diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5cd95a14..dda3dfe8 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -23,9 +23,11 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- python -m pip install .[entity_preprocessing]
+ python -m pip install .[preprocessing]
python -m pip install -r requirements-dev.txt
- name: Test
run: |
python -m pytest tests/test_get_dataloaders.py
python -m pytest tests/test_metadata_utils.py
+ python -m pytest tests/test_preprocessing_utils.py
+ python -m pytest tests/preprocessing_tools
diff --git a/bsmetadata/evaluate.py b/bsmetadata/evaluate.py
new file mode 100644
index 00000000..adc1f755
--- /dev/null
+++ b/bsmetadata/evaluate.py
@@ -0,0 +1,211 @@
+import dataclasses
+import gc
+import json
+import logging
+import math
+import os
+import sys
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Optional
+
+import hydra
+import torch
+import torch.nn.functional as F
+import wandb
+from accelerate import Accelerator
+from datasets.features import Value
+from hydra.core.config_store import ConfigStore
+from omegaconf import OmegaConf
+from tqdm.auto import tqdm as original_tqdm
+from transformers import AdamW, AutoModelForCausalLM, AutoTokenizer, get_scheduler, set_seed
+from transformers.trainer_utils import IntervalStrategy
+
+from bsmetadata.input_pipeline import DataConfig, get_dataloaders
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CFG:
+ data_config: DataConfig = DataConfig()
+ out_dir: str = field(
+ default="output_dir", metadata={"help": "The output directory in which the trained model is saved."}
+ )
+ training_jobid: Optional[str] = field(default=None, metadata={"help": "The jobid of the training run."})
+ jobid: Optional[str] = field(default=None, metadata={"help": "The jobid of the evaluation."})
+ checkpoints_to_evaluate: str = field(
+ default="all",
+ metadata={
+ "help": "Indicate whether all checkpoints should be evaluated ('all') or only the last one ('last')"
+ },
+ )
+ eval_name: str = field(
+ default="ppl on val without metadata",
+ metadata={
+ "help": "Indicate whether all checkpoints should be evaluated ('all') or only the last one ('last')"
+ },)
+ seed: int = field(default=42, metadata={"help": "The seed used for RNG initialization."})
+ model_name: str = field(default="gpt2", metadata={"help": "The name of the pretrained model to use."})
+ project_name: str = field(default="metadata_lm", metadata={"help": "The project name."})
+ do_eval: bool = field(default=True, metadata={"help": "Whether to run eval on the dev set."})
+
+
+cs = ConfigStore.instance()
+cs.store(name="config", node=CFG)
+
+
+def show_help(context="", cls=CFG):
+ default_instance = cls()
+ for field_ in dataclasses.fields(cls):
+ if dataclasses.is_dataclass(field_.type):
+ show_help(context=f"{context}{field_.name}.", cls=field_.type)
+ else:
+ kwargs = field_.metadata.copy()
+ # print(field)
+ help = kwargs.get("help", "")
+ default = getattr(default_instance, field_.name) # init and tell the default
+ print(f"{context}{field_.name}: {help} (default={json.dumps(default)})")
+
+
+class Logger:
+ def __init__(self, is_local_main_process, *args, **kwargs):
+ self.is_local_main_process = is_local_main_process
+ if self.is_local_main_process:
+ self.run = wandb.init(*args, **kwargs)
+
+ def log(self, dic):
+ if self.is_local_main_process:
+ wandb.log(dic)
+
+ def close(self):
+ if self.is_local_main_process:
+ wandb.finish()
+
+
+def loss_fn(batch, outputs, metadata_mask=None):
+ b = outputs.logits.size(0)
+ lm_logits = outputs.logits
+ labels = batch["labels"]
+ attention_mask = batch["attention_mask"]
+
+ shift_logits = lm_logits[..., :-1, :].contiguous()
+ shift_labels = labels[..., 1:].contiguous()
+ if metadata_mask is not None:
+ loss_mask = torch.logical_and(attention_mask, ~metadata_mask)
+ else:
+ loss_mask = attention_mask
+ shift_mask = loss_mask[..., 1:].contiguous()
+ # Flatten the tokens
+ loss = F.cross_entropy(
+ shift_logits.view(-1, shift_logits.size(-1)),
+ shift_labels.view(-1),
+ reduction="none",
+ ).view(b, -1)
+ loss = (loss * shift_mask).sum() / shift_mask.sum()
+ # per-example ppl
+ # ppl = torch.exp((loss * shift_mask).sum(-1) / shift_mask.sum(-1))
+ return loss
+
+
+@hydra.main(config_path=None, config_name="config")
+def main(args: CFG) -> None:
+ print(OmegaConf.to_yaml(args))
+
+ # The dataset library use the hash of the arguments to create the cache
+ # name. Without this transformation the hash of args is not deterministic
+ args = OmegaConf.to_object(args)
+
+ set_seed(args.seed)
+ accelerator = Accelerator()
+ is_local_main_process = accelerator.is_local_main_process
+ tqdm = partial(original_tqdm, disable=not is_local_main_process, position=0)
+
+ os.makedirs(args.out_dir, exist_ok=True)
+
+ # Setup logging
+ logging.basicConfig(
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+ datefmt="%m/%d/%Y %H:%M:%S",
+ level=logging.INFO if is_local_main_process else logging.WARN,
+ )
+
+ # get dataloaders
+ logger.info("Load tokenizer")
+ tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+ tokenizer.pad_token = tokenizer.eos_token
+ logger.info("Load dataloaders")
+
+ # todo trick
+ _ , eval_dataloaders = get_dataloaders(tokenizer, args.data_config)
+ logger.info("The dataloaders have been build")
+
+ if not args.do_eval:
+ return
+
+ # get model
+ logger.info("Load model")
+ model = AutoModelForCausalLM.from_pretrained(args.model_name)
+
+ # Prepare everything
+ model = accelerator.prepare(model)
+ eval_dataloaders = {k: accelerator.prepare(v) for k, v in eval_dataloaders.items()}
+
+ # Note -> the training dataloader needs to be prepared before we grab its length below (cause its length will be
+ # shorter in multiprocess)
+
+ @torch.no_grad()
+ def evaluate(eval_dataloader):
+ model.eval()
+ losses = []
+ for step, batch in enumerate(tqdm(eval_dataloader, desc="eval")): # , leave=False)
+ labels = batch.pop("labels")
+ metadata_mask = batch.pop("metadata_mask", None)
+ outputs = model(**batch)
+ batch["labels"] = labels
+ loss = loss_fn(batch, outputs, metadata_mask)
+
+ losses.append(accelerator.gather(loss.repeat(args.data_config.per_device_eval_batch_size)))
+
+ losses = torch.cat(losses)
+ perplexity = math.exp(torch.mean(losses))
+ model.train()
+ return {"perplexity": perplexity}
+
+ logger_metrics = Logger(is_local_main_process, project=args.project_name, config=args)
+
+ checkpoint_names = sorted(
+ [
+ os.path.join(args.out_dir, args.training_jobid, file_name)
+ for file_name in os.listdir(os.path.join(args.out_dir, args.training_jobid))
+ if file_name.split(".")[-1] == "pt" and file_name.split("-")[0] == "checkpoint"
+ ]
+ )
+ if args.checkpoints_to_evaluate == "last":
+ checkpoint_names = [checkpoint_names[-1]]
+ elif args.checkpoints_to_evaluate != "all":
+ raise ValueError("Wrong argument set for 'checkpoints_to_evaluate', valid possibilities are 'all' or 'last'.")
+
+ logger.info(f"Will evaluate the following checkpoints: {checkpoint_names}")
+ for file_name in checkpoint_names:
+ checkpoint_path = os.path.join(args.out_dir, args.jobid, file_name)
+ step = file_name.split(".")[0].split("-")[-1].split("step")[0]
+ logger.info(f"Loading state dict for the checkpoint of step {step}")
+ state_dict = torch.load(checkpoint_path)["state_dict"]
+ logger.info("Loading state dict finished")
+
+ model.load_state_dict(state_dict)
+
+ logger.info(f"***** Evaluation step {step} *****")
+ for key, eval_dataloader in eval_dataloaders.items():
+ metrics = evaluate(eval_dataloader)
+ logger_metrics.log({f"{args.eval_name} {key}": metrics, "step": step})
+ # logger_metrics.info(f"epoch {epoch}: perplexity: {perplexity}")
+
+
+if __name__ == "__main__":
+ if "--help" in sys.argv or "-h" in sys.argv:
+ show_help()
+ sys.exit()
+ main()
diff --git a/bsmetadata/experiments/with_metadata.py b/bsmetadata/experiments/with_metadata.py
index c48802f3..f431074e 100644
--- a/bsmetadata/experiments/with_metadata.py
+++ b/bsmetadata/experiments/with_metadata.py
@@ -147,6 +147,32 @@ def create_labels_column(examples):
logger.info(f" Num train examples = {len(train_dataset)}")
logger.info(f" Num validation examples = {len(val_dataset)}")
+ for idx in range(len(train_dataset)):
+ if 1 in train_dataset[idx]["metadata_mask"]:
+ logger.info(" Train sample with metadata")
+ logger.info(f" Train sample n°{idx} attention_mask:\n{train_dataset[idx]['attention_mask']}")
+ logger.info(f" Train sample n°{idx} metadata_mask:\n{train_dataset[idx]['metadata_mask']}")
+ logger.info(f" Train sample n°{idx} input_ids:\n{train_dataset[idx]['input_ids']}")
+ logger.info(
+ f" Train sample n°{idx} input_ids decoded:\n{tokenizer.decode(train_dataset[idx]['input_ids'])}"
+ )
+ logger.info(
+ f" Train sample n°{idx} tokens:\n{tokenizer.convert_ids_to_tokens(train_dataset[idx]['input_ids'])}"
+ )
+ break
+ for idx in range(len(train_dataset)):
+ if 1 not in train_dataset[idx]["metadata_mask"]:
+ logger.info(" Train sample without metadata")
+ logger.info(f" Train sample n°{idx} attention_mask:\n{train_dataset[idx]['attention_mask']}")
+ logger.info(f" Train sample n°{idx} metadata_mask:\n{train_dataset[idx]['metadata_mask']}")
+ logger.info(f" Train sample n°{idx} input_ids:\n{train_dataset[idx]['input_ids']}")
+ logger.info(
+ f" Train sample n°{idx} input_ids decoded:\n{tokenizer.decode(train_dataset[idx]['input_ids'])}"
+ )
+ logger.info(
+ f" Train sample n°{idx} tokens:\n{tokenizer.convert_ids_to_tokens(train_dataset[idx]['input_ids'])}"
+ )
+ break
# DataLoaders creation:
train_dataloader = DataLoader(
diff --git a/bsmetadata/experiments/without_metadata.py b/bsmetadata/experiments/without_metadata.py
index 8f6b197b..e890e44d 100644
--- a/bsmetadata/experiments/without_metadata.py
+++ b/bsmetadata/experiments/without_metadata.py
@@ -59,6 +59,7 @@ def get_dataloaders(tokenizer, args):
datasets = load_dataset(
args.dataset_name,
args.dataset_config_name,
+ data_files=data_files,
cache_dir=args.cache_dir,
keep_in_memory=False,
)
@@ -103,7 +104,7 @@ def get_dataloaders(tokenizer, args):
# Preprocessing the datasets.
# First we tokenize all the texts.
- column_names = datasets["train"].column_names
+ column_names = datasets["train"].column_names if "train" in datasets else datasets["validation"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]
def tokenize_function(examples):
@@ -172,11 +173,30 @@ def group_texts(examples):
)
logger.info("Group texts finished")
- train_dataset = datasets["train"]
+ train_dataset = datasets["train"] if "train" in datasets else None
val_dataset = datasets["validation"]
- logger.info(f" Num train examples = {len(train_dataset)}")
+ if "train" in datasets:
+ logger.info(f" Num train examples = {len(train_dataset)}")
logger.info(f" Num validation examples = {len(val_dataset)}")
+ if "train" in datasets:
+ logger.info(" Train sample without metadata")
+ for idx in range(3):
+ logger.info(f" Train sample n°{idx} attention_mask:\n{train_dataset[idx]['attention_mask']}")
+ logger.info(f" Train sample n°{idx} input_ids:\n{train_dataset[idx]['input_ids']}")
+ logger.info(f" Train sample n°{idx} input_ids decoded:\n{tokenizer.decode(train_dataset[idx]['input_ids'])}")
+ logger.info(
+ f" Train sample n°{idx} tokens:\n{tokenizer.convert_ids_to_tokens(train_dataset[idx]['input_ids'])}"
+ )
+ else:
+ logger.info(" Validation sample without metadata")
+ for idx in range(3):
+ logger.info(f" Validation sample n°{idx} attention_mask:\n{val_dataset[idx]['attention_mask']}")
+ logger.info(f" Validation sample n°{idx} input_ids:\n{val_dataset[idx]['input_ids']}")
+ logger.info(f" Validation sample n°{idx} input_ids decoded:\n{tokenizer.decode(val_dataset[idx]['input_ids'])}")
+ logger.info(
+ f" Validation sample n°{idx} tokens:\n{tokenizer.convert_ids_to_tokens(val_dataset[idx]['input_ids'])}"
+ )
# DataLoaders creation:
train_dataloader = DataLoader(
@@ -184,7 +204,7 @@ def group_texts(examples):
shuffle=True,
collate_fn=default_data_collator,
batch_size=args.per_device_train_batch_size,
- )
+ ) if "train" in datasets else None
val_dataloader1 = DataLoader(
val_dataset,
collate_fn=default_data_collator,
diff --git a/bsmetadata/preprocessing_tools/__init__.py b/bsmetadata/preprocessing_tools/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/bsmetadata/preprocessing_tools/html_parser/__init__.py b/bsmetadata/preprocessing_tools/html_parser/__init__.py
new file mode 100644
index 00000000..7a0daba7
--- /dev/null
+++ b/bsmetadata/preprocessing_tools/html_parser/__init__.py
@@ -0,0 +1,36 @@
+from typing import List, Optional
+
+from bsmetadata.preprocessing_tools.html_parser.filters_and_cleaners import TextAndMetadataCleaner
+from bsmetadata.preprocessing_tools.html_parser.objects import TagToRemoveWithContent
+
+
+def get_clean_text_and_metadata(
+ html_str,
+ tags_to_remove_with_content: Optional[List[TagToRemoveWithContent]] = None,
+ tags_to_remove_alone: Optional[List[str]] = None,
+ attrs_to_keep: Optional[List[str]] = None,
+ consecutive_tags_to_fold: Optional[List[str]] = None,
+ convert_br_tag_to_breaking_line: Optional[bool] = False,
+ txt_max_chr_len_alone: float = -float("inf"),
+ txt_min_chr_len_alone: float = -float("inf"),
+ tags_exceptions_to_txt_max_min_chr_len_alone: List[str] = None,
+ txt_max_chr_len_with_content: float = -float("inf"),
+ txt_min_chr_len_with_content: float = -float("inf"),
+ tags_exceptions_to_txt_max_min_chr_len_with_content: List[str] = None,
+):
+ text_and_metadata_cleaner = TextAndMetadataCleaner(
+ html_str=html_str,
+ tags_to_remove_with_content=tags_to_remove_with_content,
+ tags_to_remove_alone=tags_to_remove_alone,
+ attrs_to_keep=attrs_to_keep,
+ start_parsing_at_tag="body",
+ consecutive_tags_to_fold=consecutive_tags_to_fold,
+ convert_br_tag_to_breaking_line=convert_br_tag_to_breaking_line,
+ txt_max_chr_len_alone=txt_max_chr_len_alone,
+ txt_min_chr_len_alone=txt_min_chr_len_alone,
+ tags_exceptions_to_txt_max_min_chr_len_alone=tags_exceptions_to_txt_max_min_chr_len_alone,
+ txt_max_chr_len_with_content=txt_max_chr_len_with_content,
+ txt_min_chr_len_with_content=txt_min_chr_len_with_content,
+ tags_exceptions_to_txt_max_min_chr_len_with_content=tags_exceptions_to_txt_max_min_chr_len_with_content,
+ )
+ return text_and_metadata_cleaner.apply()
diff --git a/bsmetadata/preprocessing_tools/html_parser/filters_and_cleaners.py b/bsmetadata/preprocessing_tools/html_parser/filters_and_cleaners.py
new file mode 100644
index 00000000..fc3c6bf9
--- /dev/null
+++ b/bsmetadata/preprocessing_tools/html_parser/filters_and_cleaners.py
@@ -0,0 +1,460 @@
+from typing import DefaultDict, List, Optional, Tuple
+
+import htmlmin
+from lxml import etree
+from lxml.html import fromstring
+
+from bsmetadata.preprocessing_tools.html_parser.objects import HtmlTag, Metadata, TagToRemove, TagToRemoveWithContent
+from bsmetadata.preprocessing_tools.html_parser.variables import (
+ BLOCK_CONTENT_SEPARATOR,
+ BLOCK_ELEMENTS,
+ FAKE_TAG_BASIC,
+ FAKE_TAG_BLOCK,
+ FAKE_TAG_INLINE,
+ INLINE_ELEMENTS_SPACING,
+ PLAIN_TEXT_SEPARATOR,
+ PRE_TAG,
+)
+
+
+class AttributeCleaner:
+ def __init__(self, attrs_to_keep: Optional[List[str]]):
+ self.attrs_to_keep = attrs_to_keep
+
+ def _test(self, attr):
+ return self.attrs_to_keep is None or attr in self.attrs_to_keep
+
+ def __call__(self, attrs: List[Tuple[str]]):
+ if isinstance(attrs, list):
+ attrbs = [attr for attr, _ in attrs if self._test(attr)]
+ values = [value for attr, value in attrs if self._test(attr)]
+ return {
+ "attrs": attrbs,
+ "values": values,
+ }
+ else:
+ attrs = dict(attrs)
+
+ attrbs = [attr for attr, value in attrs.items() if self._test(attr)]
+ values = [value for attr, value in attrs.items() if self._test(attr)]
+ return {
+ "attrs": attrbs,
+ "values": values,
+ }
+
+
+class TagFilter:
+ def __init__(
+ self,
+ tags_to_remove_alone: Optional[List[TagToRemove]],
+ tags_to_remove_with_content: Optional[List[TagToRemoveWithContent]],
+ txt_max_chr_len_alone: Optional[float] = -float("inf"),
+ txt_min_chr_len_alone: Optional[float] = -float("inf"),
+ tags_exceptions_alone: Optional[List[str]] = None,
+ txt_max_chr_len_with_content: Optional[float] = -float("inf"),
+ txt_min_chr_len_with_content: Optional[float] = -float("inf"),
+ tags_exceptions_with_content: Optional[List[str]] = None,
+ ):
+ self.txt_max_chr_len_alone = txt_max_chr_len_alone
+ self.txt_min_chr_len_alone = txt_min_chr_len_alone
+ self.tags_exceptions_alone = tags_exceptions_alone if tags_exceptions_alone is not None else []
+ self.txt_max_chr_len_with_content = txt_max_chr_len_with_content
+ self.txt_min_chr_len_with_content = txt_min_chr_len_with_content
+ self.tags_exceptions_with_content = (
+ tags_exceptions_with_content if tags_exceptions_with_content is not None else []
+ )
+
+ self.tags_to_remove_alone = (
+ {tag_to_remove.tag: tag_to_remove for tag_to_remove in tags_to_remove_alone}
+ if isinstance(tags_to_remove_alone, list)
+ else {}
+ )
+ self.tags_to_remove_with_content = (
+ {tag_to_remove.tag: tag_to_remove for tag_to_remove in tags_to_remove_with_content}
+ if isinstance(tags_to_remove_with_content, list)
+ else {}
+ )
+
+ for tag_to_remove_characteristics in self.tags_to_remove_with_content.values():
+ if tag_to_remove_characteristics.method not in ["top-down", "bottom-up"]:
+ raise ValueError(
+ f"You have requested to remove {tag_to_remove_characteristics.tag} tags and their content if the "
+ f"content has a size between {tag_to_remove_characteristics.content_min_char_length} and "
+ f"{tag_to_remove_characteristics.content_max_char_length} with an invalid method "
+ f"({tag_to_remove_characteristics.method}). Valid methods are 'top_down' and 'bottom_up'."
+ )
+ # todo sanitize tags_to_remove_with_content
+
+ def drop_tag(self, metadata_node):
+ tag = str(metadata_node.value.tag)
+
+ drop_tag = False
+ content_char_length = (
+ metadata_node.char_end_idx - metadata_node.char_start_idx if metadata_node.char_end_idx is not None else 0
+ )
+
+ if (
+ tag in self.tags_to_remove_alone
+ and content_char_length <= self.tags_to_remove_alone[tag].content_max_char_length
+ and content_char_length >= self.tags_to_remove_alone[tag].content_min_char_length
+ ):
+ drop_tag = True
+
+ if tag not in self.tags_exceptions_alone:
+ if content_char_length <= self.txt_max_chr_len_alone and content_char_length >= self.txt_min_chr_len_alone:
+ drop_tag = True
+
+ # raise TypeError(f"tag need to be a string not a {type(tag)}")
+ return drop_tag
+
+ def drop_tag_and_content_top_down(self, tag: str, text: str):
+ if tag in self.tags_to_remove_with_content and self.tags_to_remove_with_content[tag].method != "top-down":
+ return False
+
+ drop_tag = False
+ content_char_length = len(text)
+ if (
+ tag in self.tags_to_remove_with_content
+ and content_char_length <= self.tags_to_remove_with_content[tag].content_max_char_length
+ and content_char_length >= self.tags_to_remove_with_content[tag].content_min_char_length
+ ):
+ drop_tag = True
+
+ if tag not in self.tags_exceptions_with_content:
+ if (
+ content_char_length <= self.txt_max_chr_len_with_content
+ and content_char_length >= self.txt_min_chr_len_with_content
+ ):
+ drop_tag = True
+ return drop_tag
+
+ def drop_tag_and_content_bottom_up(self, tag: str, text: str):
+ if tag not in self.tags_to_remove_with_content:
+ return False
+
+ tag_to_remove_characteristics = self.tags_to_remove_with_content[tag]
+ if tag_to_remove_characteristics.method != "bottom-up":
+ return False
+
+ content_char_length = len(text)
+ if (
+ content_char_length <= tag_to_remove_characteristics.content_max_char_length
+ and content_char_length >= tag_to_remove_characteristics.content_min_char_length
+ ):
+ return True
+
+ return False
+
+
+class ConsecutiveTagCleaner:
+ def __init__(
+ self,
+ block_elements: list,
+ consecutive_tags_to_fold: Optional[List[str]],
+ ):
+ self.consecutive_tags_to_fold = consecutive_tags_to_fold if isinstance(consecutive_tags_to_fold, list) else []
+ self.fake_tag_block = FAKE_TAG_BLOCK
+ self.fake_tag_inline = FAKE_TAG_INLINE
+ self.fake_tag_basic = FAKE_TAG_BASIC
+ self.attrib_separator = " "
+ self.block_elements = block_elements
+
+ def __call__(self, root):
+ tag = root.tag
+ if (tag in self.consecutive_tags_to_fold and len(root) == 1 and root[0].tag == tag) or (
+ tag in [FAKE_TAG_BLOCK, FAKE_TAG_INLINE, FAKE_TAG_BASIC]
+ and len(root) == 1
+ and "previous_tag" in root.attrib
+ and root[0].tag == root.attrib["previous_tag"]
+ ): # has 1 child
+
+ if tag in self.block_elements:
+ root[0].tag = self.fake_tag_block
+ elif tag in INLINE_ELEMENTS_SPACING:
+ root[0].tag = self.fake_tag_inline
+ else:
+ root[0].tag = self.fake_tag_basic
+
+ parent_root = root
+ while parent_root.tag in [FAKE_TAG_BLOCK, FAKE_TAG_INLINE, FAKE_TAG_BASIC]:
+ parent_root = parent_root.getparent()
+
+ for key, value in root[0].attrib.items():
+ if key in parent_root.attrib:
+ parent_root.attrib[key] += self.attrib_separator + value
+ else:
+ parent_root.attrib[key] = value
+ root[0].attrib["previous_tag"] = tag
+
+
+def _remove_keeping_tail(element):
+ """Safe the tail text and then delete the element"""
+ _preserve_tail_before_delete(element)
+ element.getparent().remove(element)
+
+
+def _preserve_tail_before_delete(node):
+ if node.tail: # preserve the tail
+ previous = node.getprevious()
+ if previous is not None: # if there is a previous sibling it will get the tail
+ if previous.tail is None:
+ previous.tail = node.tail
+ elif (
+ previous.text
+ and not previous.text.endswith(PLAIN_TEXT_SEPARATOR)
+ and not node.tail.startswith(PLAIN_TEXT_SEPARATOR)
+ ):
+ previous.text = previous.text + PLAIN_TEXT_SEPARATOR + node.tail
+ elif (
+ previous.text
+ and previous.text.endswith(PLAIN_TEXT_SEPARATOR)
+ and node.tail.startswith(PLAIN_TEXT_SEPARATOR)
+ ):
+ # Don't accumulate too much spaces
+ previous.text = previous.text[: -len(PLAIN_TEXT_SEPARATOR)] + node.tail
+ elif (
+ previous.tail
+ and not previous.tail.endswith(PLAIN_TEXT_SEPARATOR)
+ and not node.tail.startswith(PLAIN_TEXT_SEPARATOR)
+ ):
+ previous.tail = previous.tail + PLAIN_TEXT_SEPARATOR + node.tail
+ else:
+ previous.tail = previous.tail + node.tail
+ else: # The parent get the tail as text
+ parent = node.getparent()
+ if parent.text is None:
+ parent.text = node.tail
+ elif not parent.text.endswith(PLAIN_TEXT_SEPARATOR) and not node.tail.startswith(PLAIN_TEXT_SEPARATOR):
+ parent.text = parent.text + PLAIN_TEXT_SEPARATOR + node.tail
+ elif parent.text.endswith(PLAIN_TEXT_SEPARATOR) and node.tail.startswith(PLAIN_TEXT_SEPARATOR):
+ # Don't accumulate too much spaces
+ parent.text = parent.text[: -len(PLAIN_TEXT_SEPARATOR)] + node.tail
+ else:
+ parent.text = parent.text + node.tail
+
+
+class TextAndMetadataCleaner:
+ def __init__(
+ self,
+ html_str,
+ tags_to_remove_with_content: Optional[List[TagToRemoveWithContent]] = None,
+ tags_to_remove_alone: Optional[List[TagToRemove]] = None,
+ attrs_to_keep: Optional[List[str]] = None,
+ start_parsing_at_tag: Optional[str] = "body",
+ consecutive_tags_to_fold: Optional[List[str]] = None,
+ convert_br_tag_to_breaking_line: Optional[bool] = False,
+ txt_max_chr_len_alone: float = -float("inf"),
+ txt_min_chr_len_alone: float = -float("inf"),
+ tags_exceptions_to_txt_max_min_chr_len_alone: List[str] = None,
+ txt_max_chr_len_with_content: float = -float("inf"),
+ txt_min_chr_len_with_content: float = -float("inf"),
+ tags_exceptions_to_txt_max_min_chr_len_with_content: List[str] = None,
+ ):
+ self.html_str = html_str
+ self.tags_to_remove_with_content = tags_to_remove_with_content
+ self.tags_to_remove_alone = tags_to_remove_alone
+ self.attrs_to_keep = attrs_to_keep
+ self.start_parsing_at_tag = start_parsing_at_tag
+ self.convert_br_tag_to_breaking_line = convert_br_tag_to_breaking_line
+
+ if self.tags_to_remove_alone is None:
+ self.tags_to_remove_alone = []
+ self.tags_to_remove_alone.extend(
+ [
+ TagToRemove(FAKE_TAG_BLOCK),
+ TagToRemove(FAKE_TAG_INLINE),
+ TagToRemove(FAKE_TAG_BASIC),
+ ]
+ )
+
+ self.block_elements = BLOCK_ELEMENTS.copy()
+ if self.convert_br_tag_to_breaking_line:
+ self.block_elements.remove("br")
+ self.tags_to_remove_alone.append(TagToRemove("br"))
+
+ self.consecutive_tag_cleaner = ConsecutiveTagCleaner(
+ block_elements=self.block_elements,
+ consecutive_tags_to_fold=consecutive_tags_to_fold,
+ )
+
+ self.attribute_cleaner = AttributeCleaner(attrs_to_keep=attrs_to_keep)
+ self.tag_filter = TagFilter(
+ txt_max_chr_len_alone=txt_max_chr_len_alone,
+ txt_min_chr_len_alone=txt_min_chr_len_alone,
+ tags_exceptions_alone=tags_exceptions_to_txt_max_min_chr_len_alone,
+ txt_max_chr_len_with_content=txt_max_chr_len_with_content,
+ txt_min_chr_len_with_content=txt_min_chr_len_with_content,
+ tags_exceptions_with_content=tags_exceptions_to_txt_max_min_chr_len_with_content,
+ tags_to_remove_alone=self.tags_to_remove_alone,
+ tags_to_remove_with_content=tags_to_remove_with_content,
+ )
+
+ def apply(self):
+ html_str = self.html_str
+ # Traitement n°1: start the parsing at a special tags (mostly tested with
)
+ if self.start_parsing_at_tag is not None:
+ root = fromstring(html_str)
+ find = etree.XPath(f"//{self.start_parsing_at_tag}")
+ try:
+ new_etree = find(root)[0]
+ except IndexError:
+ raise ValueError(
+ f"You have asked to start parsing at the {self.start_parsing_at_tag} tag but the current example "
+ "does not contain this tag"
+ )
+ html_str = etree.tostring(new_etree, method="html", encoding="UTF-8", pretty_print=False).decode("UTF-8")
+ if not html_str.startswith(""):
+ self.tag_filter.tags_to_remove_alone.update({"html": TagToRemove("html")})
+
+ # need to re-add html tag otherwise the fromstring` do something strange
+ html_str = f"{html_str}"
+
+ # Traitement n°2: [all treatments impacting the chr_idx] we removes sub-trees from the HTML + we minify the html
+ html_str = htmlmin.minify(html_str, remove_comments=True, keep_pre=True)
+
+ new_etree = fromstring(html_str)
+
+ self._clean_etree(new_etree)
+
+ html_str = etree.tostring(new_etree, method="html", encoding="UTF-8", pretty_print=False).decode("UTF-8")
+ html_str = htmlmin.minify(html_str, keep_pre=True)
+
+ # Traitement n°3: we separate the text from the list of metadata json that we keep
+ self.metadata = []
+ self._current_char_idx = 0
+ self._current_num_metadata_by_idx = DefaultDict(lambda: 0)
+ self.text = ""
+ self.last_tag = None
+
+ plain_text = self._get_text_and_update_metadata(new_etree)
+
+ self._clean_relative_pos(self.metadata)
+
+ return plain_text, self.metadata
+
+ def _br_conversion(self, tag):
+ if tag == "br":
+ self.text += "\n"
+
+ def _clean_relative_pos(self, metadata):
+ metadata_dict_idx = DefaultDict(dict)
+ for metadata_node in metadata:
+ metadata_dict_idx[metadata_node.char_start_idx][metadata_node.relative_start_pos] = (
+ "start",
+ metadata_node,
+ )
+ metadata_dict_idx[metadata_node.char_end_idx][metadata_node.relative_end_pos] = ("end", metadata_node)
+
+ for absolute_idx, value in metadata_dict_idx.items():
+ pos_sorted = sorted(list(value.keys()))
+ idx = 0
+ for pos in pos_sorted:
+ if metadata_dict_idx[absolute_idx][pos][0] == "start":
+ metadata_dict_idx[absolute_idx][pos][1].relative_start_pos = idx
+ idx += 1
+
+ if metadata_dict_idx[absolute_idx][pos][0] == "end":
+ metadata_dict_idx[absolute_idx][pos][1].relative_end_pos = idx
+ idx += 1
+
+ def _add_text(self, tag, new_text):
+ if tag in self.block_elements:
+ self.text = self._append_block_separator(self.text)
+ elif tag in INLINE_ELEMENTS_SPACING:
+ self.text = self._append_inline_element_separator(self.text)
+
+ if new_text:
+ self._append_text_content(new_text)
+
+ self._current_char_idx = len(self.text)
+
+ def _append_text_content(self, txt):
+ if self.current_tag == PRE_TAG:
+ self.text += txt
+ else:
+ txt = txt.replace("\u00a0", " ")
+
+ c = " "
+ if len(self.text) > 0:
+ c = self.text[-1]
+ for i in range(len(txt)):
+ c2 = txt[i]
+ if c2 == "\r" or c2 == "\n":
+ c2 = " "
+ if not c.isspace() or not c2.isspace():
+ self.text += c2
+ c = c2
+
+ def _append_block_separator(self, sb):
+ length = len(sb)
+ if length > 0:
+ # remove white space before paragraph break
+ # if self.last_tag != PRE_TAG:
+ # while (length > 0 and sb[-1] == PLAIN_TEXT_SEPARATOR):
+ # sb = sb[:-len(PLAIN_TEXT_SEPARATOR)]
+ if length > 0 and sb[-1] == PLAIN_TEXT_SEPARATOR:
+ sb = sb[:-1] + BLOCK_CONTENT_SEPARATOR
+ elif length > 0 and sb[-1] != BLOCK_CONTENT_SEPARATOR:
+ sb += BLOCK_CONTENT_SEPARATOR
+ return sb
+
+ def _append_inline_element_separator(self, sb):
+ length = len(sb)
+ if length > 0:
+ last_buffer_char = sb[-1]
+ if last_buffer_char != PLAIN_TEXT_SEPARATOR and last_buffer_char != BLOCK_CONTENT_SEPARATOR:
+ sb += PLAIN_TEXT_SEPARATOR
+ return sb
+
+ def _get_text_and_update_metadata(self, root):
+ self.current_tag = root.tag
+
+ metadata_node = Metadata(
+ char_start_idx=self._current_char_idx,
+ relative_start_pos=self._current_num_metadata_by_idx[self._current_char_idx],
+ value=HtmlTag(tag=root.tag, attrs=self.attribute_cleaner(root.attrib)),
+ )
+
+ self._current_num_metadata_by_idx[self._current_char_idx] += 1
+
+ if self.convert_br_tag_to_breaking_line:
+ self._br_conversion(root.tag)
+
+ self._add_text(root.tag, root.text)
+ for idx, child in enumerate(root):
+ _ = self._get_text_and_update_metadata(child)
+
+ self.current_tag = root.tag
+
+ metadata_node.char_end_idx = self._current_char_idx
+ metadata_node.relative_end_pos = self._current_num_metadata_by_idx[self._current_char_idx]
+ self._current_num_metadata_by_idx[self._current_char_idx] += 1
+
+ self._add_text(root.tag, root.tail)
+
+ if not self.tag_filter.drop_tag(metadata_node=metadata_node):
+ self.metadata.append(metadata_node)
+
+ return self.text
+
+ def _clean_etree(
+ self,
+ root,
+ ):
+ self.consecutive_tag_cleaner(root)
+
+ # Top-Down deletion
+ plain_text = etree.tostring(root, method="text", encoding="UTF-8", pretty_print=False).decode("UTF-8")
+ text = plain_text[: -len(root.tail)] if root.tail else plain_text
+ if self.tag_filter.drop_tag_and_content_top_down(tag=root.tag, text=text):
+ _remove_keeping_tail(root)
+ return
+
+ for idx, child in enumerate(root):
+ self._clean_etree(child)
+
+ # Bottom-UP deletion
+ plain_text = etree.tostring(root, method="text", encoding="UTF-8", pretty_print=False).decode("UTF-8")
+ text = plain_text[: -len(root.tail)] if root.tail else plain_text
+ if self.tag_filter.drop_tag_and_content_bottom_up(tag=root.tag, text=text):
+ _remove_keeping_tail(root)
diff --git a/bsmetadata/preprocessing_tools/html_parser/objects.py b/bsmetadata/preprocessing_tools/html_parser/objects.py
new file mode 100644
index 00000000..042b5c7f
--- /dev/null
+++ b/bsmetadata/preprocessing_tools/html_parser/objects.py
@@ -0,0 +1,56 @@
+from dataclasses import dataclass
+from typing import Optional, OrderedDict
+
+
+@dataclass
+class TagToRemove:
+ tag: str
+ content_min_char_length: float = 0
+ content_max_char_length: float = float("inf")
+
+
+# The "top-down" method will consist in going through the DOM from the root to the leaves and if during this traversal
+# a node fulfills the conditions of elimination it will be deleted.
+# On the contrary, the "bottom-up" method will consist in going through the DOM from the leaves to the root and if
+# during this traversal a node meets the conditions of elimination it will be deleted.
+@dataclass
+class TagToRemoveWithContent:
+ tag: str
+ content_min_char_length: float = 0
+ content_max_char_length: float = float("inf")
+ method: str = "top-down" # or "bottom-up"
+
+
+@dataclass
+class HtmlTag:
+ tag: str
+ attrs: dict
+
+
+@dataclass
+class Metadata:
+ char_start_idx: int
+ relative_start_pos: int
+ value: HtmlTag
+ char_end_idx: Optional[int] = None
+ relative_end_pos: Optional[int] = None
+ key: str = "html"
+ type: str = "local"
+
+
+def convert_html_metadata_dataclass_to_dict(metadata: Metadata):
+ html_metadata_dict = OrderedDict(
+ {
+ "key": metadata.key,
+ "type": metadata.type,
+ "char_start_idx": metadata.char_start_idx,
+ "relative_start_pos": metadata.relative_start_pos,
+ "char_end_idx": metadata.char_end_idx,
+ "relative_end_pos": metadata.relative_end_pos,
+ # The information about the HTML tag is separated into two keys because the dictionary must have a stable
+ # format between the different types of metadata
+ "value": metadata.value.tag,
+ "html_attrs": metadata.value.attrs,
+ }
+ )
+ return html_metadata_dict
diff --git a/bsmetadata/preprocessing_tools/html_parser/variables.py b/bsmetadata/preprocessing_tools/html_parser/variables.py
new file mode 100644
index 00000000..25edb7d8
--- /dev/null
+++ b/bsmetadata/preprocessing_tools/html_parser/variables.py
@@ -0,0 +1,80 @@
+FAKE_TAG_BLOCK = "fake_tag_block"
+FAKE_TAG_INLINE = "fake_tag_inline"
+FAKE_TAG_BASIC = "fake_tag_basic"
+
+BLOCK_ELEMENTS = [
+ "address",
+ "article",
+ "aside",
+ "blockquote",
+ "body",
+ "br",
+ "button",
+ "canvas",
+ "caption",
+ "col",
+ "colgroup",
+ "dd",
+ "div",
+ "dl",
+ "dt",
+ "embed",
+ "fieldset",
+ "figcaption",
+ "figure",
+ "footer",
+ "form",
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ "header",
+ "hgroup",
+ "hr",
+ "li",
+ "map",
+ "noscript",
+ "object",
+ "ol",
+ "output",
+ "p",
+ "pre",
+ "progress",
+ "section",
+ "table",
+ "tbody",
+ "textarea",
+ "tfoot",
+ "th",
+ "thead",
+ "tr",
+ "ul",
+ "video",
+ FAKE_TAG_BLOCK,
+]
+
+INLINE_ELEMENTS_SPACING = [
+ "address",
+ "cite",
+ "details",
+ "datalist",
+ "iframe",
+ "img",
+ "input",
+ "label",
+ "legend",
+ "optgroup",
+ "q",
+ "select",
+ "summary",
+ "tbody",
+ "td",
+ "time",
+ FAKE_TAG_INLINE,
+]
+
+PRE_TAG = "pre"
+PLAIN_TEXT_SEPARATOR = " "
+BLOCK_CONTENT_SEPARATOR = "\n"
diff --git a/bsmetadata/preprocessing_utils.py b/bsmetadata/preprocessing_utils.py
index 015a6ae7..0dd8c67e 100644
--- a/bsmetadata/preprocessing_utils.py
+++ b/bsmetadata/preprocessing_utils.py
@@ -24,6 +24,7 @@
from REL.ner import load_flair_ner
from REL.utils import process_results
+from bsmetadata.preprocessing_tools import html_parser
from bsmetadata.preprocessing_tools.website_desc_utils import WebsiteDescUtils
@@ -94,6 +95,47 @@ def _extract_timestamp_from_url(self, url: str) -> Optional[str]:
return date
+class HtmlPreprocessor(MetadataPreprocessor):
+ """Metadata preprocessor for extracting metadata from html text.
+
+ Specifically, it separates the html text contained in the `name_html_column`` column into a text and a list of
+ HTML metadata containing the tags, their attributes, their location in the text and their relative location to
+ each other."""
+
+ def __init__(self, name_html_column: str = "doc_html") -> None:
+ self.name_html_column = name_html_column
+ super().__init__()
+
+ def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]:
+ tags_to_remove_with_content = [
+ html_parser.objects.TagToRemoveWithContent(tag="script"),
+ html_parser.objects.TagToRemoveWithContent(tag="style"),
+ html_parser.objects.TagToRemoveWithContent(tag="header"),
+ html_parser.objects.TagToRemoveWithContent(tag="iframe"),
+ html_parser.objects.TagToRemoveWithContent(tag="footer"), # copyright in footer
+ html_parser.objects.TagToRemoveWithContent(tag="form"),
+ ]
+
+ new_texts = []
+ for example_doc_html, example_metadata in zip(
+ examples[self.name_html_column], examples["metadata"]
+ ): # if metadata already exists
+
+ plain_text, metadata = html_parser.get_clean_text_and_metadata(
+ example_doc_html,
+ tags_to_remove_with_content=tags_to_remove_with_content,
+ consecutive_tags_to_fold=["div"],
+ convert_br_tag_to_breaking_line=True,
+ )
+ new_texts.append(plain_text)
+ example_metadata.extend(
+ [html_parser.objects.convert_html_metadata_dataclass_to_dict(node) for node in metadata]
+ )
+
+ examples["texts"] = new_texts
+ return examples
+
+
class WebsiteDescPreprocessor(MetadataPreprocessor):
"""Metadata preprocessor for adding website description based on URLs."""
diff --git a/experiments/jz/html/exp_1/README.md b/experiments/jz/html/exp_1/README.md
new file mode 100644
index 00000000..cb38feb3
--- /dev/null
+++ b/experiments/jz/html/exp_1/README.md
@@ -0,0 +1,3 @@
+# Experiment example
+
+This is a toy experiment example that can be run on JZ. This experience is made up of sub-experiments, each corresponding to a run.
diff --git a/experiments/jz/html/exp_1/evaluate_without_metadata/00_create_evaluation_dataset.slurm b/experiments/jz/html/exp_1/evaluate_without_metadata/00_create_evaluation_dataset.slurm
new file mode 100644
index 00000000..e15c0de5
--- /dev/null
+++ b/experiments/jz/html/exp_1/evaluate_without_metadata/00_create_evaluation_dataset.slurm
@@ -0,0 +1,54 @@
+#!/bin/bash
+#SBATCH --job-name=modelling-metadata-html-metadata-exp1-evaluation-without-metadata-create-dataset # (change me!) job name
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=20 # (change me! between 0 and 40) number of cores per tasks
+#SBATCH --hint=nomultithread # we get physical cores not logical
+#SBATCH --time 02:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name
+#SBATCH --account=six@cpu # account
+
+set -x -e
+
+# Next line will:
+# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/
+# - setup env vars ($HOME, $WORK, etc)
+# - load several modules (git)
+# Note: We can afford to have two conda environments: one stable for running experiments and one for development.
+# If there are new dependencies to install, you have to tell me about them and not do it in this script
+source $HOME/start-user
+
+# We are on an offline partition
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+# Folder for the clone of github.com/bigscience-workshop/metadata/
+cd $WORK/repos/sync/metadata/
+
+HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_reduced_all' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy_v2)
+
+# We define the name of the folder in which the clone will be made
+#Define multi-character delimiter
+delimiter="/"
+#Concatenate the delimiter with the main string
+string=$HUB_REPO_NAME$delimiter
+
+#Split the text based on the delimiter
+myarray=()
+while [[ $string ]]; do
+ myarray+=("${string%%"$delimiter"*}")
+ string=${string#*"$delimiter"}
+done
+REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}"
+
+THEPATH=$(scontrol show job $SLURM_JOBID | awk -F= '/Command=/{print $2}')
+THIS_SCRIPT_DIR=$( dirname $THEPATH)
+echo "$THIS_SCRIPT_DIR"
+
+python bsmetadata/evaluate.py \
+ --config-dir=$THIS_SCRIPT_DIR \
+ data_config.dataset_name="${REPO_DIR}" \
+ out_dir="${SCRATCH}/metadata_outputs/${SLURM_JOB_ID}" \
+ jobid="${SLURM_JOB_ID}" \
+ do_eval=false
diff --git a/experiments/jz/html/exp_1/evaluate_without_metadata/01_evaluate_without_metadata_subexp_1.slurm b/experiments/jz/html/exp_1/evaluate_without_metadata/01_evaluate_without_metadata_subexp_1.slurm
new file mode 100644
index 00000000..8dee0047
--- /dev/null
+++ b/experiments/jz/html/exp_1/evaluate_without_metadata/01_evaluate_without_metadata_subexp_1.slurm
@@ -0,0 +1,58 @@
+#!/bin/bash
+#SBATCH --job-name=modelling-metadata-html-exp1-subexp1-evaluation-without-metadata # (change me!) job name
+#SBATCH --ntasks=1 # number of MP tasks
+#SBATCH --constraint=v100-16g
+#SBATCH --gres=gpu:1 # number of GPUs per node
+#SBATCH --cpus-per-task=8 # (change me! between 0 and 40) number of cores per tasks
+#SBATCH --hint=nomultithread # we get physical cores not logical
+#SBATCH --time 20:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name # error file name
+#SBATCH --account=six@gpu # account
+
+set -x -e
+
+# Next line will:
+# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/
+# - setup env vars ($HOME, $WORK, etc)
+# - load several modules (git)
+# Note: We can afford to have two conda environments: one stable for running experiments and one for development.
+# If there are new dependencies to install, you have to tell me about them and not do it in this script
+source $HOME/start-user
+
+# We are on an offline partition
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+# be careful about the cache folder for Wandb
+export WANDB_MODE=offline
+export WANDB_DIR=$SCRATCH
+
+# Folder for the clone of github.com/bigscience-workshop/metadata/
+cd $WORK/repos/sync/metadata/
+
+HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_reduced_all' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy_v2)
+
+# We define the name of the folder in which the clone will be made
+#Define multi-character delimiter
+delimiter="/"
+#Concatenate the delimiter with the main string
+string=$HUB_REPO_NAME$delimiter
+
+#Split the text based on the delimiter
+myarray=()
+while [[ $string ]]; do
+ myarray+=("${string%%"$delimiter"*}")
+ string=${string#*"$delimiter"}
+done
+REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}"
+
+THEPATH=$(scontrol show job $SLURM_JOBID | awk -F= '/Command=/{print $2}')
+THIS_SCRIPT_DIR=$( dirname $THEPATH)
+echo "$THIS_SCRIPT_DIR"
+
+python bsmetadata/evaluate.py \
+ --config-dir=$THIS_SCRIPT_DIR \
+ data_config.dataset_name="${REPO_DIR}" \
+ out_dir="${SCRATCH}/metadata_outputs/" \
+ training_jobid="1857108" \
+ jobid="${SLURM_JOB_ID}" \
diff --git a/experiments/jz/html/exp_1/evaluate_without_metadata/02_evaluate_without_metadata_subexp_2.slurm b/experiments/jz/html/exp_1/evaluate_without_metadata/02_evaluate_without_metadata_subexp_2.slurm
new file mode 100644
index 00000000..1fd0ecd2
--- /dev/null
+++ b/experiments/jz/html/exp_1/evaluate_without_metadata/02_evaluate_without_metadata_subexp_2.slurm
@@ -0,0 +1,58 @@
+#!/bin/bash
+#SBATCH --job-name=modelling-metadata-html-exp1-subexp2-evaluation-without-metadata # (change me!) job name
+#SBATCH --ntasks=1 # number of MP tasks
+#SBATCH --constraint=v100-16g
+#SBATCH --gres=gpu:1 # number of GPUs per node
+#SBATCH --cpus-per-task=8 # (change me! between 0 and 40) number of cores per tasks
+#SBATCH --hint=nomultithread # we get physical cores not logical
+#SBATCH --time 20:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name # error file name
+#SBATCH --account=six@gpu # account
+
+set -x -e
+
+# Next line will:
+# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/
+# - setup env vars ($HOME, $WORK, etc)
+# - load several modules (git)
+# Note: We can afford to have two conda environments: one stable for running experiments and one for development.
+# If there are new dependencies to install, you have to tell me about them and not do it in this script
+source $HOME/start-user
+
+# We are on an offline partition
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+# be careful about the cache folder for Wandb
+export WANDB_MODE=offline
+export WANDB_DIR=$SCRATCH
+
+# Folder for the clone of github.com/bigscience-workshop/metadata/
+cd $WORK/repos/sync/metadata/
+
+HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_reduced_all' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy_v2)
+
+# We define the name of the folder in which the clone will be made
+#Define multi-character delimiter
+delimiter="/"
+#Concatenate the delimiter with the main string
+string=$HUB_REPO_NAME$delimiter
+
+#Split the text based on the delimiter
+myarray=()
+while [[ $string ]]; do
+ myarray+=("${string%%"$delimiter"*}")
+ string=${string#*"$delimiter"}
+done
+REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}"
+
+THEPATH=$(scontrol show job $SLURM_JOBID | awk -F= '/Command=/{print $2}')
+THIS_SCRIPT_DIR=$( dirname $THEPATH)
+echo "$THIS_SCRIPT_DIR"
+
+python bsmetadata/evaluate.py \
+ --config-dir=$THIS_SCRIPT_DIR \
+ data_config.dataset_name="${REPO_DIR}" \
+ out_dir="${SCRATCH}/metadata_outputs/" \
+ training_jobid="1929863" \
+ jobid="${SLURM_JOB_ID}" \
diff --git a/experiments/jz/html/exp_1/evaluate_without_metadata/03_evaluate_without_metadata_subexp_3.slurm b/experiments/jz/html/exp_1/evaluate_without_metadata/03_evaluate_without_metadata_subexp_3.slurm
new file mode 100644
index 00000000..ea794421
--- /dev/null
+++ b/experiments/jz/html/exp_1/evaluate_without_metadata/03_evaluate_without_metadata_subexp_3.slurm
@@ -0,0 +1,58 @@
+#!/bin/bash
+#SBATCH --job-name=modelling-metadata-html-exp1-subexp2-evaluation-without-metadata # (change me!) job name
+#SBATCH --ntasks=1 # number of MP tasks
+#SBATCH --constraint=v100-16g
+#SBATCH --gres=gpu:1 # number of GPUs per node
+#SBATCH --cpus-per-task=8 # (change me! between 0 and 40) number of cores per tasks
+#SBATCH --hint=nomultithread # we get physical cores not logical
+#SBATCH --time 20:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name # error file name
+#SBATCH --account=six@gpu # account
+
+set -x -e
+
+# Next line will:
+# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/
+# - setup env vars ($HOME, $WORK, etc)
+# - load several modules (git)
+# Note: We can afford to have two conda environments: one stable for running experiments and one for development.
+# If there are new dependencies to install, you have to tell me about them and not do it in this script
+source $HOME/start-user
+
+# We are on an offline partition
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+# be careful about the cache folder for Wandb
+export WANDB_MODE=offline
+export WANDB_DIR=$SCRATCH
+
+# Folder for the clone of github.com/bigscience-workshop/metadata/
+cd $WORK/repos/sync/metadata/
+
+HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_reduced_all' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy_v2)
+
+# We define the name of the folder in which the clone will be made
+#Define multi-character delimiter
+delimiter="/"
+#Concatenate the delimiter with the main string
+string=$HUB_REPO_NAME$delimiter
+
+#Split the text based on the delimiter
+myarray=()
+while [[ $string ]]; do
+ myarray+=("${string%%"$delimiter"*}")
+ string=${string#*"$delimiter"}
+done
+REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}"
+
+THEPATH=$(scontrol show job $SLURM_JOBID | awk -F= '/Command=/{print $2}')
+THIS_SCRIPT_DIR=$( dirname $THEPATH)
+echo "$THIS_SCRIPT_DIR"
+
+python bsmetadata/evaluate.py \
+ --config-dir=$THIS_SCRIPT_DIR \
+ data_config.dataset_name="${REPO_DIR}" \
+ out_dir="${SCRATCH}/metadata_outputs/" \
+ training_jobid="1898197" \
+ jobid="${SLURM_JOB_ID}" \
diff --git a/experiments/jz/html/exp_1/evaluate_without_metadata/config.yaml b/experiments/jz/html/exp_1/evaluate_without_metadata/config.yaml
new file mode 100644
index 00000000..3bfa96c6
--- /dev/null
+++ b/experiments/jz/html/exp_1/evaluate_without_metadata/config.yaml
@@ -0,0 +1,24 @@
+data_config:
+ experiment: without_metadata
+ per_device_eval_batch_size: 3
+ per_device_train_batch_size: 3
+ dataset_name: null
+ dataset_config_name: null
+ train_file: null
+ validation_file: "nq-dev-*.jsonl.gz"
+ overwrite_cache: false
+ cache_dir: null
+ extension: json
+ preprocessing_num_workers: 80
+ validation_split_percentage: 5
+ block_size: null
+ map_batch_size: 3
+out_dir: output_dir
+training_jobid: null
+jobid: null
+checkpoints_to_evaluate: all
+eval_name: ppl on val without metadata
+seed: 42
+model_name: gpt2
+project_name: metadata_lm
+do_eval: true
\ No newline at end of file
diff --git a/experiments/jz/html/exp_1/html_processor.py b/experiments/jz/html/exp_1/html_processor.py
new file mode 100644
index 00000000..8890a2ed
--- /dev/null
+++ b/experiments/jz/html/exp_1/html_processor.py
@@ -0,0 +1,159 @@
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple
+
+from bsmetadata.input_pipeline import DataConfig
+from bsmetadata.metadata_processors import MetadataProcessor
+
+
+@dataclass
+class TagToRemove:
+ tag: str
+ txt_min_chr_len: int = 0
+ txt_max_chr_len: int = float("inf")
+
+
+@dataclass
+class HtmlTag:
+ tag: str
+ attrs: dict
+
+
+@dataclass
+class Metadata:
+ char_start_idx: int
+ value: HtmlTag
+ char_end_idx: Optional[int] = None
+ key: str = "html"
+ type: str = "local"
+
+
+@dataclass
+class AllTagsRules:
+ attributes_to_keep: List[str] = field(default_factory=(lambda: []), metadata={"help": "TODO."})
+ txt_max_chr_len: float = field(default=-float("inf"), metadata={"help": "TODO."})
+ txt_min_chr_len: float = field(default=-float("inf"), metadata={"help": "TODO."})
+ tags_exceptions_to_txt_max_min_chr_len: List[str] = field(default_factory=(lambda: []), metadata={"help": "TODO."})
+
+
+@dataclass
+class HTMLParserConfig:
+ all_tags_rules: AllTagsRules = AllTagsRules()
+ tags_to_remove_alone_tag_name: List[str] = field(
+ default_factory=(lambda: []),
+ metadata={"help": "TODO."},
+ )
+ tags_to_remove_alone_txt_max_chr_len: List[float] = field(
+ default_factory=(lambda: []),
+ metadata={"help": "TODO."},
+ )
+ tags_to_remove_alone_txt_min_chr_len: List[float] = field(
+ default_factory=(lambda: []),
+ metadata={"help": "TODO."},
+ )
+
+
+class TagFilter:
+ def __init__(
+ self,
+ txt_max_chr_len: Optional[float] = -float("inf"),
+ txt_min_chr_len: Optional[float] = -float("inf"),
+ tags_exceptions: Optional[List[str]] = None,
+ tags_to_remove_alone: Optional[List[TagToRemove]] = None,
+ ):
+ self.tags_to_remove_alone = (
+ {tag_to_remove.tag: tag_to_remove for tag_to_remove in tags_to_remove_alone}
+ if isinstance(tags_to_remove_alone, list)
+ else {}
+ )
+ self.txt_max_chr_len = txt_max_chr_len
+ self.txt_min_chr_len = txt_min_chr_len
+ self.tags_exceptions = tags_exceptions if tags_exceptions else []
+
+ def drop_tag(self, metadata_node):
+ tag = str(metadata_node.value.tag)
+
+ drop_tag = False
+ content_char_length = (
+ metadata_node.char_end_idx - metadata_node.char_start_idx if metadata_node.char_end_idx is not None else 0
+ )
+ if tag in self.tags_to_remove_alone:
+ tag_to_remove_characteristics = self.tags_to_remove_alone[tag]
+ if (
+ content_char_length <= tag_to_remove_characteristics.txt_max_chr_len
+ and content_char_length >= tag_to_remove_characteristics.txt_min_chr_len
+ ):
+ drop_tag = True
+
+ if tag not in self.tags_exceptions:
+ if content_char_length <= self.txt_max_chr_len and content_char_length >= self.txt_min_chr_len:
+ drop_tag = True
+
+ # raise TypeError(f"tag need to be a string not a {type(tag)}")
+ return drop_tag
+
+
+class HtmlProcessor(MetadataProcessor):
+ """An example metadata processor for HTMl tags."""
+
+ def __init__(
+ self,
+ cfg: DataConfig,
+ ):
+ """
+ Args:
+ cfg: The data configuration to use.
+ """
+ super().__init__(cfg)
+ attributes_to_keep = cfg.html_parser_config.all_tags_rules.attributes_to_keep
+ txt_max_chr_len = cfg.html_parser_config.all_tags_rules.txt_max_chr_len
+ txt_min_chr_len = cfg.html_parser_config.all_tags_rules.txt_min_chr_len
+ tags_exceptions = cfg.html_parser_config.all_tags_rules.tags_exceptions_to_txt_max_min_chr_len
+ tags_to_remove_alone = [
+ TagToRemove(tag=tag, txt_max_chr_len=txt_max_chr_len, txt_min_chr_len=txt_min_chr_len)
+ for (tag, txt_max_chr_len, txt_min_chr_len) in zip(
+ cfg.html_parser_config.tags_to_remove_alone_tag_name,
+ cfg.html_parser_config.tags_to_remove_alone_txt_max_chr_len,
+ cfg.html_parser_config.tags_to_remove_alone_txt_min_chr_len,
+ )
+ ]
+
+ self._tag_filter = TagFilter(
+ tags_to_remove_alone=tags_to_remove_alone,
+ txt_min_chr_len=txt_min_chr_len,
+ txt_max_chr_len=txt_max_chr_len,
+ tags_exceptions=tags_exceptions,
+ )
+ self._attributes_to_keep = attributes_to_keep
+
+ def process_local(self, metadata_attrs: Dict[str, Any]) -> Optional[Tuple[str, str]]:
+ # We represent a html tag `T` by enclosing the corresponding text span with "" and "".
+ # Example: An apple is an edible fruit.
+ if self._tag_filter.drop_tag(
+ Metadata(
+ char_start_idx=metadata_attrs["char_start_idx"],
+ value=HtmlTag(
+ tag=metadata_attrs["value"],
+ attrs={
+ attr: attr_value
+ for attr, attr_value in zip(
+ metadata_attrs["html_attrs"]["attrs"], metadata_attrs["html_attrs"]["values"]
+ )
+ },
+ ),
+ char_end_idx=metadata_attrs["char_end_idx"],
+ key=metadata_attrs["key"],
+ type=metadata_attrs["type"],
+ )
+ ):
+ return None
+
+ # We represent a html tag `T` by enclosing the corresponding text span with "" and "".
+ # Example: An apple is an edible fruit.
+ attributes = " ".join(
+ f"{attr}:{value}"
+ for attr, value in zip(metadata_attrs["html_attrs"]["attrs"], metadata_attrs["html_attrs"]["values"])
+ if (self._attributes_to_keep is None or attr in self._attributes_to_keep)
+ )
+ if attributes:
+ attributes = " " + attributes
+ return f"<{metadata_attrs['value']}{attributes}>", f"{metadata_attrs['value']}>"
diff --git a/experiments/jz/html/exp_1/start_training.py b/experiments/jz/html/exp_1/start_training.py
new file mode 100644
index 00000000..8dba6362
--- /dev/null
+++ b/experiments/jz/html/exp_1/start_training.py
@@ -0,0 +1,62 @@
+import sys
+from dataclasses import dataclass
+
+from html_processor import AllTagsRules, HTMLParserConfig, HtmlProcessor, TagToRemove
+from hydra.core.config_store import ConfigStore
+
+from bsmetadata.input_pipeline import DataConfig, MetadataConfig
+from bsmetadata.metadata_processors import PROCESSORS
+from bsmetadata.train import CFG, main, show_help
+
+
+tags_to_remove_alone = [
+ TagToRemove("body"),
+ TagToRemove("div", txt_max_chr_len=0),
+ TagToRemove("a", txt_max_chr_len=0),
+]
+tags_table = ["table" "tr", "th", "td", "caption", "colgroup", "thead", "tfoot", "tbody"]
+tags_list = [
+ "li",
+ "ol",
+ "ul",
+]
+attributes_to_keep = ["class", "id"]
+txt_max_chr_len = 128
+txt_min_chr_len = -float("inf")
+tags_exceptions = [
+ *tags_table,
+ *tags_list,
+ "span",
+]
+
+PROCESSORS["html"] = HtmlProcessor
+
+
+@dataclass
+class MetadataConfigWithHTML(MetadataConfig):
+ html_parser_config: HTMLParserConfig = HTMLParserConfig(
+ AllTagsRules(
+ attributes_to_keep=attributes_to_keep,
+ txt_max_chr_len=txt_max_chr_len,
+ txt_min_chr_len=txt_min_chr_len,
+ tags_exceptions_to_txt_max_min_chr_len=tags_exceptions,
+ ),
+ tags_to_remove_alone_tag_name=[tag_to_remove.tag for tag_to_remove in tags_to_remove_alone],
+ tags_to_remove_alone_txt_max_chr_len=[tag_to_remove.txt_max_chr_len for tag_to_remove in tags_to_remove_alone],
+ tags_to_remove_alone_txt_min_chr_len=[tag_to_remove.txt_min_chr_len for tag_to_remove in tags_to_remove_alone],
+ )
+
+
+@dataclass
+class CFGAugmented(CFG):
+ data_config: DataConfig = DataConfig(metadata_config=MetadataConfigWithHTML())
+
+
+cs = ConfigStore.instance()
+cs.store(name="config", node=CFGAugmented)
+
+if __name__ == "__main__":
+ if "--help" in sys.argv or "-h" in sys.argv:
+ show_help()
+ sys.exit()
+ main()
diff --git a/experiments/jz/html/exp_1/subexperiment_1/01_load_tokenizer_and_model.slurm b/experiments/jz/html/exp_1/subexperiment_1/01_load_tokenizer_and_model.slurm
new file mode 100644
index 00000000..055b0065
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_1/01_load_tokenizer_and_model.slurm
@@ -0,0 +1,29 @@
+#!/bin/bash
+#SBATCH --job-name=modelling-metadata-exp1-subexp1-load-model-and-tokenizer # (change me!) job name
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks
+#SBATCH --hint=nomultithread # we get physical cores not logical
+#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus
+#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name
+#SBATCH --account=six@gpu # account
+#SBATCH -p compil # partition with internet
+
+set -x -e
+
+# Next line will:
+# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/
+# - setup env vars ($HOME, $WORK, etc)
+# - load several modules (git)
+# Note: We can afford to have two conda environments: one stable for running experiments and one for development.
+# If there are new dependencies to install, you have to tell me about them and not do it in this script
+source $HOME/start-user
+
+# Folder for the clone of github.com/bigscience-workshop/metadata/
+cd $WORK/repos/sync/metadata/
+
+# Command to load the XXX model and tokenizer stored on https://huggingface.co/models
+python experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py \
+ model_name=gpt2 # (change me! e.g. gpt2)
diff --git a/experiments/jz/html/exp_1/subexperiment_1/02_load_dataset.slurm b/experiments/jz/html/exp_1/subexperiment_1/02_load_dataset.slurm
new file mode 100644
index 00000000..086ef44d
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_1/02_load_dataset.slurm
@@ -0,0 +1,60 @@
+#!/bin/bash
+#SBATCH --job-name=modelling-metadata-exp1-subexp1-load-dataset # (change me!) job name
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks
+#SBATCH --hint=nomultithread # we get physical cores not logical
+#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus
+#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name
+#SBATCH --account=six@gpu # account
+#SBATCH -p compil # partition with internet
+
+set -x -e
+
+# Next line will:
+# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/
+# - setup env vars ($HOME, $WORK, etc)
+# - load several modules (git)
+# Note: We can afford to have only two conda environments: one stable for running experiments and one for development.
+# If there are new dependencies to install, you have to tell me about them and not do it in this script
+source $HOME/start-user
+
+# For the moment we can't directly use the new dataset feature on JZ which would avoid having to clone the dataset
+# repo from the HUB. So the first thing to do is to clone the repo of the XXX dataset if it does not already exist.
+HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_reduced_all' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy_v2)
+
+# We define the name of the folder in which the clone will be made
+#Define multi-character delimiter
+delimiter="/"
+#Concatenate the delimiter with the main string
+string=$HUB_REPO_NAME$delimiter
+
+#Split the text based on the delimiter
+myarray=()
+while [[ $string ]]; do
+ myarray+=("${string%%"$delimiter"*}")
+ string=${string#*"$delimiter"}
+done
+REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}"
+
+# We clone the repo if it doesn't exist
+if [[ -d "${REPO_DIR}" ]]; then
+ echo "${REPO_DIR} already exists on your filesystem."
+else
+ echo "${REPO_DIR} doesn't exists on your filesystem."
+ cd $DATASETS_CUSTOM/
+ git clone "https://huggingface.co/datasets/${HUB_REPO_NAME}"
+ cd ${REPO_DIR}
+ git lfs install
+ git lfs pull origin master
+fi
+
+cd $WORK/repos/sync/metadata/
+
+# We check that the dataset can indeed be loaded
+python experiments/jz/utils/loading_script_utils/load_dataset.py \
+ dataset_name="${REPO_DIR}" \
+ train_file="nq-train-*.jsonl.gz" \
+ validation_file="nq-dev-*.jsonl.gz"
diff --git a/experiments/jz/html/exp_1/subexperiment_1/03_create_dataset.slurm b/experiments/jz/html/exp_1/subexperiment_1/03_create_dataset.slurm
new file mode 100644
index 00000000..55fd12c1
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_1/03_create_dataset.slurm
@@ -0,0 +1,58 @@
+#!/bin/bash
+#SBATCH --job-name=modelling-metadata-html-metadata-exp1-subexp1-create-dataset # (change me!) job name
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=20 # (change me! between 0 and 40) number of cores per tasks
+#SBATCH --hint=nomultithread # we get physical cores not logical
+#SBATCH --time 02:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name
+#SBATCH --account=six@cpu # account
+
+set -x -e
+
+# Next line will:
+# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/
+# - setup env vars ($HOME, $WORK, etc)
+# - load several modules (git)
+# Note: We can afford to have two conda environments: one stable for running experiments and one for development.
+# If there are new dependencies to install, you have to tell me about them and not do it in this script
+source $HOME/start-user
+
+# We are on an offline partition
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+# Folder for the clone of github.com/bigscience-workshop/metadata/
+cd $WORK/repos/sync/metadata/
+
+HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_reduced_all' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy_v2)
+
+# We define the name of the folder in which the clone will be made
+#Define multi-character delimiter
+delimiter="/"
+#Concatenate the delimiter with the main string
+string=$HUB_REPO_NAME$delimiter
+
+#Split the text based on the delimiter
+myarray=()
+while [[ $string ]]; do
+ myarray+=("${string%%"$delimiter"*}")
+ string=${string#*"$delimiter"}
+done
+REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}"
+
+THEPATH=$(scontrol show job $SLURM_JOBID | awk -F= '/Command=/{print $2}')
+THIS_SCRIPT_DIR=$( dirname $THEPATH)
+echo "$THIS_SCRIPT_DIR"
+
+# Now we launch the script that will perform the preprocessing of the dataset
+# Feel free to add any arguments you like (change me!)
+python experiments/jz/html/exp_1/start_training.py \
+ --config-dir=$THIS_SCRIPT_DIR \
+ data_config.dataset_name="${REPO_DIR}" \
+ data_config.overwrite_cache=true \
+ out_dir="${SCRATCH}/metadata_outputs" \
+ jobid="${SLURM_JOB_ID}" \
+ do_train=false \
+ do_eval=false
diff --git a/experiments/jz/html/exp_1/subexperiment_1/04_do_training.slurm b/experiments/jz/html/exp_1/subexperiment_1/04_do_training.slurm
new file mode 100644
index 00000000..28fb13d4
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_1/04_do_training.slurm
@@ -0,0 +1,59 @@
+#!/bin/bash
+#SBATCH --job-name=modelling-metadata-html-exp1-subexp1-train # (change me!) job name
+#SBATCH --ntasks=1 # number of MP tasks
+#SBATCH --constraint=v100-16g
+#SBATCH --gres=gpu:1 # number of GPUs per node
+#SBATCH --cpus-per-task=8 # (change me! between 0 and 40) number of cores per tasks
+#SBATCH --hint=nomultithread # we get physical cores not logical
+#SBATCH --time 20:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name # error file name
+#SBATCH --account=six@gpu # account
+
+set -x -e
+
+# Next line will:
+# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/
+# - setup env vars ($HOME, $WORK, etc)
+# - load several modules (git)
+# Note: We can afford to have two conda environments: one stable for running experiments and one for development.
+# If there are new dependencies to install, you have to tell me about them and not do it in this script
+source $HOME/start-user
+
+# We are on an offline partition
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+# be careful about the cache folder for Wandb
+export WANDB_MODE=offline
+export WANDB_DIR=$SCRATCH
+
+# Folder for the clone of github.com/bigscience-workshop/metadata/
+cd $WORK/repos/sync/metadata/
+
+HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_reduced_all' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy_v2)
+
+# We define the name of the folder in which the clone will be made
+#Define multi-character delimiter
+delimiter="/"
+#Concatenate the delimiter with the main string
+string=$HUB_REPO_NAME$delimiter
+
+#Split the text based on the delimiter
+myarray=()
+while [[ $string ]]; do
+ myarray+=("${string%%"$delimiter"*}")
+ string=${string#*"$delimiter"}
+done
+REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}"
+
+THEPATH=$(scontrol show job $SLURM_JOBID | awk -F= '/Command=/{print $2}')
+THIS_SCRIPT_DIR=$( dirname $THEPATH)
+echo "$THIS_SCRIPT_DIR"
+
+python experiments/jz/html/exp_1/start_training.py \
+ --config-dir=$THIS_SCRIPT_DIR \
+ data_config.dataset_name="${REPO_DIR}" \
+ out_dir="${SCRATCH}/metadata_outputs/${SLURM_JOB_ID}" \
+ jobid="${SLURM_JOB_ID}" \
+ do_train=true \
+ do_eval=true
diff --git a/experiments/jz/html/exp_1/subexperiment_1/config.yaml b/experiments/jz/html/exp_1/subexperiment_1/config.yaml
new file mode 100644
index 00000000..e24817e2
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_1/config.yaml
@@ -0,0 +1,63 @@
+data_config:
+ metadata_config:
+ metadata_list:
+ - html
+ local_metadata_special_tokens:
+ html: htmlOn
+ metadata_sep: ' | '
+ metadata_key_value_sep: ': '
+ metadata_probability: 0.5
+ treat_local_metadata_as_regular_text: true
+ add_local_metadata_special_tokens_in_prefix: true
+ metadata_prefix_sep: ' |||'
+ metadata_prefix_start_seq: ' '
+ max_seq_len: 1024
+ html_parser_config:
+ all_tags_rules:
+ attributes_to_keep:
+ - class
+ - id
+ txt_max_chr_len: 0
+ txt_min_chr_len: 0
+ tags_exceptions_to_txt_max_min_chr_len: []
+ tags_to_remove_alone_tag_name:
+ - body
+ tags_to_remove_alone_txt_max_chr_len:
+ - .inf
+ tags_to_remove_alone_txt_min_chr_len:
+ - 0.0
+ experiment: with_metadata
+ per_device_eval_batch_size: 3
+ per_device_train_batch_size: 3
+ dataset_name: null
+ dataset_config_name: null
+ train_file: "nq-train-*.jsonl.gz"
+ validation_file: "nq-dev-*.jsonl.gz"
+ overwrite_cache: false
+ cache_dir: null
+ extension: null
+ preprocessing_num_workers: 80
+ validation_split_percentage: 5
+ block_size: null
+ map_batch_size: 1
+weight_decay: 0.0
+learning_rate: 5.0e-05
+gradient_accumulation_steps: 50
+num_train_epochs: 3
+max_train_steps: null
+lr_scheduler_type: linear
+num_warmup_steps: 1000
+seed: 42
+out_dir: metadata_outputs
+model_name: gpt2
+project_name: metadata_lm
+jobid: null
+start_with_eval: true
+evaluation_strategy: STEPS
+eval_num_per_epoch: 3
+eval_steps: 200
+save_strategy: STEPS
+save_num_per_epoch: 3
+save_steps: 200
+do_train: false
+do_eval: false
\ No newline at end of file
diff --git a/experiments/jz/html/exp_1/subexperiment_1/multi_steps.bash b/experiments/jz/html/exp_1/subexperiment_1/multi_steps.bash
new file mode 100644
index 00000000..ac06bb86
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_1/multi_steps.bash
@@ -0,0 +1,4 @@
+JID_JOB1=$(sbatch 01_load_tokenizer_and_model.slurm | cut -d " " -f 4)
+JID_JOB2=$(sbatch --dependency=afterok:$JID_JOB1 02_load_dataset.slurm | cut -d " " -f 4)
+JID_JOB3=$(sbatch --dependency=afterok:$JID_JOB2 03_create_dataset.slurm | cut -d " " -f 4)
+sbatch --dependency=afterok:$JID_JOB3 04_do_training.slurm
diff --git a/experiments/jz/html/exp_1/subexperiment_1/multi_steps_03_and_04.bash b/experiments/jz/html/exp_1/subexperiment_1/multi_steps_03_and_04.bash
new file mode 100644
index 00000000..3605e25f
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_1/multi_steps_03_and_04.bash
@@ -0,0 +1,2 @@
+JID_JOB3=$(sbatch 03_create_dataset.slurm | cut -d " " -f 4)
+sbatch --dependency=afterok:$JID_JOB3 04_do_training.slurm
diff --git a/experiments/jz/html/exp_1/subexperiment_2/01_load_tokenizer_and_model.slurm b/experiments/jz/html/exp_1/subexperiment_2/01_load_tokenizer_and_model.slurm
new file mode 100644
index 00000000..37839e89
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_2/01_load_tokenizer_and_model.slurm
@@ -0,0 +1,29 @@
+#!/bin/bash
+#SBATCH --job-name=modelling-metadata-exp1-subexp2-load-model-and-tokenizer # (change me!) job name
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks
+#SBATCH --hint=nomultithread # we get physical cores not logical
+#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus
+#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name
+#SBATCH --account=six@gpu # account
+#SBATCH -p compil # partition with internet
+
+set -x -e
+
+# Next line will:
+# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/
+# - setup env vars ($HOME, $WORK, etc)
+# - load several modules (git)
+# Note: We can afford to have two conda environments: one stable for running experiments and one for development.
+# If there are new dependencies to install, you have to tell me about them and not do it in this script
+source $HOME/start-user
+
+# Folder for the clone of github.com/bigscience-workshop/metadata/
+cd $WORK/repos/sync/metadata/
+
+# Command to load the XXX model and tokenizer stored on https://huggingface.co/models
+python experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py \
+ model_name=gpt2 # (change me! e.g. gpt2)
diff --git a/experiments/jz/html/exp_1/subexperiment_2/02_load_dataset.slurm b/experiments/jz/html/exp_1/subexperiment_2/02_load_dataset.slurm
new file mode 100644
index 00000000..3f4e1944
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_2/02_load_dataset.slurm
@@ -0,0 +1,60 @@
+#!/bin/bash
+#SBATCH --job-name=modelling-metadata-exp1-subexp2-load-dataset # (change me!) job name
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks
+#SBATCH --hint=nomultithread # we get physical cores not logical
+#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus
+#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name
+#SBATCH --account=six@gpu # account
+#SBATCH -p compil # partition with internet
+
+set -x -e
+
+# Next line will:
+# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/
+# - setup env vars ($HOME, $WORK, etc)
+# - load several modules (git)
+# Note: We can afford to have only two conda environments: one stable for running experiments and one for development.
+# If there are new dependencies to install, you have to tell me about them and not do it in this script
+source $HOME/start-user
+
+# For the moment we can't directly use the new dataset feature on JZ which would avoid having to clone the dataset
+# repo from the HUB. So the first thing to do is to clone the repo of the XXX dataset if it does not already exist.
+HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_reduced_all' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy_v2)
+
+# We define the name of the folder in which the clone will be made
+#Define multi-character delimiter
+delimiter="/"
+#Concatenate the delimiter with the main string
+string=$HUB_REPO_NAME$delimiter
+
+#Split the text based on the delimiter
+myarray=()
+while [[ $string ]]; do
+ myarray+=("${string%%"$delimiter"*}")
+ string=${string#*"$delimiter"}
+done
+REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}"
+
+# We clone the repo if it doesn't exist
+if [[ -d "${REPO_DIR}" ]]; then
+ echo "${REPO_DIR} already exists on your filesystem."
+else
+ echo "${REPO_DIR} doesn't exists on your filesystem."
+ cd $DATASETS_CUSTOM/
+ git clone "https://huggingface.co/datasets/${HUB_REPO_NAME}"
+ cd ${REPO_DIR}
+ git lfs install
+ git lfs pull origin master
+fi
+
+cd $WORK/repos/sync/metadata/
+
+# We check that the dataset can indeed be loaded
+python experiments/jz/utils/loading_script_utils/load_dataset.py \
+ dataset_name="${REPO_DIR}" \
+ train_file="nq-train-*.jsonl.gz" \
+ validation_file="nq-dev-*.jsonl.gz"
diff --git a/experiments/jz/html/exp_1/subexperiment_2/03_create_dataset.slurm b/experiments/jz/html/exp_1/subexperiment_2/03_create_dataset.slurm
new file mode 100644
index 00000000..b0365a55
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_2/03_create_dataset.slurm
@@ -0,0 +1,58 @@
+#!/bin/bash
+#SBATCH --job-name=modelling-metadata-html-metadata-exp1-subexp2-create-dataset # (change me!) job name
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=20 # (change me! between 0 and 40) number of cores per tasks
+#SBATCH --hint=nomultithread # we get physical cores not logical
+#SBATCH --time 02:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name
+#SBATCH --account=six@cpu # account
+
+set -x -e
+
+# Next line will:
+# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/
+# - setup env vars ($HOME, $WORK, etc)
+# - load several modules (git)
+# Note: We can afford to have two conda environments: one stable for running experiments and one for development.
+# If there are new dependencies to install, you have to tell me about them and not do it in this script
+source $HOME/start-user
+
+# We are on an offline partition
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+# Folder for the clone of github.com/bigscience-workshop/metadata/
+cd $WORK/repos/sync/metadata/
+
+HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_reduced_all' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy_v2)
+
+# We define the name of the folder in which the clone will be made
+#Define multi-character delimiter
+delimiter="/"
+#Concatenate the delimiter with the main string
+string=$HUB_REPO_NAME$delimiter
+
+#Split the text based on the delimiter
+myarray=()
+while [[ $string ]]; do
+ myarray+=("${string%%"$delimiter"*}")
+ string=${string#*"$delimiter"}
+done
+REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}"
+
+THEPATH=$(scontrol show job $SLURM_JOBID | awk -F= '/Command=/{print $2}')
+THIS_SCRIPT_DIR=$( dirname $THEPATH)
+echo "$THIS_SCRIPT_DIR"
+
+# Now we launch the script that will perform the preprocessing of the dataset
+# Feel free to add any arguments you like (change me!)
+python experiments/jz/html/exp_1/start_training.py \
+ --config-dir=$THIS_SCRIPT_DIR \
+ data_config.dataset_name="${REPO_DIR}" \
+ data_config.overwrite_cache=true \
+ out_dir="${SCRATCH}/metadata_outputs" \
+ jobid="${SLURM_JOB_ID}" \
+ do_train=false \
+ do_eval=false
diff --git a/experiments/jz/html/exp_1/subexperiment_2/04_do_training.slurm b/experiments/jz/html/exp_1/subexperiment_2/04_do_training.slurm
new file mode 100644
index 00000000..b258a8a1
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_2/04_do_training.slurm
@@ -0,0 +1,59 @@
+#!/bin/bash
+#SBATCH --job-name=modelling-metadata-html-exp1-subexp2-train # (change me!) job name
+#SBATCH --ntasks=1 # number of MP tasks
+#SBATCH --constraint=v100-16g
+#SBATCH --gres=gpu:1 # number of GPUs per node
+#SBATCH --cpus-per-task=8 # (change me! between 0 and 40) number of cores per tasks
+#SBATCH --hint=nomultithread # we get physical cores not logical
+#SBATCH --time 20:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name # error file name
+#SBATCH --account=six@gpu # account
+
+set -x -e
+
+# Next line will:
+# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/
+# - setup env vars ($HOME, $WORK, etc)
+# - load several modules (git)
+# Note: We can afford to have two conda environments: one stable for running experiments and one for development.
+# If there are new dependencies to install, you have to tell me about them and not do it in this script
+source $HOME/start-user
+
+# We are on an offline partition
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+# be careful about the cache folder for Wandb
+export WANDB_MODE=offline
+export WANDB_DIR=$SCRATCH
+
+# Folder for the clone of github.com/bigscience-workshop/metadata/
+cd $WORK/repos/sync/metadata/
+
+HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_reduced_all' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy_v2)
+
+# We define the name of the folder in which the clone will be made
+#Define multi-character delimiter
+delimiter="/"
+#Concatenate the delimiter with the main string
+string=$HUB_REPO_NAME$delimiter
+
+#Split the text based on the delimiter
+myarray=()
+while [[ $string ]]; do
+ myarray+=("${string%%"$delimiter"*}")
+ string=${string#*"$delimiter"}
+done
+REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}"
+
+THEPATH=$(scontrol show job $SLURM_JOBID | awk -F= '/Command=/{print $2}')
+THIS_SCRIPT_DIR=$( dirname $THEPATH)
+echo "$THIS_SCRIPT_DIR"
+
+python experiments/jz/html/exp_1/start_training.py \
+ --config-dir=$THIS_SCRIPT_DIR \
+ data_config.dataset_name="${REPO_DIR}" \
+ out_dir="${SCRATCH}/metadata_outputs/${SLURM_JOB_ID}" \
+ jobid="${SLURM_JOB_ID}" \
+ do_train=true \
+ do_eval=true
diff --git a/experiments/jz/html/exp_1/subexperiment_2/README.md b/experiments/jz/html/exp_1/subexperiment_2/README.md
new file mode 100644
index 00000000..921dda00
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_2/README.md
@@ -0,0 +1 @@
+Baseline - experiment without metadata added
\ No newline at end of file
diff --git a/experiments/jz/html/exp_1/subexperiment_2/config.yaml b/experiments/jz/html/exp_1/subexperiment_2/config.yaml
new file mode 100644
index 00000000..0269bfc6
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_2/config.yaml
@@ -0,0 +1,63 @@
+data_config:
+ metadata_config:
+ metadata_list:
+ - html
+ local_metadata_special_tokens:
+ html: htmlOn
+ metadata_sep: ' | '
+ metadata_key_value_sep: ': '
+ metadata_probability: 0.5
+ treat_local_metadata_as_regular_text: true
+ add_local_metadata_special_tokens_in_prefix: true
+ metadata_prefix_sep: ' |||'
+ metadata_prefix_start_seq: ' '
+ max_seq_len: 1024
+ html_parser_config:
+ all_tags_rules:
+ attributes_to_keep:
+ - class
+ - id
+ txt_max_chr_len: .inf
+ txt_min_chr_len: 0
+ tags_exceptions_to_txt_max_min_chr_len: []
+ tags_to_remove_alone_tag_name:
+ - body
+ tags_to_remove_alone_txt_max_chr_len:
+ - .inf
+ tags_to_remove_alone_txt_min_chr_len:
+ - 0.0
+ experiment: without_metadata
+ per_device_eval_batch_size: 3
+ per_device_train_batch_size: 3
+ dataset_name: null
+ dataset_config_name: null
+ train_file: "nq-train-*.jsonl.gz"
+ validation_file: "nq-dev-*.jsonl.gz"
+ overwrite_cache: false
+ cache_dir: null
+ extension: json
+ preprocessing_num_workers: 80
+ validation_split_percentage: 5
+ block_size: null
+ map_batch_size: 10
+weight_decay: 0.0
+learning_rate: 5.0e-05
+gradient_accumulation_steps: 50
+num_train_epochs: 3
+max_train_steps: null
+lr_scheduler_type: linear
+num_warmup_steps: 1000
+seed: 42
+out_dir: metadata_outputs
+model_name: gpt2
+project_name: metadata_lm
+jobid: null
+start_with_eval: true
+evaluation_strategy: STEPS
+eval_num_per_epoch: 3
+eval_steps: 200
+save_strategy: STEPS
+save_num_per_epoch: 3
+save_steps: 200
+do_train: false
+do_eval: false
\ No newline at end of file
diff --git a/experiments/jz/html/exp_1/subexperiment_2/multi_steps.bash b/experiments/jz/html/exp_1/subexperiment_2/multi_steps.bash
new file mode 100644
index 00000000..ac06bb86
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_2/multi_steps.bash
@@ -0,0 +1,4 @@
+JID_JOB1=$(sbatch 01_load_tokenizer_and_model.slurm | cut -d " " -f 4)
+JID_JOB2=$(sbatch --dependency=afterok:$JID_JOB1 02_load_dataset.slurm | cut -d " " -f 4)
+JID_JOB3=$(sbatch --dependency=afterok:$JID_JOB2 03_create_dataset.slurm | cut -d " " -f 4)
+sbatch --dependency=afterok:$JID_JOB3 04_do_training.slurm
diff --git a/experiments/jz/html/exp_1/subexperiment_2/multi_steps_03_and_04.bash b/experiments/jz/html/exp_1/subexperiment_2/multi_steps_03_and_04.bash
new file mode 100644
index 00000000..3605e25f
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_2/multi_steps_03_and_04.bash
@@ -0,0 +1,2 @@
+JID_JOB3=$(sbatch 03_create_dataset.slurm | cut -d " " -f 4)
+sbatch --dependency=afterok:$JID_JOB3 04_do_training.slurm
diff --git a/experiments/jz/html/exp_1/subexperiment_3/01_load_tokenizer_and_model.slurm b/experiments/jz/html/exp_1/subexperiment_3/01_load_tokenizer_and_model.slurm
new file mode 100644
index 00000000..f8620acd
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_3/01_load_tokenizer_and_model.slurm
@@ -0,0 +1,29 @@
+#!/bin/bash
+#SBATCH --job-name=modelling-metadata-exp1-subexp3-load-model-and-tokenizer # (change me!) job name
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks
+#SBATCH --hint=nomultithread # we get physical cores not logical
+#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus
+#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name
+#SBATCH --account=six@gpu # account
+#SBATCH -p compil # partition with internet
+
+set -x -e
+
+# Next line will:
+# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/
+# - setup env vars ($HOME, $WORK, etc)
+# - load several modules (git)
+# Note: We can afford to have two conda environments: one stable for running experiments and one for development.
+# If there are new dependencies to install, you have to tell me about them and not do it in this script
+source $HOME/start-user
+
+# Folder for the clone of github.com/bigscience-workshop/metadata/
+cd $WORK/repos/sync/metadata/
+
+# Command to load the XXX model and tokenizer stored on https://huggingface.co/models
+python experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py \
+ model_name=gpt2 # (change me! e.g. gpt2)
diff --git a/experiments/jz/html/exp_1/subexperiment_3/02_load_dataset.slurm b/experiments/jz/html/exp_1/subexperiment_3/02_load_dataset.slurm
new file mode 100644
index 00000000..083a458a
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_3/02_load_dataset.slurm
@@ -0,0 +1,60 @@
+#!/bin/bash
+#SBATCH --job-name=modelling-metadata-exp1-subexp3-load-dataset # (change me!) job name
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks
+#SBATCH --hint=nomultithread # we get physical cores not logical
+#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus
+#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name
+#SBATCH --account=six@gpu # account
+#SBATCH -p compil # partition with internet
+
+set -x -e
+
+# Next line will:
+# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/
+# - setup env vars ($HOME, $WORK, etc)
+# - load several modules (git)
+# Note: We can afford to have only two conda environments: one stable for running experiments and one for development.
+# If there are new dependencies to install, you have to tell me about them and not do it in this script
+source $HOME/start-user
+
+# For the moment we can't directly use the new dataset feature on JZ which would avoid having to clone the dataset
+# repo from the HUB. So the first thing to do is to clone the repo of the XXX dataset if it does not already exist.
+HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_reduced_all' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy_v2)
+
+# We define the name of the folder in which the clone will be made
+#Define multi-character delimiter
+delimiter="/"
+#Concatenate the delimiter with the main string
+string=$HUB_REPO_NAME$delimiter
+
+#Split the text based on the delimiter
+myarray=()
+while [[ $string ]]; do
+ myarray+=("${string%%"$delimiter"*}")
+ string=${string#*"$delimiter"}
+done
+REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}"
+
+# We clone the repo if it doesn't exist
+if [[ -d "${REPO_DIR}" ]]; then
+ echo "${REPO_DIR} already exists on your filesystem."
+else
+ echo "${REPO_DIR} doesn't exists on your filesystem."
+ cd $DATASETS_CUSTOM/
+ git clone "https://huggingface.co/datasets/${HUB_REPO_NAME}"
+ cd ${REPO_DIR}
+ git lfs install
+ git lfs pull origin master
+fi
+
+cd $WORK/repos/sync/metadata/
+
+# We check that the dataset can indeed be loaded
+python experiments/jz/utils/loading_script_utils/load_dataset.py \
+ dataset_name="${REPO_DIR}" \
+ train_file="nq-train-*.jsonl.gz" \
+ validation_file="nq-dev-*.jsonl.gz"
diff --git a/experiments/jz/html/exp_1/subexperiment_3/03_create_dataset.slurm b/experiments/jz/html/exp_1/subexperiment_3/03_create_dataset.slurm
new file mode 100644
index 00000000..890bdbec
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_3/03_create_dataset.slurm
@@ -0,0 +1,58 @@
+#!/bin/bash
+#SBATCH --job-name=modelling-metadata-html-metadata-exp1-subexp3-create-dataset # (change me!) job name
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=20 # (change me! between 0 and 40) number of cores per tasks
+#SBATCH --hint=nomultithread # we get physical cores not logical
+#SBATCH --time 02:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name
+#SBATCH --account=six@cpu # account
+
+set -x -e
+
+# Next line will:
+# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/
+# - setup env vars ($HOME, $WORK, etc)
+# - load several modules (git)
+# Note: We can afford to have two conda environments: one stable for running experiments and one for development.
+# If there are new dependencies to install, you have to tell me about them and not do it in this script
+source $HOME/start-user
+
+# We are on an offline partition
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+# Folder for the clone of github.com/bigscience-workshop/metadata/
+cd $WORK/repos/sync/metadata/
+
+HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_reduced_all' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy_v2)
+
+# We define the name of the folder in which the clone will be made
+#Define multi-character delimiter
+delimiter="/"
+#Concatenate the delimiter with the main string
+string=$HUB_REPO_NAME$delimiter
+
+#Split the text based on the delimiter
+myarray=()
+while [[ $string ]]; do
+ myarray+=("${string%%"$delimiter"*}")
+ string=${string#*"$delimiter"}
+done
+REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}"
+
+THEPATH=$(scontrol show job $SLURM_JOBID | awk -F= '/Command=/{print $2}')
+THIS_SCRIPT_DIR=$( dirname $THEPATH)
+echo "$THIS_SCRIPT_DIR"
+
+# Now we launch the script that will perform the preprocessing of the dataset
+# Feel free to add any arguments you like (change me!)
+python experiments/jz/html/exp_1/start_training.py \
+ --config-dir=$THIS_SCRIPT_DIR \
+ data_config.dataset_name="${REPO_DIR}" \
+ data_config.overwrite_cache=true \
+ out_dir="${SCRATCH}/metadata_outputs" \
+ jobid="${SLURM_JOB_ID}" \
+ do_train=false \
+ do_eval=false
diff --git a/experiments/jz/html/exp_1/subexperiment_3/04_do_training.slurm b/experiments/jz/html/exp_1/subexperiment_3/04_do_training.slurm
new file mode 100644
index 00000000..3c66eeb8
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_3/04_do_training.slurm
@@ -0,0 +1,59 @@
+#!/bin/bash
+#SBATCH --job-name=modelling-metadata-html-exp1-subexp3-train # (change me!) job name
+#SBATCH --ntasks=1 # number of MP tasks
+#SBATCH --constraint=v100-16g
+#SBATCH --gres=gpu:1 # number of GPUs per node
+#SBATCH --cpus-per-task=8 # (change me! between 0 and 40) number of cores per tasks
+#SBATCH --hint=nomultithread # we get physical cores not logical
+#SBATCH --time 20:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS)
+#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name
+#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name # error file name
+#SBATCH --account=six@gpu # account
+
+set -x -e
+
+# Next line will:
+# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/
+# - setup env vars ($HOME, $WORK, etc)
+# - load several modules (git)
+# Note: We can afford to have two conda environments: one stable for running experiments and one for development.
+# If there are new dependencies to install, you have to tell me about them and not do it in this script
+source $HOME/start-user
+
+# We are on an offline partition
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+# be careful about the cache folder for Wandb
+export WANDB_MODE=offline
+export WANDB_DIR=$SCRATCH
+
+# Folder for the clone of github.com/bigscience-workshop/metadata/
+cd $WORK/repos/sync/metadata/
+
+HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_reduced_all' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy_v2)
+
+# We define the name of the folder in which the clone will be made
+#Define multi-character delimiter
+delimiter="/"
+#Concatenate the delimiter with the main string
+string=$HUB_REPO_NAME$delimiter
+
+#Split the text based on the delimiter
+myarray=()
+while [[ $string ]]; do
+ myarray+=("${string%%"$delimiter"*}")
+ string=${string#*"$delimiter"}
+done
+REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}"
+
+THEPATH=$(scontrol show job $SLURM_JOBID | awk -F= '/Command=/{print $2}')
+THIS_SCRIPT_DIR=$( dirname $THEPATH)
+echo "$THIS_SCRIPT_DIR"
+
+python experiments/jz/html/exp_1/start_training.py \
+ --config-dir=$THIS_SCRIPT_DIR \
+ data_config.dataset_name="${REPO_DIR}" \
+ out_dir="${SCRATCH}/metadata_outputs/${SLURM_JOB_ID}" \
+ jobid="${SLURM_JOB_ID}" \
+ do_train=true \
+ do_eval=true
diff --git a/experiments/jz/html/exp_1/subexperiment_3/README.md b/experiments/jz/html/exp_1/subexperiment_3/README.md
new file mode 100644
index 00000000..921dda00
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_3/README.md
@@ -0,0 +1 @@
+Baseline - experiment without metadata added
\ No newline at end of file
diff --git a/experiments/jz/html/exp_1/subexperiment_3/config.yaml b/experiments/jz/html/exp_1/subexperiment_3/config.yaml
new file mode 100644
index 00000000..d1069be9
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_3/config.yaml
@@ -0,0 +1,70 @@
+data_config:
+ metadata_config:
+ metadata_list:
+ - html
+ local_metadata_special_tokens:
+ html: htmlOn
+ metadata_sep: ' | '
+ metadata_key_value_sep: ': '
+ metadata_probability: 0.5
+ treat_local_metadata_as_regular_text: true
+ add_local_metadata_special_tokens_in_prefix: true
+ metadata_prefix_sep: ' |||'
+ metadata_prefix_start_seq: ' '
+ max_seq_len: 1024
+ html_parser_config:
+ all_tags_rules:
+ attributes_to_keep:
+ - class
+ - id
+ txt_max_chr_len: .inf
+ txt_min_chr_len: 0
+ tags_exceptions_to_txt_max_min_chr_len:
+ - h1
+ - h2
+ - h3
+ - h4
+ - h5
+ - h6
+ - p
+ tags_to_remove_alone_tag_name:
+ - body
+ tags_to_remove_alone_txt_max_chr_len:
+ - .inf
+ tags_to_remove_alone_txt_min_chr_len:
+ - 0.0
+ experiment: with_metadata
+ per_device_eval_batch_size: 3
+ per_device_train_batch_size: 3
+ dataset_name: null
+ dataset_config_name: null
+ train_file: "nq-train-*.jsonl.gz"
+ validation_file: "nq-dev-*.jsonl.gz"
+ overwrite_cache: false
+ cache_dir: null
+ extension: json
+ preprocessing_num_workers: 80
+ validation_split_percentage: 5
+ block_size: null
+ map_batch_size: 10
+weight_decay: 0.0
+learning_rate: 5.0e-05
+gradient_accumulation_steps: 50
+num_train_epochs: 3
+max_train_steps: null
+lr_scheduler_type: linear
+num_warmup_steps: 1000
+seed: 42
+out_dir: metadata_outputs
+model_name: gpt2
+project_name: metadata_lm
+jobid: null
+start_with_eval: true
+evaluation_strategy: STEPS
+eval_num_per_epoch: 3
+eval_steps: 200
+save_strategy: STEPS
+save_num_per_epoch: 3
+save_steps: 200
+do_train: false
+do_eval: false
\ No newline at end of file
diff --git a/experiments/jz/html/exp_1/subexperiment_3/multi_steps.bash b/experiments/jz/html/exp_1/subexperiment_3/multi_steps.bash
new file mode 100644
index 00000000..ac06bb86
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_3/multi_steps.bash
@@ -0,0 +1,4 @@
+JID_JOB1=$(sbatch 01_load_tokenizer_and_model.slurm | cut -d " " -f 4)
+JID_JOB2=$(sbatch --dependency=afterok:$JID_JOB1 02_load_dataset.slurm | cut -d " " -f 4)
+JID_JOB3=$(sbatch --dependency=afterok:$JID_JOB2 03_create_dataset.slurm | cut -d " " -f 4)
+sbatch --dependency=afterok:$JID_JOB3 04_do_training.slurm
diff --git a/experiments/jz/html/exp_1/subexperiment_3/multi_steps_03_and_04.bash b/experiments/jz/html/exp_1/subexperiment_3/multi_steps_03_and_04.bash
new file mode 100644
index 00000000..3605e25f
--- /dev/null
+++ b/experiments/jz/html/exp_1/subexperiment_3/multi_steps_03_and_04.bash
@@ -0,0 +1,2 @@
+JID_JOB3=$(sbatch 03_create_dataset.slurm | cut -d " " -f 4)
+sbatch --dependency=afterok:$JID_JOB3 04_do_training.slurm
diff --git a/experiments/jz/html/exp_1/test_html_processor.py b/experiments/jz/html/exp_1/test_html_processor.py
new file mode 100644
index 00000000..535914e4
--- /dev/null
+++ b/experiments/jz/html/exp_1/test_html_processor.py
@@ -0,0 +1,224 @@
+import unittest
+
+from html_processor import AllTagsRules, HTMLParserConfig, HtmlProcessor, TagToRemove
+from start_training import MetadataConfigWithHTML
+from transformers import GPT2TokenizerFast
+
+from bsmetadata.metadata_processors import PROCESSORS, MetadataConfig
+from bsmetadata.metadata_utils import add_local_metadata_to_text, add_metadata_and_chunk_examples
+
+
+class MetadataUtilsTester(unittest.TestCase):
+ def setUp(self) -> None:
+ self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2-xl")
+ self.examples = [
+ {
+ "id": "0004",
+ "text": "useless text The Walking Dead (season 8)\n",
+ "metadata": [
+ {
+ "char_start_idx": 0,
+ "value": "a",
+ "html_attrs": {"attrs": [], "values": []},
+ "char_end_idx": 12,
+ "key": "html",
+ "type": "local",
+ },
+ {
+ "char_start_idx": 13,
+ "value": "div",
+ "html_attrs": {"attrs": ["id", "class"], "values": ["mw-page-base", "noprint"]},
+ "char_end_idx": 13,
+ "key": "html",
+ "type": "local",
+ },
+ {
+ "char_start_idx": 13,
+ "value": "div",
+ "html_attrs": {"attrs": ["id", "class"], "values": ["mw-head-base", "noprint"]},
+ "char_end_idx": 13,
+ "key": "html",
+ "type": "local",
+ },
+ {
+ "char_start_idx": 13,
+ "value": "a",
+ "html_attrs": {"attrs": ["id"], "values": ["top"]},
+ "char_end_idx": 13,
+ "key": "html",
+ "type": "local",
+ },
+ {
+ "char_start_idx": 13,
+ "value": "div",
+ "html_attrs": {
+ "attrs": ["id", "class"],
+ "values": ["siteNotice centralNotice", "mw-body-content"],
+ },
+ "char_end_idx": 13,
+ "key": "html",
+ "type": "local",
+ },
+ {
+ "char_start_idx": 13,
+ "value": "i",
+ "html_attrs": {"attrs": [], "values": []},
+ "char_end_idx": 29,
+ "key": "html",
+ "type": "local",
+ },
+ {
+ "char_start_idx": 13,
+ "value": "h1",
+ "html_attrs": {
+ "attrs": ["id", "class", "lang"],
+ "values": ["firstHeading", "firstHeading", "en"],
+ },
+ "char_end_idx": 40,
+ "key": "html",
+ "type": "local",
+ },
+ ],
+ },
+ {
+ "id": "0004",
+ "text": ("this is a title that we keep\n" "blablabla\n" "tidi tidi2 this one keep his tag\n"),
+ "metadata": [
+ {
+ "char_start_idx": 0,
+ "value": "h1",
+ "html_attrs": {"attrs": ["id"], "values": ["title"]},
+ "char_end_idx": 28,
+ "key": "html",
+ "type": "local",
+ },
+ {
+ "char_start_idx": 50,
+ "value": "span",
+ "html_attrs": {"attrs": ["id"], "values": ["3"]},
+ "char_end_idx": 71,
+ "key": "html",
+ "type": "local",
+ },
+ {
+ "char_start_idx": 29,
+ "value": "div",
+ "html_attrs": {
+ "attrs": ["class", "id", "href"],
+ "values": ["div-level-1 div-level-2", "1", "http"],
+ },
+ "char_end_idx": 72,
+ "key": "html",
+ "type": "local",
+ },
+ {
+ "char_start_idx": 0,
+ "value": "body",
+ "html_attrs": {"attrs": [], "values": []},
+ "char_end_idx": 72,
+ "key": "html",
+ "type": "local",
+ },
+ ],
+ },
+ ]
+
+ def test_add_html_tags(self):
+ cfg = MetadataConfigWithHTML(
+ html_parser_config=HTMLParserConfig(
+ all_tags_rules=AllTagsRules(attributes_to_keep=["class", "id", "href"])
+ )
+ )
+ cfg.metadata_list = ["html"]
+ PROCESSORS["html"] = HtmlProcessor
+
+ text1, mask1 = add_local_metadata_to_text(self.examples[0], cfg)
+ self.maxDiff = None
+ target_text = "useless text The Walking Dead (season 8)
\n"
+
+ self.assertEqual(text1, target_text)
+
+ def test_add_html_tags_remove_tag(self):
+ tags_to_remove_alone = [TagToRemove("span", txt_max_chr_len=5), TagToRemove("body")]
+
+ cfg = MetadataConfigWithHTML(
+ html_parser_config=HTMLParserConfig(
+ all_tags_rules=AllTagsRules(attributes_to_keep=["class", "id", "href"]),
+ tags_to_remove_alone_tag_name=[tag_to_remove.tag for tag_to_remove in tags_to_remove_alone],
+ tags_to_remove_alone_txt_max_chr_len=[
+ tag_to_remove.txt_max_chr_len for tag_to_remove in tags_to_remove_alone
+ ],
+ tags_to_remove_alone_txt_min_chr_len=[
+ tag_to_remove.txt_min_chr_len for tag_to_remove in tags_to_remove_alone
+ ],
+ )
+ )
+ cfg.metadata_list = ["html"]
+ PROCESSORS["html"] = HtmlProcessor
+
+ text1, mask1 = add_local_metadata_to_text(self.examples[1], cfg)
+ target_text = (
+ "this is a title that we keep
\n"
+ "blablabla\ntidi tidi2 this one keep his tag\n
"
+ )
+
+ print(repr(text1))
+
+ self.assertEqual(text1, target_text)
+
+ def test_tmp(self):
+ tags_to_remove_alone = [
+ TagToRemove("body"),
+ # TagToRemove("div", txt_max_chr_len=0),
+ # TagToRemove("a", txt_max_chr_len=0),
+ ]
+
+ cfg = MetadataConfigWithHTML(
+ html_parser_config=HTMLParserConfig(
+ all_tags_rules=AllTagsRules(
+ attributes_to_keep=["class", "id"],
+ txt_max_chr_len=float("inf"),
+ txt_min_chr_len=float("inf"),
+ tags_exceptions_to_txt_max_min_chr_len=None,
+ ),
+ tags_to_remove_alone_tag_name=[tag_to_remove.tag for tag_to_remove in tags_to_remove_alone],
+ tags_to_remove_alone_txt_max_chr_len=[
+ tag_to_remove.txt_max_chr_len for tag_to_remove in tags_to_remove_alone
+ ],
+ tags_to_remove_alone_txt_min_chr_len=[
+ tag_to_remove.txt_min_chr_len for tag_to_remove in tags_to_remove_alone
+ ],
+ ),
+ add_local_metadata_special_tokens_in_prefix=True,
+ local_metadata_special_tokens={"html": "HtmlOn"},
+ treat_local_metadata_as_regular_text=True,
+ )
+ cfg.metadata_list = ["html"]
+ PROCESSORS["html"] = HtmlProcessor
+
+ import json
+ import os
+
+ with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "sample.json")) as json_file:
+ data = json.load(json_file)
+
+ text1, mask1 = add_local_metadata_to_text(data, cfg)
+
+ with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "text_with_local.txt"), "w") as f:
+ f.write(text1)
+
+ with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "mask_text_with_local.json"), "w") as f:
+ f.write(json.dumps(mask1))
+
+ print(text1)
+
+ output = add_metadata_and_chunk_examples(
+ {key: [value] for key, value in data.items()}, tokenizer=self.tokenizer, cfg=cfg
+ )
+ with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "ext_with_local_output.json"), "w") as f:
+ output["tokens"] = [self.tokenizer.convert_ids_to_tokens(input_ids) for input_ids in output["input_ids"]]
+ f.write(json.dumps(output))
+
+ # import pprint
+
+ # pprint.pprint(mask1)
diff --git a/experiments/jz/html/exp_1/test_local/config.yaml b/experiments/jz/html/exp_1/test_local/config.yaml
new file mode 100644
index 00000000..b6b7ccde
--- /dev/null
+++ b/experiments/jz/html/exp_1/test_local/config.yaml
@@ -0,0 +1,70 @@
+data_config:
+ metadata_config:
+ metadata_list:
+ - html
+ local_metadata_special_tokens:
+ html: htmlOn
+ metadata_sep: ' | '
+ metadata_key_value_sep: ': '
+ metadata_probability: 1
+ treat_local_metadata_as_regular_text: true
+ add_local_metadata_special_tokens_in_prefix: true
+ metadata_prefix_sep: ' |||'
+ metadata_prefix_start_seq: ' '
+ max_seq_len: 1024
+ html_parser_config:
+ all_tags_rules:
+ attributes_to_keep:
+ - class
+ - id
+ txt_max_chr_len: .inf
+ txt_min_chr_len: 0
+ tags_exceptions_to_txt_max_min_chr_len:
+ - h1
+ - h2
+ - h3
+ - h4
+ - h5
+ - h6
+ - p
+ tags_to_remove_alone_tag_name:
+ - body
+ tags_to_remove_alone_txt_max_chr_len:
+ - .inf
+ tags_to_remove_alone_txt_min_chr_len:
+ - 0.0
+ experiment: with_metadata
+ per_device_eval_batch_size: 3
+ per_device_train_batch_size: 3
+ dataset_name: null
+ dataset_config_name: null
+ train_file: "nq-train-*.jsonl.gz"
+ validation_file: "nq-dev-*.jsonl.gz"
+ overwrite_cache: false
+ cache_dir: null
+ extension: json
+ preprocessing_num_workers: 80
+ validation_split_percentage: 5
+ block_size: null
+ map_batch_size: 2
+weight_decay: 0.0
+learning_rate: 5.0e-05
+gradient_accumulation_steps: 50
+num_train_epochs: 3
+max_train_steps: null
+lr_scheduler_type: linear
+num_warmup_steps: 1000
+seed: 42
+out_dir: metadata_outputs
+model_name: gpt2
+project_name: metadata_lm
+jobid: null
+start_with_eval: true
+evaluation_strategy: STEPS
+eval_num_per_epoch: 3
+eval_steps: 200
+save_strategy: STEPS
+save_num_per_epoch: 3
+save_steps: 200
+do_train: false
+do_eval: false
\ No newline at end of file
diff --git a/experiments/jz/html/exp_1/test_local/create_dataset.sh b/experiments/jz/html/exp_1/test_local/create_dataset.sh
new file mode 100644
index 00000000..736390d5
--- /dev/null
+++ b/experiments/jz/html/exp_1/test_local/create_dataset.sh
@@ -0,0 +1,13 @@
+
+BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+echo "$BASEDIR"
+
+# change proba
+python experiments/jz/html/exp_1/start_training.py \
+ --config-dir=$BASEDIR \
+ data_config.train_file="/mnt/storage/Documents/hugging_face/bigscience/jz/jz-code/sync/metadata/experiments/jz/html/exp_1/test_local/inputs/sample.json" \
+ data_config.validation_file=null \
+ out_dir="/mnt/storage/Documents/hugging_face/bigscience/jz/jz-code/sync/metadata/experiments/jz/html/exp_1/test_local/outputs" \
+ data_config.overwrite_cache=true \
+ do_train=false \
+ do_eval=false
\ No newline at end of file
diff --git a/experiments/jz/html/exp_1/test_local/inputs/sample.json b/experiments/jz/html/exp_1/test_local/inputs/sample.json
new file mode 100644
index 00000000..44079c16
--- /dev/null
+++ b/experiments/jz/html/exp_1/test_local/inputs/sample.json
@@ -0,0 +1 @@
+{"text": "Trade winds\nFrom Wikipedia, the free encyclopedia\nJump to: navigation, search\nThis article is about the weather phenomenon. For other uses, see Tradewind.\nThe westerlies (blue arrows) and trade winds (yellow and brown arrows)\nThe trade winds are the prevailing pattern of easterly surface winds found in the tropics, within the lower portion of the Earth's atmosphere, in the lower section of the troposphere near the Earth's equator. The trade winds blow predominantly from the northeast in the Northern Hemisphere and from the southeast in the Southern Hemisphere, strengthening during the winter and when the Arctic oscillation is in its warm phase. Trade winds have been used by captains of sailing ships to cross the world's oceans for centuries, and enabled European empire expansion into the Americas and trade routes to become established across the Atlantic and Pacific oceans.\nIn meteorology, the trade winds act as the steering flow for tropical storms that form over the Atlantic, Pacific, and southern Indian Oceans and make landfall in North America, Southeast Asia, and Madagascar and eastern Africa, respectively. Trade winds also transport African dust westward across the Atlantic Ocean into the Caribbean Sea, as well as portions of southeastern North America. Shallow cumulus clouds are seen within trade wind regimes, and are capped from becoming taller by a trade wind inversion, which is caused by descending air aloft from within the subtropical ridge. The weaker the trade winds become, the more rainfall can be expected in the neighboring landmasses.\nContents\n[hide]\n1 History\n2 Cause\n3 Weather effects\n4 See also\n5 References\nHistory[edit]\nA Spanish galleon\nSee also: Age of Discovery, Volta do Mar, and Age of sail\nThe term trade winds originally derives from the early fourteenth century late Middle English word 'trade,' meaning \"path\" or \"track.\"[1] The Portuguese recognized the importance of the trade winds (then the Volta do mar, meaning in Portuguese \"turn of the sea\" but also \"return from the sea\") in navigation in both the north and south Atlantic ocean as early as the 15th century.[2] From West Africa, the Portuguese had to sail away from continental Portugal, that is, to west and northwest. They could then turn northeast, to the area around the Azores islands, and finally east to mainland Europe. They also learned that to reach South Africa, they needed to go far out in the ocean, head for Brazil, and around 30\u00b0S go east again. Following the African coast southbound means upwind in the Southern hemisphere. In the Pacific ocean, the full wind circulation, which included both the trade wind easterlies and higher-latitude Westerlies, was unknown to Europeans until Andres de Urdaneta's voyage in 1565.[3]\nThe captain of a sailing ship seeks a course along which the winds can be expected to blow in the direction of travel.[4] During the Age of Sail, the pattern of prevailing winds made various points of the globe easy or difficult to access, and therefore had a direct effect on European empire-building and thus on modern political geography. For example, Manila galleons could not sail into the wind at all.[3]\nBy the 18th century the importance of the trade winds to England's merchant fleet for crossing the Atlantic Ocean had led both the general public and etymologists to identify the name with a later meaning of 'trade', \"(foreign) commerce\".[5] Between 1847 and 1849, Matthew Fontaine Maury collected enough information to create wind and current charts for the world's oceans.[6]\nCause[edit]\n3D map showing Hadley cells in relationship to trade winds on the surface.\nSee also: Air mass, Hadley cell, Humidity, Intertropical Convergence Zone, Monsoon, Monsoon trough, Near-equatorial trough, and Prevailing winds\nAs part of the Hadley cell circulation, surface air flows toward the equator while the flow aloft is towards the poles. A low-pressure area of calm, light variable winds near the equator is known as the doldrums,[7] near-equatorial trough,[8] intertropical front, or the Intertropical Convergence Zone.[9] When located within a monsoon region, this zone of low pressure and wind convergence is also known as the monsoon trough.[10] Around 30\u00b0 in both hemispheres, air begins to descend toward the surface in subtropical high-pressure belts known as subtropical ridges. The subsident (sinking) air is relatively dry because as it descends, the temperature increases, but the absolute humidity remains constant, which lowers the relative humidity of the air mass. This warm, dry air is known as a superior air mass and normally resides above a maritime tropical (warm and moist) air mass. An increase of temperature with height is known as a temperature inversion. When it occurs within a trade wind regime, it is known as a trade wind inversion.[11]\nThe surface air that flows from these subtropical high-pressure belts toward the Equator is deflected toward the west in both hemispheres by the Coriolis effect.[12] These winds blow predominantly from the northeast in the Northern Hemisphere and from the southeast in the Southern Hemisphere.[13] Because winds are named for the direction from which the wind is blowing,[14] these winds are called the northeasterly trade winds in the Northern Hemisphere and the southeasterly trade winds in the Southern Hemisphere. The trade winds of both hemispheres meet at the doldrums.[7]\nAs they blow across tropical regions, air masses heat up over lower latitudes due to more direct sunlight. Those that develop over land (continental) are drier and hotter than those that develop over oceans (maritime), and travel northward on the western periphery of the subtropical ridge.[15] Maritime tropical air masses are sometimes referred to as trade air masses.[16] The one region of the Earth which has an absence of trade winds is the north Indian ocean.[17]\nWeather effects[edit]\nClouds which form above regions within trade wind regimes are typically composed of cumulus which extend no more than 4 kilometres (13,000 ft) in height, and are capped from being taller by the trade wind inversion.[18] Trade winds originate more from the direction of the poles (northeast in the Northern Hemisphere, southeast in the Southern Hemisphere) during the cold season, and are stronger in the winter than the summer.[19] As an example, the windy season in the Guianas, which lie at low latitudes in South America, occurs between January and April.[20] When the phase of the Arctic oscillation (AO) is warm, trade winds are stronger within the tropics. The cold phase of the AO leads to weaker trade winds.[21] When the trade winds are weaker, more extensive areas of rain fall upon landmasses within the tropics, such as Central America.[22]\nDuring mid-summer in the Northern Hemisphere (July), the westward-moving trade winds south of the northward-moving subtropical ridge expand northwestward from the Caribbean sea into southeastern North America (Florida and Gulf Coast). When dust from the Sahara moving around the southern periphery of the ridge travels over land, rainfall is suppressed and the sky changes from a blue to a white appearance which leads to an increase in red sunsets. Its presence negatively impacts air quality by adding to the count of airborne particulates.[23] Although the Southeast USA has some of the cleanest air in North America, much of the African dust that reaches the United States affects Florida.[24] Since 1970, dust outbreaks have worsened due to periods of drought in Africa. There is a large variability in the dust transport to the Caribbean and Florida from year to year.[25] Dust events have been linked to a decline in the health of coral reefs across the Caribbean and Florida, primarily since the 1970s.[26]\nSee also[edit]\nIntertropical Convergence Zone\nWinds in the Age of Sail\nWesterly wind burst\nVolta do mar\nReferences[edit]\nJump up ^ Carol G. Braham; Enid Pearsons; Deborah M. Posner; Georgia S. Maas & Richard Goodman (2001). Random House Webster's College Dictionary (second ed.). Random House. p. 1385. ISBN 0-375-42560-8.\nJump up ^ Hermann R. Muelder (2007). Years of This Land - A Geographical History of the United States. Read Books. p. 38. ISBN 978-1-4067-7740-6. Retrieved 2009-11-09.\n^ Jump up to: a b Derek Hayes (2001). Historical atlas of the North Pacific Ocean: maps of discovery and scientific exploration, 1500-2000. Douglas & McIntyre. p. 18. ISBN 978-1-55054-865-5. Retrieved 2009-11-08.\nJump up ^ Cyrus Cornelius Adams (1904). A text-book of commercial geography. D. Appleton and company. p. 19. Retrieved 2009-11-07.\nJump up ^ Oxford English Dictionary (2 ed.). p. 225.\nJump up ^ Derek Hayes (2001). Historical atlas of the North Pacific Ocean: maps of discovery and scientific exploration, 1500-2000. Douglas & McIntyre. p. 152. ISBN 978-1-55054-865-5. Retrieved 2009-11-08.\n^ Jump up to: a b Sverre Petterssen (1941). Introduction to Meteorology. Mcgraw-Hill Book Company, Inc. p. 110. ISBN 978-1-4437-2300-8. Retrieved 2009-11-09.\nJump up ^ Glossary of Meteorology (June 2000). \"Doldrums\". American Meteorological Society. Retrieved 2009-11-09.\nJump up ^ Glossary of Meteorology (June 2000). \"Intertropical Convergence Zone\". American Meteorological Society. Retrieved 2009-11-09.\nJump up ^ Glossary of Meteorology (June 2000). \"Monsoon Trough\". American Meteorological Society. Archived from the original on 2009-06-17. Retrieved 2009-11-09.\nJump up ^ Glossary of Meteorology (June 2000). \"Superior air\". American Meteorological Society. Retrieved 2009-10-28.\nJump up ^ Glossary of Meteorology (2009). \"trade winds\". Glossary of Meteorology. American Meteorological Society. Archived from the original on 2008-12-11. Retrieved 2008-09-08.\nJump up ^ Ralph Stockman Tarr and Frank Morton McMurry (1909).Advanced geography. W.W. Shannon, State Printing, pp. 246. Retrieved on 2009-04-15.\nJump up ^ JetStream (2008). \"How to read weather maps\". National Weather Service. Retrieved 2007-05-16.\nJump up ^ Glossary of Meteorology (June 2000). \"Tropical air\". American Meteorological Society. Retrieved 2009-10-28.\nJump up ^ Glossary of Meteorology (June 2000). \"Trade air\". American Meteorological Society. Retrieved 2009-10-28.\nJump up ^ John E. Oliver (2005). Encyclopedia of world climatology. Springer. p. 128. ISBN 978-1-4020-3264-6. Retrieved 2009-11-09.\nJump up ^ Bob Rauber (2009-05-22). \"Research-The Rain in Cumulus over the Ocean Campaign\". Retrieved 2009-11-08.\nJump up ^ James P. Terry (2007). Tropical cyclones: climatology and impacts in the South Pacific. Springer. p. 8. ISBN 978-0-387-71542-1. Retrieved 2009-11-08.\nJump up ^ G. E. Pieter & F. Augustinus. \"The influence of the trade winds on the coastal development of the Guianas at various scale levels: a synthesis\". Marine Geology. 208 (2-4): 145\u2013151. Bibcode:2004MGeol.208..145A. doi:10.1016/j.margeo.2004.04.007.\nJump up ^ Robert R. Steward (2005). \"The Ocean's Influence on North American Drought\". Texas A&M University. Retrieved 2009-11-08.\nJump up ^ John E. Oliver (2005). Encyclopedia of world climatology. Springer. p. 185. ISBN 978-1-4020-3264-6. Retrieved 2009-11-08.\nJump up ^ Science Daily (1999-07-14). African Dust Called A Major Factor Affecting Southeast U.S. Air Quality. Retrieved on 2007-06-10.\nJump up ^ Science Daily (2001-06-15). Microbes And The Dust They Ride In On Pose Potential Health Risks. Retrieved on 2007-06-10.\nJump up ^ Usinfo.state.gov (2003). Study Says African Dust Affects Climate in U.S., Caribbean. Archived 2007-06-20 at the Wayback Machine. Retrieved on 2007-06-10.\nJump up ^ U. S. Geological Survey (2006). Coral Mortality and African Dust. Retrieved on 2007-06-10.\nRetrieved from \"https://en.wikipedia.org/w/index.php?title=Trade_winds&oldid=817251427\"\nCategories:\nClimate patterns\nAtmospheric dynamics\nWind\nAge of Sail\nHidden categories:\nWebarchive template wayback links\nGood articles\nNavigation menu\nPersonal tools\nNot logged in\nTalk\nContributions\nCreate account\nLog in\nNamespaces\nArticle\nTalk\nVariants\nViews\nRead\nEdit\nView history\nMore\nSearch\nNavigation\nMain page\nContents\nFeatured content\nCurrent events\nRandom article\nDonate to Wikipedia\nWikipedia store\nInteraction\nHelp\nAbout Wikipedia\nCommunity portal\nRecent changes\nContact page\nTools\nWhat links here\nRelated changes\nUpload file\nSpecial pages\nPermanent link\nPage information\nWikidata item\nCite this page\nPrint/export\nCreate a book\nDownload as PDF\nPrintable version\nLanguages\n\u0627\u0644\u0639\u0631\u0628\u064a\u0629\nAz\u0259rbaycanca\nB\u00e2n-l\u00e2m-g\u00fa\n\u0411\u0435\u043b\u0430\u0440\u0443\u0441\u043a\u0430\u044f\n\u0411\u044a\u043b\u0433\u0430\u0440\u0441\u043a\u0438\nCatal\u00e0\n\u010ce\u0161tina\nDansk\nDeutsch\nEesti\n\u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac\nEspa\u00f1ol\nEsperanto\nEuskara\n\u0641\u0627\u0631\u0633\u06cc\nFran\u00e7ais\nGaeilge\nGalego\n\ud55c\uad6d\uc5b4\n\u0540\u0561\u0575\u0565\u0580\u0565\u0576\n\u0939\u093f\u0928\u094d\u0926\u0940\nHrvatski\nIdo\nBahasa Indonesia\n\u0418\u0440\u043e\u043d\n\u00cdslenska\nItaliano\n\u05e2\u05d1\u05e8\u05d9\u05ea\n\u10e5\u10d0\u10e0\u10d7\u10e3\u10da\u10d8\n\u049a\u0430\u0437\u0430\u049b\u0448\u0430\nKrey\u00f2l ayisyen\nLatina\nLatvie\u0161u\nLietuvi\u0173\nLimburgs\nMagyar\n\u0d2e\u0d32\u0d2f\u0d3e\u0d33\u0d02\nBahasa Melayu\n\u1019\u103c\u1014\u103a\u1019\u102c\u1018\u102c\u101e\u102c\nNederlands\n\u65e5\u672c\u8a9e\nNorsk\nNorsk nynorsk\nOccitan\nO\u02bbzbekcha/\u045e\u0437\u0431\u0435\u043a\u0447\u0430\nPolski\nPortugu\u00eas\nRom\u00e2n\u0103\n\u0420\u0443\u0441\u0441\u043a\u0438\u0439\nShqip\nSloven\u010dina\nSloven\u0161\u010dina\n\u0421\u0440\u043f\u0441\u043a\u0438 / srpski\nSrpskohrvatski / \u0441\u0440\u043f\u0441\u043a\u043e\u0445\u0440\u0432\u0430\u0442\u0441\u043a\u0438\nSuomi\nSvenska\n\u0ba4\u0bae\u0bbf\u0bb4\u0bcd\n\u0e44\u0e17\u0e22\nT\u00fcrk\u00e7e\n\u0423\u043a\u0440\u0430\u0457\u043d\u0441\u044c\u043a\u0430\nTi\u1ebfng Vi\u1ec7t\n\u7cb5\u8a9e\n\u4e2d\u6587\nEdit links\nThis page was last edited on 27 December 2017, at 05:08.\nText is available under the Creative Commons Attribution-ShareAlike License; additional terms may apply. By using this site, you agree to the Terms of Use and Privacy Policy. Wikipedia\u00ae is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.\nPrivacy policy\nAbout Wikipedia\nDisclaimers\nContact Wikipedia\nDevelopers\nCookie statement\nMobile view\nEnable previews\n", "metadata": [{"key": "html", "type": "local", "char_start_idx": 0, "relative_start_pos": 1, "char_end_idx": 0, "relative_end_pos": 2, "value": "div", "html_attrs": {"attrs": ["id", "class"], "values": ["mw-page-base", "noprint"]}}, {"key": "html", "type": "local", "char_start_idx": 0, "relative_start_pos": 3, "char_end_idx": 0, "relative_end_pos": 4, "value": "div", "html_attrs": {"attrs": ["id", "class"], "values": ["mw-head-base", "noprint"]}}, {"key": "html", "type": "local", "char_start_idx": 0, "relative_start_pos": 6, "char_end_idx": 0, "relative_end_pos": 7, "value": "a", "html_attrs": {"attrs": ["id"], "values": ["top"]}}, {"key": "html", "type": "local", "char_start_idx": 0, "relative_start_pos": 8, "char_end_idx": 0, "relative_end_pos": 9, "value": "div", "html_attrs": {"attrs": ["id", "class"], "values": ["siteNotice centralNotice", "mw-body-content"]}}, {"key": "html", "type": "local", "char_start_idx": 0, "relative_start_pos": 12, "char_end_idx": 0, "relative_end_pos": 13, "value": "img", "html_attrs": {"attrs": ["alt", "src", "width", "height", "srcset", "data-file-width", "data-file-height"], "values": ["This is a good article. Follow the link for more information.", "//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/19px-Symbol_support_vote.svg.png", "19", "20", "//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/29px-Symbol_support_vote.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/39px-Symbol_support_vote.svg.png 2x", "180", "185"]}}, {"key": "html", "type": "local", "char_start_idx": 0, "relative_start_pos": 11, "char_end_idx": 0, "relative_end_pos": 14, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Wikipedia:Good_articles", "This is a good article. Follow the link for more information."]}}, {"key": "html", "type": "local", "char_start_idx": 0, "relative_start_pos": 10, "char_end_idx": 0, "relative_end_pos": 15, "value": "div", "html_attrs": {"attrs": ["class", "id"], "values": ["mw-indicators mw-body-content mw-indicator", "mw-indicator-good-star"]}}, {"key": "html", "type": "local", "char_start_idx": 0, "relative_start_pos": 16, "char_end_idx": 11, "relative_end_pos": 0, "value": "h1", "html_attrs": {"attrs": ["id", "class", "lang"], "values": ["firstHeading", "firstHeading", "en"]}}, {"key": "html", "type": "local", "char_start_idx": 12, "relative_start_pos": 1, "char_end_idx": 49, "relative_end_pos": 0, "value": "div", "html_attrs": {"attrs": ["id", "class"], "values": ["siteSub", "noprint"]}}, {"key": "html", "type": "local", "char_start_idx": 50, "relative_start_pos": 0, "char_end_idx": 50, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["id"], "values": ["contentSub"]}}, {"key": "html", "type": "local", "char_start_idx": 59, "relative_start_pos": 0, "char_end_idx": 69, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#mw-head"]}}, {"key": "html", "type": "local", "char_start_idx": 71, "relative_start_pos": 0, "char_end_idx": 77, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#p-search"]}}, {"key": "html", "type": "local", "char_start_idx": 50, "relative_start_pos": 2, "char_end_idx": 78, "relative_end_pos": 0, "value": "div", "html_attrs": {"attrs": ["id", "class"], "values": ["jump-to-nav", "mw-jump"]}}, {"key": "html", "type": "local", "char_start_idx": 144, "relative_start_pos": 0, "char_end_idx": 153, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/Tradewind_(disambiguation)", "mw-redirect mw-disambig", "Tradewind (disambiguation)"]}}, {"key": "html", "type": "local", "char_start_idx": 78, "relative_start_pos": 3, "char_end_idx": 154, "relative_end_pos": 0, "value": "div", "html_attrs": {"attrs": ["role", "class"], "values": ["note", "hatnote navigation-not-searchable"]}}, {"key": "html", "type": "local", "char_start_idx": 155, "relative_start_pos": 2, "char_end_idx": 155, "relative_end_pos": 3, "value": "img", "html_attrs": {"attrs": ["alt", "src", "width", "height", "class", "srcset", "data-file-width", "data-file-height"], "values": ["", "//upload.wikimedia.org/wikipedia/commons/thumb/1/18/Map_prevailing_winds_on_earth.png/300px-Map_prevailing_winds_on_earth.png", "300", "132", "thumbimage", "//upload.wikimedia.org/wikipedia/commons/thumb/1/18/Map_prevailing_winds_on_earth.png/450px-Map_prevailing_winds_on_earth.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/1/18/Map_prevailing_winds_on_earth.png/600px-Map_prevailing_winds_on_earth.png 2x", "1425", "625"]}}, {"key": "html", "type": "local", "char_start_idx": 155, "relative_start_pos": 1, "char_end_idx": 155, "relative_end_pos": 4, "value": "a", "html_attrs": {"attrs": ["href", "class"], "values": ["/wiki/File:Map_prevailing_winds_on_earth.png", "image"]}}, {"key": "html", "type": "local", "char_start_idx": 155, "relative_start_pos": 7, "char_end_idx": 155, "relative_end_pos": 8, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/File:Map_prevailing_winds_on_earth.png", "internal", "Enlarge"]}}, {"key": "html", "type": "local", "char_start_idx": 155, "relative_start_pos": 6, "char_end_idx": 155, "relative_end_pos": 9, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["magnify"]}}, {"key": "html", "type": "local", "char_start_idx": 159, "relative_start_pos": 0, "char_end_idx": 169, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Westerlies", "Westerlies"]}}, {"key": "html", "type": "local", "char_start_idx": 155, "relative_start_pos": 5, "char_end_idx": 225, "relative_end_pos": 0, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["thumbcaption"]}}, {"key": "html", "type": "local", "char_start_idx": 155, "relative_start_pos": 0, "char_end_idx": 226, "relative_end_pos": 0, "value": "div", "html_attrs": {"attrs": ["class", "style"], "values": ["thumb tright thumbinner", "width:302px;"]}}, {"key": "html", "type": "local", "char_start_idx": 230, "relative_start_pos": 0, "char_end_idx": 241, "relative_end_pos": 0, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 250, "relative_start_pos": 0, "char_end_idx": 268, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Prevailing_winds", "Prevailing winds"]}}, {"key": "html", "type": "local", "char_start_idx": 289, "relative_start_pos": 0, "char_end_idx": 294, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Wind", "Wind"]}}, {"key": "html", "type": "local", "char_start_idx": 308, "relative_start_pos": 0, "char_end_idx": 315, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Tropics", "Tropics"]}}, {"key": "html", "type": "local", "char_start_idx": 349, "relative_start_pos": 0, "char_end_idx": 367, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/Earth%27s_atmosphere", "mw-redirect", "Earth's atmosphere"]}}, {"key": "html", "type": "local", "char_start_idx": 397, "relative_start_pos": 0, "char_end_idx": 408, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Troposphere", "Troposphere"]}}, {"key": "html", "type": "local", "char_start_idx": 426, "relative_start_pos": 0, "char_end_idx": 433, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Equator", "Equator"]}}, {"key": "html", "type": "local", "char_start_idx": 496, "relative_start_pos": 0, "char_end_idx": 515, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Northern_Hemisphere", "Northern Hemisphere"]}}, {"key": "html", "type": "local", "char_start_idx": 546, "relative_start_pos": 0, "char_end_idx": 565, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Southern_Hemisphere", "Southern Hemisphere"]}}, {"key": "html", "type": "local", "char_start_idx": 612, "relative_start_pos": 0, "char_end_idx": 630, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Arctic_oscillation", "Arctic oscillation"]}}, {"key": "html", "type": "local", "char_start_idx": 695, "relative_start_pos": 0, "char_end_idx": 708, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Sailing_ship", "Sailing ship"]}}, {"key": "html", "type": "local", "char_start_idx": 764, "relative_start_pos": 0, "char_end_idx": 779, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Colonial_empire", "Colonial empire"]}}, {"key": "html", "type": "local", "char_start_idx": 858, "relative_start_pos": 0, "char_end_idx": 866, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/Atlantic_ocean", "mw-redirect", "Atlantic ocean"]}}, {"key": "html", "type": "local", "char_start_idx": 871, "relative_start_pos": 0, "char_end_idx": 885, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/Pacific_ocean", "mw-redirect", "Pacific ocean"]}}, {"key": "html", "type": "local", "char_start_idx": 226, "relative_start_pos": 1, "char_end_idx": 886, "relative_end_pos": 0, "value": "p", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 890, "relative_start_pos": 0, "char_end_idx": 901, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Meteorology", "Meteorology"]}}, {"key": "html", "type": "local", "char_start_idx": 930, "relative_start_pos": 0, "char_end_idx": 943, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Prevailing_winds", "Prevailing winds"]}}, {"key": "html", "type": "local", "char_start_idx": 948, "relative_start_pos": 0, "char_end_idx": 963, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/Tropical_storms", "mw-redirect", "Tropical storms"]}}, {"key": "html", "type": "local", "char_start_idx": 1015, "relative_start_pos": 0, "char_end_idx": 1028, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Indian_Ocean", "Indian Ocean"]}}, {"key": "html", "type": "local", "char_start_idx": 1190, "relative_start_pos": 0, "char_end_idx": 1204, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Atlantic_Ocean", "Atlantic Ocean"]}}, {"key": "html", "type": "local", "char_start_idx": 1214, "relative_start_pos": 0, "char_end_idx": 1227, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Caribbean_Sea", "Caribbean Sea"]}}, {"key": "html", "type": "local", "char_start_idx": 1265, "relative_start_pos": 0, "char_end_idx": 1278, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/North_America", "North America"]}}, {"key": "html", "type": "local", "char_start_idx": 1288, "relative_start_pos": 0, "char_end_idx": 1302, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Cumulus_cloud", "Cumulus cloud"]}}, {"key": "html", "type": "local", "char_start_idx": 1458, "relative_start_pos": 0, "char_end_idx": 1475, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Subtropical_ridge", "Subtropical ridge"]}}, {"key": "html", "type": "local", "char_start_idx": 887, "relative_start_pos": 0, "char_end_idx": 1576, "relative_end_pos": 0, "value": "p", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 1577, "relative_start_pos": 0, "char_end_idx": 1577, "relative_end_pos": 1, "value": "p", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 1577, "relative_start_pos": 4, "char_end_idx": 1585, "relative_end_pos": 0, "value": "h2", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 1587, "relative_start_pos": 0, "char_end_idx": 1591, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["role", "tabindex", "class"], "values": ["button", "0", "togglelink"]}}, {"key": "html", "type": "local", "char_start_idx": 1586, "relative_start_pos": 0, "char_end_idx": 1593, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["toctoggle"]}}, {"key": "html", "type": "local", "char_start_idx": 1577, "relative_start_pos": 3, "char_end_idx": 1593, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["toctitle"]}}, {"key": "html", "type": "local", "char_start_idx": 1593, "relative_start_pos": 5, "char_end_idx": 1594, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["tocnumber"]}}, {"key": "html", "type": "local", "char_start_idx": 1595, "relative_start_pos": 0, "char_end_idx": 1602, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["toctext"]}}, {"key": "html", "type": "local", "char_start_idx": 1593, "relative_start_pos": 4, "char_end_idx": 1602, "relative_end_pos": 1, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#History"]}}, {"key": "html", "type": "local", "char_start_idx": 1593, "relative_start_pos": 3, "char_end_idx": 1602, "relative_end_pos": 2, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["toclevel-1 tocsection-1"]}}, {"key": "html", "type": "local", "char_start_idx": 1603, "relative_start_pos": 2, "char_end_idx": 1604, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["tocnumber"]}}, {"key": "html", "type": "local", "char_start_idx": 1605, "relative_start_pos": 0, "char_end_idx": 1610, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["toctext"]}}, {"key": "html", "type": "local", "char_start_idx": 1603, "relative_start_pos": 1, "char_end_idx": 1610, "relative_end_pos": 1, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#Cause"]}}, {"key": "html", "type": "local", "char_start_idx": 1603, "relative_start_pos": 0, "char_end_idx": 1610, "relative_end_pos": 2, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["toclevel-1 tocsection-2"]}}, {"key": "html", "type": "local", "char_start_idx": 1611, "relative_start_pos": 2, "char_end_idx": 1612, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["tocnumber"]}}, {"key": "html", "type": "local", "char_start_idx": 1613, "relative_start_pos": 0, "char_end_idx": 1628, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["toctext"]}}, {"key": "html", "type": "local", "char_start_idx": 1611, "relative_start_pos": 1, "char_end_idx": 1628, "relative_end_pos": 1, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#Weather_effects"]}}, {"key": "html", "type": "local", "char_start_idx": 1611, "relative_start_pos": 0, "char_end_idx": 1628, "relative_end_pos": 2, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["toclevel-1 tocsection-3"]}}, {"key": "html", "type": "local", "char_start_idx": 1629, "relative_start_pos": 2, "char_end_idx": 1630, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["tocnumber"]}}, {"key": "html", "type": "local", "char_start_idx": 1631, "relative_start_pos": 0, "char_end_idx": 1639, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["toctext"]}}, {"key": "html", "type": "local", "char_start_idx": 1629, "relative_start_pos": 1, "char_end_idx": 1639, "relative_end_pos": 1, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#See_also"]}}, {"key": "html", "type": "local", "char_start_idx": 1629, "relative_start_pos": 0, "char_end_idx": 1639, "relative_end_pos": 2, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["toclevel-1 tocsection-4"]}}, {"key": "html", "type": "local", "char_start_idx": 1640, "relative_start_pos": 2, "char_end_idx": 1641, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["tocnumber"]}}, {"key": "html", "type": "local", "char_start_idx": 1642, "relative_start_pos": 0, "char_end_idx": 1652, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["toctext"]}}, {"key": "html", "type": "local", "char_start_idx": 1640, "relative_start_pos": 1, "char_end_idx": 1652, "relative_end_pos": 1, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#References"]}}, {"key": "html", "type": "local", "char_start_idx": 1640, "relative_start_pos": 0, "char_end_idx": 1652, "relative_end_pos": 2, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["toclevel-1 tocsection-5"]}}, {"key": "html", "type": "local", "char_start_idx": 1593, "relative_start_pos": 2, "char_end_idx": 1653, "relative_end_pos": 0, "value": "ul", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 1577, "relative_start_pos": 2, "char_end_idx": 1653, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["id", "class"], "values": ["toc", "toc"]}}, {"key": "html", "type": "local", "char_start_idx": 1653, "relative_start_pos": 2, "char_end_idx": 1653, "relative_end_pos": 3, "value": "p", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 1653, "relative_start_pos": 5, "char_end_idx": 1660, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class", "id"], "values": ["mw-headline", "History"]}}, {"key": "html", "type": "local", "char_start_idx": 1660, "relative_start_pos": 2, "char_end_idx": 1661, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-editsection-bracket"]}}, {"key": "html", "type": "local", "char_start_idx": 1661, "relative_start_pos": 1, "char_end_idx": 1665, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/w/index.php?title=Trade_winds&action=edit§ion=1", "Edit section: History"]}}, {"key": "html", "type": "local", "char_start_idx": 1665, "relative_start_pos": 1, "char_end_idx": 1666, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-editsection-bracket"]}}, {"key": "html", "type": "local", "char_start_idx": 1660, "relative_start_pos": 1, "char_end_idx": 1666, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-editsection"]}}, {"key": "html", "type": "local", "char_start_idx": 1653, "relative_start_pos": 4, "char_end_idx": 1666, "relative_end_pos": 2, "value": "h2", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 1667, "relative_start_pos": 2, "char_end_idx": 1667, "relative_end_pos": 3, "value": "img", "html_attrs": {"attrs": ["alt", "src", "width", "height", "class", "srcset", "data-file-width", "data-file-height"], "values": ["", "//upload.wikimedia.org/wikipedia/commons/thumb/0/08/Spanish_Galleon.jpg/200px-Spanish_Galleon.jpg", "200", "275", "thumbimage", "//upload.wikimedia.org/wikipedia/commons/0/08/Spanish_Galleon.jpg 1.5x", "300", "412"]}}, {"key": "html", "type": "local", "char_start_idx": 1667, "relative_start_pos": 1, "char_end_idx": 1667, "relative_end_pos": 4, "value": "a", "html_attrs": {"attrs": ["href", "class"], "values": ["/wiki/File:Spanish_Galleon.jpg", "image"]}}, {"key": "html", "type": "local", "char_start_idx": 1667, "relative_start_pos": 6, "char_end_idx": 1667, "relative_end_pos": 7, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/File:Spanish_Galleon.jpg", "internal", "Enlarge"]}}, {"key": "html", "type": "local", "char_start_idx": 1667, "relative_start_pos": 5, "char_end_idx": 1684, "relative_end_pos": 0, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["thumbcaption magnify"]}}, {"key": "html", "type": "local", "char_start_idx": 1667, "relative_start_pos": 0, "char_end_idx": 1685, "relative_end_pos": 0, "value": "div", "html_attrs": {"attrs": ["class", "style"], "values": ["thumb tright thumbinner", "width:202px;"]}}, {"key": "html", "type": "local", "char_start_idx": 1695, "relative_start_pos": 0, "char_end_idx": 1711, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Age_of_Discovery", "Age of Discovery"]}}, {"key": "html", "type": "local", "char_start_idx": 1713, "relative_start_pos": 0, "char_end_idx": 1725, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/Volta_do_Mar", "mw-redirect", "Volta do Mar"]}}, {"key": "html", "type": "local", "char_start_idx": 1731, "relative_start_pos": 0, "char_end_idx": 1742, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/Age_of_sail", "mw-redirect", "Age of sail"]}}, {"key": "html", "type": "local", "char_start_idx": 1685, "relative_start_pos": 1, "char_end_idx": 1742, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["role", "class"], "values": ["note", "hatnote navigation-not-searchable"]}}, {"key": "html", "type": "local", "char_start_idx": 1752, "relative_start_pos": 0, "char_end_idx": 1763, "relative_end_pos": 0, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 1822, "relative_start_pos": 0, "char_end_idx": 1836, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Middle_English", "Middle English"]}}, {"key": "html", "type": "local", "char_start_idx": 1877, "relative_start_pos": 1, "char_end_idx": 1880, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-1"]}}, {"key": "html", "type": "local", "char_start_idx": 1877, "relative_start_pos": 0, "char_end_idx": 1880, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-1", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 1951, "relative_start_pos": 1, "char_end_idx": 1963, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Volta_do_mar", "Volta do mar"]}}, {"key": "html", "type": "local", "char_start_idx": 1951, "relative_start_pos": 0, "char_end_idx": 1963, "relative_end_pos": 1, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 2123, "relative_start_pos": 1, "char_end_idx": 2126, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-2"]}}, {"key": "html", "type": "local", "char_start_idx": 2123, "relative_start_pos": 0, "char_end_idx": 2126, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-2", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 2291, "relative_start_pos": 0, "char_end_idx": 2297, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Azores", "Azores"]}}, {"key": "html", "type": "local", "char_start_idx": 2673, "relative_start_pos": 0, "char_end_idx": 2683, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Westerlies", "Westerlies"]}}, {"key": "html", "type": "local", "char_start_idx": 2716, "relative_start_pos": 0, "char_end_idx": 2734, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/Andres_de_Urdaneta", "mw-redirect", "Andres de Urdaneta"]}}, {"key": "html", "type": "local", "char_start_idx": 2752, "relative_start_pos": 1, "char_end_idx": 2755, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-pg18-3"]}}, {"key": "html", "type": "local", "char_start_idx": 2752, "relative_start_pos": 0, "char_end_idx": 2755, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-pg18_3-0", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 1743, "relative_start_pos": 0, "char_end_idx": 2755, "relative_end_pos": 2, "value": "p", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 2874, "relative_start_pos": 1, "char_end_idx": 2877, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-4"]}}, {"key": "html", "type": "local", "char_start_idx": 2874, "relative_start_pos": 0, "char_end_idx": 2877, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-4", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 2889, "relative_start_pos": 0, "char_end_idx": 2900, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Age_of_Sail", "Age of Sail"]}}, {"key": "html", "type": "local", "char_start_idx": 2917, "relative_start_pos": 0, "char_end_idx": 2933, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Prevailing_winds", "Prevailing winds"]}}, {"key": "html", "type": "local", "char_start_idx": 3111, "relative_start_pos": 0, "char_end_idx": 3126, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Manila_galleon", "Manila galleon"]}}, {"key": "html", "type": "local", "char_start_idx": 3163, "relative_start_pos": 1, "char_end_idx": 3166, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-pg18-3"]}}, {"key": "html", "type": "local", "char_start_idx": 3163, "relative_start_pos": 0, "char_end_idx": 3166, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-pg18_3-1", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 2756, "relative_start_pos": 0, "char_end_idx": 3166, "relative_end_pos": 2, "value": "p", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 3405, "relative_start_pos": 1, "char_end_idx": 3408, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-5"]}}, {"key": "html", "type": "local", "char_start_idx": 3405, "relative_start_pos": 0, "char_end_idx": 3408, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-5", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 3432, "relative_start_pos": 0, "char_end_idx": 3454, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Matthew_Fontaine_Maury", "Matthew Fontaine Maury"]}}, {"key": "html", "type": "local", "char_start_idx": 3541, "relative_start_pos": 1, "char_end_idx": 3544, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-6"]}}, {"key": "html", "type": "local", "char_start_idx": 3541, "relative_start_pos": 0, "char_end_idx": 3544, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-6", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 3167, "relative_start_pos": 0, "char_end_idx": 3544, "relative_end_pos": 2, "value": "p", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 3545, "relative_start_pos": 1, "char_end_idx": 3550, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class", "id"], "values": ["mw-headline", "Cause"]}}, {"key": "html", "type": "local", "char_start_idx": 3550, "relative_start_pos": 2, "char_end_idx": 3551, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-editsection-bracket"]}}, {"key": "html", "type": "local", "char_start_idx": 3551, "relative_start_pos": 1, "char_end_idx": 3555, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/w/index.php?title=Trade_winds&action=edit§ion=2", "Edit section: Cause"]}}, {"key": "html", "type": "local", "char_start_idx": 3555, "relative_start_pos": 1, "char_end_idx": 3556, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-editsection-bracket"]}}, {"key": "html", "type": "local", "char_start_idx": 3550, "relative_start_pos": 1, "char_end_idx": 3556, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-editsection"]}}, {"key": "html", "type": "local", "char_start_idx": 3545, "relative_start_pos": 0, "char_end_idx": 3556, "relative_end_pos": 2, "value": "h2", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 3557, "relative_start_pos": 2, "char_end_idx": 3557, "relative_end_pos": 3, "value": "img", "html_attrs": {"attrs": ["alt", "src", "width", "height", "class", "srcset", "data-file-width", "data-file-height"], "values": ["", "//upload.wikimedia.org/wikipedia/commons/thumb/9/9c/Earth_Global_Circulation_-_en.svg/400px-Earth_Global_Circulation_-_en.svg.png", "400", "334", "thumbimage", "//upload.wikimedia.org/wikipedia/commons/thumb/9/9c/Earth_Global_Circulation_-_en.svg/600px-Earth_Global_Circulation_-_en.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9c/Earth_Global_Circulation_-_en.svg/800px-Earth_Global_Circulation_-_en.svg.png 2x", "1000", "835"]}}, {"key": "html", "type": "local", "char_start_idx": 3557, "relative_start_pos": 1, "char_end_idx": 3557, "relative_end_pos": 4, "value": "a", "html_attrs": {"attrs": ["href", "class"], "values": ["/wiki/File:Earth_Global_Circulation_-_en.svg", "image"]}}, {"key": "html", "type": "local", "char_start_idx": 3557, "relative_start_pos": 7, "char_end_idx": 3557, "relative_end_pos": 8, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/File:Earth_Global_Circulation_-_en.svg", "internal", "Enlarge"]}}, {"key": "html", "type": "local", "char_start_idx": 3557, "relative_start_pos": 6, "char_end_idx": 3557, "relative_end_pos": 9, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["magnify"]}}, {"key": "html", "type": "local", "char_start_idx": 3572, "relative_start_pos": 0, "char_end_idx": 3584, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Hadley_cell", "Hadley cell"]}}, {"key": "html", "type": "local", "char_start_idx": 3557, "relative_start_pos": 5, "char_end_idx": 3631, "relative_end_pos": 0, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["thumbcaption"]}}, {"key": "html", "type": "local", "char_start_idx": 3557, "relative_start_pos": 0, "char_end_idx": 3632, "relative_end_pos": 0, "value": "div", "html_attrs": {"attrs": ["class", "style"], "values": ["thumb tright thumbinner", "width:402px;"]}}, {"key": "html", "type": "local", "char_start_idx": 3642, "relative_start_pos": 0, "char_end_idx": 3650, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Air_mass", "Air mass"]}}, {"key": "html", "type": "local", "char_start_idx": 3652, "relative_start_pos": 0, "char_end_idx": 3663, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Hadley_cell", "Hadley cell"]}}, {"key": "html", "type": "local", "char_start_idx": 3665, "relative_start_pos": 0, "char_end_idx": 3673, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Humidity", "Humidity"]}}, {"key": "html", "type": "local", "char_start_idx": 3675, "relative_start_pos": 0, "char_end_idx": 3705, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Intertropical_Convergence_Zone", "Intertropical Convergence Zone"]}}, {"key": "html", "type": "local", "char_start_idx": 3707, "relative_start_pos": 0, "char_end_idx": 3714, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Monsoon", "Monsoon"]}}, {"key": "html", "type": "local", "char_start_idx": 3716, "relative_start_pos": 0, "char_end_idx": 3730, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Monsoon_trough", "Monsoon trough"]}}, {"key": "html", "type": "local", "char_start_idx": 3732, "relative_start_pos": 0, "char_end_idx": 3754, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/Near-equatorial_trough", "mw-redirect", "Near-equatorial trough"]}}, {"key": "html", "type": "local", "char_start_idx": 3760, "relative_start_pos": 0, "char_end_idx": 3776, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Prevailing_winds", "Prevailing winds"]}}, {"key": "html", "type": "local", "char_start_idx": 3632, "relative_start_pos": 1, "char_end_idx": 3776, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["role", "class"], "values": ["note", "hatnote navigation-not-searchable"]}}, {"key": "html", "type": "local", "char_start_idx": 3792, "relative_start_pos": 0, "char_end_idx": 3803, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Hadley_cell", "Hadley cell"]}}, {"key": "html", "type": "local", "char_start_idx": 3890, "relative_start_pos": 0, "char_end_idx": 3895, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/Geographic_pole", "mw-redirect", "Geographic pole"]}}, {"key": "html", "type": "local", "char_start_idx": 3899, "relative_start_pos": 0, "char_end_idx": 3916, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Low-pressure_area", "Low-pressure area"]}}, {"key": "html", "type": "local", "char_start_idx": 3980, "relative_start_pos": 0, "char_end_idx": 3988, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Doldrums", "Doldrums"]}}, {"key": "html", "type": "local", "char_start_idx": 3989, "relative_start_pos": 1, "char_end_idx": 3992, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-dold-7"]}}, {"key": "html", "type": "local", "char_start_idx": 3989, "relative_start_pos": 0, "char_end_idx": 3992, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-dold_7-0", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 4016, "relative_start_pos": 1, "char_end_idx": 4019, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-8"]}}, {"key": "html", "type": "local", "char_start_idx": 4016, "relative_start_pos": 0, "char_end_idx": 4019, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-8", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 4048, "relative_start_pos": 0, "char_end_idx": 4078, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Intertropical_Convergence_Zone", "Intertropical Convergence Zone"]}}, {"key": "html", "type": "local", "char_start_idx": 4079, "relative_start_pos": 1, "char_end_idx": 4082, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-9"]}}, {"key": "html", "type": "local", "char_start_idx": 4079, "relative_start_pos": 0, "char_end_idx": 4082, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-9", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 4105, "relative_start_pos": 0, "char_end_idx": 4112, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Monsoon", "Monsoon"]}}, {"key": "html", "type": "local", "char_start_idx": 4189, "relative_start_pos": 0, "char_end_idx": 4203, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Monsoon_trough", "Monsoon trough"]}}, {"key": "html", "type": "local", "char_start_idx": 4204, "relative_start_pos": 1, "char_end_idx": 4208, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-10"]}}, {"key": "html", "type": "local", "char_start_idx": 4204, "relative_start_pos": 0, "char_end_idx": 4208, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-10", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 4326, "relative_start_pos": 0, "char_end_idx": 4344, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Subtropical_ridge", "Subtropical ridge"]}}, {"key": "html", "type": "local", "char_start_idx": 4821, "relative_start_pos": 1, "char_end_idx": 4825, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-11"]}}, {"key": "html", "type": "local", "char_start_idx": 4821, "relative_start_pos": 0, "char_end_idx": 4825, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-11", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 3777, "relative_start_pos": 0, "char_end_idx": 4825, "relative_end_pos": 2, "value": "p", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 4971, "relative_start_pos": 0, "char_end_idx": 4986, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/Coriolis_effect", "mw-redirect", "Coriolis effect"]}}, {"key": "html", "type": "local", "char_start_idx": 4987, "relative_start_pos": 1, "char_end_idx": 4991, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-12"]}}, {"key": "html", "type": "local", "char_start_idx": 4987, "relative_start_pos": 0, "char_end_idx": 4991, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-12", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 5049, "relative_start_pos": 0, "char_end_idx": 5068, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Northern_Hemisphere", "Northern Hemisphere"]}}, {"key": "html", "type": "local", "char_start_idx": 5099, "relative_start_pos": 0, "char_end_idx": 5118, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Southern_Hemisphere", "Southern Hemisphere"]}}, {"key": "html", "type": "local", "char_start_idx": 5119, "relative_start_pos": 1, "char_end_idx": 5123, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-Ralph-13"]}}, {"key": "html", "type": "local", "char_start_idx": 5119, "relative_start_pos": 0, "char_end_idx": 5123, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-Ralph_13-0", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 5197, "relative_start_pos": 1, "char_end_idx": 5201, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-HOWTOREAD-14"]}}, {"key": "html", "type": "local", "char_start_idx": 5197, "relative_start_pos": 0, "char_end_idx": 5201, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-HOWTOREAD_14-0", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 5392, "relative_start_pos": 0, "char_end_idx": 5400, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Doldrums", "Doldrums"]}}, {"key": "html", "type": "local", "char_start_idx": 5401, "relative_start_pos": 1, "char_end_idx": 5404, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-dold-7"]}}, {"key": "html", "type": "local", "char_start_idx": 5401, "relative_start_pos": 0, "char_end_idx": 5404, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-dold_7-1", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 4826, "relative_start_pos": 0, "char_end_idx": 5404, "relative_end_pos": 2, "value": "p", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 5695, "relative_start_pos": 1, "char_end_idx": 5699, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-15"]}}, {"key": "html", "type": "local", "char_start_idx": 5695, "relative_start_pos": 0, "char_end_idx": 5699, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-15", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 5775, "relative_start_pos": 1, "char_end_idx": 5779, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-16"]}}, {"key": "html", "type": "local", "char_start_idx": 5775, "relative_start_pos": 0, "char_end_idx": 5779, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-16", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 5802, "relative_start_pos": 0, "char_end_idx": 5807, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Earth", "Earth"]}}, {"key": "html", "type": "local", "char_start_idx": 5857, "relative_start_pos": 0, "char_end_idx": 5869, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/Indian_ocean", "mw-redirect", "Indian ocean"]}}, {"key": "html", "type": "local", "char_start_idx": 5870, "relative_start_pos": 1, "char_end_idx": 5874, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-17"]}}, {"key": "html", "type": "local", "char_start_idx": 5870, "relative_start_pos": 0, "char_end_idx": 5874, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-17", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 5405, "relative_start_pos": 0, "char_end_idx": 5874, "relative_end_pos": 2, "value": "p", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 5875, "relative_start_pos": 1, "char_end_idx": 5890, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class", "id"], "values": ["mw-headline", "Weather_effects"]}}, {"key": "html", "type": "local", "char_start_idx": 5890, "relative_start_pos": 2, "char_end_idx": 5891, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-editsection-bracket"]}}, {"key": "html", "type": "local", "char_start_idx": 5891, "relative_start_pos": 1, "char_end_idx": 5895, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/w/index.php?title=Trade_winds&action=edit§ion=3", "Edit section: Weather effects"]}}, {"key": "html", "type": "local", "char_start_idx": 5895, "relative_start_pos": 1, "char_end_idx": 5896, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-editsection-bracket"]}}, {"key": "html", "type": "local", "char_start_idx": 5890, "relative_start_pos": 1, "char_end_idx": 5896, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-editsection"]}}, {"key": "html", "type": "local", "char_start_idx": 5875, "relative_start_pos": 0, "char_end_idx": 5896, "relative_end_pos": 2, "value": "h2", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 6112, "relative_start_pos": 1, "char_end_idx": 6116, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-18"]}}, {"key": "html", "type": "local", "char_start_idx": 6112, "relative_start_pos": 0, "char_end_idx": 6116, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-18", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 6324, "relative_start_pos": 1, "char_end_idx": 6328, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-19"]}}, {"key": "html", "type": "local", "char_start_idx": 6324, "relative_start_pos": 0, "char_end_idx": 6328, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-19", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 6368, "relative_start_pos": 0, "char_end_idx": 6375, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Guyana", "Guyana"]}}, {"key": "html", "type": "local", "char_start_idx": 6407, "relative_start_pos": 0, "char_end_idx": 6420, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/South_America", "South America"]}}, {"key": "html", "type": "local", "char_start_idx": 6455, "relative_start_pos": 1, "char_end_idx": 6459, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-20"]}}, {"key": "html", "type": "local", "char_start_idx": 6455, "relative_start_pos": 0, "char_end_idx": 6459, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-20", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 6482, "relative_start_pos": 0, "char_end_idx": 6500, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Arctic_oscillation", "Arctic oscillation"]}}, {"key": "html", "type": "local", "char_start_idx": 6613, "relative_start_pos": 1, "char_end_idx": 6617, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-21"]}}, {"key": "html", "type": "local", "char_start_idx": 6613, "relative_start_pos": 0, "char_end_idx": 6617, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-21", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 6729, "relative_start_pos": 0, "char_end_idx": 6744, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Central_America", "Central America"]}}, {"key": "html", "type": "local", "char_start_idx": 6745, "relative_start_pos": 1, "char_end_idx": 6749, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-22"]}}, {"key": "html", "type": "local", "char_start_idx": 6745, "relative_start_pos": 0, "char_end_idx": 6749, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-22", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 5897, "relative_start_pos": 0, "char_end_idx": 6749, "relative_end_pos": 2, "value": "p", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 6865, "relative_start_pos": 0, "char_end_idx": 6882, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Subtropical_ridge", "Subtropical ridge"]}}, {"key": "html", "type": "local", "char_start_idx": 6913, "relative_start_pos": 0, "char_end_idx": 6926, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/Caribbean_sea", "mw-redirect", "Caribbean sea"]}}, {"key": "html", "type": "local", "char_start_idx": 7004, "relative_start_pos": 0, "char_end_idx": 7010, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Sahara", "Sahara"]}}, {"key": "html", "type": "local", "char_start_idx": 7232, "relative_start_pos": 0, "char_end_idx": 7243, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/Air_quality", "mw-redirect", "Air quality"]}}, {"key": "html", "type": "local", "char_start_idx": 7292, "relative_start_pos": 1, "char_end_idx": 7296, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-pooraq-23"]}}, {"key": "html", "type": "local", "char_start_idx": 7292, "relative_start_pos": 0, "char_end_idx": 7296, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-pooraq_23-0", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 7443, "relative_start_pos": 1, "char_end_idx": 7447, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-24"]}}, {"key": "html", "type": "local", "char_start_idx": 7443, "relative_start_pos": 0, "char_end_idx": 7447, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-24", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 7624, "relative_start_pos": 1, "char_end_idx": 7628, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-25"]}}, {"key": "html", "type": "local", "char_start_idx": 7624, "relative_start_pos": 0, "char_end_idx": 7628, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-25", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 7688, "relative_start_pos": 0, "char_end_idx": 7699, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Coral_reef", "Coral reef"]}}, {"key": "html", "type": "local", "char_start_idx": 7760, "relative_start_pos": 1, "char_end_idx": 7764, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_note-26"]}}, {"key": "html", "type": "local", "char_start_idx": 7760, "relative_start_pos": 0, "char_end_idx": 7764, "relative_end_pos": 1, "value": "sup", "html_attrs": {"attrs": ["id", "class"], "values": ["cite_ref-26", "reference"]}}, {"key": "html", "type": "local", "char_start_idx": 6750, "relative_start_pos": 0, "char_end_idx": 7764, "relative_end_pos": 2, "value": "p", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 7765, "relative_start_pos": 1, "char_end_idx": 7773, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class", "id"], "values": ["mw-headline", "See_also"]}}, {"key": "html", "type": "local", "char_start_idx": 7773, "relative_start_pos": 2, "char_end_idx": 7774, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-editsection-bracket"]}}, {"key": "html", "type": "local", "char_start_idx": 7774, "relative_start_pos": 1, "char_end_idx": 7778, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/w/index.php?title=Trade_winds&action=edit§ion=4", "Edit section: See also"]}}, {"key": "html", "type": "local", "char_start_idx": 7778, "relative_start_pos": 1, "char_end_idx": 7779, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-editsection-bracket"]}}, {"key": "html", "type": "local", "char_start_idx": 7773, "relative_start_pos": 1, "char_end_idx": 7779, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-editsection"]}}, {"key": "html", "type": "local", "char_start_idx": 7765, "relative_start_pos": 0, "char_end_idx": 7779, "relative_end_pos": 2, "value": "h2", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 7780, "relative_start_pos": 2, "char_end_idx": 7810, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Intertropical_Convergence_Zone", "Intertropical Convergence Zone"]}}, {"key": "html", "type": "local", "char_start_idx": 7780, "relative_start_pos": 1, "char_end_idx": 7810, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 7811, "relative_start_pos": 1, "char_end_idx": 7835, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Winds_in_the_Age_of_Sail", "Winds in the Age of Sail"]}}, {"key": "html", "type": "local", "char_start_idx": 7811, "relative_start_pos": 0, "char_end_idx": 7835, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 7836, "relative_start_pos": 1, "char_end_idx": 7855, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Westerly_wind_burst", "Westerly wind burst"]}}, {"key": "html", "type": "local", "char_start_idx": 7836, "relative_start_pos": 0, "char_end_idx": 7855, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 7856, "relative_start_pos": 1, "char_end_idx": 7868, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Volta_do_mar", "Volta do mar"]}}, {"key": "html", "type": "local", "char_start_idx": 7856, "relative_start_pos": 0, "char_end_idx": 7868, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 7780, "relative_start_pos": 0, "char_end_idx": 7869, "relative_end_pos": 0, "value": "ul", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 7869, "relative_start_pos": 2, "char_end_idx": 7879, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class", "id"], "values": ["mw-headline", "References"]}}, {"key": "html", "type": "local", "char_start_idx": 7879, "relative_start_pos": 2, "char_end_idx": 7880, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-editsection-bracket"]}}, {"key": "html", "type": "local", "char_start_idx": 7880, "relative_start_pos": 1, "char_end_idx": 7884, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/w/index.php?title=Trade_winds&action=edit§ion=5", "Edit section: References"]}}, {"key": "html", "type": "local", "char_start_idx": 7884, "relative_start_pos": 1, "char_end_idx": 7885, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-editsection-bracket"]}}, {"key": "html", "type": "local", "char_start_idx": 7879, "relative_start_pos": 1, "char_end_idx": 7885, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-editsection"]}}, {"key": "html", "type": "local", "char_start_idx": 7869, "relative_start_pos": 1, "char_end_idx": 7885, "relative_end_pos": 2, "value": "h2", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 7886, "relative_start_pos": 6, "char_end_idx": 7894, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 7886, "relative_start_pos": 5, "char_end_idx": 7895, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-1"]}}, {"key": "html", "type": "local", "char_start_idx": 7886, "relative_start_pos": 4, "char_end_idx": 7895, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 7886, "relative_start_pos": 3, "char_end_idx": 7895, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 7989, "relative_start_pos": 0, "char_end_idx": 8030, "relative_end_pos": 0, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8045, "relative_start_pos": 0, "char_end_idx": 8057, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Random_House", "Random House"]}}, {"key": "html", "type": "local", "char_start_idx": 8068, "relative_start_pos": 0, "char_end_idx": 8072, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/International_Standard_Book_Number", "International Standard Book Number"]}}, {"key": "html", "type": "local", "char_start_idx": 8073, "relative_start_pos": 0, "char_end_idx": 8086, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Special:BookSources/0-375-42560-8", "Special:BookSources/0-375-42560-8"]}}, {"key": "html", "type": "local", "char_start_idx": 7896, "relative_start_pos": 1, "char_end_idx": 8087, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation book"]}}, {"key": "html", "type": "local", "char_start_idx": 8088, "relative_start_pos": 1, "char_end_idx": 8088, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 8088, "relative_start_pos": 0, "char_end_idx": 8088, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Random+House+Webster%27s+College+Dictionary&rft.pages=1385&rft.edition=second&rft.pub=Random+House&rft.date=2001&rft.isbn=0-375-42560-8&rft.au=Carol+G.+Braham&rft.au=Enid+Pearsons&rft.au=Deborah+M.+Posner&rft.au=Georgia+S.+Maas&rft.au=Richard+Goodman&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 7896, "relative_start_pos": 0, "char_end_idx": 8088, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 7886, "relative_start_pos": 2, "char_end_idx": 8088, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-1"]}}, {"key": "html", "type": "local", "char_start_idx": 8088, "relative_start_pos": 10, "char_end_idx": 8096, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 8088, "relative_start_pos": 9, "char_end_idx": 8097, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-2"]}}, {"key": "html", "type": "local", "char_start_idx": 8088, "relative_start_pos": 8, "char_end_idx": 8097, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8088, "relative_start_pos": 7, "char_end_idx": 8097, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 8125, "relative_start_pos": 1, "char_end_idx": 8189, "relative_end_pos": 0, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8125, "relative_start_pos": 0, "char_end_idx": 8189, "relative_end_pos": 1, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "https://books.google.com/books?id=w47gOifvK6EC&pg=PA38&lpg=PA38&dq=knowledge+of+atlantic+winds+and+currents+history#v=onepage&q=&f=false"]}}, {"key": "html", "type": "local", "char_start_idx": 8210, "relative_start_pos": 0, "char_end_idx": 8214, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/International_Standard_Book_Number", "International Standard Book Number"]}}, {"key": "html", "type": "local", "char_start_idx": 8215, "relative_start_pos": 0, "char_end_idx": 8232, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Special:BookSources/978-1-4067-7740-6", "Special:BookSources/978-1-4067-7740-6"]}}, {"key": "html", "type": "local", "char_start_idx": 8244, "relative_start_pos": 0, "char_end_idx": 8254, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 8232, "relative_start_pos": 1, "char_end_idx": 8254, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 8098, "relative_start_pos": 1, "char_end_idx": 8255, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation book"]}}, {"key": "html", "type": "local", "char_start_idx": 8256, "relative_start_pos": 1, "char_end_idx": 8256, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 8256, "relative_start_pos": 0, "char_end_idx": 8256, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Years+of+This+Land+-+A+Geographical+History+of+the+United+States&rft.pages=38&rft.pub=Read+Books&rft.date=2007&rft.isbn=978-1-4067-7740-6&rft.au=Hermann+R.+Muelder&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3Dw47gOifvK6EC%26pg%3DPA38%26lpg%3DPA38%26dq%3Dknowledge%2Bof%2Batlantic%2Bwinds%2Band%2Bcurrents%2Bhistory%23v%3Donepage%26q%3D%26f%3Dfalse&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 8098, "relative_start_pos": 0, "char_end_idx": 8256, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 8088, "relative_start_pos": 6, "char_end_idx": 8256, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-2"]}}, {"key": "html", "type": "local", "char_start_idx": 8258, "relative_start_pos": 1, "char_end_idx": 8270, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 8270, "relative_start_pos": 3, "char_end_idx": 8271, "relative_end_pos": 0, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8270, "relative_start_pos": 2, "char_end_idx": 8271, "relative_end_pos": 1, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8270, "relative_start_pos": 1, "char_end_idx": 8271, "relative_end_pos": 2, "value": "sup", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8258, "relative_start_pos": 0, "char_end_idx": 8271, "relative_end_pos": 3, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-pg18_3-0"]}}, {"key": "html", "type": "local", "char_start_idx": 8272, "relative_start_pos": 3, "char_end_idx": 8273, "relative_end_pos": 0, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8272, "relative_start_pos": 2, "char_end_idx": 8273, "relative_end_pos": 1, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8272, "relative_start_pos": 1, "char_end_idx": 8273, "relative_end_pos": 2, "value": "sup", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8272, "relative_start_pos": 0, "char_end_idx": 8273, "relative_end_pos": 3, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-pg18_3-1"]}}, {"key": "html", "type": "local", "char_start_idx": 8256, "relative_start_pos": 7, "char_end_idx": 8273, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 8294, "relative_start_pos": 1, "char_end_idx": 8394, "relative_end_pos": 0, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8294, "relative_start_pos": 0, "char_end_idx": 8394, "relative_end_pos": 1, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "https://books.google.com/books?id=0Z26YL407SkC&pg=PA152&lpg=PA152&dq=sailing+historic+use+of+trade+winds+book#v=onepage&q=&f=false"]}}, {"key": "html", "type": "local", "char_start_idx": 8423, "relative_start_pos": 0, "char_end_idx": 8427, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/International_Standard_Book_Number", "International Standard Book Number"]}}, {"key": "html", "type": "local", "char_start_idx": 8428, "relative_start_pos": 0, "char_end_idx": 8445, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Special:BookSources/978-1-55054-865-5", "Special:BookSources/978-1-55054-865-5"]}}, {"key": "html", "type": "local", "char_start_idx": 8457, "relative_start_pos": 0, "char_end_idx": 8467, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 8445, "relative_start_pos": 1, "char_end_idx": 8467, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 8274, "relative_start_pos": 1, "char_end_idx": 8468, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation book"]}}, {"key": "html", "type": "local", "char_start_idx": 8469, "relative_start_pos": 1, "char_end_idx": 8469, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 8469, "relative_start_pos": 0, "char_end_idx": 8469, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Historical+atlas+of+the+North+Pacific+Ocean%3A+maps+of+discovery+and+scientific+exploration%2C+1500-2000&rft.pages=18&rft.pub=Douglas+%26+McIntyre&rft.date=2001&rft.isbn=978-1-55054-865-5&rft.au=Derek+Hayes&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3D0Z26YL407SkC%26pg%3DPA152%26lpg%3DPA152%26dq%3Dsailing%2Bhistoric%2Buse%2Bof%2Btrade%2Bwinds%2Bbook%23v%3Donepage%26q%3D%26f%3Dfalse&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 8274, "relative_start_pos": 0, "char_end_idx": 8469, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 8256, "relative_start_pos": 6, "char_end_idx": 8469, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-pg18-3"]}}, {"key": "html", "type": "local", "char_start_idx": 8469, "relative_start_pos": 10, "char_end_idx": 8477, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 8469, "relative_start_pos": 9, "char_end_idx": 8478, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-4"]}}, {"key": "html", "type": "local", "char_start_idx": 8469, "relative_start_pos": 8, "char_end_idx": 8478, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8469, "relative_start_pos": 7, "char_end_idx": 8478, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 8509, "relative_start_pos": 1, "char_end_idx": 8544, "relative_end_pos": 0, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8509, "relative_start_pos": 0, "char_end_idx": 8544, "relative_end_pos": 1, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "https://books.google.com/books?id=V0wAAAAAYAAJ&pg=PA19&lpg=PA19&dq=sailing+using+the+trade+winds+book#v=onepage&q=&f=false"]}}, {"key": "html", "type": "local", "char_start_idx": 8588, "relative_start_pos": 0, "char_end_idx": 8598, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 8576, "relative_start_pos": 0, "char_end_idx": 8598, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 8479, "relative_start_pos": 1, "char_end_idx": 8599, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation book"]}}, {"key": "html", "type": "local", "char_start_idx": 8600, "relative_start_pos": 1, "char_end_idx": 8600, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 8600, "relative_start_pos": 0, "char_end_idx": 8600, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=A+text-book+of+commercial+geography&rft.pages=19&rft.pub=D.+Appleton+and+company&rft.date=1904&rft.au=Cyrus+Cornelius+Adams&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3DV0wAAAAAYAAJ%26pg%3DPA19%26lpg%3DPA19%26dq%3Dsailing%2Busing%2Bthe%2Btrade%2Bwinds%2Bbook%23v%3Donepage%26q%3D%26f%3Dfalse&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 8479, "relative_start_pos": 0, "char_end_idx": 8600, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 8469, "relative_start_pos": 6, "char_end_idx": 8600, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-4"]}}, {"key": "html", "type": "local", "char_start_idx": 8600, "relative_start_pos": 10, "char_end_idx": 8608, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 8600, "relative_start_pos": 9, "char_end_idx": 8609, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-5"]}}, {"key": "html", "type": "local", "char_start_idx": 8600, "relative_start_pos": 8, "char_end_idx": 8609, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8600, "relative_start_pos": 7, "char_end_idx": 8609, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 8610, "relative_start_pos": 2, "char_end_idx": 8635, "relative_end_pos": 0, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8610, "relative_start_pos": 1, "char_end_idx": 8652, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation book"]}}, {"key": "html", "type": "local", "char_start_idx": 8653, "relative_start_pos": 1, "char_end_idx": 8653, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 8653, "relative_start_pos": 0, "char_end_idx": 8653, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Oxford+English+Dictionary&rft.pages=225&rft.edition=2&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 8610, "relative_start_pos": 0, "char_end_idx": 8653, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 8600, "relative_start_pos": 6, "char_end_idx": 8653, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-5"]}}, {"key": "html", "type": "local", "char_start_idx": 8653, "relative_start_pos": 10, "char_end_idx": 8661, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 8653, "relative_start_pos": 9, "char_end_idx": 8662, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-6"]}}, {"key": "html", "type": "local", "char_start_idx": 8653, "relative_start_pos": 8, "char_end_idx": 8662, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8653, "relative_start_pos": 7, "char_end_idx": 8662, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 8683, "relative_start_pos": 1, "char_end_idx": 8783, "relative_end_pos": 0, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8683, "relative_start_pos": 0, "char_end_idx": 8783, "relative_end_pos": 1, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "https://books.google.com/books?id=0Z26YL407SkC&pg=PA152&lpg=PA152&dq=sailing+historic+use+of+trade+winds+book#v=onepage&q=&f=false"]}}, {"key": "html", "type": "local", "char_start_idx": 8813, "relative_start_pos": 0, "char_end_idx": 8817, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/International_Standard_Book_Number", "International Standard Book Number"]}}, {"key": "html", "type": "local", "char_start_idx": 8818, "relative_start_pos": 0, "char_end_idx": 8835, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Special:BookSources/978-1-55054-865-5", "Special:BookSources/978-1-55054-865-5"]}}, {"key": "html", "type": "local", "char_start_idx": 8847, "relative_start_pos": 0, "char_end_idx": 8857, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 8835, "relative_start_pos": 1, "char_end_idx": 8857, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 8663, "relative_start_pos": 1, "char_end_idx": 8858, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation book"]}}, {"key": "html", "type": "local", "char_start_idx": 8859, "relative_start_pos": 1, "char_end_idx": 8859, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 8859, "relative_start_pos": 0, "char_end_idx": 8859, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Historical+atlas+of+the+North+Pacific+Ocean%3A+maps+of+discovery+and+scientific+exploration%2C+1500-2000&rft.pages=152&rft.pub=Douglas+%26+McIntyre&rft.date=2001&rft.isbn=978-1-55054-865-5&rft.au=Derek+Hayes&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3D0Z26YL407SkC%26pg%3DPA152%26lpg%3DPA152%26dq%3Dsailing%2Bhistoric%2Buse%2Bof%2Btrade%2Bwinds%2Bbook%23v%3Donepage%26q%3D%26f%3Dfalse&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 8663, "relative_start_pos": 0, "char_end_idx": 8859, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 8653, "relative_start_pos": 6, "char_end_idx": 8859, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-6"]}}, {"key": "html", "type": "local", "char_start_idx": 8861, "relative_start_pos": 1, "char_end_idx": 8873, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 8873, "relative_start_pos": 3, "char_end_idx": 8874, "relative_end_pos": 0, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8873, "relative_start_pos": 2, "char_end_idx": 8874, "relative_end_pos": 1, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8873, "relative_start_pos": 1, "char_end_idx": 8874, "relative_end_pos": 2, "value": "sup", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8861, "relative_start_pos": 0, "char_end_idx": 8874, "relative_end_pos": 3, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-dold_7-0"]}}, {"key": "html", "type": "local", "char_start_idx": 8875, "relative_start_pos": 3, "char_end_idx": 8876, "relative_end_pos": 0, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8875, "relative_start_pos": 2, "char_end_idx": 8876, "relative_end_pos": 1, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8875, "relative_start_pos": 1, "char_end_idx": 8876, "relative_end_pos": 2, "value": "sup", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8875, "relative_start_pos": 0, "char_end_idx": 8876, "relative_end_pos": 3, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-dold_7-1"]}}, {"key": "html", "type": "local", "char_start_idx": 8859, "relative_start_pos": 7, "char_end_idx": 8876, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 8903, "relative_start_pos": 1, "char_end_idx": 8930, "relative_end_pos": 0, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 8903, "relative_start_pos": 0, "char_end_idx": 8930, "relative_end_pos": 1, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "https://books.google.com/books?id=u-EMrG4bYJkC&pg=PA110&lpg=PA110&dq=trade+winds+converge+in+the+doldrums+book#v=onepage&q=&f=false"]}}, {"key": "html", "type": "local", "char_start_idx": 8971, "relative_start_pos": 0, "char_end_idx": 8975, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/International_Standard_Book_Number", "International Standard Book Number"]}}, {"key": "html", "type": "local", "char_start_idx": 8976, "relative_start_pos": 0, "char_end_idx": 8993, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Special:BookSources/978-1-4437-2300-8", "Special:BookSources/978-1-4437-2300-8"]}}, {"key": "html", "type": "local", "char_start_idx": 9005, "relative_start_pos": 0, "char_end_idx": 9015, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 8993, "relative_start_pos": 1, "char_end_idx": 9015, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 8877, "relative_start_pos": 1, "char_end_idx": 9016, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation book"]}}, {"key": "html", "type": "local", "char_start_idx": 9017, "relative_start_pos": 1, "char_end_idx": 9017, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 9017, "relative_start_pos": 0, "char_end_idx": 9017, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Introduction+to+Meteorology&rft.pages=110&rft.pub=Mcgraw-Hill+Book+Company%2C+Inc.&rft.date=1941&rft.isbn=978-1-4437-2300-8&rft.au=Sverre+Petterssen&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3Du-EMrG4bYJkC%26pg%3DPA110%26lpg%3DPA110%26dq%3Dtrade%2Bwinds%2Bconverge%2Bin%2Bthe%2Bdoldrums%2Bbook%23v%3Donepage%26q%3D%26f%3Dfalse&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 8877, "relative_start_pos": 0, "char_end_idx": 9017, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 8859, "relative_start_pos": 6, "char_end_idx": 9017, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-dold-7"]}}, {"key": "html", "type": "local", "char_start_idx": 9017, "relative_start_pos": 10, "char_end_idx": 9025, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 9017, "relative_start_pos": 9, "char_end_idx": 9026, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-8"]}}, {"key": "html", "type": "local", "char_start_idx": 9017, "relative_start_pos": 8, "char_end_idx": 9026, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 9017, "relative_start_pos": 7, "char_end_idx": 9026, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 9064, "relative_start_pos": 0, "char_end_idx": 9074, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "http://amsglossary.allenpress.com/glossary/search?p=1&query=doldrums&submit=Search"]}}, {"key": "html", "type": "local", "char_start_idx": 9076, "relative_start_pos": 0, "char_end_idx": 9107, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/American_Meteorological_Society", "American Meteorological Society"]}}, {"key": "html", "type": "local", "char_start_idx": 9119, "relative_start_pos": 0, "char_end_idx": 9129, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 9107, "relative_start_pos": 1, "char_end_idx": 9129, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 9027, "relative_start_pos": 1, "char_end_idx": 9130, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation web"]}}, {"key": "html", "type": "local", "char_start_idx": 9131, "relative_start_pos": 1, "char_end_idx": 9131, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 9131, "relative_start_pos": 0, "char_end_idx": 9131, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Doldrums&rft.pub=American+Meteorological+Society&rft.date=2000-06&rft.au=Glossary+of+Meteorology&rft_id=http%3A%2F%2Famsglossary.allenpress.com%2Fglossary%2Fsearch%3Fp%3D1%26query%3Ddoldrums%26submit%3DSearch&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 9027, "relative_start_pos": 0, "char_end_idx": 9131, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 9017, "relative_start_pos": 6, "char_end_idx": 9131, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-8"]}}, {"key": "html", "type": "local", "char_start_idx": 9131, "relative_start_pos": 10, "char_end_idx": 9139, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 9131, "relative_start_pos": 9, "char_end_idx": 9140, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-9"]}}, {"key": "html", "type": "local", "char_start_idx": 9131, "relative_start_pos": 8, "char_end_idx": 9140, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 9131, "relative_start_pos": 7, "char_end_idx": 9140, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 9178, "relative_start_pos": 0, "char_end_idx": 9210, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "http://amsglossary.allenpress.com/glossary/search?p=1&query=intertropical+convergence+zone&submit=Search"]}}, {"key": "html", "type": "local", "char_start_idx": 9212, "relative_start_pos": 0, "char_end_idx": 9243, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/American_Meteorological_Society", "American Meteorological Society"]}}, {"key": "html", "type": "local", "char_start_idx": 9255, "relative_start_pos": 0, "char_end_idx": 9265, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 9243, "relative_start_pos": 1, "char_end_idx": 9265, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 9141, "relative_start_pos": 1, "char_end_idx": 9266, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation web"]}}, {"key": "html", "type": "local", "char_start_idx": 9267, "relative_start_pos": 1, "char_end_idx": 9267, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 9267, "relative_start_pos": 0, "char_end_idx": 9267, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Intertropical+Convergence+Zone&rft.pub=American+Meteorological+Society&rft.date=2000-06&rft.au=Glossary+of+Meteorology&rft_id=http%3A%2F%2Famsglossary.allenpress.com%2Fglossary%2Fsearch%3Fp%3D1%26query%3Dintertropical%2Bconvergence%2Bzone%26submit%3DSearch&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 9141, "relative_start_pos": 0, "char_end_idx": 9267, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 9131, "relative_start_pos": 6, "char_end_idx": 9267, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-9"]}}, {"key": "html", "type": "local", "char_start_idx": 9267, "relative_start_pos": 10, "char_end_idx": 9275, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 9267, "relative_start_pos": 9, "char_end_idx": 9276, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-10"]}}, {"key": "html", "type": "local", "char_start_idx": 9267, "relative_start_pos": 8, "char_end_idx": 9276, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 9267, "relative_start_pos": 7, "char_end_idx": 9276, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 9314, "relative_start_pos": 0, "char_end_idx": 9330, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "https://web.archive.org/web/20090617192300/http://amsglossary.allenpress.com/glossary/search?p=1&query=monsoon+trough&submit=Search"]}}, {"key": "html", "type": "local", "char_start_idx": 9332, "relative_start_pos": 0, "char_end_idx": 9363, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/American_Meteorological_Society", "American Meteorological Society"]}}, {"key": "html", "type": "local", "char_start_idx": 9379, "relative_start_pos": 0, "char_end_idx": 9391, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "http://amsglossary.allenpress.com/glossary/search?p=1&query=monsoon+trough&submit=Search"]}}, {"key": "html", "type": "local", "char_start_idx": 9417, "relative_start_pos": 0, "char_end_idx": 9427, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 9405, "relative_start_pos": 0, "char_end_idx": 9427, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 9277, "relative_start_pos": 1, "char_end_idx": 9428, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation web"]}}, {"key": "html", "type": "local", "char_start_idx": 9429, "relative_start_pos": 1, "char_end_idx": 9429, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 9429, "relative_start_pos": 0, "char_end_idx": 9429, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Monsoon+Trough&rft.pub=American+Meteorological+Society&rft.date=2000-06&rft.au=Glossary+of+Meteorology&rft_id=http%3A%2F%2Famsglossary.allenpress.com%2Fglossary%2Fsearch%3Fp%3D1%26query%3Dmonsoon%2Btrough%26submit%3DSearch&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 9277, "relative_start_pos": 0, "char_end_idx": 9429, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 9267, "relative_start_pos": 6, "char_end_idx": 9429, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-10"]}}, {"key": "html", "type": "local", "char_start_idx": 9429, "relative_start_pos": 10, "char_end_idx": 9437, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 9429, "relative_start_pos": 9, "char_end_idx": 9438, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-11"]}}, {"key": "html", "type": "local", "char_start_idx": 9429, "relative_start_pos": 8, "char_end_idx": 9438, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 9429, "relative_start_pos": 7, "char_end_idx": 9438, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 9476, "relative_start_pos": 0, "char_end_idx": 9490, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "http://amsglossary.allenpress.com/glossary/search?p=1&query=superior+air&submit=Search"]}}, {"key": "html", "type": "local", "char_start_idx": 9492, "relative_start_pos": 0, "char_end_idx": 9523, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/American_Meteorological_Society", "American Meteorological Society"]}}, {"key": "html", "type": "local", "char_start_idx": 9535, "relative_start_pos": 0, "char_end_idx": 9545, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 9523, "relative_start_pos": 1, "char_end_idx": 9545, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 9439, "relative_start_pos": 1, "char_end_idx": 9546, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation web"]}}, {"key": "html", "type": "local", "char_start_idx": 9547, "relative_start_pos": 1, "char_end_idx": 9547, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 9547, "relative_start_pos": 0, "char_end_idx": 9547, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Superior+air&rft.pub=American+Meteorological+Society&rft.date=2000-06&rft.au=Glossary+of+Meteorology&rft_id=http%3A%2F%2Famsglossary.allenpress.com%2Fglossary%2Fsearch%3Fp%3D1%26query%3Dsuperior%2Bair%26submit%3DSearch&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 9439, "relative_start_pos": 0, "char_end_idx": 9547, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 9429, "relative_start_pos": 6, "char_end_idx": 9547, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-11"]}}, {"key": "html", "type": "local", "char_start_idx": 9547, "relative_start_pos": 10, "char_end_idx": 9555, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 9547, "relative_start_pos": 9, "char_end_idx": 9556, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-12"]}}, {"key": "html", "type": "local", "char_start_idx": 9547, "relative_start_pos": 8, "char_end_idx": 9556, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 9547, "relative_start_pos": 7, "char_end_idx": 9556, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 9589, "relative_start_pos": 0, "char_end_idx": 9602, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "https://web.archive.org/web/20081211050708/http://amsglossary.allenpress.com/glossary/search?id=trade-winds1"]}}, {"key": "html", "type": "local", "char_start_idx": 9604, "relative_start_pos": 0, "char_end_idx": 9627, "relative_end_pos": 0, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 9676, "relative_start_pos": 0, "char_end_idx": 9688, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "http://amsglossary.allenpress.com/glossary/search?id=trade-winds1"]}}, {"key": "html", "type": "local", "char_start_idx": 9714, "relative_start_pos": 0, "char_end_idx": 9724, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 9702, "relative_start_pos": 0, "char_end_idx": 9724, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 9557, "relative_start_pos": 1, "char_end_idx": 9725, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation web"]}}, {"key": "html", "type": "local", "char_start_idx": 9726, "relative_start_pos": 1, "char_end_idx": 9726, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 9726, "relative_start_pos": 0, "char_end_idx": 9726, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Glossary+of+Meteorology&rft.atitle=trade+winds&rft.date=2009&rft.au=Glossary+of+Meteorology&rft_id=http%3A%2F%2Famsglossary.allenpress.com%2Fglossary%2Fsearch%3Fid%3Dtrade-winds1&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 9557, "relative_start_pos": 0, "char_end_idx": 9726, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 9547, "relative_start_pos": 6, "char_end_idx": 9726, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-12"]}}, {"key": "html", "type": "local", "char_start_idx": 9726, "relative_start_pos": 10, "char_end_idx": 9734, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 9726, "relative_start_pos": 9, "char_end_idx": 9735, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-Ralph_13-0"]}}, {"key": "html", "type": "local", "char_start_idx": 9726, "relative_start_pos": 8, "char_end_idx": 9735, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 9726, "relative_start_pos": 7, "char_end_idx": 9735, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 9788, "relative_start_pos": 0, "char_end_idx": 9807, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "https://books.google.com/books?id=OLMXAAAAIAAJ&pg=PA246&lpg=PA246&dq=direction+of+the+prevailing+westerlies+in+northern+hemisphere+southwest&source=bl&ots=C2SzvUDsje&sig=dkFc55QfdoJhBCygMCVIu9u-4_s&hl=en&ei=5eXlSfzwKIvKMJOUlYQJ&sa=X&oi=book_result&ct=result&resnum=3"]}}, {"key": "html", "type": "local", "char_start_idx": 9736, "relative_start_pos": 0, "char_end_idx": 9871, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 9726, "relative_start_pos": 6, "char_end_idx": 9871, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-Ralph-13"]}}, {"key": "html", "type": "local", "char_start_idx": 9872, "relative_start_pos": 4, "char_end_idx": 9880, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 9872, "relative_start_pos": 3, "char_end_idx": 9881, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-HOWTOREAD_14-0"]}}, {"key": "html", "type": "local", "char_start_idx": 9872, "relative_start_pos": 2, "char_end_idx": 9881, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 9872, "relative_start_pos": 1, "char_end_idx": 9881, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 9900, "relative_start_pos": 0, "char_end_idx": 9926, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "http://www.srh.weather.gov/srh/jetstream/synoptic/wxmaps.htm"]}}, {"key": "html", "type": "local", "char_start_idx": 9928, "relative_start_pos": 0, "char_end_idx": 9952, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/National_Weather_Service", "National Weather Service"]}}, {"key": "html", "type": "local", "char_start_idx": 9964, "relative_start_pos": 0, "char_end_idx": 9974, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 9952, "relative_start_pos": 1, "char_end_idx": 9974, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 9882, "relative_start_pos": 1, "char_end_idx": 9975, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation web"]}}, {"key": "html", "type": "local", "char_start_idx": 9976, "relative_start_pos": 1, "char_end_idx": 9976, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 9976, "relative_start_pos": 0, "char_end_idx": 9976, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=How+to+read+weather+maps&rft.pub=National+Weather+Service&rft.date=2008&rft.au=JetStream&rft_id=http%3A%2F%2Fwww.srh.weather.gov%2Fsrh%2Fjetstream%2Fsynoptic%2Fwxmaps.htm&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 9882, "relative_start_pos": 0, "char_end_idx": 9976, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 9872, "relative_start_pos": 0, "char_end_idx": 9976, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-HOWTOREAD-14"]}}, {"key": "html", "type": "local", "char_start_idx": 9976, "relative_start_pos": 10, "char_end_idx": 9984, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 9976, "relative_start_pos": 9, "char_end_idx": 9985, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-15"]}}, {"key": "html", "type": "local", "char_start_idx": 9976, "relative_start_pos": 8, "char_end_idx": 9985, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 9976, "relative_start_pos": 7, "char_end_idx": 9985, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 10023, "relative_start_pos": 0, "char_end_idx": 10037, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "http://amsglossary.allenpress.com/glossary/search?p=1&query=tropical+air&submit=Search"]}}, {"key": "html", "type": "local", "char_start_idx": 10039, "relative_start_pos": 0, "char_end_idx": 10070, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/American_Meteorological_Society", "American Meteorological Society"]}}, {"key": "html", "type": "local", "char_start_idx": 10082, "relative_start_pos": 0, "char_end_idx": 10092, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 10070, "relative_start_pos": 1, "char_end_idx": 10092, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 9986, "relative_start_pos": 1, "char_end_idx": 10093, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation web"]}}, {"key": "html", "type": "local", "char_start_idx": 10094, "relative_start_pos": 1, "char_end_idx": 10094, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 10094, "relative_start_pos": 0, "char_end_idx": 10094, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Tropical+air&rft.pub=American+Meteorological+Society&rft.date=2000-06&rft.au=Glossary+of+Meteorology&rft_id=http%3A%2F%2Famsglossary.allenpress.com%2Fglossary%2Fsearch%3Fp%3D1%26query%3Dtropical%2Bair%26submit%3DSearch&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 9986, "relative_start_pos": 0, "char_end_idx": 10094, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 9976, "relative_start_pos": 6, "char_end_idx": 10094, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-15"]}}, {"key": "html", "type": "local", "char_start_idx": 10094, "relative_start_pos": 10, "char_end_idx": 10102, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 10094, "relative_start_pos": 9, "char_end_idx": 10103, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-16"]}}, {"key": "html", "type": "local", "char_start_idx": 10094, "relative_start_pos": 8, "char_end_idx": 10103, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 10094, "relative_start_pos": 7, "char_end_idx": 10103, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 10141, "relative_start_pos": 0, "char_end_idx": 10152, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "http://amsglossary.allenpress.com/glossary/search?id=trade-air1"]}}, {"key": "html", "type": "local", "char_start_idx": 10154, "relative_start_pos": 0, "char_end_idx": 10185, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/American_Meteorological_Society", "American Meteorological Society"]}}, {"key": "html", "type": "local", "char_start_idx": 10197, "relative_start_pos": 0, "char_end_idx": 10207, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 10185, "relative_start_pos": 1, "char_end_idx": 10207, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 10104, "relative_start_pos": 1, "char_end_idx": 10208, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation web"]}}, {"key": "html", "type": "local", "char_start_idx": 10209, "relative_start_pos": 1, "char_end_idx": 10209, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 10209, "relative_start_pos": 0, "char_end_idx": 10209, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Trade+air&rft.pub=American+Meteorological+Society&rft.date=2000-06&rft.au=Glossary+of+Meteorology&rft_id=http%3A%2F%2Famsglossary.allenpress.com%2Fglossary%2Fsearch%3Fid%3Dtrade-air1&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 10104, "relative_start_pos": 0, "char_end_idx": 10209, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 10094, "relative_start_pos": 6, "char_end_idx": 10209, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-16"]}}, {"key": "html", "type": "local", "char_start_idx": 10209, "relative_start_pos": 10, "char_end_idx": 10217, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 10209, "relative_start_pos": 9, "char_end_idx": 10218, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-17"]}}, {"key": "html", "type": "local", "char_start_idx": 10209, "relative_start_pos": 8, "char_end_idx": 10218, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 10209, "relative_start_pos": 7, "char_end_idx": 10218, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 10242, "relative_start_pos": 1, "char_end_idx": 10275, "relative_end_pos": 0, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 10242, "relative_start_pos": 0, "char_end_idx": 10275, "relative_end_pos": 1, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "https://books.google.com/books?id=-mwbAsxpRr0C&pg=PA406&dq=oxford+english+dictionary+origin+of+trade+wind#v=onepage&q=trade%20wind&f=false"]}}, {"key": "html", "type": "local", "char_start_idx": 10295, "relative_start_pos": 0, "char_end_idx": 10299, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/International_Standard_Book_Number", "International Standard Book Number"]}}, {"key": "html", "type": "local", "char_start_idx": 10300, "relative_start_pos": 0, "char_end_idx": 10317, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Special:BookSources/978-1-4020-3264-6", "Special:BookSources/978-1-4020-3264-6"]}}, {"key": "html", "type": "local", "char_start_idx": 10329, "relative_start_pos": 0, "char_end_idx": 10339, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 10317, "relative_start_pos": 1, "char_end_idx": 10339, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 10219, "relative_start_pos": 1, "char_end_idx": 10340, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation book"]}}, {"key": "html", "type": "local", "char_start_idx": 10341, "relative_start_pos": 1, "char_end_idx": 10341, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 10341, "relative_start_pos": 0, "char_end_idx": 10341, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Encyclopedia+of+world+climatology&rft.pages=128&rft.pub=Springer&rft.date=2005&rft.isbn=978-1-4020-3264-6&rft.au=John+E.+Oliver&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3D-mwbAsxpRr0C%26pg%3DPA406%26dq%3Doxford%2Benglish%2Bdictionary%2Borigin%2Bof%2Btrade%2Bwind%23v%3Donepage%26q%3Dtrade%2520wind%26f%3Dfalse&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 10219, "relative_start_pos": 0, "char_end_idx": 10341, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 10209, "relative_start_pos": 6, "char_end_idx": 10341, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-17"]}}, {"key": "html", "type": "local", "char_start_idx": 10341, "relative_start_pos": 10, "char_end_idx": 10349, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 10341, "relative_start_pos": 9, "char_end_idx": 10350, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-18"]}}, {"key": "html", "type": "local", "char_start_idx": 10341, "relative_start_pos": 8, "char_end_idx": 10350, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 10341, "relative_start_pos": 7, "char_end_idx": 10350, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 10376, "relative_start_pos": 0, "char_end_idx": 10430, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "http://www.atmos.uiuc.edu/~rauber/researchRICO.htm"]}}, {"key": "html", "type": "local", "char_start_idx": 10442, "relative_start_pos": 0, "char_end_idx": 10452, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 10430, "relative_start_pos": 1, "char_end_idx": 10452, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 10351, "relative_start_pos": 1, "char_end_idx": 10453, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation web"]}}, {"key": "html", "type": "local", "char_start_idx": 10454, "relative_start_pos": 1, "char_end_idx": 10454, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 10454, "relative_start_pos": 0, "char_end_idx": 10454, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Research-The+Rain+in+Cumulus+over+the+Ocean+Campaign&rft.date=2009-05-22&rft.au=Bob+Rauber&rft_id=http%3A%2F%2Fwww.atmos.uiuc.edu%2F~rauber%2FresearchRICO.htm&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 10351, "relative_start_pos": 0, "char_end_idx": 10454, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 10341, "relative_start_pos": 6, "char_end_idx": 10454, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-18"]}}, {"key": "html", "type": "local", "char_start_idx": 10454, "relative_start_pos": 10, "char_end_idx": 10462, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 10454, "relative_start_pos": 9, "char_end_idx": 10463, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-19"]}}, {"key": "html", "type": "local", "char_start_idx": 10454, "relative_start_pos": 8, "char_end_idx": 10463, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 10454, "relative_start_pos": 7, "char_end_idx": 10463, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 10487, "relative_start_pos": 1, "char_end_idx": 10550, "relative_end_pos": 0, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 10487, "relative_start_pos": 0, "char_end_idx": 10550, "relative_end_pos": 1, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "https://books.google.com/books?id=syqPSpliRCwC&pg=PA8&lpg=PA8&dq=behavior+of+trade+winds+climatology+by+season#v=onepage&q=&f=false"]}}, {"key": "html", "type": "local", "char_start_idx": 10568, "relative_start_pos": 0, "char_end_idx": 10572, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/International_Standard_Book_Number", "International Standard Book Number"]}}, {"key": "html", "type": "local", "char_start_idx": 10573, "relative_start_pos": 0, "char_end_idx": 10590, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Special:BookSources/978-0-387-71542-1", "Special:BookSources/978-0-387-71542-1"]}}, {"key": "html", "type": "local", "char_start_idx": 10602, "relative_start_pos": 0, "char_end_idx": 10612, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 10590, "relative_start_pos": 1, "char_end_idx": 10612, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 10464, "relative_start_pos": 1, "char_end_idx": 10613, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation book"]}}, {"key": "html", "type": "local", "char_start_idx": 10614, "relative_start_pos": 1, "char_end_idx": 10614, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 10614, "relative_start_pos": 0, "char_end_idx": 10614, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Tropical+cyclones%3A+climatology+and+impacts+in+the+South+Pacific&rft.pages=8&rft.pub=Springer&rft.date=2007&rft.isbn=978-0-387-71542-1&rft.au=James+P.+Terry&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3DsyqPSpliRCwC%26pg%3DPA8%26lpg%3DPA8%26dq%3Dbehavior%2Bof%2Btrade%2Bwinds%2Bclimatology%2Bby%2Bseason%23v%3Donepage%26q%3D%26f%3Dfalse&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 10464, "relative_start_pos": 0, "char_end_idx": 10614, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 10454, "relative_start_pos": 6, "char_end_idx": 10614, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-19"]}}, {"key": "html", "type": "local", "char_start_idx": 10614, "relative_start_pos": 10, "char_end_idx": 10622, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 10614, "relative_start_pos": 9, "char_end_idx": 10623, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-20"]}}, {"key": "html", "type": "local", "char_start_idx": 10614, "relative_start_pos": 8, "char_end_idx": 10623, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 10614, "relative_start_pos": 7, "char_end_idx": 10623, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 10654, "relative_start_pos": 0, "char_end_idx": 10767, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "http://www.sciencedirect.com/science?_ob=ArticleURL&_udi=B6V6M-4CMJDBT-1&_user=10&_rdoc=1&_fmt=&_orig=search&_sort=d&_docanchor=&view=c&_searchStrId=1083422780&_rerunOrigin=google&_acct=C000050221&_version=1&_urlVersion=0&_userid=10&md5=78a803b1c209929e82ce69c1ee2a005e"]}}, {"key": "html", "type": "local", "char_start_idx": 10769, "relative_start_pos": 0, "char_end_idx": 10783, "relative_end_pos": 0, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 10785, "relative_start_pos": 0, "char_end_idx": 10788, "relative_end_pos": 0, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 10805, "relative_start_pos": 0, "char_end_idx": 10812, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Bibcode", "Bibcode"]}}, {"key": "html", "type": "local", "char_start_idx": 10813, "relative_start_pos": 0, "char_end_idx": 10832, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "http://adsabs.harvard.edu/abs/2004MGeol.208..145A"]}}, {"key": "html", "type": "local", "char_start_idx": 10834, "relative_start_pos": 0, "char_end_idx": 10837, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Digital_object_identifier", "Digital object identifier"]}}, {"key": "html", "type": "local", "char_start_idx": 10838, "relative_start_pos": 0, "char_end_idx": 10866, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "//doi.org/10.1016%2Fj.margeo.2004.04.007"]}}, {"key": "html", "type": "local", "char_start_idx": 10624, "relative_start_pos": 1, "char_end_idx": 10867, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation journal"]}}, {"key": "html", "type": "local", "char_start_idx": 10868, "relative_start_pos": 1, "char_end_idx": 10868, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 10868, "relative_start_pos": 0, "char_end_idx": 10868, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Marine+Geology&rft.atitle=The+influence+of+the+trade+winds+on+the+coastal+development+of+the+Guianas+at+various+scale+levels%3A+a+synthesis&rft.volume=208&rft.issue=2-4&rft.pages=145-151&rft_id=info%3Adoi%2F10.1016%2Fj.margeo.2004.04.007&rft_id=info%3Abibcode%2F2004MGeol.208..145A&rft.au=G.+E.+Pieter&rft.au=F.+Augustinus&rft_id=http%3A%2F%2Fwww.sciencedirect.com%2Fscience%3F_ob%3DArticleURL%26_udi%3DB6V6M-4CMJDBT-1%26_user%3D10%26_rdoc%3D1%26_fmt%3D%26_orig%3Dsearch%26_sort%3Dd%26_docanchor%3D%26view%3Dc%26_searchStrId%3D1083422780%26_rerunOrigin%3Dgoogle%26_acct%3DC000050221%26_version%3D1%26_urlVersion%3D0%26_userid%3D10%26md5%3D78a803b1c209929e82ce69c1ee2a005e&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 10624, "relative_start_pos": 0, "char_end_idx": 10868, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 10614, "relative_start_pos": 6, "char_end_idx": 10868, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-20"]}}, {"key": "html", "type": "local", "char_start_idx": 10868, "relative_start_pos": 10, "char_end_idx": 10876, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 10868, "relative_start_pos": 9, "char_end_idx": 10877, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-21"]}}, {"key": "html", "type": "local", "char_start_idx": 10868, "relative_start_pos": 8, "char_end_idx": 10877, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 10868, "relative_start_pos": 7, "char_end_idx": 10877, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 10904, "relative_start_pos": 0, "char_end_idx": 10953, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "http://oceanworld.tamu.edu/resources/oceanography-book/oceananddrought.html"]}}, {"key": "html", "type": "local", "char_start_idx": 10955, "relative_start_pos": 0, "char_end_idx": 10975, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Texas_A%26M_University", "Texas A&M University"]}}, {"key": "html", "type": "local", "char_start_idx": 10987, "relative_start_pos": 0, "char_end_idx": 10997, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 10975, "relative_start_pos": 1, "char_end_idx": 10997, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 10878, "relative_start_pos": 1, "char_end_idx": 10998, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation web"]}}, {"key": "html", "type": "local", "char_start_idx": 10999, "relative_start_pos": 1, "char_end_idx": 10999, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 10999, "relative_start_pos": 0, "char_end_idx": 10999, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=The+Ocean%27s+Influence+on+North+American+Drought&rft.pub=Texas+A%26M+University&rft.date=2005&rft.au=Robert+R.+Steward&rft_id=http%3A%2F%2Foceanworld.tamu.edu%2Fresources%2Foceanography-book%2Foceananddrought.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 10878, "relative_start_pos": 0, "char_end_idx": 10999, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 10868, "relative_start_pos": 6, "char_end_idx": 10999, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-21"]}}, {"key": "html", "type": "local", "char_start_idx": 10999, "relative_start_pos": 10, "char_end_idx": 11007, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 10999, "relative_start_pos": 9, "char_end_idx": 11008, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-22"]}}, {"key": "html", "type": "local", "char_start_idx": 10999, "relative_start_pos": 8, "char_end_idx": 11008, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 10999, "relative_start_pos": 7, "char_end_idx": 11008, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 11032, "relative_start_pos": 1, "char_end_idx": 11065, "relative_end_pos": 0, "value": "i", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11032, "relative_start_pos": 0, "char_end_idx": 11065, "relative_end_pos": 1, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "https://books.google.com/books?id=-mwbAsxpRr0C&pg=PA185&lpg=PA185&dq=behavior+of+trade+winds+climatology+by+season#v=onepage&q=&f=false"]}}, {"key": "html", "type": "local", "char_start_idx": 11085, "relative_start_pos": 0, "char_end_idx": 11089, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/International_Standard_Book_Number", "International Standard Book Number"]}}, {"key": "html", "type": "local", "char_start_idx": 11090, "relative_start_pos": 0, "char_end_idx": 11107, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Special:BookSources/978-1-4020-3264-6", "Special:BookSources/978-1-4020-3264-6"]}}, {"key": "html", "type": "local", "char_start_idx": 11119, "relative_start_pos": 0, "char_end_idx": 11129, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["nowrap"]}}, {"key": "html", "type": "local", "char_start_idx": 11107, "relative_start_pos": 1, "char_end_idx": 11129, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-accessdate"]}}, {"key": "html", "type": "local", "char_start_idx": 11009, "relative_start_pos": 1, "char_end_idx": 11130, "relative_end_pos": 0, "value": "cite", "html_attrs": {"attrs": ["class"], "values": ["citation book"]}}, {"key": "html", "type": "local", "char_start_idx": 11131, "relative_start_pos": 1, "char_end_idx": 11131, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["style"], "values": ["display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 11131, "relative_start_pos": 0, "char_end_idx": 11131, "relative_end_pos": 3, "value": "span", "html_attrs": {"attrs": ["title", "class"], "values": ["ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Encyclopedia+of+world+climatology&rft.pages=185&rft.pub=Springer&rft.date=2005&rft.isbn=978-1-4020-3264-6&rft.au=John+E.+Oliver&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3D-mwbAsxpRr0C%26pg%3DPA185%26lpg%3DPA185%26dq%3Dbehavior%2Bof%2Btrade%2Bwinds%2Bclimatology%2Bby%2Bseason%23v%3Donepage%26q%3D%26f%3Dfalse&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrade+winds", "Z3988"]}}, {"key": "html", "type": "local", "char_start_idx": 11009, "relative_start_pos": 0, "char_end_idx": 11131, "relative_end_pos": 4, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 10999, "relative_start_pos": 6, "char_end_idx": 11131, "relative_end_pos": 5, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-22"]}}, {"key": "html", "type": "local", "char_start_idx": 11131, "relative_start_pos": 10, "char_end_idx": 11139, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 11131, "relative_start_pos": 9, "char_end_idx": 11140, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-pooraq_23-0"]}}, {"key": "html", "type": "local", "char_start_idx": 11131, "relative_start_pos": 8, "char_end_idx": 11140, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11131, "relative_start_pos": 7, "char_end_idx": 11140, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 11169, "relative_start_pos": 0, "char_end_idx": 11241, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "http://www.sciencedaily.com/releases/1999/07/990714073433.htm"]}}, {"key": "html", "type": "local", "char_start_idx": 11141, "relative_start_pos": 0, "char_end_idx": 11266, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 11131, "relative_start_pos": 6, "char_end_idx": 11266, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-pooraq-23"]}}, {"key": "html", "type": "local", "char_start_idx": 11267, "relative_start_pos": 4, "char_end_idx": 11275, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 11267, "relative_start_pos": 3, "char_end_idx": 11276, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-24"]}}, {"key": "html", "type": "local", "char_start_idx": 11267, "relative_start_pos": 2, "char_end_idx": 11276, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11267, "relative_start_pos": 1, "char_end_idx": 11276, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 11305, "relative_start_pos": 0, "char_end_idx": 11371, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "http://www.sciencedaily.com/releases/2001/06/010615071508.htm"]}}, {"key": "html", "type": "local", "char_start_idx": 11277, "relative_start_pos": 0, "char_end_idx": 11396, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 11267, "relative_start_pos": 0, "char_end_idx": 11396, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-24"]}}, {"key": "html", "type": "local", "char_start_idx": 11397, "relative_start_pos": 4, "char_end_idx": 11405, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 11397, "relative_start_pos": 3, "char_end_idx": 11406, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-25"]}}, {"key": "html", "type": "local", "char_start_idx": 11397, "relative_start_pos": 2, "char_end_idx": 11406, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11397, "relative_start_pos": 1, "char_end_idx": 11406, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 11432, "relative_start_pos": 0, "char_end_idx": 11491, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "http://www.gcrio.org/OnLnDoc/pdf/african_dust.pdf"]}}, {"key": "html", "type": "local", "char_start_idx": 11492, "relative_start_pos": 0, "char_end_idx": 11500, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "class", "href"], "values": ["nofollow", "external text", "https://web.archive.org/web/20070620013708/http://www.gcrio.org/OnLnDoc/pdf/african_dust.pdf"]}}, {"key": "html", "type": "local", "char_start_idx": 11519, "relative_start_pos": 0, "char_end_idx": 11534, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Wayback_Machine", "Wayback Machine"]}}, {"key": "html", "type": "local", "char_start_idx": 11407, "relative_start_pos": 0, "char_end_idx": 11560, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 11397, "relative_start_pos": 0, "char_end_idx": 11560, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-25"]}}, {"key": "html", "type": "local", "char_start_idx": 11561, "relative_start_pos": 4, "char_end_idx": 11569, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["cite-accessibility-label"]}}, {"key": "html", "type": "local", "char_start_idx": 11561, "relative_start_pos": 3, "char_end_idx": 11570, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#cite_ref-26"]}}, {"key": "html", "type": "local", "char_start_idx": 11561, "relative_start_pos": 2, "char_end_idx": 11570, "relative_end_pos": 1, "value": "b", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11561, "relative_start_pos": 1, "char_end_idx": 11570, "relative_end_pos": 2, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["mw-cite-backlink"]}}, {"key": "html", "type": "local", "char_start_idx": 11571, "relative_start_pos": 1, "char_end_idx": 11594, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["/wiki/U._S._Geological_Survey", "mw-redirect", "U. S. Geological Survey"]}}, {"key": "html", "type": "local", "char_start_idx": 11603, "relative_start_pos": 0, "char_end_idx": 11636, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "href", "class"], "values": ["nofollow", "http://coastal.er.usgs.gov/african_dust/", "external text"]}}, {"key": "html", "type": "local", "char_start_idx": 11571, "relative_start_pos": 0, "char_end_idx": 11661, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["reference-text"]}}, {"key": "html", "type": "local", "char_start_idx": 11561, "relative_start_pos": 0, "char_end_idx": 11661, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["cite_note-26"]}}, {"key": "html", "type": "local", "char_start_idx": 7886, "relative_start_pos": 1, "char_end_idx": 11662, "relative_end_pos": 0, "value": "ol", "html_attrs": {"attrs": ["class"], "values": ["references"]}}, {"key": "html", "type": "local", "char_start_idx": 7886, "relative_start_pos": 0, "char_end_idx": 11662, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["class", "style"], "values": ["reflist columns references-column-width", "-moz-column-width: 30em; -webkit-column-width: 30em; column-width: 30em; list-style-type: decimal;"]}}, {"key": "html", "type": "local", "char_start_idx": 78, "relative_start_pos": 2, "char_end_idx": 11662, "relative_end_pos": 2, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["mw-parser-output"]}}, {"key": "html", "type": "local", "char_start_idx": 11662, "relative_start_pos": 4, "char_end_idx": 11662, "relative_end_pos": 5, "value": "img", "html_attrs": {"attrs": ["src", "alt", "title", "width", "height", "style"], "values": ["//en.wikipedia.org/wiki/Special:CentralAutoLogin/start?type=1x1", "", "", "1", "1", "border: none; position: absolute;"]}}, {"key": "html", "type": "local", "char_start_idx": 11662, "relative_start_pos": 3, "char_end_idx": 11662, "relative_end_pos": 6, "value": "noscript", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 78, "relative_start_pos": 1, "char_end_idx": 11662, "relative_end_pos": 7, "value": "div", "html_attrs": {"attrs": ["id", "lang", "dir", "class"], "values": ["mw-content-text", "en", "ltr", "mw-content-ltr"]}}, {"key": "html", "type": "local", "char_start_idx": 11678, "relative_start_pos": 0, "char_end_idx": 11748, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["dir", "href"], "values": ["ltr", "https://en.wikipedia.org/w/index.php?title=Trade_winds&oldid=817251427"]}}, {"key": "html", "type": "local", "char_start_idx": 11662, "relative_start_pos": 8, "char_end_idx": 11750, "relative_end_pos": 0, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["printfooter"]}}, {"key": "html", "type": "local", "char_start_idx": 11750, "relative_start_pos": 3, "char_end_idx": 11760, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Help:Category", "Help:Category"]}}, {"key": "html", "type": "local", "char_start_idx": 11762, "relative_start_pos": 2, "char_end_idx": 11778, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Category:Climate_patterns", "Category:Climate patterns"]}}, {"key": "html", "type": "local", "char_start_idx": 11762, "relative_start_pos": 1, "char_end_idx": 11778, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11779, "relative_start_pos": 1, "char_end_idx": 11799, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Category:Atmospheric_dynamics", "Category:Atmospheric dynamics"]}}, {"key": "html", "type": "local", "char_start_idx": 11779, "relative_start_pos": 0, "char_end_idx": 11799, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11800, "relative_start_pos": 1, "char_end_idx": 11804, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Category:Wind", "Category:Wind"]}}, {"key": "html", "type": "local", "char_start_idx": 11800, "relative_start_pos": 0, "char_end_idx": 11804, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11805, "relative_start_pos": 1, "char_end_idx": 11816, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Category:Age_of_Sail", "Category:Age of Sail"]}}, {"key": "html", "type": "local", "char_start_idx": 11805, "relative_start_pos": 0, "char_end_idx": 11816, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11762, "relative_start_pos": 0, "char_end_idx": 11817, "relative_end_pos": 0, "value": "ul", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11750, "relative_start_pos": 2, "char_end_idx": 11817, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["id", "class"], "values": ["mw-normal-catlinks", "mw-normal-catlinks"]}}, {"key": "html", "type": "local", "char_start_idx": 11836, "relative_start_pos": 2, "char_end_idx": 11869, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Category:Webarchive_template_wayback_links", "Category:Webarchive template wayback links"]}}, {"key": "html", "type": "local", "char_start_idx": 11836, "relative_start_pos": 1, "char_end_idx": 11869, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11870, "relative_start_pos": 1, "char_end_idx": 11883, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Category:Good_articles", "Category:Good articles"]}}, {"key": "html", "type": "local", "char_start_idx": 11870, "relative_start_pos": 0, "char_end_idx": 11883, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11836, "relative_start_pos": 0, "char_end_idx": 11884, "relative_end_pos": 0, "value": "ul", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11817, "relative_start_pos": 2, "char_end_idx": 11884, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["id", "class"], "values": ["mw-hidden-catlinks", "mw-hidden-catlinks mw-hidden-cats-hidden"]}}, {"key": "html", "type": "local", "char_start_idx": 11750, "relative_start_pos": 1, "char_end_idx": 11884, "relative_end_pos": 2, "value": "div", "html_attrs": {"attrs": ["id", "class", "data-mw"], "values": ["catlinks", "catlinks", "interface"]}}, {"key": "html", "type": "local", "char_start_idx": 11884, "relative_start_pos": 3, "char_end_idx": 11884, "relative_end_pos": 4, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["visualClear"]}}, {"key": "html", "type": "local", "char_start_idx": 12, "relative_start_pos": 0, "char_end_idx": 11884, "relative_end_pos": 5, "value": "div", "html_attrs": {"attrs": ["id", "class"], "values": ["bodyContent", "mw-body-content"]}}, {"key": "html", "type": "local", "char_start_idx": 0, "relative_start_pos": 5, "char_end_idx": 11884, "relative_end_pos": 6, "value": "div", "html_attrs": {"attrs": ["id", "class", "role"], "values": ["content", "mw-body", "main"]}}, {"key": "html", "type": "local", "char_start_idx": 11884, "relative_start_pos": 8, "char_end_idx": 11899, "relative_end_pos": 0, "value": "h2", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11900, "relative_start_pos": 2, "char_end_idx": 11914, "relative_end_pos": 0, "value": "h3", "html_attrs": {"attrs": ["id"], "values": ["p-personal-label"]}}, {"key": "html", "type": "local", "char_start_idx": 11915, "relative_start_pos": 1, "char_end_idx": 11928, "relative_end_pos": 0, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["pt-anonuserpage"]}}, {"key": "html", "type": "local", "char_start_idx": 11929, "relative_start_pos": 1, "char_end_idx": 11933, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "accesskey"], "values": ["/wiki/Special:MyTalk", "Discussion about edits from this IP address [ctrl-alt-n]", "n"]}}, {"key": "html", "type": "local", "char_start_idx": 11929, "relative_start_pos": 0, "char_end_idx": 11933, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["pt-anontalk"]}}, {"key": "html", "type": "local", "char_start_idx": 11934, "relative_start_pos": 1, "char_end_idx": 11947, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "accesskey"], "values": ["/wiki/Special:MyContributions", "A list of edits made from this IP address [ctrl-alt-y]", "y"]}}, {"key": "html", "type": "local", "char_start_idx": 11934, "relative_start_pos": 0, "char_end_idx": 11947, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["pt-anoncontribs"]}}, {"key": "html", "type": "local", "char_start_idx": 11948, "relative_start_pos": 1, "char_end_idx": 11962, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/w/index.php?title=Special:CreateAccount&returnto=Trade+winds", "You are encouraged to create an account and log in; however, it is not mandatory"]}}, {"key": "html", "type": "local", "char_start_idx": 11948, "relative_start_pos": 0, "char_end_idx": 11962, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["pt-createaccount"]}}, {"key": "html", "type": "local", "char_start_idx": 11963, "relative_start_pos": 1, "char_end_idx": 11969, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "accesskey"], "values": ["/w/index.php?title=Special:UserLogin&returnto=Trade+winds", "You're encouraged to log in; however, it's not mandatory. [ctrl-alt-o]", "o"]}}, {"key": "html", "type": "local", "char_start_idx": 11963, "relative_start_pos": 0, "char_end_idx": 11969, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["pt-login"]}}, {"key": "html", "type": "local", "char_start_idx": 11915, "relative_start_pos": 0, "char_end_idx": 11970, "relative_end_pos": 0, "value": "ul", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11900, "relative_start_pos": 1, "char_end_idx": 11970, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["id", "role", "class", "aria-labelledby"], "values": ["p-personal", "navigation", "", "p-personal-label"]}}, {"key": "html", "type": "local", "char_start_idx": 11970, "relative_start_pos": 4, "char_end_idx": 11980, "relative_end_pos": 0, "value": "h3", "html_attrs": {"attrs": ["id"], "values": ["p-namespaces-label"]}}, {"key": "html", "type": "local", "char_start_idx": 11981, "relative_start_pos": 3, "char_end_idx": 11988, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "accesskey"], "values": ["/wiki/Trade_winds", "View the content page [ctrl-alt-c]", "c"]}}, {"key": "html", "type": "local", "char_start_idx": 11981, "relative_start_pos": 2, "char_end_idx": 11988, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11981, "relative_start_pos": 1, "char_end_idx": 11988, "relative_end_pos": 2, "value": "li", "html_attrs": {"attrs": ["id", "class"], "values": ["ca-nstab-main", "selected"]}}, {"key": "html", "type": "local", "char_start_idx": 11989, "relative_start_pos": 2, "char_end_idx": 11993, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "rel", "title", "accesskey"], "values": ["/wiki/Talk:Trade_winds", "discussion", "Discussion about the content page [ctrl-alt-t]", "t"]}}, {"key": "html", "type": "local", "char_start_idx": 11989, "relative_start_pos": 1, "char_end_idx": 11993, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11989, "relative_start_pos": 0, "char_end_idx": 11993, "relative_end_pos": 2, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["ca-talk"]}}, {"key": "html", "type": "local", "char_start_idx": 11981, "relative_start_pos": 0, "char_end_idx": 11994, "relative_end_pos": 0, "value": "ul", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11970, "relative_start_pos": 3, "char_end_idx": 11994, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["id", "role", "class", "aria-labelledby"], "values": ["p-namespaces", "navigation", "vectorTabs", "p-namespaces-label"]}}, {"key": "html", "type": "local", "char_start_idx": 11994, "relative_start_pos": 4, "char_end_idx": 12002, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 11994, "relative_start_pos": 3, "char_end_idx": 12003, "relative_end_pos": 0, "value": "h3", "html_attrs": {"attrs": ["id", "tabindex"], "values": ["p-variants-label", "0"]}}, {"key": "html", "type": "local", "char_start_idx": 12003, "relative_start_pos": 2, "char_end_idx": 12003, "relative_end_pos": 3, "value": "ul", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 12003, "relative_start_pos": 1, "char_end_idx": 12003, "relative_end_pos": 4, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["menu"]}}, {"key": "html", "type": "local", "char_start_idx": 11994, "relative_start_pos": 2, "char_end_idx": 12003, "relative_end_pos": 5, "value": "div", "html_attrs": {"attrs": ["id", "role", "class", "aria-labelledby"], "values": ["p-variants", "navigation", "vectorMenu emptyPortlet", "p-variants-label"]}}, {"key": "html", "type": "local", "char_start_idx": 11970, "relative_start_pos": 2, "char_end_idx": 12003, "relative_end_pos": 6, "value": "div", "html_attrs": {"attrs": ["id"], "values": ["left-navigation"]}}, {"key": "html", "type": "local", "char_start_idx": 12003, "relative_start_pos": 9, "char_end_idx": 12008, "relative_end_pos": 0, "value": "h3", "html_attrs": {"attrs": ["id"], "values": ["p-views-label"]}}, {"key": "html", "type": "local", "char_start_idx": 12009, "relative_start_pos": 3, "char_end_idx": 12013, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["/wiki/Trade_winds"]}}, {"key": "html", "type": "local", "char_start_idx": 12009, "relative_start_pos": 2, "char_end_idx": 12013, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 12009, "relative_start_pos": 1, "char_end_idx": 12013, "relative_end_pos": 2, "value": "li", "html_attrs": {"attrs": ["id", "class"], "values": ["ca-view", "collapsible selected"]}}, {"key": "html", "type": "local", "char_start_idx": 12014, "relative_start_pos": 2, "char_end_idx": 12018, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "accesskey"], "values": ["/w/index.php?title=Trade_winds&action=edit", "Edit this page [ctrl-alt-e]", "e"]}}, {"key": "html", "type": "local", "char_start_idx": 12014, "relative_start_pos": 1, "char_end_idx": 12018, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 12014, "relative_start_pos": 0, "char_end_idx": 12018, "relative_end_pos": 2, "value": "li", "html_attrs": {"attrs": ["id", "class"], "values": ["ca-edit", "collapsible"]}}, {"key": "html", "type": "local", "char_start_idx": 12019, "relative_start_pos": 2, "char_end_idx": 12031, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "accesskey"], "values": ["/w/index.php?title=Trade_winds&action=history", "Past revisions of this page [ctrl-alt-h]", "h"]}}, {"key": "html", "type": "local", "char_start_idx": 12019, "relative_start_pos": 1, "char_end_idx": 12031, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 12019, "relative_start_pos": 0, "char_end_idx": 12031, "relative_end_pos": 2, "value": "li", "html_attrs": {"attrs": ["id", "class"], "values": ["ca-history", "collapsible"]}}, {"key": "html", "type": "local", "char_start_idx": 12009, "relative_start_pos": 0, "char_end_idx": 12032, "relative_end_pos": 0, "value": "ul", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 12003, "relative_start_pos": 8, "char_end_idx": 12032, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["id", "role", "class", "aria-labelledby"], "values": ["p-views", "navigation", "vectorTabs", "p-views-label"]}}, {"key": "html", "type": "local", "char_start_idx": 12032, "relative_start_pos": 4, "char_end_idx": 12036, "relative_end_pos": 0, "value": "span", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 12032, "relative_start_pos": 3, "char_end_idx": 12036, "relative_end_pos": 1, "value": "h3", "html_attrs": {"attrs": ["id", "tabindex"], "values": ["p-cactions-label", "0"]}}, {"key": "html", "type": "local", "char_start_idx": 12037, "relative_start_pos": 1, "char_end_idx": 12037, "relative_end_pos": 2, "value": "ul", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 12037, "relative_start_pos": 0, "char_end_idx": 12037, "relative_end_pos": 3, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["menu"]}}, {"key": "html", "type": "local", "char_start_idx": 12032, "relative_start_pos": 2, "char_end_idx": 12037, "relative_end_pos": 4, "value": "div", "html_attrs": {"attrs": ["id", "role", "class", "aria-labelledby"], "values": ["p-cactions", "navigation", "vectorMenu emptyPortlet", "p-cactions-label"]}}, {"key": "html", "type": "local", "char_start_idx": 12037, "relative_start_pos": 7, "char_end_idx": 12043, "relative_end_pos": 0, "value": "label", "html_attrs": {"attrs": ["for"], "values": ["searchInput"]}}, {"key": "html", "type": "local", "char_start_idx": 12037, "relative_start_pos": 6, "char_end_idx": 12044, "relative_end_pos": 0, "value": "h3", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 12037, "relative_start_pos": 5, "char_end_idx": 12044, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["id", "role"], "values": ["p-search", "search"]}}, {"key": "html", "type": "local", "char_start_idx": 12003, "relative_start_pos": 7, "char_end_idx": 12044, "relative_end_pos": 2, "value": "div", "html_attrs": {"attrs": ["id"], "values": ["right-navigation"]}}, {"key": "html", "type": "local", "char_start_idx": 11900, "relative_start_pos": 0, "char_end_idx": 12044, "relative_end_pos": 3, "value": "div", "html_attrs": {"attrs": ["id"], "values": ["mw-head"]}}, {"key": "html", "type": "local", "char_start_idx": 12044, "relative_start_pos": 6, "char_end_idx": 12044, "relative_end_pos": 7, "value": "a", "html_attrs": {"attrs": ["class", "href", "title"], "values": ["mw-wiki-logo", "/wiki/Main_Page", "Visit the main page"]}}, {"key": "html", "type": "local", "char_start_idx": 12044, "relative_start_pos": 5, "char_end_idx": 12044, "relative_end_pos": 8, "value": "div", "html_attrs": {"attrs": ["id", "role"], "values": ["p-logo", "banner"]}}, {"key": "html", "type": "local", "char_start_idx": 12044, "relative_start_pos": 10, "char_end_idx": 12054, "relative_end_pos": 0, "value": "h3", "html_attrs": {"attrs": ["id"], "values": ["p-navigation-label"]}}, {"key": "html", "type": "local", "char_start_idx": 12055, "relative_start_pos": 3, "char_end_idx": 12064, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "accesskey"], "values": ["/wiki/Main_Page", "Visit the main page [ctrl-alt-z]", "z"]}}, {"key": "html", "type": "local", "char_start_idx": 12055, "relative_start_pos": 2, "char_end_idx": 12064, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["n-mainpage-description"]}}, {"key": "html", "type": "local", "char_start_idx": 12065, "relative_start_pos": 1, "char_end_idx": 12073, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Portal:Contents", "Guides to browsing Wikipedia"]}}, {"key": "html", "type": "local", "char_start_idx": 12065, "relative_start_pos": 0, "char_end_idx": 12073, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["n-contents"]}}, {"key": "html", "type": "local", "char_start_idx": 12074, "relative_start_pos": 1, "char_end_idx": 12090, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Portal:Featured_content", "Featured content \u2013 the best of Wikipedia"]}}, {"key": "html", "type": "local", "char_start_idx": 12074, "relative_start_pos": 0, "char_end_idx": 12090, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["n-featuredcontent"]}}, {"key": "html", "type": "local", "char_start_idx": 12091, "relative_start_pos": 1, "char_end_idx": 12105, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Portal:Current_events", "Find background information on current events"]}}, {"key": "html", "type": "local", "char_start_idx": 12091, "relative_start_pos": 0, "char_end_idx": 12105, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["n-currentevents"]}}, {"key": "html", "type": "local", "char_start_idx": 12106, "relative_start_pos": 1, "char_end_idx": 12120, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "accesskey"], "values": ["/wiki/Special:Random", "Load a random article [ctrl-alt-x]", "x"]}}, {"key": "html", "type": "local", "char_start_idx": 12106, "relative_start_pos": 0, "char_end_idx": 12120, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["n-randompage"]}}, {"key": "html", "type": "local", "char_start_idx": 12121, "relative_start_pos": 1, "char_end_idx": 12140, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en", "Support us"]}}, {"key": "html", "type": "local", "char_start_idx": 12121, "relative_start_pos": 0, "char_end_idx": 12140, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["n-sitesupport"]}}, {"key": "html", "type": "local", "char_start_idx": 12141, "relative_start_pos": 1, "char_end_idx": 12156, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["//shop.wikimedia.org", "Visit the Wikipedia store"]}}, {"key": "html", "type": "local", "char_start_idx": 12141, "relative_start_pos": 0, "char_end_idx": 12156, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["n-shoplink"]}}, {"key": "html", "type": "local", "char_start_idx": 12055, "relative_start_pos": 1, "char_end_idx": 12157, "relative_end_pos": 0, "value": "ul", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 12055, "relative_start_pos": 0, "char_end_idx": 12157, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["body"]}}, {"key": "html", "type": "local", "char_start_idx": 12044, "relative_start_pos": 9, "char_end_idx": 12157, "relative_end_pos": 2, "value": "div", "html_attrs": {"attrs": ["class", "role", "id", "aria-labelledby"], "values": ["portal", "navigation", "p-navigation", "p-navigation-label"]}}, {"key": "html", "type": "local", "char_start_idx": 12157, "relative_start_pos": 4, "char_end_idx": 12168, "relative_end_pos": 0, "value": "h3", "html_attrs": {"attrs": ["id"], "values": ["p-interaction-label"]}}, {"key": "html", "type": "local", "char_start_idx": 12169, "relative_start_pos": 3, "char_end_idx": 12173, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Help:Contents", "Guidance on how to use and edit Wikipedia"]}}, {"key": "html", "type": "local", "char_start_idx": 12169, "relative_start_pos": 2, "char_end_idx": 12173, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["n-help"]}}, {"key": "html", "type": "local", "char_start_idx": 12174, "relative_start_pos": 1, "char_end_idx": 12189, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Wikipedia:About", "Find out about Wikipedia"]}}, {"key": "html", "type": "local", "char_start_idx": 12174, "relative_start_pos": 0, "char_end_idx": 12189, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["n-aboutsite"]}}, {"key": "html", "type": "local", "char_start_idx": 12190, "relative_start_pos": 1, "char_end_idx": 12206, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Wikipedia:Community_portal", "About the project, what you can do, where to find things"]}}, {"key": "html", "type": "local", "char_start_idx": 12190, "relative_start_pos": 0, "char_end_idx": 12206, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["n-portal"]}}, {"key": "html", "type": "local", "char_start_idx": 12207, "relative_start_pos": 1, "char_end_idx": 12221, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "accesskey"], "values": ["/wiki/Special:RecentChanges", "A list of recent changes in the wiki [ctrl-alt-r]", "r"]}}, {"key": "html", "type": "local", "char_start_idx": 12207, "relative_start_pos": 0, "char_end_idx": 12221, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["n-recentchanges"]}}, {"key": "html", "type": "local", "char_start_idx": 12222, "relative_start_pos": 1, "char_end_idx": 12234, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["//en.wikipedia.org/wiki/Wikipedia:Contact_us", "How to contact Wikipedia"]}}, {"key": "html", "type": "local", "char_start_idx": 12222, "relative_start_pos": 0, "char_end_idx": 12234, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["n-contactpage"]}}, {"key": "html", "type": "local", "char_start_idx": 12169, "relative_start_pos": 1, "char_end_idx": 12235, "relative_end_pos": 0, "value": "ul", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 12169, "relative_start_pos": 0, "char_end_idx": 12235, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["body"]}}, {"key": "html", "type": "local", "char_start_idx": 12157, "relative_start_pos": 3, "char_end_idx": 12235, "relative_end_pos": 2, "value": "div", "html_attrs": {"attrs": ["class", "role", "id", "aria-labelledby"], "values": ["portal", "navigation", "p-interaction", "p-interaction-label"]}}, {"key": "html", "type": "local", "char_start_idx": 12235, "relative_start_pos": 4, "char_end_idx": 12240, "relative_end_pos": 0, "value": "h3", "html_attrs": {"attrs": ["id"], "values": ["p-tb-label"]}}, {"key": "html", "type": "local", "char_start_idx": 12241, "relative_start_pos": 3, "char_end_idx": 12256, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "accesskey"], "values": ["/wiki/Special:WhatLinksHere/Trade_winds", "List of all English Wikipedia pages containing links to this page [ctrl-alt-j]", "j"]}}, {"key": "html", "type": "local", "char_start_idx": 12241, "relative_start_pos": 2, "char_end_idx": 12256, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["t-whatlinkshere"]}}, {"key": "html", "type": "local", "char_start_idx": 12257, "relative_start_pos": 1, "char_end_idx": 12272, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "rel", "title", "accesskey"], "values": ["/wiki/Special:RecentChangesLinked/Trade_winds", "nofollow", "Recent changes in pages linked from this page [ctrl-alt-k]", "k"]}}, {"key": "html", "type": "local", "char_start_idx": 12257, "relative_start_pos": 0, "char_end_idx": 12272, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["t-recentchangeslinked"]}}, {"key": "html", "type": "local", "char_start_idx": 12273, "relative_start_pos": 1, "char_end_idx": 12284, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "accesskey"], "values": ["/wiki/Wikipedia:File_Upload_Wizard", "Upload files [ctrl-alt-u]", "u"]}}, {"key": "html", "type": "local", "char_start_idx": 12273, "relative_start_pos": 0, "char_end_idx": 12284, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["t-upload"]}}, {"key": "html", "type": "local", "char_start_idx": 12285, "relative_start_pos": 1, "char_end_idx": 12298, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "accesskey"], "values": ["/wiki/Special:SpecialPages", "A list of all special pages [ctrl-alt-q]", "q"]}}, {"key": "html", "type": "local", "char_start_idx": 12285, "relative_start_pos": 0, "char_end_idx": 12298, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["t-specialpages"]}}, {"key": "html", "type": "local", "char_start_idx": 12299, "relative_start_pos": 1, "char_end_idx": 12313, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/w/index.php?title=Trade_winds&oldid=817251427", "Permanent link to this revision of the page"]}}, {"key": "html", "type": "local", "char_start_idx": 12299, "relative_start_pos": 0, "char_end_idx": 12313, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["t-permalink"]}}, {"key": "html", "type": "local", "char_start_idx": 12314, "relative_start_pos": 1, "char_end_idx": 12330, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/w/index.php?title=Trade_winds&action=info", "More information about this page"]}}, {"key": "html", "type": "local", "char_start_idx": 12314, "relative_start_pos": 0, "char_end_idx": 12330, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["t-info"]}}, {"key": "html", "type": "local", "char_start_idx": 12331, "relative_start_pos": 1, "char_end_idx": 12344, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "accesskey"], "values": ["https://www.wikidata.org/wiki/Special:EntityPage/Q160603", "Link to connected data repository item [ctrl-alt-g]", "g"]}}, {"key": "html", "type": "local", "char_start_idx": 12331, "relative_start_pos": 0, "char_end_idx": 12344, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["t-wikibase"]}}, {"key": "html", "type": "local", "char_start_idx": 12345, "relative_start_pos": 1, "char_end_idx": 12359, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/w/index.php?title=Special:CiteThisPage&page=Trade_winds&id=817251427", "Information on how to cite this page"]}}, {"key": "html", "type": "local", "char_start_idx": 12345, "relative_start_pos": 0, "char_end_idx": 12359, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["t-cite"]}}, {"key": "html", "type": "local", "char_start_idx": 12241, "relative_start_pos": 1, "char_end_idx": 12360, "relative_end_pos": 0, "value": "ul", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 12241, "relative_start_pos": 0, "char_end_idx": 12360, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["body"]}}, {"key": "html", "type": "local", "char_start_idx": 12235, "relative_start_pos": 3, "char_end_idx": 12360, "relative_end_pos": 2, "value": "div", "html_attrs": {"attrs": ["class", "role", "id", "aria-labelledby"], "values": ["portal", "navigation", "p-tb", "p-tb-label"]}}, {"key": "html", "type": "local", "char_start_idx": 12360, "relative_start_pos": 4, "char_end_idx": 12372, "relative_end_pos": 0, "value": "h3", "html_attrs": {"attrs": ["id"], "values": ["p-coll-print_export-label"]}}, {"key": "html", "type": "local", "char_start_idx": 12373, "relative_start_pos": 3, "char_end_idx": 12386, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["/w/index.php?title=Special:Book&bookcmd=book_creator&referer=Trade+winds"]}}, {"key": "html", "type": "local", "char_start_idx": 12373, "relative_start_pos": 2, "char_end_idx": 12386, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["coll-create_a_book"]}}, {"key": "html", "type": "local", "char_start_idx": 12387, "relative_start_pos": 1, "char_end_idx": 12402, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["/w/index.php?title=Special:ElectronPdf&page=Trade+winds&action=show-download-screen"]}}, {"key": "html", "type": "local", "char_start_idx": 12387, "relative_start_pos": 0, "char_end_idx": 12402, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["coll-download-as-rdf2latex"]}}, {"key": "html", "type": "local", "char_start_idx": 12403, "relative_start_pos": 1, "char_end_idx": 12420, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "accesskey"], "values": ["/w/index.php?title=Trade_winds&printable=yes", "Printable version of this page [ctrl-alt-p]", "p"]}}, {"key": "html", "type": "local", "char_start_idx": 12403, "relative_start_pos": 0, "char_end_idx": 12420, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["t-print"]}}, {"key": "html", "type": "local", "char_start_idx": 12373, "relative_start_pos": 1, "char_end_idx": 12421, "relative_end_pos": 0, "value": "ul", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 12373, "relative_start_pos": 0, "char_end_idx": 12421, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["body"]}}, {"key": "html", "type": "local", "char_start_idx": 12360, "relative_start_pos": 3, "char_end_idx": 12421, "relative_end_pos": 2, "value": "div", "html_attrs": {"attrs": ["class", "role", "id", "aria-labelledby"], "values": ["portal", "navigation", "p-coll-print_export", "p-coll-print_export-label"]}}, {"key": "html", "type": "local", "char_start_idx": 12421, "relative_start_pos": 4, "char_end_idx": 12421, "relative_end_pos": 5, "value": "span", "html_attrs": {"attrs": ["class", "title", "tabindex", "role", "aria-haspopup"], "values": ["uls-settings-trigger", "Language settings", "0", "button", "true"]}}, {"key": "html", "type": "local", "char_start_idx": 12421, "relative_start_pos": 6, "char_end_idx": 12430, "relative_end_pos": 0, "value": "h3", "html_attrs": {"attrs": ["id"], "values": ["p-lang-label"]}}, {"key": "html", "type": "local", "char_start_idx": 12431, "relative_start_pos": 3, "char_end_idx": 12438, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://ar.wikipedia.org/wiki/%D8%B1%D9%8A%D8%A7%D8%AD_%D8%A7%D9%84%D8%AA%D8%AC%D8%A7%D8%B1%D8%A9", "\u0631\u064a\u0627\u062d \u0627\u0644\u062a\u062c\u0627\u0631\u0629 \u2013 Arabic", "ar", "ar", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12431, "relative_start_pos": 2, "char_end_idx": 12438, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-ar"]}}, {"key": "html", "type": "local", "char_start_idx": 12439, "relative_start_pos": 1, "char_end_idx": 12451, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://az.wikipedia.org/wiki/Passatlar", "Passatlar \u2013 Azerbaijani", "az", "az", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12439, "relative_start_pos": 0, "char_end_idx": 12451, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-az"]}}, {"key": "html", "type": "local", "char_start_idx": 12452, "relative_start_pos": 1, "char_end_idx": 12462, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://zh-min-nan.wikipedia.org/wiki/B%C5%8D%CD%98-e%CC%8Dk-hong", "B\u014d\u0358-e\u030dk-hong \u2013 Chinese (Min Nan)", "zh-min-nan", "zh-min-nan", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12452, "relative_start_pos": 0, "char_end_idx": 12462, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-zh-min-nan"]}}, {"key": "html", "type": "local", "char_start_idx": 12463, "relative_start_pos": 1, "char_end_idx": 12473, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://be.wikipedia.org/wiki/%D0%9F%D0%B0%D1%81%D0%B0%D1%82", "\u041f\u0430\u0441\u0430\u0442 \u2013 Belarusian", "be", "be", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12463, "relative_start_pos": 0, "char_end_idx": 12473, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-be"]}}, {"key": "html", "type": "local", "char_start_idx": 12474, "relative_start_pos": 1, "char_end_idx": 12483, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://bg.wikipedia.org/wiki/%D0%9F%D0%B0%D1%81%D0%B0%D1%82", "\u041f\u0430\u0441\u0430\u0442 \u2013 Bulgarian", "bg", "bg", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12474, "relative_start_pos": 0, "char_end_idx": 12483, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-bg"]}}, {"key": "html", "type": "local", "char_start_idx": 12484, "relative_start_pos": 1, "char_end_idx": 12490, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://ca.wikipedia.org/wiki/Vents_alisis", "Vents alisis \u2013 Catalan", "ca", "ca", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12484, "relative_start_pos": 0, "char_end_idx": 12490, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-ca"]}}, {"key": "html", "type": "local", "char_start_idx": 12491, "relative_start_pos": 1, "char_end_idx": 12498, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://cs.wikipedia.org/wiki/Pas%C3%A1t", "Pas\u00e1t \u2013 Czech", "cs", "cs", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12491, "relative_start_pos": 0, "char_end_idx": 12498, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-cs"]}}, {"key": "html", "type": "local", "char_start_idx": 12499, "relative_start_pos": 1, "char_end_idx": 12504, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://da.wikipedia.org/wiki/Passat", "Passat \u2013 Danish", "da", "da", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12499, "relative_start_pos": 0, "char_end_idx": 12504, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-da"]}}, {"key": "html", "type": "local", "char_start_idx": 12505, "relative_start_pos": 1, "char_end_idx": 12512, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://de.wikipedia.org/wiki/Passat_(Windsystem)", "Passat (Windsystem) \u2013 German", "de", "de", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12505, "relative_start_pos": 0, "char_end_idx": 12512, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-de"]}}, {"key": "html", "type": "local", "char_start_idx": 12513, "relative_start_pos": 1, "char_end_idx": 12518, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://et.wikipedia.org/wiki/Passaat", "Passaat \u2013 Estonian", "et", "et", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12513, "relative_start_pos": 0, "char_end_idx": 12518, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-et"]}}, {"key": "html", "type": "local", "char_start_idx": 12519, "relative_start_pos": 1, "char_end_idx": 12527, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://el.wikipedia.org/wiki/%CE%91%CE%BB%CE%B7%CE%B3%CE%B5%CE%AF%CF%82_%CE%AC%CE%BD%CE%B5%CE%BC%CE%BF%CE%B9", "\u0391\u03bb\u03b7\u03b3\u03b5\u03af\u03c2 \u03ac\u03bd\u03b5\u03bc\u03bf\u03b9 \u2013 Greek", "el", "el", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12519, "relative_start_pos": 0, "char_end_idx": 12527, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-el"]}}, {"key": "html", "type": "local", "char_start_idx": 12528, "relative_start_pos": 1, "char_end_idx": 12535, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://es.wikipedia.org/wiki/Vientos_alisios", "Vientos alisios \u2013 Spanish", "es", "es", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12528, "relative_start_pos": 0, "char_end_idx": 12535, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-es"]}}, {"key": "html", "type": "local", "char_start_idx": 12536, "relative_start_pos": 1, "char_end_idx": 12545, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://eo.wikipedia.org/wiki/Pasato", "Pasato \u2013 Esperanto", "eo", "eo", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12536, "relative_start_pos": 0, "char_end_idx": 12545, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-eo"]}}, {"key": "html", "type": "local", "char_start_idx": 12546, "relative_start_pos": 1, "char_end_idx": 12553, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://eu.wikipedia.org/wiki/Alisio", "Alisio \u2013 Basque", "eu", "eu", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12546, "relative_start_pos": 0, "char_end_idx": 12553, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-eu"]}}, {"key": "html", "type": "local", "char_start_idx": 12554, "relative_start_pos": 1, "char_end_idx": 12559, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://fa.wikipedia.org/wiki/%D8%A8%D8%A7%D8%AF%D9%87%D8%A7%DB%8C_%D8%A8%D8%B3%D8%A7%D9%85%D8%A7%D9%86", "\u0628\u0627\u062f\u0647\u0627\u06cc \u0628\u0633\u0627\u0645\u0627\u0646 \u2013 Persian", "fa", "fa", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12554, "relative_start_pos": 0, "char_end_idx": 12559, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-fa"]}}, {"key": "html", "type": "local", "char_start_idx": 12560, "relative_start_pos": 1, "char_end_idx": 12568, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://fr.wikipedia.org/wiki/Aliz%C3%A9", "Aliz\u00e9 \u2013 French", "fr", "fr", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12560, "relative_start_pos": 0, "char_end_idx": 12568, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-fr"]}}, {"key": "html", "type": "local", "char_start_idx": 12569, "relative_start_pos": 1, "char_end_idx": 12576, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://ga.wikipedia.org/wiki/Tr%C3%A1dghaotha", "Tr\u00e1dghaotha \u2013 Irish", "ga", "ga", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12569, "relative_start_pos": 0, "char_end_idx": 12576, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-ga"]}}, {"key": "html", "type": "local", "char_start_idx": 12577, "relative_start_pos": 1, "char_end_idx": 12583, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://gl.wikipedia.org/wiki/Alisio", "Alisio \u2013 Galician", "gl", "gl", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12577, "relative_start_pos": 0, "char_end_idx": 12583, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-gl"]}}, {"key": "html", "type": "local", "char_start_idx": 12584, "relative_start_pos": 1, "char_end_idx": 12587, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://ko.wikipedia.org/wiki/%EB%AC%B4%EC%97%AD%ED%92%8D", "\ubb34\uc5ed\ud48d \u2013 Korean", "ko", "ko", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12584, "relative_start_pos": 0, "char_end_idx": 12587, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-ko"]}}, {"key": "html", "type": "local", "char_start_idx": 12588, "relative_start_pos": 1, "char_end_idx": 12595, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://hy.wikipedia.org/wiki/%D5%8A%D5%A1%D5%BD%D5%BD%D5%A1%D5%BF%D5%B6%D5%A5%D6%80", "\u054a\u0561\u057d\u057d\u0561\u057f\u0576\u0565\u0580 \u2013 Armenian", "hy", "hy", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12588, "relative_start_pos": 0, "char_end_idx": 12595, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-hy"]}}, {"key": "html", "type": "local", "char_start_idx": 12596, "relative_start_pos": 1, "char_end_idx": 12602, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://hi.wikipedia.org/wiki/%E0%A4%B5%E0%A5%8D%E0%A4%AF%E0%A4%BE%E0%A4%AA%E0%A4%BE%E0%A4%B0%E0%A4%BF%E0%A4%95_%E0%A4%AA%E0%A4%B5%E0%A4%A8", "\u0935\u094d\u092f\u093e\u092a\u093e\u0930\u093f\u0915 \u092a\u0935\u0928 \u2013 Hindi", "hi", "hi", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12596, "relative_start_pos": 0, "char_end_idx": 12602, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-hi"]}}, {"key": "html", "type": "local", "char_start_idx": 12603, "relative_start_pos": 1, "char_end_idx": 12611, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://hr.wikipedia.org/wiki/Pasati", "Pasati \u2013 Croatian", "hr", "hr", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12603, "relative_start_pos": 0, "char_end_idx": 12611, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-hr"]}}, {"key": "html", "type": "local", "char_start_idx": 12612, "relative_start_pos": 1, "char_end_idx": 12615, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://io.wikipedia.org/wiki/Alizeo", "Alizeo \u2013 Ido", "io", "io", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12612, "relative_start_pos": 0, "char_end_idx": 12615, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-io"]}}, {"key": "html", "type": "local", "char_start_idx": 12616, "relative_start_pos": 1, "char_end_idx": 12632, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://id.wikipedia.org/wiki/Angin_pasat", "Angin pasat \u2013 Indonesian", "id", "id", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12616, "relative_start_pos": 0, "char_end_idx": 12632, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-id"]}}, {"key": "html", "type": "local", "char_start_idx": 12633, "relative_start_pos": 1, "char_end_idx": 12637, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://os.wikipedia.org/wiki/%D0%9F%D0%B0%D1%81%D1%81%D0%B0%D1%82", "\u041f\u0430\u0441\u0441\u0430\u0442 \u2013 Ossetic", "os", "os", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12633, "relative_start_pos": 0, "char_end_idx": 12637, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-os"]}}, {"key": "html", "type": "local", "char_start_idx": 12638, "relative_start_pos": 1, "char_end_idx": 12646, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://is.wikipedia.org/wiki/Sta%C3%B0vindur", "Sta\u00f0vindur \u2013 Icelandic", "is", "is", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12638, "relative_start_pos": 0, "char_end_idx": 12646, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-is"]}}, {"key": "html", "type": "local", "char_start_idx": 12647, "relative_start_pos": 1, "char_end_idx": 12655, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://it.wikipedia.org/wiki/Aliseo", "Aliseo \u2013 Italian", "it", "it", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12647, "relative_start_pos": 0, "char_end_idx": 12655, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-it"]}}, {"key": "html", "type": "local", "char_start_idx": 12656, "relative_start_pos": 1, "char_end_idx": 12661, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://he.wikipedia.org/wiki/%D7%A8%D7%95%D7%97%D7%95%D7%AA_%D7%94%D7%A1%D7%97%D7%A8", "\u05e8\u05d5\u05d7\u05d5\u05ea \u05d4\u05e1\u05d7\u05e8 \u2013 Hebrew", "he", "he", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12656, "relative_start_pos": 0, "char_end_idx": 12661, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-he"]}}, {"key": "html", "type": "local", "char_start_idx": 12662, "relative_start_pos": 1, "char_end_idx": 12669, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://ka.wikipedia.org/wiki/%E1%83%9E%E1%83%90%E1%83%A1%E1%83%90%E1%83%A2%E1%83%94%E1%83%91%E1%83%98", "\u10de\u10d0\u10e1\u10d0\u10e2\u10d4\u10d1\u10d8 \u2013 Georgian", "ka", "ka", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12662, "relative_start_pos": 0, "char_end_idx": 12669, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-ka"]}}, {"key": "html", "type": "local", "char_start_idx": 12670, "relative_start_pos": 1, "char_end_idx": 12677, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://kk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%81%D1%81%D0%B0%D1%82", "\u041f\u0430\u0441\u0441\u0430\u0442 \u2013 Kazakh", "kk", "kk", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12670, "relative_start_pos": 0, "char_end_idx": 12677, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-kk"]}}, {"key": "html", "type": "local", "char_start_idx": 12678, "relative_start_pos": 1, "char_end_idx": 12692, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://ht.wikipedia.org/wiki/Alize", "Alize \u2013 Haitian Creole", "ht", "ht", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12678, "relative_start_pos": 0, "char_end_idx": 12692, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-ht"]}}, {"key": "html", "type": "local", "char_start_idx": 12693, "relative_start_pos": 1, "char_end_idx": 12699, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://la.wikipedia.org/wiki/Venti_anniversarii", "Venti anniversarii \u2013 Latin", "la", "la", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12693, "relative_start_pos": 0, "char_end_idx": 12699, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-la"]}}, {"key": "html", "type": "local", "char_start_idx": 12700, "relative_start_pos": 1, "char_end_idx": 12708, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://lv.wikipedia.org/wiki/Pas%C4%81ts", "Pas\u0101ts \u2013 Latvian", "lv", "lv", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12700, "relative_start_pos": 0, "char_end_idx": 12708, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-lv"]}}, {"key": "html", "type": "local", "char_start_idx": 12709, "relative_start_pos": 1, "char_end_idx": 12717, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://lt.wikipedia.org/wiki/Pasatas", "Pasatas \u2013 Lithuanian", "lt", "lt", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12709, "relative_start_pos": 0, "char_end_idx": 12717, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-lt"]}}, {"key": "html", "type": "local", "char_start_idx": 12718, "relative_start_pos": 1, "char_end_idx": 12726, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://li.wikipedia.org/wiki/Passaatwindj", "Passaatwindj \u2013 Limburgish", "li", "li", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12718, "relative_start_pos": 0, "char_end_idx": 12726, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-li"]}}, {"key": "html", "type": "local", "char_start_idx": 12727, "relative_start_pos": 1, "char_end_idx": 12733, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://hu.wikipedia.org/wiki/Passz%C3%A1tsz%C3%A9l", "Passz\u00e1tsz\u00e9l \u2013 Hungarian", "hu", "hu", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12727, "relative_start_pos": 0, "char_end_idx": 12733, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-hu"]}}, {"key": "html", "type": "local", "char_start_idx": 12734, "relative_start_pos": 1, "char_end_idx": 12740, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://ml.wikipedia.org/wiki/%E0%B4%B5%E0%B4%BE%E0%B4%A3%E0%B4%BF%E0%B4%9C%E0%B5%8D%E0%B4%AF%E0%B4%95%E0%B5%8D%E0%B4%95%E0%B4%BE%E0%B4%B1%E0%B5%8D%E0%B4%B1%E0%B5%81%E0%B4%95%E0%B5%BE", "\u0d35\u0d3e\u0d23\u0d3f\u0d1c\u0d4d\u0d2f\u0d15\u0d4d\u0d15\u0d3e\u0d31\u0d4d\u0d31\u0d41\u0d15\u0d7e \u2013 Malayalam", "ml", "ml", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12734, "relative_start_pos": 0, "char_end_idx": 12740, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-ml"]}}, {"key": "html", "type": "local", "char_start_idx": 12741, "relative_start_pos": 1, "char_end_idx": 12754, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://ms.wikipedia.org/wiki/Angin_pasat", "Angin pasat \u2013 Malay", "ms", "ms", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12741, "relative_start_pos": 0, "char_end_idx": 12754, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-ms"]}}, {"key": "html", "type": "local", "char_start_idx": 12755, "relative_start_pos": 1, "char_end_idx": 12765, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://my.wikipedia.org/wiki/%E1%80%80%E1%80%AF%E1%80%94%E1%80%BA%E1%80%9E%E1%80%8A%E1%80%BA%E1%80%9C%E1%80%B1", "\u1000\u102f\u1014\u103a\u101e\u100a\u103a\u101c\u1031 \u2013 Burmese", "my", "my", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12755, "relative_start_pos": 0, "char_end_idx": 12765, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-my"]}}, {"key": "html", "type": "local", "char_start_idx": 12766, "relative_start_pos": 1, "char_end_idx": 12776, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://nl.wikipedia.org/wiki/Passaat", "Passaat \u2013 Dutch", "nl", "nl", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12766, "relative_start_pos": 0, "char_end_idx": 12776, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-nl"]}}, {"key": "html", "type": "local", "char_start_idx": 12777, "relative_start_pos": 1, "char_end_idx": 12780, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://ja.wikipedia.org/wiki/%E8%B2%BF%E6%98%93%E9%A2%A8", "\u8cbf\u6613\u98a8 \u2013 Japanese", "ja", "ja", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12777, "relative_start_pos": 0, "char_end_idx": 12780, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-ja"]}}, {"key": "html", "type": "local", "char_start_idx": 12781, "relative_start_pos": 1, "char_end_idx": 12786, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://no.wikipedia.org/wiki/Passatvind", "Passatvind \u2013 Norwegian", "no", "no", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12781, "relative_start_pos": 0, "char_end_idx": 12786, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-no"]}}, {"key": "html", "type": "local", "char_start_idx": 12787, "relative_start_pos": 1, "char_end_idx": 12800, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://nn.wikipedia.org/wiki/Passatvind", "Passatvind \u2013 Norwegian Nynorsk", "nn", "nn", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12787, "relative_start_pos": 0, "char_end_idx": 12800, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-nn"]}}, {"key": "html", "type": "local", "char_start_idx": 12801, "relative_start_pos": 1, "char_end_idx": 12808, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://oc.wikipedia.org/wiki/Alisis", "Alisis \u2013 Occitan", "oc", "oc", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12801, "relative_start_pos": 0, "char_end_idx": 12808, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-oc"]}}, {"key": "html", "type": "local", "char_start_idx": 12809, "relative_start_pos": 1, "char_end_idx": 12826, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://uz.wikipedia.org/wiki/Passatlar", "Passatlar \u2013 Uzbek", "uz", "uz", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12809, "relative_start_pos": 0, "char_end_idx": 12826, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-uz"]}}, {"key": "html", "type": "local", "char_start_idx": 12827, "relative_start_pos": 1, "char_end_idx": 12833, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://pl.wikipedia.org/wiki/Pasat", "Pasat \u2013 Polish", "pl", "pl", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12827, "relative_start_pos": 0, "char_end_idx": 12833, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-pl"]}}, {"key": "html", "type": "local", "char_start_idx": 12834, "relative_start_pos": 1, "char_end_idx": 12843, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://pt.wikipedia.org/wiki/Al%C3%ADsios", "Al\u00edsios \u2013 Portuguese", "pt", "pt", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12834, "relative_start_pos": 0, "char_end_idx": 12843, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-pt"]}}, {"key": "html", "type": "local", "char_start_idx": 12844, "relative_start_pos": 1, "char_end_idx": 12850, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://ro.wikipedia.org/wiki/Alizeu", "Alizeu \u2013 Romanian", "ro", "ro", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12844, "relative_start_pos": 0, "char_end_idx": 12850, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-ro"]}}, {"key": "html", "type": "local", "char_start_idx": 12851, "relative_start_pos": 1, "char_end_idx": 12858, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://ru.wikipedia.org/wiki/%D0%9F%D0%B0%D1%81%D1%81%D0%B0%D1%82", "\u041f\u0430\u0441\u0441\u0430\u0442 \u2013 Russian", "ru", "ru", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12851, "relative_start_pos": 0, "char_end_idx": 12858, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-ru"]}}, {"key": "html", "type": "local", "char_start_idx": 12859, "relative_start_pos": 1, "char_end_idx": 12864, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://sq.wikipedia.org/wiki/Pasatet", "Pasatet \u2013 Albanian", "sq", "sq", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12859, "relative_start_pos": 0, "char_end_idx": 12864, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-sq"]}}, {"key": "html", "type": "local", "char_start_idx": 12865, "relative_start_pos": 1, "char_end_idx": 12875, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://sk.wikipedia.org/wiki/Pas%C3%A1t", "Pas\u00e1t \u2013 Slovak", "sk", "sk", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12865, "relative_start_pos": 0, "char_end_idx": 12875, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-sk"]}}, {"key": "html", "type": "local", "char_start_idx": 12876, "relative_start_pos": 1, "char_end_idx": 12887, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://sl.wikipedia.org/wiki/Pasat", "Pasat \u2013 Slovenian", "sl", "sl", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12876, "relative_start_pos": 0, "char_end_idx": 12887, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-sl"]}}, {"key": "html", "type": "local", "char_start_idx": 12888, "relative_start_pos": 1, "char_end_idx": 12903, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://sr.wikipedia.org/wiki/%D0%9F%D0%B0%D1%81%D0%B0%D1%82%D0%B8", "\u041f\u0430\u0441\u0430\u0442\u0438 \u2013 Serbian", "sr", "sr", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12888, "relative_start_pos": 0, "char_end_idx": 12903, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-sr"]}}, {"key": "html", "type": "local", "char_start_idx": 12904, "relative_start_pos": 1, "char_end_idx": 12935, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://sh.wikipedia.org/wiki/Pasati", "Pasati \u2013 Serbo-Croatian", "sh", "sh", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12904, "relative_start_pos": 0, "char_end_idx": 12935, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-sh"]}}, {"key": "html", "type": "local", "char_start_idx": 12936, "relative_start_pos": 1, "char_end_idx": 12941, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://fi.wikipedia.org/wiki/Pasaatituuli", "Pasaatituuli \u2013 Finnish", "fi", "fi", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12936, "relative_start_pos": 0, "char_end_idx": 12941, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-fi"]}}, {"key": "html", "type": "local", "char_start_idx": 12942, "relative_start_pos": 1, "char_end_idx": 12949, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://sv.wikipedia.org/wiki/Passadvind", "Passadvind \u2013 Swedish", "sv", "sv", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12942, "relative_start_pos": 0, "char_end_idx": 12949, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-sv"]}}, {"key": "html", "type": "local", "char_start_idx": 12950, "relative_start_pos": 1, "char_end_idx": 12955, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://ta.wikipedia.org/wiki/%E0%AE%B5%E0%AE%A3%E0%AE%BF%E0%AE%95%E0%AE%95%E0%AF%8D_%E0%AE%95%E0%AE%BE%E0%AE%B1%E0%AF%8D%E0%AE%B1%E0%AF%81", "\u0bb5\u0ba3\u0bbf\u0b95\u0b95\u0bcd \u0b95\u0bbe\u0bb1\u0bcd\u0bb1\u0bc1 \u2013 Tamil", "ta", "ta", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12950, "relative_start_pos": 0, "char_end_idx": 12955, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-ta"]}}, {"key": "html", "type": "local", "char_start_idx": 12956, "relative_start_pos": 1, "char_end_idx": 12959, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://th.wikipedia.org/wiki/%E0%B8%A5%E0%B8%A1%E0%B8%84%E0%B9%89%E0%B8%B2", "\u0e25\u0e21\u0e04\u0e49\u0e32 \u2013 Thai", "th", "th", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12956, "relative_start_pos": 0, "char_end_idx": 12959, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-th"]}}, {"key": "html", "type": "local", "char_start_idx": 12960, "relative_start_pos": 1, "char_end_idx": 12966, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://tr.wikipedia.org/wiki/Alizeler", "Alizeler \u2013 Turkish", "tr", "tr", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12960, "relative_start_pos": 0, "char_end_idx": 12966, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-tr"]}}, {"key": "html", "type": "local", "char_start_idx": 12967, "relative_start_pos": 1, "char_end_idx": 12977, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://uk.wikipedia.org/wiki/%D0%9F%D0%B0%D1%81%D0%B0%D1%82%D0%B8", "\u041f\u0430\u0441\u0430\u0442\u0438 \u2013 Ukrainian", "uk", "uk", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12967, "relative_start_pos": 0, "char_end_idx": 12977, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-uk"]}}, {"key": "html", "type": "local", "char_start_idx": 12978, "relative_start_pos": 1, "char_end_idx": 12988, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://vi.wikipedia.org/wiki/Gi%C3%B3_m%E1%BA%ADu_d%E1%BB%8Bch", "Gi\u00f3 m\u1eadu d\u1ecbch \u2013 Vietnamese", "vi", "vi", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12978, "relative_start_pos": 0, "char_end_idx": 12988, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-vi"]}}, {"key": "html", "type": "local", "char_start_idx": 12989, "relative_start_pos": 1, "char_end_idx": 12991, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://zh-yue.wikipedia.org/wiki/%E4%BF%A1%E9%A2%A8", "\u4fe1\u98a8 \u2013 Cantonese", "zh-yue", "zh-yue", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12989, "relative_start_pos": 0, "char_end_idx": 12991, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-zh-yue"]}}, {"key": "html", "type": "local", "char_start_idx": 12992, "relative_start_pos": 1, "char_end_idx": 12994, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "lang", "hreflang", "class"], "values": ["https://zh.wikipedia.org/wiki/%E4%BF%A1%E9%A2%A8", "\u4fe1\u98a8 \u2013 Chinese", "zh", "zh", "interlanguage-link-target"]}}, {"key": "html", "type": "local", "char_start_idx": 12992, "relative_start_pos": 0, "char_end_idx": 12994, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["class"], "values": ["interlanguage-link interwiki-zh"]}}, {"key": "html", "type": "local", "char_start_idx": 12431, "relative_start_pos": 1, "char_end_idx": 12995, "relative_end_pos": 0, "value": "ul", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 12995, "relative_start_pos": 3, "char_end_idx": 13005, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title", "class"], "values": ["https://www.wikidata.org/wiki/Special:EntityPage/Q160603#sitelinks-wikipedia", "Edit interlanguage links", "wbc-editpage"]}}, {"key": "html", "type": "local", "char_start_idx": 12995, "relative_start_pos": 2, "char_end_idx": 13005, "relative_end_pos": 1, "value": "span", "html_attrs": {"attrs": ["class"], "values": ["wb-langlinks-edit wb-langlinks-link"]}}, {"key": "html", "type": "local", "char_start_idx": 12995, "relative_start_pos": 1, "char_end_idx": 13005, "relative_end_pos": 2, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["after-portlet after-portlet-lang"]}}, {"key": "html", "type": "local", "char_start_idx": 12431, "relative_start_pos": 0, "char_end_idx": 13006, "relative_end_pos": 0, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["body"]}}, {"key": "html", "type": "local", "char_start_idx": 12421, "relative_start_pos": 3, "char_end_idx": 13006, "relative_end_pos": 1, "value": "div", "html_attrs": {"attrs": ["class", "role", "id", "aria-labelledby"], "values": ["portal", "navigation", "p-lang", "p-lang-label"]}}, {"key": "html", "type": "local", "char_start_idx": 12044, "relative_start_pos": 4, "char_end_idx": 13006, "relative_end_pos": 2, "value": "div", "html_attrs": {"attrs": ["id"], "values": ["mw-panel"]}}, {"key": "html", "type": "local", "char_start_idx": 11884, "relative_start_pos": 7, "char_end_idx": 13006, "relative_end_pos": 3, "value": "div", "html_attrs": {"attrs": ["id"], "values": ["mw-navigation"]}}, {"key": "html", "type": "local", "char_start_idx": 13006, "relative_start_pos": 6, "char_end_idx": 13062, "relative_end_pos": 0, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["footer-info-lastmod"]}}, {"key": "html", "type": "local", "char_start_idx": 13091, "relative_start_pos": 0, "char_end_idx": 13138, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["rel", "href"], "values": ["license", "//en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License"]}}, {"key": "html", "type": "local", "char_start_idx": 13138, "relative_start_pos": 1, "char_end_idx": 13138, "relative_end_pos": 2, "value": "a", "html_attrs": {"attrs": ["rel", "href", "style"], "values": ["license", "//creativecommons.org/licenses/by-sa/3.0/", "display:none;"]}}, {"key": "html", "type": "local", "char_start_idx": 13205, "relative_start_pos": 0, "char_end_idx": 13217, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["//wikimediafoundation.org/wiki/Terms_of_Use"]}}, {"key": "html", "type": "local", "char_start_idx": 13222, "relative_start_pos": 0, "char_end_idx": 13236, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["//wikimediafoundation.org/wiki/Privacy_policy"]}}, {"key": "html", "type": "local", "char_start_idx": 13282, "relative_start_pos": 0, "char_end_idx": 13308, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["//www.wikimediafoundation.org/"]}}, {"key": "html", "type": "local", "char_start_idx": 13063, "relative_start_pos": 0, "char_end_idx": 13336, "relative_end_pos": 0, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["footer-info-copyright"]}}, {"key": "html", "type": "local", "char_start_idx": 13006, "relative_start_pos": 5, "char_end_idx": 13337, "relative_end_pos": 0, "value": "ul", "html_attrs": {"attrs": ["id"], "values": ["footer-info"]}}, {"key": "html", "type": "local", "char_start_idx": 13337, "relative_start_pos": 3, "char_end_idx": 13351, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class", "title"], "values": ["https://wikimediafoundation.org/wiki/Privacy_policy", "extiw", "wmf:Privacy policy"]}}, {"key": "html", "type": "local", "char_start_idx": 13337, "relative_start_pos": 2, "char_end_idx": 13351, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["footer-places-privacy"]}}, {"key": "html", "type": "local", "char_start_idx": 13352, "relative_start_pos": 1, "char_end_idx": 13367, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Wikipedia:About", "Wikipedia:About"]}}, {"key": "html", "type": "local", "char_start_idx": 13352, "relative_start_pos": 0, "char_end_idx": 13367, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["footer-places-about"]}}, {"key": "html", "type": "local", "char_start_idx": 13368, "relative_start_pos": 1, "char_end_idx": 13379, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "title"], "values": ["/wiki/Wikipedia:General_disclaimer", "Wikipedia:General disclaimer"]}}, {"key": "html", "type": "local", "char_start_idx": 13368, "relative_start_pos": 0, "char_end_idx": 13379, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["footer-places-disclaimer"]}}, {"key": "html", "type": "local", "char_start_idx": 13380, "relative_start_pos": 1, "char_end_idx": 13397, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["//en.wikipedia.org/wiki/Wikipedia:Contact_us"]}}, {"key": "html", "type": "local", "char_start_idx": 13380, "relative_start_pos": 0, "char_end_idx": 13397, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["footer-places-contact"]}}, {"key": "html", "type": "local", "char_start_idx": 13398, "relative_start_pos": 1, "char_end_idx": 13408, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["https://www.mediawiki.org/wiki/Special:MyLanguage/How_to_contribute"]}}, {"key": "html", "type": "local", "char_start_idx": 13398, "relative_start_pos": 0, "char_end_idx": 13408, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["footer-places-developers"]}}, {"key": "html", "type": "local", "char_start_idx": 13409, "relative_start_pos": 1, "char_end_idx": 13425, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["https://wikimediafoundation.org/wiki/Cookie_statement"]}}, {"key": "html", "type": "local", "char_start_idx": 13409, "relative_start_pos": 0, "char_end_idx": 13425, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["footer-places-cookiestatement"]}}, {"key": "html", "type": "local", "char_start_idx": 13426, "relative_start_pos": 1, "char_end_idx": 13437, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href", "class"], "values": ["//en.m.wikipedia.org/w/index.php?title=Trade_winds&mobileaction=toggle_view_mobile", "noprint stopMobileRedirectToggle"]}}, {"key": "html", "type": "local", "char_start_idx": 13426, "relative_start_pos": 0, "char_end_idx": 13437, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["footer-places-mobileview"]}}, {"key": "html", "type": "local", "char_start_idx": 13438, "relative_start_pos": 1, "char_end_idx": 13453, "relative_end_pos": 0, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["#"]}}, {"key": "html", "type": "local", "char_start_idx": 13438, "relative_start_pos": 0, "char_end_idx": 13453, "relative_end_pos": 1, "value": "li", "html_attrs": {"attrs": ["style"], "values": [""]}}, {"key": "html", "type": "local", "char_start_idx": 13337, "relative_start_pos": 1, "char_end_idx": 13454, "relative_end_pos": 0, "value": "ul", "html_attrs": {"attrs": ["id"], "values": ["footer-places"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 4, "char_end_idx": 13454, "relative_end_pos": 5, "value": "img", "html_attrs": {"attrs": ["src", "srcset", "width", "height", "alt"], "values": ["/static/images/wikimedia-button.png", "/static/images/wikimedia-button-1.5x.png 1.5x, /static/images/wikimedia-button-2x.png 2x", "88", "31", "Wikimedia Foundation"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 3, "char_end_idx": 13454, "relative_end_pos": 6, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["https://wikimediafoundation.org/"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 2, "char_end_idx": 13454, "relative_end_pos": 7, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["footer-copyrightico"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 10, "char_end_idx": 13454, "relative_end_pos": 11, "value": "img", "html_attrs": {"attrs": ["src", "alt", "srcset", "width", "height"], "values": ["/static/images/poweredby_mediawiki_88x31.png", "Powered by MediaWiki", "/static/images/poweredby_mediawiki_132x47.png 1.5x, /static/images/poweredby_mediawiki_176x62.png 2x", "88", "31"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 9, "char_end_idx": 13454, "relative_end_pos": 12, "value": "a", "html_attrs": {"attrs": ["href"], "values": ["//www.mediawiki.org/"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 8, "char_end_idx": 13454, "relative_end_pos": 13, "value": "li", "html_attrs": {"attrs": ["id"], "values": ["footer-poweredbyico"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 1, "char_end_idx": 13454, "relative_end_pos": 14, "value": "ul", "html_attrs": {"attrs": ["id", "class"], "values": ["footer-icons", "noprint"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 15, "char_end_idx": 13454, "relative_end_pos": 16, "value": "div", "html_attrs": {"attrs": ["style"], "values": ["clear: both;"]}}, {"key": "html", "type": "local", "char_start_idx": 13006, "relative_start_pos": 4, "char_end_idx": 13454, "relative_end_pos": 17, "value": "div", "html_attrs": {"attrs": ["id", "role"], "values": ["footer", "contentinfo"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 19, "char_end_idx": 13454, "relative_end_pos": 20, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["suggestions-results"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 21, "char_end_idx": 13454, "relative_end_pos": 22, "value": "div", "html_attrs": {"attrs": ["class"], "values": ["suggestions-special"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 18, "char_end_idx": 13454, "relative_end_pos": 23, "value": "div", "html_attrs": {"attrs": ["class", "style"], "values": ["suggestions", "display: none; font-size: 13px;"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 28, "char_end_idx": 13454, "relative_end_pos": 29, "value": "polygon", "html_attrs": {"attrs": ["points"], "values": ["0 8, 10 8, 18 0, 26 8, 1000 8, 1000 1000, 0 1000"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 27, "char_end_idx": 13454, "relative_end_pos": 30, "value": "clippath", "html_attrs": {"attrs": ["id"], "values": ["mwe-popups-mask"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 32, "char_end_idx": 13454, "relative_end_pos": 33, "value": "polygon", "html_attrs": {"attrs": ["points"], "values": ["0 8, 274 8, 282 0, 290 8, 1000 8, 1000 1000, 0 1000"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 31, "char_end_idx": 13454, "relative_end_pos": 34, "value": "clippath", "html_attrs": {"attrs": ["id"], "values": ["mwe-popups-mask-flip"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 36, "char_end_idx": 13454, "relative_end_pos": 37, "value": "polygon", "html_attrs": {"attrs": ["points"], "values": ["0 8, 174 8, 182 0, 190 8, 1000 8, 1000 1000, 0 1000"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 35, "char_end_idx": 13454, "relative_end_pos": 38, "value": "clippath", "html_attrs": {"attrs": ["id"], "values": ["mwe-popups-landscape-mask"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 40, "char_end_idx": 13454, "relative_end_pos": 41, "value": "polygon", "html_attrs": {"attrs": ["points"], "values": ["0 0, 1000 0, 1000 242, 190 242, 182 250, 174 242, 0 242"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 39, "char_end_idx": 13454, "relative_end_pos": 42, "value": "clippath", "html_attrs": {"attrs": ["id"], "values": ["mwe-popups-landscape-mask-flip"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 26, "char_end_idx": 13454, "relative_end_pos": 43, "value": "defs", "html_attrs": {"attrs": [], "values": []}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 25, "char_end_idx": 13454, "relative_end_pos": 44, "value": "svg", "html_attrs": {"attrs": ["width", "height"], "values": ["0", "0"]}}, {"key": "html", "type": "local", "char_start_idx": 13454, "relative_start_pos": 24, "char_end_idx": 13454, "relative_end_pos": 45, "value": "div", "html_attrs": {"attrs": ["id"], "values": ["mwe-popups-svg"]}}, {"key": "html", "type": "local", "char_start_idx": 0, "relative_start_pos": 0, "char_end_idx": 13454, "relative_end_pos": 46, "value": "body", "html_attrs": {"attrs": ["class"], "values": ["mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject page-Trade_winds rootpage-Trade_winds skin-vector action-view"]}}]}
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 534c6998..95d55f06 100644
--- a/setup.py
+++ b/setup.py
@@ -9,6 +9,17 @@ def req_file(filename):
install_requires = req_file("requirements.txt")
+preprocessing_requires = {
+ "html": ["lxml==4.6.3", "htmlmin==0.1.12"],
+ "entity": ["REL @ git+https://github.com/manandey/REL.git#egg=REL"],
+ "timestamp": ["bs_dateutil @ git+git://github.com/cccntu/dateutil@2.8.5"],
+ "website_description": ["wikipedia2vec==1.0.5", "nltk==3.6.5"],
+}
+
+preprocessing_dependencies = []
+for dependencies in preprocessing_requires.values():
+ preprocessing_dependencies.extend(dependencies)
+
setup(
name="bsmetadata",
python_requires=">=3.7.11, <3.10",
@@ -19,9 +30,5 @@ def req_file(filename):
description="Codebase for including metadata (e.g., URLs, timestamps, HTML tags) during language model pretraining.",
packages=find_packages(),
install_requires=install_requires,
- extras_require={
- "entity_preprocessing": ["REL @ git+https://github.com/manandey/REL.git#egg=REL"],
- "timestamp": ["bs_dateutil @ git+git://github.com/cccntu/dateutil@2.8.5"],
- "website_description_preprocessing": ["wikipedia2vec==1.0.5", "nltk==3.6.5"],
- },
+ extras_require={"preprocessing": preprocessing_dependencies},
)
diff --git a/tests/preprocessing_tools/html_parser/test_html_parser.py b/tests/preprocessing_tools/html_parser/test_html_parser.py
new file mode 100644
index 00000000..eca1ac9b
--- /dev/null
+++ b/tests/preprocessing_tools/html_parser/test_html_parser.py
@@ -0,0 +1,1064 @@
+from typing import DefaultDict
+
+from bsmetadata.preprocessing_tools.html_parser import get_clean_text_and_metadata
+from bsmetadata.preprocessing_tools.html_parser.objects import TagToRemove, TagToRemoveWithContent
+
+
+def check_content_parsing(target_content_plain_text: str, target_metadata_tags, metadata, plain_text):
+ target_list_tags = []
+ for target_tag in target_content_plain_text.keys():
+ target_list_tags.extend([target_tag] * len(target_content_plain_text[target_tag]))
+
+ for target_tag in target_list_tags:
+ assert target_tag in target_metadata_tags
+ target_metadata_tags.remove(target_tag)
+ find = False
+ for metadata_node in metadata:
+ if (
+ metadata_node.value.tag == target_tag
+ and plain_text[metadata_node.char_start_idx : metadata_node.char_end_idx]
+ in target_content_plain_text[target_tag]
+ ):
+ find = True
+ target_content_plain_text[target_tag].remove(
+ plain_text[metadata_node.char_start_idx : metadata_node.char_end_idx]
+ )
+ if not target_content_plain_text[target_tag]:
+ target_content_plain_text.pop(target_tag)
+ break
+
+ error_msg = f"Plain text not found for the tag '{target_tag}'"
+ if not find:
+ retrived_plain_text = "\n ".join(
+ [
+ f"{metadata_node.value.tag}: {repr(plain_text[metadata_node.char_start_idx : metadata_node.char_end_idx])}"
+ for metadata_node in metadata
+ ]
+ )
+ error_msg = f"{error_msg}\nThe plain text associated with each tags are:\n {retrived_plain_text} \nand the text to match with:\n{repr(plain_text[metadata_node.char_start_idx : metadata_node.char_end_idx])}"
+ assert find, error_msg
+
+ assert not target_content_plain_text
+ assert not target_metadata_tags
+
+
+def check_content_parsing_and_metadata(target_content_plain_text: str, target_metadata_tags, metadata, plain_text):
+ target_list_tags = []
+ for target_tag in target_content_plain_text.keys():
+ target_list_tags.extend([target_tag] * len(target_content_plain_text[target_tag]))
+ for target_tag in target_list_tags:
+ assert target_tag in target_metadata_tags
+ target_metadata_tags.remove(target_tag)
+ find = False
+ for metadata_node in metadata:
+ if (
+ metadata_node.value.tag == target_tag
+ and metadata_node.value.attrs in [item[1] for item in target_content_plain_text[target_tag]]
+ and plain_text[metadata_node.char_start_idx : metadata_node.char_end_idx]
+ in [item[0] for item in target_content_plain_text[target_tag]]
+ ):
+ find = True
+ target_content_plain_text[target_tag].remove(
+ (
+ plain_text[metadata_node.char_start_idx : metadata_node.char_end_idx],
+ metadata_node.value.attrs,
+ )
+ )
+ if not target_content_plain_text[target_tag]:
+ target_content_plain_text.pop(target_tag)
+ break
+
+ error_msg = f"Plain text not found for the tag '{target_tag}'"
+ if not find:
+ retrived_plain_text = "\n ".join(
+ [
+ f"{metadata_node.value.tag}: {repr(plain_text[metadata_node.char_start_idx : metadata_node.char_end_idx])} {metadata_node.value.attrs}"
+ for metadata_node in metadata
+ ]
+ )
+ error_msg = f"{error_msg}\nThe plain text associated with each tags are:\n {retrived_plain_text}"
+ assert find, error_msg
+
+ assert not target_content_plain_text
+ assert not target_metadata_tags
+
+
+def test_parse_simple_html():
+ html = """
+
+
+
+
+ This is a title
+
+
+"""
+ plain_text, metadata = get_clean_text_and_metadata(html)
+ assert plain_text == "This is a title\n" # the space are doe to the block contents
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+ assert len(metadata) == 2
+ assert "html" not in metadata_tags
+ assert "head" not in metadata_tags
+ assert "body" in metadata_tags
+ assert "h1" in metadata_tags
+
+ for metadata_node in metadata:
+ if metadata_node.value.tag == "h1":
+ metadata_h1 = metadata_node
+ break
+ assert plain_text[metadata_h1.char_start_idx : metadata_h1.char_end_idx] == "This is a title"
+ return (plain_text, metadata)
+
+
+def test_parse_html_remove_tag_alone():
+ html = """
+
+
+
+
+ This is a title
+
+
+"""
+ tags_to_remove_alone = [TagToRemove("body")]
+ plain_text, metadata = get_clean_text_and_metadata(html, tags_to_remove_alone=tags_to_remove_alone)
+ assert plain_text == "This is a title\n"
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+ assert len(metadata) == 1
+ assert "html" not in metadata_tags
+ assert "head" not in metadata_tags
+ assert "body" not in metadata_tags
+ assert "h1" in metadata_tags
+
+ for metadata_node in metadata:
+ if metadata_node.value.tag == "h1":
+ metadata_h1 = metadata_node
+ break
+ assert plain_text[metadata_h1.char_start_idx : metadata_h1.char_end_idx] == "This is a title"
+ return (plain_text, metadata)
+
+
+def test_parse_html_remove_tag_and_content():
+ html = """
+
+
+
+
+ This is a title
+
+
This is a first paragraph in div
+
This is a second paragraph in div
+
+ This is a paragraph not in div
+
+
+"""
+ tags_to_remove_with_content = [TagToRemoveWithContent(tag="div")]
+ plain_text, metadata = get_clean_text_and_metadata(html, tags_to_remove_with_content=tags_to_remove_with_content)
+ assert plain_text == (
+ """This is a title
+This is a paragraph not in div
+"""
+ ) # the space are doe to the block contents
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ assert len(metadata) == 3
+ assert "html" not in metadata_tags
+ assert "head" not in metadata_tags
+ assert "body" in metadata_tags
+ assert "h1" in metadata_tags
+ assert "p" in metadata_tags
+
+ for metadata_node in metadata:
+ if metadata_node.value.tag == "h1":
+ metadata_h1 = metadata_node
+ break
+ assert plain_text[metadata_h1.char_start_idx : metadata_h1.char_end_idx] == "This is a title"
+
+ for metadata_node in metadata:
+ if metadata_node.value.tag == "p":
+ metadata_p = metadata_node
+ break
+ assert plain_text[metadata_p.char_start_idx : metadata_p.char_end_idx] == "This is a paragraph not in div"
+ return (plain_text, metadata)
+
+
+def test_parse_html_nested_example():
+ html = """
+
+
+
+
+ This is a title
+
+
This is a first sub-div in div
+
This is a second sub-div in div
+
+ This is a paragraph not in div
+
+
+"""
+ plain_text, metadata = get_clean_text_and_metadata(html)
+ assert plain_text == (
+ """This is a title
+This is a first sub-div in div
+This is a second sub-div in div
+This is a paragraph not in div
+"""
+ )
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ assert len(metadata) == 6
+
+ target_content_plain_text = {
+ "body": [
+ """This is a title
+This is a first sub-div in div
+This is a second sub-div in div
+This is a paragraph not in div
+"""
+ ],
+ "h1": ["This is a title"],
+ "p": ["This is a paragraph not in div"],
+ "div": [
+ "This is a first sub-div in div",
+ "This is a second sub-div in div",
+ "This is a first sub-div in div\nThis is a second sub-div in div\n",
+ ],
+ }
+
+ check_content_parsing(
+ target_content_plain_text=target_content_plain_text,
+ target_metadata_tags=metadata_tags,
+ metadata=metadata,
+ plain_text=plain_text,
+ )
+
+
+def test_parse_html_nested_example_2():
+ html = """
+
+
+
+
+ This is a title
+
+
This is a
first
sub-div in div
+
This is a
second
sub-div in div
+
+ This is a paragraph not in div
+
+
+"""
+ plain_text, metadata = get_clean_text_and_metadata(html)
+ assert (
+ plain_text
+ == """This is a title
+This is a
+first
+sub-div in div
+This is a
+second
+sub-div in div
+This is a paragraph not in div
+"""
+ )
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ assert len(metadata) == 8
+
+ target_content_plain_text = {
+ "body": [
+ """This is a title
+This is a
+first
+sub-div in div
+This is a
+second
+sub-div in div
+This is a paragraph not in div
+"""
+ ],
+ "h1": ["This is a title"],
+ "p": ["This is a paragraph not in div"],
+ "div": [
+ "first",
+ "second",
+ "This is a\nfirst\nsub-div in div",
+ "This is a\nsecond\nsub-div in div",
+ "This is a\nfirst\nsub-div in div\nThis is a\nsecond\nsub-div in div\n",
+ ],
+ }
+
+ check_content_parsing(
+ target_content_plain_text=target_content_plain_text,
+ target_metadata_tags=metadata_tags,
+ metadata=metadata,
+ plain_text=plain_text,
+ )
+
+
+def test_parse_html_nested_example_max_length():
+ html = """
+
+
+
+
+ This is a title
+
+
This is a
first
sub-div in div
+
This is a
second
sub-div in div
+
+ This is a paragraph not in div
+
+
+"""
+ tags_to_remove_with_content = [TagToRemoveWithContent(tag="div", content_max_char_length=6)]
+ plain_text, metadata = get_clean_text_and_metadata(html, tags_to_remove_with_content=tags_to_remove_with_content)
+ assert plain_text == (
+ "This is a title\n"
+ "This is a sub-div in div\n"
+ "This is a sub-div in div\n"
+ "This is a paragraph not in div\n"
+ )
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ assert len(metadata) == 6
+
+ target_content_plain_text = {
+ "body": [
+ (
+ "This is a title\n"
+ "This is a sub-div in div\n"
+ "This is a sub-div in div\n"
+ "This is a paragraph not in div\n"
+ )
+ ],
+ "h1": ["This is a title"],
+ "p": ["This is a paragraph not in div"],
+ "div": [
+ "This is a sub-div in div",
+ "This is a sub-div in div",
+ ("This is a sub-div in div\n" "This is a sub-div in div\n"),
+ ],
+ }
+
+ check_content_parsing(
+ target_content_plain_text=target_content_plain_text,
+ target_metadata_tags=metadata_tags,
+ metadata=metadata,
+ plain_text=plain_text,
+ )
+
+
+def test_parse_html_nested_example_min_length():
+ html = """
+
+
+
+
+ This is a title
+ small
+
+
This is a
first
sub-div in div
+
This is a
second
sub-div in div
+
+ This is a paragraph not in div
+
+
+"""
+ tags_to_remove_with_content = [TagToRemoveWithContent(tag="div", content_min_char_length=7, method="top-down")]
+ plain_text, metadata = get_clean_text_and_metadata(html, tags_to_remove_with_content=tags_to_remove_with_content)
+ assert plain_text == ("This is a title\n" "small\n" "This is a paragraph not in div\n")
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ assert len(metadata) == 4
+
+ target_content_plain_text = {
+ "body": [("This is a title\n" "small\n" "This is a paragraph not in div\n")],
+ "h1": ["This is a title"],
+ "p": ["This is a paragraph not in div"],
+ "div": ["small"],
+ }
+
+ check_content_parsing(
+ target_content_plain_text=target_content_plain_text,
+ target_metadata_tags=metadata_tags,
+ metadata=metadata,
+ plain_text=plain_text,
+ )
+
+
+def test_remove_all_table():
+ html = """
+
+
+| |
+Jeux olympiques d'été
+ |
+ |
+Jeux olympiques d'hiver
+ |
+
+| 2032 |
+XXXV |
+Brisbane (1) |
+ Australie (3) |
+Océanie (3) |
+ |
+ |
+ |
+
+ |
"""
+ tags_to_remove_with_content = [
+ TagToRemoveWithContent(tag="tbody"),
+ TagToRemoveWithContent(tag="td"),
+ ]
+ attrs_to_keep = ["class", "id"]
+ plain_text, metadata = get_clean_text_and_metadata(
+ html,
+ tags_to_remove_with_content=tags_to_remove_with_content,
+ attrs_to_keep=attrs_to_keep,
+ )
+ assert plain_text == ""
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ assert len(metadata) == 2
+
+ target_content_plain_text = {
+ "body": [""],
+ "caption": [""],
+ }
+
+ check_content_parsing(
+ target_content_plain_text=target_content_plain_text,
+ target_metadata_tags=metadata_tags,
+ metadata=metadata,
+ plain_text=plain_text,
+ )
+
+
+def test_table():
+ html = """
+
+
+ | The table header |
+
+
+
+
+ | The table body |
+ with two columns |
+
+
+
"""
+ tags_to_remove_with_content = [
+ TagToRemoveWithContent(tag="table", content_min_char_length=54),
+ ]
+ attrs_to_keep = ["class", "id"]
+ plain_text, metadata = get_clean_text_and_metadata(
+ html,
+ tags_to_remove_with_content=tags_to_remove_with_content,
+ attrs_to_keep=attrs_to_keep,
+ )
+ assert plain_text == ""
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ assert len(metadata) == 1
+
+ target_content_plain_text = {
+ "body": [""],
+ }
+
+ check_content_parsing(
+ target_content_plain_text=target_content_plain_text,
+ target_metadata_tags=metadata_tags,
+ metadata=metadata,
+ plain_text=plain_text,
+ )
+
+
+def test_table_keep_everything():
+ html = """
+
+
+ | The table header |
+
+
+
+
+ | The table body |
+ with two columns |
+
+
+
"""
+ plain_text, metadata = get_clean_text_and_metadata(
+ html,
+ )
+ assert plain_text == "The table header\nThe table body with two columns\n"
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ assert len(metadata) == 9
+
+ target_content_plain_text = {
+ "table": ["The table header\nThe table body with two columns\n"],
+ "thead": ["The table header\n"],
+ "tr": ["The table header\n", "The table body with two columns\n"],
+ "th": ["The table header"],
+ "tbody": ["The table body with two columns\n"],
+ "td": ["The table body", "with two columns"],
+ "body": ["The table header\nThe table body with two columns\n"],
+ }
+
+ check_content_parsing(
+ target_content_plain_text=target_content_plain_text,
+ target_metadata_tags=metadata_tags,
+ metadata=metadata,
+ plain_text=plain_text,
+ )
+
+
+def test_behavior_on_corrupt_examples():
+ # Corrupt 1: missing end tag value
+ html = """ test >"""
+ plain_text, metadata = get_clean_text_and_metadata(
+ html,
+ # start_parsing_at_tag=None,
+ )
+ assert plain_text == "test >\n"
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ assert len(metadata) == 2
+
+ target_content_plain_text = {
+ "p": ["test >"],
+ "body": ["test >\n"],
+ }
+
+ check_content_parsing(
+ target_content_plain_text=target_content_plain_text,
+ target_metadata_tags=metadata_tags,
+ metadata=metadata,
+ plain_text=plain_text,
+ )
+ assert metadata[0].value.attrs == {"attrs": [], "values": []}
+
+ # Corrupt 2: unnecessary "
+ html = """ test """
+ plain_text, metadata = get_clean_text_and_metadata(
+ html,
+ # start_parsing_at_tag=None,
+ )
+ assert plain_text == "test\n"
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ assert len(metadata) == 2
+
+ target_content_plain_text = {
+ "a": ["test\n"],
+ "body": ["test\n"],
+ }
+ check_content_parsing(
+ target_content_plain_text=target_content_plain_text,
+ target_metadata_tags=metadata_tags,
+ metadata=metadata,
+ plain_text=plain_text,
+ )
+ assert metadata[0].value.attrs == {
+ "attrs": ["href"],
+ "values": ["http://example.com"],
+ }
+
+
+def test_attribs():
+ html = (
+ "
"
+ "this is a title that we keep
"
+ ''
+ ""
+ )
+ plain_text, metadata = get_clean_text_and_metadata(
+ html,
+ )
+ assert plain_text == ("this is a title that we keep\n" "blablabla\n" "tidi tidi\n")
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ assert len(metadata) == 4
+
+ target_content_plain_text = {
+ "body": [
+ (
+ "this is a title that we keep\n" "blablabla\n" "tidi tidi\n",
+ {"attrs": [], "values": []},
+ )
+ ],
+ "h1": [("this is a title that we keep", {"attrs": [], "values": []})],
+ "div": [
+ ("blablabla\ntidi tidi\n", {"attrs": ["class"], "values": ["div-level-1"]}),
+ ("\ntidi tidi", {"attrs": ["class"], "values": ["div-level-2"]}),
+ ],
+ }
+
+ check_content_parsing_and_metadata(
+ target_content_plain_text=target_content_plain_text,
+ target_metadata_tags=metadata_tags,
+ metadata=metadata,
+ plain_text=plain_text,
+ )
+
+
+def test_remove_consecutive_tag():
+ html = (
+ ""
+ "this is a title that we keep
"
+ ''
+ ""
+ )
+ consecutive_tags_to_fold = ["div"]
+ plain_text, metadata = get_clean_text_and_metadata(html, consecutive_tags_to_fold=consecutive_tags_to_fold)
+ assert plain_text == ("this is a title that we keep\n" "blablabla\n" "tidi tidi\n")
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ assert len(metadata) == 3
+
+ target_content_plain_text = {
+ "body": [
+ (
+ "this is a title that we keep\n" "blablabla\n" "tidi tidi\n",
+ {"attrs": [], "values": []},
+ )
+ ],
+ "h1": [("this is a title that we keep", {"attrs": [], "values": []})],
+ "div": [
+ (
+ "blablabla\ntidi tidi\n",
+ {
+ "attrs": ["class", "id", "href"],
+ "values": ["div-level-1 div-level-2", "1", "http"],
+ },
+ ),
+ ],
+ }
+
+ check_content_parsing_and_metadata(
+ target_content_plain_text=target_content_plain_text,
+ target_metadata_tags=metadata_tags,
+ metadata=metadata,
+ plain_text=plain_text,
+ )
+
+
+def test_remove_consecutive_tag_with_tag_to_remove():
+ html = (
+ ""
+ "this is a title that we keep
"
+ ''
+ ""
+ )
+ consecutive_tags_to_fold = ["div"]
+ tags_to_remove_alone = [TagToRemove("span")]
+ plain_text, metadata = get_clean_text_and_metadata(
+ html,
+ consecutive_tags_to_fold=consecutive_tags_to_fold,
+ tags_to_remove_alone=tags_to_remove_alone,
+ )
+ assert plain_text == ("this is a title that we keep\n" "blablabla\n" "tidi tidi\n")
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ assert len(metadata) == 3
+
+ target_content_plain_text = {
+ "body": [
+ (
+ "this is a title that we keep\n" "blablabla\n" "tidi tidi\n",
+ {"attrs": [], "values": []},
+ )
+ ],
+ "h1": [("this is a title that we keep", {"attrs": ["id"], "values": ["title"]})],
+ "div": [
+ (
+ "blablabla\ntidi tidi\n",
+ {
+ "attrs": ["class", "id", "href"],
+ "values": ["div-level-1 div-level-2", "1", "http"],
+ },
+ ),
+ ],
+ }
+
+ check_content_parsing_and_metadata(
+ target_content_plain_text=target_content_plain_text,
+ target_metadata_tags=metadata_tags,
+ metadata=metadata,
+ plain_text=plain_text,
+ )
+
+
+def test_remove_consecutive_tag_very_nested():
+ html = (
+ ""
+ "this is a title that we keep
"
+ ''
+ ""
+ )
+ consecutive_tags_to_fold = ["div"]
+ tags_to_remove_alone = [TagToRemove("span")]
+ plain_text, metadata = get_clean_text_and_metadata(
+ html,
+ consecutive_tags_to_fold=consecutive_tags_to_fold,
+ tags_to_remove_alone=tags_to_remove_alone,
+ )
+ assert plain_text == ("this is a title that we keep\n" "blablabla\n" "tidi\ntidi2\n")
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ assert len(metadata) == 3
+
+ target_content_plain_text = {
+ "body": [
+ (
+ "this is a title that we keep\n" "blablabla\n" "tidi\ntidi2\n",
+ {"attrs": [], "values": []},
+ )
+ ],
+ "h1": [("this is a title that we keep", {"attrs": ["id"], "values": ["title"]})],
+ "div": [
+ (
+ "blablabla\ntidi\ntidi2\n",
+ {
+ "attrs": ["class", "id", "href"],
+ "values": ["div-level-1 div-level-2", "1 3", "http"],
+ },
+ ),
+ ],
+ }
+
+ check_content_parsing_and_metadata(
+ target_content_plain_text=target_content_plain_text,
+ target_metadata_tags=metadata_tags,
+ metadata=metadata,
+ plain_text=plain_text,
+ )
+
+
+def test_min_len_to_include_tag():
+ html = (
+ ""
+ "this is a title that we keep
"
+ 'blablabla
tidi tidi2 this one keep his tag
'
+ ""
+ )
+ consecutive_tags_to_fold = ["div"]
+ tags_to_remove_alone = [TagToRemove("span", content_max_char_length=5)]
+ plain_text, metadata = get_clean_text_and_metadata(
+ html,
+ consecutive_tags_to_fold=consecutive_tags_to_fold,
+ tags_to_remove_alone=tags_to_remove_alone,
+ )
+ assert plain_text == ("this is a title that we keep\n" "blablabla\n" "tidi tidi2 this one keep his tag\n")
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ assert len(metadata) == 4
+
+ target_content_plain_text = {
+ "body": [
+ (
+ "this is a title that we keep\n" "blablabla\n" "tidi tidi2 this one keep his tag\n",
+ {"attrs": [], "values": []},
+ )
+ ],
+ "h1": [("this is a title that we keep", {"attrs": ["id"], "values": ["title"]})],
+ "div": [
+ (
+ "blablabla\ntidi tidi2 this one keep his tag\n",
+ {
+ "attrs": ["class", "id", "href"],
+ "values": ["div-level-1 div-level-2", "1", "http"],
+ },
+ ),
+ ],
+ "span": [("this one keep his tag", {"attrs": ["id"], "values": ["3"]})],
+ }
+
+ check_content_parsing_and_metadata(
+ target_content_plain_text=target_content_plain_text,
+ target_metadata_tags=metadata_tags,
+ metadata=metadata,
+ plain_text=plain_text,
+ )
+
+
+def test_idx_order():
+ html = (
+ ""
+ "this is a title that we keep
"
+ ' blablabla tidi tidi2
this one keep his tag '
+ ""
+ )
+ plain_text, metadata = get_clean_text_and_metadata(
+ html,
+ )
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ target_content_plain_text = {
+ "body": [
+ (
+ "this is a title that we keep\n" "blablabla tidi tidi2\n" "this one keep his tag\n",
+ {"attrs": [], "values": []},
+ )
+ ],
+ "h1": [
+ (
+ "this is a title that we keep",
+ {"attrs": ["id"], "values": ["title"]},
+ )
+ ],
+ "div": [
+ (
+ "blablabla tidi tidi2\nthis one keep his tag",
+ {"attrs": ["class", "href"], "values": ["div-level-2", "http"]},
+ ),
+ (
+ "blablabla tidi tidi2",
+ {"attrs": ["class"], "values": ["div-level-3"]},
+ ),
+ (
+ "blablabla tidi tidi2\nthis one keep his tag\n",
+ {"attrs": ["class", "id"], "values": ["div-level-1", "1"]},
+ ),
+ ],
+ "span": [
+ (
+ "this one keep his tag",
+ {"attrs": ["id"], "values": ["2"]},
+ ),
+ ("tidi2", {"attrs": ["id"], "values": ["3"]}),
+ ],
+ }
+ metadata_sorted_by_start_idx = DefaultDict(list)
+ metadata_sorted_by_end_idx = DefaultDict(list)
+
+ metadata_dict_start_idx = DefaultDict(dict)
+ metadata_dict_end_idx = DefaultDict(dict)
+ for metadata_node in metadata:
+ metadata_dict_start_idx[metadata_node.char_start_idx][metadata_node.relative_start_pos] = metadata_node
+ metadata_dict_end_idx[metadata_node.char_end_idx][metadata_node.relative_end_pos] = metadata_node
+
+ for key, value in metadata_dict_start_idx.items():
+ pos_sorted = sorted(list(value.keys()))
+ metadata_sorted_by_start_idx[key] = [value[pos] for pos in pos_sorted]
+
+ for key, value in metadata_dict_end_idx.items():
+ pos_sorted = sorted(list(value.keys()))
+ metadata_sorted_by_end_idx[key] = [value[pos] for pos in pos_sorted]
+
+ metadata_sorted_by_start_idx_simplify = dict()
+ metadata_sorted_by_end_idx_simplify = dict()
+ for key, value in metadata_sorted_by_start_idx.items():
+ metadata_sorted_by_start_idx_simplify[key] = [
+ (metadata_node.value.tag, metadata_node.value.attrs) for metadata_node in value
+ ]
+
+ for key, value in metadata_sorted_by_end_idx.items():
+ metadata_sorted_by_end_idx_simplify[key] = [
+ (metadata_node.value.tag, metadata_node.value.attrs) for metadata_node in value
+ ]
+
+ metadata_sorted_by_start_idx_simplify_true = {
+ 0: [
+ ("body", {"attrs": [], "values": []}),
+ ("h1", {"attrs": ["id"], "values": ["title"]}),
+ ],
+ 29: [
+ ("div", {"attrs": ["class", "id"], "values": ["div-level-1", "1"]}),
+ ("div", {"attrs": ["class", "href"], "values": ["div-level-2", "http"]}),
+ ("div", {"attrs": ["class"], "values": ["div-level-3"]}),
+ ],
+ 44: [("span", {"attrs": ["id"], "values": ["3"]})],
+ 50: [("span", {"attrs": ["id"], "values": ["2"]})],
+ }
+
+ metadata_sorted_by_end_idx_simplify_true = {
+ 28: [("h1", {"attrs": ["id"], "values": ["title"]})],
+ 49: [
+ ("span", {"attrs": ["id"], "values": ["3"]}),
+ ("div", {"attrs": ["class"], "values": ["div-level-3"]}),
+ ],
+ 71: [
+ ("span", {"attrs": ["id"], "values": ["2"]}),
+ ("div", {"attrs": ["class", "href"], "values": ["div-level-2", "http"]}),
+ ],
+ 72: [
+ ("div", {"attrs": ["class", "id"], "values": ["div-level-1", "1"]}),
+ ("body", {"attrs": [], "values": []}),
+ ],
+ }
+
+ assert metadata_sorted_by_start_idx_simplify_true == metadata_sorted_by_start_idx_simplify
+ assert metadata_sorted_by_end_idx_simplify_true == metadata_sorted_by_end_idx_simplify
+
+ check_content_parsing_and_metadata(
+ target_content_plain_text=target_content_plain_text,
+ target_metadata_tags=metadata_tags,
+ metadata=metadata,
+ plain_text=plain_text,
+ )
+
+
+def test_idx_order_with_br():
+ html = (
+ ""
+ "this is a title that we keep
"
+ '
blablabla tidi tidi2
this one keep his tag '
+ ""
+ )
+ plain_text, metadata = get_clean_text_and_metadata(
+ html,
+ )
+
+ metadata_tags = [metadata_node.value.tag for metadata_node in metadata]
+
+ target_content_plain_text = {
+ "body": [
+ (
+ "this is a title that we keep\n" "blablabla tidi tidi2\n" "this one keep his tag\n",
+ {"attrs": [], "values": []},
+ )
+ ],
+ "h1": [
+ (
+ "this is a title that we keep",
+ {"attrs": ["id"], "values": ["title"]},
+ )
+ ],
+ "div": [
+ (
+ "blablabla tidi tidi2\nthis one keep his tag",
+ {"attrs": ["class", "href"], "values": ["div-level-2", "http"]},
+ ),
+ (
+ "blablabla tidi tidi2",
+ {"attrs": ["class"], "values": ["div-level-3"]},
+ ),
+ (
+ "blablabla tidi tidi2\nthis one keep his tag\n",
+ {"attrs": ["class", "id"], "values": ["div-level-1", "1"]},
+ ),
+ ],
+ "span": [
+ (
+ "this one keep his tag",
+ {"attrs": ["id"], "values": ["2"]},
+ ),
+ ("tidi2", {"attrs": ["id"], "values": ["3"]}),
+ ],
+ "br": [
+ (
+ "",
+ {"attrs": [], "values": []},
+ ),
+ (
+ "",
+ {"attrs": [], "values": []},
+ ),
+ ],
+ }
+
+ metadata_dict_start_idx = DefaultDict(dict)
+ metadata_dict_end_idx = DefaultDict(dict)
+ for metadata_node in metadata:
+ metadata_dict_start_idx[metadata_node.char_start_idx][metadata_node.relative_start_pos] = (
+ metadata_node.value.tag,
+ metadata_node.value.attrs,
+ )
+ metadata_dict_end_idx[metadata_node.char_end_idx][metadata_node.relative_end_pos] = (
+ metadata_node.value.tag,
+ metadata_node.value.attrs,
+ )
+
+ metadata_sorted_by_start_idx_simplify_true = {
+ 0: {
+ 0: ("body", {"attrs": [], "values": []}),
+ 1: ("h1", {"attrs": ["id"], "values": ["title"]}),
+ },
+ 29: {
+ 0: ("br", {"attrs": [], "values": []}),
+ 2: ("div", {"attrs": ["class", "id"], "values": ["div-level-1", "1"]}),
+ 3: ("div", {"attrs": ["class", "href"], "values": ["div-level-2", "http"]}),
+ 4: ("div", {"attrs": ["class"], "values": ["div-level-3"]}),
+ 5: ("br", {"attrs": [], "values": []}),
+ },
+ 44: {0: ("span", {"attrs": ["id"], "values": ["3"]})},
+ 50: {0: ("span", {"attrs": ["id"], "values": ["2"]})},
+ }
+
+ metadata_sorted_by_end_idx_simplify_true = {
+ 28: {0: ("h1", {"attrs": ["id"], "values": ["title"]})},
+ 29: {
+ 1: ("br", {"attrs": [], "values": []}),
+ 6: ("br", {"attrs": [], "values": []}),
+ },
+ 49: {
+ 0: ("span", {"attrs": ["id"], "values": ["3"]}),
+ 1: ("div", {"attrs": ["class"], "values": ["div-level-3"]}),
+ },
+ 71: {
+ 0: ("span", {"attrs": ["id"], "values": ["2"]}),
+ 1: ("div", {"attrs": ["class", "href"], "values": ["div-level-2", "http"]}),
+ },
+ 72: {
+ 0: ("div", {"attrs": ["class", "id"], "values": ["div-level-1", "1"]}),
+ 1: ("body", {"attrs": [], "values": []}),
+ },
+ }
+
+ assert metadata_sorted_by_start_idx_simplify_true == metadata_dict_start_idx
+ assert metadata_sorted_by_end_idx_simplify_true == metadata_dict_end_idx
+
+ check_content_parsing_and_metadata(
+ target_content_plain_text=target_content_plain_text,
+ target_metadata_tags=metadata_tags,
+ metadata=metadata,
+ plain_text=plain_text,
+ )
+
+
+def test_convert_br_tag():
+ html = "" "first line
" "second line" ""
+ plain_text, metadata = get_clean_text_and_metadata(html, convert_br_tag_to_breaking_line=True)
+ assert plain_text == "first line\nsecond line\n"
+ assert "br" not in [html_tag.value.tag for html_tag in metadata]
+
+ html = "" "first line
" "second line" ""
+ plain_text, metadata = get_clean_text_and_metadata(html, convert_br_tag_to_breaking_line=True)
+ assert plain_text == "first line\n\n\nsecond line\n"
+ assert "br" not in [html_tag.value.tag for html_tag in metadata]
+
+ html = "" "first line
" "second line" ""
+ plain_text, metadata = get_clean_text_and_metadata(
+ html,
+ )
+ assert plain_text == "first line\nsecond line\n"
+ assert "br" in [html_tag.value.tag for html_tag in metadata]
+
+ html = "" "first line
" "second line" ""
+ plain_text, metadata = get_clean_text_and_metadata(html, convert_br_tag_to_breaking_line=True)
+ assert plain_text == "first line\nsecond line\n"
+ assert "br" not in [html_tag.value.tag for html_tag in metadata]
diff --git a/tests/test_preprocessing_utils.py b/tests/test_preprocessing_utils.py
index 2061f4e2..1bc3e609 100644
--- a/tests/test_preprocessing_utils.py
+++ b/tests/test_preprocessing_utils.py
@@ -4,7 +4,7 @@
from datasets import Dataset
from mocks.mock_dump_db import MockDumpDB
-from bsmetadata.preprocessing_utils import WebsiteDescPreprocessor
+from bsmetadata.preprocessing_utils import HtmlPreprocessor, WebsiteDescPreprocessor
def mock_sent_tokenize(text):
@@ -51,5 +51,138 @@ def test_website_metadata_processor(self):
self.assertEqual(ds[:]["metadata"], target_metadata)
+class HtmlPreprocessorTester(unittest.TestCase):
+ def setUp(self) -> None:
+ self.html_processor = HtmlPreprocessor()
+
+ def test_toy_dataset(self):
+ # Define toy data
+ my_dict = {
+ "doc_html": [
+ "\n \n \n \n \n This is a title
\n \n \n",
+ "this is a simple paragraph
",
+ "paragraph 1
paragraph 2
",
+ '',
+ ],
+ "metadata": [[], [], [], []],
+ }
+
+ # Define target values
+ target_texts = [
+ "This is a title\n",
+ "this is a simple paragraph\n",
+ "paragraph 1\nparagraph 2\n",
+ "blablabla\ntidi tidi\n",
+ ]
+
+ target_metadata = [
+ [
+ {
+ "char_end_idx": 15,
+ "char_start_idx": 0,
+ "html_attrs": {"attrs": [], "values": []},
+ "key": "html",
+ "relative_end_pos": 0,
+ "relative_start_pos": 1,
+ "type": "local",
+ "value": "h1",
+ },
+ {
+ "char_end_idx": 16,
+ "char_start_idx": 0,
+ "html_attrs": {"attrs": [], "values": []},
+ "key": "html",
+ "relative_end_pos": 0,
+ "relative_start_pos": 0,
+ "type": "local",
+ "value": "body",
+ },
+ ],
+ [
+ {
+ "char_end_idx": 26,
+ "char_start_idx": 0,
+ "html_attrs": {"attrs": [], "values": []},
+ "key": "html",
+ "relative_end_pos": 0,
+ "relative_start_pos": 1,
+ "type": "local",
+ "value": "p",
+ },
+ {
+ "char_end_idx": 27,
+ "char_start_idx": 0,
+ "html_attrs": {"attrs": [], "values": []},
+ "key": "html",
+ "relative_end_pos": 0,
+ "relative_start_pos": 0,
+ "type": "local",
+ "value": "body",
+ },
+ ],
+ [
+ {
+ "char_end_idx": 11,
+ "char_start_idx": 0,
+ "html_attrs": {"attrs": ["id"], "values": ["1"]},
+ "key": "html",
+ "relative_end_pos": 0,
+ "relative_start_pos": 1,
+ "type": "local",
+ "value": "p",
+ },
+ {
+ "char_end_idx": 23,
+ "char_start_idx": 12,
+ "html_attrs": {"attrs": ["id"], "values": ["2"]},
+ "key": "html",
+ "relative_end_pos": 0,
+ "relative_start_pos": 0,
+ "type": "local",
+ "value": "p",
+ },
+ {
+ "char_end_idx": 24,
+ "char_start_idx": 0,
+ "html_attrs": {"attrs": [], "values": []},
+ "key": "html",
+ "relative_end_pos": 0,
+ "relative_start_pos": 0,
+ "type": "local",
+ "value": "body",
+ },
+ ],
+ [
+ {
+ "char_end_idx": 20,
+ "char_start_idx": 0,
+ "html_attrs": {"attrs": ["class"], "values": ["div-level-1 div-level-2"]},
+ "key": "html",
+ "relative_end_pos": 0,
+ "relative_start_pos": 1,
+ "type": "local",
+ "value": "div",
+ },
+ {
+ "char_end_idx": 20,
+ "char_start_idx": 0,
+ "html_attrs": {"attrs": [], "values": []},
+ "key": "html",
+ "relative_end_pos": 1,
+ "relative_start_pos": 0,
+ "type": "local",
+ "value": "body",
+ },
+ ],
+ ]
+
+ # Apply function
+ ds = Dataset.from_dict(my_dict)
+ ds = ds.map(lambda ex: self.html_processor.preprocess(ex), batched=True, batch_size=3)
+
+ self.assertEqual(ds[:]["texts"], target_texts)
+ self.assertEqual(ds[:]["metadata"], target_metadata)
+
+
if __name__ == "__main__":
unittest.main()