From da38891f7ca4b2afb2f1998f8f826be40b8865a5 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Fri, 27 Aug 2021 14:42:02 +0000 Subject: [PATCH 01/84] simplify code --- bsmetadata/experiments/with_metadata.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bsmetadata/experiments/with_metadata.py b/bsmetadata/experiments/with_metadata.py index 02378a1f..2ef2800b 100644 --- a/bsmetadata/experiments/with_metadata.py +++ b/bsmetadata/experiments/with_metadata.py @@ -68,7 +68,8 @@ def get_dataloaders(tokenizer, args): data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file - extension = args.train_file.split(".")[-1] + + extension = args.train_file.split(".")[-1] if not args.extension else args.extension if extension == "txt": raise ValueError( "You have entered a text file for the train data, but this type of file cannot contain metadata " From 3c1d121aca9038f560c3271f206fc68e34ea5ae6 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Fri, 27 Aug 2021 14:42:51 +0000 Subject: [PATCH 02/84] change how the tags are added - difference between self closing tag and standard tags --- bsmetadata/metadata_utils.py | 62 +++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 11 deletions(-) diff --git a/bsmetadata/metadata_utils.py b/bsmetadata/metadata_utils.py index 7268102a..e54d51f2 100644 --- a/bsmetadata/metadata_utils.py +++ b/bsmetadata/metadata_utils.py @@ -16,6 +16,7 @@ import random from collections import defaultdict from typing import Any, Dict, List, Tuple +from dataclasses import field, dataclass from transformers import PreTrainedTokenizerFast @@ -122,6 +123,14 @@ def create_global_metadata_prefix(example: Dict[str, Any], cfg: DataConfig) -> s return cfg.metadata_sep.join(sorted_metadata) + cfg.global_metadata_sep if sorted_metadata else "" +@dataclass +class MetadataIdxStorage: + start_idx_tag_with_content: dict = field(default_factory=(lambda: defaultdict(list))) + end_idx_tag_with_content: dict = field(default_factory=(lambda: defaultdict(list))) + start_idx_tag_without_content: dict = field(default_factory=(lambda: defaultdict(list))) + end_idx_tag_without_content: dict = field(default_factory=(lambda: defaultdict(list))) + + def add_local_metadata_to_text(example: Dict[str, Any], cfg: DataConfig) -> Tuple[str, List[bool]]: """Adds local metadata (such as HTML tags and entity names) to the given input text. @@ -134,7 +143,7 @@ def add_local_metadata_to_text(example: Dict[str, Any], cfg: DataConfig) -> Tupl - the first element is the text with metadata; - the second element is a boolean mask where `mask[i]` is set iff `text[i]` is some kind of metadata. """ - metadata_start_texts, metadata_end_texts = defaultdict(list), defaultdict(list) + metadata_idx_storage = MetadataIdxStorage() # Filter and sort all metadata so that they are processed in the requested order. filtered_metadata = [md for md in example["metadata"] if md["type"] == "local" and md["key"] in cfg.metadata_list] @@ -152,27 +161,58 @@ def add_local_metadata_to_text(example: Dict[str, Any], cfg: DataConfig) -> Tupl char_start_idx = metadata.get("char_start_idx", -1) char_end_idx = metadata.get("char_end_idx", -1) - metadata_start_texts[char_start_idx].insert(0, start_text) - metadata_end_texts[char_end_idx].append(end_text) + if char_start_idx == char_end_idx: + metadata_idx_storage.start_idx_tag_without_content[char_start_idx].insert(0, start_text) + metadata_idx_storage.end_idx_tag_without_content[char_end_idx].append(end_text) + else: + metadata_idx_storage.start_idx_tag_with_content[char_start_idx].insert(0, start_text) + metadata_idx_storage.end_idx_tag_with_content[char_end_idx].append(end_text) # Build the final text with local metadata and the corresponding mask. text_with_local_metadata = [] metadata_mask = [] + def _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask): + for metadata_text in metadata_text_list: + text_with_local_metadata.append(metadata_text) + metadata_mask += [True] * len(metadata_text) + for idx, char in enumerate(example["text"]): + if idx in metadata_idx_storage.end_idx_tag_with_content: + metadata_text_list = metadata_idx_storage.end_idx_tag_with_content[idx] + _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask) + + if idx in metadata_idx_storage.start_idx_tag_without_content: + metadata_text_list = metadata_idx_storage.start_idx_tag_without_content[idx] + _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask) - if idx in metadata_start_texts: - for start_text in metadata_start_texts[idx]: - text_with_local_metadata.append(start_text) - metadata_mask += [True] * len(start_text) + if idx in metadata_idx_storage.end_idx_tag_without_content: + metadata_text_list = metadata_idx_storage.end_idx_tag_without_content[idx] + _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask) + + if idx in metadata_idx_storage.start_idx_tag_with_content: + metadata_text_list = metadata_idx_storage.start_idx_tag_with_content[idx] + _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask) text_with_local_metadata.append(char) metadata_mask += [False] - if idx + 1 in metadata_end_texts: - for end_text in metadata_end_texts[idx + 1]: - text_with_local_metadata.append(end_text) - metadata_mask += [True] * len(end_text) + idx += 1 + if idx in metadata_idx_storage.end_idx_tag_with_content: + metadata_text_list = metadata_idx_storage.end_idx_tag_with_content[idx] + _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask) + + if idx in metadata_idx_storage.start_idx_tag_without_content: + metadata_text_list = metadata_idx_storage.start_idx_tag_without_content[idx] + _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask) + + if idx in metadata_idx_storage.end_idx_tag_without_content: + metadata_text_list = metadata_idx_storage.end_idx_tag_without_content[idx] + _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask) + + if idx in metadata_idx_storage.start_idx_tag_with_content: + metadata_text_list = metadata_idx_storage.start_idx_tag_with_content[idx] + _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask) return "".join(text_with_local_metadata), metadata_mask From 321411f53b0ddeff4f6acc234c50d407a32bc673 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Fri, 27 Aug 2021 14:43:32 +0000 Subject: [PATCH 03/84] add test for the new way to add local metadata --- tests/test_metadata_utils.py | 85 +++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/tests/test_metadata_utils.py b/tests/test_metadata_utils.py index c54c2826..7febfa53 100644 --- a/tests/test_metadata_utils.py +++ b/tests/test_metadata_utils.py @@ -5,7 +5,7 @@ from transformers import GPT2TokenizerFast from bsmetadata.input_pipeline import DataConfig -from bsmetadata.metadata_processors import PROCESSORS, MetadataProcessor +from bsmetadata.metadata_processors import PROCESSORS, MetadataProcessor, HtmlProcessor from bsmetadata.metadata_utils import ( add_local_metadata_to_text, add_metadata_and_chunk_examples, @@ -57,6 +57,79 @@ def setUp(self) -> None: {"key": "url", "type": "global", "value": "callto:RickAndMorty/Year%202021/"}, ], }, + { + "id": "0004", + "text": "useless text The Walking Dead (season 8)\n", + "metadata": [ + { + "char_start_idx": 0, + "value": {"tag": "a", "attrs": {"attr": [], "value": []}}, + "char_end_idx": 12, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": { + "tag": "div", + "attrs": {"attr": ["id", "class"], "value": ["mw-page-base", "noprint"]}, + }, + "char_end_idx": 13, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": { + "tag": "div", + "attrs": {"attr": ["id", "class"], "value": ["mw-head-base", "noprint"]}, + }, + "char_end_idx": 13, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": {"tag": "a", "attrs": {"attr": ["id"], "value": ["top"]}}, + "char_end_idx": 13, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": { + "tag": "div", + "attrs": { + "attr": ["id", "class"], + "value": ["siteNotice centralNotice", "mw-body-content"], + }, + }, + "char_end_idx": 13, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": {"tag": "i", "attrs": {"attr": [], "value": []}}, + "char_end_idx": 29, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": { + "tag": "h1", + "attrs": { + "attr": ["id", "class", "lang"], + "value": ["firstHeading", "firstHeading", "en"], + }, + }, + "char_end_idx": 40, + "key": "html", + "type": "local", + }, + ], + }, ] def test_chunks(self): @@ -133,6 +206,16 @@ def test_add_no_metadata_and_chunk_examples(self): for example in mapped_ds: self.assertTrue(all(not x for x in example["metadata_mask"])) + def test_add_html_tags(self): + cfg = DataConfig() + cfg.metadata_list = ["html"] + PROCESSORS["html"] = HtmlProcessor + + text1, mask1 = add_local_metadata_to_text(self.examples[3], cfg) + target_text = 'useless text

The Walking Dead (season 8)

\n' + + self.assertEqual(text1, target_text) + def test_add_metadata_and_chunk_examples(self): cfg = DataConfig() cfg.metadata_list = ["url", "timestamp", "html", "entity"] From 3088c91e2fbe5809ed7299b6523d17561614c7e0 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Fri, 27 Aug 2021 14:43:55 +0000 Subject: [PATCH 04/84] add extension --- bsmetadata/input_pipeline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bsmetadata/input_pipeline.py b/bsmetadata/input_pipeline.py index 04420869..422c7152 100644 --- a/bsmetadata/input_pipeline.py +++ b/bsmetadata/input_pipeline.py @@ -50,6 +50,7 @@ class DataConfig: cache_dir: Optional[str] = field( default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3?"} ) + extension: Optional[str] = field(default=None, metadata={"help": "the file extension of the dataset"}) preprocessing_num_workers: Optional[int] = field( default=None, metadata={"help": "The number of processes to use for the preprocessing."} ) From eadde2c196d27adde430e0a4cf6fd90e5b4770cc Mon Sep 17 00:00:00 2001 From: SaulLu Date: Fri, 27 Aug 2021 14:44:30 +0000 Subject: [PATCH 05/84] create special html processor --- experiments/html/html_processor.py | 121 +++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 experiments/html/html_processor.py diff --git a/experiments/html/html_processor.py b/experiments/html/html_processor.py new file mode 100644 index 00000000..07a8e1ad --- /dev/null +++ b/experiments/html/html_processor.py @@ -0,0 +1,121 @@ +import datetime +from typing import Any, Dict, Optional, Tuple, List +from urllib.parse import unquote_plus +from dataclasses import dataclass + +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.metadata_processors import MetadataProcessor + + +@dataclass +class TagToRemove: + tag: str + content_min_char_length: int = 0 + content_max_char_length: int = float("inf") + + +@dataclass +class HtmlTag: + tag: str + attrs: dict + + +@dataclass +class Metadata: + char_start_idx: int + value: HtmlTag + char_end_idx: Optional[int] = None + key: str = "html" + type: str = "local" + + +class TagFilter: + def __init__( + self, + content_max_char_length: Optional[float] = float("inf"), + content_min_char_length: Optional[float] = 0, + tags_exceptions: Optional[List[str]] = None, + tags_to_remove_alone: Optional[List[TagToRemove]] = None, + ): + self.tags_to_remove_alone = ( + {tag_to_remove.tag: tag_to_remove for tag_to_remove in tags_to_remove_alone} + if isinstance(tags_to_remove_alone, list) + else {} + ) + self.content_max_char_length = content_max_char_length + self.content_min_char_length = content_min_char_length + self.tags_exceptions = tags_exceptions if tags_exceptions else [] + + def drop_tag(self, metadata_node): + tag = str(metadata_node.value.tag) + + drop_tag = False + content_char_length = ( + metadata_node.char_end_idx - metadata_node.char_start_idx if metadata_node.char_end_idx is not None else 0 + ) + if tag in self.tags_to_remove_alone: + tag_to_remove_characteristics = self.tags_to_remove_alone[tag] + if ( + content_char_length <= tag_to_remove_characteristics.content_max_char_length + and content_char_length >= tag_to_remove_characteristics.content_min_char_length + ): + drop_tag = True + + if tag not in self.tags_exceptions: + if ( + content_char_length <= self.content_max_char_length + and content_char_length >= self.content_min_char_length + ): + drop_tag = True + + # raise TypeError(f"tag need to be a string not a {type(tag)}") + return drop_tag + + +class HtmlProcessor(MetadataProcessor): + """An example metadata processor for HTMl tags.""" + + def __init__( + self, + cfg: DataConfig, + attributes_to_keep=None, + content_max_char_length: Optional[float] = float("inf"), + content_min_char_length: Optional[float] = 0, + tags_exceptions: Optional[List[str]] = None, + tags_to_remove_alone: Optional[List[TagToRemove]] = None, + ): + """ + Args: + cfg: The data configuration to use. + """ + super().__init__(cfg) + self._tag_filter = TagFilter( + tags_to_remove_alone=tags_to_remove_alone, + content_min_char_length=content_min_char_length, + content_max_char_length=content_max_char_length, + tags_exceptions=tags_exceptions, + ) + self._attributes_to_keep = attributes_to_keep + + def process_local(self, metadata_attrs: Dict[str, Any]) -> Optional[Tuple[str, str]]: + # We represent a html tag `T` by enclosing the corresponding text span with "" and "". + # Example: An apple is an edible fruit. + if self._tag_filter.drop_tag( + Metadata( + char_start_idx=metadata_attrs["char_start_idx"], + value=HtmlTag(tag=metadata_attrs["value"]["tag"], attrs=metadata_attrs["value"]["attrs"]), + char_end_idx=metadata_attrs["char_end_idx"], + key=metadata_attrs["key"], + type=metadata_attrs["type"], + ) + ): + return None + + attributes = " ".join( + f'{attr}="{value}"' + for attr, value in zip(metadata_attrs["value"]["attrs"]["attr"], metadata_attrs["value"]["attrs"]["value"]) + if (self._attributes_to_keep is None or attr in self._attributes_to_keep) + ) + if attributes: + attributes = " " + attributes + return f"<{metadata_attrs['value']['tag']}{attributes}>", f"" From 1ef6ba928cf593e817d253dcc1080dcb19abfe7f Mon Sep 17 00:00:00 2001 From: SaulLu Date: Fri, 27 Aug 2021 14:44:53 +0000 Subject: [PATCH 06/84] add attributes to regular html processor --- bsmetadata/metadata_processors.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bsmetadata/metadata_processors.py b/bsmetadata/metadata_processors.py index f4a3f121..169cead1 100644 --- a/bsmetadata/metadata_processors.py +++ b/bsmetadata/metadata_processors.py @@ -92,7 +92,13 @@ class HtmlProcessor(MetadataProcessor): def process_local(self, metadata_attrs: Dict[str, Any]) -> Optional[Tuple[str, str]]: # We represent a html tag `T` by enclosing the corresponding text span with "" and "". # Example: An apple is an edible fruit. - return f"<{metadata_attrs['value']}>", f"" + attributes = " ".join( + f'{attr}:"{value}"' + for attr, value in zip(metadata_attrs["value"]["attrs"]["attr"], metadata_attrs["value"]["attrs"]["value"]) + ) + if attributes: + attributes = " " + attributes + return f"<{metadata_attrs['value']['tag']}{attributes}>", f"" class UrlProcessor(MetadataProcessor): From 26385c752ccf06b0488806b60cad8dfe51cb6d34 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Fri, 27 Aug 2021 14:45:12 +0000 Subject: [PATCH 07/84] add test to custom html processor --- experiments/html/test_html_processor.py | 162 ++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 experiments/html/test_html_processor.py diff --git a/experiments/html/test_html_processor.py b/experiments/html/test_html_processor.py new file mode 100644 index 00000000..199edd64 --- /dev/null +++ b/experiments/html/test_html_processor.py @@ -0,0 +1,162 @@ +import functools +from functools import partial +import unittest + +from transformers import GPT2TokenizerFast + +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.metadata_processors import PROCESSORS, MetadataProcessor +from bsmetadata.metadata_utils import ( + add_local_metadata_to_text, + add_metadata_and_chunk_examples, + chunks, + create_global_metadata_prefix, +) + +from html_processor import HtmlProcessor, TagToRemove + + +class MetadataUtilsTester(unittest.TestCase): + def setUp(self) -> None: + self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2-xl") + self.examples = [ + { + "id": "0004", + "text": "useless text The Walking Dead (season 8)\n", + "metadata": [ + { + "char_start_idx": 0, + "value": {"tag": "a", "attrs": {"attr": [], "value": []}}, + "char_end_idx": 12, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": { + "tag": "div", + "attrs": {"attr": ["id", "class"], "value": ["mw-page-base", "noprint"]}, + }, + "char_end_idx": 13, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": { + "tag": "div", + "attrs": {"attr": ["id", "class"], "value": ["mw-head-base", "noprint"]}, + }, + "char_end_idx": 13, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": {"tag": "a", "attrs": {"attr": ["id"], "value": ["top"]}}, + "char_end_idx": 13, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": { + "tag": "div", + "attrs": { + "attr": ["id", "class"], + "value": ["siteNotice centralNotice", "mw-body-content"], + }, + }, + "char_end_idx": 13, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": {"tag": "i", "attrs": {"attr": [], "value": []}}, + "char_end_idx": 29, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": { + "tag": "h1", + "attrs": { + "attr": ["id", "class", "lang"], + "value": ["firstHeading", "firstHeading", "en"], + }, + }, + "char_end_idx": 40, + "key": "html", + "type": "local", + }, + ], + }, + { + "id": "0004", + "text": ("this is a title that we keep\n" "blablabla\n" "tidi tidi2 this one keep his tag\n"), + "metadata": [ + { + "char_start_idx": 0, + "value": {"tag": "h1", "attrs": {"attr": ["id"], "value": ["title"]}}, + "char_end_idx": 28, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 50, + "value": {"tag": "span", "attrs": {"attr": ["id"], "value": ["3"]}}, + "char_end_idx": 71, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 29, + "value": { + "tag": "div", + "attrs": { + "attr": ["class", "id", "href"], + "value": ["div-level-1 div-level-2", "1", "http"], + }, + }, + "char_end_idx": 72, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 0, + "value": {"tag": "body", "attrs": {"attr": [], "value": []}}, + "char_end_idx": 72, + "key": "html", + "type": "local", + }, + ], + }, + ] + + def test_add_html_tags(self): + cfg = DataConfig() + cfg.metadata_list = ["html"] + PROCESSORS["html"] = HtmlProcessor + + text1, mask1 = add_local_metadata_to_text(self.examples[0], cfg) + target_text = 'useless text

The Walking Dead (season 8)

\n' + + self.assertEqual(text1, target_text) + + def test_add_html_tags_remove_tag(self): + cfg = DataConfig() + cfg.metadata_list = ["html"] + tags_to_remove_alone = [TagToRemove("span", content_max_char_length=5), TagToRemove("body")] + PROCESSORS["html"] = partial(HtmlProcessor, tags_to_remove_alone=tags_to_remove_alone) + + text1, mask1 = add_local_metadata_to_text(self.examples[1], cfg) + target_text = ( + '

this is a title that we keep

\n' + '
blablabla\ntidi tidi2 this one keep his tag\n
' + ) + + print(repr(text1)) + + self.assertEqual(text1, target_text) From 8b7655d7c177cade60fd0f3df7b2af590657ea22 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Fri, 27 Aug 2021 14:45:55 +0000 Subject: [PATCH 08/84] add baby training script to test --- experiments/html/start_training.py | 70 ++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 experiments/html/start_training.py diff --git a/experiments/html/start_training.py b/experiments/html/start_training.py new file mode 100644 index 00000000..9fbede41 --- /dev/null +++ b/experiments/html/start_training.py @@ -0,0 +1,70 @@ +from functools import partial +from bsmetadata.experiments.with_metadata import get_dataloaders +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.metadata_utils import ( + add_metadata_and_chunk_examples, + create_global_metadata_prefix, + add_local_metadata_to_text, + chunks, +) +from bsmetadata.metadata_processors import PROCESSORS +from transformers import AutoTokenizer + +from datasets import load_dataset + +from html_processor import HtmlProcessor, TagToRemove + +tags_to_remove_alone = [ + TagToRemove("body"), + TagToRemove("div", content_max_char_length=0), + TagToRemove("a", content_max_char_length=0), +] +tags_table = ["table" "tr", "th", "td", "caption", "colgroup", "thead", "tfoot", "tbody"] +tags_list = [ + "li", + "ol", + "ul", +] +PROCESSORS["html"] = partial( + HtmlProcessor, + tags_to_remove_alone=tags_to_remove_alone, + attributes_to_keep=["class", "id"], + content_max_char_length=128, + tags_exceptions=[ + *tags_table, + *tags_list, + "span", + ], +) + +args = DataConfig( + train_file="/home/lucile/mini-html-parser/data/v1.0/pre-process-body-v3/nq-train-00.jsonl.gz", + extension="json", + metadata_list=["html"], + preprocessing_num_workers=8 +) +tokenizer = AutoTokenizer.from_pretrained("gpt2") + +# dataloaders = get_dataloaders(tokenizer, args) + +# dataloaders + +# train_dataloader = dataloaders[0] + +# sample = next(iter(train_dataloader)) +# print(tokenizer.convert_ids_to_tokens(sample["input_ids"][0])) +# dataset = load_dataset(args.extension, data_files=[args.train_file]) + +# # dataset["train"][0] + +# examples = dataset["train"][:2] + +# output = add_metadata_and_chunk_examples(examples=examples, tokenizer=tokenizer, cfg=args) + + +# print("******") +# print(tokenizer.decode(output["input_ids"][0])) + + +dataloaders = get_dataloaders(tokenizer, args) +print(dataloaders) \ No newline at end of file From 7ae45dd44317b419c5c50da1de33932c4d59a59f Mon Sep 17 00:00:00 2001 From: SaulLu Date: Fri, 27 Aug 2021 17:00:39 +0000 Subject: [PATCH 09/84] change requirements --- bsmetadata/__init__.py | 0 requirements.txt | 5 +++-- setup.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 bsmetadata/__init__.py diff --git a/bsmetadata/__init__.py b/bsmetadata/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/requirements.txt b/requirements.txt index 8c6321ee..1671047b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ torch==1.8.1 hydra_core==1.0.6 wandb==0.10.30 -transformers==4.6.0 +transformers accelerate==0.3.0 -datasets==1.11.0 +git+https://github.com/huggingface/datasets.git + diff --git a/setup.py b/setup.py index 65bf2dea..aa5d1ba3 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ def req_file(filename): return [x.strip() for x in content] -install_requires = req_file("requirements.txt") +# install_requires = req_file("requirements.txt") setup( name="bsmetadata", @@ -17,5 +17,5 @@ def req_file(filename): author_email="xxx", description="Codebase for including metadata (e.g., URLs, timestamps, HTML tags) during language model pretraining.", packages=find_packages(), - install_requires=install_requires, + # install_requires=install_requires, ) From 4d98709ee4930cd4da1c9f084944c8b5fadb9add Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 30 Aug 2021 09:39:46 +0000 Subject: [PATCH 10/84] add do train attribute --- bsmetadata/train.py | 102 ++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 50 deletions(-) diff --git a/bsmetadata/train.py b/bsmetadata/train.py index a5d37049..7eb3334d 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -45,7 +45,8 @@ class CFG: num_eval: int = field(default=3, metadata={"help": "The number of evaluations to perform during training."}) model_name: str = field(default="gpt2", metadata={"help": "The name of the pretrained model to use."}) project_name: str = field(default="metadata_lm", metadata={"help": "The project name."}) - + do_train: bool = field(default=True, metadata={"help": "Whether to run training."}) + do_eval: bool = field(default=True, metadata={"help": "Whether to run eval on the dev set."}) cs = ConfigStore.instance() cs.store(name="config", node=CFG) @@ -181,55 +182,56 @@ def evaluate(eval_dataloader): model.train() return {"perplexity": perplexity} - progress_bar = tqdm(range(args.max_train_steps), desc="training") - completed_steps = 0 - logger = Logger(is_local_main_process, project=args.project_name, config=args) - for epoch in range(args.num_train_epochs): - model.train() - for step, batch in enumerate(train_dataloader): - # pop labels because we want to calculate loss ourselves - labels = batch.pop("labels") - metadata_mask = batch.pop("metadata_mask", None) - outputs = model(**batch) - batch["labels"] = labels - loss = loss_fn(batch, outputs, metadata_mask) - - logger.log({"loss": loss}) - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - - do_step = step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1 - if do_step: - # accelerator.clip_grad_norm_(model.parameters(), 1.0) - optimizer.step() - scheduler.step() - optimizer.zero_grad() - progress_bar.update(1) - completed_steps += 1 - else: - continue - do_eval = completed_steps > 0 and completed_steps % eval_per_n_step == 0 - if do_eval: - for key, eval_dataloader in eval_dataloaders.items(): - metrics = evaluate(eval_dataloader) - logger.log({key: metrics}) - - # logger.info(f"epoch {epoch}: perplexity: {perplexity}") - if is_local_main_process: - save_dict = { - "epoch": epoch + 1, - "state_dict": accelerator.unwrap_model(model).state_dict(), - "optimizer": optimizer.state_dict(), - "scheduler": scheduler.state_dict(), - } - torch.save( - save_dict, - os.path.join(args.out_dir, f"checkpoint-{completed_steps}step.pt"), - ) - del save_dict - gc.collect() - if completed_steps >= args.max_train_steps: - break + if args.do_train: + progress_bar = tqdm(range(args.max_train_steps), desc="training") + completed_steps = 0 + logger = Logger(is_local_main_process, project=args.project_name, config=args) + for epoch in range(args.num_train_epochs): + model.train() + for step, batch in enumerate(train_dataloader): + # pop labels because we want to calculate loss ourselves + labels = batch.pop("labels") + metadata_mask = batch.pop("metadata_mask", None) + outputs = model(**batch) + batch["labels"] = labels + loss = loss_fn(batch, outputs, metadata_mask) + + logger.log({"loss": loss}) + loss = loss / args.gradient_accumulation_steps + accelerator.backward(loss) + + do_step = step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1 + if do_step: + # accelerator.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) + completed_steps += 1 + else: + continue + do_eval = args.do_train and completed_steps > 0 and completed_steps % eval_per_n_step == 0 + if do_eval: + for key, eval_dataloader in eval_dataloaders.items(): + metrics = evaluate(eval_dataloader) + logger.log({key: metrics}) + + # logger.info(f"epoch {epoch}: perplexity: {perplexity}") + if is_local_main_process: + save_dict = { + "epoch": epoch + 1, + "state_dict": accelerator.unwrap_model(model).state_dict(), + "optimizer": optimizer.state_dict(), + "scheduler": scheduler.state_dict(), + } + torch.save( + save_dict, + os.path.join(args.out_dir, f"checkpoint-{completed_steps}step.pt"), + ) + del save_dict + gc.collect() + if completed_steps >= args.max_train_steps: + break logger.close() if is_local_main_process and args.out_dir is not None: From d0ca7dca738ef50790bf8f596950c48636f64180 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 30 Aug 2021 09:40:13 +0000 Subject: [PATCH 11/84] add strat_training_example_script --- experiments/html/start_training.py | 64 +++++++++++++++++------------- 1 file changed, 37 insertions(+), 27 deletions(-) diff --git a/experiments/html/start_training.py b/experiments/html/start_training.py index 9fbede41..67a1684d 100644 --- a/experiments/html/start_training.py +++ b/experiments/html/start_training.py @@ -1,18 +1,21 @@ +import sys from functools import partial + +from datasets import load_dataset +from html_processor import HtmlProcessor, TagToRemove +from transformers import AutoTokenizer + from bsmetadata.experiments.with_metadata import get_dataloaders from bsmetadata.input_pipeline import DataConfig +from bsmetadata.metadata_processors import PROCESSORS from bsmetadata.metadata_utils import ( - add_metadata_and_chunk_examples, - create_global_metadata_prefix, add_local_metadata_to_text, + add_metadata_and_chunk_examples, chunks, + create_global_metadata_prefix, ) -from bsmetadata.metadata_processors import PROCESSORS -from transformers import AutoTokenizer +from bsmetadata.train import main, show_help -from datasets import load_dataset - -from html_processor import HtmlProcessor, TagToRemove tags_to_remove_alone = [ TagToRemove("body"), @@ -37,34 +40,41 @@ ], ) -args = DataConfig( - train_file="/home/lucile/mini-html-parser/data/v1.0/pre-process-body-v3/nq-train-00.jsonl.gz", - extension="json", - metadata_list=["html"], - preprocessing_num_workers=8 -) -tokenizer = AutoTokenizer.from_pretrained("gpt2") +# args = DataConfig( +# train_file="/home/lucile/mini-html-parser/data/v1.0/pre-process-body-v3/nq-train-00.jsonl.gz", +# extension="json", +# metadata_list=["html"], +# preprocessing_num_workers=8 +# ) +# tokenizer = AutoTokenizer.from_pretrained("gpt2") -# dataloaders = get_dataloaders(tokenizer, args) +# # dataloaders = get_dataloaders(tokenizer, args) -# dataloaders +# # dataloaders -# train_dataloader = dataloaders[0] +# # train_dataloader = dataloaders[0] -# sample = next(iter(train_dataloader)) -# print(tokenizer.convert_ids_to_tokens(sample["input_ids"][0])) -# dataset = load_dataset(args.extension, data_files=[args.train_file]) +# # sample = next(iter(train_dataloader)) +# # print(tokenizer.convert_ids_to_tokens(sample["input_ids"][0])) +# # dataset = load_dataset(args.extension, data_files=[args.train_file]) -# # dataset["train"][0] +# # # dataset["train"][0] -# examples = dataset["train"][:2] +# # examples = dataset["train"][:2] -# output = add_metadata_and_chunk_examples(examples=examples, tokenizer=tokenizer, cfg=args) +# # output = add_metadata_and_chunk_examples(examples=examples, tokenizer=tokenizer, cfg=args) -# print("******") -# print(tokenizer.decode(output["input_ids"][0])) +# # print("******") +# # print(tokenizer.decode(output["input_ids"][0])) -dataloaders = get_dataloaders(tokenizer, args) -print(dataloaders) \ No newline at end of file +# dataloaders = get_dataloaders(tokenizer, args) +# print(list(next(iter(dataloaders[0])).keys())) +# print(tokenizer.decode(next(iter(dataloaders[0]))["input_ids"][0])) + +if __name__ == "__main__": + if "--help" in sys.argv or "-h" in sys.argv: + show_help() + sys.exit() + main() From a3d60c7e757ff238d69e1b83d4df8e8fa9aae895 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 30 Aug 2021 09:43:10 +0000 Subject: [PATCH 12/84] format + fix content min char --- bsmetadata/metadata_utils.py | 2 +- experiments/html/html_processor.py | 12 ++++++------ experiments/html/test_html_processor.py | 5 ++--- tests/test_metadata_utils.py | 2 +- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/bsmetadata/metadata_utils.py b/bsmetadata/metadata_utils.py index e54d51f2..78941141 100644 --- a/bsmetadata/metadata_utils.py +++ b/bsmetadata/metadata_utils.py @@ -15,8 +15,8 @@ """ import random from collections import defaultdict +from dataclasses import dataclass, field from typing import Any, Dict, List, Tuple -from dataclasses import field, dataclass from transformers import PreTrainedTokenizerFast diff --git a/experiments/html/html_processor.py b/experiments/html/html_processor.py index 07a8e1ad..235bb4b4 100644 --- a/experiments/html/html_processor.py +++ b/experiments/html/html_processor.py @@ -1,7 +1,7 @@ import datetime -from typing import Any, Dict, Optional, Tuple, List -from urllib.parse import unquote_plus from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple +from urllib.parse import unquote_plus from bsmetadata.input_pipeline import DataConfig from bsmetadata.metadata_processors import MetadataProcessor @@ -32,8 +32,8 @@ class Metadata: class TagFilter: def __init__( self, - content_max_char_length: Optional[float] = float("inf"), - content_min_char_length: Optional[float] = 0, + content_max_char_length: Optional[float] = - float("inf"), + content_min_char_length: Optional[float] = - float("inf"), tags_exceptions: Optional[List[str]] = None, tags_to_remove_alone: Optional[List[TagToRemove]] = None, ): @@ -79,8 +79,8 @@ def __init__( self, cfg: DataConfig, attributes_to_keep=None, - content_max_char_length: Optional[float] = float("inf"), - content_min_char_length: Optional[float] = 0, + content_max_char_length: Optional[float] = - float("inf"), + content_min_char_length: Optional[float] = - float("inf"), tags_exceptions: Optional[List[str]] = None, tags_to_remove_alone: Optional[List[TagToRemove]] = None, ): diff --git a/experiments/html/test_html_processor.py b/experiments/html/test_html_processor.py index 199edd64..0b18cedd 100644 --- a/experiments/html/test_html_processor.py +++ b/experiments/html/test_html_processor.py @@ -1,7 +1,8 @@ import functools -from functools import partial import unittest +from functools import partial +from html_processor import HtmlProcessor, TagToRemove from transformers import GPT2TokenizerFast from bsmetadata.input_pipeline import DataConfig @@ -13,8 +14,6 @@ create_global_metadata_prefix, ) -from html_processor import HtmlProcessor, TagToRemove - class MetadataUtilsTester(unittest.TestCase): def setUp(self) -> None: diff --git a/tests/test_metadata_utils.py b/tests/test_metadata_utils.py index 7febfa53..0d10fe9d 100644 --- a/tests/test_metadata_utils.py +++ b/tests/test_metadata_utils.py @@ -5,7 +5,7 @@ from transformers import GPT2TokenizerFast from bsmetadata.input_pipeline import DataConfig -from bsmetadata.metadata_processors import PROCESSORS, MetadataProcessor, HtmlProcessor +from bsmetadata.metadata_processors import PROCESSORS, HtmlProcessor, MetadataProcessor from bsmetadata.metadata_utils import ( add_local_metadata_to_text, add_metadata_and_chunk_examples, From 4363a9b011a23c4f3e912743075e2b2b39f10e7d Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 30 Aug 2021 09:43:46 +0000 Subject: [PATCH 13/84] aff file name in addition to dataset name --- bsmetadata/experiments/with_metadata.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/bsmetadata/experiments/with_metadata.py b/bsmetadata/experiments/with_metadata.py index 2ef2800b..66a5c90a 100644 --- a/bsmetadata/experiments/with_metadata.py +++ b/bsmetadata/experiments/with_metadata.py @@ -40,11 +40,18 @@ def get_dataloaders(tokenizer, args): # # In distributed training, the load_dataset function guarantees that only one local process can concurrently # download the dataset. + data_files = {} + if args.train_file is not None: + data_files["train"] = args.train_file + if args.validation_file is not None: + data_files["validation"] = args.validation_file + if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( args.dataset_name, args.dataset_config_name, + data_files=data_files, cache_dir=args.cache_dir, keep_in_memory=False, ) @@ -63,12 +70,6 @@ def get_dataloaders(tokenizer, args): cache_dir=args.cache_dir, ) else: - data_files = {} - if args.train_file is not None: - data_files["train"] = args.train_file - if args.validation_file is not None: - data_files["validation"] = args.validation_file - extension = args.train_file.split(".")[-1] if not args.extension else args.extension if extension == "txt": raise ValueError( From d402effb74c42401368b54a4870868cab64e362e Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 30 Aug 2021 16:22:40 +0200 Subject: [PATCH 14/84] add example script --- experiments/html/example_script.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 experiments/html/example_script.sh diff --git a/experiments/html/example_script.sh b/experiments/html/example_script.sh new file mode 100644 index 00000000..1a501528 --- /dev/null +++ b/experiments/html/example_script.sh @@ -0,0 +1,11 @@ +python experiments/html/start_training.py \ +data_config.experiment="with_metadata" \ +data_config.metadata_list=["html"] \ +data_config.max_seq_len=1024 \ +data_config.dataset_name="SaulLu/Natural_Questions_HTML_Toy" \ +data_config.train_file="nq-train-*.jsonl.gz" \ +data_config.validation_file="nq-dev-*.jsonl.gz" \ +data_config.extension="json" \ +data_config.preprocessing_num_workers=6 \ +do_train=False \ +do_eval=False \ From 070b285f7412139f440e85069e8f7fc2f085a459 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 30 Aug 2021 16:23:46 +0200 Subject: [PATCH 15/84] add html parser dataclass --- bsmetadata/train.py | 2 +- experiments/html/html_processor.py | 72 +++++++++++----- experiments/html/start_training.py | 108 ++++++++++++++---------- experiments/html/test_html_processor.py | 2 +- 4 files changed, 116 insertions(+), 68 deletions(-) diff --git a/bsmetadata/train.py b/bsmetadata/train.py index 7eb3334d..a3b75cf0 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -232,7 +232,7 @@ def evaluate(eval_dataloader): gc.collect() if completed_steps >= args.max_train_steps: break - logger.close() + logger.close() if is_local_main_process and args.out_dir is not None: accelerator.wait_for_everyone() diff --git a/experiments/html/html_processor.py b/experiments/html/html_processor.py index 235bb4b4..fd959803 100644 --- a/experiments/html/html_processor.py +++ b/experiments/html/html_processor.py @@ -1,5 +1,5 @@ import datetime -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Tuple from urllib.parse import unquote_plus @@ -10,8 +10,8 @@ @dataclass class TagToRemove: tag: str - content_min_char_length: int = 0 - content_max_char_length: int = float("inf") + txt_min_chr_len: int = 0 + txt_max_chr_len: int = float("inf") @dataclass @@ -29,11 +29,38 @@ class Metadata: type: str = "local" +@dataclass +class AllTagsRules: + attributes_to_keep: List[str] = field(default_factory=(lambda: []), metadata={"help": "TODO."}) + txt_max_chr_len: float = field(default= -float("inf"), metadata={"help": "TODO."}) + txt_min_chr_len: float = field(default= -float("inf"), metadata={"help": "TODO."}) + tags_exceptions_to_txt_max_min_chr_len: List[str] = field( + default_factory=(lambda: []), metadata={"help": "TODO."} + ) + + +@dataclass +class HTMLParserConfig: + all_tags_rules: AllTagsRules = AllTagsRules() + tags_to_remove_alone_tag_name: List[str] = field( + default_factory=(lambda: []), + metadata={"help": "TODO."}, + ) + tags_to_remove_alone_txt_max_chr_len: List[float] = field( + default_factory=(lambda: []), + metadata={"help": "TODO."}, + ) + tags_to_remove_alone_txt_min_chr_len: List[float] = field( + default_factory=(lambda: []), + metadata={"help": "TODO."}, + ) + + class TagFilter: def __init__( self, - content_max_char_length: Optional[float] = - float("inf"), - content_min_char_length: Optional[float] = - float("inf"), + txt_max_chr_len: Optional[float] = -float("inf"), + txt_min_chr_len: Optional[float] = -float("inf"), tags_exceptions: Optional[List[str]] = None, tags_to_remove_alone: Optional[List[TagToRemove]] = None, ): @@ -42,8 +69,8 @@ def __init__( if isinstance(tags_to_remove_alone, list) else {} ) - self.content_max_char_length = content_max_char_length - self.content_min_char_length = content_min_char_length + self.txt_max_chr_len = txt_max_chr_len + self.txt_min_chr_len = txt_min_chr_len self.tags_exceptions = tags_exceptions if tags_exceptions else [] def drop_tag(self, metadata_node): @@ -56,16 +83,13 @@ def drop_tag(self, metadata_node): if tag in self.tags_to_remove_alone: tag_to_remove_characteristics = self.tags_to_remove_alone[tag] if ( - content_char_length <= tag_to_remove_characteristics.content_max_char_length - and content_char_length >= tag_to_remove_characteristics.content_min_char_length + content_char_length <= tag_to_remove_characteristics.txt_max_chr_len + and content_char_length >= tag_to_remove_characteristics.txt_min_chr_len ): drop_tag = True if tag not in self.tags_exceptions: - if ( - content_char_length <= self.content_max_char_length - and content_char_length >= self.content_min_char_length - ): + if content_char_length <= self.txt_max_chr_len and content_char_length >= self.txt_min_chr_len: drop_tag = True # raise TypeError(f"tag need to be a string not a {type(tag)}") @@ -78,21 +102,29 @@ class HtmlProcessor(MetadataProcessor): def __init__( self, cfg: DataConfig, - attributes_to_keep=None, - content_max_char_length: Optional[float] = - float("inf"), - content_min_char_length: Optional[float] = - float("inf"), - tags_exceptions: Optional[List[str]] = None, - tags_to_remove_alone: Optional[List[TagToRemove]] = None, ): """ Args: cfg: The data configuration to use. """ super().__init__(cfg) + attributes_to_keep = cfg.html_parser_config.all_tags_rules.attributes_to_keep + txt_max_chr_len = cfg.html_parser_config.all_tags_rules.txt_max_chr_len + txt_min_chr_len = cfg.html_parser_config.all_tags_rules.txt_min_chr_len + tags_exceptions = cfg.html_parser_config.all_tags_rules.tags_exceptions_to_txt_max_min_chr_len + tags_to_remove_alone = [ + TagToRemove(tag=tag, txt_max_chr_len=txt_max_chr_len, txt_min_chr_len=txt_min_chr_len) + for (tag, txt_max_chr_len, txt_min_chr_len) in zip( + cfg.html_parser_config.tags_to_remove_alone_tag_name, + cfg.html_parser_config.tags_to_remove_alone_txt_max_chr_len, + cfg.html_parser_config.tags_to_remove_alone_txt_min_chr_len, + ) + ] + self._tag_filter = TagFilter( tags_to_remove_alone=tags_to_remove_alone, - content_min_char_length=content_min_char_length, - content_max_char_length=content_max_char_length, + txt_min_chr_len=txt_min_chr_len, + txt_max_chr_len=txt_max_chr_len, tags_exceptions=tags_exceptions, ) self._attributes_to_keep = attributes_to_keep diff --git a/experiments/html/start_training.py b/experiments/html/start_training.py index 67a1684d..d592ffe7 100644 --- a/experiments/html/start_training.py +++ b/experiments/html/start_training.py @@ -1,8 +1,11 @@ import sys +from dataclasses import dataclass, field from functools import partial +from typing import Optional, List, Any, Tuple from datasets import load_dataset -from html_processor import HtmlProcessor, TagToRemove +from html_processor import HtmlProcessor, TagToRemove, HTMLParserConfig, AllTagsRules +from hydra.core.config_store import ConfigStore from transformers import AutoTokenizer from bsmetadata.experiments.with_metadata import get_dataloaders @@ -19,8 +22,8 @@ tags_to_remove_alone = [ TagToRemove("body"), - TagToRemove("div", content_max_char_length=0), - TagToRemove("a", content_max_char_length=0), + TagToRemove("div", txt_max_chr_len=0), + TagToRemove("a", txt_max_chr_len=0), ] tags_table = ["table" "tr", "th", "td", "caption", "colgroup", "thead", "tfoot", "tbody"] tags_list = [ @@ -28,50 +31,63 @@ "ol", "ul", ] -PROCESSORS["html"] = partial( - HtmlProcessor, - tags_to_remove_alone=tags_to_remove_alone, - attributes_to_keep=["class", "id"], - content_max_char_length=128, - tags_exceptions=[ - *tags_table, - *tags_list, - "span", - ], -) - -# args = DataConfig( -# train_file="/home/lucile/mini-html-parser/data/v1.0/pre-process-body-v3/nq-train-00.jsonl.gz", -# extension="json", -# metadata_list=["html"], -# preprocessing_num_workers=8 -# ) -# tokenizer = AutoTokenizer.from_pretrained("gpt2") - -# # dataloaders = get_dataloaders(tokenizer, args) - -# # dataloaders - -# # train_dataloader = dataloaders[0] - -# # sample = next(iter(train_dataloader)) -# # print(tokenizer.convert_ids_to_tokens(sample["input_ids"][0])) -# # dataset = load_dataset(args.extension, data_files=[args.train_file]) - -# # # dataset["train"][0] - -# # examples = dataset["train"][:2] - -# # output = add_metadata_and_chunk_examples(examples=examples, tokenizer=tokenizer, cfg=args) - - -# # print("******") -# # print(tokenizer.decode(output["input_ids"][0])) - +attributes_to_keep = ["class", "id"] +txt_max_chr_len = 128 +txt_min_chr_len = -float("inf") +tags_exceptions = [ + *tags_table, + *tags_list, + "span", +] -# dataloaders = get_dataloaders(tokenizer, args) -# print(list(next(iter(dataloaders[0])).keys())) -# print(tokenizer.decode(next(iter(dataloaders[0]))["input_ids"][0])) +PROCESSORS["html"] = HtmlProcessor + + +@dataclass +class DataConfigWithHTML(DataConfig): + html_parser_config: HTMLParserConfig = HTMLParserConfig( + AllTagsRules( + attributes_to_keep=attributes_to_keep, + txt_max_chr_len=txt_max_chr_len, + txt_min_chr_len=txt_min_chr_len, + tags_exceptions_to_txt_max_min_chr_len=tags_exceptions + ), + tags_to_remove_alone_tag_name=[tag_to_remove.tag for tag_to_remove in tags_to_remove_alone], + tags_to_remove_alone_txt_max_chr_len=[tag_to_remove.txt_max_chr_len for tag_to_remove in tags_to_remove_alone], + tags_to_remove_alone_txt_min_chr_len=[tag_to_remove.txt_min_chr_len for tag_to_remove in tags_to_remove_alone] + ) + + +@dataclass +class CFG: + data_config: DataConfigWithHTML = DataConfigWithHTML() + weight_decay: float = field(default=0.0, metadata={"help": "The weight decay to use for training."}) + learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate."}) + gradient_accumulation_steps: int = field( + default=1, + metadata={"help": "The number of gradient accumulation steps to perform before updating model parameters."}, + ) + num_train_epochs: int = field(default=1, metadata={"help": "The number of epochs to train the model for."}) + max_train_steps: Optional[int] = field( + default=None, metadata={"help": "The maximum number of training steps (overrides num_train_epochs)."} + ) + lr_scheduler_type: str = field(default="linear", metadata={"help": "The type of learning rate schedule to use."}) + num_warmup_steps: int = field( + default=1000, metadata={"help": "The number of warmup steps during which the learning rate is increased."} + ) + seed: int = field(default=42, metadata={"help": "The seed used for RNG initialization."}) + out_dir: str = field( + default="output_dir", metadata={"help": "The output directory in which the trained model is saved."} + ) + num_eval: int = field(default=3, metadata={"help": "The number of evaluations to perform during training."}) + model_name: str = field(default="gpt2", metadata={"help": "The name of the pretrained model to use."}) + project_name: str = field(default="metadata_lm", metadata={"help": "The project name."}) + do_train: bool = field(default=True, metadata={"help": "Whether to run training."}) + do_eval: bool = field(default=True, metadata={"help": "Whether to run eval on the dev set."}) + + +cs = ConfigStore.instance() +cs.store(name="config", node=CFG) if __name__ == "__main__": if "--help" in sys.argv or "-h" in sys.argv: diff --git a/experiments/html/test_html_processor.py b/experiments/html/test_html_processor.py index 0b18cedd..89f05208 100644 --- a/experiments/html/test_html_processor.py +++ b/experiments/html/test_html_processor.py @@ -147,7 +147,7 @@ def test_add_html_tags(self): def test_add_html_tags_remove_tag(self): cfg = DataConfig() cfg.metadata_list = ["html"] - tags_to_remove_alone = [TagToRemove("span", content_max_char_length=5), TagToRemove("body")] + tags_to_remove_alone = [TagToRemove("span", txt_max_chr_len=5), TagToRemove("body")] PROCESSORS["html"] = partial(HtmlProcessor, tags_to_remove_alone=tags_to_remove_alone) text1, mask1 = add_local_metadata_to_text(self.examples[1], cfg) From 830744d682d7f633c7694bda7b2deadd8f94fec1 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 30 Aug 2021 18:13:23 +0200 Subject: [PATCH 16/84] add hash --- experiments/html/hash_investigate.txt | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 experiments/html/hash_investigate.txt diff --git a/experiments/html/hash_investigate.txt b/experiments/html/hash_investigate.txt new file mode 100644 index 00000000..6554bf94 --- /dev/null +++ b/experiments/html/hash_investigate.txt @@ -0,0 +1,20 @@ +experiment hash key | value: d054d90240b85e16 | 344c3dc4b5cf8646 +per_device_eval_batch_size hash key | value: 67dae3bf9c1036df | d3680cf2f313c8fc +per_device_train_batch_size hash key | value: 530047abd92ccf24 | d3680cf2f313c8fc +metadata_list hash key | value: 4bd839407ac25327 | 1334f528d7dbe450 +metadata_sep hash key | value: 1726fe05238500f6 | 040238c86b64577a +metadata_key_value_sep hash key | value: af649de4f4919728 | 33dd19cdc1300084 +metadata_probability hash key | value: cdcd0263a3874515 | fea5c87c419b5847 +global_metadata_sep hash key | value: 52422b0116a8167e | 9d5790dd15155529 +max_seq_len hash key | value: 09f0965bf157c2bd | 3fd3922da80411e2 +dataset_name hash key | value: 8cdcb241ac2c7029 | 1928a0ada5a73e07 +dataset_config_name hash key | value: b1e8d2394418d360 | 4d8c0405832b0f7e +train_file hash key | value: dd1e5e2401c1bcfa | b0123c2d43e2982d +validation_file hash key | value: 93f8998bdf1c060f | faae0565261050bc +overwrite_cache hash key | value: 0053471c52542656 | aef99bec791b0f18 +cache_dir hash key | value: 136582c7d6f5b69d | 4d8c0405832b0f7e +extension hash key | value: fb3553b1bec24680 | f6caa0871f48acbd +preprocessing_num_workers hash key | value: c55b211054a90a87 | 433aeac0a2f51423 +validation_split_percentage hash key | value: c8a6d71d862df501 | 9897c07112dce998 +block_size hash key | value: 81dc640ef3b11f40 | 4d8c0405832b0f7e +html_parser_config hash key | value: 3844a6686d6b07c7 | b507f2f4a7200b74 \ No newline at end of file From a8e3f4fe98b34449349128ef115c3e0a180ac7b7 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 30 Aug 2021 19:01:31 +0200 Subject: [PATCH 17/84] add html --- experiments/html/hash_investigate.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/experiments/html/hash_investigate.txt b/experiments/html/hash_investigate.txt index 6554bf94..c504de12 100644 --- a/experiments/html/hash_investigate.txt +++ b/experiments/html/hash_investigate.txt @@ -17,4 +17,5 @@ extension hash key | value: fb3553b1bec24680 | f6caa0871f48acbd preprocessing_num_workers hash key | value: c55b211054a90a87 | 433aeac0a2f51423 validation_split_percentage hash key | value: c8a6d71d862df501 | 9897c07112dce998 block_size hash key | value: 81dc640ef3b11f40 | 4d8c0405832b0f7e -html_parser_config hash key | value: 3844a6686d6b07c7 | b507f2f4a7200b74 \ No newline at end of file +html_parser_config hash key | value: 3844a6686d6b07c7 | b507f2f4a7200b74 +"['html']" \ No newline at end of file From 7a1b699e9f90b528be499e75cee0fa41ad8f2359 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 30 Aug 2021 20:15:37 +0200 Subject: [PATCH 18/84] format + make hashables the args --- bsmetadata/train.py | 4 +++ experiments/html/html_processor.py | 9 +++---- experiments/html/start_training.py | 34 +++++++++---------------- experiments/html/test_html_processor.py | 34 +++++++++++++++++-------- 4 files changed, 43 insertions(+), 38 deletions(-) diff --git a/bsmetadata/train.py b/bsmetadata/train.py index a3b75cf0..6d41374e 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -48,6 +48,7 @@ class CFG: do_train: bool = field(default=True, metadata={"help": "Whether to run training."}) do_eval: bool = field(default=True, metadata={"help": "Whether to run eval on the dev set."}) + cs = ConfigStore.instance() cs.store(name="config", node=CFG) @@ -109,6 +110,9 @@ def loss_fn(batch, outputs, metadata_mask=None): def main(args: CFG) -> None: print(OmegaConf.to_yaml(args)) + # The following line is very important for the object to be hashable (property used by datasets) + args = OmegaConf.to_object(args) + set_seed(args.seed) accelerator = Accelerator() is_local_main_process = accelerator.is_local_main_process diff --git a/experiments/html/html_processor.py b/experiments/html/html_processor.py index fd959803..0a7ed0bd 100644 --- a/experiments/html/html_processor.py +++ b/experiments/html/html_processor.py @@ -1,4 +1,3 @@ -import datetime from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Tuple from urllib.parse import unquote_plus @@ -32,11 +31,9 @@ class Metadata: @dataclass class AllTagsRules: attributes_to_keep: List[str] = field(default_factory=(lambda: []), metadata={"help": "TODO."}) - txt_max_chr_len: float = field(default= -float("inf"), metadata={"help": "TODO."}) - txt_min_chr_len: float = field(default= -float("inf"), metadata={"help": "TODO."}) - tags_exceptions_to_txt_max_min_chr_len: List[str] = field( - default_factory=(lambda: []), metadata={"help": "TODO."} - ) + txt_max_chr_len: float = field(default=-float("inf"), metadata={"help": "TODO."}) + txt_min_chr_len: float = field(default=-float("inf"), metadata={"help": "TODO."}) + tags_exceptions_to_txt_max_min_chr_len: List[str] = field(default_factory=(lambda: []), metadata={"help": "TODO."}) @dataclass diff --git a/experiments/html/start_training.py b/experiments/html/start_training.py index d592ffe7..039d4a2e 100644 --- a/experiments/html/start_training.py +++ b/experiments/html/start_training.py @@ -1,22 +1,12 @@ import sys from dataclasses import dataclass, field -from functools import partial -from typing import Optional, List, Any, Tuple +from typing import Optional -from datasets import load_dataset -from html_processor import HtmlProcessor, TagToRemove, HTMLParserConfig, AllTagsRules +from html_processor import AllTagsRules, HTMLParserConfig, HtmlProcessor, TagToRemove from hydra.core.config_store import ConfigStore -from transformers import AutoTokenizer -from bsmetadata.experiments.with_metadata import get_dataloaders from bsmetadata.input_pipeline import DataConfig from bsmetadata.metadata_processors import PROCESSORS -from bsmetadata.metadata_utils import ( - add_local_metadata_to_text, - add_metadata_and_chunk_examples, - chunks, - create_global_metadata_prefix, -) from bsmetadata.train import main, show_help @@ -45,16 +35,16 @@ @dataclass class DataConfigWithHTML(DataConfig): - html_parser_config: HTMLParserConfig = HTMLParserConfig( - AllTagsRules( - attributes_to_keep=attributes_to_keep, - txt_max_chr_len=txt_max_chr_len, - txt_min_chr_len=txt_min_chr_len, - tags_exceptions_to_txt_max_min_chr_len=tags_exceptions - ), - tags_to_remove_alone_tag_name=[tag_to_remove.tag for tag_to_remove in tags_to_remove_alone], - tags_to_remove_alone_txt_max_chr_len=[tag_to_remove.txt_max_chr_len for tag_to_remove in tags_to_remove_alone], - tags_to_remove_alone_txt_min_chr_len=[tag_to_remove.txt_min_chr_len for tag_to_remove in tags_to_remove_alone] + html_parser_config: HTMLParserConfig = HTMLParserConfig( + AllTagsRules( + attributes_to_keep=attributes_to_keep, + txt_max_chr_len=txt_max_chr_len, + txt_min_chr_len=txt_min_chr_len, + tags_exceptions_to_txt_max_min_chr_len=tags_exceptions, + ), + tags_to_remove_alone_tag_name=[tag_to_remove.tag for tag_to_remove in tags_to_remove_alone], + tags_to_remove_alone_txt_max_chr_len=[tag_to_remove.txt_max_chr_len for tag_to_remove in tags_to_remove_alone], + tags_to_remove_alone_txt_min_chr_len=[tag_to_remove.txt_min_chr_len for tag_to_remove in tags_to_remove_alone], ) diff --git a/experiments/html/test_html_processor.py b/experiments/html/test_html_processor.py index 89f05208..402863bc 100644 --- a/experiments/html/test_html_processor.py +++ b/experiments/html/test_html_processor.py @@ -1,12 +1,10 @@ -import functools import unittest -from functools import partial -from html_processor import HtmlProcessor, TagToRemove +from html_processor import AllTagsRules, HTMLParserConfig, HtmlProcessor, TagToRemove +from start_training import DataConfigWithHTML from transformers import GPT2TokenizerFast -from bsmetadata.input_pipeline import DataConfig -from bsmetadata.metadata_processors import PROCESSORS, MetadataProcessor +from bsmetadata.metadata_processors import PROCESSORS from bsmetadata.metadata_utils import ( add_local_metadata_to_text, add_metadata_and_chunk_examples, @@ -135,20 +133,36 @@ def setUp(self) -> None: ] def test_add_html_tags(self): - cfg = DataConfig() + cfg = DataConfigWithHTML( + html_parser_config=HTMLParserConfig( + all_tags_rules=AllTagsRules(attributes_to_keep=["class", "id", "href"]) + ) + ) cfg.metadata_list = ["html"] PROCESSORS["html"] = HtmlProcessor text1, mask1 = add_local_metadata_to_text(self.examples[0], cfg) - target_text = 'useless text

The Walking Dead (season 8)

\n' + target_text = 'useless text

The Walking Dead (season 8)

\n' self.assertEqual(text1, target_text) def test_add_html_tags_remove_tag(self): - cfg = DataConfig() - cfg.metadata_list = ["html"] tags_to_remove_alone = [TagToRemove("span", txt_max_chr_len=5), TagToRemove("body")] - PROCESSORS["html"] = partial(HtmlProcessor, tags_to_remove_alone=tags_to_remove_alone) + + cfg = DataConfigWithHTML( + html_parser_config=HTMLParserConfig( + all_tags_rules=AllTagsRules(attributes_to_keep=["class", "id", "href"]), + tags_to_remove_alone_tag_name=[tag_to_remove.tag for tag_to_remove in tags_to_remove_alone], + tags_to_remove_alone_txt_max_chr_len=[ + tag_to_remove.txt_max_chr_len for tag_to_remove in tags_to_remove_alone + ], + tags_to_remove_alone_txt_min_chr_len=[ + tag_to_remove.txt_min_chr_len for tag_to_remove in tags_to_remove_alone + ], + ) + ) + cfg.metadata_list = ["html"] + PROCESSORS["html"] = HtmlProcessor text1, mask1 = add_local_metadata_to_text(self.examples[1], cfg) target_text = ( From b3a20df2b40b73293bd4fcfcb9c7935489b24d65 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 30 Aug 2021 20:34:03 +0200 Subject: [PATCH 19/84] change requirements for the new method used --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1671047b..7eb152ba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ torch==1.8.1 -hydra_core==1.0.6 +hydra_core==1.1.* wandb==0.10.30 transformers accelerate==0.3.0 +omegaconf==2.1.1 git+https://github.com/huggingface/datasets.git From 70205a173c57606dcae7b5a1dbdb3e9427b0f261 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 14:09:00 +0200 Subject: [PATCH 20/84] add script to run experiment --- .../init_experiment/create_dataset.slurm | 34 +++++++++ .../SLURM/init_experiment/do_training.slurm | 4 + .../SLURM/init_experiment/load_dataset.py | 73 +++++++++++++++++++ .../SLURM/init_experiment/load_dataset.slurm | 27 +++++++ .../init_experiment}/start_training.py | 0 5 files changed, 138 insertions(+) create mode 100644 experiments/html/SLURM/init_experiment/create_dataset.slurm create mode 100644 experiments/html/SLURM/init_experiment/do_training.slurm create mode 100644 experiments/html/SLURM/init_experiment/load_dataset.py create mode 100644 experiments/html/SLURM/init_experiment/load_dataset.slurm rename experiments/html/{ => SLURM/init_experiment}/start_training.py (100%) diff --git a/experiments/html/SLURM/init_experiment/create_dataset.slurm b/experiments/html/SLURM/init_experiment/create_dataset.slurm new file mode 100644 index 00000000..ade2c583 --- /dev/null +++ b/experiments/html/SLURM/init_experiment/create_dataset.slurm @@ -0,0 +1,34 @@ + +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name + +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # number of gpus +#SBATCH --time 00:02:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=%x-%j.out # output file name +#SBATCH --error=%x-%j.err # error file name +#SBATCH --account=six@cpu # account +#SBATCH -p prepost # partition with internet + +set -x -e + +source $HOME/start-user + +nvidia-smi + +cd $WORK/repos/metadata/ + +python experiments/html/start_training.py \ +data_config.experiment="with_metadata" \ +data_config.metadata_list=["html"] \ +data_config.max_seq_len=1024 \ +data_config.dataset_name="SaulLu/Natural_Questions_HTML_Toy" \ +data_config.train_file="nq-train-*.jsonl.gz" \ +data_config.validation_file="nq-dev-*.jsonl.gz" \ +data_config.extension="json" \ +data_config.preprocessing_num_workers=8 \ +do_train=False \ +do_eval=False \ \ No newline at end of file diff --git a/experiments/html/SLURM/init_experiment/do_training.slurm b/experiments/html/SLURM/init_experiment/do_training.slurm new file mode 100644 index 00000000..bc925e22 --- /dev/null +++ b/experiments/html/SLURM/init_experiment/do_training.slurm @@ -0,0 +1,4 @@ + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +export WANDB_MODE=offline \ No newline at end of file diff --git a/experiments/html/SLURM/init_experiment/load_dataset.py b/experiments/html/SLURM/init_experiment/load_dataset.py new file mode 100644 index 00000000..3d69ee2e --- /dev/null +++ b/experiments/html/SLURM/init_experiment/load_dataset.py @@ -0,0 +1,73 @@ +import hydra +import sys +from datasets import load_dataset +from hydra.core.config_store import ConfigStore + +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.train import show_help + +cs = ConfigStore.instance() +cs.store(name="data_config", node=DataConfig) + +@hydra.main(config_name="config") +def main(args: DataConfig) -> None: + data_files = {} + if args.train_file is not None: + data_files["train"] = args.train_file + if args.validation_file is not None: + data_files["validation"] = args.validation_file + + if args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + args.dataset_name, + args.dataset_config_name, + data_files=data_files, + cache_dir=args.cache_dir, + keep_in_memory=False, + ) + + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + else: + extension = args.train_file.split(".")[-1] if not args.extension else args.extension + if extension == "txt": + raise ValueError( + "You have entered a text file for the train data, but this type of file cannot contain metadata " + "columns. Wouldn't you rather have a file in json/jsonl or pandas format?" + ) + if extension == "jsonl": + extension = "json" + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=args.cache_dir) + + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + + +if __name__ == "__main__": + if "--help" in sys.argv or "-h" in sys.argv: + show_help() + sys.exit() + main() \ No newline at end of file diff --git a/experiments/html/SLURM/init_experiment/load_dataset.slurm b/experiments/html/SLURM/init_experiment/load_dataset.slurm new file mode 100644 index 00000000..3187730c --- /dev/null +++ b/experiments/html/SLURM/init_experiment/load_dataset.slurm @@ -0,0 +1,27 @@ + +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name + +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # number of gpus +#SBATCH --time 00:02:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=%x-%j.out # output file name +#SBATCH --error=%x-%j.err # error file name +#SBATCH --account=six@cpu # account +#SBATCH -p prepost # partition with internet + +set -x -e + +source $HOME/start-user + +nvidia-smi + +cd $WORK/repos/metadata/ + +python experiments/html/SLURM/init_experiment/load_dataset.py \ +dataset_name="SaulLu/Natural_Questions_HTML_Toy" \ +train_file="nq-train-*.jsonl.gz" \ +validation_file="nq-dev-*.jsonl.gz" \ No newline at end of file diff --git a/experiments/html/start_training.py b/experiments/html/SLURM/init_experiment/start_training.py similarity index 100% rename from experiments/html/start_training.py rename to experiments/html/SLURM/init_experiment/start_training.py From cb3f9de08779a0d92f0720ad73e9180ea660666f Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 14:11:59 +0200 Subject: [PATCH 21/84] fix batch script --- experiments/html/SLURM/init_experiment/load_dataset.slurm | 1 - 1 file changed, 1 deletion(-) diff --git a/experiments/html/SLURM/init_experiment/load_dataset.slurm b/experiments/html/SLURM/init_experiment/load_dataset.slurm index 3187730c..485e3d44 100644 --- a/experiments/html/SLURM/init_experiment/load_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/load_dataset.slurm @@ -1,4 +1,3 @@ - #!/bin/bash #SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name From ad808d3ae760dc2575113547c40c3ed4d046e8ea Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 15:05:24 +0200 Subject: [PATCH 22/84] fix code --- .../html/SLURM/init_experiment/load_dataset.py | 2 +- .../html/SLURM/init_experiment/load_dataset.slurm | 11 ++++------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/experiments/html/SLURM/init_experiment/load_dataset.py b/experiments/html/SLURM/init_experiment/load_dataset.py index 3d69ee2e..60bca6d2 100644 --- a/experiments/html/SLURM/init_experiment/load_dataset.py +++ b/experiments/html/SLURM/init_experiment/load_dataset.py @@ -9,7 +9,7 @@ cs = ConfigStore.instance() cs.store(name="data_config", node=DataConfig) -@hydra.main(config_name="config") +@hydra.main(config_name="data_config") def main(args: DataConfig) -> None: data_files = {} if args.train_file is not None: diff --git a/experiments/html/SLURM/init_experiment/load_dataset.slurm b/experiments/html/SLURM/init_experiment/load_dataset.slurm index 485e3d44..5c40d79a 100644 --- a/experiments/html/SLURM/init_experiment/load_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/load_dataset.slurm @@ -1,23 +1,20 @@ #!/bin/bash #SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name - #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! #SBATCH --cpus-per-task=8 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --gres=gpu:0 # number of gpus #SBATCH --time 00:02:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=%x-%j.out # output file name -#SBATCH --error=%x-%j.err # error file name -#SBATCH --account=six@cpu # account -#SBATCH -p prepost # partition with internet +#SBATCH --output=%x-%j.out # output file name +#SBATCH --error=%x-%j.err # error file name +#SBATCH --account=six@cpu # account +#SBATCH -p compil # partition with internet set -x -e source $HOME/start-user -nvidia-smi - cd $WORK/repos/metadata/ python experiments/html/SLURM/init_experiment/load_dataset.py \ From 4618c394625ff93ff94a5f6be9cd3d942ddff9d4 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 15:07:12 +0200 Subject: [PATCH 23/84] change alloc --- experiments/html/SLURM/init_experiment/load_dataset.slurm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/experiments/html/SLURM/init_experiment/load_dataset.slurm b/experiments/html/SLURM/init_experiment/load_dataset.slurm index 5c40d79a..56ff6172 100644 --- a/experiments/html/SLURM/init_experiment/load_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/load_dataset.slurm @@ -8,8 +8,8 @@ #SBATCH --time 00:02:00 # maximum execution time (HH:MM:SS) #SBATCH --output=%x-%j.out # output file name #SBATCH --error=%x-%j.err # error file name -#SBATCH --account=six@cpu # account -#SBATCH -p compil # partition with internet +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet set -x -e From c4969af0dc030104cb4468b783c228e21edc4855 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 15:17:49 +0200 Subject: [PATCH 24/84] adjust bash script for preprocess --- experiments/html/SLURM/init_experiment/create_dataset.slurm | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/experiments/html/SLURM/init_experiment/create_dataset.slurm b/experiments/html/SLURM/init_experiment/create_dataset.slurm index ade2c583..bd80ccd0 100644 --- a/experiments/html/SLURM/init_experiment/create_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/create_dataset.slurm @@ -1,24 +1,20 @@ #!/bin/bash #SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name - #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! #SBATCH --cpus-per-task=8 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --gres=gpu:0 # number of gpus -#SBATCH --time 00:02:00 # maximum execution time (HH:MM:SS) +#SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) #SBATCH --output=%x-%j.out # output file name #SBATCH --error=%x-%j.err # error file name #SBATCH --account=six@cpu # account -#SBATCH -p prepost # partition with internet set -x -e source $HOME/start-user -nvidia-smi - cd $WORK/repos/metadata/ python experiments/html/start_training.py \ From e0c7246f6f2792049a539a320fa8a1161b06e894 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 15:18:54 +0200 Subject: [PATCH 25/84] space top file removed --- experiments/html/SLURM/init_experiment/create_dataset.slurm | 1 - 1 file changed, 1 deletion(-) diff --git a/experiments/html/SLURM/init_experiment/create_dataset.slurm b/experiments/html/SLURM/init_experiment/create_dataset.slurm index bd80ccd0..4f61be78 100644 --- a/experiments/html/SLURM/init_experiment/create_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/create_dataset.slurm @@ -1,4 +1,3 @@ - #!/bin/bash #SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name #SBATCH --nodes=1 From 7335e349a441a961a561023b966a029f1107d2e7 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 15:19:34 +0200 Subject: [PATCH 26/84] remove ngpu --- experiments/html/SLURM/init_experiment/create_dataset.slurm | 1 - 1 file changed, 1 deletion(-) diff --git a/experiments/html/SLURM/init_experiment/create_dataset.slurm b/experiments/html/SLURM/init_experiment/create_dataset.slurm index 4f61be78..035a003f 100644 --- a/experiments/html/SLURM/init_experiment/create_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/create_dataset.slurm @@ -4,7 +4,6 @@ #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! #SBATCH --cpus-per-task=8 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --gres=gpu:0 # number of gpus #SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) #SBATCH --output=%x-%j.out # output file name #SBATCH --error=%x-%j.err # error file name From 6a77f4a896f3463a4467cd06cc653c72dc65157c Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 15:22:20 +0200 Subject: [PATCH 27/84] fix path --- experiments/html/SLURM/init_experiment/load_dataset.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experiments/html/SLURM/init_experiment/load_dataset.slurm b/experiments/html/SLURM/init_experiment/load_dataset.slurm index 56ff6172..e8c36d4d 100644 --- a/experiments/html/SLURM/init_experiment/load_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/load_dataset.slurm @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name +#SBATCH --job-name=modelling-metadata-html-download-dataset-test # job name #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! #SBATCH --cpus-per-task=8 # number of cores per tasks From 18d0465550d128d662653e803dfe13767b418816 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 15:23:40 +0200 Subject: [PATCH 28/84] fix path --- experiments/html/SLURM/init_experiment/create_dataset.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experiments/html/SLURM/init_experiment/create_dataset.slurm b/experiments/html/SLURM/init_experiment/create_dataset.slurm index 035a003f..e6377d10 100644 --- a/experiments/html/SLURM/init_experiment/create_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/create_dataset.slurm @@ -15,7 +15,7 @@ source $HOME/start-user cd $WORK/repos/metadata/ -python experiments/html/start_training.py \ +python experiments/html/SLURM/init_experiment/start_training.py \ data_config.experiment="with_metadata" \ data_config.metadata_list=["html"] \ data_config.max_seq_len=1024 \ From 8532f1bf92de62372e017755816d599f725503df Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 15:26:30 +0200 Subject: [PATCH 29/84] refactor --- experiments/html/SLURM/init_experiment/create_dataset.slurm | 2 +- experiments/html/{SLURM/init_experiment => }/start_training.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename experiments/html/{SLURM/init_experiment => }/start_training.py (100%) diff --git a/experiments/html/SLURM/init_experiment/create_dataset.slurm b/experiments/html/SLURM/init_experiment/create_dataset.slurm index e6377d10..035a003f 100644 --- a/experiments/html/SLURM/init_experiment/create_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/create_dataset.slurm @@ -15,7 +15,7 @@ source $HOME/start-user cd $WORK/repos/metadata/ -python experiments/html/SLURM/init_experiment/start_training.py \ +python experiments/html/start_training.py \ data_config.experiment="with_metadata" \ data_config.metadata_list=["html"] \ data_config.max_seq_len=1024 \ diff --git a/experiments/html/SLURM/init_experiment/start_training.py b/experiments/html/start_training.py similarity index 100% rename from experiments/html/SLURM/init_experiment/start_training.py rename to experiments/html/start_training.py From b0ddd7f64c3c2cd4afe999bee04258123b96872f Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 15:43:08 +0200 Subject: [PATCH 30/84] add config_path --- bsmetadata/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bsmetadata/train.py b/bsmetadata/train.py index 6d41374e..65fa007a 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -106,7 +106,7 @@ def loss_fn(batch, outputs, metadata_mask=None): return loss -@hydra.main(config_name="config") +@hydra.main(config_path=None, config_name="config") def main(args: CFG) -> None: print(OmegaConf.to_yaml(args)) From 55e7cd40b1109408b44cc0c906c7f8eede7338d9 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 16:31:34 +0200 Subject: [PATCH 31/84] add logging --- bsmetadata/experiments/with_metadata.py | 7 +++++++ bsmetadata/train.py | 21 ++++++++++++++++----- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/bsmetadata/experiments/with_metadata.py b/bsmetadata/experiments/with_metadata.py index 66a5c90a..5ad1679c 100644 --- a/bsmetadata/experiments/with_metadata.py +++ b/bsmetadata/experiments/with_metadata.py @@ -46,6 +46,8 @@ def get_dataloaders(tokenizer, args): if args.validation_file is not None: data_files["validation"] = args.validation_file + logger.info("Start to load dataset") + logger.warning("Start to load dataset") if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( @@ -93,12 +95,14 @@ def get_dataloaders(tokenizer, args): split=f"train[{args.validation_split_percentage}%:]", cache_dir=args.cache_dir, ) + logger.info("Dataset loaded") # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Preprocessing the datasets. column_names = raw_datasets["train"].column_names + logger.info("Start to add metadata and chunk examples") # First we pre-process our text and metadata lm_datasets = raw_datasets.map( functools.partial(add_metadata_and_chunk_examples, tokenizer=tokenizer, cfg=args), @@ -108,11 +112,13 @@ def get_dataloaders(tokenizer, args): desc="Pre-process the text and metadata to create new samples", remove_columns=column_names, ) + logger.info("Add metadata and chunk examples finished") def create_labels_column(examples): examples["labels"] = examples["input_ids"].copy() return examples + logger.info("Create labels column") # Then we add the column containing the labels lm_datasets = lm_datasets.map( create_labels_column, @@ -121,6 +127,7 @@ def create_labels_column(examples): load_from_cache_file=not args.overwrite_cache, desc="Create labels column", ) + logger.info("Creating labels column finished") train_dataset = lm_datasets["train"] val_dataset = lm_datasets["validation"] diff --git a/bsmetadata/train.py b/bsmetadata/train.py index 65fa007a..02af462f 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -1,4 +1,5 @@ import dataclasses +import logging import gc import json import math @@ -120,10 +121,18 @@ def main(args: CFG) -> None: os.makedirs(args.out_dir, exist_ok=True) + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if is_local_main_process else logging.WARN, + ) + # get dataloaders tokenizer = AutoTokenizer.from_pretrained(args.model_name) tokenizer.pad_token = tokenizer.eos_token train_dataloader, eval_dataloaders = get_dataloaders(tokenizer, args.data_config) + logger.info("The dataloaders have been build") # get model model = AutoModelForCausalLM.from_pretrained(args.model_name) @@ -187,9 +196,11 @@ def evaluate(eval_dataloader): return {"perplexity": perplexity} if args.do_train: + # Train! + logger.info("***** Running training *****") progress_bar = tqdm(range(args.max_train_steps), desc="training") completed_steps = 0 - logger = Logger(is_local_main_process, project=args.project_name, config=args) + logger_metrics = Logger(is_local_main_process, project=args.project_name, config=args) for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): @@ -200,7 +211,7 @@ def evaluate(eval_dataloader): batch["labels"] = labels loss = loss_fn(batch, outputs, metadata_mask) - logger.log({"loss": loss}) + logger_metrics.log({"loss": loss}) loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) @@ -218,9 +229,9 @@ def evaluate(eval_dataloader): if do_eval: for key, eval_dataloader in eval_dataloaders.items(): metrics = evaluate(eval_dataloader) - logger.log({key: metrics}) + logger_metrics.log({key: metrics}) - # logger.info(f"epoch {epoch}: perplexity: {perplexity}") + # logger_metrics.info(f"epoch {epoch}: perplexity: {perplexity}") if is_local_main_process: save_dict = { "epoch": epoch + 1, @@ -236,7 +247,7 @@ def evaluate(eval_dataloader): gc.collect() if completed_steps >= args.max_train_steps: break - logger.close() + logger_metrics.close() if is_local_main_process and args.out_dir is not None: accelerator.wait_for_everyone() From c5c0663c78685d417681f93f9bdf863e1ed57944 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 17:04:14 +0200 Subject: [PATCH 32/84] load model and dataset --- .../load_tokenizer_and_model.py | 26 +++++++++++++++++++ .../load_tokenizer_and_model.slurm | 21 +++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py create mode 100644 experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm diff --git a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py new file mode 100644 index 00000000..2585239c --- /dev/null +++ b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py @@ -0,0 +1,26 @@ +import hydra +import sys +from datasets import load_dataset +from hydra.core.config_store import ConfigStore +from transformers import AdamW, AutoModelForCausalLM, AutoTokenizer + +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.train import show_help, CFG + +cs = ConfigStore.instance() +cs.store(name="config", node=CFG) + +@hydra.main(config_name="config") +def main(args: CFG) -> None: + # get dataloaders + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + + # get model + model = AutoModelForCausalLM.from_pretrained(args.model_name) + + +if __name__ == "__main__": + if "--help" in sys.argv or "-h" in sys.argv: + show_help() + sys.exit() + main() \ No newline at end of file diff --git a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm new file mode 100644 index 00000000..d504b821 --- /dev/null +++ b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-download-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # number of gpus +#SBATCH --time 00:02:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=%x-%j.out # output file name +#SBATCH --error=%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +source $HOME/start-user + +cd $WORK/repos/metadata/ + +python experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py \ +model_name=gpt2 \ \ No newline at end of file From 266a5ef5a82cc6d71d1bff83b65ddc70ae226fe5 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 17:06:15 +0200 Subject: [PATCH 33/84] hydra config_path --- .../html/SLURM/loading_scripts/load_tokenizer_and_model.py | 2 +- .../html/SLURM/loading_scripts/load_tokenizer_and_model.slurm | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py index 2585239c..deafe6af 100644 --- a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py +++ b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py @@ -10,7 +10,7 @@ cs = ConfigStore.instance() cs.store(name="config", node=CFG) -@hydra.main(config_name="config") +@hydra.main(config_path=None, config_name="config") def main(args: CFG) -> None: # get dataloaders tokenizer = AutoTokenizer.from_pretrained(args.model_name) diff --git a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm index d504b821..f46b320f 100644 --- a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm +++ b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm @@ -1,11 +1,11 @@ #!/bin/bash -#SBATCH --job-name=modelling-metadata-html-download-dataset-test # job name +#SBATCH --job-name=modelling-metadata-html-download-tokenizer-and-model # job name #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! #SBATCH --cpus-per-task=8 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --gres=gpu:0 # number of gpus -#SBATCH --time 00:02:00 # maximum execution time (HH:MM:SS) +#SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) #SBATCH --output=%x-%j.out # output file name #SBATCH --error=%x-%j.err # error file name #SBATCH --account=six@gpu # account From 040cb12fb1fc451cf7314cfc66bd14c337c92425 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 17:10:52 +0200 Subject: [PATCH 34/84] add logging --- bsmetadata/train.py | 5 ++++- .../html/SLURM/init_experiment/load_dataset.py | 7 +++++-- .../loading_scripts/load_tokenizer_and_model.py | 17 ++++++++++++++--- experiments/html/start_training.py | 1 + 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/bsmetadata/train.py b/bsmetadata/train.py index 02af462f..e674fc4f 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -1,7 +1,7 @@ import dataclasses -import logging import gc import json +import logging import math import os import sys @@ -22,6 +22,9 @@ from bsmetadata.input_pipeline import DataConfig, get_dataloaders +logger = logging.getLogger(__name__) + + @dataclass class CFG: data_config: DataConfig = DataConfig() diff --git a/experiments/html/SLURM/init_experiment/load_dataset.py b/experiments/html/SLURM/init_experiment/load_dataset.py index 60bca6d2..fe97171d 100644 --- a/experiments/html/SLURM/init_experiment/load_dataset.py +++ b/experiments/html/SLURM/init_experiment/load_dataset.py @@ -1,14 +1,17 @@ -import hydra import sys + +import hydra from datasets import load_dataset from hydra.core.config_store import ConfigStore from bsmetadata.input_pipeline import DataConfig from bsmetadata.train import show_help + cs = ConfigStore.instance() cs.store(name="data_config", node=DataConfig) + @hydra.main(config_name="data_config") def main(args: DataConfig) -> None: data_files = {} @@ -70,4 +73,4 @@ def main(args: DataConfig) -> None: if "--help" in sys.argv or "-h" in sys.argv: show_help() sys.exit() - main() \ No newline at end of file + main() diff --git a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py index deafe6af..cfe848c0 100644 --- a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py +++ b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py @@ -1,15 +1,26 @@ -import hydra +import logging import sys + +import hydra from datasets import load_dataset from hydra.core.config_store import ConfigStore from transformers import AdamW, AutoModelForCausalLM, AutoTokenizer from bsmetadata.input_pipeline import DataConfig -from bsmetadata.train import show_help, CFG +from bsmetadata.train import CFG, show_help + + +# Setup logging +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO +) + +logger = logging.getLogger(__name__) cs = ConfigStore.instance() cs.store(name="config", node=CFG) + @hydra.main(config_path=None, config_name="config") def main(args: CFG) -> None: # get dataloaders @@ -23,4 +34,4 @@ def main(args: CFG) -> None: if "--help" in sys.argv or "-h" in sys.argv: show_help() sys.exit() - main() \ No newline at end of file + main() diff --git a/experiments/html/start_training.py b/experiments/html/start_training.py index 039d4a2e..cdcb7a7a 100644 --- a/experiments/html/start_training.py +++ b/experiments/html/start_training.py @@ -1,3 +1,4 @@ +import logging import sys from dataclasses import dataclass, field from typing import Optional From e33566f1f499d39ebfabedee33ee70616d34a6d5 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 17:17:18 +0200 Subject: [PATCH 35/84] set transformers login info --- .../html/SLURM/loading_scripts/load_tokenizer_and_model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py index cfe848c0..8c6cb94b 100644 --- a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py +++ b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py @@ -1,6 +1,7 @@ import logging import sys +import transformers.utils.logging as logging_transformers import hydra from datasets import load_dataset from hydra.core.config_store import ConfigStore @@ -14,6 +15,9 @@ logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO ) +logging_transformers.set_verbosity_info() +logging_transformers.enable_default_handler() +logging_transformers.enable_explicit_format() logger = logging.getLogger(__name__) From 5dd779bd7894e596315059502d4b026750f8a12c Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 17:28:02 +0200 Subject: [PATCH 36/84] add loggings --- bsmetadata/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bsmetadata/train.py b/bsmetadata/train.py index e674fc4f..3f22de84 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -132,8 +132,10 @@ def main(args: CFG) -> None: ) # get dataloaders + logger.info("Load tokenizer") tokenizer = AutoTokenizer.from_pretrained(args.model_name) tokenizer.pad_token = tokenizer.eos_token + logger.info("Load dataloaders") train_dataloader, eval_dataloaders = get_dataloaders(tokenizer, args.data_config) logger.info("The dataloaders have been build") From e509b6dfc528e4e3d6f5a40d4ec9cc83a6b0a7cf Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 17:48:44 +0200 Subject: [PATCH 37/84] add offilne mode --- .../html/SLURM/loading_scripts/load_tokenizer_and_model.slurm | 3 +++ 1 file changed, 3 insertions(+) diff --git a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm index f46b320f..928c9244 100644 --- a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm +++ b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm @@ -15,6 +15,9 @@ set -x -e source $HOME/start-user +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + cd $WORK/repos/metadata/ python experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py \ From 4d982a27d9f1b0a9a0e897d8e490f7f5ae84a951 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 17:49:29 +0200 Subject: [PATCH 38/84] add offline mode --- experiments/html/SLURM/init_experiment/create_dataset.slurm | 3 +++ .../html/SLURM/loading_scripts/load_tokenizer_and_model.slurm | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/experiments/html/SLURM/init_experiment/create_dataset.slurm b/experiments/html/SLURM/init_experiment/create_dataset.slurm index 035a003f..9d36f76b 100644 --- a/experiments/html/SLURM/init_experiment/create_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/create_dataset.slurm @@ -13,6 +13,9 @@ set -x -e source $HOME/start-user +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + cd $WORK/repos/metadata/ python experiments/html/start_training.py \ diff --git a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm index 928c9244..f46b320f 100644 --- a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm +++ b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm @@ -15,9 +15,6 @@ set -x -e source $HOME/start-user -export HF_DATASETS_OFFLINE=1 -export TRANSFORMERS_OFFLINE=1 - cd $WORK/repos/metadata/ python experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py \ From 28a9095786f47554d8c6faa8936c27a195d10fdf Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 19:15:12 +0200 Subject: [PATCH 39/84] add logging info --- bsmetadata/build_dataset/with_metadata.py | 125 ++++++++++++++++++ .../SLURM/init_experiment/load_dataset.py | 3 + 2 files changed, 128 insertions(+) create mode 100644 bsmetadata/build_dataset/with_metadata.py diff --git a/bsmetadata/build_dataset/with_metadata.py b/bsmetadata/build_dataset/with_metadata.py new file mode 100644 index 00000000..d2351f79 --- /dev/null +++ b/bsmetadata/build_dataset/with_metadata.py @@ -0,0 +1,125 @@ +import functools +import logging + +from datasets import load_dataset +from torch.utils.data import DataLoader +from transformers import default_data_collator + +from bsmetadata.metadata_utils import add_metadata_and_chunk_examples + + +logger = logging.getLogger(__name__) + + +def build_dataset(tokenizer, args): + """ + Args: + tokenizer: a huggingface/transformers tokenizer + args: a DataConfig + Returns: + a training dataloader and one or more validation dataloaders + validation dataloaders should be in a dictionary + each dataloader should yield {str: torch.Tensor(cpu) } + dictionary keys may have 'metadata_mask' + other fields will be passed to model + note: metadata_mask should be padded + Example: + train_dataloader, val_dataloaders = get_dataloaders(...) + for batch in train_dataloader: + metadata_mask = batch.get('metadata_mask', None) + outputs = model(**batch) + metrics = loss_fn(batch, outputs, metadata_mask) + """ + # Mostly copy/paste from https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_clm_no_trainer.py + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantees that only one local process can concurrently + # download the dataset. + data_files = {} + if args.train_file is not None: + data_files["train"] = args.train_file + if args.validation_file is not None: + data_files["validation"] = args.validation_file + + if args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + args.dataset_name, + args.dataset_config_name, + data_files=data_files, + cache_dir=args.cache_dir, + keep_in_memory=False, + ) + + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + else: + extension = args.train_file.split(".")[-1] if not args.extension else args.extension + if extension == "txt": + raise ValueError( + "You have entered a text file for the train data, but this type of file cannot contain metadata " + "columns. Wouldn't you rather have a file in json/jsonl or pandas format?" + ) + if extension == "jsonl": + extension = "json" + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=args.cache_dir) + + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Preprocessing the datasets. + column_names = raw_datasets["train"].column_names + + # First we pre-process our text and metadata + lm_datasets = raw_datasets.map( + functools.partial(add_metadata_and_chunk_examples, tokenizer=tokenizer, cfg=args), + batched=True, + num_proc=args.preprocessing_num_workers, + load_from_cache_file=not args.overwrite_cache, + desc="Pre-process the text and metadata to create new samples", + remove_columns=column_names, + ) + + def create_labels_column(examples): + examples["labels"] = examples["input_ids"].copy() + return examples + + # Then we add the column containing the labels + lm_datasets = lm_datasets.map( + create_labels_column, + batched=True, + num_proc=args.preprocessing_num_workers, + load_from_cache_file=not args.overwrite_cache, + desc="Create labels column", + ) + + lm_datasets.save_to_disk(args.dataset_saving_dir) diff --git a/experiments/html/SLURM/init_experiment/load_dataset.py b/experiments/html/SLURM/init_experiment/load_dataset.py index fe97171d..31590594 100644 --- a/experiments/html/SLURM/init_experiment/load_dataset.py +++ b/experiments/html/SLURM/init_experiment/load_dataset.py @@ -1,3 +1,4 @@ +import logging import sys import hydra @@ -7,6 +8,7 @@ from bsmetadata.input_pipeline import DataConfig from bsmetadata.train import show_help +logger = logging.getLogger(__name__) cs = ConfigStore.instance() cs.store(name="data_config", node=DataConfig) @@ -21,6 +23,7 @@ def main(args: DataConfig) -> None: data_files["validation"] = args.validation_file if args.dataset_name is not None: + logger.info("Downloading and loading a dataset from the hub") # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( args.dataset_name, From 2a8d1d774afd564e954113b80abba6fa334d575d Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 19:22:09 +0200 Subject: [PATCH 40/84] add log --- bsmetadata/experiments/with_metadata.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bsmetadata/experiments/with_metadata.py b/bsmetadata/experiments/with_metadata.py index 5ad1679c..9e1d086b 100644 --- a/bsmetadata/experiments/with_metadata.py +++ b/bsmetadata/experiments/with_metadata.py @@ -49,6 +49,7 @@ def get_dataloaders(tokenizer, args): logger.info("Start to load dataset") logger.warning("Start to load dataset") if args.dataset_name is not None: + logger.info("Downloading and loading a dataset from the hub") # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( args.dataset_name, @@ -72,6 +73,7 @@ def get_dataloaders(tokenizer, args): cache_dir=args.cache_dir, ) else: + logger.info("Loading dataset from extension script") extension = args.train_file.split(".")[-1] if not args.extension else args.extension if extension == "txt": raise ValueError( From 9e6890322e948af229ac06fd23f35e67f81cf855 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 19:29:22 +0200 Subject: [PATCH 41/84] remove dataset offline --- experiments/html/SLURM/init_experiment/create_dataset.slurm | 1 - 1 file changed, 1 deletion(-) diff --git a/experiments/html/SLURM/init_experiment/create_dataset.slurm b/experiments/html/SLURM/init_experiment/create_dataset.slurm index 9d36f76b..28bd0afa 100644 --- a/experiments/html/SLURM/init_experiment/create_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/create_dataset.slurm @@ -13,7 +13,6 @@ set -x -e source $HOME/start-user -export HF_DATASETS_OFFLINE=1 export TRANSFORMERS_OFFLINE=1 cd $WORK/repos/metadata/ From 090188f9208b0e23207e627862b5495846f0ecdf Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 19:41:27 +0200 Subject: [PATCH 42/84] add even more logs --- bsmetadata/experiments/with_metadata.py | 8 +++++++- experiments/html/SLURM/init_experiment/load_dataset.py | 6 +++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/bsmetadata/experiments/with_metadata.py b/bsmetadata/experiments/with_metadata.py index 9e1d086b..48d6b37c 100644 --- a/bsmetadata/experiments/with_metadata.py +++ b/bsmetadata/experiments/with_metadata.py @@ -49,7 +49,10 @@ def get_dataloaders(tokenizer, args): logger.info("Start to load dataset") logger.warning("Start to load dataset") if args.dataset_name is not None: - logger.info("Downloading and loading a dataset from the hub") + logger.info( + "Downloading and loading a dataset from the hub" + f"{args.dataset_name}, {args.dataset_config_name}, data_files={data_files}, cache_dir={args.cache_dir}," + ) # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( args.dataset_name, @@ -60,6 +63,9 @@ def get_dataloaders(tokenizer, args): ) if "validation" not in raw_datasets.keys(): + logger.info( + "validation not in raw_datasets.keys()" + ) raw_datasets["validation"] = load_dataset( args.dataset_name, args.dataset_config_name, diff --git a/experiments/html/SLURM/init_experiment/load_dataset.py b/experiments/html/SLURM/init_experiment/load_dataset.py index 31590594..1b7f4209 100644 --- a/experiments/html/SLURM/init_experiment/load_dataset.py +++ b/experiments/html/SLURM/init_experiment/load_dataset.py @@ -23,7 +23,10 @@ def main(args: DataConfig) -> None: data_files["validation"] = args.validation_file if args.dataset_name is not None: - logger.info("Downloading and loading a dataset from the hub") + logger.info( + "Downloading and loading a dataset from the hub" + f"{args.dataset_name}, {args.dataset_config_name}, data_files={data_files}, cache_dir={args.cache_dir}," + ) # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( args.dataset_name, @@ -34,6 +37,7 @@ def main(args: DataConfig) -> None: ) if "validation" not in raw_datasets.keys(): + logger.info("validation not in raw_datasets.keys()") raw_datasets["validation"] = load_dataset( args.dataset_name, args.dataset_config_name, From ca8ba98e7e8aae012a23ea761a666267d8c99eb0 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 20:01:16 +0200 Subject: [PATCH 43/84] add log cache dir --- bsmetadata/experiments/with_metadata.py | 2 ++ experiments/html/SLURM/init_experiment/load_dataset.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/bsmetadata/experiments/with_metadata.py b/bsmetadata/experiments/with_metadata.py index 48d6b37c..2721c51d 100644 --- a/bsmetadata/experiments/with_metadata.py +++ b/bsmetadata/experiments/with_metadata.py @@ -1,5 +1,6 @@ import functools import logging +from datasets import config from datasets import load_dataset from torch.utils.data import DataLoader @@ -48,6 +49,7 @@ def get_dataloaders(tokenizer, args): logger.info("Start to load dataset") logger.warning("Start to load dataset") + logger.info(config.HF_DATASETS_CACHE) if args.dataset_name is not None: logger.info( "Downloading and loading a dataset from the hub" diff --git a/experiments/html/SLURM/init_experiment/load_dataset.py b/experiments/html/SLURM/init_experiment/load_dataset.py index 1b7f4209..36f557e2 100644 --- a/experiments/html/SLURM/init_experiment/load_dataset.py +++ b/experiments/html/SLURM/init_experiment/load_dataset.py @@ -1,5 +1,6 @@ import logging import sys +from datasets import config import hydra from datasets import load_dataset @@ -22,6 +23,7 @@ def main(args: DataConfig) -> None: if args.validation_file is not None: data_files["validation"] = args.validation_file + logger.info(config.HF_DATASETS_CACHE) if args.dataset_name is not None: logger.info( "Downloading and loading a dataset from the hub" From a633377209eb268a8290411feb4b3f03bd39e8d6 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 31 Aug 2021 20:15:08 +0200 Subject: [PATCH 44/84] test dataset avec squad --- .../SLURM/squad_test/create_dataset.slurm | 28 +++++++++++++++++++ .../html/SLURM/squad_test/load_dataset.slurm | 21 ++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 experiments/html/SLURM/squad_test/create_dataset.slurm create mode 100644 experiments/html/SLURM/squad_test/load_dataset.slurm diff --git a/experiments/html/SLURM/squad_test/create_dataset.slurm b/experiments/html/SLURM/squad_test/create_dataset.slurm new file mode 100644 index 00000000..7ae66624 --- /dev/null +++ b/experiments/html/SLURM/squad_test/create_dataset.slurm @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=%x-%j.out # output file name +#SBATCH --error=%x-%j.err # error file name +#SBATCH --account=six@cpu # account + +set -x -e + +source $HOME/start-user + +export TRANSFORMERS_OFFLINE=1 + +cd $WORK/repos/metadata/ + +python experiments/html/start_training.py \ +data_config.experiment="with_metadata" \ +data_config.metadata_list=["html"] \ +data_config.max_seq_len=1024 \ +data_config.dataset_name="squad" \ +data_config.extension="json" \ +data_config.preprocessing_num_workers=8 \ +do_train=False \ +do_eval=False \ \ No newline at end of file diff --git a/experiments/html/SLURM/squad_test/load_dataset.slurm b/experiments/html/SLURM/squad_test/load_dataset.slurm new file mode 100644 index 00000000..7cf9c725 --- /dev/null +++ b/experiments/html/SLURM/squad_test/load_dataset.slurm @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-download-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # number of gpus +#SBATCH --time 00:02:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=%x-%j.out # output file name +#SBATCH --error=%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +source $HOME/start-user + +cd $WORK/repos/metadata/ + +python experiments/html/SLURM/init_experiment/load_dataset.py \ +dataset_name="squad" \ \ No newline at end of file From eb3c0796bae22d9bc6f58f10fe3cb6a13c860830 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 1 Sep 2021 09:39:09 +0200 Subject: [PATCH 45/84] data_files is None if empty --- bsmetadata/experiments/with_metadata.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bsmetadata/experiments/with_metadata.py b/bsmetadata/experiments/with_metadata.py index 2721c51d..77033467 100644 --- a/bsmetadata/experiments/with_metadata.py +++ b/bsmetadata/experiments/with_metadata.py @@ -46,6 +46,9 @@ def get_dataloaders(tokenizer, args): data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file + + if not data_files: + data_files = None logger.info("Start to load dataset") logger.warning("Start to load dataset") From a3d877cab17d9334489dad6d134ad3d3f5525495 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 1 Sep 2021 09:42:00 +0200 Subject: [PATCH 46/84] data_files is None if empty --- experiments/html/SLURM/init_experiment/load_dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/experiments/html/SLURM/init_experiment/load_dataset.py b/experiments/html/SLURM/init_experiment/load_dataset.py index 36f557e2..1a8c65de 100644 --- a/experiments/html/SLURM/init_experiment/load_dataset.py +++ b/experiments/html/SLURM/init_experiment/load_dataset.py @@ -23,6 +23,9 @@ def main(args: DataConfig) -> None: if args.validation_file is not None: data_files["validation"] = args.validation_file + if not data_files: + data_files = None + logger.info(config.HF_DATASETS_CACHE) if args.dataset_name is not None: logger.info( From 07b44d208cd1401369236a4b693aa290e3f267f5 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 1 Sep 2021 09:44:18 +0200 Subject: [PATCH 47/84] replace squad by crime_and_punish --- .../create_dataset.slurm | 2 +- .../crime_and_punish_test/do_training.slurm | 4 + .../crime_and_punish_test/load_dataset.py | 88 +++++++++++++++++++ .../load_dataset.slurm | 2 +- 4 files changed, 94 insertions(+), 2 deletions(-) rename experiments/html/SLURM/{squad_test => crime_and_punish_test}/create_dataset.slurm (95%) create mode 100644 experiments/html/SLURM/crime_and_punish_test/do_training.slurm create mode 100644 experiments/html/SLURM/crime_and_punish_test/load_dataset.py rename experiments/html/SLURM/{squad_test => crime_and_punish_test}/load_dataset.slurm (96%) diff --git a/experiments/html/SLURM/squad_test/create_dataset.slurm b/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm similarity index 95% rename from experiments/html/SLURM/squad_test/create_dataset.slurm rename to experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm index 7ae66624..c51058be 100644 --- a/experiments/html/SLURM/squad_test/create_dataset.slurm +++ b/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm @@ -21,7 +21,7 @@ python experiments/html/start_training.py \ data_config.experiment="with_metadata" \ data_config.metadata_list=["html"] \ data_config.max_seq_len=1024 \ -data_config.dataset_name="squad" \ +data_config.dataset_name="crime_and_punish" \ data_config.extension="json" \ data_config.preprocessing_num_workers=8 \ do_train=False \ diff --git a/experiments/html/SLURM/crime_and_punish_test/do_training.slurm b/experiments/html/SLURM/crime_and_punish_test/do_training.slurm new file mode 100644 index 00000000..bc925e22 --- /dev/null +++ b/experiments/html/SLURM/crime_and_punish_test/do_training.slurm @@ -0,0 +1,4 @@ + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +export WANDB_MODE=offline \ No newline at end of file diff --git a/experiments/html/SLURM/crime_and_punish_test/load_dataset.py b/experiments/html/SLURM/crime_and_punish_test/load_dataset.py new file mode 100644 index 00000000..1a8c65de --- /dev/null +++ b/experiments/html/SLURM/crime_and_punish_test/load_dataset.py @@ -0,0 +1,88 @@ +import logging +import sys +from datasets import config + +import hydra +from datasets import load_dataset +from hydra.core.config_store import ConfigStore + +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.train import show_help + +logger = logging.getLogger(__name__) + +cs = ConfigStore.instance() +cs.store(name="data_config", node=DataConfig) + + +@hydra.main(config_name="data_config") +def main(args: DataConfig) -> None: + data_files = {} + if args.train_file is not None: + data_files["train"] = args.train_file + if args.validation_file is not None: + data_files["validation"] = args.validation_file + + if not data_files: + data_files = None + + logger.info(config.HF_DATASETS_CACHE) + if args.dataset_name is not None: + logger.info( + "Downloading and loading a dataset from the hub" + f"{args.dataset_name}, {args.dataset_config_name}, data_files={data_files}, cache_dir={args.cache_dir}," + ) + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + args.dataset_name, + args.dataset_config_name, + data_files=data_files, + cache_dir=args.cache_dir, + keep_in_memory=False, + ) + + if "validation" not in raw_datasets.keys(): + logger.info("validation not in raw_datasets.keys()") + raw_datasets["validation"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + else: + extension = args.train_file.split(".")[-1] if not args.extension else args.extension + if extension == "txt": + raise ValueError( + "You have entered a text file for the train data, but this type of file cannot contain metadata " + "columns. Wouldn't you rather have a file in json/jsonl or pandas format?" + ) + if extension == "jsonl": + extension = "json" + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=args.cache_dir) + + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + + +if __name__ == "__main__": + if "--help" in sys.argv or "-h" in sys.argv: + show_help() + sys.exit() + main() diff --git a/experiments/html/SLURM/squad_test/load_dataset.slurm b/experiments/html/SLURM/crime_and_punish_test/load_dataset.slurm similarity index 96% rename from experiments/html/SLURM/squad_test/load_dataset.slurm rename to experiments/html/SLURM/crime_and_punish_test/load_dataset.slurm index 7cf9c725..ecf3139b 100644 --- a/experiments/html/SLURM/squad_test/load_dataset.slurm +++ b/experiments/html/SLURM/crime_and_punish_test/load_dataset.slurm @@ -18,4 +18,4 @@ source $HOME/start-user cd $WORK/repos/metadata/ python experiments/html/SLURM/init_experiment/load_dataset.py \ -dataset_name="squad" \ \ No newline at end of file +dataset_name="crime_and_punish" \ \ No newline at end of file From 35dbe235a1d5893570ec2c6e62f0bdcd7ca9bc35 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 1 Sep 2021 10:10:48 +0200 Subject: [PATCH 48/84] create local repo --- experiments/html/SLURM/init_experiment/create_dataset.slurm | 3 ++- experiments/html/SLURM/init_experiment/load_dataset.slurm | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/experiments/html/SLURM/init_experiment/create_dataset.slurm b/experiments/html/SLURM/init_experiment/create_dataset.slurm index 28bd0afa..a4d7df41 100644 --- a/experiments/html/SLURM/init_experiment/create_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/create_dataset.slurm @@ -13,6 +13,7 @@ set -x -e source $HOME/start-user +export HF_DATASETS_OFFLINE=1 export TRANSFORMERS_OFFLINE=1 cd $WORK/repos/metadata/ @@ -21,7 +22,7 @@ python experiments/html/start_training.py \ data_config.experiment="with_metadata" \ data_config.metadata_list=["html"] \ data_config.max_seq_len=1024 \ -data_config.dataset_name="SaulLu/Natural_Questions_HTML_Toy" \ +data_config.dataset_name="${DATASETS_CUSTOM}/SaulLu/Natural_Questions_HTML_Toy" \ data_config.train_file="nq-train-*.jsonl.gz" \ data_config.validation_file="nq-dev-*.jsonl.gz" \ data_config.extension="json" \ diff --git a/experiments/html/SLURM/init_experiment/load_dataset.slurm b/experiments/html/SLURM/init_experiment/load_dataset.slurm index e8c36d4d..0336f7d8 100644 --- a/experiments/html/SLURM/init_experiment/load_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/load_dataset.slurm @@ -15,9 +15,13 @@ set -x -e source $HOME/start-user +# Uncomment if the repo doesn't exist +# cd $DATASETS_CUSTOM/ +# git clone https://huggingface.co/datasets/SaulLu/Natural_Questions_HTML_Toy + cd $WORK/repos/metadata/ python experiments/html/SLURM/init_experiment/load_dataset.py \ -dataset_name="SaulLu/Natural_Questions_HTML_Toy" \ +dataset_name="${DATASETS_CUSTOM}/SaulLu/Natural_Questions_HTML_Toy" \ train_file="nq-train-*.jsonl.gz" \ validation_file="nq-dev-*.jsonl.gz" \ No newline at end of file From 59178efbc3cf754d8521474e28d726d731ef13ad Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 1 Sep 2021 10:13:53 +0200 Subject: [PATCH 49/84] fix repo name --- experiments/html/SLURM/init_experiment/create_dataset.slurm | 2 +- experiments/html/SLURM/init_experiment/load_dataset.slurm | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/experiments/html/SLURM/init_experiment/create_dataset.slurm b/experiments/html/SLURM/init_experiment/create_dataset.slurm index a4d7df41..ba75916d 100644 --- a/experiments/html/SLURM/init_experiment/create_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/create_dataset.slurm @@ -22,7 +22,7 @@ python experiments/html/start_training.py \ data_config.experiment="with_metadata" \ data_config.metadata_list=["html"] \ data_config.max_seq_len=1024 \ -data_config.dataset_name="${DATASETS_CUSTOM}/SaulLu/Natural_Questions_HTML_Toy" \ +data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML_Toy" \ data_config.train_file="nq-train-*.jsonl.gz" \ data_config.validation_file="nq-dev-*.jsonl.gz" \ data_config.extension="json" \ diff --git a/experiments/html/SLURM/init_experiment/load_dataset.slurm b/experiments/html/SLURM/init_experiment/load_dataset.slurm index 0336f7d8..b891d8a5 100644 --- a/experiments/html/SLURM/init_experiment/load_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/load_dataset.slurm @@ -22,6 +22,6 @@ source $HOME/start-user cd $WORK/repos/metadata/ python experiments/html/SLURM/init_experiment/load_dataset.py \ -dataset_name="${DATASETS_CUSTOM}/SaulLu/Natural_Questions_HTML_Toy" \ +dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML_Toy" \ train_file="nq-train-*.jsonl.gz" \ validation_file="nq-dev-*.jsonl.gz" \ No newline at end of file From 2bdff39647edc7d119e3b6da7b7cc6cfedb5b45f Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 1 Sep 2021 11:31:04 +0200 Subject: [PATCH 50/84] add do_training --- .../crime_and_punish_test/do_training.slurm | 4 --- .../SLURM/init_experiment/do_training.slurm | 32 ++++++++++++++++++- 2 files changed, 31 insertions(+), 5 deletions(-) delete mode 100644 experiments/html/SLURM/crime_and_punish_test/do_training.slurm diff --git a/experiments/html/SLURM/crime_and_punish_test/do_training.slurm b/experiments/html/SLURM/crime_and_punish_test/do_training.slurm deleted file mode 100644 index bc925e22..00000000 --- a/experiments/html/SLURM/crime_and_punish_test/do_training.slurm +++ /dev/null @@ -1,4 +0,0 @@ - -export HF_DATASETS_OFFLINE=1 -export TRANSFORMERS_OFFLINE=1 -export WANDB_MODE=offline \ No newline at end of file diff --git a/experiments/html/SLURM/init_experiment/do_training.slurm b/experiments/html/SLURM/init_experiment/do_training.slurm index bc925e22..fe055518 100644 --- a/experiments/html/SLURM/init_experiment/do_training.slurm +++ b/experiments/html/SLURM/init_experiment/do_training.slurm @@ -1,4 +1,34 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-do-train-test # job name +#SBATCH --ntasks=1 # number of MP tasks +#SBATCH --constraint=v100-16g +#SBATCH --gres=gpu:1 # number of GPUs per node +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=%x-%j.out # output file name +#SBATCH --error=%x-%j.err # error file name +#SBATCH --account=six@gpu # account + +set -x -e + +source $HOME/start-user export HF_DATASETS_OFFLINE=1 export TRANSFORMERS_OFFLINE=1 -export WANDB_MODE=offline \ No newline at end of file +# be careful about the cache folder for Wandb +export WANDB_MODE=offline + +cd $WORK/repos/metadata/ + +python experiments/html/start_training.py \ +data_config.experiment="with_metadata" \ +data_config.metadata_list=["html"] \ +data_config.max_seq_len=1024 \ +data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML_Toy" \ +data_config.train_file="nq-train-*.jsonl.gz" \ +data_config.validation_file="nq-dev-*.jsonl.gz" \ +data_config.extension="json" \ +data_config.preprocessing_num_workers=6 \ +do_train=True \ +do_eval=True \ \ No newline at end of file From 6c62c27511c130bc797d6cc27cecf5401386f836 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 1 Sep 2021 11:31:28 +0200 Subject: [PATCH 51/84] add lines for repo init --- experiments/html/SLURM/init_experiment/load_dataset.slurm | 3 +++ 1 file changed, 3 insertions(+) diff --git a/experiments/html/SLURM/init_experiment/load_dataset.slurm b/experiments/html/SLURM/init_experiment/load_dataset.slurm index b891d8a5..12d1a125 100644 --- a/experiments/html/SLURM/init_experiment/load_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/load_dataset.slurm @@ -18,6 +18,9 @@ source $HOME/start-user # Uncomment if the repo doesn't exist # cd $DATASETS_CUSTOM/ # git clone https://huggingface.co/datasets/SaulLu/Natural_Questions_HTML_Toy +# cd Natural_Questions_HTML_Toy/ +# git lfs install +# git lfs pull origin master cd $WORK/repos/metadata/ From 60e0e02cf9418d9f5b222b679ddc23e2a040b781 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 1 Sep 2021 14:09:28 +0200 Subject: [PATCH 52/84] add experiment 1 --- bsmetadata/experiments/with_metadata.py | 8 +- bsmetadata/train.py | 4 +- experiments/html/SLURM/experiment_1/README.md | 9 ++ .../SLURM/experiment_1/create_dataset.slurm | 31 +++++++ .../html/SLURM/experiment_1/do_training.slurm | 35 ++++++++ .../html/SLURM/experiment_1/load_dataset.py | 88 +++++++++++++++++++ .../SLURM/experiment_1/load_dataset.slurm | 30 +++++++ .../SLURM/init_experiment/do_training.slurm | 1 + 8 files changed, 199 insertions(+), 7 deletions(-) create mode 100644 experiments/html/SLURM/experiment_1/README.md create mode 100644 experiments/html/SLURM/experiment_1/create_dataset.slurm create mode 100644 experiments/html/SLURM/experiment_1/do_training.slurm create mode 100644 experiments/html/SLURM/experiment_1/load_dataset.py create mode 100644 experiments/html/SLURM/experiment_1/load_dataset.slurm diff --git a/bsmetadata/experiments/with_metadata.py b/bsmetadata/experiments/with_metadata.py index 77033467..34fb09be 100644 --- a/bsmetadata/experiments/with_metadata.py +++ b/bsmetadata/experiments/with_metadata.py @@ -51,12 +51,11 @@ def get_dataloaders(tokenizer, args): data_files = None logger.info("Start to load dataset") - logger.warning("Start to load dataset") logger.info(config.HF_DATASETS_CACHE) if args.dataset_name is not None: logger.info( - "Downloading and loading a dataset from the hub" - f"{args.dataset_name}, {args.dataset_config_name}, data_files={data_files}, cache_dir={args.cache_dir}," + "Downloading and loading with arguments: " + f"dataset_name={args.dataset_name}, dataset_config_name={args.dataset_config_name}, data_files={data_files}, cache_dir={args.cache_dir}," ) # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( @@ -68,9 +67,6 @@ def get_dataloaders(tokenizer, args): ) if "validation" not in raw_datasets.keys(): - logger.info( - "validation not in raw_datasets.keys()" - ) raw_datasets["validation"] = load_dataset( args.dataset_name, args.dataset_config_name, diff --git a/bsmetadata/train.py b/bsmetadata/train.py index 3f22de84..6c8ba286 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -140,6 +140,7 @@ def main(args: CFG) -> None: logger.info("The dataloaders have been build") # get model + logger.info("Load model") model = AutoModelForCausalLM.from_pretrained(args.model_name) # Optimizer @@ -200,9 +201,9 @@ def evaluate(eval_dataloader): model.train() return {"perplexity": perplexity} + logger.info("***** Start training *****") if args.do_train: # Train! - logger.info("***** Running training *****") progress_bar = tqdm(range(args.max_train_steps), desc="training") completed_steps = 0 logger_metrics = Logger(is_local_main_process, project=args.project_name, config=args) @@ -254,6 +255,7 @@ def evaluate(eval_dataloader): break logger_metrics.close() + logger.info("***** Training finished *****") if is_local_main_process and args.out_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) diff --git a/experiments/html/SLURM/experiment_1/README.md b/experiments/html/SLURM/experiment_1/README.md new file mode 100644 index 00000000..f0732a86 --- /dev/null +++ b/experiments/html/SLURM/experiment_1/README.md @@ -0,0 +1,9 @@ +# Experiment 1 + +## Run experiment on JZ + +1. Download the tokenizer and the model +2. Download the dataset on a partition with internet +3. Preprocess the dataset on a cpu-only partition +4. Run the training on a gpu 16gb partition + diff --git a/experiments/html/SLURM/experiment_1/create_dataset.slurm b/experiments/html/SLURM/experiment_1/create_dataset.slurm new file mode 100644 index 00000000..7c7a6747 --- /dev/null +++ b/experiments/html/SLURM/experiment_1/create_dataset.slurm @@ -0,0 +1,31 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=%x-%j.out # output file name +#SBATCH --error=%x-%j.err # error file name +#SBATCH --account=six@cpu # account + +set -x -e + +source $HOME/start-user + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +cd $WORK/repos/metadata/ + +python experiments/html/start_training.py \ +data_config.experiment="with_metadata" \ +data_config.metadata_list=["html"] \ +data_config.max_seq_len=1024 \ +data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ +data_config.train_file="nq-train-*.jsonl.gz" \ +data_config.validation_file="nq-dev-*.jsonl.gz" \ +data_config.extension="json" \ +data_config.preprocessing_num_workers=8 \ +do_train=False \ +do_eval=False \ \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_1/do_training.slurm b/experiments/html/SLURM/experiment_1/do_training.slurm new file mode 100644 index 00000000..6579f812 --- /dev/null +++ b/experiments/html/SLURM/experiment_1/do_training.slurm @@ -0,0 +1,35 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-do-train-test # job name +#SBATCH --ntasks=1 # number of MP tasks +#SBATCH --constraint=v100-16g +#SBATCH --gres=gpu:1 # number of GPUs per node +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=%x-%j.out # output file name +#SBATCH --error=%x-%j.err # error file name +#SBATCH --account=six@gpu # account + +set -x -e + +source $HOME/start-user + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +# be careful about the cache folder for Wandb +export WANDB_MODE=offline + +cd $WORK/repos/metadata/ + +python experiments/html/start_training.py \ +data_config.experiment="with_metadata" \ +data_config.metadata_list=["html"] \ +data_config.max_seq_len=1024 \ +data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ +data_config.train_file="nq-train-*.jsonl.gz" \ +data_config.validation_file="nq-dev-*.jsonl.gz" \ +data_config.extension="json" \ +data_config.preprocessing_num_workers=6 \ +out_dir="${SCRATCH}/metadata_outputs" \ +do_train=True \ +do_eval=True \ \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_1/load_dataset.py b/experiments/html/SLURM/experiment_1/load_dataset.py new file mode 100644 index 00000000..1a8c65de --- /dev/null +++ b/experiments/html/SLURM/experiment_1/load_dataset.py @@ -0,0 +1,88 @@ +import logging +import sys +from datasets import config + +import hydra +from datasets import load_dataset +from hydra.core.config_store import ConfigStore + +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.train import show_help + +logger = logging.getLogger(__name__) + +cs = ConfigStore.instance() +cs.store(name="data_config", node=DataConfig) + + +@hydra.main(config_name="data_config") +def main(args: DataConfig) -> None: + data_files = {} + if args.train_file is not None: + data_files["train"] = args.train_file + if args.validation_file is not None: + data_files["validation"] = args.validation_file + + if not data_files: + data_files = None + + logger.info(config.HF_DATASETS_CACHE) + if args.dataset_name is not None: + logger.info( + "Downloading and loading a dataset from the hub" + f"{args.dataset_name}, {args.dataset_config_name}, data_files={data_files}, cache_dir={args.cache_dir}," + ) + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + args.dataset_name, + args.dataset_config_name, + data_files=data_files, + cache_dir=args.cache_dir, + keep_in_memory=False, + ) + + if "validation" not in raw_datasets.keys(): + logger.info("validation not in raw_datasets.keys()") + raw_datasets["validation"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + else: + extension = args.train_file.split(".")[-1] if not args.extension else args.extension + if extension == "txt": + raise ValueError( + "You have entered a text file for the train data, but this type of file cannot contain metadata " + "columns. Wouldn't you rather have a file in json/jsonl or pandas format?" + ) + if extension == "jsonl": + extension = "json" + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=args.cache_dir) + + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + + +if __name__ == "__main__": + if "--help" in sys.argv or "-h" in sys.argv: + show_help() + sys.exit() + main() diff --git a/experiments/html/SLURM/experiment_1/load_dataset.slurm b/experiments/html/SLURM/experiment_1/load_dataset.slurm new file mode 100644 index 00000000..f6d84696 --- /dev/null +++ b/experiments/html/SLURM/experiment_1/load_dataset.slurm @@ -0,0 +1,30 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-download-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # number of gpus +#SBATCH --time 00:02:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=%x-%j.out # output file name +#SBATCH --error=%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +source $HOME/start-user + +# Uncomment if the repo doesn't exist +cd $DATASETS_CUSTOM/ +git clone https://huggingface.co/datasets/SaulLu/Natural_Questions_HTML +cd Natural_Questions_HTML_Toy/ +git lfs install +git lfs pull origin master + +cd $WORK/repos/metadata/ + +python experiments/html/SLURM/init_experiment/load_dataset.py \ +dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ +train_file="nq-train-*.jsonl.gz" \ +validation_file="nq-dev-*.jsonl.gz" \ No newline at end of file diff --git a/experiments/html/SLURM/init_experiment/do_training.slurm b/experiments/html/SLURM/init_experiment/do_training.slurm index fe055518..08281d6b 100644 --- a/experiments/html/SLURM/init_experiment/do_training.slurm +++ b/experiments/html/SLURM/init_experiment/do_training.slurm @@ -30,5 +30,6 @@ data_config.train_file="nq-train-*.jsonl.gz" \ data_config.validation_file="nq-dev-*.jsonl.gz" \ data_config.extension="json" \ data_config.preprocessing_num_workers=6 \ +out_dir="${SCRATCH}/metadata_outputs" \ do_train=True \ do_eval=True \ \ No newline at end of file From 5ebc1e752cea08bcec545b821cccd3df41e12c2a Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 1 Sep 2021 14:13:42 +0200 Subject: [PATCH 53/84] add multi_steps script --- experiments/html/SLURM/experiment_1/multi_steps.bash | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 experiments/html/SLURM/experiment_1/multi_steps.bash diff --git a/experiments/html/SLURM/experiment_1/multi_steps.bash b/experiments/html/SLURM/experiment_1/multi_steps.bash new file mode 100644 index 00000000..d9c33272 --- /dev/null +++ b/experiments/html/SLURM/experiment_1/multi_steps.bash @@ -0,0 +1,3 @@ +JID_JOB1=`sbatch load_dataset.slurm | cut -d " " -f 4` +JID_JOB2=`sbatch --dependency=afterok:$JID_JOB1 create_dataset.slurm | cut -d " " -f 4` +sbatch --dependency=afterok:$JID_JOB2 do_training.slurm \ No newline at end of file From 4f737357682cfd06bf47726a6f4b490bc274e568 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 1 Sep 2021 14:21:26 +0200 Subject: [PATCH 54/84] change time --- .../html/SLURM/experiment_1/create_dataset.slurm | 2 +- .../html/SLURM/experiment_1/do_training.slurm | 2 +- .../html/SLURM/experiment_1/load_dataset.slurm | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/experiments/html/SLURM/experiment_1/create_dataset.slurm b/experiments/html/SLURM/experiment_1/create_dataset.slurm index 7c7a6747..e372c436 100644 --- a/experiments/html/SLURM/experiment_1/create_dataset.slurm +++ b/experiments/html/SLURM/experiment_1/create_dataset.slurm @@ -4,7 +4,7 @@ #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! #SBATCH --cpus-per-task=8 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) +#SBATCH --time 01:00:00 # maximum execution time (HH:MM:SS) #SBATCH --output=%x-%j.out # output file name #SBATCH --error=%x-%j.err # error file name #SBATCH --account=six@cpu # account diff --git a/experiments/html/SLURM/experiment_1/do_training.slurm b/experiments/html/SLURM/experiment_1/do_training.slurm index 6579f812..d124e862 100644 --- a/experiments/html/SLURM/experiment_1/do_training.slurm +++ b/experiments/html/SLURM/experiment_1/do_training.slurm @@ -5,7 +5,7 @@ #SBATCH --gres=gpu:1 # number of GPUs per node #SBATCH --cpus-per-task=8 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) +#SBATCH --time 04:00:00 # maximum execution time (HH:MM:SS) #SBATCH --output=%x-%j.out # output file name #SBATCH --error=%x-%j.err # error file name #SBATCH --account=six@gpu # account diff --git a/experiments/html/SLURM/experiment_1/load_dataset.slurm b/experiments/html/SLURM/experiment_1/load_dataset.slurm index f6d84696..24a66da6 100644 --- a/experiments/html/SLURM/experiment_1/load_dataset.slurm +++ b/experiments/html/SLURM/experiment_1/load_dataset.slurm @@ -5,7 +5,7 @@ #SBATCH --cpus-per-task=8 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --gres=gpu:0 # number of gpus -#SBATCH --time 00:02:00 # maximum execution time (HH:MM:SS) +#SBATCH --time 00:30:00 # maximum execution time (HH:MM:SS) #SBATCH --output=%x-%j.out # output file name #SBATCH --error=%x-%j.err # error file name #SBATCH --account=six@gpu # account @@ -16,11 +16,11 @@ set -x -e source $HOME/start-user # Uncomment if the repo doesn't exist -cd $DATASETS_CUSTOM/ -git clone https://huggingface.co/datasets/SaulLu/Natural_Questions_HTML -cd Natural_Questions_HTML_Toy/ -git lfs install -git lfs pull origin master +# cd $DATASETS_CUSTOM/ +# git clone https://huggingface.co/datasets/SaulLu/Natural_Questions_HTML +# cd Natural_Questions_HTML_Toy/ +# git lfs install +# git lfs pull origin master cd $WORK/repos/metadata/ From f9f7dfe5aeb3b602271c432db1a21f4855a98233 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 1 Sep 2021 15:06:29 +0200 Subject: [PATCH 55/84] change time load dataset --- experiments/html/SLURM/experiment_1/load_dataset.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experiments/html/SLURM/experiment_1/load_dataset.slurm b/experiments/html/SLURM/experiment_1/load_dataset.slurm index 24a66da6..1e36dbad 100644 --- a/experiments/html/SLURM/experiment_1/load_dataset.slurm +++ b/experiments/html/SLURM/experiment_1/load_dataset.slurm @@ -5,7 +5,7 @@ #SBATCH --cpus-per-task=8 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --gres=gpu:0 # number of gpus -#SBATCH --time 00:30:00 # maximum execution time (HH:MM:SS) +#SBATCH --time 02:30:00 # maximum execution time (HH:MM:SS) #SBATCH --output=%x-%j.out # output file name #SBATCH --error=%x-%j.err # error file name #SBATCH --account=six@gpu # account From d563a63465085fd073e7a41941badc6dcb55d233 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 1 Sep 2021 15:41:14 +0200 Subject: [PATCH 56/84] change time --- experiments/html/SLURM/experiment_1/create_dataset.slurm | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/experiments/html/SLURM/experiment_1/create_dataset.slurm b/experiments/html/SLURM/experiment_1/create_dataset.slurm index e372c436..6509b359 100644 --- a/experiments/html/SLURM/experiment_1/create_dataset.slurm +++ b/experiments/html/SLURM/experiment_1/create_dataset.slurm @@ -2,9 +2,9 @@ #SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! -#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --cpus-per-task=16 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --time 01:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --time 03:00:00 # maximum execution time (HH:MM:SS) #SBATCH --output=%x-%j.out # output file name #SBATCH --error=%x-%j.err # error file name #SBATCH --account=six@cpu # account @@ -26,6 +26,6 @@ data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ data_config.train_file="nq-train-*.jsonl.gz" \ data_config.validation_file="nq-dev-*.jsonl.gz" \ data_config.extension="json" \ -data_config.preprocessing_num_workers=8 \ +data_config.preprocessing_num_workers=16 \ do_train=False \ do_eval=False \ \ No newline at end of file From 5a94022f20b2a17158b550202de7dfa92345260f Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 1 Sep 2021 15:42:16 +0200 Subject: [PATCH 57/84] change multi_batch --- experiments/html/SLURM/experiment_1/multi_steps.bash | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/experiments/html/SLURM/experiment_1/multi_steps.bash b/experiments/html/SLURM/experiment_1/multi_steps.bash index d9c33272..d6f4a745 100644 --- a/experiments/html/SLURM/experiment_1/multi_steps.bash +++ b/experiments/html/SLURM/experiment_1/multi_steps.bash @@ -1,3 +1,2 @@ -JID_JOB1=`sbatch load_dataset.slurm | cut -d " " -f 4` -JID_JOB2=`sbatch --dependency=afterok:$JID_JOB1 create_dataset.slurm | cut -d " " -f 4` -sbatch --dependency=afterok:$JID_JOB2 do_training.slurm \ No newline at end of file +JID_JOB1=`sbatch create_dataset.slurm | cut -d " " -f 4` +sbatch --dependency=afterok:$JID_JOB1 do_training.slurm \ No newline at end of file From 701f5da2e0315207488d993a88b024fc5bda1c4c Mon Sep 17 00:00:00 2001 From: SaulLu Date: Thu, 2 Sep 2021 11:01:20 +0200 Subject: [PATCH 58/84] remove useless file --- experiments/html/hash_investigate.txt | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 experiments/html/hash_investigate.txt diff --git a/experiments/html/hash_investigate.txt b/experiments/html/hash_investigate.txt deleted file mode 100644 index c504de12..00000000 --- a/experiments/html/hash_investigate.txt +++ /dev/null @@ -1,21 +0,0 @@ -experiment hash key | value: d054d90240b85e16 | 344c3dc4b5cf8646 -per_device_eval_batch_size hash key | value: 67dae3bf9c1036df | d3680cf2f313c8fc -per_device_train_batch_size hash key | value: 530047abd92ccf24 | d3680cf2f313c8fc -metadata_list hash key | value: 4bd839407ac25327 | 1334f528d7dbe450 -metadata_sep hash key | value: 1726fe05238500f6 | 040238c86b64577a -metadata_key_value_sep hash key | value: af649de4f4919728 | 33dd19cdc1300084 -metadata_probability hash key | value: cdcd0263a3874515 | fea5c87c419b5847 -global_metadata_sep hash key | value: 52422b0116a8167e | 9d5790dd15155529 -max_seq_len hash key | value: 09f0965bf157c2bd | 3fd3922da80411e2 -dataset_name hash key | value: 8cdcb241ac2c7029 | 1928a0ada5a73e07 -dataset_config_name hash key | value: b1e8d2394418d360 | 4d8c0405832b0f7e -train_file hash key | value: dd1e5e2401c1bcfa | b0123c2d43e2982d -validation_file hash key | value: 93f8998bdf1c060f | faae0565261050bc -overwrite_cache hash key | value: 0053471c52542656 | aef99bec791b0f18 -cache_dir hash key | value: 136582c7d6f5b69d | 4d8c0405832b0f7e -extension hash key | value: fb3553b1bec24680 | f6caa0871f48acbd -preprocessing_num_workers hash key | value: c55b211054a90a87 | 433aeac0a2f51423 -validation_split_percentage hash key | value: c8a6d71d862df501 | 9897c07112dce998 -block_size hash key | value: 81dc640ef3b11f40 | 4d8c0405832b0f7e -html_parser_config hash key | value: 3844a6686d6b07c7 | b507f2f4a7200b74 -"['html']" \ No newline at end of file From ea1219a9e63d3a6a16a4d8dcee802a33a31a5f33 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Thu, 2 Sep 2021 11:19:21 +0200 Subject: [PATCH 59/84] add htlm wieghts --- perso/bart_state_dict.txt | 515 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 515 insertions(+) create mode 100644 perso/bart_state_dict.txt diff --git a/perso/bart_state_dict.txt b/perso/bart_state_dict.txt new file mode 100644 index 00000000..610cd1b1 --- /dev/null +++ b/perso/bart_state_dict.txt @@ -0,0 +1,515 @@ +encoder.version torch.Size([1]) +encoder.embed_tokens.weight torch.Size([50265, 1024]) +encoder.embed_positions.weight torch.Size([1026, 1024]) +encoder.layernorm_embedding.weight torch.Size([1024]) +encoder.layernorm_embedding.bias torch.Size([1024]) +encoder.layers.0.self_attn.k_proj.weight torch.Size([1024, 1024]) +encoder.layers.0.self_attn.k_proj.bias torch.Size([1024]) +encoder.layers.0.self_attn.v_proj.weight torch.Size([1024, 1024]) +encoder.layers.0.self_attn.v_proj.bias torch.Size([1024]) +encoder.layers.0.self_attn.q_proj.weight torch.Size([1024, 1024]) +encoder.layers.0.self_attn.q_proj.bias torch.Size([1024]) +encoder.layers.0.self_attn.out_proj.weight torch.Size([1024, 1024]) +encoder.layers.0.self_attn.out_proj.bias torch.Size([1024]) +encoder.layers.0.self_attn_layer_norm.weight torch.Size([1024]) +encoder.layers.0.self_attn_layer_norm.bias torch.Size([1024]) +encoder.layers.0.fc1.weight torch.Size([4096, 1024]) +encoder.layers.0.fc1.bias torch.Size([4096]) +encoder.layers.0.fc2.weight torch.Size([1024, 4096]) +encoder.layers.0.fc2.bias torch.Size([1024]) +encoder.layers.0.final_layer_norm.weight torch.Size([1024]) +encoder.layers.0.final_layer_norm.bias torch.Size([1024]) +encoder.layers.1.self_attn.k_proj.weight torch.Size([1024, 1024]) +encoder.layers.1.self_attn.k_proj.bias torch.Size([1024]) +encoder.layers.1.self_attn.v_proj.weight torch.Size([1024, 1024]) +encoder.layers.1.self_attn.v_proj.bias torch.Size([1024]) +encoder.layers.1.self_attn.q_proj.weight torch.Size([1024, 1024]) +encoder.layers.1.self_attn.q_proj.bias torch.Size([1024]) +encoder.layers.1.self_attn.out_proj.weight torch.Size([1024, 1024]) +encoder.layers.1.self_attn.out_proj.bias torch.Size([1024]) +encoder.layers.1.self_attn_layer_norm.weight torch.Size([1024]) +encoder.layers.1.self_attn_layer_norm.bias torch.Size([1024]) +encoder.layers.1.fc1.weight torch.Size([4096, 1024]) +encoder.layers.1.fc1.bias torch.Size([4096]) +encoder.layers.1.fc2.weight torch.Size([1024, 4096]) +encoder.layers.1.fc2.bias torch.Size([1024]) +encoder.layers.1.final_layer_norm.weight torch.Size([1024]) +encoder.layers.1.final_layer_norm.bias torch.Size([1024]) +encoder.layers.2.self_attn.k_proj.weight torch.Size([1024, 1024]) +encoder.layers.2.self_attn.k_proj.bias torch.Size([1024]) +encoder.layers.2.self_attn.v_proj.weight torch.Size([1024, 1024]) +encoder.layers.2.self_attn.v_proj.bias torch.Size([1024]) +encoder.layers.2.self_attn.q_proj.weight torch.Size([1024, 1024]) +encoder.layers.2.self_attn.q_proj.bias torch.Size([1024]) +encoder.layers.2.self_attn.out_proj.weight torch.Size([1024, 1024]) +encoder.layers.2.self_attn.out_proj.bias torch.Size([1024]) +encoder.layers.2.self_attn_layer_norm.weight torch.Size([1024]) +encoder.layers.2.self_attn_layer_norm.bias torch.Size([1024]) +encoder.layers.2.fc1.weight torch.Size([4096, 1024]) +encoder.layers.2.fc1.bias torch.Size([4096]) +encoder.layers.2.fc2.weight torch.Size([1024, 4096]) +encoder.layers.2.fc2.bias torch.Size([1024]) +encoder.layers.2.final_layer_norm.weight torch.Size([1024]) +encoder.layers.2.final_layer_norm.bias torch.Size([1024]) +encoder.layers.3.self_attn.k_proj.weight torch.Size([1024, 1024]) +encoder.layers.3.self_attn.k_proj.bias torch.Size([1024]) +encoder.layers.3.self_attn.v_proj.weight torch.Size([1024, 1024]) +encoder.layers.3.self_attn.v_proj.bias torch.Size([1024]) +encoder.layers.3.self_attn.q_proj.weight torch.Size([1024, 1024]) +encoder.layers.3.self_attn.q_proj.bias torch.Size([1024]) +encoder.layers.3.self_attn.out_proj.weight torch.Size([1024, 1024]) +encoder.layers.3.self_attn.out_proj.bias torch.Size([1024]) +encoder.layers.3.self_attn_layer_norm.weight torch.Size([1024]) +encoder.layers.3.self_attn_layer_norm.bias torch.Size([1024]) +encoder.layers.3.fc1.weight torch.Size([4096, 1024]) +encoder.layers.3.fc1.bias torch.Size([4096]) +encoder.layers.3.fc2.weight torch.Size([1024, 4096]) +encoder.layers.3.fc2.bias torch.Size([1024]) +encoder.layers.3.final_layer_norm.weight torch.Size([1024]) +encoder.layers.3.final_layer_norm.bias torch.Size([1024]) +encoder.layers.4.self_attn.k_proj.weight torch.Size([1024, 1024]) +encoder.layers.4.self_attn.k_proj.bias torch.Size([1024]) +encoder.layers.4.self_attn.v_proj.weight torch.Size([1024, 1024]) +encoder.layers.4.self_attn.v_proj.bias torch.Size([1024]) +encoder.layers.4.self_attn.q_proj.weight torch.Size([1024, 1024]) +encoder.layers.4.self_attn.q_proj.bias torch.Size([1024]) +encoder.layers.4.self_attn.out_proj.weight torch.Size([1024, 1024]) +encoder.layers.4.self_attn.out_proj.bias torch.Size([1024]) +encoder.layers.4.self_attn_layer_norm.weight torch.Size([1024]) +encoder.layers.4.self_attn_layer_norm.bias torch.Size([1024]) +encoder.layers.4.fc1.weight torch.Size([4096, 1024]) +encoder.layers.4.fc1.bias torch.Size([4096]) +encoder.layers.4.fc2.weight torch.Size([1024, 4096]) +encoder.layers.4.fc2.bias torch.Size([1024]) +encoder.layers.4.final_layer_norm.weight torch.Size([1024]) +encoder.layers.4.final_layer_norm.bias torch.Size([1024]) +encoder.layers.5.self_attn.k_proj.weight torch.Size([1024, 1024]) +encoder.layers.5.self_attn.k_proj.bias torch.Size([1024]) +encoder.layers.5.self_attn.v_proj.weight torch.Size([1024, 1024]) +encoder.layers.5.self_attn.v_proj.bias torch.Size([1024]) +encoder.layers.5.self_attn.q_proj.weight torch.Size([1024, 1024]) +encoder.layers.5.self_attn.q_proj.bias torch.Size([1024]) +encoder.layers.5.self_attn.out_proj.weight torch.Size([1024, 1024]) +encoder.layers.5.self_attn.out_proj.bias torch.Size([1024]) +encoder.layers.5.self_attn_layer_norm.weight torch.Size([1024]) +encoder.layers.5.self_attn_layer_norm.bias torch.Size([1024]) +encoder.layers.5.fc1.weight torch.Size([4096, 1024]) +encoder.layers.5.fc1.bias torch.Size([4096]) +encoder.layers.5.fc2.weight torch.Size([1024, 4096]) +encoder.layers.5.fc2.bias torch.Size([1024]) +encoder.layers.5.final_layer_norm.weight torch.Size([1024]) +encoder.layers.5.final_layer_norm.bias torch.Size([1024]) +encoder.layers.6.self_attn.k_proj.weight torch.Size([1024, 1024]) +encoder.layers.6.self_attn.k_proj.bias torch.Size([1024]) +encoder.layers.6.self_attn.v_proj.weight torch.Size([1024, 1024]) +encoder.layers.6.self_attn.v_proj.bias torch.Size([1024]) +encoder.layers.6.self_attn.q_proj.weight torch.Size([1024, 1024]) +encoder.layers.6.self_attn.q_proj.bias torch.Size([1024]) +encoder.layers.6.self_attn.out_proj.weight torch.Size([1024, 1024]) +encoder.layers.6.self_attn.out_proj.bias torch.Size([1024]) +encoder.layers.6.self_attn_layer_norm.weight torch.Size([1024]) +encoder.layers.6.self_attn_layer_norm.bias torch.Size([1024]) +encoder.layers.6.fc1.weight torch.Size([4096, 1024]) +encoder.layers.6.fc1.bias torch.Size([4096]) +encoder.layers.6.fc2.weight torch.Size([1024, 4096]) +encoder.layers.6.fc2.bias torch.Size([1024]) +encoder.layers.6.final_layer_norm.weight torch.Size([1024]) +encoder.layers.6.final_layer_norm.bias torch.Size([1024]) +encoder.layers.7.self_attn.k_proj.weight torch.Size([1024, 1024]) +encoder.layers.7.self_attn.k_proj.bias torch.Size([1024]) +encoder.layers.7.self_attn.v_proj.weight torch.Size([1024, 1024]) +encoder.layers.7.self_attn.v_proj.bias torch.Size([1024]) +encoder.layers.7.self_attn.q_proj.weight torch.Size([1024, 1024]) +encoder.layers.7.self_attn.q_proj.bias torch.Size([1024]) +encoder.layers.7.self_attn.out_proj.weight torch.Size([1024, 1024]) +encoder.layers.7.self_attn.out_proj.bias torch.Size([1024]) +encoder.layers.7.self_attn_layer_norm.weight torch.Size([1024]) +encoder.layers.7.self_attn_layer_norm.bias torch.Size([1024]) +encoder.layers.7.fc1.weight torch.Size([4096, 1024]) +encoder.layers.7.fc1.bias torch.Size([4096]) +encoder.layers.7.fc2.weight torch.Size([1024, 4096]) +encoder.layers.7.fc2.bias torch.Size([1024]) +encoder.layers.7.final_layer_norm.weight torch.Size([1024]) +encoder.layers.7.final_layer_norm.bias torch.Size([1024]) +encoder.layers.8.self_attn.k_proj.weight torch.Size([1024, 1024]) +encoder.layers.8.self_attn.k_proj.bias torch.Size([1024]) +encoder.layers.8.self_attn.v_proj.weight torch.Size([1024, 1024]) +encoder.layers.8.self_attn.v_proj.bias torch.Size([1024]) +encoder.layers.8.self_attn.q_proj.weight torch.Size([1024, 1024]) +encoder.layers.8.self_attn.q_proj.bias torch.Size([1024]) +encoder.layers.8.self_attn.out_proj.weight torch.Size([1024, 1024]) +encoder.layers.8.self_attn.out_proj.bias torch.Size([1024]) +encoder.layers.8.self_attn_layer_norm.weight torch.Size([1024]) +encoder.layers.8.self_attn_layer_norm.bias torch.Size([1024]) +encoder.layers.8.fc1.weight torch.Size([4096, 1024]) +encoder.layers.8.fc1.bias torch.Size([4096]) +encoder.layers.8.fc2.weight torch.Size([1024, 4096]) +encoder.layers.8.fc2.bias torch.Size([1024]) +encoder.layers.8.final_layer_norm.weight torch.Size([1024]) +encoder.layers.8.final_layer_norm.bias torch.Size([1024]) +encoder.layers.9.self_attn.k_proj.weight torch.Size([1024, 1024]) +encoder.layers.9.self_attn.k_proj.bias torch.Size([1024]) +encoder.layers.9.self_attn.v_proj.weight torch.Size([1024, 1024]) +encoder.layers.9.self_attn.v_proj.bias torch.Size([1024]) +encoder.layers.9.self_attn.q_proj.weight torch.Size([1024, 1024]) +encoder.layers.9.self_attn.q_proj.bias torch.Size([1024]) +encoder.layers.9.self_attn.out_proj.weight torch.Size([1024, 1024]) +encoder.layers.9.self_attn.out_proj.bias torch.Size([1024]) +encoder.layers.9.self_attn_layer_norm.weight torch.Size([1024]) +encoder.layers.9.self_attn_layer_norm.bias torch.Size([1024]) +encoder.layers.9.fc1.weight torch.Size([4096, 1024]) +encoder.layers.9.fc1.bias torch.Size([4096]) +encoder.layers.9.fc2.weight torch.Size([1024, 4096]) +encoder.layers.9.fc2.bias torch.Size([1024]) +encoder.layers.9.final_layer_norm.weight torch.Size([1024]) +encoder.layers.9.final_layer_norm.bias torch.Size([1024]) +encoder.layers.10.self_attn.k_proj.weight torch.Size([1024, 1024]) +encoder.layers.10.self_attn.k_proj.bias torch.Size([1024]) +encoder.layers.10.self_attn.v_proj.weight torch.Size([1024, 1024]) +encoder.layers.10.self_attn.v_proj.bias torch.Size([1024]) +encoder.layers.10.self_attn.q_proj.weight torch.Size([1024, 1024]) +encoder.layers.10.self_attn.q_proj.bias torch.Size([1024]) +encoder.layers.10.self_attn.out_proj.weight torch.Size([1024, 1024]) +encoder.layers.10.self_attn.out_proj.bias torch.Size([1024]) +encoder.layers.10.self_attn_layer_norm.weight torch.Size([1024]) +encoder.layers.10.self_attn_layer_norm.bias torch.Size([1024]) +encoder.layers.10.fc1.weight torch.Size([4096, 1024]) +encoder.layers.10.fc1.bias torch.Size([4096]) +encoder.layers.10.fc2.weight torch.Size([1024, 4096]) +encoder.layers.10.fc2.bias torch.Size([1024]) +encoder.layers.10.final_layer_norm.weight torch.Size([1024]) +encoder.layers.10.final_layer_norm.bias torch.Size([1024]) +encoder.layers.11.self_attn.k_proj.weight torch.Size([1024, 1024]) +encoder.layers.11.self_attn.k_proj.bias torch.Size([1024]) +encoder.layers.11.self_attn.v_proj.weight torch.Size([1024, 1024]) +encoder.layers.11.self_attn.v_proj.bias torch.Size([1024]) +encoder.layers.11.self_attn.q_proj.weight torch.Size([1024, 1024]) +encoder.layers.11.self_attn.q_proj.bias torch.Size([1024]) +encoder.layers.11.self_attn.out_proj.weight torch.Size([1024, 1024]) +encoder.layers.11.self_attn.out_proj.bias torch.Size([1024]) +encoder.layers.11.self_attn_layer_norm.weight torch.Size([1024]) +encoder.layers.11.self_attn_layer_norm.bias torch.Size([1024]) +encoder.layers.11.fc1.weight torch.Size([4096, 1024]) +encoder.layers.11.fc1.bias torch.Size([4096]) +encoder.layers.11.fc2.weight torch.Size([1024, 4096]) +encoder.layers.11.fc2.bias torch.Size([1024]) +encoder.layers.11.final_layer_norm.weight torch.Size([1024]) +encoder.layers.11.final_layer_norm.bias torch.Size([1024]) +decoder.version torch.Size([1]) +decoder.embed_tokens.weight torch.Size([50265, 1024]) +decoder.embed_positions.weight torch.Size([1026, 1024]) +decoder.layernorm_embedding.weight torch.Size([1024]) +decoder.layernorm_embedding.bias torch.Size([1024]) +decoder.layers.0.self_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.0.self_attn.k_proj.bias torch.Size([1024]) +decoder.layers.0.self_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.0.self_attn.v_proj.bias torch.Size([1024]) +decoder.layers.0.self_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.0.self_attn.q_proj.bias torch.Size([1024]) +decoder.layers.0.self_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.0.self_attn.out_proj.bias torch.Size([1024]) +decoder.layers.0.self_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.0.self_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.0.encoder_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.0.encoder_attn.k_proj.bias torch.Size([1024]) +decoder.layers.0.encoder_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.0.encoder_attn.v_proj.bias torch.Size([1024]) +decoder.layers.0.encoder_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.0.encoder_attn.q_proj.bias torch.Size([1024]) +decoder.layers.0.encoder_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.0.encoder_attn.out_proj.bias torch.Size([1024]) +decoder.layers.0.encoder_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.0.encoder_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.0.fc1.weight torch.Size([4096, 1024]) +decoder.layers.0.fc1.bias torch.Size([4096]) +decoder.layers.0.fc2.weight torch.Size([1024, 4096]) +decoder.layers.0.fc2.bias torch.Size([1024]) +decoder.layers.0.final_layer_norm.weight torch.Size([1024]) +decoder.layers.0.final_layer_norm.bias torch.Size([1024]) +decoder.layers.1.self_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.1.self_attn.k_proj.bias torch.Size([1024]) +decoder.layers.1.self_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.1.self_attn.v_proj.bias torch.Size([1024]) +decoder.layers.1.self_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.1.self_attn.q_proj.bias torch.Size([1024]) +decoder.layers.1.self_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.1.self_attn.out_proj.bias torch.Size([1024]) +decoder.layers.1.self_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.1.self_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.1.encoder_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.1.encoder_attn.k_proj.bias torch.Size([1024]) +decoder.layers.1.encoder_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.1.encoder_attn.v_proj.bias torch.Size([1024]) +decoder.layers.1.encoder_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.1.encoder_attn.q_proj.bias torch.Size([1024]) +decoder.layers.1.encoder_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.1.encoder_attn.out_proj.bias torch.Size([1024]) +decoder.layers.1.encoder_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.1.encoder_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.1.fc1.weight torch.Size([4096, 1024]) +decoder.layers.1.fc1.bias torch.Size([4096]) +decoder.layers.1.fc2.weight torch.Size([1024, 4096]) +decoder.layers.1.fc2.bias torch.Size([1024]) +decoder.layers.1.final_layer_norm.weight torch.Size([1024]) +decoder.layers.1.final_layer_norm.bias torch.Size([1024]) +decoder.layers.2.self_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.2.self_attn.k_proj.bias torch.Size([1024]) +decoder.layers.2.self_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.2.self_attn.v_proj.bias torch.Size([1024]) +decoder.layers.2.self_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.2.self_attn.q_proj.bias torch.Size([1024]) +decoder.layers.2.self_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.2.self_attn.out_proj.bias torch.Size([1024]) +decoder.layers.2.self_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.2.self_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.2.encoder_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.2.encoder_attn.k_proj.bias torch.Size([1024]) +decoder.layers.2.encoder_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.2.encoder_attn.v_proj.bias torch.Size([1024]) +decoder.layers.2.encoder_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.2.encoder_attn.q_proj.bias torch.Size([1024]) +decoder.layers.2.encoder_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.2.encoder_attn.out_proj.bias torch.Size([1024]) +decoder.layers.2.encoder_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.2.encoder_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.2.fc1.weight torch.Size([4096, 1024]) +decoder.layers.2.fc1.bias torch.Size([4096]) +decoder.layers.2.fc2.weight torch.Size([1024, 4096]) +decoder.layers.2.fc2.bias torch.Size([1024]) +decoder.layers.2.final_layer_norm.weight torch.Size([1024]) +decoder.layers.2.final_layer_norm.bias torch.Size([1024]) +decoder.layers.3.self_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.3.self_attn.k_proj.bias torch.Size([1024]) +decoder.layers.3.self_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.3.self_attn.v_proj.bias torch.Size([1024]) +decoder.layers.3.self_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.3.self_attn.q_proj.bias torch.Size([1024]) +decoder.layers.3.self_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.3.self_attn.out_proj.bias torch.Size([1024]) +decoder.layers.3.self_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.3.self_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.3.encoder_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.3.encoder_attn.k_proj.bias torch.Size([1024]) +decoder.layers.3.encoder_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.3.encoder_attn.v_proj.bias torch.Size([1024]) +decoder.layers.3.encoder_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.3.encoder_attn.q_proj.bias torch.Size([1024]) +decoder.layers.3.encoder_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.3.encoder_attn.out_proj.bias torch.Size([1024]) +decoder.layers.3.encoder_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.3.encoder_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.3.fc1.weight torch.Size([4096, 1024]) +decoder.layers.3.fc1.bias torch.Size([4096]) +decoder.layers.3.fc2.weight torch.Size([1024, 4096]) +decoder.layers.3.fc2.bias torch.Size([1024]) +decoder.layers.3.final_layer_norm.weight torch.Size([1024]) +decoder.layers.3.final_layer_norm.bias torch.Size([1024]) +decoder.layers.4.self_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.4.self_attn.k_proj.bias torch.Size([1024]) +decoder.layers.4.self_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.4.self_attn.v_proj.bias torch.Size([1024]) +decoder.layers.4.self_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.4.self_attn.q_proj.bias torch.Size([1024]) +decoder.layers.4.self_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.4.self_attn.out_proj.bias torch.Size([1024]) +decoder.layers.4.self_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.4.self_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.4.encoder_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.4.encoder_attn.k_proj.bias torch.Size([1024]) +decoder.layers.4.encoder_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.4.encoder_attn.v_proj.bias torch.Size([1024]) +decoder.layers.4.encoder_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.4.encoder_attn.q_proj.bias torch.Size([1024]) +decoder.layers.4.encoder_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.4.encoder_attn.out_proj.bias torch.Size([1024]) +decoder.layers.4.encoder_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.4.encoder_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.4.fc1.weight torch.Size([4096, 1024]) +decoder.layers.4.fc1.bias torch.Size([4096]) +decoder.layers.4.fc2.weight torch.Size([1024, 4096]) +decoder.layers.4.fc2.bias torch.Size([1024]) +decoder.layers.4.final_layer_norm.weight torch.Size([1024]) +decoder.layers.4.final_layer_norm.bias torch.Size([1024]) +decoder.layers.5.self_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.5.self_attn.k_proj.bias torch.Size([1024]) +decoder.layers.5.self_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.5.self_attn.v_proj.bias torch.Size([1024]) +decoder.layers.5.self_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.5.self_attn.q_proj.bias torch.Size([1024]) +decoder.layers.5.self_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.5.self_attn.out_proj.bias torch.Size([1024]) +decoder.layers.5.self_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.5.self_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.5.encoder_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.5.encoder_attn.k_proj.bias torch.Size([1024]) +decoder.layers.5.encoder_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.5.encoder_attn.v_proj.bias torch.Size([1024]) +decoder.layers.5.encoder_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.5.encoder_attn.q_proj.bias torch.Size([1024]) +decoder.layers.5.encoder_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.5.encoder_attn.out_proj.bias torch.Size([1024]) +decoder.layers.5.encoder_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.5.encoder_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.5.fc1.weight torch.Size([4096, 1024]) +decoder.layers.5.fc1.bias torch.Size([4096]) +decoder.layers.5.fc2.weight torch.Size([1024, 4096]) +decoder.layers.5.fc2.bias torch.Size([1024]) +decoder.layers.5.final_layer_norm.weight torch.Size([1024]) +decoder.layers.5.final_layer_norm.bias torch.Size([1024]) +decoder.layers.6.self_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.6.self_attn.k_proj.bias torch.Size([1024]) +decoder.layers.6.self_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.6.self_attn.v_proj.bias torch.Size([1024]) +decoder.layers.6.self_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.6.self_attn.q_proj.bias torch.Size([1024]) +decoder.layers.6.self_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.6.self_attn.out_proj.bias torch.Size([1024]) +decoder.layers.6.self_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.6.self_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.6.encoder_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.6.encoder_attn.k_proj.bias torch.Size([1024]) +decoder.layers.6.encoder_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.6.encoder_attn.v_proj.bias torch.Size([1024]) +decoder.layers.6.encoder_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.6.encoder_attn.q_proj.bias torch.Size([1024]) +decoder.layers.6.encoder_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.6.encoder_attn.out_proj.bias torch.Size([1024]) +decoder.layers.6.encoder_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.6.encoder_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.6.fc1.weight torch.Size([4096, 1024]) +decoder.layers.6.fc1.bias torch.Size([4096]) +decoder.layers.6.fc2.weight torch.Size([1024, 4096]) +decoder.layers.6.fc2.bias torch.Size([1024]) +decoder.layers.6.final_layer_norm.weight torch.Size([1024]) +decoder.layers.6.final_layer_norm.bias torch.Size([1024]) +decoder.layers.7.self_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.7.self_attn.k_proj.bias torch.Size([1024]) +decoder.layers.7.self_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.7.self_attn.v_proj.bias torch.Size([1024]) +decoder.layers.7.self_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.7.self_attn.q_proj.bias torch.Size([1024]) +decoder.layers.7.self_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.7.self_attn.out_proj.bias torch.Size([1024]) +decoder.layers.7.self_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.7.self_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.7.encoder_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.7.encoder_attn.k_proj.bias torch.Size([1024]) +decoder.layers.7.encoder_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.7.encoder_attn.v_proj.bias torch.Size([1024]) +decoder.layers.7.encoder_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.7.encoder_attn.q_proj.bias torch.Size([1024]) +decoder.layers.7.encoder_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.7.encoder_attn.out_proj.bias torch.Size([1024]) +decoder.layers.7.encoder_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.7.encoder_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.7.fc1.weight torch.Size([4096, 1024]) +decoder.layers.7.fc1.bias torch.Size([4096]) +decoder.layers.7.fc2.weight torch.Size([1024, 4096]) +decoder.layers.7.fc2.bias torch.Size([1024]) +decoder.layers.7.final_layer_norm.weight torch.Size([1024]) +decoder.layers.7.final_layer_norm.bias torch.Size([1024]) +decoder.layers.8.self_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.8.self_attn.k_proj.bias torch.Size([1024]) +decoder.layers.8.self_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.8.self_attn.v_proj.bias torch.Size([1024]) +decoder.layers.8.self_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.8.self_attn.q_proj.bias torch.Size([1024]) +decoder.layers.8.self_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.8.self_attn.out_proj.bias torch.Size([1024]) +decoder.layers.8.self_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.8.self_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.8.encoder_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.8.encoder_attn.k_proj.bias torch.Size([1024]) +decoder.layers.8.encoder_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.8.encoder_attn.v_proj.bias torch.Size([1024]) +decoder.layers.8.encoder_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.8.encoder_attn.q_proj.bias torch.Size([1024]) +decoder.layers.8.encoder_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.8.encoder_attn.out_proj.bias torch.Size([1024]) +decoder.layers.8.encoder_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.8.encoder_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.8.fc1.weight torch.Size([4096, 1024]) +decoder.layers.8.fc1.bias torch.Size([4096]) +decoder.layers.8.fc2.weight torch.Size([1024, 4096]) +decoder.layers.8.fc2.bias torch.Size([1024]) +decoder.layers.8.final_layer_norm.weight torch.Size([1024]) +decoder.layers.8.final_layer_norm.bias torch.Size([1024]) +decoder.layers.9.self_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.9.self_attn.k_proj.bias torch.Size([1024]) +decoder.layers.9.self_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.9.self_attn.v_proj.bias torch.Size([1024]) +decoder.layers.9.self_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.9.self_attn.q_proj.bias torch.Size([1024]) +decoder.layers.9.self_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.9.self_attn.out_proj.bias torch.Size([1024]) +decoder.layers.9.self_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.9.self_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.9.encoder_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.9.encoder_attn.k_proj.bias torch.Size([1024]) +decoder.layers.9.encoder_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.9.encoder_attn.v_proj.bias torch.Size([1024]) +decoder.layers.9.encoder_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.9.encoder_attn.q_proj.bias torch.Size([1024]) +decoder.layers.9.encoder_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.9.encoder_attn.out_proj.bias torch.Size([1024]) +decoder.layers.9.encoder_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.9.encoder_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.9.fc1.weight torch.Size([4096, 1024]) +decoder.layers.9.fc1.bias torch.Size([4096]) +decoder.layers.9.fc2.weight torch.Size([1024, 4096]) +decoder.layers.9.fc2.bias torch.Size([1024]) +decoder.layers.9.final_layer_norm.weight torch.Size([1024]) +decoder.layers.9.final_layer_norm.bias torch.Size([1024]) +decoder.layers.10.self_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.10.self_attn.k_proj.bias torch.Size([1024]) +decoder.layers.10.self_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.10.self_attn.v_proj.bias torch.Size([1024]) +decoder.layers.10.self_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.10.self_attn.q_proj.bias torch.Size([1024]) +decoder.layers.10.self_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.10.self_attn.out_proj.bias torch.Size([1024]) +decoder.layers.10.self_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.10.self_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.10.encoder_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.10.encoder_attn.k_proj.bias torch.Size([1024]) +decoder.layers.10.encoder_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.10.encoder_attn.v_proj.bias torch.Size([1024]) +decoder.layers.10.encoder_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.10.encoder_attn.q_proj.bias torch.Size([1024]) +decoder.layers.10.encoder_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.10.encoder_attn.out_proj.bias torch.Size([1024]) +decoder.layers.10.encoder_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.10.encoder_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.10.fc1.weight torch.Size([4096, 1024]) +decoder.layers.10.fc1.bias torch.Size([4096]) +decoder.layers.10.fc2.weight torch.Size([1024, 4096]) +decoder.layers.10.fc2.bias torch.Size([1024]) +decoder.layers.10.final_layer_norm.weight torch.Size([1024]) +decoder.layers.10.final_layer_norm.bias torch.Size([1024]) +decoder.layers.11.self_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.11.self_attn.k_proj.bias torch.Size([1024]) +decoder.layers.11.self_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.11.self_attn.v_proj.bias torch.Size([1024]) +decoder.layers.11.self_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.11.self_attn.q_proj.bias torch.Size([1024]) +decoder.layers.11.self_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.11.self_attn.out_proj.bias torch.Size([1024]) +decoder.layers.11.self_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.11.self_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.11.encoder_attn.k_proj.weight torch.Size([1024, 1024]) +decoder.layers.11.encoder_attn.k_proj.bias torch.Size([1024]) +decoder.layers.11.encoder_attn.v_proj.weight torch.Size([1024, 1024]) +decoder.layers.11.encoder_attn.v_proj.bias torch.Size([1024]) +decoder.layers.11.encoder_attn.q_proj.weight torch.Size([1024, 1024]) +decoder.layers.11.encoder_attn.q_proj.bias torch.Size([1024]) +decoder.layers.11.encoder_attn.out_proj.weight torch.Size([1024, 1024]) +decoder.layers.11.encoder_attn.out_proj.bias torch.Size([1024]) +decoder.layers.11.encoder_attn_layer_norm.weight torch.Size([1024]) +decoder.layers.11.encoder_attn_layer_norm.bias torch.Size([1024]) +decoder.layers.11.fc1.weight torch.Size([4096, 1024]) +decoder.layers.11.fc1.bias torch.Size([4096]) +decoder.layers.11.fc2.weight torch.Size([1024, 4096]) +decoder.layers.11.fc2.bias torch.Size([1024]) +decoder.layers.11.final_layer_norm.weight torch.Size([1024]) +decoder.layers.11.final_layer_norm.bias torch.Size([1024]) +decoder.output_projection.weight torch.Size([50265, 1024]) \ No newline at end of file From 80a7ce940a4bb0948728d1170188937acbb23d82 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Fri, 3 Sep 2021 09:08:08 +0200 Subject: [PATCH 60/84] fix logs --- bsmetadata/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bsmetadata/train.py b/bsmetadata/train.py index 6c8ba286..631a4103 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -201,8 +201,8 @@ def evaluate(eval_dataloader): model.train() return {"perplexity": perplexity} - logger.info("***** Start training *****") if args.do_train: + logger.info("***** Start training *****") # Train! progress_bar = tqdm(range(args.max_train_steps), desc="training") completed_steps = 0 @@ -254,8 +254,8 @@ def evaluate(eval_dataloader): if completed_steps >= args.max_train_steps: break logger_metrics.close() - - logger.info("***** Training finished *****") + logger.info("***** Training finished *****") + if is_local_main_process and args.out_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) From 25acad467abd28a37483b93d03a512268537671d Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 6 Sep 2021 10:32:58 +0200 Subject: [PATCH 61/84] change to None empty data files --- bsmetadata/build_dataset/with_metadata.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bsmetadata/build_dataset/with_metadata.py b/bsmetadata/build_dataset/with_metadata.py index d2351f79..07cb4009 100644 --- a/bsmetadata/build_dataset/with_metadata.py +++ b/bsmetadata/build_dataset/with_metadata.py @@ -46,6 +46,9 @@ def build_dataset(tokenizer, args): if args.validation_file is not None: data_files["validation"] = args.validation_file + if not data_files: + data_files = None + if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( From b03cde347db5eefc1efb7fd102b6181f372fd426 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 6 Sep 2021 10:33:12 +0200 Subject: [PATCH 62/84] see offline dataset --- .../html/SLURM/crime_and_punish_test/create_dataset.slurm | 1 + 1 file changed, 1 insertion(+) diff --git a/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm b/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm index c51058be..9eb3afa5 100644 --- a/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm +++ b/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm @@ -13,6 +13,7 @@ set -x -e source $HOME/start-user +export HF_DATASETS_OFFLINE=1 export TRANSFORMERS_OFFLINE=1 cd $WORK/repos/metadata/ From ef90eb9848b9c06d87eba38fbe30fc7a5c506a37 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 6 Sep 2021 10:33:22 +0200 Subject: [PATCH 63/84] change requirements --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7eb152ba..d81c860c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,5 @@ hydra_core==1.1.* wandb==0.10.30 transformers accelerate==0.3.0 -omegaconf==2.1.1 git+https://github.com/huggingface/datasets.git From c2ab8229340aadf45ca39e8774850d3dadda2d72 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 6 Sep 2021 10:42:06 +0200 Subject: [PATCH 64/84] remove unused files --- bsmetadata/build_dataset/with_metadata.py | 128 ------ perso/bart_state_dict.txt | 515 ---------------------- 2 files changed, 643 deletions(-) delete mode 100644 bsmetadata/build_dataset/with_metadata.py delete mode 100644 perso/bart_state_dict.txt diff --git a/bsmetadata/build_dataset/with_metadata.py b/bsmetadata/build_dataset/with_metadata.py deleted file mode 100644 index 07cb4009..00000000 --- a/bsmetadata/build_dataset/with_metadata.py +++ /dev/null @@ -1,128 +0,0 @@ -import functools -import logging - -from datasets import load_dataset -from torch.utils.data import DataLoader -from transformers import default_data_collator - -from bsmetadata.metadata_utils import add_metadata_and_chunk_examples - - -logger = logging.getLogger(__name__) - - -def build_dataset(tokenizer, args): - """ - Args: - tokenizer: a huggingface/transformers tokenizer - args: a DataConfig - Returns: - a training dataloader and one or more validation dataloaders - validation dataloaders should be in a dictionary - each dataloader should yield {str: torch.Tensor(cpu) } - dictionary keys may have 'metadata_mask' - other fields will be passed to model - note: metadata_mask should be padded - Example: - train_dataloader, val_dataloaders = get_dataloaders(...) - for batch in train_dataloader: - metadata_mask = batch.get('metadata_mask', None) - outputs = model(**batch) - metrics = loss_fn(batch, outputs, metadata_mask) - """ - # Mostly copy/paste from https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_clm_no_trainer.py - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called - # 'text' is found. You can easily tweak this behavior (see below). - # - # In distributed training, the load_dataset function guarantees that only one local process can concurrently - # download the dataset. - data_files = {} - if args.train_file is not None: - data_files["train"] = args.train_file - if args.validation_file is not None: - data_files["validation"] = args.validation_file - - if not data_files: - data_files = None - - if args.dataset_name is not None: - # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset( - args.dataset_name, - args.dataset_config_name, - data_files=data_files, - cache_dir=args.cache_dir, - keep_in_memory=False, - ) - - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - args.dataset_name, - args.dataset_config_name, - split=f"train[:{args.validation_split_percentage}%]", - cache_dir=args.cache_dir, - ) - raw_datasets["train"] = load_dataset( - args.dataset_name, - args.dataset_config_name, - split=f"train[{args.validation_split_percentage}%:]", - cache_dir=args.cache_dir, - ) - else: - extension = args.train_file.split(".")[-1] if not args.extension else args.extension - if extension == "txt": - raise ValueError( - "You have entered a text file for the train data, but this type of file cannot contain metadata " - "columns. Wouldn't you rather have a file in json/jsonl or pandas format?" - ) - if extension == "jsonl": - extension = "json" - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=args.cache_dir) - - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - extension, - data_files=data_files, - split=f"train[:{args.validation_split_percentage}%]", - cache_dir=args.cache_dir, - ) - raw_datasets["train"] = load_dataset( - extension, - data_files=data_files, - split=f"train[{args.validation_split_percentage}%:]", - cache_dir=args.cache_dir, - ) - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - # Preprocessing the datasets. - column_names = raw_datasets["train"].column_names - - # First we pre-process our text and metadata - lm_datasets = raw_datasets.map( - functools.partial(add_metadata_and_chunk_examples, tokenizer=tokenizer, cfg=args), - batched=True, - num_proc=args.preprocessing_num_workers, - load_from_cache_file=not args.overwrite_cache, - desc="Pre-process the text and metadata to create new samples", - remove_columns=column_names, - ) - - def create_labels_column(examples): - examples["labels"] = examples["input_ids"].copy() - return examples - - # Then we add the column containing the labels - lm_datasets = lm_datasets.map( - create_labels_column, - batched=True, - num_proc=args.preprocessing_num_workers, - load_from_cache_file=not args.overwrite_cache, - desc="Create labels column", - ) - - lm_datasets.save_to_disk(args.dataset_saving_dir) diff --git a/perso/bart_state_dict.txt b/perso/bart_state_dict.txt deleted file mode 100644 index 610cd1b1..00000000 --- a/perso/bart_state_dict.txt +++ /dev/null @@ -1,515 +0,0 @@ -encoder.version torch.Size([1]) -encoder.embed_tokens.weight torch.Size([50265, 1024]) -encoder.embed_positions.weight torch.Size([1026, 1024]) -encoder.layernorm_embedding.weight torch.Size([1024]) -encoder.layernorm_embedding.bias torch.Size([1024]) -encoder.layers.0.self_attn.k_proj.weight torch.Size([1024, 1024]) -encoder.layers.0.self_attn.k_proj.bias torch.Size([1024]) -encoder.layers.0.self_attn.v_proj.weight torch.Size([1024, 1024]) -encoder.layers.0.self_attn.v_proj.bias torch.Size([1024]) -encoder.layers.0.self_attn.q_proj.weight torch.Size([1024, 1024]) -encoder.layers.0.self_attn.q_proj.bias torch.Size([1024]) -encoder.layers.0.self_attn.out_proj.weight torch.Size([1024, 1024]) -encoder.layers.0.self_attn.out_proj.bias torch.Size([1024]) -encoder.layers.0.self_attn_layer_norm.weight torch.Size([1024]) -encoder.layers.0.self_attn_layer_norm.bias torch.Size([1024]) -encoder.layers.0.fc1.weight torch.Size([4096, 1024]) -encoder.layers.0.fc1.bias torch.Size([4096]) -encoder.layers.0.fc2.weight torch.Size([1024, 4096]) -encoder.layers.0.fc2.bias torch.Size([1024]) -encoder.layers.0.final_layer_norm.weight torch.Size([1024]) -encoder.layers.0.final_layer_norm.bias torch.Size([1024]) -encoder.layers.1.self_attn.k_proj.weight torch.Size([1024, 1024]) -encoder.layers.1.self_attn.k_proj.bias torch.Size([1024]) -encoder.layers.1.self_attn.v_proj.weight torch.Size([1024, 1024]) -encoder.layers.1.self_attn.v_proj.bias torch.Size([1024]) -encoder.layers.1.self_attn.q_proj.weight torch.Size([1024, 1024]) -encoder.layers.1.self_attn.q_proj.bias torch.Size([1024]) -encoder.layers.1.self_attn.out_proj.weight torch.Size([1024, 1024]) -encoder.layers.1.self_attn.out_proj.bias torch.Size([1024]) -encoder.layers.1.self_attn_layer_norm.weight torch.Size([1024]) -encoder.layers.1.self_attn_layer_norm.bias torch.Size([1024]) -encoder.layers.1.fc1.weight torch.Size([4096, 1024]) -encoder.layers.1.fc1.bias torch.Size([4096]) -encoder.layers.1.fc2.weight torch.Size([1024, 4096]) -encoder.layers.1.fc2.bias torch.Size([1024]) -encoder.layers.1.final_layer_norm.weight torch.Size([1024]) -encoder.layers.1.final_layer_norm.bias torch.Size([1024]) -encoder.layers.2.self_attn.k_proj.weight torch.Size([1024, 1024]) -encoder.layers.2.self_attn.k_proj.bias torch.Size([1024]) -encoder.layers.2.self_attn.v_proj.weight torch.Size([1024, 1024]) -encoder.layers.2.self_attn.v_proj.bias torch.Size([1024]) -encoder.layers.2.self_attn.q_proj.weight torch.Size([1024, 1024]) -encoder.layers.2.self_attn.q_proj.bias torch.Size([1024]) -encoder.layers.2.self_attn.out_proj.weight torch.Size([1024, 1024]) -encoder.layers.2.self_attn.out_proj.bias torch.Size([1024]) -encoder.layers.2.self_attn_layer_norm.weight torch.Size([1024]) -encoder.layers.2.self_attn_layer_norm.bias torch.Size([1024]) -encoder.layers.2.fc1.weight torch.Size([4096, 1024]) -encoder.layers.2.fc1.bias torch.Size([4096]) -encoder.layers.2.fc2.weight torch.Size([1024, 4096]) -encoder.layers.2.fc2.bias torch.Size([1024]) -encoder.layers.2.final_layer_norm.weight torch.Size([1024]) -encoder.layers.2.final_layer_norm.bias torch.Size([1024]) -encoder.layers.3.self_attn.k_proj.weight torch.Size([1024, 1024]) -encoder.layers.3.self_attn.k_proj.bias torch.Size([1024]) -encoder.layers.3.self_attn.v_proj.weight torch.Size([1024, 1024]) -encoder.layers.3.self_attn.v_proj.bias torch.Size([1024]) -encoder.layers.3.self_attn.q_proj.weight torch.Size([1024, 1024]) -encoder.layers.3.self_attn.q_proj.bias torch.Size([1024]) -encoder.layers.3.self_attn.out_proj.weight torch.Size([1024, 1024]) -encoder.layers.3.self_attn.out_proj.bias torch.Size([1024]) -encoder.layers.3.self_attn_layer_norm.weight torch.Size([1024]) -encoder.layers.3.self_attn_layer_norm.bias torch.Size([1024]) -encoder.layers.3.fc1.weight torch.Size([4096, 1024]) -encoder.layers.3.fc1.bias torch.Size([4096]) -encoder.layers.3.fc2.weight torch.Size([1024, 4096]) -encoder.layers.3.fc2.bias torch.Size([1024]) -encoder.layers.3.final_layer_norm.weight torch.Size([1024]) -encoder.layers.3.final_layer_norm.bias torch.Size([1024]) -encoder.layers.4.self_attn.k_proj.weight torch.Size([1024, 1024]) -encoder.layers.4.self_attn.k_proj.bias torch.Size([1024]) -encoder.layers.4.self_attn.v_proj.weight torch.Size([1024, 1024]) -encoder.layers.4.self_attn.v_proj.bias torch.Size([1024]) -encoder.layers.4.self_attn.q_proj.weight torch.Size([1024, 1024]) -encoder.layers.4.self_attn.q_proj.bias torch.Size([1024]) -encoder.layers.4.self_attn.out_proj.weight torch.Size([1024, 1024]) -encoder.layers.4.self_attn.out_proj.bias torch.Size([1024]) -encoder.layers.4.self_attn_layer_norm.weight torch.Size([1024]) -encoder.layers.4.self_attn_layer_norm.bias torch.Size([1024]) -encoder.layers.4.fc1.weight torch.Size([4096, 1024]) -encoder.layers.4.fc1.bias torch.Size([4096]) -encoder.layers.4.fc2.weight torch.Size([1024, 4096]) -encoder.layers.4.fc2.bias torch.Size([1024]) -encoder.layers.4.final_layer_norm.weight torch.Size([1024]) -encoder.layers.4.final_layer_norm.bias torch.Size([1024]) -encoder.layers.5.self_attn.k_proj.weight torch.Size([1024, 1024]) -encoder.layers.5.self_attn.k_proj.bias torch.Size([1024]) -encoder.layers.5.self_attn.v_proj.weight torch.Size([1024, 1024]) -encoder.layers.5.self_attn.v_proj.bias torch.Size([1024]) -encoder.layers.5.self_attn.q_proj.weight torch.Size([1024, 1024]) -encoder.layers.5.self_attn.q_proj.bias torch.Size([1024]) -encoder.layers.5.self_attn.out_proj.weight torch.Size([1024, 1024]) -encoder.layers.5.self_attn.out_proj.bias torch.Size([1024]) -encoder.layers.5.self_attn_layer_norm.weight torch.Size([1024]) -encoder.layers.5.self_attn_layer_norm.bias torch.Size([1024]) -encoder.layers.5.fc1.weight torch.Size([4096, 1024]) -encoder.layers.5.fc1.bias torch.Size([4096]) -encoder.layers.5.fc2.weight torch.Size([1024, 4096]) -encoder.layers.5.fc2.bias torch.Size([1024]) -encoder.layers.5.final_layer_norm.weight torch.Size([1024]) -encoder.layers.5.final_layer_norm.bias torch.Size([1024]) -encoder.layers.6.self_attn.k_proj.weight torch.Size([1024, 1024]) -encoder.layers.6.self_attn.k_proj.bias torch.Size([1024]) -encoder.layers.6.self_attn.v_proj.weight torch.Size([1024, 1024]) -encoder.layers.6.self_attn.v_proj.bias torch.Size([1024]) -encoder.layers.6.self_attn.q_proj.weight torch.Size([1024, 1024]) -encoder.layers.6.self_attn.q_proj.bias torch.Size([1024]) -encoder.layers.6.self_attn.out_proj.weight torch.Size([1024, 1024]) -encoder.layers.6.self_attn.out_proj.bias torch.Size([1024]) -encoder.layers.6.self_attn_layer_norm.weight torch.Size([1024]) -encoder.layers.6.self_attn_layer_norm.bias torch.Size([1024]) -encoder.layers.6.fc1.weight torch.Size([4096, 1024]) -encoder.layers.6.fc1.bias torch.Size([4096]) -encoder.layers.6.fc2.weight torch.Size([1024, 4096]) -encoder.layers.6.fc2.bias torch.Size([1024]) -encoder.layers.6.final_layer_norm.weight torch.Size([1024]) -encoder.layers.6.final_layer_norm.bias torch.Size([1024]) -encoder.layers.7.self_attn.k_proj.weight torch.Size([1024, 1024]) -encoder.layers.7.self_attn.k_proj.bias torch.Size([1024]) -encoder.layers.7.self_attn.v_proj.weight torch.Size([1024, 1024]) -encoder.layers.7.self_attn.v_proj.bias torch.Size([1024]) -encoder.layers.7.self_attn.q_proj.weight torch.Size([1024, 1024]) -encoder.layers.7.self_attn.q_proj.bias torch.Size([1024]) -encoder.layers.7.self_attn.out_proj.weight torch.Size([1024, 1024]) -encoder.layers.7.self_attn.out_proj.bias torch.Size([1024]) -encoder.layers.7.self_attn_layer_norm.weight torch.Size([1024]) -encoder.layers.7.self_attn_layer_norm.bias torch.Size([1024]) -encoder.layers.7.fc1.weight torch.Size([4096, 1024]) -encoder.layers.7.fc1.bias torch.Size([4096]) -encoder.layers.7.fc2.weight torch.Size([1024, 4096]) -encoder.layers.7.fc2.bias torch.Size([1024]) -encoder.layers.7.final_layer_norm.weight torch.Size([1024]) -encoder.layers.7.final_layer_norm.bias torch.Size([1024]) -encoder.layers.8.self_attn.k_proj.weight torch.Size([1024, 1024]) -encoder.layers.8.self_attn.k_proj.bias torch.Size([1024]) -encoder.layers.8.self_attn.v_proj.weight torch.Size([1024, 1024]) -encoder.layers.8.self_attn.v_proj.bias torch.Size([1024]) -encoder.layers.8.self_attn.q_proj.weight torch.Size([1024, 1024]) -encoder.layers.8.self_attn.q_proj.bias torch.Size([1024]) -encoder.layers.8.self_attn.out_proj.weight torch.Size([1024, 1024]) -encoder.layers.8.self_attn.out_proj.bias torch.Size([1024]) -encoder.layers.8.self_attn_layer_norm.weight torch.Size([1024]) -encoder.layers.8.self_attn_layer_norm.bias torch.Size([1024]) -encoder.layers.8.fc1.weight torch.Size([4096, 1024]) -encoder.layers.8.fc1.bias torch.Size([4096]) -encoder.layers.8.fc2.weight torch.Size([1024, 4096]) -encoder.layers.8.fc2.bias torch.Size([1024]) -encoder.layers.8.final_layer_norm.weight torch.Size([1024]) -encoder.layers.8.final_layer_norm.bias torch.Size([1024]) -encoder.layers.9.self_attn.k_proj.weight torch.Size([1024, 1024]) -encoder.layers.9.self_attn.k_proj.bias torch.Size([1024]) -encoder.layers.9.self_attn.v_proj.weight torch.Size([1024, 1024]) -encoder.layers.9.self_attn.v_proj.bias torch.Size([1024]) -encoder.layers.9.self_attn.q_proj.weight torch.Size([1024, 1024]) -encoder.layers.9.self_attn.q_proj.bias torch.Size([1024]) -encoder.layers.9.self_attn.out_proj.weight torch.Size([1024, 1024]) -encoder.layers.9.self_attn.out_proj.bias torch.Size([1024]) -encoder.layers.9.self_attn_layer_norm.weight torch.Size([1024]) -encoder.layers.9.self_attn_layer_norm.bias torch.Size([1024]) -encoder.layers.9.fc1.weight torch.Size([4096, 1024]) -encoder.layers.9.fc1.bias torch.Size([4096]) -encoder.layers.9.fc2.weight torch.Size([1024, 4096]) -encoder.layers.9.fc2.bias torch.Size([1024]) -encoder.layers.9.final_layer_norm.weight torch.Size([1024]) -encoder.layers.9.final_layer_norm.bias torch.Size([1024]) -encoder.layers.10.self_attn.k_proj.weight torch.Size([1024, 1024]) -encoder.layers.10.self_attn.k_proj.bias torch.Size([1024]) -encoder.layers.10.self_attn.v_proj.weight torch.Size([1024, 1024]) -encoder.layers.10.self_attn.v_proj.bias torch.Size([1024]) -encoder.layers.10.self_attn.q_proj.weight torch.Size([1024, 1024]) -encoder.layers.10.self_attn.q_proj.bias torch.Size([1024]) -encoder.layers.10.self_attn.out_proj.weight torch.Size([1024, 1024]) -encoder.layers.10.self_attn.out_proj.bias torch.Size([1024]) -encoder.layers.10.self_attn_layer_norm.weight torch.Size([1024]) -encoder.layers.10.self_attn_layer_norm.bias torch.Size([1024]) -encoder.layers.10.fc1.weight torch.Size([4096, 1024]) -encoder.layers.10.fc1.bias torch.Size([4096]) -encoder.layers.10.fc2.weight torch.Size([1024, 4096]) -encoder.layers.10.fc2.bias torch.Size([1024]) -encoder.layers.10.final_layer_norm.weight torch.Size([1024]) -encoder.layers.10.final_layer_norm.bias torch.Size([1024]) -encoder.layers.11.self_attn.k_proj.weight torch.Size([1024, 1024]) -encoder.layers.11.self_attn.k_proj.bias torch.Size([1024]) -encoder.layers.11.self_attn.v_proj.weight torch.Size([1024, 1024]) -encoder.layers.11.self_attn.v_proj.bias torch.Size([1024]) -encoder.layers.11.self_attn.q_proj.weight torch.Size([1024, 1024]) -encoder.layers.11.self_attn.q_proj.bias torch.Size([1024]) -encoder.layers.11.self_attn.out_proj.weight torch.Size([1024, 1024]) -encoder.layers.11.self_attn.out_proj.bias torch.Size([1024]) -encoder.layers.11.self_attn_layer_norm.weight torch.Size([1024]) -encoder.layers.11.self_attn_layer_norm.bias torch.Size([1024]) -encoder.layers.11.fc1.weight torch.Size([4096, 1024]) -encoder.layers.11.fc1.bias torch.Size([4096]) -encoder.layers.11.fc2.weight torch.Size([1024, 4096]) -encoder.layers.11.fc2.bias torch.Size([1024]) -encoder.layers.11.final_layer_norm.weight torch.Size([1024]) -encoder.layers.11.final_layer_norm.bias torch.Size([1024]) -decoder.version torch.Size([1]) -decoder.embed_tokens.weight torch.Size([50265, 1024]) -decoder.embed_positions.weight torch.Size([1026, 1024]) -decoder.layernorm_embedding.weight torch.Size([1024]) -decoder.layernorm_embedding.bias torch.Size([1024]) -decoder.layers.0.self_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.0.self_attn.k_proj.bias torch.Size([1024]) -decoder.layers.0.self_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.0.self_attn.v_proj.bias torch.Size([1024]) -decoder.layers.0.self_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.0.self_attn.q_proj.bias torch.Size([1024]) -decoder.layers.0.self_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.0.self_attn.out_proj.bias torch.Size([1024]) -decoder.layers.0.self_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.0.self_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.0.encoder_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.0.encoder_attn.k_proj.bias torch.Size([1024]) -decoder.layers.0.encoder_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.0.encoder_attn.v_proj.bias torch.Size([1024]) -decoder.layers.0.encoder_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.0.encoder_attn.q_proj.bias torch.Size([1024]) -decoder.layers.0.encoder_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.0.encoder_attn.out_proj.bias torch.Size([1024]) -decoder.layers.0.encoder_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.0.encoder_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.0.fc1.weight torch.Size([4096, 1024]) -decoder.layers.0.fc1.bias torch.Size([4096]) -decoder.layers.0.fc2.weight torch.Size([1024, 4096]) -decoder.layers.0.fc2.bias torch.Size([1024]) -decoder.layers.0.final_layer_norm.weight torch.Size([1024]) -decoder.layers.0.final_layer_norm.bias torch.Size([1024]) -decoder.layers.1.self_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.1.self_attn.k_proj.bias torch.Size([1024]) -decoder.layers.1.self_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.1.self_attn.v_proj.bias torch.Size([1024]) -decoder.layers.1.self_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.1.self_attn.q_proj.bias torch.Size([1024]) -decoder.layers.1.self_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.1.self_attn.out_proj.bias torch.Size([1024]) -decoder.layers.1.self_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.1.self_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.1.encoder_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.1.encoder_attn.k_proj.bias torch.Size([1024]) -decoder.layers.1.encoder_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.1.encoder_attn.v_proj.bias torch.Size([1024]) -decoder.layers.1.encoder_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.1.encoder_attn.q_proj.bias torch.Size([1024]) -decoder.layers.1.encoder_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.1.encoder_attn.out_proj.bias torch.Size([1024]) -decoder.layers.1.encoder_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.1.encoder_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.1.fc1.weight torch.Size([4096, 1024]) -decoder.layers.1.fc1.bias torch.Size([4096]) -decoder.layers.1.fc2.weight torch.Size([1024, 4096]) -decoder.layers.1.fc2.bias torch.Size([1024]) -decoder.layers.1.final_layer_norm.weight torch.Size([1024]) -decoder.layers.1.final_layer_norm.bias torch.Size([1024]) -decoder.layers.2.self_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.2.self_attn.k_proj.bias torch.Size([1024]) -decoder.layers.2.self_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.2.self_attn.v_proj.bias torch.Size([1024]) -decoder.layers.2.self_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.2.self_attn.q_proj.bias torch.Size([1024]) -decoder.layers.2.self_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.2.self_attn.out_proj.bias torch.Size([1024]) -decoder.layers.2.self_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.2.self_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.2.encoder_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.2.encoder_attn.k_proj.bias torch.Size([1024]) -decoder.layers.2.encoder_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.2.encoder_attn.v_proj.bias torch.Size([1024]) -decoder.layers.2.encoder_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.2.encoder_attn.q_proj.bias torch.Size([1024]) -decoder.layers.2.encoder_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.2.encoder_attn.out_proj.bias torch.Size([1024]) -decoder.layers.2.encoder_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.2.encoder_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.2.fc1.weight torch.Size([4096, 1024]) -decoder.layers.2.fc1.bias torch.Size([4096]) -decoder.layers.2.fc2.weight torch.Size([1024, 4096]) -decoder.layers.2.fc2.bias torch.Size([1024]) -decoder.layers.2.final_layer_norm.weight torch.Size([1024]) -decoder.layers.2.final_layer_norm.bias torch.Size([1024]) -decoder.layers.3.self_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.3.self_attn.k_proj.bias torch.Size([1024]) -decoder.layers.3.self_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.3.self_attn.v_proj.bias torch.Size([1024]) -decoder.layers.3.self_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.3.self_attn.q_proj.bias torch.Size([1024]) -decoder.layers.3.self_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.3.self_attn.out_proj.bias torch.Size([1024]) -decoder.layers.3.self_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.3.self_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.3.encoder_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.3.encoder_attn.k_proj.bias torch.Size([1024]) -decoder.layers.3.encoder_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.3.encoder_attn.v_proj.bias torch.Size([1024]) -decoder.layers.3.encoder_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.3.encoder_attn.q_proj.bias torch.Size([1024]) -decoder.layers.3.encoder_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.3.encoder_attn.out_proj.bias torch.Size([1024]) -decoder.layers.3.encoder_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.3.encoder_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.3.fc1.weight torch.Size([4096, 1024]) -decoder.layers.3.fc1.bias torch.Size([4096]) -decoder.layers.3.fc2.weight torch.Size([1024, 4096]) -decoder.layers.3.fc2.bias torch.Size([1024]) -decoder.layers.3.final_layer_norm.weight torch.Size([1024]) -decoder.layers.3.final_layer_norm.bias torch.Size([1024]) -decoder.layers.4.self_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.4.self_attn.k_proj.bias torch.Size([1024]) -decoder.layers.4.self_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.4.self_attn.v_proj.bias torch.Size([1024]) -decoder.layers.4.self_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.4.self_attn.q_proj.bias torch.Size([1024]) -decoder.layers.4.self_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.4.self_attn.out_proj.bias torch.Size([1024]) -decoder.layers.4.self_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.4.self_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.4.encoder_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.4.encoder_attn.k_proj.bias torch.Size([1024]) -decoder.layers.4.encoder_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.4.encoder_attn.v_proj.bias torch.Size([1024]) -decoder.layers.4.encoder_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.4.encoder_attn.q_proj.bias torch.Size([1024]) -decoder.layers.4.encoder_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.4.encoder_attn.out_proj.bias torch.Size([1024]) -decoder.layers.4.encoder_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.4.encoder_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.4.fc1.weight torch.Size([4096, 1024]) -decoder.layers.4.fc1.bias torch.Size([4096]) -decoder.layers.4.fc2.weight torch.Size([1024, 4096]) -decoder.layers.4.fc2.bias torch.Size([1024]) -decoder.layers.4.final_layer_norm.weight torch.Size([1024]) -decoder.layers.4.final_layer_norm.bias torch.Size([1024]) -decoder.layers.5.self_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.5.self_attn.k_proj.bias torch.Size([1024]) -decoder.layers.5.self_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.5.self_attn.v_proj.bias torch.Size([1024]) -decoder.layers.5.self_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.5.self_attn.q_proj.bias torch.Size([1024]) -decoder.layers.5.self_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.5.self_attn.out_proj.bias torch.Size([1024]) -decoder.layers.5.self_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.5.self_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.5.encoder_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.5.encoder_attn.k_proj.bias torch.Size([1024]) -decoder.layers.5.encoder_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.5.encoder_attn.v_proj.bias torch.Size([1024]) -decoder.layers.5.encoder_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.5.encoder_attn.q_proj.bias torch.Size([1024]) -decoder.layers.5.encoder_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.5.encoder_attn.out_proj.bias torch.Size([1024]) -decoder.layers.5.encoder_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.5.encoder_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.5.fc1.weight torch.Size([4096, 1024]) -decoder.layers.5.fc1.bias torch.Size([4096]) -decoder.layers.5.fc2.weight torch.Size([1024, 4096]) -decoder.layers.5.fc2.bias torch.Size([1024]) -decoder.layers.5.final_layer_norm.weight torch.Size([1024]) -decoder.layers.5.final_layer_norm.bias torch.Size([1024]) -decoder.layers.6.self_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.6.self_attn.k_proj.bias torch.Size([1024]) -decoder.layers.6.self_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.6.self_attn.v_proj.bias torch.Size([1024]) -decoder.layers.6.self_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.6.self_attn.q_proj.bias torch.Size([1024]) -decoder.layers.6.self_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.6.self_attn.out_proj.bias torch.Size([1024]) -decoder.layers.6.self_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.6.self_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.6.encoder_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.6.encoder_attn.k_proj.bias torch.Size([1024]) -decoder.layers.6.encoder_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.6.encoder_attn.v_proj.bias torch.Size([1024]) -decoder.layers.6.encoder_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.6.encoder_attn.q_proj.bias torch.Size([1024]) -decoder.layers.6.encoder_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.6.encoder_attn.out_proj.bias torch.Size([1024]) -decoder.layers.6.encoder_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.6.encoder_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.6.fc1.weight torch.Size([4096, 1024]) -decoder.layers.6.fc1.bias torch.Size([4096]) -decoder.layers.6.fc2.weight torch.Size([1024, 4096]) -decoder.layers.6.fc2.bias torch.Size([1024]) -decoder.layers.6.final_layer_norm.weight torch.Size([1024]) -decoder.layers.6.final_layer_norm.bias torch.Size([1024]) -decoder.layers.7.self_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.7.self_attn.k_proj.bias torch.Size([1024]) -decoder.layers.7.self_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.7.self_attn.v_proj.bias torch.Size([1024]) -decoder.layers.7.self_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.7.self_attn.q_proj.bias torch.Size([1024]) -decoder.layers.7.self_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.7.self_attn.out_proj.bias torch.Size([1024]) -decoder.layers.7.self_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.7.self_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.7.encoder_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.7.encoder_attn.k_proj.bias torch.Size([1024]) -decoder.layers.7.encoder_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.7.encoder_attn.v_proj.bias torch.Size([1024]) -decoder.layers.7.encoder_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.7.encoder_attn.q_proj.bias torch.Size([1024]) -decoder.layers.7.encoder_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.7.encoder_attn.out_proj.bias torch.Size([1024]) -decoder.layers.7.encoder_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.7.encoder_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.7.fc1.weight torch.Size([4096, 1024]) -decoder.layers.7.fc1.bias torch.Size([4096]) -decoder.layers.7.fc2.weight torch.Size([1024, 4096]) -decoder.layers.7.fc2.bias torch.Size([1024]) -decoder.layers.7.final_layer_norm.weight torch.Size([1024]) -decoder.layers.7.final_layer_norm.bias torch.Size([1024]) -decoder.layers.8.self_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.8.self_attn.k_proj.bias torch.Size([1024]) -decoder.layers.8.self_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.8.self_attn.v_proj.bias torch.Size([1024]) -decoder.layers.8.self_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.8.self_attn.q_proj.bias torch.Size([1024]) -decoder.layers.8.self_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.8.self_attn.out_proj.bias torch.Size([1024]) -decoder.layers.8.self_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.8.self_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.8.encoder_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.8.encoder_attn.k_proj.bias torch.Size([1024]) -decoder.layers.8.encoder_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.8.encoder_attn.v_proj.bias torch.Size([1024]) -decoder.layers.8.encoder_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.8.encoder_attn.q_proj.bias torch.Size([1024]) -decoder.layers.8.encoder_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.8.encoder_attn.out_proj.bias torch.Size([1024]) -decoder.layers.8.encoder_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.8.encoder_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.8.fc1.weight torch.Size([4096, 1024]) -decoder.layers.8.fc1.bias torch.Size([4096]) -decoder.layers.8.fc2.weight torch.Size([1024, 4096]) -decoder.layers.8.fc2.bias torch.Size([1024]) -decoder.layers.8.final_layer_norm.weight torch.Size([1024]) -decoder.layers.8.final_layer_norm.bias torch.Size([1024]) -decoder.layers.9.self_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.9.self_attn.k_proj.bias torch.Size([1024]) -decoder.layers.9.self_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.9.self_attn.v_proj.bias torch.Size([1024]) -decoder.layers.9.self_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.9.self_attn.q_proj.bias torch.Size([1024]) -decoder.layers.9.self_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.9.self_attn.out_proj.bias torch.Size([1024]) -decoder.layers.9.self_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.9.self_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.9.encoder_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.9.encoder_attn.k_proj.bias torch.Size([1024]) -decoder.layers.9.encoder_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.9.encoder_attn.v_proj.bias torch.Size([1024]) -decoder.layers.9.encoder_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.9.encoder_attn.q_proj.bias torch.Size([1024]) -decoder.layers.9.encoder_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.9.encoder_attn.out_proj.bias torch.Size([1024]) -decoder.layers.9.encoder_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.9.encoder_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.9.fc1.weight torch.Size([4096, 1024]) -decoder.layers.9.fc1.bias torch.Size([4096]) -decoder.layers.9.fc2.weight torch.Size([1024, 4096]) -decoder.layers.9.fc2.bias torch.Size([1024]) -decoder.layers.9.final_layer_norm.weight torch.Size([1024]) -decoder.layers.9.final_layer_norm.bias torch.Size([1024]) -decoder.layers.10.self_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.10.self_attn.k_proj.bias torch.Size([1024]) -decoder.layers.10.self_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.10.self_attn.v_proj.bias torch.Size([1024]) -decoder.layers.10.self_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.10.self_attn.q_proj.bias torch.Size([1024]) -decoder.layers.10.self_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.10.self_attn.out_proj.bias torch.Size([1024]) -decoder.layers.10.self_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.10.self_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.10.encoder_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.10.encoder_attn.k_proj.bias torch.Size([1024]) -decoder.layers.10.encoder_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.10.encoder_attn.v_proj.bias torch.Size([1024]) -decoder.layers.10.encoder_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.10.encoder_attn.q_proj.bias torch.Size([1024]) -decoder.layers.10.encoder_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.10.encoder_attn.out_proj.bias torch.Size([1024]) -decoder.layers.10.encoder_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.10.encoder_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.10.fc1.weight torch.Size([4096, 1024]) -decoder.layers.10.fc1.bias torch.Size([4096]) -decoder.layers.10.fc2.weight torch.Size([1024, 4096]) -decoder.layers.10.fc2.bias torch.Size([1024]) -decoder.layers.10.final_layer_norm.weight torch.Size([1024]) -decoder.layers.10.final_layer_norm.bias torch.Size([1024]) -decoder.layers.11.self_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.11.self_attn.k_proj.bias torch.Size([1024]) -decoder.layers.11.self_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.11.self_attn.v_proj.bias torch.Size([1024]) -decoder.layers.11.self_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.11.self_attn.q_proj.bias torch.Size([1024]) -decoder.layers.11.self_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.11.self_attn.out_proj.bias torch.Size([1024]) -decoder.layers.11.self_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.11.self_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.11.encoder_attn.k_proj.weight torch.Size([1024, 1024]) -decoder.layers.11.encoder_attn.k_proj.bias torch.Size([1024]) -decoder.layers.11.encoder_attn.v_proj.weight torch.Size([1024, 1024]) -decoder.layers.11.encoder_attn.v_proj.bias torch.Size([1024]) -decoder.layers.11.encoder_attn.q_proj.weight torch.Size([1024, 1024]) -decoder.layers.11.encoder_attn.q_proj.bias torch.Size([1024]) -decoder.layers.11.encoder_attn.out_proj.weight torch.Size([1024, 1024]) -decoder.layers.11.encoder_attn.out_proj.bias torch.Size([1024]) -decoder.layers.11.encoder_attn_layer_norm.weight torch.Size([1024]) -decoder.layers.11.encoder_attn_layer_norm.bias torch.Size([1024]) -decoder.layers.11.fc1.weight torch.Size([4096, 1024]) -decoder.layers.11.fc1.bias torch.Size([4096]) -decoder.layers.11.fc2.weight torch.Size([1024, 4096]) -decoder.layers.11.fc2.bias torch.Size([1024]) -decoder.layers.11.final_layer_norm.weight torch.Size([1024]) -decoder.layers.11.final_layer_norm.bias torch.Size([1024]) -decoder.output_projection.weight torch.Size([50265, 1024]) \ No newline at end of file From 1d0d1b825ef48fb73a4ef8484fd299732a784d25 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 6 Sep 2021 11:52:11 +0200 Subject: [PATCH 65/84] changes with metadata regarding dataset --- bsmetadata/experiments/with_metadata.py | 43 ++++++++++++++++--------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/bsmetadata/experiments/with_metadata.py b/bsmetadata/experiments/with_metadata.py index 34fb09be..7c9a90b3 100644 --- a/bsmetadata/experiments/with_metadata.py +++ b/bsmetadata/experiments/with_metadata.py @@ -1,7 +1,8 @@ import functools import logging from datasets import config - +import copy +from datasets.fingerprint import Hasher from datasets import load_dataset from torch.utils.data import DataLoader from transformers import default_data_collator @@ -58,7 +59,7 @@ def get_dataloaders(tokenizer, args): f"dataset_name={args.dataset_name}, dataset_config_name={args.dataset_config_name}, data_files={data_files}, cache_dir={args.cache_dir}," ) # Downloading and loading a dataset from the hub. - raw_datasets = load_dataset( + datasets = load_dataset( args.dataset_name, args.dataset_config_name, data_files=data_files, @@ -66,14 +67,14 @@ def get_dataloaders(tokenizer, args): keep_in_memory=False, ) - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( + if "validation" not in datasets.keys(): + datasets["validation"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[:{args.validation_split_percentage}%]", cache_dir=args.cache_dir, ) - raw_datasets["train"] = load_dataset( + datasets["train"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[{args.validation_split_percentage}%:]", @@ -89,16 +90,16 @@ def get_dataloaders(tokenizer, args): ) if extension == "jsonl": extension = "json" - raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=args.cache_dir) + datasets = load_dataset(extension, data_files=data_files, cache_dir=args.cache_dir) - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( + if "validation" not in datasets.keys(): + datasets["validation"] = load_dataset( extension, data_files=data_files, split=f"train[:{args.validation_split_percentage}%]", cache_dir=args.cache_dir, ) - raw_datasets["train"] = load_dataset( + datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{args.validation_split_percentage}%:]", @@ -109,12 +110,21 @@ def get_dataloaders(tokenizer, args): # https://huggingface.co/docs/datasets/loading_datasets.html. # Preprocessing the datasets. - column_names = raw_datasets["train"].column_names + column_names = datasets["train"].column_names logger.info("Start to add metadata and chunk examples") + + tmp_data_args = copy.deepcopy(args) + tmp_data_args.preprocessing_num_workers = 80 + tmp_data_args.overwrite_cache = False + tmp_data_args.per_device_eval_batch_size = 2 + tmp_data_args.per_device_train_batch_size = 2 + + logger.info(f"Will store the cache with the hash for the tokenizer {Hasher.hash(tokenizer)} and the args {Hasher.hash(tmp_data_args)}") + # First we pre-process our text and metadata - lm_datasets = raw_datasets.map( - functools.partial(add_metadata_and_chunk_examples, tokenizer=tokenizer, cfg=args), + datasets = datasets.map( + functools.partial(add_metadata_and_chunk_examples, tokenizer=tokenizer, cfg=tmp_data_args), batched=True, num_proc=args.preprocessing_num_workers, load_from_cache_file=not args.overwrite_cache, @@ -129,7 +139,7 @@ def create_labels_column(examples): logger.info("Create labels column") # Then we add the column containing the labels - lm_datasets = lm_datasets.map( + datasets = datasets.map( create_labels_column, batched=True, num_proc=args.preprocessing_num_workers, @@ -138,8 +148,11 @@ def create_labels_column(examples): ) logger.info("Creating labels column finished") - train_dataset = lm_datasets["train"] - val_dataset = lm_datasets["validation"] + train_dataset = datasets["train"] + val_dataset = datasets["validation"] + + logger.info(f" Num train examples = {len(train_dataset)}") + logger.info(f" Num validation examples = {len(val_dataset)}") # DataLoaders creation: train_dataloader = DataLoader( From d7f7b9d59a1b72aa926966090fb5fd9e5440068a Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 6 Sep 2021 11:53:35 +0200 Subject: [PATCH 66/84] change train by adding new arguments --- bsmetadata/train.py | 70 +++++++++++++++++-- .../create_dataset.slurm | 2 +- .../crime_and_punish_test/load_dataset.slurm | 2 +- .../SLURM/experiment_1/create_dataset.slurm | 20 +++--- .../html/SLURM/experiment_1/do_training.slurm | 25 ++++--- .../SLURM/experiment_1/load_dataset.slurm | 6 +- .../init_experiment/create_dataset.slurm | 2 +- .../SLURM/init_experiment/do_training.slurm | 2 +- .../SLURM/init_experiment/load_dataset.slurm | 2 +- .../load_tokenizer_and_model.slurm | 2 +- experiments/html/example_script.sh | 4 ++ 11 files changed, 102 insertions(+), 35 deletions(-) diff --git a/bsmetadata/train.py b/bsmetadata/train.py index 631a4103..e57c36a0 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -18,6 +18,7 @@ from omegaconf import OmegaConf from tqdm.auto import tqdm as original_tqdm from transformers import AdamW, AutoModelForCausalLM, AutoTokenizer, get_scheduler, set_seed +from transformers.trainer_utils import EvaluationStrategy, IntervalStrategy from bsmetadata.input_pipeline import DataConfig, get_dataloaders @@ -46,9 +47,34 @@ class CFG: out_dir: str = field( default="output_dir", metadata={"help": "The output directory in which the trained model is saved."} ) - num_eval: int = field(default=3, metadata={"help": "The number of evaluations to perform during training."}) + + # logging_first_step : bool = field(default=False, metadata={"help": "Log the first global_step"}) + evaluation_strategy: IntervalStrategy = field( + default="STEPS", + metadata={"help": "The evaluation strategy to use."}, + ) + eval_num_per_epoch: int = field(default=3, metadata={"help": "If evaluation strategy is `epoch`. The number of evaluations to perform per epoch during training."}) + eval_steps: int = field(default=100, metadata={"help": "If evaluation strategy is `steps`. Run an evaluation every X steps."}) + + save_strategy: IntervalStrategy = field( + default="STEPS", + metadata={"help": "The checkpoint save strategy to use."}, + ) + save_num_per_epoch: int = field(default=3, metadata={"help": "If save strategy is `epoch`. The number of savings to perform per epoch during training."}) + save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."}) + save_total_limit: Optional[int] = field( + default=None, + metadata={ + "help": ( + "Limit the total amount of checkpoints." + "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints" + ) + }, + ) #TODO!!! + model_name: str = field(default="gpt2", metadata={"help": "The name of the pretrained model to use."}) project_name: str = field(default="metadata_lm", metadata={"help": "The project name."}) + do_train: bool = field(default=True, metadata={"help": "Whether to run training."}) do_eval: bool = field(default=True, metadata={"help": "Whether to run eval on the dev set."}) @@ -172,10 +198,26 @@ def main(args: CFG) -> None: else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) - if args.num_eval < 1: + if args.evaluation_strategy == IntervalStrategy.EPOCH: + if args.eval_num_per_epoch < 1: + eval_per_n_step = args.max_train_steps + 1 + else: + eval_per_n_step = args.max_train_steps // args.eval_num_per_epoch + elif args.evaluation_strategy == IntervalStrategy.STEPS: + eval_per_n_step = args.eval_steps + else: # IntervalStrategy.NO eval_per_n_step = args.max_train_steps + 1 - else: - eval_per_n_step = args.max_train_steps // args.num_eval + + if args.save_strategy == IntervalStrategy.EPOCH: + if args.save_num_per_epoch < 1: + save_per_n_step = args.max_train_steps + 1 + else: + save_per_n_step = args.max_train_steps // args.save_num_per_epoch + elif args.save_strategy == IntervalStrategy.STEPS: + save_per_n_step = args.save_steps + else: # IntervalStrategy.NO + save_per_n_step = args.max_train_steps + 1 + scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, @@ -207,6 +249,15 @@ def evaluate(eval_dataloader): progress_bar = tqdm(range(args.max_train_steps), desc="training") completed_steps = 0 logger_metrics = Logger(is_local_main_process, project=args.project_name, config=args) + + do_eval = args.do_eval and completed_steps > 0 and completed_steps % eval_per_n_step == 0 + if do_eval: + logger.info("***** Evaluation *****") + for key, eval_dataloader in eval_dataloaders.items(): + metrics = evaluate(eval_dataloader) + logger_metrics.log({key: metrics}) + # logger_metrics.info(f"epoch {epoch}: perplexity: {perplexity}") + for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): @@ -231,13 +282,18 @@ def evaluate(eval_dataloader): completed_steps += 1 else: continue - do_eval = args.do_train and completed_steps > 0 and completed_steps % eval_per_n_step == 0 + + do_eval = args.do_eval and completed_steps > 0 and completed_steps % eval_per_n_step == 0 if do_eval: + logger.info("***** Evaluation *****") for key, eval_dataloader in eval_dataloaders.items(): metrics = evaluate(eval_dataloader) logger_metrics.log({key: metrics}) - # logger_metrics.info(f"epoch {epoch}: perplexity: {perplexity}") + + do_save = completed_steps > 0 and completed_steps % save_per_n_step == 0 + if do_save: + logger.info(f"***** Saving at {args.out_dir} *****") if is_local_main_process: save_dict = { "epoch": epoch + 1, @@ -255,7 +311,7 @@ def evaluate(eval_dataloader): break logger_metrics.close() logger.info("***** Training finished *****") - + if is_local_main_process and args.out_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) diff --git a/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm b/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm index c51058be..25eee0e6 100644 --- a/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm +++ b/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm @@ -15,7 +15,7 @@ source $HOME/start-user export TRANSFORMERS_OFFLINE=1 -cd $WORK/repos/metadata/ +cd $WORK/repos/test-sync/metadata/ python experiments/html/start_training.py \ data_config.experiment="with_metadata" \ diff --git a/experiments/html/SLURM/crime_and_punish_test/load_dataset.slurm b/experiments/html/SLURM/crime_and_punish_test/load_dataset.slurm index ecf3139b..85b2ef29 100644 --- a/experiments/html/SLURM/crime_and_punish_test/load_dataset.slurm +++ b/experiments/html/SLURM/crime_and_punish_test/load_dataset.slurm @@ -15,7 +15,7 @@ set -x -e source $HOME/start-user -cd $WORK/repos/metadata/ +cd $WORK/repos/test-sync/metadata/ python experiments/html/SLURM/init_experiment/load_dataset.py \ dataset_name="crime_and_punish" \ \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_1/create_dataset.slurm b/experiments/html/SLURM/experiment_1/create_dataset.slurm index 6509b359..0d8c367d 100644 --- a/experiments/html/SLURM/experiment_1/create_dataset.slurm +++ b/experiments/html/SLURM/experiment_1/create_dataset.slurm @@ -1,12 +1,12 @@ #!/bin/bash #SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name #SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! -#SBATCH --cpus-per-task=16 # number of cores per tasks -#SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --time 03:00:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=%x-%j.out # output file name -#SBATCH --error=%x-%j.err # error file name +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=20 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 01:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name #SBATCH --account=six@cpu # account set -x -e @@ -16,16 +16,16 @@ source $HOME/start-user export HF_DATASETS_OFFLINE=1 export TRANSFORMERS_OFFLINE=1 -cd $WORK/repos/metadata/ +cd $WORK/repos/test-sync/metadata/ python experiments/html/start_training.py \ data_config.experiment="with_metadata" \ data_config.metadata_list=["html"] \ data_config.max_seq_len=1024 \ data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ -data_config.train_file="nq-train-*.jsonl.gz" \ -data_config.validation_file="nq-dev-*.jsonl.gz" \ +data_config.train_file="nq-train-0\[0-2\].jsonl.gz" \ +data_config.validation_file="nq-dev-00.jsonl.gz" \ data_config.extension="json" \ -data_config.preprocessing_num_workers=16 \ +data_config.preprocessing_num_workers=80 \ do_train=False \ do_eval=False \ \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_1/do_training.slurm b/experiments/html/SLURM/experiment_1/do_training.slurm index d124e862..99ae0467 100644 --- a/experiments/html/SLURM/experiment_1/do_training.slurm +++ b/experiments/html/SLURM/experiment_1/do_training.slurm @@ -5,9 +5,9 @@ #SBATCH --gres=gpu:1 # number of GPUs per node #SBATCH --cpus-per-task=8 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --time 04:00:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=%x-%j.out # output file name -#SBATCH --error=%x-%j.err # error file name +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name #SBATCH --account=six@gpu # account set -x -e @@ -18,18 +18,25 @@ export HF_DATASETS_OFFLINE=1 export TRANSFORMERS_OFFLINE=1 # be careful about the cache folder for Wandb export WANDB_MODE=offline +export WANDB_DIR=$SCRATCH -cd $WORK/repos/metadata/ +cd $WORK/repos/test-sync/metadata/ python experiments/html/start_training.py \ data_config.experiment="with_metadata" \ data_config.metadata_list=["html"] \ data_config.max_seq_len=1024 \ -data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ -data_config.train_file="nq-train-*.jsonl.gz" \ -data_config.validation_file="nq-dev-*.jsonl.gz" \ +data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ +data_config.train_file="nq-train-0\[0-2\].jsonl.gz" \ +data_config.validation_file="nq-dev-00.jsonl.gz" \ data_config.extension="json" \ -data_config.preprocessing_num_workers=6 \ +data_config.preprocessing_num_workers=80 \ +data_config.per_device_eval_batch_size=3 \ +data_config.per_device_train_batch_size=3 \ out_dir="${SCRATCH}/metadata_outputs" \ do_train=True \ -do_eval=True \ \ No newline at end of file +do_eval=True \ +evaluation_strategy=STEPS \ +eval_steps=1000 \ +save_strategy=STEPS \ +save_steps=1000 \ \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_1/load_dataset.slurm b/experiments/html/SLURM/experiment_1/load_dataset.slurm index 1e36dbad..9a69dd42 100644 --- a/experiments/html/SLURM/experiment_1/load_dataset.slurm +++ b/experiments/html/SLURM/experiment_1/load_dataset.slurm @@ -6,8 +6,8 @@ #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --gres=gpu:0 # number of gpus #SBATCH --time 02:30:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=%x-%j.out # output file name -#SBATCH --error=%x-%j.err # error file name +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name #SBATCH --account=six@gpu # account #SBATCH -p compil # partition with internet @@ -22,7 +22,7 @@ source $HOME/start-user # git lfs install # git lfs pull origin master -cd $WORK/repos/metadata/ +cd $WORK/repos/test-sync/metadata/ python experiments/html/SLURM/init_experiment/load_dataset.py \ dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ diff --git a/experiments/html/SLURM/init_experiment/create_dataset.slurm b/experiments/html/SLURM/init_experiment/create_dataset.slurm index ba75916d..8f003637 100644 --- a/experiments/html/SLURM/init_experiment/create_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/create_dataset.slurm @@ -16,7 +16,7 @@ source $HOME/start-user export HF_DATASETS_OFFLINE=1 export TRANSFORMERS_OFFLINE=1 -cd $WORK/repos/metadata/ +cd $WORK/repos/test-sync/metadata/ python experiments/html/start_training.py \ data_config.experiment="with_metadata" \ diff --git a/experiments/html/SLURM/init_experiment/do_training.slurm b/experiments/html/SLURM/init_experiment/do_training.slurm index 08281d6b..a6b06497 100644 --- a/experiments/html/SLURM/init_experiment/do_training.slurm +++ b/experiments/html/SLURM/init_experiment/do_training.slurm @@ -19,7 +19,7 @@ export TRANSFORMERS_OFFLINE=1 # be careful about the cache folder for Wandb export WANDB_MODE=offline -cd $WORK/repos/metadata/ +cd $WORK/repos/test-sync/metadata/ python experiments/html/start_training.py \ data_config.experiment="with_metadata" \ diff --git a/experiments/html/SLURM/init_experiment/load_dataset.slurm b/experiments/html/SLURM/init_experiment/load_dataset.slurm index 12d1a125..1fb85e16 100644 --- a/experiments/html/SLURM/init_experiment/load_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/load_dataset.slurm @@ -22,7 +22,7 @@ source $HOME/start-user # git lfs install # git lfs pull origin master -cd $WORK/repos/metadata/ +cd $WORK/repos/test-sync/metadata/ python experiments/html/SLURM/init_experiment/load_dataset.py \ dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML_Toy" \ diff --git a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm index f46b320f..7be1f6ac 100644 --- a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm +++ b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm @@ -15,7 +15,7 @@ set -x -e source $HOME/start-user -cd $WORK/repos/metadata/ +cd $WORK/repos/test-sync/metadata/ python experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py \ model_name=gpt2 \ \ No newline at end of file diff --git a/experiments/html/example_script.sh b/experiments/html/example_script.sh index 1a501528..1f3e76ec 100644 --- a/experiments/html/example_script.sh +++ b/experiments/html/example_script.sh @@ -9,3 +9,7 @@ data_config.extension="json" \ data_config.preprocessing_num_workers=6 \ do_train=False \ do_eval=False \ +evaluation_strategy=STEPS \ +eval_steps=50 \ +save_strategy=STEPS \ +save_steps=500 \ From af82ec0348a6e7d9bce47ea1b201fa4a7cedb628 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 6 Sep 2021 11:54:58 +0200 Subject: [PATCH 67/84] change personnal experiment --- experiments/html/start_training.py | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/experiments/html/start_training.py b/experiments/html/start_training.py index cdcb7a7a..d8ae0701 100644 --- a/experiments/html/start_training.py +++ b/experiments/html/start_training.py @@ -8,7 +8,7 @@ from bsmetadata.input_pipeline import DataConfig from bsmetadata.metadata_processors import PROCESSORS -from bsmetadata.train import main, show_help +from bsmetadata.train import main, show_help, CFG tags_to_remove_alone = [ @@ -50,35 +50,12 @@ class DataConfigWithHTML(DataConfig): @dataclass -class CFG: +class CFGAugmented(CFG): data_config: DataConfigWithHTML = DataConfigWithHTML() - weight_decay: float = field(default=0.0, metadata={"help": "The weight decay to use for training."}) - learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate."}) - gradient_accumulation_steps: int = field( - default=1, - metadata={"help": "The number of gradient accumulation steps to perform before updating model parameters."}, - ) - num_train_epochs: int = field(default=1, metadata={"help": "The number of epochs to train the model for."}) - max_train_steps: Optional[int] = field( - default=None, metadata={"help": "The maximum number of training steps (overrides num_train_epochs)."} - ) - lr_scheduler_type: str = field(default="linear", metadata={"help": "The type of learning rate schedule to use."}) - num_warmup_steps: int = field( - default=1000, metadata={"help": "The number of warmup steps during which the learning rate is increased."} - ) - seed: int = field(default=42, metadata={"help": "The seed used for RNG initialization."}) - out_dir: str = field( - default="output_dir", metadata={"help": "The output directory in which the trained model is saved."} - ) - num_eval: int = field(default=3, metadata={"help": "The number of evaluations to perform during training."}) - model_name: str = field(default="gpt2", metadata={"help": "The name of the pretrained model to use."}) - project_name: str = field(default="metadata_lm", metadata={"help": "The project name."}) - do_train: bool = field(default=True, metadata={"help": "Whether to run training."}) - do_eval: bool = field(default=True, metadata={"help": "Whether to run eval on the dev set."}) cs = ConfigStore.instance() -cs.store(name="config", node=CFG) +cs.store(name="config", node=CFGAugmented) if __name__ == "__main__": if "--help" in sys.argv or "-h" in sys.argv: From 0ad2d2c7dff4bc902aa4d8e463b1929f1edc7f22 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 6 Sep 2021 14:22:02 +0200 Subject: [PATCH 68/84] change batch size --- bsmetadata/experiments/with_metadata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bsmetadata/experiments/with_metadata.py b/bsmetadata/experiments/with_metadata.py index 7c9a90b3..3d6d6c73 100644 --- a/bsmetadata/experiments/with_metadata.py +++ b/bsmetadata/experiments/with_metadata.py @@ -130,6 +130,7 @@ def get_dataloaders(tokenizer, args): load_from_cache_file=not args.overwrite_cache, desc="Pre-process the text and metadata to create new samples", remove_columns=column_names, + batch_size=1, ) logger.info("Add metadata and chunk examples finished") From 9c30ce637405995bbd85fbbd8f6d5b189949fbe9 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 6 Sep 2021 14:50:51 +0200 Subject: [PATCH 69/84] change batch size --- bsmetadata/experiments/with_metadata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bsmetadata/experiments/with_metadata.py b/bsmetadata/experiments/with_metadata.py index 3d6d6c73..d9cca513 100644 --- a/bsmetadata/experiments/with_metadata.py +++ b/bsmetadata/experiments/with_metadata.py @@ -146,6 +146,7 @@ def create_labels_column(examples): num_proc=args.preprocessing_num_workers, load_from_cache_file=not args.overwrite_cache, desc="Create labels column", + batch_size=1, ) logger.info("Creating labels column finished") From 5753bb0d68df5063555385401ab9843ef86e6bae Mon Sep 17 00:00:00 2001 From: SaulLu Date: Mon, 6 Sep 2021 14:51:00 +0200 Subject: [PATCH 70/84] change experiment SLURM file --- experiments/html/SLURM/experiment_1/create_dataset.slurm | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/experiments/html/SLURM/experiment_1/create_dataset.slurm b/experiments/html/SLURM/experiment_1/create_dataset.slurm index 0d8c367d..868c9ae0 100644 --- a/experiments/html/SLURM/experiment_1/create_dataset.slurm +++ b/experiments/html/SLURM/experiment_1/create_dataset.slurm @@ -2,7 +2,7 @@ #SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! -#SBATCH --cpus-per-task=20 # number of cores per tasks +#SBATCH --cpus-per-task=40 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --time 01:00:00 # maximum execution time (HH:MM:SS) #SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name @@ -23,9 +23,10 @@ data_config.experiment="with_metadata" \ data_config.metadata_list=["html"] \ data_config.max_seq_len=1024 \ data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ -data_config.train_file="nq-train-0\[0-2\].jsonl.gz" \ -data_config.validation_file="nq-dev-00.jsonl.gz" \ +data_config.train_file="nq-train-*.jsonl.gz" \ +data_config.validation_file="nq-dev-*.jsonl.gz" \ data_config.extension="json" \ data_config.preprocessing_num_workers=80 \ +out_dir="${SCRATCH}/metadata_outputs" \ do_train=False \ do_eval=False \ \ No newline at end of file From 49f5ef4ab6ffcdfc0a1501e8ec91840bed99d355 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 7 Sep 2021 12:46:14 +0200 Subject: [PATCH 71/84] add JZ doc + new dataloader --- .../with_metadata_and_baseline_val.py | 220 ++++++++++++++++++ bsmetadata/input_pipeline.py | 5 + bsmetadata/train.py | 4 +- .../create_dataset.slurm | 6 +- .../crime_and_punish_test/load_dataset.slurm | 6 +- .../SLURM/experiment_1/create_dataset.slurm | 2 +- .../html/SLURM/experiment_1/do_training.slurm | 2 +- .../SLURM/experiment_1/load_dataset.slurm | 2 +- experiments/html/SLURM/experiment_2/README.md | 9 + .../SLURM/experiment_2/create_dataset.slurm | 32 +++ .../html/SLURM/experiment_2/do_training.slurm | 43 ++++ .../html/SLURM/experiment_2/load_dataset.py | 88 +++++++ .../SLURM/experiment_2/load_dataset.slurm | 30 +++ .../html/SLURM/experiment_2/multi_steps.bash | 2 + .../init_experiment/create_dataset.slurm | 6 +- .../SLURM/init_experiment/do_training.slurm | 6 +- .../SLURM/init_experiment/load_dataset.slurm | 6 +- .../load_tokenizer_and_model.slurm | 6 +- experiments/html/SLURM/wandb/sync_wandb.slurm | 24 ++ experiments/jz_template/README.md | 6 + .../01_load_tokenizer_and_model.slurm | 21 ++ .../experiment_example/02_load_dataset.slurm | 30 +++ .../03_create_dataset.slurm | 34 +++ .../experiment_example/04_do_training.slurm | 43 ++++ .../SLURM/experiment_example/README.md | 9 + .../SLURM/experiment_example/multi_steps.bash | 2 + .../01_load_tokenizer_and_model.slurm | 29 +++ .../experiment_template/02_load_dataset.slurm | 30 +++ .../03_create_dataset.slurm | 34 +++ .../experiment_template/04_do_training.slurm | 43 ++++ .../SLURM/experiment_template/README.md | 9 + .../experiment_template/multi_steps.bash | 2 + .../loading_script_utils/load_dataset.py | 88 +++++++ .../load_tokenizer_and_model.py | 41 ++++ 34 files changed, 897 insertions(+), 23 deletions(-) create mode 100644 bsmetadata/experiments/with_metadata_and_baseline_val.py create mode 100644 experiments/html/SLURM/experiment_2/README.md create mode 100644 experiments/html/SLURM/experiment_2/create_dataset.slurm create mode 100644 experiments/html/SLURM/experiment_2/do_training.slurm create mode 100644 experiments/html/SLURM/experiment_2/load_dataset.py create mode 100644 experiments/html/SLURM/experiment_2/load_dataset.slurm create mode 100644 experiments/html/SLURM/experiment_2/multi_steps.bash create mode 100644 experiments/html/SLURM/wandb/sync_wandb.slurm create mode 100644 experiments/jz_template/README.md create mode 100644 experiments/jz_template/SLURM/experiment_example/01_load_tokenizer_and_model.slurm create mode 100644 experiments/jz_template/SLURM/experiment_example/02_load_dataset.slurm create mode 100644 experiments/jz_template/SLURM/experiment_example/03_create_dataset.slurm create mode 100644 experiments/jz_template/SLURM/experiment_example/04_do_training.slurm create mode 100644 experiments/jz_template/SLURM/experiment_example/README.md create mode 100644 experiments/jz_template/SLURM/experiment_example/multi_steps.bash create mode 100644 experiments/jz_template/SLURM/experiment_template/01_load_tokenizer_and_model.slurm create mode 100644 experiments/jz_template/SLURM/experiment_template/02_load_dataset.slurm create mode 100644 experiments/jz_template/SLURM/experiment_template/03_create_dataset.slurm create mode 100644 experiments/jz_template/SLURM/experiment_template/04_do_training.slurm create mode 100644 experiments/jz_template/SLURM/experiment_template/README.md create mode 100644 experiments/jz_template/SLURM/experiment_template/multi_steps.bash create mode 100644 experiments/jz_template/SLURM/loading_script_utils/load_dataset.py create mode 100644 experiments/jz_template/SLURM/loading_script_utils/load_tokenizer_and_model.py diff --git a/bsmetadata/experiments/with_metadata_and_baseline_val.py b/bsmetadata/experiments/with_metadata_and_baseline_val.py new file mode 100644 index 00000000..394089b9 --- /dev/null +++ b/bsmetadata/experiments/with_metadata_and_baseline_val.py @@ -0,0 +1,220 @@ +import copy +import functools +import logging + +from datasets import config, load_dataset +from torch.utils.data import DataLoader +from transformers import default_data_collator + +from bsmetadata.metadata_utils import add_metadata_and_chunk_examples + + +logger = logging.getLogger(__name__) + + +def get_dataloaders(tokenizer, args): + """ + Args: + tokenizer: a huggingface/transformers tokenizer + args: a DataConfig + Returns: + a training dataloader and one or more validation dataloaders + validation dataloaders should be in a dictionary + each dataloader should yield {str: torch.Tensor(cpu) } + dictionary keys may have 'metadata_mask' + other fields will be passed to model + note: metadata_mask should be padded + Example: + train_dataloader, val_dataloaders = get_dataloaders(...) + for batch in train_dataloader: + metadata_mask = batch.get('metadata_mask', None) + outputs = model(**batch) + metrics = loss_fn(batch, outputs, metadata_mask) + """ + # Mostly copy/paste from https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_clm_no_trainer.py + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantees that only one local process can concurrently + # download the dataset. + data_files = {} + if args.train_file is not None: + data_files["train"] = args.train_file + if args.validation_file is not None: + data_files["validation"] = args.validation_file + + if not data_files: + data_files = None + + logger.info(f"Start to load dataset, the result will be cached at {config.HF_DATASETS_CACHE}") + if args.dataset_name is not None: + logger.info( + "Downloading with arguments: " + f"dataset_name={args.dataset_name}, " + f"dataset_config_name={args.dataset_config_name}, " + f"data_files={data_files}, " + f"cache_dir={args.cache_dir}," + ) + # Downloading and loading a dataset from the hub. + datasets = load_dataset( + args.dataset_name, + args.dataset_config_name, + data_files=data_files, + cache_dir=args.cache_dir, + keep_in_memory=False, + ) + + if "validation" not in datasets.keys(): + datasets["validation"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + datasets["train"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + else: + logger.info("Loading dataset from extension script") + extension = args.train_file.split(".")[-1] if not args.extension else args.extension + if extension == "txt": + raise ValueError( + "You have entered a text file for the train data, but this type of file cannot contain metadata " + "columns. Wouldn't you rather have a file in json/jsonl or pandas format?" + ) + if extension == "jsonl": + extension = "json" + datasets = load_dataset(extension, data_files=data_files, cache_dir=args.cache_dir) + + if "validation" not in datasets.keys(): + datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + logger.info("Dataset loaded") + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Preprocessing the datasets. + column_names = datasets["train"].column_names + + logger.info("Start to add metadata and chunk examples") + + # Sets the attributes of the args object that have no influence on the calculation of the next map. This is useful + # for using the cache efficiently. + tmp_data_args = copy.deepcopy(args) + tmp_data_args.preprocessing_num_workers = 80 + tmp_data_args.overwrite_cache = False + tmp_data_args.per_device_eval_batch_size = 2 + tmp_data_args.per_device_train_batch_size = 2 + tmp_data_args.map_batch_size = 1 + + # First we pre-process our text and metadata + datasets_with_metadata = datasets.map( + functools.partial(add_metadata_and_chunk_examples, tokenizer=tokenizer, cfg=tmp_data_args), + batched=True, + num_proc=args.preprocessing_num_workers, + load_from_cache_file=not args.overwrite_cache, + desc="Pre-process the text and metadata to create new samples", + remove_columns=column_names, + batch_size=args.map_batch_size, + ) + logger.info("Add metadata and chunk examples finished") + + def create_labels_column(examples): + examples["labels"] = examples["input_ids"].copy() + return examples + + logger.info("Create labels column") + # Then we add the column containing the labels + datasets_with_metadata = datasets_with_metadata.map( + create_labels_column, + batched=True, + num_proc=args.preprocessing_num_workers, + load_from_cache_file=not args.overwrite_cache, + desc="Create labels column", + batch_size=args.map_batch_size, + ) + logger.info("Creating labels column finished") + + train_dataset = datasets_with_metadata["train"] + val_dataset1 = datasets_with_metadata["validation"] + + # We create another validation dataset without metadata + logger.info("Start to add metadata and chunk examples") + tmp_data_args.metadata_probability = 0 + val_dataset_without_metadata = datasets["validation"].map( + functools.partial(add_metadata_and_chunk_examples, tokenizer=tokenizer, cfg=tmp_data_args), + batched=True, + num_proc=args.preprocessing_num_workers, + load_from_cache_file=not args.overwrite_cache, + desc="Pre-process the text and metadata to create new samples", + remove_columns=column_names, + batch_size=args.map_batch_size, + ) + logger.info("Add metadata and chunk examples finished") + + def create_labels_column(examples): + examples["labels"] = examples["input_ids"].copy() + return examples + + logger.info("Create labels column") + # Then we add the column containing the labels + val_dataset_without_metadata = val_dataset_without_metadata.map( + create_labels_column, + batched=True, + num_proc=args.preprocessing_num_workers, + load_from_cache_file=not args.overwrite_cache, + desc="Create labels column", + batch_size=args.map_batch_size, + ) + logger.info("Creating labels column finished") + val_dataset2 = val_dataset_without_metadata + + + logger.info(f" Num train examples = {len(train_dataset)}") + logger.info(f" Num validation examples dataloader 1 = {len(val_dataset1)}") + logger.info(f" Num validation examples dataloader 2 = {len(val_dataset2)}") + + logger.info(f' Train examples = {train_dataset[0]}') + logger.info(f' Validation examples dataloader 1 = {val_dataset1[0]}') + logger.info(f' Validation examples dataloader 2 = {val_dataset2[0]}') + + logger.info(f' Train examples = {tokenizer.convert_ids_to_tokens(train_dataset[0]["input_ids"])}') + logger.info(f' Validation examples dataloader 1 = {tokenizer.convert_ids_to_tokens(val_dataset1[0]["input_ids"])}') + logger.info(f' Validation examples dataloader 2 = {tokenizer.convert_ids_to_tokens(val_dataset2[0]["input_ids"])}') + + + # DataLoaders creation: + train_dataloader = DataLoader( + train_dataset, + shuffle=True, + collate_fn=default_data_collator, + batch_size=args.per_device_train_batch_size, + ) + val_dataloader1 = DataLoader( + val_dataset1, + collate_fn=default_data_collator, + batch_size=args.per_device_eval_batch_size, + ) + val_dataloader2 = DataLoader( + val_dataset2, + collate_fn=default_data_collator, + batch_size=args.per_device_eval_batch_size, + ) + return train_dataloader, {"val1": val_dataloader1 , "val2": val_dataloader2 } diff --git a/bsmetadata/input_pipeline.py b/bsmetadata/input_pipeline.py index bfb77c3f..c46473f7 100644 --- a/bsmetadata/input_pipeline.py +++ b/bsmetadata/input_pipeline.py @@ -105,6 +105,11 @@ def get_dataloaders(tokenizer, cfg: DataConfig): if cfg.experiment == "with_metadata": from bsmetadata.experiments.with_metadata import get_dataloaders as fn + return fn(tokenizer, cfg) + + if cfg.experiment == "with_metadata_and_baseline_val": + from bsmetadata.experiments.with_metadata_and_baseline_val import get_dataloaders as fn + return fn(tokenizer, cfg) else: raise ValueError("You have not entered a valid experience name") diff --git a/bsmetadata/train.py b/bsmetadata/train.py index f1ab6df2..e419977a 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -251,7 +251,7 @@ def evaluate(eval_dataloader): completed_steps = 0 logger_metrics = Logger(is_local_main_process, project=args.project_name, config=args) - do_eval = args.do_eval and completed_steps > 0 and completed_steps % eval_per_n_step == 0 + do_eval = args.do_eval if do_eval: logger.info("***** Evaluation *****") for key, eval_dataloader in eval_dataloaders.items(): @@ -269,7 +269,7 @@ def evaluate(eval_dataloader): batch["labels"] = labels loss = loss_fn(batch, outputs, metadata_mask) - logger_metrics.log({"loss": loss}) + logger_metrics.log({"loss": loss, "lr": optimizer.param_groups[0]['lr']}) loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) diff --git a/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm b/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm index ef6d8f54..e746380c 100644 --- a/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm +++ b/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm @@ -5,8 +5,8 @@ #SBATCH --cpus-per-task=8 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=%x-%j.out # output file name -#SBATCH --error=%x-%j.err # error file name +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name #SBATCH --account=six@cpu # account set -x -e @@ -16,7 +16,7 @@ source $HOME/start-user export HF_DATASETS_OFFLINE=1 export TRANSFORMERS_OFFLINE=1 -cd $WORK/repos/test-sync/metadata/ +cd $WORK/repos/sync/metadata/ python experiments/html/start_training.py \ data_config.experiment="with_metadata" \ diff --git a/experiments/html/SLURM/crime_and_punish_test/load_dataset.slurm b/experiments/html/SLURM/crime_and_punish_test/load_dataset.slurm index 85b2ef29..8c32659a 100644 --- a/experiments/html/SLURM/crime_and_punish_test/load_dataset.slurm +++ b/experiments/html/SLURM/crime_and_punish_test/load_dataset.slurm @@ -6,8 +6,8 @@ #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --gres=gpu:0 # number of gpus #SBATCH --time 00:02:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=%x-%j.out # output file name -#SBATCH --error=%x-%j.err # error file name +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name #SBATCH --account=six@gpu # account #SBATCH -p compil # partition with internet @@ -15,7 +15,7 @@ set -x -e source $HOME/start-user -cd $WORK/repos/test-sync/metadata/ +cd $WORK/repos/sync/metadata/ python experiments/html/SLURM/init_experiment/load_dataset.py \ dataset_name="crime_and_punish" \ \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_1/create_dataset.slurm b/experiments/html/SLURM/experiment_1/create_dataset.slurm index 868c9ae0..99e09f09 100644 --- a/experiments/html/SLURM/experiment_1/create_dataset.slurm +++ b/experiments/html/SLURM/experiment_1/create_dataset.slurm @@ -16,7 +16,7 @@ source $HOME/start-user export HF_DATASETS_OFFLINE=1 export TRANSFORMERS_OFFLINE=1 -cd $WORK/repos/test-sync/metadata/ +cd $WORK/repos/sync/metadata/ python experiments/html/start_training.py \ data_config.experiment="with_metadata" \ diff --git a/experiments/html/SLURM/experiment_1/do_training.slurm b/experiments/html/SLURM/experiment_1/do_training.slurm index 99ae0467..8b2f2f71 100644 --- a/experiments/html/SLURM/experiment_1/do_training.slurm +++ b/experiments/html/SLURM/experiment_1/do_training.slurm @@ -20,7 +20,7 @@ export TRANSFORMERS_OFFLINE=1 export WANDB_MODE=offline export WANDB_DIR=$SCRATCH -cd $WORK/repos/test-sync/metadata/ +cd $WORK/repos/sync/metadata/ python experiments/html/start_training.py \ data_config.experiment="with_metadata" \ diff --git a/experiments/html/SLURM/experiment_1/load_dataset.slurm b/experiments/html/SLURM/experiment_1/load_dataset.slurm index 9a69dd42..ad6cb82c 100644 --- a/experiments/html/SLURM/experiment_1/load_dataset.slurm +++ b/experiments/html/SLURM/experiment_1/load_dataset.slurm @@ -22,7 +22,7 @@ source $HOME/start-user # git lfs install # git lfs pull origin master -cd $WORK/repos/test-sync/metadata/ +cd $WORK/repos/sync/metadata/ python experiments/html/SLURM/init_experiment/load_dataset.py \ dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ diff --git a/experiments/html/SLURM/experiment_2/README.md b/experiments/html/SLURM/experiment_2/README.md new file mode 100644 index 00000000..f0732a86 --- /dev/null +++ b/experiments/html/SLURM/experiment_2/README.md @@ -0,0 +1,9 @@ +# Experiment 1 + +## Run experiment on JZ + +1. Download the tokenizer and the model +2. Download the dataset on a partition with internet +3. Preprocess the dataset on a cpu-only partition +4. Run the training on a gpu 16gb partition + diff --git a/experiments/html/SLURM/experiment_2/create_dataset.slurm b/experiments/html/SLURM/experiment_2/create_dataset.slurm new file mode 100644 index 00000000..f68eacdb --- /dev/null +++ b/experiments/html/SLURM/experiment_2/create_dataset.slurm @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 01:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@cpu # account + +set -x -e + +source $HOME/start-user + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +cd $WORK/repos/sync/metadata/ + +python experiments/html/start_training.py \ +data_config.experiment="with_metadata_and_baseline_val" \ +data_config.metadata_list=["html"] \ +data_config.max_seq_len=1024 \ +data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ +data_config.train_file="nq-train-0\[0-2\].jsonl.gz" \ +data_config.validation_file="nq-dev-00.jsonl.gz" \ +data_config.extension="json" \ +data_config.preprocessing_num_workers=80 \ +out_dir="${SCRATCH}/metadata_outputs" \ +do_train=False \ +do_eval=False \ \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_2/do_training.slurm b/experiments/html/SLURM/experiment_2/do_training.slurm new file mode 100644 index 00000000..4edacac4 --- /dev/null +++ b/experiments/html/SLURM/experiment_2/do_training.slurm @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-do-train-test # job name +#SBATCH --ntasks=1 # number of MP tasks +#SBATCH --constraint=v100-16g +#SBATCH --gres=gpu:1 # number of GPUs per node +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account + +set -x -e + +source $HOME/start-user + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +# be careful about the cache folder for Wandb +export WANDB_MODE=offline +export WANDB_DIR=$SCRATCH + +cd $WORK/repos/sync/metadata/ + +python experiments/html/start_training.py \ +data_config.experiment="with_metadata_and_baseline_val" \ +data_config.metadata_list=["html"] \ +data_config.max_seq_len=1024 \ +data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ +data_config.train_file="nq-train-0\[0-2\].jsonl.gz" \ +data_config.validation_file="nq-dev-00.jsonl.gz" \ +data_config.extension="json" \ +data_config.preprocessing_num_workers=80 \ +data_config.per_device_eval_batch_size=3 \ +data_config.per_device_train_batch_size=3 \ +out_dir="${SCRATCH}/metadata_outputs/${SLURM_JOB_ID}" \ +do_train=True \ +do_eval=True \ +evaluation_strategy=STEPS \ +eval_steps=10 \ +save_strategy=STEPS \ +save_steps=10 \ +gradient_accumulation_steps=50\ \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_2/load_dataset.py b/experiments/html/SLURM/experiment_2/load_dataset.py new file mode 100644 index 00000000..1a8c65de --- /dev/null +++ b/experiments/html/SLURM/experiment_2/load_dataset.py @@ -0,0 +1,88 @@ +import logging +import sys +from datasets import config + +import hydra +from datasets import load_dataset +from hydra.core.config_store import ConfigStore + +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.train import show_help + +logger = logging.getLogger(__name__) + +cs = ConfigStore.instance() +cs.store(name="data_config", node=DataConfig) + + +@hydra.main(config_name="data_config") +def main(args: DataConfig) -> None: + data_files = {} + if args.train_file is not None: + data_files["train"] = args.train_file + if args.validation_file is not None: + data_files["validation"] = args.validation_file + + if not data_files: + data_files = None + + logger.info(config.HF_DATASETS_CACHE) + if args.dataset_name is not None: + logger.info( + "Downloading and loading a dataset from the hub" + f"{args.dataset_name}, {args.dataset_config_name}, data_files={data_files}, cache_dir={args.cache_dir}," + ) + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + args.dataset_name, + args.dataset_config_name, + data_files=data_files, + cache_dir=args.cache_dir, + keep_in_memory=False, + ) + + if "validation" not in raw_datasets.keys(): + logger.info("validation not in raw_datasets.keys()") + raw_datasets["validation"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + else: + extension = args.train_file.split(".")[-1] if not args.extension else args.extension + if extension == "txt": + raise ValueError( + "You have entered a text file for the train data, but this type of file cannot contain metadata " + "columns. Wouldn't you rather have a file in json/jsonl or pandas format?" + ) + if extension == "jsonl": + extension = "json" + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=args.cache_dir) + + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + + +if __name__ == "__main__": + if "--help" in sys.argv or "-h" in sys.argv: + show_help() + sys.exit() + main() diff --git a/experiments/html/SLURM/experiment_2/load_dataset.slurm b/experiments/html/SLURM/experiment_2/load_dataset.slurm new file mode 100644 index 00000000..ad6cb82c --- /dev/null +++ b/experiments/html/SLURM/experiment_2/load_dataset.slurm @@ -0,0 +1,30 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-download-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # number of gpus +#SBATCH --time 02:30:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +source $HOME/start-user + +# Uncomment if the repo doesn't exist +# cd $DATASETS_CUSTOM/ +# git clone https://huggingface.co/datasets/SaulLu/Natural_Questions_HTML +# cd Natural_Questions_HTML_Toy/ +# git lfs install +# git lfs pull origin master + +cd $WORK/repos/sync/metadata/ + +python experiments/html/SLURM/init_experiment/load_dataset.py \ +dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ +train_file="nq-train-*.jsonl.gz" \ +validation_file="nq-dev-*.jsonl.gz" \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_2/multi_steps.bash b/experiments/html/SLURM/experiment_2/multi_steps.bash new file mode 100644 index 00000000..d6f4a745 --- /dev/null +++ b/experiments/html/SLURM/experiment_2/multi_steps.bash @@ -0,0 +1,2 @@ +JID_JOB1=`sbatch create_dataset.slurm | cut -d " " -f 4` +sbatch --dependency=afterok:$JID_JOB1 do_training.slurm \ No newline at end of file diff --git a/experiments/html/SLURM/init_experiment/create_dataset.slurm b/experiments/html/SLURM/init_experiment/create_dataset.slurm index 8f003637..d4d2740c 100644 --- a/experiments/html/SLURM/init_experiment/create_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/create_dataset.slurm @@ -5,8 +5,8 @@ #SBATCH --cpus-per-task=8 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=%x-%j.out # output file name -#SBATCH --error=%x-%j.err # error file name +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name #SBATCH --account=six@cpu # account set -x -e @@ -16,7 +16,7 @@ source $HOME/start-user export HF_DATASETS_OFFLINE=1 export TRANSFORMERS_OFFLINE=1 -cd $WORK/repos/test-sync/metadata/ +cd $WORK/repos/sync/metadata/ python experiments/html/start_training.py \ data_config.experiment="with_metadata" \ diff --git a/experiments/html/SLURM/init_experiment/do_training.slurm b/experiments/html/SLURM/init_experiment/do_training.slurm index a6b06497..00d7fc74 100644 --- a/experiments/html/SLURM/init_experiment/do_training.slurm +++ b/experiments/html/SLURM/init_experiment/do_training.slurm @@ -6,8 +6,8 @@ #SBATCH --cpus-per-task=8 # number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=%x-%j.out # output file name -#SBATCH --error=%x-%j.err # error file name +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name #SBATCH --account=six@gpu # account set -x -e @@ -19,7 +19,7 @@ export TRANSFORMERS_OFFLINE=1 # be careful about the cache folder for Wandb export WANDB_MODE=offline -cd $WORK/repos/test-sync/metadata/ +cd $WORK/repos/sync/metadata/ python experiments/html/start_training.py \ data_config.experiment="with_metadata" \ diff --git a/experiments/html/SLURM/init_experiment/load_dataset.slurm b/experiments/html/SLURM/init_experiment/load_dataset.slurm index 1fb85e16..6f0761f7 100644 --- a/experiments/html/SLURM/init_experiment/load_dataset.slurm +++ b/experiments/html/SLURM/init_experiment/load_dataset.slurm @@ -6,8 +6,8 @@ #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --gres=gpu:0 # number of gpus #SBATCH --time 00:02:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=%x-%j.out # output file name -#SBATCH --error=%x-%j.err # error file name +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name #SBATCH --account=six@gpu # account #SBATCH -p compil # partition with internet @@ -22,7 +22,7 @@ source $HOME/start-user # git lfs install # git lfs pull origin master -cd $WORK/repos/test-sync/metadata/ +cd $WORK/repos/sync/metadata/ python experiments/html/SLURM/init_experiment/load_dataset.py \ dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML_Toy" \ diff --git a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm index 7be1f6ac..63ef488f 100644 --- a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm +++ b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm @@ -6,8 +6,8 @@ #SBATCH --hint=nomultithread # we get physical cores not logical #SBATCH --gres=gpu:0 # number of gpus #SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=%x-%j.out # output file name -#SBATCH --error=%x-%j.err # error file name +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name #SBATCH --account=six@gpu # account #SBATCH -p compil # partition with internet @@ -15,7 +15,7 @@ set -x -e source $HOME/start-user -cd $WORK/repos/test-sync/metadata/ +cd $WORK/repos/sync/metadata/ python experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py \ model_name=gpt2 \ \ No newline at end of file diff --git a/experiments/html/SLURM/wandb/sync_wandb.slurm b/experiments/html/SLURM/wandb/sync_wandb.slurm new file mode 100644 index 00000000..1922e3a3 --- /dev/null +++ b/experiments/html/SLURM/wandb/sync_wandb.slurm @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-sync-wandb # job name +#SBATCH --ntasks=1 # number of MP tasks +#SBATCH --nodes=1 # number of nodes +#SBATCH --cpus-per-task=1 # number of cores per task +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time=2:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --partition=compil +#SBATCH --account=six@cpu + +echo "START TIME: $(date)" + +source $HOME/start-user + +cd ${SCRATCH} + +while true +do + wandb sync --sync-all + sleep 30 +done + +echo "END TIME: $(date)" \ No newline at end of file diff --git a/experiments/jz_template/README.md b/experiments/jz_template/README.md new file mode 100644 index 00000000..8722c141 --- /dev/null +++ b/experiments/jz_template/README.md @@ -0,0 +1,6 @@ +# JZ templates + +The purpose of this repo is to provide templates for people who don't have direct access to JZ but who have been working on experiments that we want to run on JZ. + +## How to design your experience for JZ + diff --git a/experiments/jz_template/SLURM/experiment_example/01_load_tokenizer_and_model.slurm b/experiments/jz_template/SLURM/experiment_example/01_load_tokenizer_and_model.slurm new file mode 100644 index 00000000..abcc05de --- /dev/null +++ b/experiments/jz_template/SLURM/experiment_example/01_load_tokenizer_and_model.slurm @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-download-tokenizer-and-model # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # number of gpus +#SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +source $HOME/start-user + +cd $WORK/repos/sync/metadata/ + +python experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py \ + model_name=gpt2 \ diff --git a/experiments/jz_template/SLURM/experiment_example/02_load_dataset.slurm b/experiments/jz_template/SLURM/experiment_example/02_load_dataset.slurm new file mode 100644 index 00000000..f657f1fb --- /dev/null +++ b/experiments/jz_template/SLURM/experiment_example/02_load_dataset.slurm @@ -0,0 +1,30 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-download-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # number of gpus +#SBATCH --time 02:30:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +source $HOME/start-user + +# Uncomment if the repo doesn't exist +# cd $DATASETS_CUSTOM/ +# git clone https://huggingface.co/datasets/SaulLu/Natural_Questions_HTML +# cd Natural_Questions_HTML_Toy/ +# git lfs install +# git lfs pull origin master + +cd $WORK/repos/sync/metadata/ + +python experiments/html/SLURM/init_experiment/load_dataset.py \ + dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ + train_file="nq-train-*.jsonl.gz" \ + validation_file="nq-dev-*.jsonl.gz" diff --git a/experiments/jz_template/SLURM/experiment_example/03_create_dataset.slurm b/experiments/jz_template/SLURM/experiment_example/03_create_dataset.slurm new file mode 100644 index 00000000..adb290c3 --- /dev/null +++ b/experiments/jz_template/SLURM/experiment_example/03_create_dataset.slurm @@ -0,0 +1,34 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 01:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@cpu # account + +set -x -e + +source $HOME/start-user + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +cd $WORK/repos/sync/metadata/ + +pip install . + +python experiments/html/start_training.py \ + data_config.experiment="with_metadata_and_baseline_val" \ + data_config.metadata_list=["html"] \ + data_config.max_seq_len=1024 \ + data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ + data_config.train_file="nq-train-0\[0-2\].jsonl.gz" \ + data_config.validation_file="nq-dev-00.jsonl.gz" \ + data_config.extension="json" \ + data_config.preprocessing_num_workers=80 \ + out_dir="${SCRATCH}/metadata_outputs" \ + do_train=False \ + do_eval=False \ diff --git a/experiments/jz_template/SLURM/experiment_example/04_do_training.slurm b/experiments/jz_template/SLURM/experiment_example/04_do_training.slurm new file mode 100644 index 00000000..69fa96c6 --- /dev/null +++ b/experiments/jz_template/SLURM/experiment_example/04_do_training.slurm @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-do-train-test # job name +#SBATCH --ntasks=1 # number of MP tasks +#SBATCH --constraint=v100-16g +#SBATCH --gres=gpu:1 # number of GPUs per node +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account + +set -x -e + +source $HOME/start-user + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +# be careful about the cache folder for Wandb +export WANDB_MODE=offline +export WANDB_DIR=$SCRATCH + +cd $WORK/repos/sync/metadata/ + +python experiments/html/start_training.py \ + data_config.experiment="with_metadata_and_baseline_val" \ + data_config.metadata_list=["html"] \ + data_config.max_seq_len=1024 \ + data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ + data_config.train_file="nq-train-0\[0-2\].jsonl.gz" \ + data_config.validation_file="nq-dev-00.jsonl.gz" \ + data_config.extension="json" \ + data_config.preprocessing_num_workers=80 \ + data_config.per_device_eval_batch_size=3 \ + data_config.per_device_train_batch_size=3 \ + out_dir="${SCRATCH}/metadata_outputs/${SLURM_JOB_ID}" \ + do_train=True \ + do_eval=True \ + evaluation_strategy=STEPS \ + eval_steps=10 \ + save_strategy=STEPS \ + save_steps=10 \ + gradient_accumulation_steps=50\ diff --git a/experiments/jz_template/SLURM/experiment_example/README.md b/experiments/jz_template/SLURM/experiment_example/README.md new file mode 100644 index 00000000..f0732a86 --- /dev/null +++ b/experiments/jz_template/SLURM/experiment_example/README.md @@ -0,0 +1,9 @@ +# Experiment 1 + +## Run experiment on JZ + +1. Download the tokenizer and the model +2. Download the dataset on a partition with internet +3. Preprocess the dataset on a cpu-only partition +4. Run the training on a gpu 16gb partition + diff --git a/experiments/jz_template/SLURM/experiment_example/multi_steps.bash b/experiments/jz_template/SLURM/experiment_example/multi_steps.bash new file mode 100644 index 00000000..16775a3f --- /dev/null +++ b/experiments/jz_template/SLURM/experiment_example/multi_steps.bash @@ -0,0 +1,2 @@ +JID_JOB1=$(sbatch create_dataset.slurm | cut -d " " -f 4) +sbatch --dependency=afterok:$JID_JOB1 do_training.slurm diff --git a/experiments/jz_template/SLURM/experiment_template/01_load_tokenizer_and_model.slurm b/experiments/jz_template/SLURM/experiment_template/01_load_tokenizer_and_model.slurm new file mode 100644 index 00000000..37ae21c9 --- /dev/null +++ b/experiments/jz_template/SLURM/experiment_template/01_load_tokenizer_and_model.slurm @@ -0,0 +1,29 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-XX # (change me!) job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus +#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +# Next line will: +# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/ +# - setup env vars ($HOME, $WORK, etc) +# - load several modules (git) +# Note: We can afford to have two conda environments: one stable for running experiments and one for development. +# If there are new dependencies to install, you have to tell me about them and not do it in this script +source $HOME/start-modelling-metadata-user + +# Folder for the clone of github.com/bigscience-workshop/metadata/ +cd $WORK/repos/metadata/ + +# Command to load the XXX model and tokenizer stored on https://huggingface.co/models +python experiments/jz_template/SLURM/loading_script_utils/load_tokenizer_and_model.py \ + model_name=XXX diff --git a/experiments/jz_template/SLURM/experiment_template/02_load_dataset.slurm b/experiments/jz_template/SLURM/experiment_template/02_load_dataset.slurm new file mode 100644 index 00000000..f657f1fb --- /dev/null +++ b/experiments/jz_template/SLURM/experiment_template/02_load_dataset.slurm @@ -0,0 +1,30 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-download-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # number of gpus +#SBATCH --time 02:30:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +source $HOME/start-user + +# Uncomment if the repo doesn't exist +# cd $DATASETS_CUSTOM/ +# git clone https://huggingface.co/datasets/SaulLu/Natural_Questions_HTML +# cd Natural_Questions_HTML_Toy/ +# git lfs install +# git lfs pull origin master + +cd $WORK/repos/sync/metadata/ + +python experiments/html/SLURM/init_experiment/load_dataset.py \ + dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ + train_file="nq-train-*.jsonl.gz" \ + validation_file="nq-dev-*.jsonl.gz" diff --git a/experiments/jz_template/SLURM/experiment_template/03_create_dataset.slurm b/experiments/jz_template/SLURM/experiment_template/03_create_dataset.slurm new file mode 100644 index 00000000..adb290c3 --- /dev/null +++ b/experiments/jz_template/SLURM/experiment_template/03_create_dataset.slurm @@ -0,0 +1,34 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 01:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@cpu # account + +set -x -e + +source $HOME/start-user + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +cd $WORK/repos/sync/metadata/ + +pip install . + +python experiments/html/start_training.py \ + data_config.experiment="with_metadata_and_baseline_val" \ + data_config.metadata_list=["html"] \ + data_config.max_seq_len=1024 \ + data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ + data_config.train_file="nq-train-0\[0-2\].jsonl.gz" \ + data_config.validation_file="nq-dev-00.jsonl.gz" \ + data_config.extension="json" \ + data_config.preprocessing_num_workers=80 \ + out_dir="${SCRATCH}/metadata_outputs" \ + do_train=False \ + do_eval=False \ diff --git a/experiments/jz_template/SLURM/experiment_template/04_do_training.slurm b/experiments/jz_template/SLURM/experiment_template/04_do_training.slurm new file mode 100644 index 00000000..69fa96c6 --- /dev/null +++ b/experiments/jz_template/SLURM/experiment_template/04_do_training.slurm @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-do-train-test # job name +#SBATCH --ntasks=1 # number of MP tasks +#SBATCH --constraint=v100-16g +#SBATCH --gres=gpu:1 # number of GPUs per node +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account + +set -x -e + +source $HOME/start-user + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +# be careful about the cache folder for Wandb +export WANDB_MODE=offline +export WANDB_DIR=$SCRATCH + +cd $WORK/repos/sync/metadata/ + +python experiments/html/start_training.py \ + data_config.experiment="with_metadata_and_baseline_val" \ + data_config.metadata_list=["html"] \ + data_config.max_seq_len=1024 \ + data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ + data_config.train_file="nq-train-0\[0-2\].jsonl.gz" \ + data_config.validation_file="nq-dev-00.jsonl.gz" \ + data_config.extension="json" \ + data_config.preprocessing_num_workers=80 \ + data_config.per_device_eval_batch_size=3 \ + data_config.per_device_train_batch_size=3 \ + out_dir="${SCRATCH}/metadata_outputs/${SLURM_JOB_ID}" \ + do_train=True \ + do_eval=True \ + evaluation_strategy=STEPS \ + eval_steps=10 \ + save_strategy=STEPS \ + save_steps=10 \ + gradient_accumulation_steps=50\ diff --git a/experiments/jz_template/SLURM/experiment_template/README.md b/experiments/jz_template/SLURM/experiment_template/README.md new file mode 100644 index 00000000..f0732a86 --- /dev/null +++ b/experiments/jz_template/SLURM/experiment_template/README.md @@ -0,0 +1,9 @@ +# Experiment 1 + +## Run experiment on JZ + +1. Download the tokenizer and the model +2. Download the dataset on a partition with internet +3. Preprocess the dataset on a cpu-only partition +4. Run the training on a gpu 16gb partition + diff --git a/experiments/jz_template/SLURM/experiment_template/multi_steps.bash b/experiments/jz_template/SLURM/experiment_template/multi_steps.bash new file mode 100644 index 00000000..16775a3f --- /dev/null +++ b/experiments/jz_template/SLURM/experiment_template/multi_steps.bash @@ -0,0 +1,2 @@ +JID_JOB1=$(sbatch create_dataset.slurm | cut -d " " -f 4) +sbatch --dependency=afterok:$JID_JOB1 do_training.slurm diff --git a/experiments/jz_template/SLURM/loading_script_utils/load_dataset.py b/experiments/jz_template/SLURM/loading_script_utils/load_dataset.py new file mode 100644 index 00000000..1a8c65de --- /dev/null +++ b/experiments/jz_template/SLURM/loading_script_utils/load_dataset.py @@ -0,0 +1,88 @@ +import logging +import sys +from datasets import config + +import hydra +from datasets import load_dataset +from hydra.core.config_store import ConfigStore + +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.train import show_help + +logger = logging.getLogger(__name__) + +cs = ConfigStore.instance() +cs.store(name="data_config", node=DataConfig) + + +@hydra.main(config_name="data_config") +def main(args: DataConfig) -> None: + data_files = {} + if args.train_file is not None: + data_files["train"] = args.train_file + if args.validation_file is not None: + data_files["validation"] = args.validation_file + + if not data_files: + data_files = None + + logger.info(config.HF_DATASETS_CACHE) + if args.dataset_name is not None: + logger.info( + "Downloading and loading a dataset from the hub" + f"{args.dataset_name}, {args.dataset_config_name}, data_files={data_files}, cache_dir={args.cache_dir}," + ) + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + args.dataset_name, + args.dataset_config_name, + data_files=data_files, + cache_dir=args.cache_dir, + keep_in_memory=False, + ) + + if "validation" not in raw_datasets.keys(): + logger.info("validation not in raw_datasets.keys()") + raw_datasets["validation"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + else: + extension = args.train_file.split(".")[-1] if not args.extension else args.extension + if extension == "txt": + raise ValueError( + "You have entered a text file for the train data, but this type of file cannot contain metadata " + "columns. Wouldn't you rather have a file in json/jsonl or pandas format?" + ) + if extension == "jsonl": + extension = "json" + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=args.cache_dir) + + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + + +if __name__ == "__main__": + if "--help" in sys.argv or "-h" in sys.argv: + show_help() + sys.exit() + main() diff --git a/experiments/jz_template/SLURM/loading_script_utils/load_tokenizer_and_model.py b/experiments/jz_template/SLURM/loading_script_utils/load_tokenizer_and_model.py new file mode 100644 index 00000000..8c6cb94b --- /dev/null +++ b/experiments/jz_template/SLURM/loading_script_utils/load_tokenizer_and_model.py @@ -0,0 +1,41 @@ +import logging +import sys + +import transformers.utils.logging as logging_transformers +import hydra +from datasets import load_dataset +from hydra.core.config_store import ConfigStore +from transformers import AdamW, AutoModelForCausalLM, AutoTokenizer + +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.train import CFG, show_help + + +# Setup logging +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO +) +logging_transformers.set_verbosity_info() +logging_transformers.enable_default_handler() +logging_transformers.enable_explicit_format() + +logger = logging.getLogger(__name__) + +cs = ConfigStore.instance() +cs.store(name="config", node=CFG) + + +@hydra.main(config_path=None, config_name="config") +def main(args: CFG) -> None: + # get dataloaders + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + + # get model + model = AutoModelForCausalLM.from_pretrained(args.model_name) + + +if __name__ == "__main__": + if "--help" in sys.argv or "-h" in sys.argv: + show_help() + sys.exit() + main() From 72e9976026fad77b07131620b443ef016af34e25 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 7 Sep 2021 16:49:40 +0200 Subject: [PATCH 72/84] create dedicated dir --- experiments/{jz_template => jz}/README.md | 1 + .../SLURM/experiment_example/README.md | 4 ++ .../01_load_tokenizer_and_model.slurm | 29 ++++++++ .../subexperiment_1/02_load_dataset.slurm | 62 +++++++++++++++++ .../subexperiment_1/03_create_dataset.slurm | 57 ++++++++++++++++ .../subexperiment_1/04_do_training.slurm | 66 +++++++++++++++++++ .../subexperiment_1/multi_steps.bash | 4 ++ .../01_load_tokenizer_and_model.slurm | 4 +- .../experiment_template/02_load_dataset.slurm | 62 +++++++++++++++++ .../03_create_dataset.slurm | 57 ++++++++++++++++ .../experiment_template/04_do_training.slurm | 65 ++++++++++++++++++ .../SLURM/experiment_template/README.md | 4 ++ .../experiment_template/multi_steps.bash | 4 ++ .../loading_script_utils/load_dataset.py | 0 .../load_tokenizer_and_model.py | 0 experiments/jz/utils/sync_wandb.slurm | 24 +++++++ .../01_load_tokenizer_and_model.slurm | 21 ------ .../experiment_example/02_load_dataset.slurm | 30 --------- .../03_create_dataset.slurm | 34 ---------- .../experiment_example/04_do_training.slurm | 43 ------------ .../SLURM/experiment_example/multi_steps.bash | 2 - .../experiment_template/02_load_dataset.slurm | 30 --------- .../03_create_dataset.slurm | 34 ---------- .../experiment_template/04_do_training.slurm | 43 ------------ .../experiment_template/multi_steps.bash | 2 - 25 files changed, 441 insertions(+), 241 deletions(-) rename experiments/{jz_template => jz}/README.md (82%) rename experiments/{jz_template => jz/templates}/SLURM/experiment_example/README.md (78%) create mode 100644 experiments/jz/templates/SLURM/experiment_example/subexperiment_1/01_load_tokenizer_and_model.slurm create mode 100644 experiments/jz/templates/SLURM/experiment_example/subexperiment_1/02_load_dataset.slurm create mode 100644 experiments/jz/templates/SLURM/experiment_example/subexperiment_1/03_create_dataset.slurm create mode 100644 experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm create mode 100644 experiments/jz/templates/SLURM/experiment_example/subexperiment_1/multi_steps.bash rename experiments/{jz_template => jz/templates}/SLURM/experiment_template/01_load_tokenizer_and_model.slurm (93%) create mode 100644 experiments/jz/templates/SLURM/experiment_template/02_load_dataset.slurm create mode 100644 experiments/jz/templates/SLURM/experiment_template/03_create_dataset.slurm create mode 100644 experiments/jz/templates/SLURM/experiment_template/04_do_training.slurm rename experiments/{jz_template => jz/templates}/SLURM/experiment_template/README.md (78%) create mode 100644 experiments/jz/templates/SLURM/experiment_template/multi_steps.bash rename experiments/{jz_template/SLURM => jz/utils}/loading_script_utils/load_dataset.py (100%) rename experiments/{jz_template/SLURM => jz/utils}/loading_script_utils/load_tokenizer_and_model.py (100%) create mode 100644 experiments/jz/utils/sync_wandb.slurm delete mode 100644 experiments/jz_template/SLURM/experiment_example/01_load_tokenizer_and_model.slurm delete mode 100644 experiments/jz_template/SLURM/experiment_example/02_load_dataset.slurm delete mode 100644 experiments/jz_template/SLURM/experiment_example/03_create_dataset.slurm delete mode 100644 experiments/jz_template/SLURM/experiment_example/04_do_training.slurm delete mode 100644 experiments/jz_template/SLURM/experiment_example/multi_steps.bash delete mode 100644 experiments/jz_template/SLURM/experiment_template/02_load_dataset.slurm delete mode 100644 experiments/jz_template/SLURM/experiment_template/03_create_dataset.slurm delete mode 100644 experiments/jz_template/SLURM/experiment_template/04_do_training.slurm delete mode 100644 experiments/jz_template/SLURM/experiment_template/multi_steps.bash diff --git a/experiments/jz_template/README.md b/experiments/jz/README.md similarity index 82% rename from experiments/jz_template/README.md rename to experiments/jz/README.md index 8722c141..3afd6e43 100644 --- a/experiments/jz_template/README.md +++ b/experiments/jz/README.md @@ -4,3 +4,4 @@ The purpose of this repo is to provide templates for people who don't have direc ## How to design your experience for JZ +Everything on the Hub: model, tokenizer, Dataset \ No newline at end of file diff --git a/experiments/jz_template/SLURM/experiment_example/README.md b/experiments/jz/templates/SLURM/experiment_example/README.md similarity index 78% rename from experiments/jz_template/SLURM/experiment_example/README.md rename to experiments/jz/templates/SLURM/experiment_example/README.md index f0732a86..8cfa4464 100644 --- a/experiments/jz_template/SLURM/experiment_example/README.md +++ b/experiments/jz/templates/SLURM/experiment_example/README.md @@ -7,3 +7,7 @@ 3. Preprocess the dataset on a cpu-only partition 4. Run the training on a gpu 16gb partition +Are you downloading a new: +- model ? +- tokenizer ? +- dataset ? \ No newline at end of file diff --git a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/01_load_tokenizer_and_model.slurm b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/01_load_tokenizer_and_model.slurm new file mode 100644 index 00000000..8544be60 --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/01_load_tokenizer_and_model.slurm @@ -0,0 +1,29 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-example-load-model-and-tokenizer # (change me!) job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus +#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +# Next line will: +# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/ +# - setup env vars ($HOME, $WORK, etc) +# - load several modules (git) +# Note: We can afford to have two conda environments: one stable for running experiments and one for development. +# If there are new dependencies to install, you have to tell me about them and not do it in this script +source $HOME/start-modelling-metadata-user + +# Folder for the clone of github.com/bigscience-workshop/metadata/ +cd $WORK/repos/metadata/ + +# Command to load the XXX model and tokenizer stored on https://huggingface.co/models +python experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py \ + model_name=gpt2 # (change me! e.g. gpt2) diff --git a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/02_load_dataset.slurm b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/02_load_dataset.slurm new file mode 100644 index 00000000..d6721310 --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/02_load_dataset.slurm @@ -0,0 +1,62 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-example-load-dataset # (change me!) job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus +#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + + +set -x -e + +# Next line will: +# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/ +# - setup env vars ($HOME, $WORK, etc) +# - load several modules (git) +# Note: We can afford to have only two conda environments: one stable for running experiments and one for development. +# If there are new dependencies to install, you have to tell me about them and not do it in this script +source $HOME/start-modelling-metadata-user + +# For the moment we can't directly use the new dataset feature on JZ which would avoid having to clone the dataset +# repo from the HUB. So the first thing to do is to clone the repo of the XXX dataset if it does not already exist. +HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_Toy' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy) + +# We define the name of the folder in which the clone will be made +#Define multi-character delimiter +delimiter="/" +#Concatenate the delimiter with the main string +string=$HUB_REPO_NAME$delimiter + +#Split the text based on the delimiter +myarray=() +while [[ $string ]]; do + myarray+=( "${string%%"$delimiter"*}" ) + string=${string#*"$delimiter"} +done +REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" + +# We clone the repo if it doesn't exist +if [[ -d "${REPO_DIR}" ]] +then + echo "${REPO_DIR} already exists on your filesystem." +else + echo "${REPO_DIR} doesn't exists on your filesystem." + cd $DATASETS_CUSTOM/ + git clone "https://huggingface.co/datasets/${HUB_REPO_NAME}" + cd ${REPO_DIR} + git lfs install + git lfs pull origin master +fi + +cd $WORK/repos/sync/metadata/ + +# We check that the dataset can indeed be loaded +python experiments/jz/utils/loading_script_utils/load_dataset.py \ + dataset_name="${REPO_DIR}" \ + train_file="nq-train-*.jsonl.gz" \ # (change me! e.g "nq-train-*.jsonl.gz" or remove arg) + validation_file="nq-dev-*.jsonl.gz" # (change me! e.g. "nq-dev-*.jsonl.gz" or remove arg) diff --git a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/03_create_dataset.slurm b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/03_create_dataset.slurm new file mode 100644 index 00000000..6b23d34b --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/03_create_dataset.slurm @@ -0,0 +1,57 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-metadata-create-dataset # (change me!) job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=20 # (change me! between 0 and 40) number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 01:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@cpu # account + +set -x -e + +# Next line will: +# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/ +# - setup env vars ($HOME, $WORK, etc) +# - load several modules (git) +# Note: We can afford to have two conda environments: one stable for running experiments and one for development. +# If there are new dependencies to install, you have to tell me about them and not do it in this script +source $HOME/start-modelling-metadata-user + +# We are on an offline partition +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +# Folder for the clone of github.com/bigscience-workshop/metadata/ +cd $WORK/repos/metadata/ + +HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_Toy' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy) + +# We define the name of the folder in which the clone will be made +#Define multi-character delimiter +delimiter="/" +#Concatenate the delimiter with the main string +string=$HUB_REPO_NAME$delimiter + +#Split the text based on the delimiter +myarray=() +while [[ $string ]]; do + myarray+=( "${string%%"$delimiter"*}" ) + string=${string#*"$delimiter"} +done +REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" + +# Now we launch the script that will perform the preprocessing of the dataset +# Feel free to add any arguments you like (change me!) +python bsmetadata/train.py \ # (change me! if you have a specific script) + data_config.experiment="with_metadata" \ + data_config.metadata_list=["html"] \ # (change me!) + data_config.max_seq_len=1024 \ + data_config.dataset_name="${REPO_DIR}" \ + data_config.train_file="nq-train-*.jsonl.gz" \ # (change me! e.g "nq-train-0\[0-2\].jsonl.gz" or remove arg) + data_config.validation_file="nq-dev-*.jsonl.gz"\ # (change me! e.g "nq-dev-00.jsonl.gz" or remove arg) + data_config.preprocessing_num_workers=80 \ + out_dir="${SCRATCH}/metadata_outputs" \ + do_train=False \ + do_eval=False \ diff --git a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm new file mode 100644 index 00000000..c12c25e0 --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm @@ -0,0 +1,66 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-XX # (change me!) job name +#SBATCH --ntasks=1 # number of MP tasks +#SBATCH --constraint=v100-16g +#SBATCH --gres=gpu:1 # number of GPUs per node +#SBATCH --cpus-per-task=8 # (change me! between 0 and 40) number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 01:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name # error file name +#SBATCH --account=six@gpu # account + +set -x -e + +# Next line will: +# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/ +# - setup env vars ($HOME, $WORK, etc) +# - load several modules (git) +# Note: We can afford to have two conda environments: one stable for running experiments and one for development. +# If there are new dependencies to install, you have to tell me about them and not do it in this script +source $HOME/start-modelling-metadata-user + +# We are on an offline partition +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +# be careful about the cache folder for Wandb +export WANDB_MODE=offline +export WANDB_DIR=$SCRATCH + +# Folder for the clone of github.com/bigscience-workshop/metadata/ +cd $WORK/repos/metadata/ + +HUB_REPO_NAME='XXX' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy) + +# We define the name of the folder in which the clone will be made +#Define multi-character delimiter +delimiter="/" +#Concatenate the delimiter with the main string +string=$HUB_REPO_NAME$delimiter + +#Split the text based on the delimiter +myarray=() +while [[ $string ]]; do + myarray+=( "${string%%"$delimiter"*}" ) + string=${string#*"$delimiter"} +done +REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" + +python bsmetadata/train.py \ # (change me! if you have a specific script) + data_config.experiment="with_metadata" \ + data_config.metadata_list=["html"] \ + data_config.max_seq_len=1024 \ + data_config.dataset_name="${REPO_DIR}" \ + data_config.train_file="nq-train-*.jsonl.gz" \ # (change me! e.g "nq-train-0\[0-2\].jsonl.gz" or remove arg) + data_config.validation_file="nq-dev-*.jsonl.gz"\ # (change me! e.g "nq-dev-00.jsonl.gz" or remove arg) + data_config.preprocessing_num_workers=80 \ + data_config.per_device_eval_batch_size=3 \ + data_config.per_device_train_batch_size=3 \ + out_dir="${SCRATCH}/metadata_outputs/${SLURM_JOB_ID}" \ + do_train=True \ + do_eval=True \ + evaluation_strategy=STEPS \ + eval_steps=10 \ + save_strategy=STEPS \ + save_steps=10 \ + gradient_accumulation_steps=50\ # (change me!) diff --git a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/multi_steps.bash b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/multi_steps.bash new file mode 100644 index 00000000..ac06bb86 --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/multi_steps.bash @@ -0,0 +1,4 @@ +JID_JOB1=$(sbatch 01_load_tokenizer_and_model.slurm | cut -d " " -f 4) +JID_JOB2=$(sbatch --dependency=afterok:$JID_JOB1 02_load_dataset.slurm | cut -d " " -f 4) +JID_JOB3=$(sbatch --dependency=afterok:$JID_JOB2 03_create_dataset.slurm | cut -d " " -f 4) +sbatch --dependency=afterok:$JID_JOB3 04_do_training.slurm diff --git a/experiments/jz_template/SLURM/experiment_template/01_load_tokenizer_and_model.slurm b/experiments/jz/templates/SLURM/experiment_template/01_load_tokenizer_and_model.slurm similarity index 93% rename from experiments/jz_template/SLURM/experiment_template/01_load_tokenizer_and_model.slurm rename to experiments/jz/templates/SLURM/experiment_template/01_load_tokenizer_and_model.slurm index 37ae21c9..b402a379 100644 --- a/experiments/jz_template/SLURM/experiment_template/01_load_tokenizer_and_model.slurm +++ b/experiments/jz/templates/SLURM/experiment_template/01_load_tokenizer_and_model.slurm @@ -25,5 +25,5 @@ source $HOME/start-modelling-metadata-user cd $WORK/repos/metadata/ # Command to load the XXX model and tokenizer stored on https://huggingface.co/models -python experiments/jz_template/SLURM/loading_script_utils/load_tokenizer_and_model.py \ - model_name=XXX +python experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py \ + model_name=XXX # (change me! e.g. gpt2) diff --git a/experiments/jz/templates/SLURM/experiment_template/02_load_dataset.slurm b/experiments/jz/templates/SLURM/experiment_template/02_load_dataset.slurm new file mode 100644 index 00000000..5edc9faa --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_template/02_load_dataset.slurm @@ -0,0 +1,62 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-XX # (change me!) job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus +#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + + +set -x -e + +# Next line will: +# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/ +# - setup env vars ($HOME, $WORK, etc) +# - load several modules (git) +# Note: We can afford to have two conda environments: one stable for running experiments and one for development. +# If there are new dependencies to install, you have to tell me about them and not do it in this script +source $HOME/start-modelling-metadata-user + +# For the moment we can't directly use the new dataset feature on JZ which would avoid having to clone the dataset +# repo from the HUB. So the first thing to do is to clone the repo of the XXX dataset if it does not already exist. +HUB_REPO_NAME='XXX' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy) + +# We define the name of the folder in which the clone will be made +#Define multi-character delimiter +delimiter="/" +#Concatenate the delimiter with the main string +string=$HUB_REPO_NAME$delimiter + +#Split the text based on the delimiter +myarray=() +while [[ $string ]]; do + myarray+=( "${string%%"$delimiter"*}" ) + string=${string#*"$delimiter"} +done +REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" + +# We clone the repo if it doesn't exist +if [[ -d "${REPO_DIR}" ]] +then + echo "${REPO_DIR} already exists on your filesystem." +else + echo "${REPO_DIR} doesn't exists on your filesystem." + cd $DATASETS_CUSTOM/ + git clone "https://huggingface.co/datasets/${HUB_REPO_NAME}" + cd ${REPO_DIR} + git lfs install + git lfs pull origin master +fi + +cd $WORK/repos/sync/metadata/ + +# We check that the dataset can indeed be loaded +python experiments/jz/utils/loading_script_utils/load_dataset.py \ + dataset_name="${REPO_DIR}" \ + train_file="XXX" \ # (change me! e.g "nq-train-*.jsonl.gz" or remove arg) + validation_file="XXX" # (change me! e.g. "nq-dev-*.jsonl.gz" or remove arg) diff --git a/experiments/jz/templates/SLURM/experiment_template/03_create_dataset.slurm b/experiments/jz/templates/SLURM/experiment_template/03_create_dataset.slurm new file mode 100644 index 00000000..4afe7647 --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_template/03_create_dataset.slurm @@ -0,0 +1,57 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-XX # (change me!) job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=20 # (change me! between 0 and 40) number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 01:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@cpu # account + +set -x -e + +# Next line will: +# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/ +# - setup env vars ($HOME, $WORK, etc) +# - load several modules (git) +# Note: We can afford to have two conda environments: one stable for running experiments and one for development. +# If there are new dependencies to install, you have to tell me about them and not do it in this script +source $HOME/start-modelling-metadata-user + +# We are on an offline partition +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +# Folder for the clone of github.com/bigscience-workshop/metadata/ +cd $WORK/repos/metadata/ + +HUB_REPO_NAME='XXX' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy) + +# We define the name of the folder in which the clone will be made +#Define multi-character delimiter +delimiter="/" +#Concatenate the delimiter with the main string +string=$HUB_REPO_NAME$delimiter + +#Split the text based on the delimiter +myarray=() +while [[ $string ]]; do + myarray+=( "${string%%"$delimiter"*}" ) + string=${string#*"$delimiter"} +done +REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" + +# Now we launch the script that will perform the preprocessing of the dataset +# Feel free to add any arguments you like (change me!) +python bsmetadata/train.py \ # (change me! if you have a specific script) + data_config.experiment="with_metadata" \ + data_config.metadata_list=["html"] \ # (change me!) + data_config.max_seq_len=1024 \ + data_config.dataset_name="${REPO_DIR}" \ + data_config.train_file="XXX" \ # (change me! e.g "nq-train-0\[0-2\].jsonl.gz" or remove arg) + data_config.validation_file"XXX" \ # (change me! e.g "nq-dev-00.jsonl.gz" or remove arg) + data_config.preprocessing_num_workers=80 \ + out_dir="${SCRATCH}/metadata_outputs" \ + do_train=False \ + do_eval=False \ diff --git a/experiments/jz/templates/SLURM/experiment_template/04_do_training.slurm b/experiments/jz/templates/SLURM/experiment_template/04_do_training.slurm new file mode 100644 index 00000000..fdc281ad --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_template/04_do_training.slurm @@ -0,0 +1,65 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-XX # (change me!) job name +#SBATCH --ntasks=1 # number of MP tasks +#SBATCH --constraint=v100-16g +#SBATCH --cpus-per-task=8 # (change me! between 0 and 40) number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 01:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name # error file name +#SBATCH --account=six@gpu # account + +set -x -e + +# Next line will: +# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/ +# - setup env vars ($HOME, $WORK, etc) +# - load several modules (git) +# Note: We can afford to have two conda environments: one stable for running experiments and one for development. +# If there are new dependencies to install, you have to tell me about them and not do it in this script +source $HOME/start-modelling-metadata-user + +# We are on an offline partition +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +# be careful about the cache folder for Wandb +export WANDB_MODE=offline +export WANDB_DIR=$SCRATCH + +# Folder for the clone of github.com/bigscience-workshop/metadata/ +cd $WORK/repos/metadata/ + +HUB_REPO_NAME='XXX' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy) + +# We define the name of the folder in which the clone will be made +#Define multi-character delimiter +delimiter="/" +#Concatenate the delimiter with the main string +string=$HUB_REPO_NAME$delimiter + +#Split the text based on the delimiter +myarray=() +while [[ $string ]]; do + myarray+=( "${string%%"$delimiter"*}" ) + string=${string#*"$delimiter"} +done +REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" + +python bsmetadata/train.py \ # (change me! if you have a specific script) + data_config.experiment="with_metadata" \ + data_config.metadata_list=["html"] \ + data_config.max_seq_len=1024 \ + data_config.dataset_name="${REPO_DIR}" \ + data_config.train_file="XXX" \ # (change me! e.g "nq-train-0\[0-2\].jsonl.gz" or remove arg) + data_config.validation_file"XXX" \ # (change me! e.g "nq-dev-00.jsonl.gz" or remove arg) + data_config.preprocessing_num_workers=80 \ + data_config.per_device_eval_batch_size=3 \ + data_config.per_device_train_batch_size=3 \ + out_dir="${SCRATCH}/metadata_outputs/${SLURM_JOB_ID}" \ + do_train=True \ + do_eval=True \ + evaluation_strategy=STEPS \ + eval_steps=10 \ + save_strategy=STEPS \ + save_steps=10 \ + gradient_accumulation_steps=50\ # (change me!) diff --git a/experiments/jz_template/SLURM/experiment_template/README.md b/experiments/jz/templates/SLURM/experiment_template/README.md similarity index 78% rename from experiments/jz_template/SLURM/experiment_template/README.md rename to experiments/jz/templates/SLURM/experiment_template/README.md index f0732a86..8cfa4464 100644 --- a/experiments/jz_template/SLURM/experiment_template/README.md +++ b/experiments/jz/templates/SLURM/experiment_template/README.md @@ -7,3 +7,7 @@ 3. Preprocess the dataset on a cpu-only partition 4. Run the training on a gpu 16gb partition +Are you downloading a new: +- model ? +- tokenizer ? +- dataset ? \ No newline at end of file diff --git a/experiments/jz/templates/SLURM/experiment_template/multi_steps.bash b/experiments/jz/templates/SLURM/experiment_template/multi_steps.bash new file mode 100644 index 00000000..ac06bb86 --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_template/multi_steps.bash @@ -0,0 +1,4 @@ +JID_JOB1=$(sbatch 01_load_tokenizer_and_model.slurm | cut -d " " -f 4) +JID_JOB2=$(sbatch --dependency=afterok:$JID_JOB1 02_load_dataset.slurm | cut -d " " -f 4) +JID_JOB3=$(sbatch --dependency=afterok:$JID_JOB2 03_create_dataset.slurm | cut -d " " -f 4) +sbatch --dependency=afterok:$JID_JOB3 04_do_training.slurm diff --git a/experiments/jz_template/SLURM/loading_script_utils/load_dataset.py b/experiments/jz/utils/loading_script_utils/load_dataset.py similarity index 100% rename from experiments/jz_template/SLURM/loading_script_utils/load_dataset.py rename to experiments/jz/utils/loading_script_utils/load_dataset.py diff --git a/experiments/jz_template/SLURM/loading_script_utils/load_tokenizer_and_model.py b/experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py similarity index 100% rename from experiments/jz_template/SLURM/loading_script_utils/load_tokenizer_and_model.py rename to experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py diff --git a/experiments/jz/utils/sync_wandb.slurm b/experiments/jz/utils/sync_wandb.slurm new file mode 100644 index 00000000..1922e3a3 --- /dev/null +++ b/experiments/jz/utils/sync_wandb.slurm @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-sync-wandb # job name +#SBATCH --ntasks=1 # number of MP tasks +#SBATCH --nodes=1 # number of nodes +#SBATCH --cpus-per-task=1 # number of cores per task +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time=2:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --partition=compil +#SBATCH --account=six@cpu + +echo "START TIME: $(date)" + +source $HOME/start-user + +cd ${SCRATCH} + +while true +do + wandb sync --sync-all + sleep 30 +done + +echo "END TIME: $(date)" \ No newline at end of file diff --git a/experiments/jz_template/SLURM/experiment_example/01_load_tokenizer_and_model.slurm b/experiments/jz_template/SLURM/experiment_example/01_load_tokenizer_and_model.slurm deleted file mode 100644 index abcc05de..00000000 --- a/experiments/jz_template/SLURM/experiment_example/01_load_tokenizer_and_model.slurm +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=modelling-metadata-html-download-tokenizer-and-model # job name -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! -#SBATCH --cpus-per-task=8 # number of cores per tasks -#SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --gres=gpu:0 # number of gpus -#SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name -#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name -#SBATCH --account=six@gpu # account -#SBATCH -p compil # partition with internet - -set -x -e - -source $HOME/start-user - -cd $WORK/repos/sync/metadata/ - -python experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py \ - model_name=gpt2 \ diff --git a/experiments/jz_template/SLURM/experiment_example/02_load_dataset.slurm b/experiments/jz_template/SLURM/experiment_example/02_load_dataset.slurm deleted file mode 100644 index f657f1fb..00000000 --- a/experiments/jz_template/SLURM/experiment_example/02_load_dataset.slurm +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=modelling-metadata-html-download-dataset-test # job name -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! -#SBATCH --cpus-per-task=8 # number of cores per tasks -#SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --gres=gpu:0 # number of gpus -#SBATCH --time 02:30:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name -#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name -#SBATCH --account=six@gpu # account -#SBATCH -p compil # partition with internet - -set -x -e - -source $HOME/start-user - -# Uncomment if the repo doesn't exist -# cd $DATASETS_CUSTOM/ -# git clone https://huggingface.co/datasets/SaulLu/Natural_Questions_HTML -# cd Natural_Questions_HTML_Toy/ -# git lfs install -# git lfs pull origin master - -cd $WORK/repos/sync/metadata/ - -python experiments/html/SLURM/init_experiment/load_dataset.py \ - dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ - train_file="nq-train-*.jsonl.gz" \ - validation_file="nq-dev-*.jsonl.gz" diff --git a/experiments/jz_template/SLURM/experiment_example/03_create_dataset.slurm b/experiments/jz_template/SLURM/experiment_example/03_create_dataset.slurm deleted file mode 100644 index adb290c3..00000000 --- a/experiments/jz_template/SLURM/experiment_example/03_create_dataset.slurm +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! -#SBATCH --cpus-per-task=40 # number of cores per tasks -#SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --time 01:00:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name -#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name -#SBATCH --account=six@cpu # account - -set -x -e - -source $HOME/start-user - -export HF_DATASETS_OFFLINE=1 -export TRANSFORMERS_OFFLINE=1 - -cd $WORK/repos/sync/metadata/ - -pip install . - -python experiments/html/start_training.py \ - data_config.experiment="with_metadata_and_baseline_val" \ - data_config.metadata_list=["html"] \ - data_config.max_seq_len=1024 \ - data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ - data_config.train_file="nq-train-0\[0-2\].jsonl.gz" \ - data_config.validation_file="nq-dev-00.jsonl.gz" \ - data_config.extension="json" \ - data_config.preprocessing_num_workers=80 \ - out_dir="${SCRATCH}/metadata_outputs" \ - do_train=False \ - do_eval=False \ diff --git a/experiments/jz_template/SLURM/experiment_example/04_do_training.slurm b/experiments/jz_template/SLURM/experiment_example/04_do_training.slurm deleted file mode 100644 index 69fa96c6..00000000 --- a/experiments/jz_template/SLURM/experiment_example/04_do_training.slurm +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=modelling-metadata-html-do-train-test # job name -#SBATCH --ntasks=1 # number of MP tasks -#SBATCH --constraint=v100-16g -#SBATCH --gres=gpu:1 # number of GPUs per node -#SBATCH --cpus-per-task=8 # number of cores per tasks -#SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name -#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name -#SBATCH --account=six@gpu # account - -set -x -e - -source $HOME/start-user - -export HF_DATASETS_OFFLINE=1 -export TRANSFORMERS_OFFLINE=1 -# be careful about the cache folder for Wandb -export WANDB_MODE=offline -export WANDB_DIR=$SCRATCH - -cd $WORK/repos/sync/metadata/ - -python experiments/html/start_training.py \ - data_config.experiment="with_metadata_and_baseline_val" \ - data_config.metadata_list=["html"] \ - data_config.max_seq_len=1024 \ - data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ - data_config.train_file="nq-train-0\[0-2\].jsonl.gz" \ - data_config.validation_file="nq-dev-00.jsonl.gz" \ - data_config.extension="json" \ - data_config.preprocessing_num_workers=80 \ - data_config.per_device_eval_batch_size=3 \ - data_config.per_device_train_batch_size=3 \ - out_dir="${SCRATCH}/metadata_outputs/${SLURM_JOB_ID}" \ - do_train=True \ - do_eval=True \ - evaluation_strategy=STEPS \ - eval_steps=10 \ - save_strategy=STEPS \ - save_steps=10 \ - gradient_accumulation_steps=50\ diff --git a/experiments/jz_template/SLURM/experiment_example/multi_steps.bash b/experiments/jz_template/SLURM/experiment_example/multi_steps.bash deleted file mode 100644 index 16775a3f..00000000 --- a/experiments/jz_template/SLURM/experiment_example/multi_steps.bash +++ /dev/null @@ -1,2 +0,0 @@ -JID_JOB1=$(sbatch create_dataset.slurm | cut -d " " -f 4) -sbatch --dependency=afterok:$JID_JOB1 do_training.slurm diff --git a/experiments/jz_template/SLURM/experiment_template/02_load_dataset.slurm b/experiments/jz_template/SLURM/experiment_template/02_load_dataset.slurm deleted file mode 100644 index f657f1fb..00000000 --- a/experiments/jz_template/SLURM/experiment_template/02_load_dataset.slurm +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=modelling-metadata-html-download-dataset-test # job name -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! -#SBATCH --cpus-per-task=8 # number of cores per tasks -#SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --gres=gpu:0 # number of gpus -#SBATCH --time 02:30:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name -#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name -#SBATCH --account=six@gpu # account -#SBATCH -p compil # partition with internet - -set -x -e - -source $HOME/start-user - -# Uncomment if the repo doesn't exist -# cd $DATASETS_CUSTOM/ -# git clone https://huggingface.co/datasets/SaulLu/Natural_Questions_HTML -# cd Natural_Questions_HTML_Toy/ -# git lfs install -# git lfs pull origin master - -cd $WORK/repos/sync/metadata/ - -python experiments/html/SLURM/init_experiment/load_dataset.py \ - dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ - train_file="nq-train-*.jsonl.gz" \ - validation_file="nq-dev-*.jsonl.gz" diff --git a/experiments/jz_template/SLURM/experiment_template/03_create_dataset.slurm b/experiments/jz_template/SLURM/experiment_template/03_create_dataset.slurm deleted file mode 100644 index adb290c3..00000000 --- a/experiments/jz_template/SLURM/experiment_template/03_create_dataset.slurm +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! -#SBATCH --cpus-per-task=40 # number of cores per tasks -#SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --time 01:00:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name -#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name -#SBATCH --account=six@cpu # account - -set -x -e - -source $HOME/start-user - -export HF_DATASETS_OFFLINE=1 -export TRANSFORMERS_OFFLINE=1 - -cd $WORK/repos/sync/metadata/ - -pip install . - -python experiments/html/start_training.py \ - data_config.experiment="with_metadata_and_baseline_val" \ - data_config.metadata_list=["html"] \ - data_config.max_seq_len=1024 \ - data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ - data_config.train_file="nq-train-0\[0-2\].jsonl.gz" \ - data_config.validation_file="nq-dev-00.jsonl.gz" \ - data_config.extension="json" \ - data_config.preprocessing_num_workers=80 \ - out_dir="${SCRATCH}/metadata_outputs" \ - do_train=False \ - do_eval=False \ diff --git a/experiments/jz_template/SLURM/experiment_template/04_do_training.slurm b/experiments/jz_template/SLURM/experiment_template/04_do_training.slurm deleted file mode 100644 index 69fa96c6..00000000 --- a/experiments/jz_template/SLURM/experiment_template/04_do_training.slurm +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=modelling-metadata-html-do-train-test # job name -#SBATCH --ntasks=1 # number of MP tasks -#SBATCH --constraint=v100-16g -#SBATCH --gres=gpu:1 # number of GPUs per node -#SBATCH --cpus-per-task=8 # number of cores per tasks -#SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name -#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name -#SBATCH --account=six@gpu # account - -set -x -e - -source $HOME/start-user - -export HF_DATASETS_OFFLINE=1 -export TRANSFORMERS_OFFLINE=1 -# be careful about the cache folder for Wandb -export WANDB_MODE=offline -export WANDB_DIR=$SCRATCH - -cd $WORK/repos/sync/metadata/ - -python experiments/html/start_training.py \ - data_config.experiment="with_metadata_and_baseline_val" \ - data_config.metadata_list=["html"] \ - data_config.max_seq_len=1024 \ - data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ - data_config.train_file="nq-train-0\[0-2\].jsonl.gz" \ - data_config.validation_file="nq-dev-00.jsonl.gz" \ - data_config.extension="json" \ - data_config.preprocessing_num_workers=80 \ - data_config.per_device_eval_batch_size=3 \ - data_config.per_device_train_batch_size=3 \ - out_dir="${SCRATCH}/metadata_outputs/${SLURM_JOB_ID}" \ - do_train=True \ - do_eval=True \ - evaluation_strategy=STEPS \ - eval_steps=10 \ - save_strategy=STEPS \ - save_steps=10 \ - gradient_accumulation_steps=50\ diff --git a/experiments/jz_template/SLURM/experiment_template/multi_steps.bash b/experiments/jz_template/SLURM/experiment_template/multi_steps.bash deleted file mode 100644 index 16775a3f..00000000 --- a/experiments/jz_template/SLURM/experiment_template/multi_steps.bash +++ /dev/null @@ -1,2 +0,0 @@ -JID_JOB1=$(sbatch create_dataset.slurm | cut -d " " -f 4) -sbatch --dependency=afterok:$JID_JOB1 do_training.slurm From eadefbe3e90ba603c49feb12391e06ff33206c71 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 7 Sep 2021 17:43:28 +0200 Subject: [PATCH 73/84] fix comments in command --- .../01_load_tokenizer_and_model.slurm | 8 ++-- .../subexperiment_1/02_load_dataset.slurm | 20 +++++----- .../subexperiment_1/03_create_dataset.slurm | 26 ++++++------ .../subexperiment_1/04_do_training.slurm | 40 +++++++++---------- 4 files changed, 46 insertions(+), 48 deletions(-) diff --git a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/01_load_tokenizer_and_model.slurm b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/01_load_tokenizer_and_model.slurm index 8544be60..e0af2b19 100644 --- a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/01_load_tokenizer_and_model.slurm +++ b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/01_load_tokenizer_and_model.slurm @@ -1,11 +1,11 @@ #!/bin/bash -#SBATCH --job-name=modelling-metadata-example-load-model-and-tokenizer # (change me!) job name +#SBATCH --job-name=modelling-metadata-example-load-model-and-tokenizer # (change me!) job name #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! -#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks +#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus -#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus +#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) #SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name #SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name #SBATCH --account=six@gpu # account diff --git a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/02_load_dataset.slurm b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/02_load_dataset.slurm index d6721310..bd02527d 100644 --- a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/02_load_dataset.slurm +++ b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/02_load_dataset.slurm @@ -1,17 +1,16 @@ #!/bin/bash -#SBATCH --job-name=modelling-metadata-example-load-dataset # (change me!) job name +#SBATCH --job-name=modelling-metadata-example-load-dataset # (change me!) job name #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! -#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks +#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus -#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus +#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) #SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name #SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name #SBATCH --account=six@gpu # account #SBATCH -p compil # partition with internet - set -x -e # Next line will: @@ -35,14 +34,13 @@ string=$HUB_REPO_NAME$delimiter #Split the text based on the delimiter myarray=() while [[ $string ]]; do - myarray+=( "${string%%"$delimiter"*}" ) - string=${string#*"$delimiter"} + myarray+=("${string%%"$delimiter"*}") + string=${string#*"$delimiter"} done REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" # We clone the repo if it doesn't exist -if [[ -d "${REPO_DIR}" ]] -then +if [[ -d "${REPO_DIR}" ]]; then echo "${REPO_DIR} already exists on your filesystem." else echo "${REPO_DIR} doesn't exists on your filesystem." @@ -58,5 +56,5 @@ cd $WORK/repos/sync/metadata/ # We check that the dataset can indeed be loaded python experiments/jz/utils/loading_script_utils/load_dataset.py \ dataset_name="${REPO_DIR}" \ - train_file="nq-train-*.jsonl.gz" \ # (change me! e.g "nq-train-*.jsonl.gz" or remove arg) - validation_file="nq-dev-*.jsonl.gz" # (change me! e.g. "nq-dev-*.jsonl.gz" or remove arg) + train_file="nq-train-*.jsonl.gz" \ + validation_file="nq-dev-*.jsonl.gz" diff --git a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/03_create_dataset.slurm b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/03_create_dataset.slurm index 6b23d34b..8be44449 100644 --- a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/03_create_dataset.slurm +++ b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/03_create_dataset.slurm @@ -4,7 +4,7 @@ #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! #SBATCH --cpus-per-task=20 # (change me! between 0 and 40) number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --time 01:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --time 01:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) #SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name #SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name #SBATCH --account=six@cpu # account @@ -37,21 +37,21 @@ string=$HUB_REPO_NAME$delimiter #Split the text based on the delimiter myarray=() while [[ $string ]]; do - myarray+=( "${string%%"$delimiter"*}" ) + myarray+=("${string%%"$delimiter"*}") string=${string#*"$delimiter"} done REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" # Now we launch the script that will perform the preprocessing of the dataset # Feel free to add any arguments you like (change me!) -python bsmetadata/train.py \ # (change me! if you have a specific script) - data_config.experiment="with_metadata" \ - data_config.metadata_list=["html"] \ # (change me!) - data_config.max_seq_len=1024 \ - data_config.dataset_name="${REPO_DIR}" \ - data_config.train_file="nq-train-*.jsonl.gz" \ # (change me! e.g "nq-train-0\[0-2\].jsonl.gz" or remove arg) - data_config.validation_file="nq-dev-*.jsonl.gz"\ # (change me! e.g "nq-dev-00.jsonl.gz" or remove arg) - data_config.preprocessing_num_workers=80 \ - out_dir="${SCRATCH}/metadata_outputs" \ - do_train=False \ - do_eval=False \ +python bsmetadata/train.py \ + data_config.experiment="with_metadata" \ + data_config.metadata_list=["html"] \ + data_config.max_seq_len=1024 \ + data_config.dataset_name="${REPO_DIR}" \ + data_config.train_file="nq-train-*.jsonl.gz" \ + data_config.validation_file="nq-dev-*.jsonl.gz" \ + data_config.preprocessing_num_workers=80 \ + out_dir="${SCRATCH}/metadata_outputs" \ + do_train=False \ + do_eval=False diff --git a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm index c12c25e0..227e512e 100644 --- a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm +++ b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm @@ -5,7 +5,7 @@ #SBATCH --gres=gpu:1 # number of GPUs per node #SBATCH --cpus-per-task=8 # (change me! between 0 and 40) number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --time 01:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --time 01:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) #SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name #SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name # error file name #SBATCH --account=six@gpu # account @@ -41,26 +41,26 @@ string=$HUB_REPO_NAME$delimiter #Split the text based on the delimiter myarray=() while [[ $string ]]; do - myarray+=( "${string%%"$delimiter"*}" ) + myarray+=("${string%%"$delimiter"*}") string=${string#*"$delimiter"} done REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" -python bsmetadata/train.py \ # (change me! if you have a specific script) - data_config.experiment="with_metadata" \ - data_config.metadata_list=["html"] \ - data_config.max_seq_len=1024 \ - data_config.dataset_name="${REPO_DIR}" \ - data_config.train_file="nq-train-*.jsonl.gz" \ # (change me! e.g "nq-train-0\[0-2\].jsonl.gz" or remove arg) - data_config.validation_file="nq-dev-*.jsonl.gz"\ # (change me! e.g "nq-dev-00.jsonl.gz" or remove arg) - data_config.preprocessing_num_workers=80 \ - data_config.per_device_eval_batch_size=3 \ - data_config.per_device_train_batch_size=3 \ - out_dir="${SCRATCH}/metadata_outputs/${SLURM_JOB_ID}" \ - do_train=True \ - do_eval=True \ - evaluation_strategy=STEPS \ - eval_steps=10 \ - save_strategy=STEPS \ - save_steps=10 \ - gradient_accumulation_steps=50\ # (change me!) +python bsmetadata/train.py \ + data_config.experiment="with_metadata" \ + data_config.metadata_list=["html"] \ + data_config.max_seq_len=1024 \ + data_config.dataset_name="${REPO_DIR}" \ + data_config.train_file="nq-train-*.jsonl.gz" \ + data_config.validation_file="nq-dev-*.jsonl.gz" \ + data_config.preprocessing_num_workers=80 \ + data_config.per_device_eval_batch_size=3 \ + data_config.per_device_train_batch_size=3 \ + out_dir="${SCRATCH}/metadata_outputs/${SLURM_JOB_ID}" \ + do_train=True \ + do_eval=True \ + evaluation_strategy=STEPS \ + eval_steps=10 \ + save_strategy=STEPS \ + save_steps=10 \ + gradient_accumulation_steps=50 From 01f846707839f51746ed8d1af83aee874762df05 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 7 Sep 2021 18:21:00 +0200 Subject: [PATCH 74/84] add init in experiment folder --- bsmetadata/experiments/__init__.py | 0 bsmetadata/metadata_processors.py | 1 + 2 files changed, 1 insertion(+) create mode 100644 bsmetadata/experiments/__init__.py diff --git a/bsmetadata/experiments/__init__.py b/bsmetadata/experiments/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bsmetadata/metadata_processors.py b/bsmetadata/metadata_processors.py index 169cead1..1aa3bd32 100644 --- a/bsmetadata/metadata_processors.py +++ b/bsmetadata/metadata_processors.py @@ -20,6 +20,7 @@ from bsmetadata.input_pipeline import DataConfig + class MetadataProcessor: """A metadata processor can be used to add both global and local metadata information to a given input text.""" From dc5b9c285ffe47155f651cebaf759b2243e14a3b Mon Sep 17 00:00:00 2001 From: SaulLu Date: Tue, 7 Sep 2021 21:36:23 +0200 Subject: [PATCH 75/84] change datasetname --- .../experiment_example/subexperiment_1/04_do_training.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm index 227e512e..22305b9c 100644 --- a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm +++ b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm @@ -30,7 +30,7 @@ export WANDB_DIR=$SCRATCH # Folder for the clone of github.com/bigscience-workshop/metadata/ cd $WORK/repos/metadata/ -HUB_REPO_NAME='XXX' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy) +HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_Toy' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy) # We define the name of the folder in which the clone will be made #Define multi-character delimiter From 6af9a191b8b954c392964cb25e3b1cef0d7e8e88 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 8 Sep 2021 15:08:38 +0200 Subject: [PATCH 76/84] add readmes --- experiments/jz/README.md | 56 ++++++++++++++++++- .../SLURM/experiment_example/README.md | 14 +---- 2 files changed, 57 insertions(+), 13 deletions(-) diff --git a/experiments/jz/README.md b/experiments/jz/README.md index 3afd6e43..5e92174d 100644 --- a/experiments/jz/README.md +++ b/experiments/jz/README.md @@ -4,4 +4,58 @@ The purpose of this repo is to provide templates for people who don't have direc ## How to design your experience for JZ -Everything on the Hub: model, tokenizer, Dataset \ No newline at end of file +To designate an experiment on JZ you have to think in stages: + +1. Uploading stage: stage where we have to download from the internet everything we need (dataset, model, tokenizer, dependencies, etc); +2. CPU stage: stage in which only CPU operations are performed (typically pre-processing); +3. GPU stage: stage during which only operations on the CPU and GPU are performed. This is typically training and evaluation; +4. Downloading stage: stage during which the outputs (checkpoints, datasets, metrics) are retrieved from JZ. + +What I propose is to put on the Hub the data that will be uploaded to JZ (dataset, initial model, tokenizer, etc). + +Concretely, to work on JZ, you have to prepare bash scripts (and more precisely SLURM files) which will be put in a job queue to be executed. You will find in the `experiments/jz/templates/SLURM/experiment_template` folder templates of scripts to realize an end-2-end experiment. Each of these scripts are composed of 2 sections: + +1. A section indicating the characteristics of the job to run (typically its duration and the hardware to use); +2. A section which is a bash script in which you just have to list the terminal commands to run to launch a part of the experiment. + +You will also find in `experiments/jz/templates/SLURM/experiment_example` folder an example of an experiment that could be launched on JZ. + +As you will certainly not be able to run these scripts yourself on JZ, what I suggest is that you write the bash instructions to be used for your experiments (keeping in mind the need to think of your experiment in steps with one script per type of step). Don't hesitate to write your doubts or questions while writing this script so that we can discuss them before the execution of the script on the cluster. + +As a tip, try to prepare a toy example to check that your scripts can be run on JZ. By toy example I mean a small enough dataset that we can run the jobs with very little time and compute. Indeed, as the jobs are put in a queue there is a priority system which makes that the small jobs are executed more quickly. If ever there is a small bug in the code it can be very useful to be able to debug it quickly. + +In summary some interesting points to know about JZ: + +- the computational partitions **do not have access to the internet**. We use specific partions for everything that needs the internet. +- we try to use only **2 conda environments**: a stable one which corresponds to the dependencies on master and a development one. If your experiment requires dependencies that are not on the master branch of our repository you will have to tell the person who will run your experiment +- we have several storage partitions on JZ, if your code uses **paths** you will also have to talk to the person who will launch your experiment. For your information: + 1. The dataset clones are located at `$DATASETS_CUSTOM` + 2. The clone of the repo on the master branch is at `$WORK/repos/metadata/` + 3. The wandb logs are at `$SCRATCH/wandb` (deleted after 30 days if there is no access to the file in the meantime) + 4. The checkpoints are located at `$SCRATCH/metadata_outputs/{job_id}` (deleted after 30 days if the file has not been accessed in the meantime) + 5. For scripts requiring GPU computing we try to use one 16GB V100 (maximum 20h). + 6. For scripts requiring CPU computation we try to use a maximum of 1 node (40 CPUs). + +If you ever get stuck on anything to design your experience on JZ, contact me. The instructions will most likely change according to your needs. + +It might be interesting to plan some peers coding sessions to prepare experiments that would go beyond this very generic framework. But in any case, it will be useful to have a bash script base to visualize the operations to perform. + +## Downloading from JZ + +This is not yet ready: + +- downloading the checkpoints (do we want to send them to the HUB?) +- logging in tensorboard to be able to use [this feature](https://huggingface.co/bigscience/tr3d-1B3-more-warmup-tensorboard/tensorboard) of the HUB + +What is ready: + +- synchronization of wandb logs (on request) + +## Checklist + +Here is a small checklist of information that the person running your script will probably want to know: + +- What do you need to download (dataset, template, tokenizer)? +- Where are your scripts located in the repository, in what order should they be run? +- Are you using the master branch of modelling-metadata? If not, why not? +- Do your dependencies match the dependencies listed on master? diff --git a/experiments/jz/templates/SLURM/experiment_example/README.md b/experiments/jz/templates/SLURM/experiment_example/README.md index 8cfa4464..cb38feb3 100644 --- a/experiments/jz/templates/SLURM/experiment_example/README.md +++ b/experiments/jz/templates/SLURM/experiment_example/README.md @@ -1,13 +1,3 @@ -# Experiment 1 +# Experiment example -## Run experiment on JZ - -1. Download the tokenizer and the model -2. Download the dataset on a partition with internet -3. Preprocess the dataset on a cpu-only partition -4. Run the training on a gpu 16gb partition - -Are you downloading a new: -- model ? -- tokenizer ? -- dataset ? \ No newline at end of file +This is a toy experiment example that can be run on JZ. This experience is made up of sub-experiments, each corresponding to a run. From 263eabbc5adcfc3c1bdab99720ffc918acae0adf Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 8 Sep 2021 15:08:53 +0200 Subject: [PATCH 77/84] format --- .../experiment_template/01_load_tokenizer_and_model.slurm | 8 ++++---- .../SLURM/experiment_template/02_load_dataset.slurm | 4 ++-- .../SLURM/experiment_template/03_create_dataset.slurm | 8 ++++---- .../SLURM/experiment_template/04_do_training.slurm | 8 ++++---- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/experiments/jz/templates/SLURM/experiment_template/01_load_tokenizer_and_model.slurm b/experiments/jz/templates/SLURM/experiment_template/01_load_tokenizer_and_model.slurm index b402a379..a1feb35d 100644 --- a/experiments/jz/templates/SLURM/experiment_template/01_load_tokenizer_and_model.slurm +++ b/experiments/jz/templates/SLURM/experiment_template/01_load_tokenizer_and_model.slurm @@ -1,11 +1,11 @@ #!/bin/bash -#SBATCH --job-name=modelling-metadata-XX # (change me!) job name +#SBATCH --job-name=modelling-metadata-XX # (change me!) job name #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! -#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks +#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks #SBATCH --hint=nomultithread # we get physical cores not logical -#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus -#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus +#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) #SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name #SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name #SBATCH --account=six@gpu # account diff --git a/experiments/jz/templates/SLURM/experiment_template/02_load_dataset.slurm b/experiments/jz/templates/SLURM/experiment_template/02_load_dataset.slurm index 5edc9faa..7c4b1ce9 100644 --- a/experiments/jz/templates/SLURM/experiment_template/02_load_dataset.slurm +++ b/experiments/jz/templates/SLURM/experiment_template/02_load_dataset.slurm @@ -58,5 +58,5 @@ cd $WORK/repos/sync/metadata/ # We check that the dataset can indeed be loaded python experiments/jz/utils/loading_script_utils/load_dataset.py \ dataset_name="${REPO_DIR}" \ - train_file="XXX" \ # (change me! e.g "nq-train-*.jsonl.gz" or remove arg) - validation_file="XXX" # (change me! e.g. "nq-dev-*.jsonl.gz" or remove arg) + train_file="XXX" \ # (change me and remove the comment! e.g "nq-train-*.jsonl.gz" or remove arg) + validation_file="XXX" # (change me and remove the comment! e.g. "nq-dev-*.jsonl.gz" or remove arg) diff --git a/experiments/jz/templates/SLURM/experiment_template/03_create_dataset.slurm b/experiments/jz/templates/SLURM/experiment_template/03_create_dataset.slurm index 4afe7647..7579e57b 100644 --- a/experiments/jz/templates/SLURM/experiment_template/03_create_dataset.slurm +++ b/experiments/jz/templates/SLURM/experiment_template/03_create_dataset.slurm @@ -44,13 +44,13 @@ REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" # Now we launch the script that will perform the preprocessing of the dataset # Feel free to add any arguments you like (change me!) -python bsmetadata/train.py \ # (change me! if you have a specific script) +python bsmetadata/train.py \ # (change me and remove the comment! if you have a specific script) data_config.experiment="with_metadata" \ - data_config.metadata_list=["html"] \ # (change me!) + data_config.metadata_list=["html"] \ # (change me and remove the comment!) data_config.max_seq_len=1024 \ data_config.dataset_name="${REPO_DIR}" \ - data_config.train_file="XXX" \ # (change me! e.g "nq-train-0\[0-2\].jsonl.gz" or remove arg) - data_config.validation_file"XXX" \ # (change me! e.g "nq-dev-00.jsonl.gz" or remove arg) + data_config.train_file="XXX" \ # (change me and remove the comment! e.g "nq-train-0\[0-2\].jsonl.gz" or remove arg) + data_config.validation_file"XXX" \ # (change me and remove the comment! e.g "nq-dev-00.jsonl.gz" or remove arg) data_config.preprocessing_num_workers=80 \ out_dir="${SCRATCH}/metadata_outputs" \ do_train=False \ diff --git a/experiments/jz/templates/SLURM/experiment_template/04_do_training.slurm b/experiments/jz/templates/SLURM/experiment_template/04_do_training.slurm index fdc281ad..a1d90ff8 100644 --- a/experiments/jz/templates/SLURM/experiment_template/04_do_training.slurm +++ b/experiments/jz/templates/SLURM/experiment_template/04_do_training.slurm @@ -45,13 +45,13 @@ while [[ $string ]]; do done REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" -python bsmetadata/train.py \ # (change me! if you have a specific script) +python bsmetadata/train.py \ # (change me and remove the comment! if you have a specific script) data_config.experiment="with_metadata" \ data_config.metadata_list=["html"] \ data_config.max_seq_len=1024 \ data_config.dataset_name="${REPO_DIR}" \ - data_config.train_file="XXX" \ # (change me! e.g "nq-train-0\[0-2\].jsonl.gz" or remove arg) - data_config.validation_file"XXX" \ # (change me! e.g "nq-dev-00.jsonl.gz" or remove arg) + data_config.train_file="XXX" \ # (change me and remove the comment! e.g "nq-train-0\[0-2\].jsonl.gz" or remove arg) + data_config.validation_file"XXX" \ # (change me and remove the comment! e.g "nq-dev-00.jsonl.gz" or remove arg) data_config.preprocessing_num_workers=80 \ data_config.per_device_eval_batch_size=3 \ data_config.per_device_train_batch_size=3 \ @@ -62,4 +62,4 @@ python bsmetadata/train.py \ # (change me! if you have a specific script) eval_steps=10 \ save_strategy=STEPS \ save_steps=10 \ - gradient_accumulation_steps=50\ # (change me!) + gradient_accumulation_steps=50\ # (change me and remove the comment!) From 5b35b507541f6832dd6fb47b214a906a0a5bddd2 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 8 Sep 2021 15:19:13 +0200 Subject: [PATCH 78/84] experiment template readme --- .../SLURM/experiment_template/README.md | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/experiments/jz/templates/SLURM/experiment_template/README.md b/experiments/jz/templates/SLURM/experiment_template/README.md index 8cfa4464..b483f392 100644 --- a/experiments/jz/templates/SLURM/experiment_template/README.md +++ b/experiments/jz/templates/SLURM/experiment_template/README.md @@ -1,13 +1,10 @@ -# Experiment 1 +# Experiment template -## Run experiment on JZ +In this folder you will find script templates to run a "typical" experiment on JZ. -1. Download the tokenizer and the model -2. Download the dataset on a partition with internet -3. Preprocess the dataset on a cpu-only partition -4. Run the training on a gpu 16gb partition +These scripts are designed to be run sequentially for: -Are you downloading a new: -- model ? -- tokenizer ? -- dataset ? \ No newline at end of file +1. Downloading the tokenizer and the model (`01_load_tokenizer_and_model.slurm`) +2. Downloading the dataset on a partition with internet ( `02_load_dataset.slurm`) +3. Preprocessing the dataset on a cpu-only partition (`03_create_dataset.slurm`) +4. Running the training on a gpu 16gb partition (`04_do_training.slurm`) From 1d9e2195a2ac4dd36cdd09e8709c0a5fcbe88b9b Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 8 Sep 2021 15:29:19 +0200 Subject: [PATCH 79/84] temporary change for tests --- .github/workflows/test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3a7dc9de..aa730b97 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,6 +24,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install . + python -m pip install -r requirements.txt python -m pip install -r requirements-dev.txt - name: Test run: | From 1166d326b0e3c64c98e73fd99f8a8cd9628e1434 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 8 Sep 2021 15:30:19 +0200 Subject: [PATCH 80/84] format --- .../with_metadata_and_baseline_val.py | 18 ++++++----- bsmetadata/input_pipeline.py | 2 +- bsmetadata/metadata_processors.py | 1 - bsmetadata/train.py | 32 ++++++++++++------- 4 files changed, 32 insertions(+), 21 deletions(-) diff --git a/bsmetadata/experiments/with_metadata_and_baseline_val.py b/bsmetadata/experiments/with_metadata_and_baseline_val.py index 394089b9..dc80eb92 100644 --- a/bsmetadata/experiments/with_metadata_and_baseline_val.py +++ b/bsmetadata/experiments/with_metadata_and_baseline_val.py @@ -185,20 +185,22 @@ def create_labels_column(examples): ) logger.info("Creating labels column finished") val_dataset2 = val_dataset_without_metadata - logger.info(f" Num train examples = {len(train_dataset)}") logger.info(f" Num validation examples dataloader 1 = {len(val_dataset1)}") logger.info(f" Num validation examples dataloader 2 = {len(val_dataset2)}") - logger.info(f' Train examples = {train_dataset[0]}') - logger.info(f' Validation examples dataloader 1 = {val_dataset1[0]}') - logger.info(f' Validation examples dataloader 2 = {val_dataset2[0]}') + logger.info(f" Train examples = {train_dataset[0]}") + logger.info(f" Validation examples dataloader 1 = {val_dataset1[0]}") + logger.info(f" Validation examples dataloader 2 = {val_dataset2[0]}") logger.info(f' Train examples = {tokenizer.convert_ids_to_tokens(train_dataset[0]["input_ids"])}') - logger.info(f' Validation examples dataloader 1 = {tokenizer.convert_ids_to_tokens(val_dataset1[0]["input_ids"])}') - logger.info(f' Validation examples dataloader 2 = {tokenizer.convert_ids_to_tokens(val_dataset2[0]["input_ids"])}') - + logger.info( + f' Validation examples dataloader 1 = {tokenizer.convert_ids_to_tokens(val_dataset1[0]["input_ids"])}' + ) + logger.info( + f' Validation examples dataloader 2 = {tokenizer.convert_ids_to_tokens(val_dataset2[0]["input_ids"])}' + ) # DataLoaders creation: train_dataloader = DataLoader( @@ -217,4 +219,4 @@ def create_labels_column(examples): collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size, ) - return train_dataloader, {"val1": val_dataloader1 , "val2": val_dataloader2 } + return train_dataloader, {"val1": val_dataloader1, "val2": val_dataloader2} diff --git a/bsmetadata/input_pipeline.py b/bsmetadata/input_pipeline.py index c46473f7..ee599285 100644 --- a/bsmetadata/input_pipeline.py +++ b/bsmetadata/input_pipeline.py @@ -106,7 +106,7 @@ def get_dataloaders(tokenizer, cfg: DataConfig): from bsmetadata.experiments.with_metadata import get_dataloaders as fn return fn(tokenizer, cfg) - + if cfg.experiment == "with_metadata_and_baseline_val": from bsmetadata.experiments.with_metadata_and_baseline_val import get_dataloaders as fn diff --git a/bsmetadata/metadata_processors.py b/bsmetadata/metadata_processors.py index 1aa3bd32..169cead1 100644 --- a/bsmetadata/metadata_processors.py +++ b/bsmetadata/metadata_processors.py @@ -20,7 +20,6 @@ from bsmetadata.input_pipeline import DataConfig - class MetadataProcessor: """A metadata processor can be used to add both global and local metadata information to a given input text.""" diff --git a/bsmetadata/train.py b/bsmetadata/train.py index e419977a..6f0ec454 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -53,14 +53,24 @@ class CFG: default="STEPS", metadata={"help": "The evaluation strategy to use."}, ) - eval_num_per_epoch: int = field(default=3, metadata={"help": "If evaluation strategy is `epoch`. The number of evaluations to perform per epoch during training."}) - eval_steps: int = field(default=100, metadata={"help": "If evaluation strategy is `steps`. Run an evaluation every X steps."}) + eval_num_per_epoch: int = field( + default=3, + metadata={ + "help": "If evaluation strategy is `epoch`. The number of evaluations to perform per epoch during training." + }, + ) + eval_steps: int = field( + default=100, metadata={"help": "If evaluation strategy is `steps`. Run an evaluation every X steps."} + ) save_strategy: IntervalStrategy = field( default="STEPS", metadata={"help": "The checkpoint save strategy to use."}, ) - save_num_per_epoch: int = field(default=3, metadata={"help": "If save strategy is `epoch`. The number of savings to perform per epoch during training."}) + save_num_per_epoch: int = field( + default=3, + metadata={"help": "If save strategy is `epoch`. The number of savings to perform per epoch during training."}, + ) save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."}) save_total_limit: Optional[int] = field( default=None, @@ -70,7 +80,7 @@ class CFG: "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints" ) }, - ) #TODO!!! + ) # TODO!!! model_name: str = field(default="gpt2", metadata={"help": "The name of the pretrained model to use."}) project_name: str = field(default="metadata_lm", metadata={"help": "The project name."}) @@ -205,8 +215,8 @@ def main(args: CFG) -> None: else: eval_per_n_step = args.max_train_steps // args.eval_num_per_epoch elif args.evaluation_strategy == IntervalStrategy.STEPS: - eval_per_n_step = args.eval_steps - else: # IntervalStrategy.NO + eval_per_n_step = args.eval_steps + else: # IntervalStrategy.NO eval_per_n_step = args.max_train_steps + 1 if args.save_strategy == IntervalStrategy.EPOCH: @@ -215,8 +225,8 @@ def main(args: CFG) -> None: else: save_per_n_step = args.max_train_steps // args.save_num_per_epoch elif args.save_strategy == IntervalStrategy.STEPS: - save_per_n_step = args.save_steps - else: # IntervalStrategy.NO + save_per_n_step = args.save_steps + else: # IntervalStrategy.NO save_per_n_step = args.max_train_steps + 1 scheduler = get_scheduler( @@ -250,7 +260,7 @@ def evaluate(eval_dataloader): progress_bar = tqdm(range(args.max_train_steps), desc="training") completed_steps = 0 logger_metrics = Logger(is_local_main_process, project=args.project_name, config=args) - + do_eval = args.do_eval if do_eval: logger.info("***** Evaluation *****") @@ -269,7 +279,7 @@ def evaluate(eval_dataloader): batch["labels"] = labels loss = loss_fn(batch, outputs, metadata_mask) - logger_metrics.log({"loss": loss, "lr": optimizer.param_groups[0]['lr']}) + logger_metrics.log({"loss": loss, "lr": optimizer.param_groups[0]["lr"]}) loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) @@ -283,7 +293,7 @@ def evaluate(eval_dataloader): completed_steps += 1 else: continue - + do_eval = args.do_eval and completed_steps > 0 and completed_steps % eval_per_n_step == 0 if do_eval: logger.info("***** Evaluation *****") From 7683dd5b017ae7f848c0bad5d3418924b7ba5811 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 8 Sep 2021 15:32:56 +0200 Subject: [PATCH 81/84] isort --- experiments/html/SLURM/crime_and_punish_test/load_dataset.py | 4 ++-- experiments/html/SLURM/experiment_1/load_dataset.py | 4 ++-- experiments/html/SLURM/experiment_2/load_dataset.py | 4 ++-- experiments/html/SLURM/init_experiment/load_dataset.py | 4 ++-- .../html/SLURM/loading_scripts/load_tokenizer_and_model.py | 2 +- experiments/html/start_training.py | 2 +- experiments/jz/utils/loading_script_utils/load_dataset.py | 4 ++-- .../jz/utils/loading_script_utils/load_tokenizer_and_model.py | 2 +- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/experiments/html/SLURM/crime_and_punish_test/load_dataset.py b/experiments/html/SLURM/crime_and_punish_test/load_dataset.py index 1a8c65de..3ce34fd1 100644 --- a/experiments/html/SLURM/crime_and_punish_test/load_dataset.py +++ b/experiments/html/SLURM/crime_and_punish_test/load_dataset.py @@ -1,14 +1,14 @@ import logging import sys -from datasets import config import hydra -from datasets import load_dataset +from datasets import config, load_dataset from hydra.core.config_store import ConfigStore from bsmetadata.input_pipeline import DataConfig from bsmetadata.train import show_help + logger = logging.getLogger(__name__) cs = ConfigStore.instance() diff --git a/experiments/html/SLURM/experiment_1/load_dataset.py b/experiments/html/SLURM/experiment_1/load_dataset.py index 1a8c65de..3ce34fd1 100644 --- a/experiments/html/SLURM/experiment_1/load_dataset.py +++ b/experiments/html/SLURM/experiment_1/load_dataset.py @@ -1,14 +1,14 @@ import logging import sys -from datasets import config import hydra -from datasets import load_dataset +from datasets import config, load_dataset from hydra.core.config_store import ConfigStore from bsmetadata.input_pipeline import DataConfig from bsmetadata.train import show_help + logger = logging.getLogger(__name__) cs = ConfigStore.instance() diff --git a/experiments/html/SLURM/experiment_2/load_dataset.py b/experiments/html/SLURM/experiment_2/load_dataset.py index 1a8c65de..3ce34fd1 100644 --- a/experiments/html/SLURM/experiment_2/load_dataset.py +++ b/experiments/html/SLURM/experiment_2/load_dataset.py @@ -1,14 +1,14 @@ import logging import sys -from datasets import config import hydra -from datasets import load_dataset +from datasets import config, load_dataset from hydra.core.config_store import ConfigStore from bsmetadata.input_pipeline import DataConfig from bsmetadata.train import show_help + logger = logging.getLogger(__name__) cs = ConfigStore.instance() diff --git a/experiments/html/SLURM/init_experiment/load_dataset.py b/experiments/html/SLURM/init_experiment/load_dataset.py index 1a8c65de..3ce34fd1 100644 --- a/experiments/html/SLURM/init_experiment/load_dataset.py +++ b/experiments/html/SLURM/init_experiment/load_dataset.py @@ -1,14 +1,14 @@ import logging import sys -from datasets import config import hydra -from datasets import load_dataset +from datasets import config, load_dataset from hydra.core.config_store import ConfigStore from bsmetadata.input_pipeline import DataConfig from bsmetadata.train import show_help + logger = logging.getLogger(__name__) cs = ConfigStore.instance() diff --git a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py index 8c6cb94b..9b726649 100644 --- a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py +++ b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py @@ -1,8 +1,8 @@ import logging import sys -import transformers.utils.logging as logging_transformers import hydra +import transformers.utils.logging as logging_transformers from datasets import load_dataset from hydra.core.config_store import ConfigStore from transformers import AdamW, AutoModelForCausalLM, AutoTokenizer diff --git a/experiments/html/start_training.py b/experiments/html/start_training.py index d8ae0701..63e3cf8f 100644 --- a/experiments/html/start_training.py +++ b/experiments/html/start_training.py @@ -8,7 +8,7 @@ from bsmetadata.input_pipeline import DataConfig from bsmetadata.metadata_processors import PROCESSORS -from bsmetadata.train import main, show_help, CFG +from bsmetadata.train import CFG, main, show_help tags_to_remove_alone = [ diff --git a/experiments/jz/utils/loading_script_utils/load_dataset.py b/experiments/jz/utils/loading_script_utils/load_dataset.py index 1a8c65de..3ce34fd1 100644 --- a/experiments/jz/utils/loading_script_utils/load_dataset.py +++ b/experiments/jz/utils/loading_script_utils/load_dataset.py @@ -1,14 +1,14 @@ import logging import sys -from datasets import config import hydra -from datasets import load_dataset +from datasets import config, load_dataset from hydra.core.config_store import ConfigStore from bsmetadata.input_pipeline import DataConfig from bsmetadata.train import show_help + logger = logging.getLogger(__name__) cs = ConfigStore.instance() diff --git a/experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py b/experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py index 8c6cb94b..9b726649 100644 --- a/experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py +++ b/experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py @@ -1,8 +1,8 @@ import logging import sys -import transformers.utils.logging as logging_transformers import hydra +import transformers.utils.logging as logging_transformers from datasets import load_dataset from hydra.core.config_store import ConfigStore from transformers import AdamW, AutoModelForCausalLM, AutoTokenizer From 099bbe2709bd8cd4f3c360a34ca7f1a299ae9950 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 8 Sep 2021 15:37:28 +0200 Subject: [PATCH 82/84] flake 8 --- bsmetadata/train.py | 2 +- .../SLURM/loading_scripts/load_tokenizer_and_model.py | 8 +++----- experiments/html/html_processor.py | 1 - experiments/html/start_training.py | 4 +--- experiments/html/test_html_processor.py | 3 --- .../loading_script_utils/load_tokenizer_and_model.py | 8 +++----- 6 files changed, 8 insertions(+), 18 deletions(-) diff --git a/bsmetadata/train.py b/bsmetadata/train.py index 6f0ec454..7d4f5434 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -18,7 +18,7 @@ from omegaconf import OmegaConf from tqdm.auto import tqdm as original_tqdm from transformers import AdamW, AutoModelForCausalLM, AutoTokenizer, get_scheduler, set_seed -from transformers.trainer_utils import EvaluationStrategy, IntervalStrategy +from transformers.trainer_utils import IntervalStrategy from bsmetadata.input_pipeline import DataConfig, get_dataloaders diff --git a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py index 9b726649..0c4c42dd 100644 --- a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py +++ b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py @@ -3,11 +3,9 @@ import hydra import transformers.utils.logging as logging_transformers -from datasets import load_dataset from hydra.core.config_store import ConfigStore -from transformers import AdamW, AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer -from bsmetadata.input_pipeline import DataConfig from bsmetadata.train import CFG, show_help @@ -28,10 +26,10 @@ @hydra.main(config_path=None, config_name="config") def main(args: CFG) -> None: # get dataloaders - tokenizer = AutoTokenizer.from_pretrained(args.model_name) + _ = AutoTokenizer.from_pretrained(args.model_name) # get model - model = AutoModelForCausalLM.from_pretrained(args.model_name) + _ = AutoModelForCausalLM.from_pretrained(args.model_name) if __name__ == "__main__": diff --git a/experiments/html/html_processor.py b/experiments/html/html_processor.py index 0a7ed0bd..76357141 100644 --- a/experiments/html/html_processor.py +++ b/experiments/html/html_processor.py @@ -1,6 +1,5 @@ from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Tuple -from urllib.parse import unquote_plus from bsmetadata.input_pipeline import DataConfig from bsmetadata.metadata_processors import MetadataProcessor diff --git a/experiments/html/start_training.py b/experiments/html/start_training.py index 63e3cf8f..801e1d92 100644 --- a/experiments/html/start_training.py +++ b/experiments/html/start_training.py @@ -1,7 +1,5 @@ -import logging import sys -from dataclasses import dataclass, field -from typing import Optional +from dataclasses import dataclass from html_processor import AllTagsRules, HTMLParserConfig, HtmlProcessor, TagToRemove from hydra.core.config_store import ConfigStore diff --git a/experiments/html/test_html_processor.py b/experiments/html/test_html_processor.py index 402863bc..605f108b 100644 --- a/experiments/html/test_html_processor.py +++ b/experiments/html/test_html_processor.py @@ -7,9 +7,6 @@ from bsmetadata.metadata_processors import PROCESSORS from bsmetadata.metadata_utils import ( add_local_metadata_to_text, - add_metadata_and_chunk_examples, - chunks, - create_global_metadata_prefix, ) diff --git a/experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py b/experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py index 9b726649..0c4c42dd 100644 --- a/experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py +++ b/experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py @@ -3,11 +3,9 @@ import hydra import transformers.utils.logging as logging_transformers -from datasets import load_dataset from hydra.core.config_store import ConfigStore -from transformers import AdamW, AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer -from bsmetadata.input_pipeline import DataConfig from bsmetadata.train import CFG, show_help @@ -28,10 +26,10 @@ @hydra.main(config_path=None, config_name="config") def main(args: CFG) -> None: # get dataloaders - tokenizer = AutoTokenizer.from_pretrained(args.model_name) + _ = AutoTokenizer.from_pretrained(args.model_name) # get model - model = AutoModelForCausalLM.from_pretrained(args.model_name) + _ = AutoModelForCausalLM.from_pretrained(args.model_name) if __name__ == "__main__": From d0254649b4e796ee06eff34f51c0ba0d5d097ba5 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 8 Sep 2021 15:38:29 +0200 Subject: [PATCH 83/84] isort --- experiments/html/test_html_processor.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/experiments/html/test_html_processor.py b/experiments/html/test_html_processor.py index 605f108b..7fe710f6 100644 --- a/experiments/html/test_html_processor.py +++ b/experiments/html/test_html_processor.py @@ -5,9 +5,7 @@ from transformers import GPT2TokenizerFast from bsmetadata.metadata_processors import PROCESSORS -from bsmetadata.metadata_utils import ( - add_local_metadata_to_text, -) +from bsmetadata.metadata_utils import add_local_metadata_to_text class MetadataUtilsTester(unittest.TestCase): From 4cdbe2183e635086b56e40e4527427693e48b3f1 Mon Sep 17 00:00:00 2001 From: SaulLu Date: Wed, 8 Sep 2021 20:38:07 +0200 Subject: [PATCH 84/84] fix arg name --- README.md | 2 +- bsmetadata/metadata_utils.py | 2 +- tests/test_metadata_utils.py | 2 +- tests/test_train.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index eef50fde..36b08559 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ This repository contains code for including metadata such as URLs, timestamps, w ## Usage ```sh -accelerate launch --fp16 train.py max_train_steps=100 num_eval=1 data_config.per_device_eval_batch_size=4 +accelerate launch --fp16 train.py max_train_steps=100 eval_num_per_epoch=1 data_config.per_device_eval_batch_size=4 ``` ## Get Help diff --git a/bsmetadata/metadata_utils.py b/bsmetadata/metadata_utils.py index 216783a9..3a612f92 100644 --- a/bsmetadata/metadata_utils.py +++ b/bsmetadata/metadata_utils.py @@ -130,7 +130,7 @@ class MetadataIdxStorage: end_idx_tag_without_content: dict = field(default_factory=(lambda: defaultdict(list))) -def add_local_metadata_to_text(example: Dict[str, Any], cfg: DataConfig) -> Tuple[str, List[bool]]: +def add_local_metadata_to_text(example: Dict[str, Any], cfg: MetadataConfig) -> Tuple[str, List[bool]]: """Adds local metadata (such as HTML tags and entity names) to the given input text. Args: diff --git a/tests/test_metadata_utils.py b/tests/test_metadata_utils.py index 4dc7db9a..abf26d97 100644 --- a/tests/test_metadata_utils.py +++ b/tests/test_metadata_utils.py @@ -207,7 +207,7 @@ def test_add_no_metadata_and_chunk_examples(self): self.assertTrue(all(not x for x in example["metadata_mask"])) def test_add_html_tags(self): - cfg = DataConfig() + cfg = MetadataConfig() cfg.metadata_list = ["html"] PROCESSORS["html"] = HtmlProcessor diff --git a/tests/test_train.py b/tests/test_train.py index b6976564..04631c65 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -24,7 +24,7 @@ def test_toy_training_without_metadata(tmpdir): "data_config.experiment=without_metadata", f'data_config.train_file={os.path.join(path_test_folder,"data","train_toy_raw_wikitext.jsonl")}', f'data_config.validation_file={os.path.join(path_test_folder,"data","val_toy_raw_wikitext.jsonl")}', - "num_eval=2", + "eval_num_per_epoch=2", "data_config.block_size=20", f"out_dir={tmpdir}", "max_train_steps=4", @@ -55,7 +55,7 @@ def test_toy_training_with_metadata(tmpdir): "data_config.experiment=with_metadata", f'data_config.train_file={os.path.join(path_test_folder,"data","train_toy_wikitext_with_metadata.jsonl")}', f'data_config.validation_file={os.path.join(path_test_folder,"data","val_toy_wikitext_with_metadata.jsonl")}', - "num_eval=2", + "eval_num_per_epoch=2", f"out_dir={tmpdir}", "max_train_steps=4", ],