diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3a7dc9de..aa730b97 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,6 +24,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install . + python -m pip install -r requirements.txt python -m pip install -r requirements-dev.txt - name: Test run: | diff --git a/README.md b/README.md index eef50fde..36b08559 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ This repository contains code for including metadata such as URLs, timestamps, w ## Usage ```sh -accelerate launch --fp16 train.py max_train_steps=100 num_eval=1 data_config.per_device_eval_batch_size=4 +accelerate launch --fp16 train.py max_train_steps=100 eval_num_per_epoch=1 data_config.per_device_eval_batch_size=4 ``` ## Get Help diff --git a/bsmetadata/experiments/with_metadata_and_baseline_val.py b/bsmetadata/experiments/with_metadata_and_baseline_val.py new file mode 100644 index 00000000..dc80eb92 --- /dev/null +++ b/bsmetadata/experiments/with_metadata_and_baseline_val.py @@ -0,0 +1,222 @@ +import copy +import functools +import logging + +from datasets import config, load_dataset +from torch.utils.data import DataLoader +from transformers import default_data_collator + +from bsmetadata.metadata_utils import add_metadata_and_chunk_examples + + +logger = logging.getLogger(__name__) + + +def get_dataloaders(tokenizer, args): + """ + Args: + tokenizer: a huggingface/transformers tokenizer + args: a DataConfig + Returns: + a training dataloader and one or more validation dataloaders + validation dataloaders should be in a dictionary + each dataloader should yield {str: torch.Tensor(cpu) } + dictionary keys may have 'metadata_mask' + other fields will be passed to model + note: metadata_mask should be padded + Example: + train_dataloader, val_dataloaders = get_dataloaders(...) + for batch in train_dataloader: + metadata_mask = batch.get('metadata_mask', None) + outputs = model(**batch) + metrics = loss_fn(batch, outputs, metadata_mask) + """ + # Mostly copy/paste from https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_clm_no_trainer.py + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantees that only one local process can concurrently + # download the dataset. + data_files = {} + if args.train_file is not None: + data_files["train"] = args.train_file + if args.validation_file is not None: + data_files["validation"] = args.validation_file + + if not data_files: + data_files = None + + logger.info(f"Start to load dataset, the result will be cached at {config.HF_DATASETS_CACHE}") + if args.dataset_name is not None: + logger.info( + "Downloading with arguments: " + f"dataset_name={args.dataset_name}, " + f"dataset_config_name={args.dataset_config_name}, " + f"data_files={data_files}, " + f"cache_dir={args.cache_dir}," + ) + # Downloading and loading a dataset from the hub. + datasets = load_dataset( + args.dataset_name, + args.dataset_config_name, + data_files=data_files, + cache_dir=args.cache_dir, + keep_in_memory=False, + ) + + if "validation" not in datasets.keys(): + datasets["validation"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + datasets["train"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + else: + logger.info("Loading dataset from extension script") + extension = args.train_file.split(".")[-1] if not args.extension else args.extension + if extension == "txt": + raise ValueError( + "You have entered a text file for the train data, but this type of file cannot contain metadata " + "columns. Wouldn't you rather have a file in json/jsonl or pandas format?" + ) + if extension == "jsonl": + extension = "json" + datasets = load_dataset(extension, data_files=data_files, cache_dir=args.cache_dir) + + if "validation" not in datasets.keys(): + datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + logger.info("Dataset loaded") + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Preprocessing the datasets. + column_names = datasets["train"].column_names + + logger.info("Start to add metadata and chunk examples") + + # Sets the attributes of the args object that have no influence on the calculation of the next map. This is useful + # for using the cache efficiently. + tmp_data_args = copy.deepcopy(args) + tmp_data_args.preprocessing_num_workers = 80 + tmp_data_args.overwrite_cache = False + tmp_data_args.per_device_eval_batch_size = 2 + tmp_data_args.per_device_train_batch_size = 2 + tmp_data_args.map_batch_size = 1 + + # First we pre-process our text and metadata + datasets_with_metadata = datasets.map( + functools.partial(add_metadata_and_chunk_examples, tokenizer=tokenizer, cfg=tmp_data_args), + batched=True, + num_proc=args.preprocessing_num_workers, + load_from_cache_file=not args.overwrite_cache, + desc="Pre-process the text and metadata to create new samples", + remove_columns=column_names, + batch_size=args.map_batch_size, + ) + logger.info("Add metadata and chunk examples finished") + + def create_labels_column(examples): + examples["labels"] = examples["input_ids"].copy() + return examples + + logger.info("Create labels column") + # Then we add the column containing the labels + datasets_with_metadata = datasets_with_metadata.map( + create_labels_column, + batched=True, + num_proc=args.preprocessing_num_workers, + load_from_cache_file=not args.overwrite_cache, + desc="Create labels column", + batch_size=args.map_batch_size, + ) + logger.info("Creating labels column finished") + + train_dataset = datasets_with_metadata["train"] + val_dataset1 = datasets_with_metadata["validation"] + + # We create another validation dataset without metadata + logger.info("Start to add metadata and chunk examples") + tmp_data_args.metadata_probability = 0 + val_dataset_without_metadata = datasets["validation"].map( + functools.partial(add_metadata_and_chunk_examples, tokenizer=tokenizer, cfg=tmp_data_args), + batched=True, + num_proc=args.preprocessing_num_workers, + load_from_cache_file=not args.overwrite_cache, + desc="Pre-process the text and metadata to create new samples", + remove_columns=column_names, + batch_size=args.map_batch_size, + ) + logger.info("Add metadata and chunk examples finished") + + def create_labels_column(examples): + examples["labels"] = examples["input_ids"].copy() + return examples + + logger.info("Create labels column") + # Then we add the column containing the labels + val_dataset_without_metadata = val_dataset_without_metadata.map( + create_labels_column, + batched=True, + num_proc=args.preprocessing_num_workers, + load_from_cache_file=not args.overwrite_cache, + desc="Create labels column", + batch_size=args.map_batch_size, + ) + logger.info("Creating labels column finished") + val_dataset2 = val_dataset_without_metadata + + logger.info(f" Num train examples = {len(train_dataset)}") + logger.info(f" Num validation examples dataloader 1 = {len(val_dataset1)}") + logger.info(f" Num validation examples dataloader 2 = {len(val_dataset2)}") + + logger.info(f" Train examples = {train_dataset[0]}") + logger.info(f" Validation examples dataloader 1 = {val_dataset1[0]}") + logger.info(f" Validation examples dataloader 2 = {val_dataset2[0]}") + + logger.info(f' Train examples = {tokenizer.convert_ids_to_tokens(train_dataset[0]["input_ids"])}') + logger.info( + f' Validation examples dataloader 1 = {tokenizer.convert_ids_to_tokens(val_dataset1[0]["input_ids"])}' + ) + logger.info( + f' Validation examples dataloader 2 = {tokenizer.convert_ids_to_tokens(val_dataset2[0]["input_ids"])}' + ) + + # DataLoaders creation: + train_dataloader = DataLoader( + train_dataset, + shuffle=True, + collate_fn=default_data_collator, + batch_size=args.per_device_train_batch_size, + ) + val_dataloader1 = DataLoader( + val_dataset1, + collate_fn=default_data_collator, + batch_size=args.per_device_eval_batch_size, + ) + val_dataloader2 = DataLoader( + val_dataset2, + collate_fn=default_data_collator, + batch_size=args.per_device_eval_batch_size, + ) + return train_dataloader, {"val1": val_dataloader1, "val2": val_dataloader2} diff --git a/bsmetadata/input_pipeline.py b/bsmetadata/input_pipeline.py index 5a18b4da..2d460cc6 100644 --- a/bsmetadata/input_pipeline.py +++ b/bsmetadata/input_pipeline.py @@ -88,6 +88,11 @@ def get_dataloaders(tokenizer, cfg: DataConfig): if cfg.experiment == "with_metadata": from bsmetadata.experiments.with_metadata import get_dataloaders as fn + return fn(tokenizer, cfg) + + if cfg.experiment == "with_metadata_and_baseline_val": + from bsmetadata.experiments.with_metadata_and_baseline_val import get_dataloaders as fn + return fn(tokenizer, cfg) else: raise ValueError("You have not entered a valid experience name") diff --git a/bsmetadata/metadata_processors.py b/bsmetadata/metadata_processors.py index 74113c41..3d292578 100644 --- a/bsmetadata/metadata_processors.py +++ b/bsmetadata/metadata_processors.py @@ -117,7 +117,13 @@ class HtmlProcessor(MetadataProcessor): def process_local(self, metadata_attrs: Dict[str, Any]) -> Optional[Tuple[str, str]]: # We represent a html tag `T` by enclosing the corresponding text span with "" and "". # Example: An apple is an edible fruit. - return f"<{metadata_attrs['value']}>", f"" + attributes = " ".join( + f'{attr}:"{value}"' + for attr, value in zip(metadata_attrs["value"]["attrs"]["attr"], metadata_attrs["value"]["attrs"]["value"]) + ) + if attributes: + attributes = " " + attributes + return f"<{metadata_attrs['value']['tag']}{attributes}>", f"" class UrlProcessor(MetadataProcessor): diff --git a/bsmetadata/metadata_utils.py b/bsmetadata/metadata_utils.py index 598734a3..3a612f92 100644 --- a/bsmetadata/metadata_utils.py +++ b/bsmetadata/metadata_utils.py @@ -15,6 +15,7 @@ """ import random from collections import defaultdict +from dataclasses import dataclass, field from typing import Any, Dict, List, Tuple from transformers import PreTrainedTokenizerFast @@ -121,6 +122,14 @@ def create_global_metadata_prefix(example: Dict[str, Any], cfg: MetadataConfig) return cfg.metadata_sep.join(sorted_metadata) + cfg.global_metadata_sep if sorted_metadata else "" +@dataclass +class MetadataIdxStorage: + start_idx_tag_with_content: dict = field(default_factory=(lambda: defaultdict(list))) + end_idx_tag_with_content: dict = field(default_factory=(lambda: defaultdict(list))) + start_idx_tag_without_content: dict = field(default_factory=(lambda: defaultdict(list))) + end_idx_tag_without_content: dict = field(default_factory=(lambda: defaultdict(list))) + + def add_local_metadata_to_text(example: Dict[str, Any], cfg: MetadataConfig) -> Tuple[str, List[bool]]: """Adds local metadata (such as HTML tags and entity names) to the given input text. @@ -133,7 +142,7 @@ def add_local_metadata_to_text(example: Dict[str, Any], cfg: MetadataConfig) -> - the first element is the text with metadata; - the second element is a boolean mask where `mask[i]` is set iff `text[i]` is some kind of metadata. """ - metadata_start_texts, metadata_end_texts = defaultdict(list), defaultdict(list) + metadata_idx_storage = MetadataIdxStorage() # Filter and sort all metadata so that they are processed in the requested order. filtered_metadata = [md for md in example["metadata"] if md["type"] == "local" and md["key"] in cfg.metadata_list] @@ -151,27 +160,58 @@ def add_local_metadata_to_text(example: Dict[str, Any], cfg: MetadataConfig) -> char_start_idx = metadata.get("char_start_idx", -1) char_end_idx = metadata.get("char_end_idx", -1) - metadata_start_texts[char_start_idx].insert(0, start_text) - metadata_end_texts[char_end_idx].append(end_text) + if char_start_idx == char_end_idx: + metadata_idx_storage.start_idx_tag_without_content[char_start_idx].insert(0, start_text) + metadata_idx_storage.end_idx_tag_without_content[char_end_idx].append(end_text) + else: + metadata_idx_storage.start_idx_tag_with_content[char_start_idx].insert(0, start_text) + metadata_idx_storage.end_idx_tag_with_content[char_end_idx].append(end_text) # Build the final text with local metadata and the corresponding mask. text_with_local_metadata = [] metadata_mask = [] + def _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask): + for metadata_text in metadata_text_list: + text_with_local_metadata.append(metadata_text) + metadata_mask += [True] * len(metadata_text) + for idx, char in enumerate(example["text"]): + if idx in metadata_idx_storage.end_idx_tag_with_content: + metadata_text_list = metadata_idx_storage.end_idx_tag_with_content[idx] + _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask) + + if idx in metadata_idx_storage.start_idx_tag_without_content: + metadata_text_list = metadata_idx_storage.start_idx_tag_without_content[idx] + _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask) - if idx in metadata_start_texts: - for start_text in metadata_start_texts[idx]: - text_with_local_metadata.append(start_text) - metadata_mask += [True] * len(start_text) + if idx in metadata_idx_storage.end_idx_tag_without_content: + metadata_text_list = metadata_idx_storage.end_idx_tag_without_content[idx] + _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask) + + if idx in metadata_idx_storage.start_idx_tag_with_content: + metadata_text_list = metadata_idx_storage.start_idx_tag_with_content[idx] + _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask) text_with_local_metadata.append(char) metadata_mask += [False] - if idx + 1 in metadata_end_texts: - for end_text in metadata_end_texts[idx + 1]: - text_with_local_metadata.append(end_text) - metadata_mask += [True] * len(end_text) + idx += 1 + if idx in metadata_idx_storage.end_idx_tag_with_content: + metadata_text_list = metadata_idx_storage.end_idx_tag_with_content[idx] + _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask) + + if idx in metadata_idx_storage.start_idx_tag_without_content: + metadata_text_list = metadata_idx_storage.start_idx_tag_without_content[idx] + _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask) + + if idx in metadata_idx_storage.end_idx_tag_without_content: + metadata_text_list = metadata_idx_storage.end_idx_tag_without_content[idx] + _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask) + + if idx in metadata_idx_storage.start_idx_tag_with_content: + metadata_text_list = metadata_idx_storage.start_idx_tag_with_content[idx] + _add_metadata_to_text(metadata_text_list, text_with_local_metadata, metadata_mask) return "".join(text_with_local_metadata), metadata_mask diff --git a/bsmetadata/train.py b/bsmetadata/train.py index cacaf4f9..7d4f5434 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -1,6 +1,7 @@ import dataclasses import gc import json +import logging import math import os import sys @@ -17,10 +18,14 @@ from omegaconf import OmegaConf from tqdm.auto import tqdm as original_tqdm from transformers import AdamW, AutoModelForCausalLM, AutoTokenizer, get_scheduler, set_seed +from transformers.trainer_utils import IntervalStrategy from bsmetadata.input_pipeline import DataConfig, get_dataloaders +logger = logging.getLogger(__name__) + + @dataclass class CFG: data_config: DataConfig = DataConfig() @@ -42,10 +47,47 @@ class CFG: out_dir: str = field( default="output_dir", metadata={"help": "The output directory in which the trained model is saved."} ) - num_eval: int = field(default=3, metadata={"help": "The number of evaluations to perform during training."}) + + # logging_first_step : bool = field(default=False, metadata={"help": "Log the first global_step"}) + evaluation_strategy: IntervalStrategy = field( + default="STEPS", + metadata={"help": "The evaluation strategy to use."}, + ) + eval_num_per_epoch: int = field( + default=3, + metadata={ + "help": "If evaluation strategy is `epoch`. The number of evaluations to perform per epoch during training." + }, + ) + eval_steps: int = field( + default=100, metadata={"help": "If evaluation strategy is `steps`. Run an evaluation every X steps."} + ) + + save_strategy: IntervalStrategy = field( + default="STEPS", + metadata={"help": "The checkpoint save strategy to use."}, + ) + save_num_per_epoch: int = field( + default=3, + metadata={"help": "If save strategy is `epoch`. The number of savings to perform per epoch during training."}, + ) + save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."}) + save_total_limit: Optional[int] = field( + default=None, + metadata={ + "help": ( + "Limit the total amount of checkpoints." + "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints" + ) + }, + ) # TODO!!! + model_name: str = field(default="gpt2", metadata={"help": "The name of the pretrained model to use."}) project_name: str = field(default="metadata_lm", metadata={"help": "The project name."}) + do_train: bool = field(default=True, metadata={"help": "Whether to run training."}) + do_eval: bool = field(default=True, metadata={"help": "Whether to run eval on the dev set."}) + cs = ConfigStore.instance() cs.store(name="config", node=CFG) @@ -104,7 +146,7 @@ def loss_fn(batch, outputs, metadata_mask=None): return loss -@hydra.main(config_name="config") +@hydra.main(config_path=None, config_name="config") def main(args: CFG) -> None: print(OmegaConf.to_yaml(args)) @@ -119,12 +161,23 @@ def main(args: CFG) -> None: os.makedirs(args.out_dir, exist_ok=True) + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if is_local_main_process else logging.WARN, + ) + # get dataloaders + logger.info("Load tokenizer") tokenizer = AutoTokenizer.from_pretrained(args.model_name) tokenizer.pad_token = tokenizer.eos_token + logger.info("Load dataloaders") train_dataloader, eval_dataloaders = get_dataloaders(tokenizer, args.data_config) + logger.info("The dataloaders have been build") # get model + logger.info("Load model") model = AutoModelForCausalLM.from_pretrained(args.model_name) # Optimizer @@ -156,10 +209,26 @@ def main(args: CFG) -> None: else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) - if args.num_eval < 1: + if args.evaluation_strategy == IntervalStrategy.EPOCH: + if args.eval_num_per_epoch < 1: + eval_per_n_step = args.max_train_steps + 1 + else: + eval_per_n_step = args.max_train_steps // args.eval_num_per_epoch + elif args.evaluation_strategy == IntervalStrategy.STEPS: + eval_per_n_step = args.eval_steps + else: # IntervalStrategy.NO eval_per_n_step = args.max_train_steps + 1 - else: - eval_per_n_step = args.max_train_steps // args.num_eval + + if args.save_strategy == IntervalStrategy.EPOCH: + if args.save_num_per_epoch < 1: + save_per_n_step = args.max_train_steps + 1 + else: + save_per_n_step = args.max_train_steps // args.save_num_per_epoch + elif args.save_strategy == IntervalStrategy.STEPS: + save_per_n_step = args.save_steps + else: # IntervalStrategy.NO + save_per_n_step = args.max_train_steps + 1 + scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, @@ -185,56 +254,74 @@ def evaluate(eval_dataloader): model.train() return {"perplexity": perplexity} - progress_bar = tqdm(range(args.max_train_steps), desc="training") - completed_steps = 0 - logger = Logger(is_local_main_process, project=args.project_name, config=args) - for epoch in range(args.num_train_epochs): - model.train() - for step, batch in enumerate(train_dataloader): - # pop labels because we want to calculate loss ourselves - labels = batch.pop("labels") - metadata_mask = batch.pop("metadata_mask", None) - outputs = model(**batch) - batch["labels"] = labels - loss = loss_fn(batch, outputs, metadata_mask) - - logger.log({"loss": loss}) - loss = loss / args.gradient_accumulation_steps - accelerator.backward(loss) - - do_step = step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1 - if do_step: - # accelerator.clip_grad_norm_(model.parameters(), 1.0) - optimizer.step() - scheduler.step() - optimizer.zero_grad() - progress_bar.update(1) - completed_steps += 1 - else: - continue - do_eval = completed_steps > 0 and completed_steps % eval_per_n_step == 0 - if do_eval: - for key, eval_dataloader in eval_dataloaders.items(): - metrics = evaluate(eval_dataloader) - logger.log({key: metrics}) - - # logger.info(f"epoch {epoch}: perplexity: {perplexity}") - if is_local_main_process: - save_dict = { - "epoch": epoch + 1, - "state_dict": accelerator.unwrap_model(model).state_dict(), - "optimizer": optimizer.state_dict(), - "scheduler": scheduler.state_dict(), - } - torch.save( - save_dict, - os.path.join(args.out_dir, f"checkpoint-{completed_steps}step.pt"), - ) - del save_dict - gc.collect() - if completed_steps >= args.max_train_steps: - break - logger.close() + if args.do_train: + logger.info("***** Start training *****") + # Train! + progress_bar = tqdm(range(args.max_train_steps), desc="training") + completed_steps = 0 + logger_metrics = Logger(is_local_main_process, project=args.project_name, config=args) + + do_eval = args.do_eval + if do_eval: + logger.info("***** Evaluation *****") + for key, eval_dataloader in eval_dataloaders.items(): + metrics = evaluate(eval_dataloader) + logger_metrics.log({key: metrics}) + # logger_metrics.info(f"epoch {epoch}: perplexity: {perplexity}") + + for epoch in range(args.num_train_epochs): + model.train() + for step, batch in enumerate(train_dataloader): + # pop labels because we want to calculate loss ourselves + labels = batch.pop("labels") + metadata_mask = batch.pop("metadata_mask", None) + outputs = model(**batch) + batch["labels"] = labels + loss = loss_fn(batch, outputs, metadata_mask) + + logger_metrics.log({"loss": loss, "lr": optimizer.param_groups[0]["lr"]}) + loss = loss / args.gradient_accumulation_steps + accelerator.backward(loss) + + do_step = step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1 + if do_step: + # accelerator.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) + completed_steps += 1 + else: + continue + + do_eval = args.do_eval and completed_steps > 0 and completed_steps % eval_per_n_step == 0 + if do_eval: + logger.info("***** Evaluation *****") + for key, eval_dataloader in eval_dataloaders.items(): + metrics = evaluate(eval_dataloader) + logger_metrics.log({key: metrics}) + # logger_metrics.info(f"epoch {epoch}: perplexity: {perplexity}") + + do_save = completed_steps > 0 and completed_steps % save_per_n_step == 0 + if do_save: + logger.info(f"***** Saving at {args.out_dir} *****") + if is_local_main_process: + save_dict = { + "epoch": epoch + 1, + "state_dict": accelerator.unwrap_model(model).state_dict(), + "optimizer": optimizer.state_dict(), + "scheduler": scheduler.state_dict(), + } + torch.save( + save_dict, + os.path.join(args.out_dir, f"checkpoint-{completed_steps}step.pt"), + ) + del save_dict + gc.collect() + if completed_steps >= args.max_train_steps: + break + logger_metrics.close() + logger.info("***** Training finished *****") if is_local_main_process and args.out_dir is not None: accelerator.wait_for_everyone() diff --git a/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm b/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm new file mode 100644 index 00000000..e746380c --- /dev/null +++ b/experiments/html/SLURM/crime_and_punish_test/create_dataset.slurm @@ -0,0 +1,29 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@cpu # account + +set -x -e + +source $HOME/start-user + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +cd $WORK/repos/sync/metadata/ + +python experiments/html/start_training.py \ +data_config.experiment="with_metadata" \ +data_config.metadata_list=["html"] \ +data_config.max_seq_len=1024 \ +data_config.dataset_name="crime_and_punish" \ +data_config.extension="json" \ +data_config.preprocessing_num_workers=8 \ +do_train=False \ +do_eval=False \ \ No newline at end of file diff --git a/experiments/html/SLURM/crime_and_punish_test/load_dataset.py b/experiments/html/SLURM/crime_and_punish_test/load_dataset.py new file mode 100644 index 00000000..3ce34fd1 --- /dev/null +++ b/experiments/html/SLURM/crime_and_punish_test/load_dataset.py @@ -0,0 +1,88 @@ +import logging +import sys + +import hydra +from datasets import config, load_dataset +from hydra.core.config_store import ConfigStore + +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.train import show_help + + +logger = logging.getLogger(__name__) + +cs = ConfigStore.instance() +cs.store(name="data_config", node=DataConfig) + + +@hydra.main(config_name="data_config") +def main(args: DataConfig) -> None: + data_files = {} + if args.train_file is not None: + data_files["train"] = args.train_file + if args.validation_file is not None: + data_files["validation"] = args.validation_file + + if not data_files: + data_files = None + + logger.info(config.HF_DATASETS_CACHE) + if args.dataset_name is not None: + logger.info( + "Downloading and loading a dataset from the hub" + f"{args.dataset_name}, {args.dataset_config_name}, data_files={data_files}, cache_dir={args.cache_dir}," + ) + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + args.dataset_name, + args.dataset_config_name, + data_files=data_files, + cache_dir=args.cache_dir, + keep_in_memory=False, + ) + + if "validation" not in raw_datasets.keys(): + logger.info("validation not in raw_datasets.keys()") + raw_datasets["validation"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + else: + extension = args.train_file.split(".")[-1] if not args.extension else args.extension + if extension == "txt": + raise ValueError( + "You have entered a text file for the train data, but this type of file cannot contain metadata " + "columns. Wouldn't you rather have a file in json/jsonl or pandas format?" + ) + if extension == "jsonl": + extension = "json" + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=args.cache_dir) + + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + + +if __name__ == "__main__": + if "--help" in sys.argv or "-h" in sys.argv: + show_help() + sys.exit() + main() diff --git a/experiments/html/SLURM/crime_and_punish_test/load_dataset.slurm b/experiments/html/SLURM/crime_and_punish_test/load_dataset.slurm new file mode 100644 index 00000000..8c32659a --- /dev/null +++ b/experiments/html/SLURM/crime_and_punish_test/load_dataset.slurm @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-download-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # number of gpus +#SBATCH --time 00:02:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +source $HOME/start-user + +cd $WORK/repos/sync/metadata/ + +python experiments/html/SLURM/init_experiment/load_dataset.py \ +dataset_name="crime_and_punish" \ \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_1/README.md b/experiments/html/SLURM/experiment_1/README.md new file mode 100644 index 00000000..f0732a86 --- /dev/null +++ b/experiments/html/SLURM/experiment_1/README.md @@ -0,0 +1,9 @@ +# Experiment 1 + +## Run experiment on JZ + +1. Download the tokenizer and the model +2. Download the dataset on a partition with internet +3. Preprocess the dataset on a cpu-only partition +4. Run the training on a gpu 16gb partition + diff --git a/experiments/html/SLURM/experiment_1/create_dataset.slurm b/experiments/html/SLURM/experiment_1/create_dataset.slurm new file mode 100644 index 00000000..99e09f09 --- /dev/null +++ b/experiments/html/SLURM/experiment_1/create_dataset.slurm @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 01:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@cpu # account + +set -x -e + +source $HOME/start-user + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +cd $WORK/repos/sync/metadata/ + +python experiments/html/start_training.py \ +data_config.experiment="with_metadata" \ +data_config.metadata_list=["html"] \ +data_config.max_seq_len=1024 \ +data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ +data_config.train_file="nq-train-*.jsonl.gz" \ +data_config.validation_file="nq-dev-*.jsonl.gz" \ +data_config.extension="json" \ +data_config.preprocessing_num_workers=80 \ +out_dir="${SCRATCH}/metadata_outputs" \ +do_train=False \ +do_eval=False \ \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_1/do_training.slurm b/experiments/html/SLURM/experiment_1/do_training.slurm new file mode 100644 index 00000000..8b2f2f71 --- /dev/null +++ b/experiments/html/SLURM/experiment_1/do_training.slurm @@ -0,0 +1,42 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-do-train-test # job name +#SBATCH --ntasks=1 # number of MP tasks +#SBATCH --constraint=v100-16g +#SBATCH --gres=gpu:1 # number of GPUs per node +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account + +set -x -e + +source $HOME/start-user + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +# be careful about the cache folder for Wandb +export WANDB_MODE=offline +export WANDB_DIR=$SCRATCH + +cd $WORK/repos/sync/metadata/ + +python experiments/html/start_training.py \ +data_config.experiment="with_metadata" \ +data_config.metadata_list=["html"] \ +data_config.max_seq_len=1024 \ +data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ +data_config.train_file="nq-train-0\[0-2\].jsonl.gz" \ +data_config.validation_file="nq-dev-00.jsonl.gz" \ +data_config.extension="json" \ +data_config.preprocessing_num_workers=80 \ +data_config.per_device_eval_batch_size=3 \ +data_config.per_device_train_batch_size=3 \ +out_dir="${SCRATCH}/metadata_outputs" \ +do_train=True \ +do_eval=True \ +evaluation_strategy=STEPS \ +eval_steps=1000 \ +save_strategy=STEPS \ +save_steps=1000 \ \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_1/load_dataset.py b/experiments/html/SLURM/experiment_1/load_dataset.py new file mode 100644 index 00000000..3ce34fd1 --- /dev/null +++ b/experiments/html/SLURM/experiment_1/load_dataset.py @@ -0,0 +1,88 @@ +import logging +import sys + +import hydra +from datasets import config, load_dataset +from hydra.core.config_store import ConfigStore + +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.train import show_help + + +logger = logging.getLogger(__name__) + +cs = ConfigStore.instance() +cs.store(name="data_config", node=DataConfig) + + +@hydra.main(config_name="data_config") +def main(args: DataConfig) -> None: + data_files = {} + if args.train_file is not None: + data_files["train"] = args.train_file + if args.validation_file is not None: + data_files["validation"] = args.validation_file + + if not data_files: + data_files = None + + logger.info(config.HF_DATASETS_CACHE) + if args.dataset_name is not None: + logger.info( + "Downloading and loading a dataset from the hub" + f"{args.dataset_name}, {args.dataset_config_name}, data_files={data_files}, cache_dir={args.cache_dir}," + ) + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + args.dataset_name, + args.dataset_config_name, + data_files=data_files, + cache_dir=args.cache_dir, + keep_in_memory=False, + ) + + if "validation" not in raw_datasets.keys(): + logger.info("validation not in raw_datasets.keys()") + raw_datasets["validation"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + else: + extension = args.train_file.split(".")[-1] if not args.extension else args.extension + if extension == "txt": + raise ValueError( + "You have entered a text file for the train data, but this type of file cannot contain metadata " + "columns. Wouldn't you rather have a file in json/jsonl or pandas format?" + ) + if extension == "jsonl": + extension = "json" + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=args.cache_dir) + + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + + +if __name__ == "__main__": + if "--help" in sys.argv or "-h" in sys.argv: + show_help() + sys.exit() + main() diff --git a/experiments/html/SLURM/experiment_1/load_dataset.slurm b/experiments/html/SLURM/experiment_1/load_dataset.slurm new file mode 100644 index 00000000..ad6cb82c --- /dev/null +++ b/experiments/html/SLURM/experiment_1/load_dataset.slurm @@ -0,0 +1,30 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-download-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # number of gpus +#SBATCH --time 02:30:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +source $HOME/start-user + +# Uncomment if the repo doesn't exist +# cd $DATASETS_CUSTOM/ +# git clone https://huggingface.co/datasets/SaulLu/Natural_Questions_HTML +# cd Natural_Questions_HTML_Toy/ +# git lfs install +# git lfs pull origin master + +cd $WORK/repos/sync/metadata/ + +python experiments/html/SLURM/init_experiment/load_dataset.py \ +dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ +train_file="nq-train-*.jsonl.gz" \ +validation_file="nq-dev-*.jsonl.gz" \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_1/multi_steps.bash b/experiments/html/SLURM/experiment_1/multi_steps.bash new file mode 100644 index 00000000..d6f4a745 --- /dev/null +++ b/experiments/html/SLURM/experiment_1/multi_steps.bash @@ -0,0 +1,2 @@ +JID_JOB1=`sbatch create_dataset.slurm | cut -d " " -f 4` +sbatch --dependency=afterok:$JID_JOB1 do_training.slurm \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_2/README.md b/experiments/html/SLURM/experiment_2/README.md new file mode 100644 index 00000000..f0732a86 --- /dev/null +++ b/experiments/html/SLURM/experiment_2/README.md @@ -0,0 +1,9 @@ +# Experiment 1 + +## Run experiment on JZ + +1. Download the tokenizer and the model +2. Download the dataset on a partition with internet +3. Preprocess the dataset on a cpu-only partition +4. Run the training on a gpu 16gb partition + diff --git a/experiments/html/SLURM/experiment_2/create_dataset.slurm b/experiments/html/SLURM/experiment_2/create_dataset.slurm new file mode 100644 index 00000000..f68eacdb --- /dev/null +++ b/experiments/html/SLURM/experiment_2/create_dataset.slurm @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=40 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 01:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@cpu # account + +set -x -e + +source $HOME/start-user + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +cd $WORK/repos/sync/metadata/ + +python experiments/html/start_training.py \ +data_config.experiment="with_metadata_and_baseline_val" \ +data_config.metadata_list=["html"] \ +data_config.max_seq_len=1024 \ +data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ +data_config.train_file="nq-train-0\[0-2\].jsonl.gz" \ +data_config.validation_file="nq-dev-00.jsonl.gz" \ +data_config.extension="json" \ +data_config.preprocessing_num_workers=80 \ +out_dir="${SCRATCH}/metadata_outputs" \ +do_train=False \ +do_eval=False \ \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_2/do_training.slurm b/experiments/html/SLURM/experiment_2/do_training.slurm new file mode 100644 index 00000000..4edacac4 --- /dev/null +++ b/experiments/html/SLURM/experiment_2/do_training.slurm @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-do-train-test # job name +#SBATCH --ntasks=1 # number of MP tasks +#SBATCH --constraint=v100-16g +#SBATCH --gres=gpu:1 # number of GPUs per node +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account + +set -x -e + +source $HOME/start-user + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +# be careful about the cache folder for Wandb +export WANDB_MODE=offline +export WANDB_DIR=$SCRATCH + +cd $WORK/repos/sync/metadata/ + +python experiments/html/start_training.py \ +data_config.experiment="with_metadata_and_baseline_val" \ +data_config.metadata_list=["html"] \ +data_config.max_seq_len=1024 \ +data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ +data_config.train_file="nq-train-0\[0-2\].jsonl.gz" \ +data_config.validation_file="nq-dev-00.jsonl.gz" \ +data_config.extension="json" \ +data_config.preprocessing_num_workers=80 \ +data_config.per_device_eval_batch_size=3 \ +data_config.per_device_train_batch_size=3 \ +out_dir="${SCRATCH}/metadata_outputs/${SLURM_JOB_ID}" \ +do_train=True \ +do_eval=True \ +evaluation_strategy=STEPS \ +eval_steps=10 \ +save_strategy=STEPS \ +save_steps=10 \ +gradient_accumulation_steps=50\ \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_2/load_dataset.py b/experiments/html/SLURM/experiment_2/load_dataset.py new file mode 100644 index 00000000..3ce34fd1 --- /dev/null +++ b/experiments/html/SLURM/experiment_2/load_dataset.py @@ -0,0 +1,88 @@ +import logging +import sys + +import hydra +from datasets import config, load_dataset +from hydra.core.config_store import ConfigStore + +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.train import show_help + + +logger = logging.getLogger(__name__) + +cs = ConfigStore.instance() +cs.store(name="data_config", node=DataConfig) + + +@hydra.main(config_name="data_config") +def main(args: DataConfig) -> None: + data_files = {} + if args.train_file is not None: + data_files["train"] = args.train_file + if args.validation_file is not None: + data_files["validation"] = args.validation_file + + if not data_files: + data_files = None + + logger.info(config.HF_DATASETS_CACHE) + if args.dataset_name is not None: + logger.info( + "Downloading and loading a dataset from the hub" + f"{args.dataset_name}, {args.dataset_config_name}, data_files={data_files}, cache_dir={args.cache_dir}," + ) + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + args.dataset_name, + args.dataset_config_name, + data_files=data_files, + cache_dir=args.cache_dir, + keep_in_memory=False, + ) + + if "validation" not in raw_datasets.keys(): + logger.info("validation not in raw_datasets.keys()") + raw_datasets["validation"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + else: + extension = args.train_file.split(".")[-1] if not args.extension else args.extension + if extension == "txt": + raise ValueError( + "You have entered a text file for the train data, but this type of file cannot contain metadata " + "columns. Wouldn't you rather have a file in json/jsonl or pandas format?" + ) + if extension == "jsonl": + extension = "json" + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=args.cache_dir) + + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + + +if __name__ == "__main__": + if "--help" in sys.argv or "-h" in sys.argv: + show_help() + sys.exit() + main() diff --git a/experiments/html/SLURM/experiment_2/load_dataset.slurm b/experiments/html/SLURM/experiment_2/load_dataset.slurm new file mode 100644 index 00000000..ad6cb82c --- /dev/null +++ b/experiments/html/SLURM/experiment_2/load_dataset.slurm @@ -0,0 +1,30 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-download-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # number of gpus +#SBATCH --time 02:30:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +source $HOME/start-user + +# Uncomment if the repo doesn't exist +# cd $DATASETS_CUSTOM/ +# git clone https://huggingface.co/datasets/SaulLu/Natural_Questions_HTML +# cd Natural_Questions_HTML_Toy/ +# git lfs install +# git lfs pull origin master + +cd $WORK/repos/sync/metadata/ + +python experiments/html/SLURM/init_experiment/load_dataset.py \ +dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML" \ +train_file="nq-train-*.jsonl.gz" \ +validation_file="nq-dev-*.jsonl.gz" \ No newline at end of file diff --git a/experiments/html/SLURM/experiment_2/multi_steps.bash b/experiments/html/SLURM/experiment_2/multi_steps.bash new file mode 100644 index 00000000..d6f4a745 --- /dev/null +++ b/experiments/html/SLURM/experiment_2/multi_steps.bash @@ -0,0 +1,2 @@ +JID_JOB1=`sbatch create_dataset.slurm | cut -d " " -f 4` +sbatch --dependency=afterok:$JID_JOB1 do_training.slurm \ No newline at end of file diff --git a/experiments/html/SLURM/init_experiment/create_dataset.slurm b/experiments/html/SLURM/init_experiment/create_dataset.slurm new file mode 100644 index 00000000..d4d2740c --- /dev/null +++ b/experiments/html/SLURM/init_experiment/create_dataset.slurm @@ -0,0 +1,31 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-create-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@cpu # account + +set -x -e + +source $HOME/start-user + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +cd $WORK/repos/sync/metadata/ + +python experiments/html/start_training.py \ +data_config.experiment="with_metadata" \ +data_config.metadata_list=["html"] \ +data_config.max_seq_len=1024 \ +data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML_Toy" \ +data_config.train_file="nq-train-*.jsonl.gz" \ +data_config.validation_file="nq-dev-*.jsonl.gz" \ +data_config.extension="json" \ +data_config.preprocessing_num_workers=8 \ +do_train=False \ +do_eval=False \ \ No newline at end of file diff --git a/experiments/html/SLURM/init_experiment/do_training.slurm b/experiments/html/SLURM/init_experiment/do_training.slurm new file mode 100644 index 00000000..00d7fc74 --- /dev/null +++ b/experiments/html/SLURM/init_experiment/do_training.slurm @@ -0,0 +1,35 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-do-train-test # job name +#SBATCH --ntasks=1 # number of MP tasks +#SBATCH --constraint=v100-16g +#SBATCH --gres=gpu:1 # number of GPUs per node +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account + +set -x -e + +source $HOME/start-user + +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +# be careful about the cache folder for Wandb +export WANDB_MODE=offline + +cd $WORK/repos/sync/metadata/ + +python experiments/html/start_training.py \ +data_config.experiment="with_metadata" \ +data_config.metadata_list=["html"] \ +data_config.max_seq_len=1024 \ +data_config.dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML_Toy" \ +data_config.train_file="nq-train-*.jsonl.gz" \ +data_config.validation_file="nq-dev-*.jsonl.gz" \ +data_config.extension="json" \ +data_config.preprocessing_num_workers=6 \ +out_dir="${SCRATCH}/metadata_outputs" \ +do_train=True \ +do_eval=True \ \ No newline at end of file diff --git a/experiments/html/SLURM/init_experiment/load_dataset.py b/experiments/html/SLURM/init_experiment/load_dataset.py new file mode 100644 index 00000000..3ce34fd1 --- /dev/null +++ b/experiments/html/SLURM/init_experiment/load_dataset.py @@ -0,0 +1,88 @@ +import logging +import sys + +import hydra +from datasets import config, load_dataset +from hydra.core.config_store import ConfigStore + +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.train import show_help + + +logger = logging.getLogger(__name__) + +cs = ConfigStore.instance() +cs.store(name="data_config", node=DataConfig) + + +@hydra.main(config_name="data_config") +def main(args: DataConfig) -> None: + data_files = {} + if args.train_file is not None: + data_files["train"] = args.train_file + if args.validation_file is not None: + data_files["validation"] = args.validation_file + + if not data_files: + data_files = None + + logger.info(config.HF_DATASETS_CACHE) + if args.dataset_name is not None: + logger.info( + "Downloading and loading a dataset from the hub" + f"{args.dataset_name}, {args.dataset_config_name}, data_files={data_files}, cache_dir={args.cache_dir}," + ) + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + args.dataset_name, + args.dataset_config_name, + data_files=data_files, + cache_dir=args.cache_dir, + keep_in_memory=False, + ) + + if "validation" not in raw_datasets.keys(): + logger.info("validation not in raw_datasets.keys()") + raw_datasets["validation"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + else: + extension = args.train_file.split(".")[-1] if not args.extension else args.extension + if extension == "txt": + raise ValueError( + "You have entered a text file for the train data, but this type of file cannot contain metadata " + "columns. Wouldn't you rather have a file in json/jsonl or pandas format?" + ) + if extension == "jsonl": + extension = "json" + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=args.cache_dir) + + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + + +if __name__ == "__main__": + if "--help" in sys.argv or "-h" in sys.argv: + show_help() + sys.exit() + main() diff --git a/experiments/html/SLURM/init_experiment/load_dataset.slurm b/experiments/html/SLURM/init_experiment/load_dataset.slurm new file mode 100644 index 00000000..6f0761f7 --- /dev/null +++ b/experiments/html/SLURM/init_experiment/load_dataset.slurm @@ -0,0 +1,30 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-download-dataset-test # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # number of gpus +#SBATCH --time 00:02:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +source $HOME/start-user + +# Uncomment if the repo doesn't exist +# cd $DATASETS_CUSTOM/ +# git clone https://huggingface.co/datasets/SaulLu/Natural_Questions_HTML_Toy +# cd Natural_Questions_HTML_Toy/ +# git lfs install +# git lfs pull origin master + +cd $WORK/repos/sync/metadata/ + +python experiments/html/SLURM/init_experiment/load_dataset.py \ +dataset_name="${DATASETS_CUSTOM}/Natural_Questions_HTML_Toy" \ +train_file="nq-train-*.jsonl.gz" \ +validation_file="nq-dev-*.jsonl.gz" \ No newline at end of file diff --git a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py new file mode 100644 index 00000000..0c4c42dd --- /dev/null +++ b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py @@ -0,0 +1,39 @@ +import logging +import sys + +import hydra +import transformers.utils.logging as logging_transformers +from hydra.core.config_store import ConfigStore +from transformers import AutoModelForCausalLM, AutoTokenizer + +from bsmetadata.train import CFG, show_help + + +# Setup logging +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO +) +logging_transformers.set_verbosity_info() +logging_transformers.enable_default_handler() +logging_transformers.enable_explicit_format() + +logger = logging.getLogger(__name__) + +cs = ConfigStore.instance() +cs.store(name="config", node=CFG) + + +@hydra.main(config_path=None, config_name="config") +def main(args: CFG) -> None: + # get dataloaders + _ = AutoTokenizer.from_pretrained(args.model_name) + + # get model + _ = AutoModelForCausalLM.from_pretrained(args.model_name) + + +if __name__ == "__main__": + if "--help" in sys.argv or "-h" in sys.argv: + show_help() + sys.exit() + main() diff --git a/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm new file mode 100644 index 00000000..63ef488f --- /dev/null +++ b/experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.slurm @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-download-tokenizer-and-model # job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # number of gpus +#SBATCH --time 00:10:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +source $HOME/start-user + +cd $WORK/repos/sync/metadata/ + +python experiments/html/SLURM/loading_scripts/load_tokenizer_and_model.py \ +model_name=gpt2 \ \ No newline at end of file diff --git a/experiments/html/SLURM/wandb/sync_wandb.slurm b/experiments/html/SLURM/wandb/sync_wandb.slurm new file mode 100644 index 00000000..1922e3a3 --- /dev/null +++ b/experiments/html/SLURM/wandb/sync_wandb.slurm @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-sync-wandb # job name +#SBATCH --ntasks=1 # number of MP tasks +#SBATCH --nodes=1 # number of nodes +#SBATCH --cpus-per-task=1 # number of cores per task +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time=2:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --partition=compil +#SBATCH --account=six@cpu + +echo "START TIME: $(date)" + +source $HOME/start-user + +cd ${SCRATCH} + +while true +do + wandb sync --sync-all + sleep 30 +done + +echo "END TIME: $(date)" \ No newline at end of file diff --git a/experiments/html/example_script.sh b/experiments/html/example_script.sh new file mode 100644 index 00000000..1f3e76ec --- /dev/null +++ b/experiments/html/example_script.sh @@ -0,0 +1,15 @@ +python experiments/html/start_training.py \ +data_config.experiment="with_metadata" \ +data_config.metadata_list=["html"] \ +data_config.max_seq_len=1024 \ +data_config.dataset_name="SaulLu/Natural_Questions_HTML_Toy" \ +data_config.train_file="nq-train-*.jsonl.gz" \ +data_config.validation_file="nq-dev-*.jsonl.gz" \ +data_config.extension="json" \ +data_config.preprocessing_num_workers=6 \ +do_train=False \ +do_eval=False \ +evaluation_strategy=STEPS \ +eval_steps=50 \ +save_strategy=STEPS \ +save_steps=500 \ diff --git a/experiments/html/html_processor.py b/experiments/html/html_processor.py new file mode 100644 index 00000000..76357141 --- /dev/null +++ b/experiments/html/html_processor.py @@ -0,0 +1,149 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Tuple + +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.metadata_processors import MetadataProcessor + + +@dataclass +class TagToRemove: + tag: str + txt_min_chr_len: int = 0 + txt_max_chr_len: int = float("inf") + + +@dataclass +class HtmlTag: + tag: str + attrs: dict + + +@dataclass +class Metadata: + char_start_idx: int + value: HtmlTag + char_end_idx: Optional[int] = None + key: str = "html" + type: str = "local" + + +@dataclass +class AllTagsRules: + attributes_to_keep: List[str] = field(default_factory=(lambda: []), metadata={"help": "TODO."}) + txt_max_chr_len: float = field(default=-float("inf"), metadata={"help": "TODO."}) + txt_min_chr_len: float = field(default=-float("inf"), metadata={"help": "TODO."}) + tags_exceptions_to_txt_max_min_chr_len: List[str] = field(default_factory=(lambda: []), metadata={"help": "TODO."}) + + +@dataclass +class HTMLParserConfig: + all_tags_rules: AllTagsRules = AllTagsRules() + tags_to_remove_alone_tag_name: List[str] = field( + default_factory=(lambda: []), + metadata={"help": "TODO."}, + ) + tags_to_remove_alone_txt_max_chr_len: List[float] = field( + default_factory=(lambda: []), + metadata={"help": "TODO."}, + ) + tags_to_remove_alone_txt_min_chr_len: List[float] = field( + default_factory=(lambda: []), + metadata={"help": "TODO."}, + ) + + +class TagFilter: + def __init__( + self, + txt_max_chr_len: Optional[float] = -float("inf"), + txt_min_chr_len: Optional[float] = -float("inf"), + tags_exceptions: Optional[List[str]] = None, + tags_to_remove_alone: Optional[List[TagToRemove]] = None, + ): + self.tags_to_remove_alone = ( + {tag_to_remove.tag: tag_to_remove for tag_to_remove in tags_to_remove_alone} + if isinstance(tags_to_remove_alone, list) + else {} + ) + self.txt_max_chr_len = txt_max_chr_len + self.txt_min_chr_len = txt_min_chr_len + self.tags_exceptions = tags_exceptions if tags_exceptions else [] + + def drop_tag(self, metadata_node): + tag = str(metadata_node.value.tag) + + drop_tag = False + content_char_length = ( + metadata_node.char_end_idx - metadata_node.char_start_idx if metadata_node.char_end_idx is not None else 0 + ) + if tag in self.tags_to_remove_alone: + tag_to_remove_characteristics = self.tags_to_remove_alone[tag] + if ( + content_char_length <= tag_to_remove_characteristics.txt_max_chr_len + and content_char_length >= tag_to_remove_characteristics.txt_min_chr_len + ): + drop_tag = True + + if tag not in self.tags_exceptions: + if content_char_length <= self.txt_max_chr_len and content_char_length >= self.txt_min_chr_len: + drop_tag = True + + # raise TypeError(f"tag need to be a string not a {type(tag)}") + return drop_tag + + +class HtmlProcessor(MetadataProcessor): + """An example metadata processor for HTMl tags.""" + + def __init__( + self, + cfg: DataConfig, + ): + """ + Args: + cfg: The data configuration to use. + """ + super().__init__(cfg) + attributes_to_keep = cfg.html_parser_config.all_tags_rules.attributes_to_keep + txt_max_chr_len = cfg.html_parser_config.all_tags_rules.txt_max_chr_len + txt_min_chr_len = cfg.html_parser_config.all_tags_rules.txt_min_chr_len + tags_exceptions = cfg.html_parser_config.all_tags_rules.tags_exceptions_to_txt_max_min_chr_len + tags_to_remove_alone = [ + TagToRemove(tag=tag, txt_max_chr_len=txt_max_chr_len, txt_min_chr_len=txt_min_chr_len) + for (tag, txt_max_chr_len, txt_min_chr_len) in zip( + cfg.html_parser_config.tags_to_remove_alone_tag_name, + cfg.html_parser_config.tags_to_remove_alone_txt_max_chr_len, + cfg.html_parser_config.tags_to_remove_alone_txt_min_chr_len, + ) + ] + + self._tag_filter = TagFilter( + tags_to_remove_alone=tags_to_remove_alone, + txt_min_chr_len=txt_min_chr_len, + txt_max_chr_len=txt_max_chr_len, + tags_exceptions=tags_exceptions, + ) + self._attributes_to_keep = attributes_to_keep + + def process_local(self, metadata_attrs: Dict[str, Any]) -> Optional[Tuple[str, str]]: + # We represent a html tag `T` by enclosing the corresponding text span with "" and "". + # Example: An apple is an edible fruit. + if self._tag_filter.drop_tag( + Metadata( + char_start_idx=metadata_attrs["char_start_idx"], + value=HtmlTag(tag=metadata_attrs["value"]["tag"], attrs=metadata_attrs["value"]["attrs"]), + char_end_idx=metadata_attrs["char_end_idx"], + key=metadata_attrs["key"], + type=metadata_attrs["type"], + ) + ): + return None + + attributes = " ".join( + f'{attr}="{value}"' + for attr, value in zip(metadata_attrs["value"]["attrs"]["attr"], metadata_attrs["value"]["attrs"]["value"]) + if (self._attributes_to_keep is None or attr in self._attributes_to_keep) + ) + if attributes: + attributes = " " + attributes + return f"<{metadata_attrs['value']['tag']}{attributes}>", f"" diff --git a/experiments/html/start_training.py b/experiments/html/start_training.py new file mode 100644 index 00000000..801e1d92 --- /dev/null +++ b/experiments/html/start_training.py @@ -0,0 +1,62 @@ +import sys +from dataclasses import dataclass + +from html_processor import AllTagsRules, HTMLParserConfig, HtmlProcessor, TagToRemove +from hydra.core.config_store import ConfigStore + +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.metadata_processors import PROCESSORS +from bsmetadata.train import CFG, main, show_help + + +tags_to_remove_alone = [ + TagToRemove("body"), + TagToRemove("div", txt_max_chr_len=0), + TagToRemove("a", txt_max_chr_len=0), +] +tags_table = ["table" "tr", "th", "td", "caption", "colgroup", "thead", "tfoot", "tbody"] +tags_list = [ + "li", + "ol", + "ul", +] +attributes_to_keep = ["class", "id"] +txt_max_chr_len = 128 +txt_min_chr_len = -float("inf") +tags_exceptions = [ + *tags_table, + *tags_list, + "span", +] + +PROCESSORS["html"] = HtmlProcessor + + +@dataclass +class DataConfigWithHTML(DataConfig): + html_parser_config: HTMLParserConfig = HTMLParserConfig( + AllTagsRules( + attributes_to_keep=attributes_to_keep, + txt_max_chr_len=txt_max_chr_len, + txt_min_chr_len=txt_min_chr_len, + tags_exceptions_to_txt_max_min_chr_len=tags_exceptions, + ), + tags_to_remove_alone_tag_name=[tag_to_remove.tag for tag_to_remove in tags_to_remove_alone], + tags_to_remove_alone_txt_max_chr_len=[tag_to_remove.txt_max_chr_len for tag_to_remove in tags_to_remove_alone], + tags_to_remove_alone_txt_min_chr_len=[tag_to_remove.txt_min_chr_len for tag_to_remove in tags_to_remove_alone], + ) + + +@dataclass +class CFGAugmented(CFG): + data_config: DataConfigWithHTML = DataConfigWithHTML() + + +cs = ConfigStore.instance() +cs.store(name="config", node=CFGAugmented) + +if __name__ == "__main__": + if "--help" in sys.argv or "-h" in sys.argv: + show_help() + sys.exit() + main() diff --git a/experiments/html/test_html_processor.py b/experiments/html/test_html_processor.py new file mode 100644 index 00000000..7fe710f6 --- /dev/null +++ b/experiments/html/test_html_processor.py @@ -0,0 +1,170 @@ +import unittest + +from html_processor import AllTagsRules, HTMLParserConfig, HtmlProcessor, TagToRemove +from start_training import DataConfigWithHTML +from transformers import GPT2TokenizerFast + +from bsmetadata.metadata_processors import PROCESSORS +from bsmetadata.metadata_utils import add_local_metadata_to_text + + +class MetadataUtilsTester(unittest.TestCase): + def setUp(self) -> None: + self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2-xl") + self.examples = [ + { + "id": "0004", + "text": "useless text The Walking Dead (season 8)\n", + "metadata": [ + { + "char_start_idx": 0, + "value": {"tag": "a", "attrs": {"attr": [], "value": []}}, + "char_end_idx": 12, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": { + "tag": "div", + "attrs": {"attr": ["id", "class"], "value": ["mw-page-base", "noprint"]}, + }, + "char_end_idx": 13, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": { + "tag": "div", + "attrs": {"attr": ["id", "class"], "value": ["mw-head-base", "noprint"]}, + }, + "char_end_idx": 13, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": {"tag": "a", "attrs": {"attr": ["id"], "value": ["top"]}}, + "char_end_idx": 13, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": { + "tag": "div", + "attrs": { + "attr": ["id", "class"], + "value": ["siteNotice centralNotice", "mw-body-content"], + }, + }, + "char_end_idx": 13, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": {"tag": "i", "attrs": {"attr": [], "value": []}}, + "char_end_idx": 29, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": { + "tag": "h1", + "attrs": { + "attr": ["id", "class", "lang"], + "value": ["firstHeading", "firstHeading", "en"], + }, + }, + "char_end_idx": 40, + "key": "html", + "type": "local", + }, + ], + }, + { + "id": "0004", + "text": ("this is a title that we keep\n" "blablabla\n" "tidi tidi2 this one keep his tag\n"), + "metadata": [ + { + "char_start_idx": 0, + "value": {"tag": "h1", "attrs": {"attr": ["id"], "value": ["title"]}}, + "char_end_idx": 28, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 50, + "value": {"tag": "span", "attrs": {"attr": ["id"], "value": ["3"]}}, + "char_end_idx": 71, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 29, + "value": { + "tag": "div", + "attrs": { + "attr": ["class", "id", "href"], + "value": ["div-level-1 div-level-2", "1", "http"], + }, + }, + "char_end_idx": 72, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 0, + "value": {"tag": "body", "attrs": {"attr": [], "value": []}}, + "char_end_idx": 72, + "key": "html", + "type": "local", + }, + ], + }, + ] + + def test_add_html_tags(self): + cfg = DataConfigWithHTML( + html_parser_config=HTMLParserConfig( + all_tags_rules=AllTagsRules(attributes_to_keep=["class", "id", "href"]) + ) + ) + cfg.metadata_list = ["html"] + PROCESSORS["html"] = HtmlProcessor + + text1, mask1 = add_local_metadata_to_text(self.examples[0], cfg) + target_text = 'useless text

The Walking Dead (season 8)

\n' + + self.assertEqual(text1, target_text) + + def test_add_html_tags_remove_tag(self): + tags_to_remove_alone = [TagToRemove("span", txt_max_chr_len=5), TagToRemove("body")] + + cfg = DataConfigWithHTML( + html_parser_config=HTMLParserConfig( + all_tags_rules=AllTagsRules(attributes_to_keep=["class", "id", "href"]), + tags_to_remove_alone_tag_name=[tag_to_remove.tag for tag_to_remove in tags_to_remove_alone], + tags_to_remove_alone_txt_max_chr_len=[ + tag_to_remove.txt_max_chr_len for tag_to_remove in tags_to_remove_alone + ], + tags_to_remove_alone_txt_min_chr_len=[ + tag_to_remove.txt_min_chr_len for tag_to_remove in tags_to_remove_alone + ], + ) + ) + cfg.metadata_list = ["html"] + PROCESSORS["html"] = HtmlProcessor + + text1, mask1 = add_local_metadata_to_text(self.examples[1], cfg) + target_text = ( + '

this is a title that we keep

\n' + '
blablabla\ntidi tidi2 this one keep his tag\n
' + ) + + print(repr(text1)) + + self.assertEqual(text1, target_text) diff --git a/experiments/jz/README.md b/experiments/jz/README.md new file mode 100644 index 00000000..5e92174d --- /dev/null +++ b/experiments/jz/README.md @@ -0,0 +1,61 @@ +# JZ templates + +The purpose of this repo is to provide templates for people who don't have direct access to JZ but who have been working on experiments that we want to run on JZ. + +## How to design your experience for JZ + +To designate an experiment on JZ you have to think in stages: + +1. Uploading stage: stage where we have to download from the internet everything we need (dataset, model, tokenizer, dependencies, etc); +2. CPU stage: stage in which only CPU operations are performed (typically pre-processing); +3. GPU stage: stage during which only operations on the CPU and GPU are performed. This is typically training and evaluation; +4. Downloading stage: stage during which the outputs (checkpoints, datasets, metrics) are retrieved from JZ. + +What I propose is to put on the Hub the data that will be uploaded to JZ (dataset, initial model, tokenizer, etc). + +Concretely, to work on JZ, you have to prepare bash scripts (and more precisely SLURM files) which will be put in a job queue to be executed. You will find in the `experiments/jz/templates/SLURM/experiment_template` folder templates of scripts to realize an end-2-end experiment. Each of these scripts are composed of 2 sections: + +1. A section indicating the characteristics of the job to run (typically its duration and the hardware to use); +2. A section which is a bash script in which you just have to list the terminal commands to run to launch a part of the experiment. + +You will also find in `experiments/jz/templates/SLURM/experiment_example` folder an example of an experiment that could be launched on JZ. + +As you will certainly not be able to run these scripts yourself on JZ, what I suggest is that you write the bash instructions to be used for your experiments (keeping in mind the need to think of your experiment in steps with one script per type of step). Don't hesitate to write your doubts or questions while writing this script so that we can discuss them before the execution of the script on the cluster. + +As a tip, try to prepare a toy example to check that your scripts can be run on JZ. By toy example I mean a small enough dataset that we can run the jobs with very little time and compute. Indeed, as the jobs are put in a queue there is a priority system which makes that the small jobs are executed more quickly. If ever there is a small bug in the code it can be very useful to be able to debug it quickly. + +In summary some interesting points to know about JZ: + +- the computational partitions **do not have access to the internet**. We use specific partions for everything that needs the internet. +- we try to use only **2 conda environments**: a stable one which corresponds to the dependencies on master and a development one. If your experiment requires dependencies that are not on the master branch of our repository you will have to tell the person who will run your experiment +- we have several storage partitions on JZ, if your code uses **paths** you will also have to talk to the person who will launch your experiment. For your information: + 1. The dataset clones are located at `$DATASETS_CUSTOM` + 2. The clone of the repo on the master branch is at `$WORK/repos/metadata/` + 3. The wandb logs are at `$SCRATCH/wandb` (deleted after 30 days if there is no access to the file in the meantime) + 4. The checkpoints are located at `$SCRATCH/metadata_outputs/{job_id}` (deleted after 30 days if the file has not been accessed in the meantime) + 5. For scripts requiring GPU computing we try to use one 16GB V100 (maximum 20h). + 6. For scripts requiring CPU computation we try to use a maximum of 1 node (40 CPUs). + +If you ever get stuck on anything to design your experience on JZ, contact me. The instructions will most likely change according to your needs. + +It might be interesting to plan some peers coding sessions to prepare experiments that would go beyond this very generic framework. But in any case, it will be useful to have a bash script base to visualize the operations to perform. + +## Downloading from JZ + +This is not yet ready: + +- downloading the checkpoints (do we want to send them to the HUB?) +- logging in tensorboard to be able to use [this feature](https://huggingface.co/bigscience/tr3d-1B3-more-warmup-tensorboard/tensorboard) of the HUB + +What is ready: + +- synchronization of wandb logs (on request) + +## Checklist + +Here is a small checklist of information that the person running your script will probably want to know: + +- What do you need to download (dataset, template, tokenizer)? +- Where are your scripts located in the repository, in what order should they be run? +- Are you using the master branch of modelling-metadata? If not, why not? +- Do your dependencies match the dependencies listed on master? diff --git a/experiments/jz/templates/SLURM/experiment_example/README.md b/experiments/jz/templates/SLURM/experiment_example/README.md new file mode 100644 index 00000000..cb38feb3 --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_example/README.md @@ -0,0 +1,3 @@ +# Experiment example + +This is a toy experiment example that can be run on JZ. This experience is made up of sub-experiments, each corresponding to a run. diff --git a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/01_load_tokenizer_and_model.slurm b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/01_load_tokenizer_and_model.slurm new file mode 100644 index 00000000..e0af2b19 --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/01_load_tokenizer_and_model.slurm @@ -0,0 +1,29 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-example-load-model-and-tokenizer # (change me!) job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus +#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +# Next line will: +# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/ +# - setup env vars ($HOME, $WORK, etc) +# - load several modules (git) +# Note: We can afford to have two conda environments: one stable for running experiments and one for development. +# If there are new dependencies to install, you have to tell me about them and not do it in this script +source $HOME/start-modelling-metadata-user + +# Folder for the clone of github.com/bigscience-workshop/metadata/ +cd $WORK/repos/metadata/ + +# Command to load the XXX model and tokenizer stored on https://huggingface.co/models +python experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py \ + model_name=gpt2 # (change me! e.g. gpt2) diff --git a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/02_load_dataset.slurm b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/02_load_dataset.slurm new file mode 100644 index 00000000..bd02527d --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/02_load_dataset.slurm @@ -0,0 +1,60 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-example-load-dataset # (change me!) job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus +#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +# Next line will: +# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/ +# - setup env vars ($HOME, $WORK, etc) +# - load several modules (git) +# Note: We can afford to have only two conda environments: one stable for running experiments and one for development. +# If there are new dependencies to install, you have to tell me about them and not do it in this script +source $HOME/start-modelling-metadata-user + +# For the moment we can't directly use the new dataset feature on JZ which would avoid having to clone the dataset +# repo from the HUB. So the first thing to do is to clone the repo of the XXX dataset if it does not already exist. +HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_Toy' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy) + +# We define the name of the folder in which the clone will be made +#Define multi-character delimiter +delimiter="/" +#Concatenate the delimiter with the main string +string=$HUB_REPO_NAME$delimiter + +#Split the text based on the delimiter +myarray=() +while [[ $string ]]; do + myarray+=("${string%%"$delimiter"*}") + string=${string#*"$delimiter"} +done +REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" + +# We clone the repo if it doesn't exist +if [[ -d "${REPO_DIR}" ]]; then + echo "${REPO_DIR} already exists on your filesystem." +else + echo "${REPO_DIR} doesn't exists on your filesystem." + cd $DATASETS_CUSTOM/ + git clone "https://huggingface.co/datasets/${HUB_REPO_NAME}" + cd ${REPO_DIR} + git lfs install + git lfs pull origin master +fi + +cd $WORK/repos/sync/metadata/ + +# We check that the dataset can indeed be loaded +python experiments/jz/utils/loading_script_utils/load_dataset.py \ + dataset_name="${REPO_DIR}" \ + train_file="nq-train-*.jsonl.gz" \ + validation_file="nq-dev-*.jsonl.gz" diff --git a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/03_create_dataset.slurm b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/03_create_dataset.slurm new file mode 100644 index 00000000..8be44449 --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/03_create_dataset.slurm @@ -0,0 +1,57 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-metadata-create-dataset # (change me!) job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=20 # (change me! between 0 and 40) number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 01:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@cpu # account + +set -x -e + +# Next line will: +# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/ +# - setup env vars ($HOME, $WORK, etc) +# - load several modules (git) +# Note: We can afford to have two conda environments: one stable for running experiments and one for development. +# If there are new dependencies to install, you have to tell me about them and not do it in this script +source $HOME/start-modelling-metadata-user + +# We are on an offline partition +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +# Folder for the clone of github.com/bigscience-workshop/metadata/ +cd $WORK/repos/metadata/ + +HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_Toy' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy) + +# We define the name of the folder in which the clone will be made +#Define multi-character delimiter +delimiter="/" +#Concatenate the delimiter with the main string +string=$HUB_REPO_NAME$delimiter + +#Split the text based on the delimiter +myarray=() +while [[ $string ]]; do + myarray+=("${string%%"$delimiter"*}") + string=${string#*"$delimiter"} +done +REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" + +# Now we launch the script that will perform the preprocessing of the dataset +# Feel free to add any arguments you like (change me!) +python bsmetadata/train.py \ + data_config.experiment="with_metadata" \ + data_config.metadata_list=["html"] \ + data_config.max_seq_len=1024 \ + data_config.dataset_name="${REPO_DIR}" \ + data_config.train_file="nq-train-*.jsonl.gz" \ + data_config.validation_file="nq-dev-*.jsonl.gz" \ + data_config.preprocessing_num_workers=80 \ + out_dir="${SCRATCH}/metadata_outputs" \ + do_train=False \ + do_eval=False diff --git a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm new file mode 100644 index 00000000..22305b9c --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/04_do_training.slurm @@ -0,0 +1,66 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-XX # (change me!) job name +#SBATCH --ntasks=1 # number of MP tasks +#SBATCH --constraint=v100-16g +#SBATCH --gres=gpu:1 # number of GPUs per node +#SBATCH --cpus-per-task=8 # (change me! between 0 and 40) number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 01:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name # error file name +#SBATCH --account=six@gpu # account + +set -x -e + +# Next line will: +# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/ +# - setup env vars ($HOME, $WORK, etc) +# - load several modules (git) +# Note: We can afford to have two conda environments: one stable for running experiments and one for development. +# If there are new dependencies to install, you have to tell me about them and not do it in this script +source $HOME/start-modelling-metadata-user + +# We are on an offline partition +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +# be careful about the cache folder for Wandb +export WANDB_MODE=offline +export WANDB_DIR=$SCRATCH + +# Folder for the clone of github.com/bigscience-workshop/metadata/ +cd $WORK/repos/metadata/ + +HUB_REPO_NAME='SaulLu/Natural_Questions_HTML_Toy' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy) + +# We define the name of the folder in which the clone will be made +#Define multi-character delimiter +delimiter="/" +#Concatenate the delimiter with the main string +string=$HUB_REPO_NAME$delimiter + +#Split the text based on the delimiter +myarray=() +while [[ $string ]]; do + myarray+=("${string%%"$delimiter"*}") + string=${string#*"$delimiter"} +done +REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" + +python bsmetadata/train.py \ + data_config.experiment="with_metadata" \ + data_config.metadata_list=["html"] \ + data_config.max_seq_len=1024 \ + data_config.dataset_name="${REPO_DIR}" \ + data_config.train_file="nq-train-*.jsonl.gz" \ + data_config.validation_file="nq-dev-*.jsonl.gz" \ + data_config.preprocessing_num_workers=80 \ + data_config.per_device_eval_batch_size=3 \ + data_config.per_device_train_batch_size=3 \ + out_dir="${SCRATCH}/metadata_outputs/${SLURM_JOB_ID}" \ + do_train=True \ + do_eval=True \ + evaluation_strategy=STEPS \ + eval_steps=10 \ + save_strategy=STEPS \ + save_steps=10 \ + gradient_accumulation_steps=50 diff --git a/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/multi_steps.bash b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/multi_steps.bash new file mode 100644 index 00000000..ac06bb86 --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_example/subexperiment_1/multi_steps.bash @@ -0,0 +1,4 @@ +JID_JOB1=$(sbatch 01_load_tokenizer_and_model.slurm | cut -d " " -f 4) +JID_JOB2=$(sbatch --dependency=afterok:$JID_JOB1 02_load_dataset.slurm | cut -d " " -f 4) +JID_JOB3=$(sbatch --dependency=afterok:$JID_JOB2 03_create_dataset.slurm | cut -d " " -f 4) +sbatch --dependency=afterok:$JID_JOB3 04_do_training.slurm diff --git a/experiments/jz/templates/SLURM/experiment_template/01_load_tokenizer_and_model.slurm b/experiments/jz/templates/SLURM/experiment_template/01_load_tokenizer_and_model.slurm new file mode 100644 index 00000000..a1feb35d --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_template/01_load_tokenizer_and_model.slurm @@ -0,0 +1,29 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-XX # (change me!) job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus +#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + +set -x -e + +# Next line will: +# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/ +# - setup env vars ($HOME, $WORK, etc) +# - load several modules (git) +# Note: We can afford to have two conda environments: one stable for running experiments and one for development. +# If there are new dependencies to install, you have to tell me about them and not do it in this script +source $HOME/start-modelling-metadata-user + +# Folder for the clone of github.com/bigscience-workshop/metadata/ +cd $WORK/repos/metadata/ + +# Command to load the XXX model and tokenizer stored on https://huggingface.co/models +python experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py \ + model_name=XXX # (change me! e.g. gpt2) diff --git a/experiments/jz/templates/SLURM/experiment_template/02_load_dataset.slurm b/experiments/jz/templates/SLURM/experiment_template/02_load_dataset.slurm new file mode 100644 index 00000000..7c4b1ce9 --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_template/02_load_dataset.slurm @@ -0,0 +1,62 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-XX # (change me!) job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=8 # (change me! between 0 and 48) number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --gres=gpu:0 # (change me! between 0 and 1) number of gpus +#SBATCH --time 00:10:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@gpu # account +#SBATCH -p compil # partition with internet + + +set -x -e + +# Next line will: +# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/ +# - setup env vars ($HOME, $WORK, etc) +# - load several modules (git) +# Note: We can afford to have two conda environments: one stable for running experiments and one for development. +# If there are new dependencies to install, you have to tell me about them and not do it in this script +source $HOME/start-modelling-metadata-user + +# For the moment we can't directly use the new dataset feature on JZ which would avoid having to clone the dataset +# repo from the HUB. So the first thing to do is to clone the repo of the XXX dataset if it does not already exist. +HUB_REPO_NAME='XXX' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy) + +# We define the name of the folder in which the clone will be made +#Define multi-character delimiter +delimiter="/" +#Concatenate the delimiter with the main string +string=$HUB_REPO_NAME$delimiter + +#Split the text based on the delimiter +myarray=() +while [[ $string ]]; do + myarray+=( "${string%%"$delimiter"*}" ) + string=${string#*"$delimiter"} +done +REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" + +# We clone the repo if it doesn't exist +if [[ -d "${REPO_DIR}" ]] +then + echo "${REPO_DIR} already exists on your filesystem." +else + echo "${REPO_DIR} doesn't exists on your filesystem." + cd $DATASETS_CUSTOM/ + git clone "https://huggingface.co/datasets/${HUB_REPO_NAME}" + cd ${REPO_DIR} + git lfs install + git lfs pull origin master +fi + +cd $WORK/repos/sync/metadata/ + +# We check that the dataset can indeed be loaded +python experiments/jz/utils/loading_script_utils/load_dataset.py \ + dataset_name="${REPO_DIR}" \ + train_file="XXX" \ # (change me and remove the comment! e.g "nq-train-*.jsonl.gz" or remove arg) + validation_file="XXX" # (change me and remove the comment! e.g. "nq-dev-*.jsonl.gz" or remove arg) diff --git a/experiments/jz/templates/SLURM/experiment_template/03_create_dataset.slurm b/experiments/jz/templates/SLURM/experiment_template/03_create_dataset.slurm new file mode 100644 index 00000000..7579e57b --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_template/03_create_dataset.slurm @@ -0,0 +1,57 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-XX # (change me!) job name +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=20 # (change me! between 0 and 40) number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 01:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name +#SBATCH --account=six@cpu # account + +set -x -e + +# Next line will: +# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/ +# - setup env vars ($HOME, $WORK, etc) +# - load several modules (git) +# Note: We can afford to have two conda environments: one stable for running experiments and one for development. +# If there are new dependencies to install, you have to tell me about them and not do it in this script +source $HOME/start-modelling-metadata-user + +# We are on an offline partition +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +# Folder for the clone of github.com/bigscience-workshop/metadata/ +cd $WORK/repos/metadata/ + +HUB_REPO_NAME='XXX' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy) + +# We define the name of the folder in which the clone will be made +#Define multi-character delimiter +delimiter="/" +#Concatenate the delimiter with the main string +string=$HUB_REPO_NAME$delimiter + +#Split the text based on the delimiter +myarray=() +while [[ $string ]]; do + myarray+=( "${string%%"$delimiter"*}" ) + string=${string#*"$delimiter"} +done +REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" + +# Now we launch the script that will perform the preprocessing of the dataset +# Feel free to add any arguments you like (change me!) +python bsmetadata/train.py \ # (change me and remove the comment! if you have a specific script) + data_config.experiment="with_metadata" \ + data_config.metadata_list=["html"] \ # (change me and remove the comment!) + data_config.max_seq_len=1024 \ + data_config.dataset_name="${REPO_DIR}" \ + data_config.train_file="XXX" \ # (change me and remove the comment! e.g "nq-train-0\[0-2\].jsonl.gz" or remove arg) + data_config.validation_file"XXX" \ # (change me and remove the comment! e.g "nq-dev-00.jsonl.gz" or remove arg) + data_config.preprocessing_num_workers=80 \ + out_dir="${SCRATCH}/metadata_outputs" \ + do_train=False \ + do_eval=False \ diff --git a/experiments/jz/templates/SLURM/experiment_template/04_do_training.slurm b/experiments/jz/templates/SLURM/experiment_template/04_do_training.slurm new file mode 100644 index 00000000..a1d90ff8 --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_template/04_do_training.slurm @@ -0,0 +1,65 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-html-XX # (change me!) job name +#SBATCH --ntasks=1 # number of MP tasks +#SBATCH --constraint=v100-16g +#SBATCH --cpus-per-task=8 # (change me! between 0 and 40) number of cores per tasks +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time 01:00:00 # (change me! between 0 and 20h) maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --error=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.err # error file name # error file name +#SBATCH --account=six@gpu # account + +set -x -e + +# Next line will: +# - load a conda environment with the dependencies on the master branch of github.com/bigscience-workshop/metadata/ +# - setup env vars ($HOME, $WORK, etc) +# - load several modules (git) +# Note: We can afford to have two conda environments: one stable for running experiments and one for development. +# If there are new dependencies to install, you have to tell me about them and not do it in this script +source $HOME/start-modelling-metadata-user + +# We are on an offline partition +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +# be careful about the cache folder for Wandb +export WANDB_MODE=offline +export WANDB_DIR=$SCRATCH + +# Folder for the clone of github.com/bigscience-workshop/metadata/ +cd $WORK/repos/metadata/ + +HUB_REPO_NAME='XXX' # (change me! e.g. SaulLu/Natural_Questions_HTML_Toy) + +# We define the name of the folder in which the clone will be made +#Define multi-character delimiter +delimiter="/" +#Concatenate the delimiter with the main string +string=$HUB_REPO_NAME$delimiter + +#Split the text based on the delimiter +myarray=() +while [[ $string ]]; do + myarray+=( "${string%%"$delimiter"*}" ) + string=${string#*"$delimiter"} +done +REPO_DIR="${DATASETS_CUSTOM}/${myarray[-1]}" + +python bsmetadata/train.py \ # (change me and remove the comment! if you have a specific script) + data_config.experiment="with_metadata" \ + data_config.metadata_list=["html"] \ + data_config.max_seq_len=1024 \ + data_config.dataset_name="${REPO_DIR}" \ + data_config.train_file="XXX" \ # (change me and remove the comment! e.g "nq-train-0\[0-2\].jsonl.gz" or remove arg) + data_config.validation_file"XXX" \ # (change me and remove the comment! e.g "nq-dev-00.jsonl.gz" or remove arg) + data_config.preprocessing_num_workers=80 \ + data_config.per_device_eval_batch_size=3 \ + data_config.per_device_train_batch_size=3 \ + out_dir="${SCRATCH}/metadata_outputs/${SLURM_JOB_ID}" \ + do_train=True \ + do_eval=True \ + evaluation_strategy=STEPS \ + eval_steps=10 \ + save_strategy=STEPS \ + save_steps=10 \ + gradient_accumulation_steps=50\ # (change me and remove the comment!) diff --git a/experiments/jz/templates/SLURM/experiment_template/README.md b/experiments/jz/templates/SLURM/experiment_template/README.md new file mode 100644 index 00000000..b483f392 --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_template/README.md @@ -0,0 +1,10 @@ +# Experiment template + +In this folder you will find script templates to run a "typical" experiment on JZ. + +These scripts are designed to be run sequentially for: + +1. Downloading the tokenizer and the model (`01_load_tokenizer_and_model.slurm`) +2. Downloading the dataset on a partition with internet ( `02_load_dataset.slurm`) +3. Preprocessing the dataset on a cpu-only partition (`03_create_dataset.slurm`) +4. Running the training on a gpu 16gb partition (`04_do_training.slurm`) diff --git a/experiments/jz/templates/SLURM/experiment_template/multi_steps.bash b/experiments/jz/templates/SLURM/experiment_template/multi_steps.bash new file mode 100644 index 00000000..ac06bb86 --- /dev/null +++ b/experiments/jz/templates/SLURM/experiment_template/multi_steps.bash @@ -0,0 +1,4 @@ +JID_JOB1=$(sbatch 01_load_tokenizer_and_model.slurm | cut -d " " -f 4) +JID_JOB2=$(sbatch --dependency=afterok:$JID_JOB1 02_load_dataset.slurm | cut -d " " -f 4) +JID_JOB3=$(sbatch --dependency=afterok:$JID_JOB2 03_create_dataset.slurm | cut -d " " -f 4) +sbatch --dependency=afterok:$JID_JOB3 04_do_training.slurm diff --git a/experiments/jz/utils/loading_script_utils/load_dataset.py b/experiments/jz/utils/loading_script_utils/load_dataset.py new file mode 100644 index 00000000..3ce34fd1 --- /dev/null +++ b/experiments/jz/utils/loading_script_utils/load_dataset.py @@ -0,0 +1,88 @@ +import logging +import sys + +import hydra +from datasets import config, load_dataset +from hydra.core.config_store import ConfigStore + +from bsmetadata.input_pipeline import DataConfig +from bsmetadata.train import show_help + + +logger = logging.getLogger(__name__) + +cs = ConfigStore.instance() +cs.store(name="data_config", node=DataConfig) + + +@hydra.main(config_name="data_config") +def main(args: DataConfig) -> None: + data_files = {} + if args.train_file is not None: + data_files["train"] = args.train_file + if args.validation_file is not None: + data_files["validation"] = args.validation_file + + if not data_files: + data_files = None + + logger.info(config.HF_DATASETS_CACHE) + if args.dataset_name is not None: + logger.info( + "Downloading and loading a dataset from the hub" + f"{args.dataset_name}, {args.dataset_config_name}, data_files={data_files}, cache_dir={args.cache_dir}," + ) + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + args.dataset_name, + args.dataset_config_name, + data_files=data_files, + cache_dir=args.cache_dir, + keep_in_memory=False, + ) + + if "validation" not in raw_datasets.keys(): + logger.info("validation not in raw_datasets.keys()") + raw_datasets["validation"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + else: + extension = args.train_file.split(".")[-1] if not args.extension else args.extension + if extension == "txt": + raise ValueError( + "You have entered a text file for the train data, but this type of file cannot contain metadata " + "columns. Wouldn't you rather have a file in json/jsonl or pandas format?" + ) + if extension == "jsonl": + extension = "json" + raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=args.cache_dir) + + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + cache_dir=args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + cache_dir=args.cache_dir, + ) + + +if __name__ == "__main__": + if "--help" in sys.argv or "-h" in sys.argv: + show_help() + sys.exit() + main() diff --git a/experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py b/experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py new file mode 100644 index 00000000..0c4c42dd --- /dev/null +++ b/experiments/jz/utils/loading_script_utils/load_tokenizer_and_model.py @@ -0,0 +1,39 @@ +import logging +import sys + +import hydra +import transformers.utils.logging as logging_transformers +from hydra.core.config_store import ConfigStore +from transformers import AutoModelForCausalLM, AutoTokenizer + +from bsmetadata.train import CFG, show_help + + +# Setup logging +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO +) +logging_transformers.set_verbosity_info() +logging_transformers.enable_default_handler() +logging_transformers.enable_explicit_format() + +logger = logging.getLogger(__name__) + +cs = ConfigStore.instance() +cs.store(name="config", node=CFG) + + +@hydra.main(config_path=None, config_name="config") +def main(args: CFG) -> None: + # get dataloaders + _ = AutoTokenizer.from_pretrained(args.model_name) + + # get model + _ = AutoModelForCausalLM.from_pretrained(args.model_name) + + +if __name__ == "__main__": + if "--help" in sys.argv or "-h" in sys.argv: + show_help() + sys.exit() + main() diff --git a/experiments/jz/utils/sync_wandb.slurm b/experiments/jz/utils/sync_wandb.slurm new file mode 100644 index 00000000..1922e3a3 --- /dev/null +++ b/experiments/jz/utils/sync_wandb.slurm @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --job-name=modelling-metadata-sync-wandb # job name +#SBATCH --ntasks=1 # number of MP tasks +#SBATCH --nodes=1 # number of nodes +#SBATCH --cpus-per-task=1 # number of cores per task +#SBATCH --hint=nomultithread # we get physical cores not logical +#SBATCH --time=2:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=/gpfsdswork/projects/rech/six/uue59kq/logs/%x-%j.out # output file name +#SBATCH --partition=compil +#SBATCH --account=six@cpu + +echo "START TIME: $(date)" + +source $HOME/start-user + +cd ${SCRATCH} + +while true +do + wandb sync --sync-all + sleep 30 +done + +echo "END TIME: $(date)" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 62be06ed..100123dd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ hydra_core>=1.1,<1.2 # pip will likely update it to 1.1.1, but it is probably o wandb>=0.10.32,<1 # pip will likely update it to 0.12.1, but it is probably ok and good for bugfixes. transformers>=4.6.0,<5 # pip will likely update it to 4.10.0, but it is probably ok and good for bugfixes. accelerate>=0.4.0,<1 # We may want to use 0.5.0 in the near future -datasets[streaming]>=1.11.0,<2 +git+https://github.com/huggingface/datasets.git diff --git a/setup.py b/setup.py index f71c6605..c26450f1 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ def req_file(filename): return [x.strip() for x in content] -install_requires = req_file("requirements.txt") +# install_requires = req_file("requirements.txt") setup( name="bsmetadata", @@ -18,5 +18,5 @@ def req_file(filename): author_email="xxx", description="Codebase for including metadata (e.g., URLs, timestamps, HTML tags) during language model pretraining.", packages=find_packages(), - install_requires=install_requires, + # install_requires=install_requires, ) diff --git a/tests/test_metadata_utils.py b/tests/test_metadata_utils.py index 022fdbc5..abf26d97 100644 --- a/tests/test_metadata_utils.py +++ b/tests/test_metadata_utils.py @@ -4,7 +4,7 @@ from datasets import Dataset from transformers import GPT2TokenizerFast -from bsmetadata.metadata_processors import PROCESSORS, MetadataProcessor +from bsmetadata.metadata_processors import PROCESSORS, HtmlProcessor, MetadataProcessor from bsmetadata.metadata_utils import ( MetadataConfig, add_local_metadata_to_text, @@ -57,6 +57,79 @@ def setUp(self) -> None: {"key": "url", "type": "global", "value": "callto:RickAndMorty/Year%202021/"}, ], }, + { + "id": "0004", + "text": "useless text The Walking Dead (season 8)\n", + "metadata": [ + { + "char_start_idx": 0, + "value": {"tag": "a", "attrs": {"attr": [], "value": []}}, + "char_end_idx": 12, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": { + "tag": "div", + "attrs": {"attr": ["id", "class"], "value": ["mw-page-base", "noprint"]}, + }, + "char_end_idx": 13, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": { + "tag": "div", + "attrs": {"attr": ["id", "class"], "value": ["mw-head-base", "noprint"]}, + }, + "char_end_idx": 13, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": {"tag": "a", "attrs": {"attr": ["id"], "value": ["top"]}}, + "char_end_idx": 13, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": { + "tag": "div", + "attrs": { + "attr": ["id", "class"], + "value": ["siteNotice centralNotice", "mw-body-content"], + }, + }, + "char_end_idx": 13, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": {"tag": "i", "attrs": {"attr": [], "value": []}}, + "char_end_idx": 29, + "key": "html", + "type": "local", + }, + { + "char_start_idx": 13, + "value": { + "tag": "h1", + "attrs": { + "attr": ["id", "class", "lang"], + "value": ["firstHeading", "firstHeading", "en"], + }, + }, + "char_end_idx": 40, + "key": "html", + "type": "local", + }, + ], + }, ] def test_chunks(self): @@ -133,6 +206,16 @@ def test_add_no_metadata_and_chunk_examples(self): for example in mapped_ds: self.assertTrue(all(not x for x in example["metadata_mask"])) + def test_add_html_tags(self): + cfg = MetadataConfig() + cfg.metadata_list = ["html"] + PROCESSORS["html"] = HtmlProcessor + + text1, mask1 = add_local_metadata_to_text(self.examples[3], cfg) + target_text = 'useless text

The Walking Dead (season 8)

\n' + + self.assertEqual(text1, target_text) + def test_add_metadata_and_chunk_examples(self): cfg = MetadataConfig() cfg.metadata_list = ["url", "timestamp", "html", "entity"] diff --git a/tests/test_train.py b/tests/test_train.py index b6976564..04631c65 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -24,7 +24,7 @@ def test_toy_training_without_metadata(tmpdir): "data_config.experiment=without_metadata", f'data_config.train_file={os.path.join(path_test_folder,"data","train_toy_raw_wikitext.jsonl")}', f'data_config.validation_file={os.path.join(path_test_folder,"data","val_toy_raw_wikitext.jsonl")}', - "num_eval=2", + "eval_num_per_epoch=2", "data_config.block_size=20", f"out_dir={tmpdir}", "max_train_steps=4", @@ -55,7 +55,7 @@ def test_toy_training_with_metadata(tmpdir): "data_config.experiment=with_metadata", f'data_config.train_file={os.path.join(path_test_folder,"data","train_toy_wikitext_with_metadata.jsonl")}', f'data_config.validation_file={os.path.join(path_test_folder,"data","val_toy_wikitext_with_metadata.jsonl")}', - "num_eval=2", + "eval_num_per_epoch=2", f"out_dir={tmpdir}", "max_train_steps=4", ],