From 2dde85c21abd5f532574db33bdd53dfe4dd98857 Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Wed, 17 May 2023 13:42:36 +0100
Subject: [PATCH 01/15] add full evaluation into training loop. Other training
 changes for A100 node.

---
 bsmetadata/deepspeed_configs/v2.json          |  22 +-
 bsmetadata/evaluation.py                      | 212 ++++++++++--------
 .../experiments/with_metadata_datasetv2_tf.py |  27 ++-
 bsmetadata/hydra_configs/v2.yaml              |  21 +-
 bsmetadata/train.py                           |  79 +++++--
 5 files changed, 216 insertions(+), 145 deletions(-)

diff --git a/bsmetadata/deepspeed_configs/v2.json b/bsmetadata/deepspeed_configs/v2.json
index 1d5c0311..35a24626 100644
--- a/bsmetadata/deepspeed_configs/v2.json
+++ b/bsmetadata/deepspeed_configs/v2.json
@@ -30,19 +30,19 @@
         }
     },
     "zero_optimization": {
-        "stage": 1,
-        "allgather_partitions": true,
-        "allgather_bucket_size": 500000000,
-        "overlap_comm": true,
-        "reduce_scatter": true,
-        "reduce_bucket_size": 500000000,
-        "contiguous_gradients": true,
-        "cpu_offload": true
-    },
-    "gradient_accumulation_steps": 16,
+    "stage": 2,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 2e8,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 2e8,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+},
+    "gradient_accumulation_steps": 1,
     "gradient_clipping": "auto",
     "steps_per_print": 100,
-    "train_batch_size": 256,
+    "train_batch_size": 512,
     "train_micro_batch_size_per_gpu": "auto",
     "wall_clock_breakdown": false
 }
\ No newline at end of file
diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py
index b75c2aa5..235e7263 100644
--- a/bsmetadata/evaluation.py
+++ b/bsmetadata/evaluation.py
@@ -264,73 +264,24 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
     return cfg.metadata_sep.join(sorted_metadata) + cfg.metadata_prefix_sep if sorted_metadata else ""
 
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--repo_id",
-        type=str,
-        default="bs-modeling-metadata/checkpoints_all_04_23",
-        help="Repository ID for the model to compute perplexity for",
-    )
-    parser.add_argument(
-        "--subfolder",
-        type=str,
-        default="checkpoint-2000step",
-        help="subfolder in the respository with the specific checkpoint to evaluate perplexity for",
-    )
-    parser.add_argument(
-        "--config_file_path",
-        type=str,
-        help="The path actual_config.yaml if available, otherwise repo_id/actual_config.yaml or git clone's v2.yaml",
-    )
-    parser.add_argument(
-        "--output_file", type=str, default="evaluation.txt", help="Path to the file the perplexity is written to"
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="If set to true, all computations are performed on CPU")
-    parser.add_argument(
-        "--save_data",
-        action="store_true",
-        help="If set to true, save tokens & losses",
-    )
-    parser.add_argument(
-        "--test",
-        action="store_true",
-        help="If set to true, the script runs in test mode and only takes 10 examples per dataset",
-    )
-    parser.add_argument(
-        "--max_n_examples",
-        type=int,
-        default=1500,
-        help="how many examples per metadata type to evaluate",
-    )
-    parser.add_argument(
-        "--metadata_to_test",
-        type=str,
-        default="html,entity,entity_paragraph,website_desc,generation_datasource,timestamp,title,generation_length_sentence,generation_length_text,url,paragraph",
-        help="metadata types to test",
-    )
-    parser.add_argument(
-        "--untrained",
-        action="store_true",
-        help="If set to true, will load gpt2-xl",
-    )
-    parser.add_argument(
-        "--prompt",
-        action="store_true",
-        help="If set to true, the script evaluates metadata in prompt style",
-    )
-
-    args = parser.parse_args()
-    print(f"Parameters: {args}")
-
-    # Load config
-    if args.config_file_path:
-        config_file_path = args.config_file_path
-    else:
+def evaluate_main(
+    metadata_to_test: str = "title,html,entity_paragraph,website_desc,generation_datasource,timestamp",
+    output_file: str = "evaluation.txt",
+    repo_id: str = None,
+    subfolder: str = None,
+    test: bool = False,
+    max_n_examples: int = 1500,
+    prompt: bool = False,
+    no_cuda: bool = False,
+    save_data: bool = False,
+    untrained: bool = False,
+    config_file_path: str = None,
+    model: str = None,
+    tokenizer: str = None,
+) -> dict:
+    if config_file_path is None:
         try:
-            config_file_path = hf_hub_download(
-                repo_id=args.repo_id, filename="actual_config.yaml", use_auth_token=True
-            )
+            config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True)
         except Exception:
             config_file_path = "bsmetadata/hydra_configs/v2.yaml"
     repo_args = OmegaConf.load(config_file_path)
@@ -341,15 +292,17 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
 
     # Load model
     print("Loading model...")
-    if args.untrained:
-        model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
-    else:
-        model = AutoModelForCausalLM.from_pretrained(args.repo_id, subfolder=args.subfolder, use_auth_token=True)
-    model.eval().cuda() if not args.no_cuda else model.eval()
-
-    # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(repo_args.model_name)
-    tokenizer.pad_token = tokenizer.eos_token
+    if model is None or tokenizer is None:
+        if untrained:
+            model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
+            tokenizer = AutoTokenizer.from_pretrained(repo_args.model_name)
+            tokenizer.pad_token = tokenizer.eos_token
+        else:
+            model = AutoModelForCausalLM.from_pretrained(repo_id, subfolder=subfolder, use_auth_token=True)
+            tokenizer = AutoTokenizer.from_pretrained(
+                "bs-modeling-metadata/checkpoints_all_04_23", subfolder="tokenizer", use_auth_token=True
+            )
+    model.eval().cuda() if not no_cuda else model.eval()
 
     # Config preprocess function
     cfg = data_config.metadata_config
@@ -358,7 +311,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
     cfg.metadata_list.append("entity")
     cfg.metadata_list.append("paragraph")
 
-    if args.prompt:
+    if prompt:
         cfg.metadata_sep = "; "  # Instead of " | "
         cfg.metadata_prefix_sep = ""  # Instead of " |||"; there's already an implicit " "
         DatasourceProcessor.process_global = datasource_process_global_for_prompt
@@ -381,8 +334,8 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
         "bs-modeling-metadata/c4-en-html-with-validation_metadata_url",
         "bs-modeling-metadata/c4-en-html-with-validation_metadata_paragraph",
     ]
-    dataset_paths = [path for path in dataset_paths if path.split("_metadata_")[1] in args.metadata_to_test.split(",")]
-
+    dataset_paths = [path for path in dataset_paths if path.split("_metadata_")[1] in metadata_to_test.split(",")]
+    results = {}
     for path in dataset_paths:
         n_examples = 0
         total_normal_len = []
@@ -394,11 +347,11 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
         # Load validation dataset from hugging face
         metadata_type = path.split("_metadata_")[1]
         print(f"Loading {metadata_type} data...")
-        split = "validation" if not args.test else "validation[:10]"
+        split = "validation" if not test else "validation[:10]"
         validation_dataset = load_dataset(path, use_auth_token=True, split=split)
 
         data = []
-        max_n_examples_ord = len(str(args.max_n_examples))
+        max_n_examples_ord = len(str(max_n_examples))
         for idx, example in tqdm(enumerate(validation_dataset), desc=f"Calculating perplexity for {metadata_type}..."):
             # for idx in [136,]:
             example = validation_dataset[idx]
@@ -409,7 +362,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
             except Exception as e:
                 # Write error to output file and continue with next dataset
                 print(e)
-                with open(args.output_file, "a", encoding="utf8") as f:
+                with open(output_file, "a", encoding="utf8") as f:
                     f.write(f"=== RESULT [{metadata_type}] ===\n")
                     f.write(f"{e}\n\n")
                 exit_flag = True
@@ -445,7 +398,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
                 normal_batch = default_data_collator([normal_example])
                 metadata_example["labels"] = metadata_example["input_ids"]
                 metadata_batch = default_data_collator([metadata_example])
-                if not args.no_cuda:
+                if not no_cuda:
                     normal_batch = {k: v.cuda() for k, v in normal_batch.items()}
                     metadata_batch = {k: v.cuda() for k, v in metadata_batch.items()}
                 if n_examples == 1:
@@ -461,13 +414,11 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
                     # rich.print(tokenizer.decode(metadata_batch["input_ids"][0]))
 
                 # Calculate nll (natural-log loss)
-                normal_nll, normal_example_len = get_mean_loss(normal_batch, save_data=args.save_data, idx=idx)  # [0]
+                normal_nll, normal_example_len = get_mean_loss(normal_batch, save_data=save_data, idx=idx)  # [0]
                 # print("PPL")
                 # print(normal_ppl)
                 total_normal_nll.append(normal_nll)  # * normal_example_len
-                metadata_nll, metadata_example_len = get_mean_loss(
-                    metadata_batch, save_data=args.save_data, idx=idx
-                )  # [0]
+                metadata_nll, metadata_example_len = get_mean_loss(metadata_batch, save_data=save_data, idx=idx)  # [0]
                 # print(metadata_ppl)
                 total_metadata_nll.append(metadata_nll)  # * metadata_example_len
 
@@ -521,7 +472,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
 
                     # sys.exit()
 
-                if n_examples > args.max_n_examples:
+                if n_examples > max_n_examples:
                     break
 
         if exit_flag:
@@ -554,9 +505,86 @@ def ppl(examples_mean_loss, examples_len):
         else:
             final_metadata_ppl = final_normal_ppl = 0
 
-        # Write results to output file
-        with open(args.output_file, "a", encoding="utf8") as f:
-            f.write(f"=== RESULT [{metadata_type}] ===\n")
-            f.write("Perplexity (metadata): {:>6,.3f}\n".format(final_metadata_ppl))
-            f.write("Perplexity (normal):   {:>6,.3f}\n\n".format(final_normal_ppl))
+        results[metadata_type] = {"final_normal_ppl": final_normal_ppl, "final_metadata_ppl": final_metadata_ppl}
         torch.save(data, "eva.data")
+    return results
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        default="bs-modeling-metadata/checkpoints_all_04_23",
+        help="Repository ID for the model to compute perplexity for",
+    )
+    parser.add_argument(
+        "--subfolder",
+        type=str,
+        default="checkpoint-2000step",
+        help="subfolder in the respository with the specific checkpoint to evaluate perplexity for",
+    )
+    parser.add_argument(
+        "--config_file_path",
+        type=str,
+        help="The path actual_config.yaml if available, otherwise repo_id/actual_config.yaml or git clone's v2.yaml",
+    )
+    parser.add_argument(
+        "--output_file", type=str, default="evaluation.txt", help="Path to the file the perplexity is written to"
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="If set to true, all computations are performed on CPU")
+    parser.add_argument(
+        "--save_data",
+        action="store_true",
+        help="If set to true, save tokens & losses",
+    )
+    parser.add_argument(
+        "--test",
+        action="store_true",
+        help="If set to true, the script runs in test mode and only takes 10 examples per dataset",
+    )
+    parser.add_argument(
+        "--max_n_examples",
+        type=int,
+        default=1500,
+        help="how many examples per metadata type to evaluate",
+    )
+    parser.add_argument(
+        "--metadata_to_test",
+        type=str,
+        default="html,entity,entity_paragraph,website_desc,generation_datasource,timestamp,title,generation_length_sentence,generation_length_text,url,paragraph",
+        help="metadata types to test",
+    )
+    parser.add_argument(
+        "--untrained",
+        action="store_true",
+        help="If set to true, will load gpt2-xl",
+    )
+    parser.add_argument(
+        "--prompt",
+        action="store_true",
+        help="If set to true, the script evaluates metadata in prompt style",
+    )
+
+    args = parser.parse_args()
+    print(f"Parameters: {args}")
+    results = evaluate_main(
+        args.repo_id,
+        args.subfolder,
+        args.config_file_path,
+        args.output_file,
+        args.save_data,
+        args.test,
+        args.max_n_examples,
+        args.metadata_to_test,
+        args.untrained,
+        args.prompt,
+        args.no_cuda,
+    )
+    # Load config
+    # Write results to output file
+    with open(args.output_file, "a", encoding="utf8") as f:
+        for k, v in results.items():
+            f.write(f"=== RESULT [{k}] ===\n")
+            f.write("Perplexity (metadata): {:>6,.3f}\n".format(v["final_metadata_ppl"]))
+            f.write("Perplexity (normal):   {:>6,.3f}\n\n".format(v["final_normal_ppl"]))
diff --git a/bsmetadata/experiments/with_metadata_datasetv2_tf.py b/bsmetadata/experiments/with_metadata_datasetv2_tf.py
index e4ebdd28..779bcea4 100644
--- a/bsmetadata/experiments/with_metadata_datasetv2_tf.py
+++ b/bsmetadata/experiments/with_metadata_datasetv2_tf.py
@@ -54,11 +54,7 @@ def from_json_string(t):
 
         examples = {k: [v] for k, v in example.items()}
         metadata_type_sample_weights = data_config.metadata_config.random_sample_metadata_weights
-        examples = random_sample_metadata_v2(
-            examples,
-            metadata_type_sample_weights=metadata_type_sample_weights,
-            html_overall_sample_rate=data_config.metadata_config.html_overall_sample_rate,
-        )
+        examples = random_sample_metadata_v2(examples, metadata_type_sample_weights=metadata_type_sample_weights)
         # example = {k: v[0] for k, v in examples.items()}
 
         result = add_metadata_and_chunk_examples(examples, tokenizer, data_config.metadata_config)
@@ -87,7 +83,7 @@ def filter_empty(t):
     return data
 
 
-def get_dataloader(*, tokenizer, args, num_gpus, gpu_id):
+def get_dataloader(*, tokenizer, args, num_gpus, gpu_id,train=True):
     """returns a tensorflow dataloader"""
     data_config = args
     local_dir = Path(data_config.dataset_name)
@@ -99,19 +95,28 @@ def get_dataloader(*, tokenizer, args, num_gpus, gpu_id):
     file_paths = list(Path(local_dir).glob(data_config.train_file))
     assert len(file_paths) > 0, f"no files found for {data_config.train_file}"
 
+
     files_with_entities = [x for x in file_paths if x.name in data_files_with_entities]
     files_without_entities = [x for x in file_paths if x.name not in data_files_with_entities]
     print(f"{len(files_with_entities)} files with entities")
     print(f"{len(files_without_entities)} files without entities")
 
+    if train:
+        files_with_entities = [x for x in files_with_entities if
+                               'c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' not in x.name]
+    else:
+        files_with_entities = [x for x in files_with_entities if
+                               'c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' in x.name]
+
     data_with_entities = get_dataset(files_with_entities, num_gpus, gpu_id, data_config, tokenizer)
-    data_without_entities = get_dataset(files_without_entities, num_gpus, gpu_id, data_config, tokenizer)
+
+
+
     data = tf.data.Dataset.sample_from_datasets(
-        [data_with_entities, data_without_entities],
-        weights=[float(len(files_with_entities)), float(len(files_without_entities))],
+        [data_with_entities],
+        weights=[float(len(files_with_entities))],
         seed=42,
     )
-
     data = data.shuffle(1000, reshuffle_each_iteration=True)
     data = data.batch(data_config.per_device_train_batch_size)
     data = data.prefetch(tf.data.AUTOTUNE)
@@ -137,4 +142,4 @@ def get_dummy_dataloader(batch_size):
         shuffle=True,
         num_workers=0,
         pin_memory=True,
-    )
+    )
\ No newline at end of file
diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml
index 42a0044a..df1f27d6 100644
--- a/bsmetadata/hydra_configs/v2.yaml
+++ b/bsmetadata/hydra_configs/v2.yaml
@@ -75,11 +75,12 @@ data_config:
     local_metadata_special_token_end:
         entity_paragraph: " </ENTITY_CHAIN> "
     local_metadata_special_token_state: true
-    html_overall_sample_rate: 0.25
+    html_overall_sample_rate: 1
     without_metadata_same_context: false
+    use_full_evaluation_for_val: false
   experiment: with_metadata_datasetv2_tf
-  per_device_eval_batch_size: 8
-  per_device_train_batch_size: 8
+  per_device_eval_batch_size: 64 # 32 for 40gb
+  per_device_train_batch_size: 64
   dataset_name: bs-modeling-metadata/c4-en-html-with-training_metadata_all
   dataset_config_name: null
   train_file: c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz
@@ -87,12 +88,12 @@ data_config:
   overwrite_cache: false
   cache_dir: null
   extension: null
-  preprocessing_num_workers: 6
+  preprocessing_num_workers: 40
   validation_split_percentage: 5
   block_size: null
   map_batch_size: 1
 weight_decay: 0.01
-learning_rate: 5e-5
+learning_rate: 0.0001
 num_train_epochs: 1
 max_train_steps: 100000
 lr_scheduler_type: linear
@@ -103,16 +104,16 @@ model_name: gpt2
 project_name: metadata_lm
 jobid: ''
 start_with_eval: false
-extra_steps_to_eval_save_at:
-- 2
+#extra_steps_to_eval_save_at:
+#- 2
 evaluation_strategy: STEPS
 eval_num_per_epoch: 3
-eval_steps: 2000
+eval_steps: 250
 save_strategy: STEPS
 save_num_per_epoch: 3
-save_steps: 150
+save_steps: 250
 do_train: true
 do_eval: true
 gradient_checkpointing: true
 resume_from_checkpoint_dir: null
-gradient_accumulation_steps: 16
+gradient_accumulation_steps: 1
diff --git a/bsmetadata/train.py b/bsmetadata/train.py
index d97853b6..c3c8552c 100644
--- a/bsmetadata/train.py
+++ b/bsmetadata/train.py
@@ -16,11 +16,11 @@
 import wandb
 from accelerate import Accelerator
 from accelerate.utils import DistributedType, DummyOptim, DummyScheduler
+from evaluation import evaluate_main
 from hydra.core.config_store import ConfigStore
 from omegaconf import OmegaConf
-from torch.optim import AdamW
 from tqdm.auto import tqdm as original_tqdm
-from transformers import AddedToken, AutoConfig, AutoModelForCausalLM, AutoTokenizer, get_scheduler, set_seed
+from transformers import AdamW, AddedToken, AutoConfig, AutoModelForCausalLM, AutoTokenizer, get_scheduler, set_seed
 from transformers.trainer_utils import IntervalStrategy
 
 from bsmetadata.input_pipeline import DataConfig, get_dataloaders
@@ -89,6 +89,9 @@ class CFG:
     gradient_checkpointing: bool = field(
         default=False, metadata={"help": "Whether to use gradient_checkpointing to save memory."}
     )
+    use_full_evaluation_for_val: bool = field(
+        default=False, metadata={"help": "Whether to use full evaluation for val"}
+    )
 
 
 cs = ConfigStore.instance()
@@ -217,8 +220,8 @@ def main(args: CFG) -> None:
     is_local_main_process = accelerator.is_local_main_process
     tqdm = partial(original_tqdm, disable=not is_local_main_process, position=0)
     use_deepspeed = accelerator.state.deepspeed_plugin is not None
-    use_deepspeed_optimzer = use_deepspeed and "optimizer" in accelerator.state.deepspeed_plugin.deepspeed_config
-    use_deepspeed_scheduler = use_deepspeed and "scheduler" in accelerator.state.deepspeed_plugin.deepspeed_config
+    use_deepspeed_optimzer = use_deepspeed or "optimizer" in accelerator.state.deepspeed_plugin.deepspeed_config
+    use_deepspeed_scheduler = use_deepspeed or "scheduler" in accelerator.state.deepspeed_plugin.deepspeed_config
 
     if accelerator.distributed_type == DistributedType.DEEPSPEED and not use_deepspeed_scheduler:
         assert False, "Please set scheduler in DeepSpeed config file otherwise it may not be checkpointed properly"
@@ -294,7 +297,13 @@ def main(args: CFG) -> None:
             gpu_id=accelerator.process_index,
         )
         dummy_dataloader = get_dummy_dataloader(args.data_config.per_device_train_batch_size)
-        eval_dataloaders = dict()
+        eval_dataloader, format_fn_eval = get_dataloader(
+            tokenizer=tokenizer,
+            args=args.data_config,
+            num_gpus=accelerator.num_processes,
+            gpu_id=accelerator.process_index,
+            train=False,
+        )
         model, optimizer, dummy_dataloader, scheduler = accelerator.prepare(
             model, optimizer, dummy_dataloader, scheduler
         )
@@ -348,7 +357,7 @@ def format_fn(x):
         save_per_n_step = args.max_train_steps + 1  # will never eval
 
     @torch.no_grad()
-    def evaluate(eval_dataloader):
+    def evaluate(eval_dataloader, only_first_n_steps=120):
         model.eval()
         losses = []
         for step, batch in enumerate(tqdm(eval_dataloader, desc="eval")):  # , leave=False)
@@ -359,7 +368,8 @@ def evaluate(eval_dataloader):
             loss = loss_fn(batch, outputs, metadata_mask)
 
             losses.append(accelerator.gather(loss.repeat(args.data_config.per_device_eval_batch_size)))
-
+            if step == only_first_n_steps:
+                break
         model.train()
         if not losses:
             # in case the dataloader is empty
@@ -368,12 +378,21 @@ def evaluate(eval_dataloader):
         perplexity = math.exp(torch.mean(losses))
         return {"perplexity": perplexity}
 
-    def evaluate_multiple_dateloaders(eval_dataloaders):
-        for key, eval_dataloader in eval_dataloaders.items():
-            logger.info(f"Evaluating split {key}")
-            metrics = evaluate(eval_dataloader)
-            metrics_logger.log({key: metrics})
-        logger.info("Evaluation finished")
+    def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val):
+        if use_full_evaluation_for_val:
+            results = evaluate_main(
+                output_file="eval.txt",
+                metadata_to_test="title,html,entity_paragraph,website_desc,generation_datasource,timestamp",
+            )
+            for k, v in results.items():
+                metrics_logger.log({k: v})
+            logger.info("Evaluation finished")
+        else:
+            for key, eval_dataloader in eval_dataloaders.items():
+                logger.info(f"Evaluating split {key}")
+                metrics = evaluate(eval_dataloader)
+                metrics_logger.log({key: metrics})
+            logger.info("Evaluation finished")
 
     if not args.do_train and not args.do_eval:
         return
@@ -384,7 +403,7 @@ def evaluate_multiple_dateloaders(eval_dataloaders):
     do_eval = args.do_eval and args.start_with_eval
     if do_eval:
         logger.info("Start with an evaluation")
-        evaluate_multiple_dateloaders(eval_dataloaders)
+        evaluate_multiple_dateloaders(eval_dataloaders, args.use_full_evaluation_for_val)
 
     if not args.do_train:
         return
@@ -406,7 +425,7 @@ def save(path):
             model.save_checkpoint(path)
         else:
             accelerator.save_state(path)
-        save_model_and_tokenizer(accelerator, model, path)
+        save_model_and_tokenizer(accelerator, model, path, tokenizer=tokenizer)
         if is_local_main_process:
             train_state.save(path / "train_state.json")
 
@@ -426,6 +445,17 @@ def get_data_iter():
                     batch = {k: v.to(accelerator.device) for k, v in batch.items()}
                 yield batch
 
+    def get_eval_data_iter():
+        while True:
+            for batch in eval_dataloader:
+                batch = format_fn_eval(batch)
+                if args.data_config.experiment == "with_metadata_datasetv2_tf":
+                    batch = {k: v.to(accelerator.device) for k, v in batch.items()}
+                yield batch
+
+    eval_iter = get_eval_data_iter()
+    eval_dataloaders = {"validation": eval_iter}
+
     data_iter = get_data_iter()
 
     for _ in tqdm(
@@ -461,11 +491,18 @@ def get_data_iter():
                 optimizer.zero_grad()
 
             step_loss_gathered = accelerator.gather(step_loss).mean().item()
-            metrics = {
-                "loss": step_loss_gathered,
-                "lr": max(scheduler.get_lr()),
-                "gradient_step": train_state.completed_steps,
-            }
+            if step < 20:
+                metrics = {
+                    "loss": step_loss_gathered,
+                    "lr": 0,
+                    "gradient_step": train_state.completed_steps,
+                }
+            else:
+                metrics = {
+                    "loss": step_loss_gathered,
+                    "lr": max(scheduler.get_last_lr()),
+                    "gradient_step": train_state.completed_steps,
+                }
             if not args.data_config.streaming:
                 metrics["epoch"] = step / len(train_dataloader)
 
@@ -488,7 +525,7 @@ def get_data_iter():
             path = Path(args.out_dir).resolve() / f"checkpoint-{completed_steps}step"
             save(path)
         if do_eval:
-            evaluate_multiple_dateloaders(eval_dataloaders)
+            evaluate_multiple_dateloaders(eval_dataloaders, args.use_full_evaluation_for_val)
 
         if completed_steps >= args.max_train_steps:
             # finished = True

From 6c4b6a7a7b19cc656546d03d4f8d8f0058c2930d Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Wed, 17 May 2023 13:44:52 +0100
Subject: [PATCH 02/15] no message

---
 bsmetadata/experiments/with_metadata_datasetv2_tf.py | 6 +++++-
 bsmetadata/train.py                                  | 3 +--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/bsmetadata/experiments/with_metadata_datasetv2_tf.py b/bsmetadata/experiments/with_metadata_datasetv2_tf.py
index 779bcea4..b635a4ee 100644
--- a/bsmetadata/experiments/with_metadata_datasetv2_tf.py
+++ b/bsmetadata/experiments/with_metadata_datasetv2_tf.py
@@ -54,7 +54,11 @@ def from_json_string(t):
 
         examples = {k: [v] for k, v in example.items()}
         metadata_type_sample_weights = data_config.metadata_config.random_sample_metadata_weights
-        examples = random_sample_metadata_v2(examples, metadata_type_sample_weights=metadata_type_sample_weights)
+        examples = random_sample_metadata_v2(
+            examples,
+            metadata_type_sample_weights=metadata_type_sample_weights,
+            html_overall_sample_rate=data_config.metadata_config.html_overall_sample_rate,
+        )
         # example = {k: v[0] for k, v in examples.items()}
 
         result = add_metadata_and_chunk_examples(examples, tokenizer, data_config.metadata_config)
diff --git a/bsmetadata/train.py b/bsmetadata/train.py
index c3c8552c..4d00edec 100644
--- a/bsmetadata/train.py
+++ b/bsmetadata/train.py
@@ -386,13 +386,12 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val)
             )
             for k, v in results.items():
                 metrics_logger.log({k: v})
-            logger.info("Evaluation finished")
         else:
             for key, eval_dataloader in eval_dataloaders.items():
                 logger.info(f"Evaluating split {key}")
                 metrics = evaluate(eval_dataloader)
                 metrics_logger.log({key: metrics})
-            logger.info("Evaluation finished")
+        logger.info("Evaluation finished")
 
     if not args.do_train and not args.do_eval:
         return

From 98598ba6ea0d23d8ce6baf0d8a590c25c522ebfa Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Wed, 17 May 2023 13:51:24 +0100
Subject: [PATCH 03/15] fix format

---
 .../experiments/with_metadata_datasetv2_tf.py   | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/bsmetadata/experiments/with_metadata_datasetv2_tf.py b/bsmetadata/experiments/with_metadata_datasetv2_tf.py
index b635a4ee..1f45ad4a 100644
--- a/bsmetadata/experiments/with_metadata_datasetv2_tf.py
+++ b/bsmetadata/experiments/with_metadata_datasetv2_tf.py
@@ -87,7 +87,7 @@ def filter_empty(t):
     return data
 
 
-def get_dataloader(*, tokenizer, args, num_gpus, gpu_id,train=True):
+def get_dataloader(*, tokenizer, args, num_gpus, gpu_id, train=True):
     """returns a tensorflow dataloader"""
     data_config = args
     local_dir = Path(data_config.dataset_name)
@@ -99,23 +99,22 @@ def get_dataloader(*, tokenizer, args, num_gpus, gpu_id,train=True):
     file_paths = list(Path(local_dir).glob(data_config.train_file))
     assert len(file_paths) > 0, f"no files found for {data_config.train_file}"
 
-
     files_with_entities = [x for x in file_paths if x.name in data_files_with_entities]
     files_without_entities = [x for x in file_paths if x.name not in data_files_with_entities]
     print(f"{len(files_with_entities)} files with entities")
     print(f"{len(files_without_entities)} files without entities")
 
     if train:
-        files_with_entities = [x for x in files_with_entities if
-                               'c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' not in x.name]
+        files_with_entities = [
+            x for x in files_with_entities if "c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz" not in x.name
+        ]
     else:
-        files_with_entities = [x for x in files_with_entities if
-                               'c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' in x.name]
+        files_with_entities = [
+            x for x in files_with_entities if "c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz" in x.name
+        ]
 
     data_with_entities = get_dataset(files_with_entities, num_gpus, gpu_id, data_config, tokenizer)
 
-
-
     data = tf.data.Dataset.sample_from_datasets(
         [data_with_entities],
         weights=[float(len(files_with_entities))],
@@ -146,4 +145,4 @@ def get_dummy_dataloader(batch_size):
         shuffle=True,
         num_workers=0,
         pin_memory=True,
-    )
\ No newline at end of file
+    )

From f9af120bf445aa8421151f8e064f50a3d6722843 Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Wed, 17 May 2023 17:40:22 +0100
Subject: [PATCH 04/15] fix evaluation and config for 40gb

---
 bsmetadata/deepspeed_configs/v2.json |  2 +-
 bsmetadata/evaluation.py             | 18 ++++++++++++++----
 bsmetadata/hydra_configs/v2.yaml     | 14 +++++++-------
 bsmetadata/train.py                  | 12 +++++++-----
 4 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/bsmetadata/deepspeed_configs/v2.json b/bsmetadata/deepspeed_configs/v2.json
index 35a24626..dca70cc9 100644
--- a/bsmetadata/deepspeed_configs/v2.json
+++ b/bsmetadata/deepspeed_configs/v2.json
@@ -39,7 +39,7 @@
     "contiguous_gradients": true,
     "cpu_offload": false
 },
-    "gradient_accumulation_steps": 1,
+    "gradient_accumulation_steps": 2,
     "gradient_clipping": "auto",
     "steps_per_print": 100,
     "train_batch_size": 512,
diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py
index 235e7263..86f2510c 100644
--- a/bsmetadata/evaluation.py
+++ b/bsmetadata/evaluation.py
@@ -180,6 +180,7 @@ def get_mean_loss(
     batch: Dict[str, torch.Tensor],
     save_data: bool = False,
     idx: int = None,
+    model=None,
 ) -> torch.Tensor:
     """Prepares the arguments for perplexity calculation and passes them to the perplexity function.
 
@@ -272,18 +273,20 @@ def evaluate_main(
     test: bool = False,
     max_n_examples: int = 1500,
     prompt: bool = False,
-    no_cuda: bool = False,
+    no_cuda: bool = True,
     save_data: bool = False,
     untrained: bool = False,
     config_file_path: str = None,
     model: str = None,
     tokenizer: str = None,
+    accelerator=None,
 ) -> dict:
     if config_file_path is None:
         try:
             config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True)
         except Exception:
             config_file_path = "bsmetadata/hydra_configs/v2.yaml"
+    config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml"
     repo_args = OmegaConf.load(config_file_path)
     data_config = repo_args.data_config
 
@@ -398,7 +401,10 @@ def evaluate_main(
                 normal_batch = default_data_collator([normal_example])
                 metadata_example["labels"] = metadata_example["input_ids"]
                 metadata_batch = default_data_collator([metadata_example])
-                if not no_cuda:
+                if accelerator is not None:
+                    normal_batch = {k: v.to(accelerator.device) for k, v in normal_batch.items()}
+                    metadata_batch = {k: v.to(accelerator.device) for k, v in metadata_batch.items()}
+                elif not no_cuda:
                     normal_batch = {k: v.cuda() for k, v in normal_batch.items()}
                     metadata_batch = {k: v.cuda() for k, v in metadata_batch.items()}
                 if n_examples == 1:
@@ -414,11 +420,15 @@ def evaluate_main(
                     # rich.print(tokenizer.decode(metadata_batch["input_ids"][0]))
 
                 # Calculate nll (natural-log loss)
-                normal_nll, normal_example_len = get_mean_loss(normal_batch, save_data=save_data, idx=idx)  # [0]
+                normal_nll, normal_example_len = get_mean_loss(
+                    normal_batch, save_data=save_data, idx=idx, model=model
+                )  # [0]
                 # print("PPL")
                 # print(normal_ppl)
                 total_normal_nll.append(normal_nll)  # * normal_example_len
-                metadata_nll, metadata_example_len = get_mean_loss(metadata_batch, save_data=save_data, idx=idx)  # [0]
+                metadata_nll, metadata_example_len = get_mean_loss(
+                    metadata_batch, save_data=save_data, idx=idx, model=model
+                )  # [0]
                 # print(metadata_ppl)
                 total_metadata_nll.append(metadata_nll)  # * metadata_example_len
 
diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml
index df1f27d6..62786638 100644
--- a/bsmetadata/hydra_configs/v2.yaml
+++ b/bsmetadata/hydra_configs/v2.yaml
@@ -1,6 +1,7 @@
 data_config:
   streaming: True
   validation_size_max: 1024
+  use_full_evaluation_for_val: true
   metadata_config:
     random_sample_metadata: true
     random_sample_metadata_calculate_size: 16384
@@ -38,7 +39,7 @@ data_config:
     #- generation_length_sentence
     #- generation_length_text
     - entity_paragraph
-    local_metadata_special_tokens: 
+    local_metadata_special_tokens:
       entity_paragraph: "entity"
     metadata_sep: ' | '
     metadata_key_value_sep: ': '
@@ -77,10 +78,9 @@ data_config:
     local_metadata_special_token_state: true
     html_overall_sample_rate: 1
     without_metadata_same_context: false
-    use_full_evaluation_for_val: false
   experiment: with_metadata_datasetv2_tf
-  per_device_eval_batch_size: 64 # 32 for 40gb
-  per_device_train_batch_size: 64
+  per_device_eval_batch_size: 32 # 32 for 40gb
+  per_device_train_batch_size: 32
   dataset_name: bs-modeling-metadata/c4-en-html-with-training_metadata_all
   dataset_config_name: null
   train_file: c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz
@@ -104,8 +104,8 @@ model_name: gpt2
 project_name: metadata_lm
 jobid: ''
 start_with_eval: false
-#extra_steps_to_eval_save_at:
-#- 2
+extra_steps_to_eval_save_at:
+- 2
 evaluation_strategy: STEPS
 eval_num_per_epoch: 3
 eval_steps: 250
@@ -116,4 +116,4 @@ do_train: true
 do_eval: true
 gradient_checkpointing: true
 resume_from_checkpoint_dir: null
-gradient_accumulation_steps: 1
+gradient_accumulation_steps: 2
\ No newline at end of file
diff --git a/bsmetadata/train.py b/bsmetadata/train.py
index 4d00edec..9c5a928e 100644
--- a/bsmetadata/train.py
+++ b/bsmetadata/train.py
@@ -89,9 +89,6 @@ class CFG:
     gradient_checkpointing: bool = field(
         default=False, metadata={"help": "Whether to use gradient_checkpointing to save memory."}
     )
-    use_full_evaluation_for_val: bool = field(
-        default=False, metadata={"help": "Whether to use full evaluation for val"}
-    )
 
 
 cs = ConfigStore.instance()
@@ -382,8 +379,13 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val)
         if use_full_evaluation_for_val:
             results = evaluate_main(
                 output_file="eval.txt",
+                # metadata_to_test="entity_paragraph",
                 metadata_to_test="title,html,entity_paragraph,website_desc,generation_datasource,timestamp",
+                model=model,
+                tokenizer=tokenizer,
+                accelerator=accelerator,
             )
+            model.train()
             for k, v in results.items():
                 metrics_logger.log({k: v})
         else:
@@ -402,7 +404,7 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val)
     do_eval = args.do_eval and args.start_with_eval
     if do_eval:
         logger.info("Start with an evaluation")
-        evaluate_multiple_dateloaders(eval_dataloaders, args.use_full_evaluation_for_val)
+        evaluate_multiple_dateloaders(eval_dataloaders, args.data_config.use_full_evaluation_for_val)
 
     if not args.do_train:
         return
@@ -524,7 +526,7 @@ def get_eval_data_iter():
             path = Path(args.out_dir).resolve() / f"checkpoint-{completed_steps}step"
             save(path)
         if do_eval:
-            evaluate_multiple_dateloaders(eval_dataloaders, args.use_full_evaluation_for_val)
+            evaluate_multiple_dateloaders(eval_dataloaders, args.data_config.use_full_evaluation_for_val)
 
         if completed_steps >= args.max_train_steps:
             # finished = True

From 1e181af5bd09cb64f657a025c59b3c84f2a7c950 Mon Sep 17 00:00:00 2001
From: Jordan Clive <jordan.clive19@imperial.ac.uk>
Date: Thu, 18 May 2023 09:25:46 +0100
Subject: [PATCH 05/15] Update evaluation.py

remove hard coded
---
 bsmetadata/evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py
index 86f2510c..e1a7f54b 100644
--- a/bsmetadata/evaluation.py
+++ b/bsmetadata/evaluation.py
@@ -286,7 +286,7 @@ def evaluate_main(
             config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True)
         except Exception:
             config_file_path = "bsmetadata/hydra_configs/v2.yaml"
-    config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml"
+#     config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml" need to add this path to PYTHONPATH
     repo_args = OmegaConf.load(config_file_path)
     data_config = repo_args.data_config
 

From efb6414314f66b2b0677fba0a610e2a28432bf6e Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Thu, 18 May 2023 13:34:08 +0100
Subject: [PATCH 06/15] specify kwargs for non training usage

---
 bsmetadata/evaluation.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py
index e1a7f54b..88ecd73d 100644
--- a/bsmetadata/evaluation.py
+++ b/bsmetadata/evaluation.py
@@ -579,17 +579,17 @@ def ppl(examples_mean_loss, examples_len):
     args = parser.parse_args()
     print(f"Parameters: {args}")
     results = evaluate_main(
-        args.repo_id,
-        args.subfolder,
-        args.config_file_path,
-        args.output_file,
-        args.save_data,
-        args.test,
-        args.max_n_examples,
-        args.metadata_to_test,
-        args.untrained,
-        args.prompt,
-        args.no_cuda,
+        repo_id=args.repo_id,
+        subfolder=args.subfolder,
+        config_file_path=args.config_file_path,
+        output_file=args.output_file,
+        save_data=args.save_data,
+        test=args.test,
+        max_n_examples=args.max_n_examples,
+        metadata_to_test=args.metadata_to_test,
+        untrained=args.untrained,
+        prompt=args.prompt,
+        no_cuda=args.no_cuda,
     )
     # Load config
     # Write results to output file

From b8e569268ad9b13ad3035b56392278c05bd54515 Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Tue, 6 Jun 2023 14:19:27 +0100
Subject: [PATCH 07/15] update with changes

---
 bsmetadata/evaluation.py         | 16 +++++++++-------
 bsmetadata/hydra_configs/v2.yaml |  9 ++++++---
 bsmetadata/train.py              |  2 +-
 tests/test_metadata_utils.py     | 18 ++++++++++++++++++
 4 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py
index 88ecd73d..d0cdf56f 100644
--- a/bsmetadata/evaluation.py
+++ b/bsmetadata/evaluation.py
@@ -46,6 +46,7 @@ def mean_loss_fn(
     metadata_mask: torch.Tensor = None,
     save_data: bool = False,
     idx: int = None,
+    tokenizer = None,
 ) -> torch.Tensor:
     """Calculates the perplexity for a given batch.
 
@@ -62,15 +63,15 @@ def mean_loss_fn(
     b = outputs.logits.size(0)
     lm_logits = outputs.logits
 
-    lm_logits[:, :, 50257] = float("-inf")
-    lm_logits[:, :, 50258] = float("-inf")
-
     labels = batch["labels"]
     attention_mask = batch["attention_mask"]
 
     shift_logits = lm_logits[..., :-1, :].contiguous()
     shift_labels = labels[..., 1:].contiguous()
     if metadata_mask is not None:
+        for special_tok in tokenizer.additional_special_tokens_ids:
+            shift_logits[:, :, special_tok] = torch.finfo(lm_logits.dtype).min
+
         metadata_mask = metadata_mask.bool()
         nonmetadata_cumsum = torch.cumsum(~metadata_mask, dim=-1)
         first_nonmetadata = nonmetadata_cumsum == 1
@@ -133,7 +134,7 @@ def mean_loss_fn(
         shift_labels.view(-1),
         reduction="none",
     ).view(b, -1)
-
+    loss = torch.nan_to_num(loss)
     if save_data:
         # Save the non-masked tokens & their loss
         suffix = "_meta" if metadata_mask is not None else ""
@@ -181,6 +182,7 @@ def get_mean_loss(
     save_data: bool = False,
     idx: int = None,
     model=None,
+    tokenizer = None,
 ) -> torch.Tensor:
     """Prepares the arguments for perplexity calculation and passes them to the perplexity function.
 
@@ -198,7 +200,7 @@ def get_mean_loss(
     metadata_mask = batch.pop("metadata_mask", None)
     outputs = model(**batch)
     batch["labels"] = labels
-    nll = mean_loss_fn(batch, outputs, metadata_mask, save_data=save_data, idx=idx)
+    nll = mean_loss_fn(batch, outputs, metadata_mask, save_data=save_data, idx=idx,tokenizer=tokenizer)
     return nll
 
 
@@ -421,13 +423,13 @@ def evaluate_main(
 
                 # Calculate nll (natural-log loss)
                 normal_nll, normal_example_len = get_mean_loss(
-                    normal_batch, save_data=save_data, idx=idx, model=model
+                    normal_batch, save_data=save_data, idx=idx, model=model,tokenizer=tokenizer
                 )  # [0]
                 # print("PPL")
                 # print(normal_ppl)
                 total_normal_nll.append(normal_nll)  # * normal_example_len
                 metadata_nll, metadata_example_len = get_mean_loss(
-                    metadata_batch, save_data=save_data, idx=idx, model=model
+                    metadata_batch, save_data=save_data, idx=idx, model=model,tokenizer=tokenizer
                 )  # [0]
                 # print(metadata_ppl)
                 total_metadata_nll.append(metadata_nll)  # * metadata_example_len
diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml
index 62786638..3eaefcf0 100644
--- a/bsmetadata/hydra_configs/v2.yaml
+++ b/bsmetadata/hydra_configs/v2.yaml
@@ -12,7 +12,7 @@ data_config:
       title: 1.0657717366883845
       generation_datasource: 1.0
       entity_paragraph: 1.028817740667444
-
+      generation_length_text: 1.0
     #- url: 1.0
     #- generation_length_sentence
     #- generation_length_text
@@ -29,6 +29,7 @@ data_config:
     - datasource
     - length
     - entity_paragraph
+    - generation_length_text
     metadata_column_list:
     - html
     - timestamp
@@ -37,7 +38,7 @@ data_config:
     #- url
     - generation_datasource
     #- generation_length_sentence
-    #- generation_length_text
+    - generation_length_text
     - entity_paragraph
     local_metadata_special_tokens:
       entity_paragraph: "entity"
@@ -73,10 +74,12 @@ data_config:
       - 0.0
     local_metadata_special_token_start:
         entity_paragraph: "<ENTITY_CHAIN>"
+        html: "<HTML>"
     local_metadata_special_token_end:
         entity_paragraph: " </ENTITY_CHAIN> "
+        html: "</HTML>"
     local_metadata_special_token_state: true
-    html_overall_sample_rate: 1
+    html_overall_sample_rate: 0.5
     without_metadata_same_context: false
   experiment: with_metadata_datasetv2_tf
   per_device_eval_batch_size: 32 # 32 for 40gb
diff --git a/bsmetadata/train.py b/bsmetadata/train.py
index 9c5a928e..2c458afc 100644
--- a/bsmetadata/train.py
+++ b/bsmetadata/train.py
@@ -380,7 +380,7 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val)
             results = evaluate_main(
                 output_file="eval.txt",
                 # metadata_to_test="entity_paragraph",
-                metadata_to_test="title,html,entity_paragraph,website_desc,generation_datasource,timestamp",
+                metadata_to_test="title,html,entity_paragraph,website_desc,generation_datasource,timestamp,generation_length_text",
                 model=model,
                 tokenizer=tokenizer,
                 accelerator=accelerator,
diff --git a/tests/test_metadata_utils.py b/tests/test_metadata_utils.py
index 49408078..51ecdcb8 100644
--- a/tests/test_metadata_utils.py
+++ b/tests/test_metadata_utils.py
@@ -454,6 +454,24 @@ def test_entity_settings(self):
             "EntityOn |EntityParagraphOn ||| <ENTITY_CHAIN> |United Kingdom| |Louis Vuitton| |Billy Connolly| |Something in Common| |Lembit Öpik| </ENTITY_CHAIN> Hints and tips for media appearances, speaking and social media. This week; wall-to-wall politicians; Great Britain [[United Kingdom]]: Louis Vuitton [[Louis Vuitton]] condoms; Billy Connolly [[Billy Connolly]],; Lisa Dutton; Something in Common [[Something in Common]]; What was I saying?: We’re all publishers; An interview with Lembit Opik [[Lembit Öpik]]; Music from The Good Suns",
         )
 
+    def test_html_special_token_settings(self):
+        # from transformers import AddedToken
+
+        cfg = MetadataConfig()
+        PROCESSORS["html"] = HtmlProcessor
+        cfg.metadata_list = ["html"]
+        cfg.treat_local_metadata_as_regular_text = True
+        cfg.local_metadata_special_token_start = {"html": "<HTML>"}
+        cfg.local_metadata_special_token_end = {"html": "</HTML>"}
+        text, mask = add_local_metadata_to_text(self.examples[1], cfg)
+        self.assertEqual(
+            text,
+            "An <HTML><b></HTML>apple<HTML></b></HTML> is an edible fruit "
+            "produced by an <HTML><b class:level1 id:4 href:https://test.org>"
+            "<i class:level2></HTML>apple<HTML></i></HTML> tree<HTML></b>"
+            "</HTML> (Malus domestica).",
+        )
+
     def test_add_local_metadata_to_text(self):
         cfg = MetadataConfig()
         cfg.metadata_list = ["html", "entity"]

From ee61ded8c918903ae1931f5e263cef03e360c247 Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Tue, 6 Jun 2023 14:36:26 +0100
Subject: [PATCH 08/15] update configs

---
 bsmetadata/evaluation.py         |  2 +-
 bsmetadata/hydra_configs/v2.yaml |  3 +-
 bsmetadata/train.py              |  6 +++-
 slurm_40.sh                      | 47 ++++++++++++++++++++++++++++++++
 4 files changed, 55 insertions(+), 3 deletions(-)
 create mode 100644 slurm_40.sh

diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py
index d0cdf56f..62ff1577 100644
--- a/bsmetadata/evaluation.py
+++ b/bsmetadata/evaluation.py
@@ -275,7 +275,7 @@ def evaluate_main(
     test: bool = False,
     max_n_examples: int = 1500,
     prompt: bool = False,
-    no_cuda: bool = True,
+    no_cuda: bool = False,
     save_data: bool = False,
     untrained: bool = False,
     config_file_path: str = None,
diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml
index 3eaefcf0..44d9cddf 100644
--- a/bsmetadata/hydra_configs/v2.yaml
+++ b/bsmetadata/hydra_configs/v2.yaml
@@ -97,6 +97,7 @@ data_config:
   map_batch_size: 1
 weight_decay: 0.01
 learning_rate: 0.0001
+wb_name: "metadata_datasetv2_tf"
 num_train_epochs: 1
 max_train_steps: 100000
 lr_scheduler_type: linear
@@ -114,7 +115,7 @@ eval_num_per_epoch: 3
 eval_steps: 250
 save_strategy: STEPS
 save_num_per_epoch: 3
-save_steps: 250
+save_steps: 1000
 do_train: true
 do_eval: true
 gradient_checkpointing: true
diff --git a/bsmetadata/train.py b/bsmetadata/train.py
index 2c458afc..1e5c8648 100644
--- a/bsmetadata/train.py
+++ b/bsmetadata/train.py
@@ -34,6 +34,7 @@ class CFG:
     data_config: DataConfig = DataConfig()
     weight_decay: float = field(default=0.0, metadata={"help": "The weight decay to use for training."})
     learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate."})
+    wb_name: str = field(default="bsmetadata", metadata={"help": "The name of the wandb project."})
     gradient_accumulation_steps: int = field(
         default=1,
         metadata={"help": "The number of gradient accumulation steps to perform before updating model parameters."},
@@ -399,7 +400,10 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val)
         return
 
     progress_bar = tqdm(range(args.max_train_steps), desc="training", initial=train_state.completed_steps)
-    metrics_logger = Logger(is_local_main_process, project=args.project_name, config=config_dict)
+    t_bs = args.data_config.per_device_train_batch_size * args.gradient_accumulation_steps * 8
+    os.environ['WANDB_API_KEY'] = 'd8216641d549f9bb3d0c5074baa39e15dfd55030'
+    metrics_logger = Logger(is_local_main_process, name=f"{args.wb_name}-{args.learning_rate}-{t_bs}",
+                            entity='jordanclive', project='metadata', config=config_dict)
 
     do_eval = args.do_eval and args.start_with_eval
     if do_eval:
diff --git a/slurm_40.sh b/slurm_40.sh
new file mode 100644
index 00000000..6adf0cf2
--- /dev/null
+++ b/slurm_40.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+#SBATCH --account laion
+#SBATCH --partition="g40"
+#SBATCH --job-name=flan
+#SBATCH --gres=gpu:8
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=12
+#SBATCH --output=%x_%j.out
+source /fsx/home-jordiclive/miniconda3/bin/activate meta_conda
+cd /fsx/home-jordiclive/metadata
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/
+export TRANSFORMERS_CACHE=/fsx/home-jordiclive/transformers_cache
+
+#export HF_DATASETS_OFFLINE=1
+#export TRANSFORMERS_OFFLINE=1
+#export WANDB_MODE=offline
+export HYDRA_FULL_ERROR=1
+
+
+export MODEL=gpt2-xl
+export NUM_GPU=8
+export DEEPSPEED_CONFIG=$(realpath bsmetadata/deepspeed_configs/v2.json)
+export DATA_DIR=$(realpath /fsx/home-jordiclive/metadata/local-data/datasets--bs-modeling-metadata--c4-en-html-with-training_metadata_all/snapshots/8f2615d8b8580e89533b90bc3931e0b99ef15aec)
+echo "deepspeed_config_file: $DEEPSPEED_CONFIG"
+
+export WANDB_API_KEY= 'd8216641d549f9bb3d0c5074baa39e15dfd55030'
+
+echo "compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_config_file: $DEEPSPEED_CONFIG
+distributed_type: DEEPSPEED
+fp16: true
+machine_rank: 0
+main_process_ip: null
+main_process_port: null
+main_training_function: main
+num_machines: 1
+num_processes: -1
+mixed_precision: fp16
+" > accelerate_config.yaml
+CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --config_file accelerate_config.yaml bsmetadata/train.py --config-name v2 \
+  model_name=$MODEL \
+  data_config.dataset_name=$DATA_DIR \
+  data_config.train_file='*.jsonl.gz' \
+  data_config.validation_file='c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' \
+  out_dir=/fsx/home-jordiclive/tmp/metadata-html-quarter \
+  wb_name="full-metadata-with-generation-text-0.5-html"
\ No newline at end of file

From a4619dc7c41897ce38961e46c0371cf80b968047 Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Tue, 6 Jun 2023 15:49:40 +0100
Subject: [PATCH 09/15] no message

---
 bsmetadata/hydra_configs/v2.yaml | 2 +-
 slurm_40.sh                      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml
index 44d9cddf..9311028b 100644
--- a/bsmetadata/hydra_configs/v2.yaml
+++ b/bsmetadata/hydra_configs/v2.yaml
@@ -97,7 +97,7 @@ data_config:
   map_batch_size: 1
 weight_decay: 0.01
 learning_rate: 0.0001
-wb_name: "metadata_datasetv2_tf"
+wb_name: "all_metadata"
 num_train_epochs: 1
 max_train_steps: 100000
 lr_scheduler_type: linear
diff --git a/slurm_40.sh b/slurm_40.sh
index 6adf0cf2..7f483bb7 100644
--- a/slurm_40.sh
+++ b/slurm_40.sh
@@ -43,5 +43,5 @@ CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --
   data_config.dataset_name=$DATA_DIR \
   data_config.train_file='*.jsonl.gz' \
   data_config.validation_file='c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' \
-  out_dir=/fsx/home-jordiclive/tmp/metadata-html-quarter \
-  wb_name="full-metadata-with-generation-text-0.5-html"
\ No newline at end of file
+  out_dir=/fsx/home-jordiclive/tmp/metadata-html-half \
+#  wb_name="full-metadata-with-generation-text-0.5-html"
\ No newline at end of file

From 09c4e7f74896a14c045250f0ef25e967756179cd Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Tue, 6 Jun 2023 17:50:43 +0100
Subject: [PATCH 10/15] no message

---
 bsmetadata/hydra_configs/v2.yaml |  1 -
 bsmetadata/train.py              |  4 ++--
 script.sh                        | 39 ++++++++++++++++++++++++++++++++
 3 files changed, 41 insertions(+), 3 deletions(-)
 create mode 100644 script.sh

diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml
index 9311028b..a83fa5d7 100644
--- a/bsmetadata/hydra_configs/v2.yaml
+++ b/bsmetadata/hydra_configs/v2.yaml
@@ -1,7 +1,6 @@
 data_config:
   streaming: True
   validation_size_max: 1024
-  use_full_evaluation_for_val: true
   metadata_config:
     random_sample_metadata: true
     random_sample_metadata_calculate_size: 16384
diff --git a/bsmetadata/train.py b/bsmetadata/train.py
index 1e5c8648..4dc55a2c 100644
--- a/bsmetadata/train.py
+++ b/bsmetadata/train.py
@@ -408,7 +408,7 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val)
     do_eval = args.do_eval and args.start_with_eval
     if do_eval:
         logger.info("Start with an evaluation")
-        evaluate_multiple_dateloaders(eval_dataloaders, args.data_config.use_full_evaluation_for_val)
+        evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val=True)
 
     if not args.do_train:
         return
@@ -530,7 +530,7 @@ def get_eval_data_iter():
             path = Path(args.out_dir).resolve() / f"checkpoint-{completed_steps}step"
             save(path)
         if do_eval:
-            evaluate_multiple_dateloaders(eval_dataloaders, args.data_config.use_full_evaluation_for_val)
+            evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val=True)
 
         if completed_steps >= args.max_train_steps:
             # finished = True
diff --git a/script.sh b/script.sh
new file mode 100644
index 00000000..ed7e4f83
--- /dev/null
+++ b/script.sh
@@ -0,0 +1,39 @@
+source /fsx/home-jordiclive/miniconda3/bin/activate meta_conda
+cd /fsx/home-jordiclive/metadata
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/
+export TRANSFORMERS_CACHE=/fsx/home-jordiclive/transformers_cache
+
+#export HF_DATASETS_OFFLINE=1
+#export TRANSFORMERS_OFFLINE=1
+#export WANDB_MODE=offline
+export HYDRA_FULL_ERROR=1
+
+
+export MODEL=gpt2-xl
+export NUM_GPU=8
+export DEEPSPEED_CONFIG=$(realpath bsmetadata/deepspeed_configs/v2.json)
+export DATA_DIR=$(realpath /fsx/home-jordiclive/metadata/local-data/datasets--bs-modeling-metadata--c4-en-html-with-training_metadata_all/snapshots/8f2615d8b8580e89533b90bc3931e0b99ef15aec)
+echo "deepspeed_config_file: $DEEPSPEED_CONFIG"
+
+export WANDB_API_KEY= 'd8216641d549f9bb3d0c5074baa39e15dfd55030'
+
+echo "compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_config_file: $DEEPSPEED_CONFIG
+distributed_type: DEEPSPEED
+fp16: true
+machine_rank: 0
+main_process_ip: null
+main_process_port: null
+main_training_function: main
+num_machines: 1
+num_processes: -1
+mixed_precision: fp16
+" > accelerate_config.yaml
+CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --config_file accelerate_config.yaml bsmetadata/train.py --config-name v2 \
+  model_name=$MODEL \
+  data_config.dataset_name=$DATA_DIR \
+  data_config.train_file='*.jsonl.gz' \
+  data_config.validation_file='c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' \
+  out_dir=/fsx/home-jordiclive/tmp/metadata-html-half \
+#  wb_name="full-metadata-with-generation-text-0.5-html"
\ No newline at end of file

From bc892cc0d24084812dfb980bfc8a4ff594b7e6e2 Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Tue, 6 Jun 2023 20:13:32 +0100
Subject: [PATCH 11/15] no message

---
 bsmetadata/evaluation.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py
index 62ff1577..7f94e2a9 100644
--- a/bsmetadata/evaluation.py
+++ b/bsmetadata/evaluation.py
@@ -283,12 +283,12 @@ def evaluate_main(
     tokenizer: str = None,
     accelerator=None,
 ) -> dict:
-    if config_file_path is None:
-        try:
-            config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True)
-        except Exception:
-            config_file_path = "bsmetadata/hydra_configs/v2.yaml"
-#     config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml" need to add this path to PYTHONPATH
+    # if config_file_path is None:
+    #     try:
+    #         config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True)
+    #     except Exception:
+    #         config_file_path = "bsmetadata/hydra_configs/v2.yaml"
+    config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml" #need to add this path to PYTHONPATH
     repo_args = OmegaConf.load(config_file_path)
     data_config = repo_args.data_config
 

From 55c5e4ef3d78338e7fb554e0d5bd072406688d13 Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Tue, 6 Jun 2023 20:45:09 +0100
Subject: [PATCH 12/15] no message

---
 bsmetadata/evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py
index 7f94e2a9..b14dca92 100644
--- a/bsmetadata/evaluation.py
+++ b/bsmetadata/evaluation.py
@@ -275,7 +275,7 @@ def evaluate_main(
     test: bool = False,
     max_n_examples: int = 1500,
     prompt: bool = False,
-    no_cuda: bool = False,
+    no_cuda: bool = True,
     save_data: bool = False,
     untrained: bool = False,
     config_file_path: str = None,

From 35d693558b560f582f046af7b080faf0f80176e4 Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Tue, 6 Jun 2023 20:54:11 +0100
Subject: [PATCH 13/15] no message

---
 bsmetadata/hydra_configs/v2.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml
index a83fa5d7..5a869e16 100644
--- a/bsmetadata/hydra_configs/v2.yaml
+++ b/bsmetadata/hydra_configs/v2.yaml
@@ -111,7 +111,7 @@ extra_steps_to_eval_save_at:
 - 2
 evaluation_strategy: STEPS
 eval_num_per_epoch: 3
-eval_steps: 250
+eval_steps: 500
 save_strategy: STEPS
 save_num_per_epoch: 3
 save_steps: 1000

From 1f3a1f13fe714e08b09da9c373b4bb8bfe65ecd4 Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Wed, 7 Jun 2023 10:10:08 +0100
Subject: [PATCH 14/15] no message

---
 bsmetadata/hydra_configs/v2.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml
index 5a869e16..3916ead4 100644
--- a/bsmetadata/hydra_configs/v2.yaml
+++ b/bsmetadata/hydra_configs/v2.yaml
@@ -111,10 +111,10 @@ extra_steps_to_eval_save_at:
 - 2
 evaluation_strategy: STEPS
 eval_num_per_epoch: 3
-eval_steps: 500
+eval_steps: 1000
 save_strategy: STEPS
 save_num_per_epoch: 3
-save_steps: 1000
+save_steps: 500
 do_train: true
 do_eval: true
 gradient_checkpointing: true

From 4e891027a138d51c564fbac25f1575d00fbdc251 Mon Sep 17 00:00:00 2001
From: jordiclive <jordiclive19@imperial.ac.uk>
Date: Wed, 7 Jun 2023 10:10:31 +0100
Subject: [PATCH 15/15] no message

---
 bsmetadata/hydra_configs/v2.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml
index 3916ead4..805d593f 100644
--- a/bsmetadata/hydra_configs/v2.yaml
+++ b/bsmetadata/hydra_configs/v2.yaml
@@ -107,8 +107,8 @@ model_name: gpt2
 project_name: metadata_lm
 jobid: ''
 start_with_eval: false
-extra_steps_to_eval_save_at:
-- 2
+#extra_steps_to_eval_save_at:
+#- 2
 evaluation_strategy: STEPS
 eval_num_per_epoch: 3
 eval_steps: 1000