diff --git a/bsmetadata/deepspeed_configs/v2.json b/bsmetadata/deepspeed_configs/v2.json
index 1d5c0311..dca70cc9 100644
--- a/bsmetadata/deepspeed_configs/v2.json
+++ b/bsmetadata/deepspeed_configs/v2.json
@@ -30,19 +30,19 @@
         }
     },
     "zero_optimization": {
-        "stage": 1,
-        "allgather_partitions": true,
-        "allgather_bucket_size": 500000000,
-        "overlap_comm": true,
-        "reduce_scatter": true,
-        "reduce_bucket_size": 500000000,
-        "contiguous_gradients": true,
-        "cpu_offload": true
-    },
-    "gradient_accumulation_steps": 16,
+    "stage": 2,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 2e8,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 2e8,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+},
+    "gradient_accumulation_steps": 2,
     "gradient_clipping": "auto",
     "steps_per_print": 100,
-    "train_batch_size": 256,
+    "train_batch_size": 512,
     "train_micro_batch_size_per_gpu": "auto",
     "wall_clock_breakdown": false
 }
\ No newline at end of file
diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py
index b75c2aa5..b14dca92 100644
--- a/bsmetadata/evaluation.py
+++ b/bsmetadata/evaluation.py
@@ -46,6 +46,7 @@ def mean_loss_fn(
     metadata_mask: torch.Tensor = None,
     save_data: bool = False,
     idx: int = None,
+    tokenizer = None,
 ) -> torch.Tensor:
     """Calculates the perplexity for a given batch.
 
@@ -62,15 +63,15 @@ def mean_loss_fn(
     b = outputs.logits.size(0)
     lm_logits = outputs.logits
 
-    lm_logits[:, :, 50257] = float("-inf")
-    lm_logits[:, :, 50258] = float("-inf")
-
     labels = batch["labels"]
     attention_mask = batch["attention_mask"]
 
     shift_logits = lm_logits[..., :-1, :].contiguous()
     shift_labels = labels[..., 1:].contiguous()
     if metadata_mask is not None:
+        for special_tok in tokenizer.additional_special_tokens_ids:
+            shift_logits[:, :, special_tok] = torch.finfo(lm_logits.dtype).min
+
         metadata_mask = metadata_mask.bool()
         nonmetadata_cumsum = torch.cumsum(~metadata_mask, dim=-1)
         first_nonmetadata = nonmetadata_cumsum == 1
@@ -133,7 +134,7 @@ def mean_loss_fn(
         shift_labels.view(-1),
         reduction="none",
     ).view(b, -1)
-
+    loss = torch.nan_to_num(loss)
     if save_data:
         # Save the non-masked tokens & their loss
         suffix = "_meta" if metadata_mask is not None else ""
@@ -180,6 +181,8 @@ def get_mean_loss(
     batch: Dict[str, torch.Tensor],
     save_data: bool = False,
     idx: int = None,
+    model=None,
+    tokenizer = None,
 ) -> torch.Tensor:
     """Prepares the arguments for perplexity calculation and passes them to the perplexity function.
 
@@ -197,7 +200,7 @@ def get_mean_loss(
     metadata_mask = batch.pop("metadata_mask", None)
     outputs = model(**batch)
     batch["labels"] = labels
-    nll = mean_loss_fn(batch, outputs, metadata_mask, save_data=save_data, idx=idx)
+    nll = mean_loss_fn(batch, outputs, metadata_mask, save_data=save_data, idx=idx,tokenizer=tokenizer)
     return nll
 
 
@@ -264,75 +267,28 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
     return cfg.metadata_sep.join(sorted_metadata) + cfg.metadata_prefix_sep if sorted_metadata else ""
 
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--repo_id",
-        type=str,
-        default="bs-modeling-metadata/checkpoints_all_04_23",
-        help="Repository ID for the model to compute perplexity for",
-    )
-    parser.add_argument(
-        "--subfolder",
-        type=str,
-        default="checkpoint-2000step",
-        help="subfolder in the respository with the specific checkpoint to evaluate perplexity for",
-    )
-    parser.add_argument(
-        "--config_file_path",
-        type=str,
-        help="The path actual_config.yaml if available, otherwise repo_id/actual_config.yaml or git clone's v2.yaml",
-    )
-    parser.add_argument(
-        "--output_file", type=str, default="evaluation.txt", help="Path to the file the perplexity is written to"
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="If set to true, all computations are performed on CPU")
-    parser.add_argument(
-        "--save_data",
-        action="store_true",
-        help="If set to true, save tokens & losses",
-    )
-    parser.add_argument(
-        "--test",
-        action="store_true",
-        help="If set to true, the script runs in test mode and only takes 10 examples per dataset",
-    )
-    parser.add_argument(
-        "--max_n_examples",
-        type=int,
-        default=1500,
-        help="how many examples per metadata type to evaluate",
-    )
-    parser.add_argument(
-        "--metadata_to_test",
-        type=str,
-        default="html,entity,entity_paragraph,website_desc,generation_datasource,timestamp,title,generation_length_sentence,generation_length_text,url,paragraph",
-        help="metadata types to test",
-    )
-    parser.add_argument(
-        "--untrained",
-        action="store_true",
-        help="If set to true, will load gpt2-xl",
-    )
-    parser.add_argument(
-        "--prompt",
-        action="store_true",
-        help="If set to true, the script evaluates metadata in prompt style",
-    )
-
-    args = parser.parse_args()
-    print(f"Parameters: {args}")
-
-    # Load config
-    if args.config_file_path:
-        config_file_path = args.config_file_path
-    else:
-        try:
-            config_file_path = hf_hub_download(
-                repo_id=args.repo_id, filename="actual_config.yaml", use_auth_token=True
-            )
-        except Exception:
-            config_file_path = "bsmetadata/hydra_configs/v2.yaml"
+def evaluate_main(
+    metadata_to_test: str = "title,html,entity_paragraph,website_desc,generation_datasource,timestamp",
+    output_file: str = "evaluation.txt",
+    repo_id: str = None,
+    subfolder: str = None,
+    test: bool = False,
+    max_n_examples: int = 1500,
+    prompt: bool = False,
+    no_cuda: bool = True,
+    save_data: bool = False,
+    untrained: bool = False,
+    config_file_path: str = None,
+    model: str = None,
+    tokenizer: str = None,
+    accelerator=None,
+) -> dict:
+    # if config_file_path is None:
+    #     try:
+    #         config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True)
+    #     except Exception:
+    #         config_file_path = "bsmetadata/hydra_configs/v2.yaml"
+    config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml" #need to add this path to PYTHONPATH
     repo_args = OmegaConf.load(config_file_path)
     data_config = repo_args.data_config
 
@@ -341,15 +297,17 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
 
     # Load model
     print("Loading model...")
-    if args.untrained:
-        model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
-    else:
-        model = AutoModelForCausalLM.from_pretrained(args.repo_id, subfolder=args.subfolder, use_auth_token=True)
-    model.eval().cuda() if not args.no_cuda else model.eval()
-
-    # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(repo_args.model_name)
-    tokenizer.pad_token = tokenizer.eos_token
+    if model is None or tokenizer is None:
+        if untrained:
+            model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
+            tokenizer = AutoTokenizer.from_pretrained(repo_args.model_name)
+            tokenizer.pad_token = tokenizer.eos_token
+        else:
+            model = AutoModelForCausalLM.from_pretrained(repo_id, subfolder=subfolder, use_auth_token=True)
+            tokenizer = AutoTokenizer.from_pretrained(
+                "bs-modeling-metadata/checkpoints_all_04_23", subfolder="tokenizer", use_auth_token=True
+            )
+    model.eval().cuda() if not no_cuda else model.eval()
 
     # Config preprocess function
     cfg = data_config.metadata_config
@@ -358,7 +316,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
     cfg.metadata_list.append("entity")
     cfg.metadata_list.append("paragraph")
 
-    if args.prompt:
+    if prompt:
         cfg.metadata_sep = "; "  # Instead of " | "
         cfg.metadata_prefix_sep = ""  # Instead of " |||"; there's already an implicit " "
         DatasourceProcessor.process_global = datasource_process_global_for_prompt
@@ -381,8 +339,8 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
         "bs-modeling-metadata/c4-en-html-with-validation_metadata_url",
         "bs-modeling-metadata/c4-en-html-with-validation_metadata_paragraph",
     ]
-    dataset_paths = [path for path in dataset_paths if path.split("_metadata_")[1] in args.metadata_to_test.split(",")]
-
+    dataset_paths = [path for path in dataset_paths if path.split("_metadata_")[1] in metadata_to_test.split(",")]
+    results = {}
     for path in dataset_paths:
         n_examples = 0
         total_normal_len = []
@@ -394,11 +352,11 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
         # Load validation dataset from hugging face
         metadata_type = path.split("_metadata_")[1]
         print(f"Loading {metadata_type} data...")
-        split = "validation" if not args.test else "validation[:10]"
+        split = "validation" if not test else "validation[:10]"
         validation_dataset = load_dataset(path, use_auth_token=True, split=split)
 
         data = []
-        max_n_examples_ord = len(str(args.max_n_examples))
+        max_n_examples_ord = len(str(max_n_examples))
         for idx, example in tqdm(enumerate(validation_dataset), desc=f"Calculating perplexity for {metadata_type}..."):
             # for idx in [136,]:
             example = validation_dataset[idx]
@@ -409,7 +367,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
             except Exception as e:
                 # Write error to output file and continue with next dataset
                 print(e)
-                with open(args.output_file, "a", encoding="utf8") as f:
+                with open(output_file, "a", encoding="utf8") as f:
                     f.write(f"=== RESULT [{metadata_type}] ===\n")
                     f.write(f"{e}\n\n")
                 exit_flag = True
@@ -445,7 +403,10 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
                 normal_batch = default_data_collator([normal_example])
                 metadata_example["labels"] = metadata_example["input_ids"]
                 metadata_batch = default_data_collator([metadata_example])
-                if not args.no_cuda:
+                if accelerator is not None:
+                    normal_batch = {k: v.to(accelerator.device) for k, v in normal_batch.items()}
+                    metadata_batch = {k: v.to(accelerator.device) for k, v in metadata_batch.items()}
+                elif not no_cuda:
                     normal_batch = {k: v.cuda() for k, v in normal_batch.items()}
                     metadata_batch = {k: v.cuda() for k, v in metadata_batch.items()}
                 if n_examples == 1:
@@ -461,12 +422,14 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
                     # rich.print(tokenizer.decode(metadata_batch["input_ids"][0]))
 
                 # Calculate nll (natural-log loss)
-                normal_nll, normal_example_len = get_mean_loss(normal_batch, save_data=args.save_data, idx=idx)  # [0]
+                normal_nll, normal_example_len = get_mean_loss(
+                    normal_batch, save_data=save_data, idx=idx, model=model,tokenizer=tokenizer
+                )  # [0]
                 # print("PPL")
                 # print(normal_ppl)
                 total_normal_nll.append(normal_nll)  # * normal_example_len
                 metadata_nll, metadata_example_len = get_mean_loss(
-                    metadata_batch, save_data=args.save_data, idx=idx
+                    metadata_batch, save_data=save_data, idx=idx, model=model,tokenizer=tokenizer
                 )  # [0]
                 # print(metadata_ppl)
                 total_metadata_nll.append(metadata_nll)  # * metadata_example_len
@@ -521,7 +484,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str:
 
                     # sys.exit()
 
-                if n_examples > args.max_n_examples:
+                if n_examples > max_n_examples:
                     break
 
         if exit_flag:
@@ -554,9 +517,86 @@ def ppl(examples_mean_loss, examples_len):
         else:
             final_metadata_ppl = final_normal_ppl = 0
 
-        # Write results to output file
-        with open(args.output_file, "a", encoding="utf8") as f:
-            f.write(f"=== RESULT [{metadata_type}] ===\n")
-            f.write("Perplexity (metadata): {:>6,.3f}\n".format(final_metadata_ppl))
-            f.write("Perplexity (normal):   {:>6,.3f}\n\n".format(final_normal_ppl))
+        results[metadata_type] = {"final_normal_ppl": final_normal_ppl, "final_metadata_ppl": final_metadata_ppl}
         torch.save(data, "eva.data")
+    return results
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        default="bs-modeling-metadata/checkpoints_all_04_23",
+        help="Repository ID for the model to compute perplexity for",
+    )
+    parser.add_argument(
+        "--subfolder",
+        type=str,
+        default="checkpoint-2000step",
+        help="subfolder in the respository with the specific checkpoint to evaluate perplexity for",
+    )
+    parser.add_argument(
+        "--config_file_path",
+        type=str,
+        help="The path actual_config.yaml if available, otherwise repo_id/actual_config.yaml or git clone's v2.yaml",
+    )
+    parser.add_argument(
+        "--output_file", type=str, default="evaluation.txt", help="Path to the file the perplexity is written to"
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="If set to true, all computations are performed on CPU")
+    parser.add_argument(
+        "--save_data",
+        action="store_true",
+        help="If set to true, save tokens & losses",
+    )
+    parser.add_argument(
+        "--test",
+        action="store_true",
+        help="If set to true, the script runs in test mode and only takes 10 examples per dataset",
+    )
+    parser.add_argument(
+        "--max_n_examples",
+        type=int,
+        default=1500,
+        help="how many examples per metadata type to evaluate",
+    )
+    parser.add_argument(
+        "--metadata_to_test",
+        type=str,
+        default="html,entity,entity_paragraph,website_desc,generation_datasource,timestamp,title,generation_length_sentence,generation_length_text,url,paragraph",
+        help="metadata types to test",
+    )
+    parser.add_argument(
+        "--untrained",
+        action="store_true",
+        help="If set to true, will load gpt2-xl",
+    )
+    parser.add_argument(
+        "--prompt",
+        action="store_true",
+        help="If set to true, the script evaluates metadata in prompt style",
+    )
+
+    args = parser.parse_args()
+    print(f"Parameters: {args}")
+    results = evaluate_main(
+        repo_id=args.repo_id,
+        subfolder=args.subfolder,
+        config_file_path=args.config_file_path,
+        output_file=args.output_file,
+        save_data=args.save_data,
+        test=args.test,
+        max_n_examples=args.max_n_examples,
+        metadata_to_test=args.metadata_to_test,
+        untrained=args.untrained,
+        prompt=args.prompt,
+        no_cuda=args.no_cuda,
+    )
+    # Load config
+    # Write results to output file
+    with open(args.output_file, "a", encoding="utf8") as f:
+        for k, v in results.items():
+            f.write(f"=== RESULT [{k}] ===\n")
+            f.write("Perplexity (metadata): {:>6,.3f}\n".format(v["final_metadata_ppl"]))
+            f.write("Perplexity (normal):   {:>6,.3f}\n\n".format(v["final_normal_ppl"]))
diff --git a/bsmetadata/experiments/with_metadata_datasetv2_tf.py b/bsmetadata/experiments/with_metadata_datasetv2_tf.py
index e4ebdd28..1f45ad4a 100644
--- a/bsmetadata/experiments/with_metadata_datasetv2_tf.py
+++ b/bsmetadata/experiments/with_metadata_datasetv2_tf.py
@@ -87,7 +87,7 @@ def filter_empty(t):
     return data
 
 
-def get_dataloader(*, tokenizer, args, num_gpus, gpu_id):
+def get_dataloader(*, tokenizer, args, num_gpus, gpu_id, train=True):
     """returns a tensorflow dataloader"""
     data_config = args
     local_dir = Path(data_config.dataset_name)
@@ -104,14 +104,22 @@ def get_dataloader(*, tokenizer, args, num_gpus, gpu_id):
     print(f"{len(files_with_entities)} files with entities")
     print(f"{len(files_without_entities)} files without entities")
 
+    if train:
+        files_with_entities = [
+            x for x in files_with_entities if "c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz" not in x.name
+        ]
+    else:
+        files_with_entities = [
+            x for x in files_with_entities if "c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz" in x.name
+        ]
+
     data_with_entities = get_dataset(files_with_entities, num_gpus, gpu_id, data_config, tokenizer)
-    data_without_entities = get_dataset(files_without_entities, num_gpus, gpu_id, data_config, tokenizer)
+
     data = tf.data.Dataset.sample_from_datasets(
-        [data_with_entities, data_without_entities],
-        weights=[float(len(files_with_entities)), float(len(files_without_entities))],
+        [data_with_entities],
+        weights=[float(len(files_with_entities))],
         seed=42,
     )
-
     data = data.shuffle(1000, reshuffle_each_iteration=True)
     data = data.batch(data_config.per_device_train_batch_size)
     data = data.prefetch(tf.data.AUTOTUNE)
diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml
index 42a0044a..805d593f 100644
--- a/bsmetadata/hydra_configs/v2.yaml
+++ b/bsmetadata/hydra_configs/v2.yaml
@@ -11,7 +11,7 @@ data_config:
       title: 1.0657717366883845
       generation_datasource: 1.0
       entity_paragraph: 1.028817740667444
-
+      generation_length_text: 1.0
     #- url: 1.0
     #- generation_length_sentence
     #- generation_length_text
@@ -28,6 +28,7 @@ data_config:
     - datasource
     - length
     - entity_paragraph
+    - generation_length_text
     metadata_column_list:
     - html
     - timestamp
@@ -36,9 +37,9 @@ data_config:
     #- url
     - generation_datasource
     #- generation_length_sentence
-    #- generation_length_text
+    - generation_length_text
     - entity_paragraph
-    local_metadata_special_tokens: 
+    local_metadata_special_tokens:
       entity_paragraph: "entity"
     metadata_sep: ' | '
     metadata_key_value_sep: ': '
@@ -72,14 +73,16 @@ data_config:
       - 0.0
     local_metadata_special_token_start:
         entity_paragraph: "<ENTITY_CHAIN>"
+        html: "<HTML>"
     local_metadata_special_token_end:
         entity_paragraph: " </ENTITY_CHAIN> "
+        html: "</HTML>"
     local_metadata_special_token_state: true
-    html_overall_sample_rate: 0.25
+    html_overall_sample_rate: 0.5
     without_metadata_same_context: false
   experiment: with_metadata_datasetv2_tf
-  per_device_eval_batch_size: 8
-  per_device_train_batch_size: 8
+  per_device_eval_batch_size: 32 # 32 for 40gb
+  per_device_train_batch_size: 32
   dataset_name: bs-modeling-metadata/c4-en-html-with-training_metadata_all
   dataset_config_name: null
   train_file: c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz
@@ -87,12 +90,13 @@ data_config:
   overwrite_cache: false
   cache_dir: null
   extension: null
-  preprocessing_num_workers: 6
+  preprocessing_num_workers: 40
   validation_split_percentage: 5
   block_size: null
   map_batch_size: 1
 weight_decay: 0.01
-learning_rate: 5e-5
+learning_rate: 0.0001
+wb_name: "all_metadata"
 num_train_epochs: 1
 max_train_steps: 100000
 lr_scheduler_type: linear
@@ -103,16 +107,16 @@ model_name: gpt2
 project_name: metadata_lm
 jobid: ''
 start_with_eval: false
-extra_steps_to_eval_save_at:
-- 2
+#extra_steps_to_eval_save_at:
+#- 2
 evaluation_strategy: STEPS
 eval_num_per_epoch: 3
-eval_steps: 2000
+eval_steps: 1000
 save_strategy: STEPS
 save_num_per_epoch: 3
-save_steps: 150
+save_steps: 500
 do_train: true
 do_eval: true
 gradient_checkpointing: true
 resume_from_checkpoint_dir: null
-gradient_accumulation_steps: 16
+gradient_accumulation_steps: 2
\ No newline at end of file
diff --git a/bsmetadata/train.py b/bsmetadata/train.py
index d97853b6..4dc55a2c 100644
--- a/bsmetadata/train.py
+++ b/bsmetadata/train.py
@@ -16,11 +16,11 @@
 import wandb
 from accelerate import Accelerator
 from accelerate.utils import DistributedType, DummyOptim, DummyScheduler
+from evaluation import evaluate_main
 from hydra.core.config_store import ConfigStore
 from omegaconf import OmegaConf
-from torch.optim import AdamW
 from tqdm.auto import tqdm as original_tqdm
-from transformers import AddedToken, AutoConfig, AutoModelForCausalLM, AutoTokenizer, get_scheduler, set_seed
+from transformers import AdamW, AddedToken, AutoConfig, AutoModelForCausalLM, AutoTokenizer, get_scheduler, set_seed
 from transformers.trainer_utils import IntervalStrategy
 
 from bsmetadata.input_pipeline import DataConfig, get_dataloaders
@@ -34,6 +34,7 @@ class CFG:
     data_config: DataConfig = DataConfig()
     weight_decay: float = field(default=0.0, metadata={"help": "The weight decay to use for training."})
     learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate."})
+    wb_name: str = field(default="bsmetadata", metadata={"help": "The name of the wandb project."})
     gradient_accumulation_steps: int = field(
         default=1,
         metadata={"help": "The number of gradient accumulation steps to perform before updating model parameters."},
@@ -217,8 +218,8 @@ def main(args: CFG) -> None:
     is_local_main_process = accelerator.is_local_main_process
     tqdm = partial(original_tqdm, disable=not is_local_main_process, position=0)
     use_deepspeed = accelerator.state.deepspeed_plugin is not None
-    use_deepspeed_optimzer = use_deepspeed and "optimizer" in accelerator.state.deepspeed_plugin.deepspeed_config
-    use_deepspeed_scheduler = use_deepspeed and "scheduler" in accelerator.state.deepspeed_plugin.deepspeed_config
+    use_deepspeed_optimzer = use_deepspeed or "optimizer" in accelerator.state.deepspeed_plugin.deepspeed_config
+    use_deepspeed_scheduler = use_deepspeed or "scheduler" in accelerator.state.deepspeed_plugin.deepspeed_config
 
     if accelerator.distributed_type == DistributedType.DEEPSPEED and not use_deepspeed_scheduler:
         assert False, "Please set scheduler in DeepSpeed config file otherwise it may not be checkpointed properly"
@@ -294,7 +295,13 @@ def main(args: CFG) -> None:
             gpu_id=accelerator.process_index,
         )
         dummy_dataloader = get_dummy_dataloader(args.data_config.per_device_train_batch_size)
-        eval_dataloaders = dict()
+        eval_dataloader, format_fn_eval = get_dataloader(
+            tokenizer=tokenizer,
+            args=args.data_config,
+            num_gpus=accelerator.num_processes,
+            gpu_id=accelerator.process_index,
+            train=False,
+        )
         model, optimizer, dummy_dataloader, scheduler = accelerator.prepare(
             model, optimizer, dummy_dataloader, scheduler
         )
@@ -348,7 +355,7 @@ def format_fn(x):
         save_per_n_step = args.max_train_steps + 1  # will never eval
 
     @torch.no_grad()
-    def evaluate(eval_dataloader):
+    def evaluate(eval_dataloader, only_first_n_steps=120):
         model.eval()
         losses = []
         for step, batch in enumerate(tqdm(eval_dataloader, desc="eval")):  # , leave=False)
@@ -359,7 +366,8 @@ def evaluate(eval_dataloader):
             loss = loss_fn(batch, outputs, metadata_mask)
 
             losses.append(accelerator.gather(loss.repeat(args.data_config.per_device_eval_batch_size)))
-
+            if step == only_first_n_steps:
+                break
         model.train()
         if not losses:
             # in case the dataloader is empty
@@ -368,23 +376,39 @@ def evaluate(eval_dataloader):
         perplexity = math.exp(torch.mean(losses))
         return {"perplexity": perplexity}
 
-    def evaluate_multiple_dateloaders(eval_dataloaders):
-        for key, eval_dataloader in eval_dataloaders.items():
-            logger.info(f"Evaluating split {key}")
-            metrics = evaluate(eval_dataloader)
-            metrics_logger.log({key: metrics})
+    def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val):
+        if use_full_evaluation_for_val:
+            results = evaluate_main(
+                output_file="eval.txt",
+                # metadata_to_test="entity_paragraph",
+                metadata_to_test="title,html,entity_paragraph,website_desc,generation_datasource,timestamp,generation_length_text",
+                model=model,
+                tokenizer=tokenizer,
+                accelerator=accelerator,
+            )
+            model.train()
+            for k, v in results.items():
+                metrics_logger.log({k: v})
+        else:
+            for key, eval_dataloader in eval_dataloaders.items():
+                logger.info(f"Evaluating split {key}")
+                metrics = evaluate(eval_dataloader)
+                metrics_logger.log({key: metrics})
         logger.info("Evaluation finished")
 
     if not args.do_train and not args.do_eval:
         return
 
     progress_bar = tqdm(range(args.max_train_steps), desc="training", initial=train_state.completed_steps)
-    metrics_logger = Logger(is_local_main_process, project=args.project_name, config=config_dict)
+    t_bs = args.data_config.per_device_train_batch_size * args.gradient_accumulation_steps * 8
+    os.environ['WANDB_API_KEY'] = 'd8216641d549f9bb3d0c5074baa39e15dfd55030'
+    metrics_logger = Logger(is_local_main_process, name=f"{args.wb_name}-{args.learning_rate}-{t_bs}",
+                            entity='jordanclive', project='metadata', config=config_dict)
 
     do_eval = args.do_eval and args.start_with_eval
     if do_eval:
         logger.info("Start with an evaluation")
-        evaluate_multiple_dateloaders(eval_dataloaders)
+        evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val=True)
 
     if not args.do_train:
         return
@@ -406,7 +430,7 @@ def save(path):
             model.save_checkpoint(path)
         else:
             accelerator.save_state(path)
-        save_model_and_tokenizer(accelerator, model, path)
+        save_model_and_tokenizer(accelerator, model, path, tokenizer=tokenizer)
         if is_local_main_process:
             train_state.save(path / "train_state.json")
 
@@ -426,6 +450,17 @@ def get_data_iter():
                     batch = {k: v.to(accelerator.device) for k, v in batch.items()}
                 yield batch
 
+    def get_eval_data_iter():
+        while True:
+            for batch in eval_dataloader:
+                batch = format_fn_eval(batch)
+                if args.data_config.experiment == "with_metadata_datasetv2_tf":
+                    batch = {k: v.to(accelerator.device) for k, v in batch.items()}
+                yield batch
+
+    eval_iter = get_eval_data_iter()
+    eval_dataloaders = {"validation": eval_iter}
+
     data_iter = get_data_iter()
 
     for _ in tqdm(
@@ -461,11 +496,18 @@ def get_data_iter():
                 optimizer.zero_grad()
 
             step_loss_gathered = accelerator.gather(step_loss).mean().item()
-            metrics = {
-                "loss": step_loss_gathered,
-                "lr": max(scheduler.get_lr()),
-                "gradient_step": train_state.completed_steps,
-            }
+            if step < 20:
+                metrics = {
+                    "loss": step_loss_gathered,
+                    "lr": 0,
+                    "gradient_step": train_state.completed_steps,
+                }
+            else:
+                metrics = {
+                    "loss": step_loss_gathered,
+                    "lr": max(scheduler.get_last_lr()),
+                    "gradient_step": train_state.completed_steps,
+                }
             if not args.data_config.streaming:
                 metrics["epoch"] = step / len(train_dataloader)
 
@@ -488,7 +530,7 @@ def get_data_iter():
             path = Path(args.out_dir).resolve() / f"checkpoint-{completed_steps}step"
             save(path)
         if do_eval:
-            evaluate_multiple_dateloaders(eval_dataloaders)
+            evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val=True)
 
         if completed_steps >= args.max_train_steps:
             # finished = True
diff --git a/script.sh b/script.sh
new file mode 100644
index 00000000..ed7e4f83
--- /dev/null
+++ b/script.sh
@@ -0,0 +1,39 @@
+source /fsx/home-jordiclive/miniconda3/bin/activate meta_conda
+cd /fsx/home-jordiclive/metadata
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/
+export TRANSFORMERS_CACHE=/fsx/home-jordiclive/transformers_cache
+
+#export HF_DATASETS_OFFLINE=1
+#export TRANSFORMERS_OFFLINE=1
+#export WANDB_MODE=offline
+export HYDRA_FULL_ERROR=1
+
+
+export MODEL=gpt2-xl
+export NUM_GPU=8
+export DEEPSPEED_CONFIG=$(realpath bsmetadata/deepspeed_configs/v2.json)
+export DATA_DIR=$(realpath /fsx/home-jordiclive/metadata/local-data/datasets--bs-modeling-metadata--c4-en-html-with-training_metadata_all/snapshots/8f2615d8b8580e89533b90bc3931e0b99ef15aec)
+echo "deepspeed_config_file: $DEEPSPEED_CONFIG"
+
+export WANDB_API_KEY= 'd8216641d549f9bb3d0c5074baa39e15dfd55030'
+
+echo "compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_config_file: $DEEPSPEED_CONFIG
+distributed_type: DEEPSPEED
+fp16: true
+machine_rank: 0
+main_process_ip: null
+main_process_port: null
+main_training_function: main
+num_machines: 1
+num_processes: -1
+mixed_precision: fp16
+" > accelerate_config.yaml
+CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --config_file accelerate_config.yaml bsmetadata/train.py --config-name v2 \
+  model_name=$MODEL \
+  data_config.dataset_name=$DATA_DIR \
+  data_config.train_file='*.jsonl.gz' \
+  data_config.validation_file='c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' \
+  out_dir=/fsx/home-jordiclive/tmp/metadata-html-half \
+#  wb_name="full-metadata-with-generation-text-0.5-html"
\ No newline at end of file
diff --git a/slurm_40.sh b/slurm_40.sh
new file mode 100644
index 00000000..7f483bb7
--- /dev/null
+++ b/slurm_40.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+#SBATCH --account laion
+#SBATCH --partition="g40"
+#SBATCH --job-name=flan
+#SBATCH --gres=gpu:8
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=12
+#SBATCH --output=%x_%j.out
+source /fsx/home-jordiclive/miniconda3/bin/activate meta_conda
+cd /fsx/home-jordiclive/metadata
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/
+export TRANSFORMERS_CACHE=/fsx/home-jordiclive/transformers_cache
+
+#export HF_DATASETS_OFFLINE=1
+#export TRANSFORMERS_OFFLINE=1
+#export WANDB_MODE=offline
+export HYDRA_FULL_ERROR=1
+
+
+export MODEL=gpt2-xl
+export NUM_GPU=8
+export DEEPSPEED_CONFIG=$(realpath bsmetadata/deepspeed_configs/v2.json)
+export DATA_DIR=$(realpath /fsx/home-jordiclive/metadata/local-data/datasets--bs-modeling-metadata--c4-en-html-with-training_metadata_all/snapshots/8f2615d8b8580e89533b90bc3931e0b99ef15aec)
+echo "deepspeed_config_file: $DEEPSPEED_CONFIG"
+
+export WANDB_API_KEY= 'd8216641d549f9bb3d0c5074baa39e15dfd55030'
+
+echo "compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_config_file: $DEEPSPEED_CONFIG
+distributed_type: DEEPSPEED
+fp16: true
+machine_rank: 0
+main_process_ip: null
+main_process_port: null
+main_training_function: main
+num_machines: 1
+num_processes: -1
+mixed_precision: fp16
+" > accelerate_config.yaml
+CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --config_file accelerate_config.yaml bsmetadata/train.py --config-name v2 \
+  model_name=$MODEL \
+  data_config.dataset_name=$DATA_DIR \
+  data_config.train_file='*.jsonl.gz' \
+  data_config.validation_file='c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' \
+  out_dir=/fsx/home-jordiclive/tmp/metadata-html-half \
+#  wb_name="full-metadata-with-generation-text-0.5-html"
\ No newline at end of file
diff --git a/tests/test_metadata_utils.py b/tests/test_metadata_utils.py
index 49408078..51ecdcb8 100644
--- a/tests/test_metadata_utils.py
+++ b/tests/test_metadata_utils.py
@@ -454,6 +454,24 @@ def test_entity_settings(self):
             "EntityOn |EntityParagraphOn ||| <ENTITY_CHAIN> |United Kingdom| |Louis Vuitton| |Billy Connolly| |Something in Common| |Lembit Öpik| </ENTITY_CHAIN> Hints and tips for media appearances, speaking and social media. This week; wall-to-wall politicians; Great Britain [[United Kingdom]]: Louis Vuitton [[Louis Vuitton]] condoms; Billy Connolly [[Billy Connolly]],; Lisa Dutton; Something in Common [[Something in Common]]; What was I saying?: We’re all publishers; An interview with Lembit Opik [[Lembit Öpik]]; Music from The Good Suns",
         )
 
+    def test_html_special_token_settings(self):
+        # from transformers import AddedToken
+
+        cfg = MetadataConfig()
+        PROCESSORS["html"] = HtmlProcessor
+        cfg.metadata_list = ["html"]
+        cfg.treat_local_metadata_as_regular_text = True
+        cfg.local_metadata_special_token_start = {"html": "<HTML>"}
+        cfg.local_metadata_special_token_end = {"html": "</HTML>"}
+        text, mask = add_local_metadata_to_text(self.examples[1], cfg)
+        self.assertEqual(
+            text,
+            "An <HTML><b></HTML>apple<HTML></b></HTML> is an edible fruit "
+            "produced by an <HTML><b class:level1 id:4 href:https://test.org>"
+            "<i class:level2></HTML>apple<HTML></i></HTML> tree<HTML></b>"
+            "</HTML> (Malus domestica).",
+        )
+
     def test_add_local_metadata_to_text(self):
         cfg = MetadataConfig()
         cfg.metadata_list = ["html", "entity"]