From 2dde85c21abd5f532574db33bdd53dfe4dd98857 Mon Sep 17 00:00:00 2001 From: jordiclive Date: Wed, 17 May 2023 13:42:36 +0100 Subject: [PATCH 01/15] add full evaluation into training loop. Other training changes for A100 node. --- bsmetadata/deepspeed_configs/v2.json | 22 +- bsmetadata/evaluation.py | 212 ++++++++++-------- .../experiments/with_metadata_datasetv2_tf.py | 27 ++- bsmetadata/hydra_configs/v2.yaml | 21 +- bsmetadata/train.py | 79 +++++-- 5 files changed, 216 insertions(+), 145 deletions(-) diff --git a/bsmetadata/deepspeed_configs/v2.json b/bsmetadata/deepspeed_configs/v2.json index 1d5c0311..35a24626 100644 --- a/bsmetadata/deepspeed_configs/v2.json +++ b/bsmetadata/deepspeed_configs/v2.json @@ -30,19 +30,19 @@ } }, "zero_optimization": { - "stage": 1, - "allgather_partitions": true, - "allgather_bucket_size": 500000000, - "overlap_comm": true, - "reduce_scatter": true, - "reduce_bucket_size": 500000000, - "contiguous_gradients": true, - "cpu_offload": true - }, - "gradient_accumulation_steps": 16, + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true, + "cpu_offload": false +}, + "gradient_accumulation_steps": 1, "gradient_clipping": "auto", "steps_per_print": 100, - "train_batch_size": 256, + "train_batch_size": 512, "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false } \ No newline at end of file diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py index b75c2aa5..235e7263 100644 --- a/bsmetadata/evaluation.py +++ b/bsmetadata/evaluation.py @@ -264,73 +264,24 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: return cfg.metadata_sep.join(sorted_metadata) + cfg.metadata_prefix_sep if sorted_metadata else "" -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--repo_id", - type=str, - default="bs-modeling-metadata/checkpoints_all_04_23", - help="Repository ID for the model to compute perplexity for", - ) - parser.add_argument( - "--subfolder", - type=str, - default="checkpoint-2000step", - help="subfolder in the respository with the specific checkpoint to evaluate perplexity for", - ) - parser.add_argument( - "--config_file_path", - type=str, - help="The path actual_config.yaml if available, otherwise repo_id/actual_config.yaml or git clone's v2.yaml", - ) - parser.add_argument( - "--output_file", type=str, default="evaluation.txt", help="Path to the file the perplexity is written to" - ) - parser.add_argument("--no_cuda", action="store_true", help="If set to true, all computations are performed on CPU") - parser.add_argument( - "--save_data", - action="store_true", - help="If set to true, save tokens & losses", - ) - parser.add_argument( - "--test", - action="store_true", - help="If set to true, the script runs in test mode and only takes 10 examples per dataset", - ) - parser.add_argument( - "--max_n_examples", - type=int, - default=1500, - help="how many examples per metadata type to evaluate", - ) - parser.add_argument( - "--metadata_to_test", - type=str, - default="html,entity,entity_paragraph,website_desc,generation_datasource,timestamp,title,generation_length_sentence,generation_length_text,url,paragraph", - help="metadata types to test", - ) - parser.add_argument( - "--untrained", - action="store_true", - help="If set to true, will load gpt2-xl", - ) - parser.add_argument( - "--prompt", - action="store_true", - help="If set to true, the script evaluates metadata in prompt style", - ) - - args = parser.parse_args() - print(f"Parameters: {args}") - - # Load config - if args.config_file_path: - config_file_path = args.config_file_path - else: +def evaluate_main( + metadata_to_test: str = "title,html,entity_paragraph,website_desc,generation_datasource,timestamp", + output_file: str = "evaluation.txt", + repo_id: str = None, + subfolder: str = None, + test: bool = False, + max_n_examples: int = 1500, + prompt: bool = False, + no_cuda: bool = False, + save_data: bool = False, + untrained: bool = False, + config_file_path: str = None, + model: str = None, + tokenizer: str = None, +) -> dict: + if config_file_path is None: try: - config_file_path = hf_hub_download( - repo_id=args.repo_id, filename="actual_config.yaml", use_auth_token=True - ) + config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True) except Exception: config_file_path = "bsmetadata/hydra_configs/v2.yaml" repo_args = OmegaConf.load(config_file_path) @@ -341,15 +292,17 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: # Load model print("Loading model...") - if args.untrained: - model = AutoModelForCausalLM.from_pretrained("gpt2-xl") - else: - model = AutoModelForCausalLM.from_pretrained(args.repo_id, subfolder=args.subfolder, use_auth_token=True) - model.eval().cuda() if not args.no_cuda else model.eval() - - # Load tokenizer - tokenizer = AutoTokenizer.from_pretrained(repo_args.model_name) - tokenizer.pad_token = tokenizer.eos_token + if model is None or tokenizer is None: + if untrained: + model = AutoModelForCausalLM.from_pretrained("gpt2-xl") + tokenizer = AutoTokenizer.from_pretrained(repo_args.model_name) + tokenizer.pad_token = tokenizer.eos_token + else: + model = AutoModelForCausalLM.from_pretrained(repo_id, subfolder=subfolder, use_auth_token=True) + tokenizer = AutoTokenizer.from_pretrained( + "bs-modeling-metadata/checkpoints_all_04_23", subfolder="tokenizer", use_auth_token=True + ) + model.eval().cuda() if not no_cuda else model.eval() # Config preprocess function cfg = data_config.metadata_config @@ -358,7 +311,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: cfg.metadata_list.append("entity") cfg.metadata_list.append("paragraph") - if args.prompt: + if prompt: cfg.metadata_sep = "; " # Instead of " | " cfg.metadata_prefix_sep = "" # Instead of " |||"; there's already an implicit " " DatasourceProcessor.process_global = datasource_process_global_for_prompt @@ -381,8 +334,8 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: "bs-modeling-metadata/c4-en-html-with-validation_metadata_url", "bs-modeling-metadata/c4-en-html-with-validation_metadata_paragraph", ] - dataset_paths = [path for path in dataset_paths if path.split("_metadata_")[1] in args.metadata_to_test.split(",")] - + dataset_paths = [path for path in dataset_paths if path.split("_metadata_")[1] in metadata_to_test.split(",")] + results = {} for path in dataset_paths: n_examples = 0 total_normal_len = [] @@ -394,11 +347,11 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: # Load validation dataset from hugging face metadata_type = path.split("_metadata_")[1] print(f"Loading {metadata_type} data...") - split = "validation" if not args.test else "validation[:10]" + split = "validation" if not test else "validation[:10]" validation_dataset = load_dataset(path, use_auth_token=True, split=split) data = [] - max_n_examples_ord = len(str(args.max_n_examples)) + max_n_examples_ord = len(str(max_n_examples)) for idx, example in tqdm(enumerate(validation_dataset), desc=f"Calculating perplexity for {metadata_type}..."): # for idx in [136,]: example = validation_dataset[idx] @@ -409,7 +362,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: except Exception as e: # Write error to output file and continue with next dataset print(e) - with open(args.output_file, "a", encoding="utf8") as f: + with open(output_file, "a", encoding="utf8") as f: f.write(f"=== RESULT [{metadata_type}] ===\n") f.write(f"{e}\n\n") exit_flag = True @@ -445,7 +398,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: normal_batch = default_data_collator([normal_example]) metadata_example["labels"] = metadata_example["input_ids"] metadata_batch = default_data_collator([metadata_example]) - if not args.no_cuda: + if not no_cuda: normal_batch = {k: v.cuda() for k, v in normal_batch.items()} metadata_batch = {k: v.cuda() for k, v in metadata_batch.items()} if n_examples == 1: @@ -461,13 +414,11 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: # rich.print(tokenizer.decode(metadata_batch["input_ids"][0])) # Calculate nll (natural-log loss) - normal_nll, normal_example_len = get_mean_loss(normal_batch, save_data=args.save_data, idx=idx) # [0] + normal_nll, normal_example_len = get_mean_loss(normal_batch, save_data=save_data, idx=idx) # [0] # print("PPL") # print(normal_ppl) total_normal_nll.append(normal_nll) # * normal_example_len - metadata_nll, metadata_example_len = get_mean_loss( - metadata_batch, save_data=args.save_data, idx=idx - ) # [0] + metadata_nll, metadata_example_len = get_mean_loss(metadata_batch, save_data=save_data, idx=idx) # [0] # print(metadata_ppl) total_metadata_nll.append(metadata_nll) # * metadata_example_len @@ -521,7 +472,7 @@ def create_metadata_prompt(example: Dict[str, Any], cfg: MetadataConfig) -> str: # sys.exit() - if n_examples > args.max_n_examples: + if n_examples > max_n_examples: break if exit_flag: @@ -554,9 +505,86 @@ def ppl(examples_mean_loss, examples_len): else: final_metadata_ppl = final_normal_ppl = 0 - # Write results to output file - with open(args.output_file, "a", encoding="utf8") as f: - f.write(f"=== RESULT [{metadata_type}] ===\n") - f.write("Perplexity (metadata): {:>6,.3f}\n".format(final_metadata_ppl)) - f.write("Perplexity (normal): {:>6,.3f}\n\n".format(final_normal_ppl)) + results[metadata_type] = {"final_normal_ppl": final_normal_ppl, "final_metadata_ppl": final_metadata_ppl} torch.save(data, "eva.data") + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--repo_id", + type=str, + default="bs-modeling-metadata/checkpoints_all_04_23", + help="Repository ID for the model to compute perplexity for", + ) + parser.add_argument( + "--subfolder", + type=str, + default="checkpoint-2000step", + help="subfolder in the respository with the specific checkpoint to evaluate perplexity for", + ) + parser.add_argument( + "--config_file_path", + type=str, + help="The path actual_config.yaml if available, otherwise repo_id/actual_config.yaml or git clone's v2.yaml", + ) + parser.add_argument( + "--output_file", type=str, default="evaluation.txt", help="Path to the file the perplexity is written to" + ) + parser.add_argument("--no_cuda", action="store_true", help="If set to true, all computations are performed on CPU") + parser.add_argument( + "--save_data", + action="store_true", + help="If set to true, save tokens & losses", + ) + parser.add_argument( + "--test", + action="store_true", + help="If set to true, the script runs in test mode and only takes 10 examples per dataset", + ) + parser.add_argument( + "--max_n_examples", + type=int, + default=1500, + help="how many examples per metadata type to evaluate", + ) + parser.add_argument( + "--metadata_to_test", + type=str, + default="html,entity,entity_paragraph,website_desc,generation_datasource,timestamp,title,generation_length_sentence,generation_length_text,url,paragraph", + help="metadata types to test", + ) + parser.add_argument( + "--untrained", + action="store_true", + help="If set to true, will load gpt2-xl", + ) + parser.add_argument( + "--prompt", + action="store_true", + help="If set to true, the script evaluates metadata in prompt style", + ) + + args = parser.parse_args() + print(f"Parameters: {args}") + results = evaluate_main( + args.repo_id, + args.subfolder, + args.config_file_path, + args.output_file, + args.save_data, + args.test, + args.max_n_examples, + args.metadata_to_test, + args.untrained, + args.prompt, + args.no_cuda, + ) + # Load config + # Write results to output file + with open(args.output_file, "a", encoding="utf8") as f: + for k, v in results.items(): + f.write(f"=== RESULT [{k}] ===\n") + f.write("Perplexity (metadata): {:>6,.3f}\n".format(v["final_metadata_ppl"])) + f.write("Perplexity (normal): {:>6,.3f}\n\n".format(v["final_normal_ppl"])) diff --git a/bsmetadata/experiments/with_metadata_datasetv2_tf.py b/bsmetadata/experiments/with_metadata_datasetv2_tf.py index e4ebdd28..779bcea4 100644 --- a/bsmetadata/experiments/with_metadata_datasetv2_tf.py +++ b/bsmetadata/experiments/with_metadata_datasetv2_tf.py @@ -54,11 +54,7 @@ def from_json_string(t): examples = {k: [v] for k, v in example.items()} metadata_type_sample_weights = data_config.metadata_config.random_sample_metadata_weights - examples = random_sample_metadata_v2( - examples, - metadata_type_sample_weights=metadata_type_sample_weights, - html_overall_sample_rate=data_config.metadata_config.html_overall_sample_rate, - ) + examples = random_sample_metadata_v2(examples, metadata_type_sample_weights=metadata_type_sample_weights) # example = {k: v[0] for k, v in examples.items()} result = add_metadata_and_chunk_examples(examples, tokenizer, data_config.metadata_config) @@ -87,7 +83,7 @@ def filter_empty(t): return data -def get_dataloader(*, tokenizer, args, num_gpus, gpu_id): +def get_dataloader(*, tokenizer, args, num_gpus, gpu_id,train=True): """returns a tensorflow dataloader""" data_config = args local_dir = Path(data_config.dataset_name) @@ -99,19 +95,28 @@ def get_dataloader(*, tokenizer, args, num_gpus, gpu_id): file_paths = list(Path(local_dir).glob(data_config.train_file)) assert len(file_paths) > 0, f"no files found for {data_config.train_file}" + files_with_entities = [x for x in file_paths if x.name in data_files_with_entities] files_without_entities = [x for x in file_paths if x.name not in data_files_with_entities] print(f"{len(files_with_entities)} files with entities") print(f"{len(files_without_entities)} files without entities") + if train: + files_with_entities = [x for x in files_with_entities if + 'c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' not in x.name] + else: + files_with_entities = [x for x in files_with_entities if + 'c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' in x.name] + data_with_entities = get_dataset(files_with_entities, num_gpus, gpu_id, data_config, tokenizer) - data_without_entities = get_dataset(files_without_entities, num_gpus, gpu_id, data_config, tokenizer) + + + data = tf.data.Dataset.sample_from_datasets( - [data_with_entities, data_without_entities], - weights=[float(len(files_with_entities)), float(len(files_without_entities))], + [data_with_entities], + weights=[float(len(files_with_entities))], seed=42, ) - data = data.shuffle(1000, reshuffle_each_iteration=True) data = data.batch(data_config.per_device_train_batch_size) data = data.prefetch(tf.data.AUTOTUNE) @@ -137,4 +142,4 @@ def get_dummy_dataloader(batch_size): shuffle=True, num_workers=0, pin_memory=True, - ) + ) \ No newline at end of file diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml index 42a0044a..df1f27d6 100644 --- a/bsmetadata/hydra_configs/v2.yaml +++ b/bsmetadata/hydra_configs/v2.yaml @@ -75,11 +75,12 @@ data_config: local_metadata_special_token_end: entity_paragraph: " " local_metadata_special_token_state: true - html_overall_sample_rate: 0.25 + html_overall_sample_rate: 1 without_metadata_same_context: false + use_full_evaluation_for_val: false experiment: with_metadata_datasetv2_tf - per_device_eval_batch_size: 8 - per_device_train_batch_size: 8 + per_device_eval_batch_size: 64 # 32 for 40gb + per_device_train_batch_size: 64 dataset_name: bs-modeling-metadata/c4-en-html-with-training_metadata_all dataset_config_name: null train_file: c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz @@ -87,12 +88,12 @@ data_config: overwrite_cache: false cache_dir: null extension: null - preprocessing_num_workers: 6 + preprocessing_num_workers: 40 validation_split_percentage: 5 block_size: null map_batch_size: 1 weight_decay: 0.01 -learning_rate: 5e-5 +learning_rate: 0.0001 num_train_epochs: 1 max_train_steps: 100000 lr_scheduler_type: linear @@ -103,16 +104,16 @@ model_name: gpt2 project_name: metadata_lm jobid: '' start_with_eval: false -extra_steps_to_eval_save_at: -- 2 +#extra_steps_to_eval_save_at: +#- 2 evaluation_strategy: STEPS eval_num_per_epoch: 3 -eval_steps: 2000 +eval_steps: 250 save_strategy: STEPS save_num_per_epoch: 3 -save_steps: 150 +save_steps: 250 do_train: true do_eval: true gradient_checkpointing: true resume_from_checkpoint_dir: null -gradient_accumulation_steps: 16 +gradient_accumulation_steps: 1 diff --git a/bsmetadata/train.py b/bsmetadata/train.py index d97853b6..c3c8552c 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -16,11 +16,11 @@ import wandb from accelerate import Accelerator from accelerate.utils import DistributedType, DummyOptim, DummyScheduler +from evaluation import evaluate_main from hydra.core.config_store import ConfigStore from omegaconf import OmegaConf -from torch.optim import AdamW from tqdm.auto import tqdm as original_tqdm -from transformers import AddedToken, AutoConfig, AutoModelForCausalLM, AutoTokenizer, get_scheduler, set_seed +from transformers import AdamW, AddedToken, AutoConfig, AutoModelForCausalLM, AutoTokenizer, get_scheduler, set_seed from transformers.trainer_utils import IntervalStrategy from bsmetadata.input_pipeline import DataConfig, get_dataloaders @@ -89,6 +89,9 @@ class CFG: gradient_checkpointing: bool = field( default=False, metadata={"help": "Whether to use gradient_checkpointing to save memory."} ) + use_full_evaluation_for_val: bool = field( + default=False, metadata={"help": "Whether to use full evaluation for val"} + ) cs = ConfigStore.instance() @@ -217,8 +220,8 @@ def main(args: CFG) -> None: is_local_main_process = accelerator.is_local_main_process tqdm = partial(original_tqdm, disable=not is_local_main_process, position=0) use_deepspeed = accelerator.state.deepspeed_plugin is not None - use_deepspeed_optimzer = use_deepspeed and "optimizer" in accelerator.state.deepspeed_plugin.deepspeed_config - use_deepspeed_scheduler = use_deepspeed and "scheduler" in accelerator.state.deepspeed_plugin.deepspeed_config + use_deepspeed_optimzer = use_deepspeed or "optimizer" in accelerator.state.deepspeed_plugin.deepspeed_config + use_deepspeed_scheduler = use_deepspeed or "scheduler" in accelerator.state.deepspeed_plugin.deepspeed_config if accelerator.distributed_type == DistributedType.DEEPSPEED and not use_deepspeed_scheduler: assert False, "Please set scheduler in DeepSpeed config file otherwise it may not be checkpointed properly" @@ -294,7 +297,13 @@ def main(args: CFG) -> None: gpu_id=accelerator.process_index, ) dummy_dataloader = get_dummy_dataloader(args.data_config.per_device_train_batch_size) - eval_dataloaders = dict() + eval_dataloader, format_fn_eval = get_dataloader( + tokenizer=tokenizer, + args=args.data_config, + num_gpus=accelerator.num_processes, + gpu_id=accelerator.process_index, + train=False, + ) model, optimizer, dummy_dataloader, scheduler = accelerator.prepare( model, optimizer, dummy_dataloader, scheduler ) @@ -348,7 +357,7 @@ def format_fn(x): save_per_n_step = args.max_train_steps + 1 # will never eval @torch.no_grad() - def evaluate(eval_dataloader): + def evaluate(eval_dataloader, only_first_n_steps=120): model.eval() losses = [] for step, batch in enumerate(tqdm(eval_dataloader, desc="eval")): # , leave=False) @@ -359,7 +368,8 @@ def evaluate(eval_dataloader): loss = loss_fn(batch, outputs, metadata_mask) losses.append(accelerator.gather(loss.repeat(args.data_config.per_device_eval_batch_size))) - + if step == only_first_n_steps: + break model.train() if not losses: # in case the dataloader is empty @@ -368,12 +378,21 @@ def evaluate(eval_dataloader): perplexity = math.exp(torch.mean(losses)) return {"perplexity": perplexity} - def evaluate_multiple_dateloaders(eval_dataloaders): - for key, eval_dataloader in eval_dataloaders.items(): - logger.info(f"Evaluating split {key}") - metrics = evaluate(eval_dataloader) - metrics_logger.log({key: metrics}) - logger.info("Evaluation finished") + def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val): + if use_full_evaluation_for_val: + results = evaluate_main( + output_file="eval.txt", + metadata_to_test="title,html,entity_paragraph,website_desc,generation_datasource,timestamp", + ) + for k, v in results.items(): + metrics_logger.log({k: v}) + logger.info("Evaluation finished") + else: + for key, eval_dataloader in eval_dataloaders.items(): + logger.info(f"Evaluating split {key}") + metrics = evaluate(eval_dataloader) + metrics_logger.log({key: metrics}) + logger.info("Evaluation finished") if not args.do_train and not args.do_eval: return @@ -384,7 +403,7 @@ def evaluate_multiple_dateloaders(eval_dataloaders): do_eval = args.do_eval and args.start_with_eval if do_eval: logger.info("Start with an evaluation") - evaluate_multiple_dateloaders(eval_dataloaders) + evaluate_multiple_dateloaders(eval_dataloaders, args.use_full_evaluation_for_val) if not args.do_train: return @@ -406,7 +425,7 @@ def save(path): model.save_checkpoint(path) else: accelerator.save_state(path) - save_model_and_tokenizer(accelerator, model, path) + save_model_and_tokenizer(accelerator, model, path, tokenizer=tokenizer) if is_local_main_process: train_state.save(path / "train_state.json") @@ -426,6 +445,17 @@ def get_data_iter(): batch = {k: v.to(accelerator.device) for k, v in batch.items()} yield batch + def get_eval_data_iter(): + while True: + for batch in eval_dataloader: + batch = format_fn_eval(batch) + if args.data_config.experiment == "with_metadata_datasetv2_tf": + batch = {k: v.to(accelerator.device) for k, v in batch.items()} + yield batch + + eval_iter = get_eval_data_iter() + eval_dataloaders = {"validation": eval_iter} + data_iter = get_data_iter() for _ in tqdm( @@ -461,11 +491,18 @@ def get_data_iter(): optimizer.zero_grad() step_loss_gathered = accelerator.gather(step_loss).mean().item() - metrics = { - "loss": step_loss_gathered, - "lr": max(scheduler.get_lr()), - "gradient_step": train_state.completed_steps, - } + if step < 20: + metrics = { + "loss": step_loss_gathered, + "lr": 0, + "gradient_step": train_state.completed_steps, + } + else: + metrics = { + "loss": step_loss_gathered, + "lr": max(scheduler.get_last_lr()), + "gradient_step": train_state.completed_steps, + } if not args.data_config.streaming: metrics["epoch"] = step / len(train_dataloader) @@ -488,7 +525,7 @@ def get_data_iter(): path = Path(args.out_dir).resolve() / f"checkpoint-{completed_steps}step" save(path) if do_eval: - evaluate_multiple_dateloaders(eval_dataloaders) + evaluate_multiple_dateloaders(eval_dataloaders, args.use_full_evaluation_for_val) if completed_steps >= args.max_train_steps: # finished = True From 6c4b6a7a7b19cc656546d03d4f8d8f0058c2930d Mon Sep 17 00:00:00 2001 From: jordiclive Date: Wed, 17 May 2023 13:44:52 +0100 Subject: [PATCH 02/15] no message --- bsmetadata/experiments/with_metadata_datasetv2_tf.py | 6 +++++- bsmetadata/train.py | 3 +-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/bsmetadata/experiments/with_metadata_datasetv2_tf.py b/bsmetadata/experiments/with_metadata_datasetv2_tf.py index 779bcea4..b635a4ee 100644 --- a/bsmetadata/experiments/with_metadata_datasetv2_tf.py +++ b/bsmetadata/experiments/with_metadata_datasetv2_tf.py @@ -54,7 +54,11 @@ def from_json_string(t): examples = {k: [v] for k, v in example.items()} metadata_type_sample_weights = data_config.metadata_config.random_sample_metadata_weights - examples = random_sample_metadata_v2(examples, metadata_type_sample_weights=metadata_type_sample_weights) + examples = random_sample_metadata_v2( + examples, + metadata_type_sample_weights=metadata_type_sample_weights, + html_overall_sample_rate=data_config.metadata_config.html_overall_sample_rate, + ) # example = {k: v[0] for k, v in examples.items()} result = add_metadata_and_chunk_examples(examples, tokenizer, data_config.metadata_config) diff --git a/bsmetadata/train.py b/bsmetadata/train.py index c3c8552c..4d00edec 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -386,13 +386,12 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val) ) for k, v in results.items(): metrics_logger.log({k: v}) - logger.info("Evaluation finished") else: for key, eval_dataloader in eval_dataloaders.items(): logger.info(f"Evaluating split {key}") metrics = evaluate(eval_dataloader) metrics_logger.log({key: metrics}) - logger.info("Evaluation finished") + logger.info("Evaluation finished") if not args.do_train and not args.do_eval: return From 98598ba6ea0d23d8ce6baf0d8a590c25c522ebfa Mon Sep 17 00:00:00 2001 From: jordiclive Date: Wed, 17 May 2023 13:51:24 +0100 Subject: [PATCH 03/15] fix format --- .../experiments/with_metadata_datasetv2_tf.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/bsmetadata/experiments/with_metadata_datasetv2_tf.py b/bsmetadata/experiments/with_metadata_datasetv2_tf.py index b635a4ee..1f45ad4a 100644 --- a/bsmetadata/experiments/with_metadata_datasetv2_tf.py +++ b/bsmetadata/experiments/with_metadata_datasetv2_tf.py @@ -87,7 +87,7 @@ def filter_empty(t): return data -def get_dataloader(*, tokenizer, args, num_gpus, gpu_id,train=True): +def get_dataloader(*, tokenizer, args, num_gpus, gpu_id, train=True): """returns a tensorflow dataloader""" data_config = args local_dir = Path(data_config.dataset_name) @@ -99,23 +99,22 @@ def get_dataloader(*, tokenizer, args, num_gpus, gpu_id,train=True): file_paths = list(Path(local_dir).glob(data_config.train_file)) assert len(file_paths) > 0, f"no files found for {data_config.train_file}" - files_with_entities = [x for x in file_paths if x.name in data_files_with_entities] files_without_entities = [x for x in file_paths if x.name not in data_files_with_entities] print(f"{len(files_with_entities)} files with entities") print(f"{len(files_without_entities)} files without entities") if train: - files_with_entities = [x for x in files_with_entities if - 'c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' not in x.name] + files_with_entities = [ + x for x in files_with_entities if "c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz" not in x.name + ] else: - files_with_entities = [x for x in files_with_entities if - 'c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' in x.name] + files_with_entities = [ + x for x in files_with_entities if "c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz" in x.name + ] data_with_entities = get_dataset(files_with_entities, num_gpus, gpu_id, data_config, tokenizer) - - data = tf.data.Dataset.sample_from_datasets( [data_with_entities], weights=[float(len(files_with_entities))], @@ -146,4 +145,4 @@ def get_dummy_dataloader(batch_size): shuffle=True, num_workers=0, pin_memory=True, - ) \ No newline at end of file + ) From f9af120bf445aa8421151f8e064f50a3d6722843 Mon Sep 17 00:00:00 2001 From: jordiclive Date: Wed, 17 May 2023 17:40:22 +0100 Subject: [PATCH 04/15] fix evaluation and config for 40gb --- bsmetadata/deepspeed_configs/v2.json | 2 +- bsmetadata/evaluation.py | 18 ++++++++++++++---- bsmetadata/hydra_configs/v2.yaml | 14 +++++++------- bsmetadata/train.py | 12 +++++++----- 4 files changed, 29 insertions(+), 17 deletions(-) diff --git a/bsmetadata/deepspeed_configs/v2.json b/bsmetadata/deepspeed_configs/v2.json index 35a24626..dca70cc9 100644 --- a/bsmetadata/deepspeed_configs/v2.json +++ b/bsmetadata/deepspeed_configs/v2.json @@ -39,7 +39,7 @@ "contiguous_gradients": true, "cpu_offload": false }, - "gradient_accumulation_steps": 1, + "gradient_accumulation_steps": 2, "gradient_clipping": "auto", "steps_per_print": 100, "train_batch_size": 512, diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py index 235e7263..86f2510c 100644 --- a/bsmetadata/evaluation.py +++ b/bsmetadata/evaluation.py @@ -180,6 +180,7 @@ def get_mean_loss( batch: Dict[str, torch.Tensor], save_data: bool = False, idx: int = None, + model=None, ) -> torch.Tensor: """Prepares the arguments for perplexity calculation and passes them to the perplexity function. @@ -272,18 +273,20 @@ def evaluate_main( test: bool = False, max_n_examples: int = 1500, prompt: bool = False, - no_cuda: bool = False, + no_cuda: bool = True, save_data: bool = False, untrained: bool = False, config_file_path: str = None, model: str = None, tokenizer: str = None, + accelerator=None, ) -> dict: if config_file_path is None: try: config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True) except Exception: config_file_path = "bsmetadata/hydra_configs/v2.yaml" + config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml" repo_args = OmegaConf.load(config_file_path) data_config = repo_args.data_config @@ -398,7 +401,10 @@ def evaluate_main( normal_batch = default_data_collator([normal_example]) metadata_example["labels"] = metadata_example["input_ids"] metadata_batch = default_data_collator([metadata_example]) - if not no_cuda: + if accelerator is not None: + normal_batch = {k: v.to(accelerator.device) for k, v in normal_batch.items()} + metadata_batch = {k: v.to(accelerator.device) for k, v in metadata_batch.items()} + elif not no_cuda: normal_batch = {k: v.cuda() for k, v in normal_batch.items()} metadata_batch = {k: v.cuda() for k, v in metadata_batch.items()} if n_examples == 1: @@ -414,11 +420,15 @@ def evaluate_main( # rich.print(tokenizer.decode(metadata_batch["input_ids"][0])) # Calculate nll (natural-log loss) - normal_nll, normal_example_len = get_mean_loss(normal_batch, save_data=save_data, idx=idx) # [0] + normal_nll, normal_example_len = get_mean_loss( + normal_batch, save_data=save_data, idx=idx, model=model + ) # [0] # print("PPL") # print(normal_ppl) total_normal_nll.append(normal_nll) # * normal_example_len - metadata_nll, metadata_example_len = get_mean_loss(metadata_batch, save_data=save_data, idx=idx) # [0] + metadata_nll, metadata_example_len = get_mean_loss( + metadata_batch, save_data=save_data, idx=idx, model=model + ) # [0] # print(metadata_ppl) total_metadata_nll.append(metadata_nll) # * metadata_example_len diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml index df1f27d6..62786638 100644 --- a/bsmetadata/hydra_configs/v2.yaml +++ b/bsmetadata/hydra_configs/v2.yaml @@ -1,6 +1,7 @@ data_config: streaming: True validation_size_max: 1024 + use_full_evaluation_for_val: true metadata_config: random_sample_metadata: true random_sample_metadata_calculate_size: 16384 @@ -38,7 +39,7 @@ data_config: #- generation_length_sentence #- generation_length_text - entity_paragraph - local_metadata_special_tokens: + local_metadata_special_tokens: entity_paragraph: "entity" metadata_sep: ' | ' metadata_key_value_sep: ': ' @@ -77,10 +78,9 @@ data_config: local_metadata_special_token_state: true html_overall_sample_rate: 1 without_metadata_same_context: false - use_full_evaluation_for_val: false experiment: with_metadata_datasetv2_tf - per_device_eval_batch_size: 64 # 32 for 40gb - per_device_train_batch_size: 64 + per_device_eval_batch_size: 32 # 32 for 40gb + per_device_train_batch_size: 32 dataset_name: bs-modeling-metadata/c4-en-html-with-training_metadata_all dataset_config_name: null train_file: c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz @@ -104,8 +104,8 @@ model_name: gpt2 project_name: metadata_lm jobid: '' start_with_eval: false -#extra_steps_to_eval_save_at: -#- 2 +extra_steps_to_eval_save_at: +- 2 evaluation_strategy: STEPS eval_num_per_epoch: 3 eval_steps: 250 @@ -116,4 +116,4 @@ do_train: true do_eval: true gradient_checkpointing: true resume_from_checkpoint_dir: null -gradient_accumulation_steps: 1 +gradient_accumulation_steps: 2 \ No newline at end of file diff --git a/bsmetadata/train.py b/bsmetadata/train.py index 4d00edec..9c5a928e 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -89,9 +89,6 @@ class CFG: gradient_checkpointing: bool = field( default=False, metadata={"help": "Whether to use gradient_checkpointing to save memory."} ) - use_full_evaluation_for_val: bool = field( - default=False, metadata={"help": "Whether to use full evaluation for val"} - ) cs = ConfigStore.instance() @@ -382,8 +379,13 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val) if use_full_evaluation_for_val: results = evaluate_main( output_file="eval.txt", + # metadata_to_test="entity_paragraph", metadata_to_test="title,html,entity_paragraph,website_desc,generation_datasource,timestamp", + model=model, + tokenizer=tokenizer, + accelerator=accelerator, ) + model.train() for k, v in results.items(): metrics_logger.log({k: v}) else: @@ -402,7 +404,7 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val) do_eval = args.do_eval and args.start_with_eval if do_eval: logger.info("Start with an evaluation") - evaluate_multiple_dateloaders(eval_dataloaders, args.use_full_evaluation_for_val) + evaluate_multiple_dateloaders(eval_dataloaders, args.data_config.use_full_evaluation_for_val) if not args.do_train: return @@ -524,7 +526,7 @@ def get_eval_data_iter(): path = Path(args.out_dir).resolve() / f"checkpoint-{completed_steps}step" save(path) if do_eval: - evaluate_multiple_dateloaders(eval_dataloaders, args.use_full_evaluation_for_val) + evaluate_multiple_dateloaders(eval_dataloaders, args.data_config.use_full_evaluation_for_val) if completed_steps >= args.max_train_steps: # finished = True From 1e181af5bd09cb64f657a025c59b3c84f2a7c950 Mon Sep 17 00:00:00 2001 From: Jordan Clive Date: Thu, 18 May 2023 09:25:46 +0100 Subject: [PATCH 05/15] Update evaluation.py remove hard coded --- bsmetadata/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py index 86f2510c..e1a7f54b 100644 --- a/bsmetadata/evaluation.py +++ b/bsmetadata/evaluation.py @@ -286,7 +286,7 @@ def evaluate_main( config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True) except Exception: config_file_path = "bsmetadata/hydra_configs/v2.yaml" - config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml" +# config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml" need to add this path to PYTHONPATH repo_args = OmegaConf.load(config_file_path) data_config = repo_args.data_config From efb6414314f66b2b0677fba0a610e2a28432bf6e Mon Sep 17 00:00:00 2001 From: jordiclive Date: Thu, 18 May 2023 13:34:08 +0100 Subject: [PATCH 06/15] specify kwargs for non training usage --- bsmetadata/evaluation.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py index e1a7f54b..88ecd73d 100644 --- a/bsmetadata/evaluation.py +++ b/bsmetadata/evaluation.py @@ -579,17 +579,17 @@ def ppl(examples_mean_loss, examples_len): args = parser.parse_args() print(f"Parameters: {args}") results = evaluate_main( - args.repo_id, - args.subfolder, - args.config_file_path, - args.output_file, - args.save_data, - args.test, - args.max_n_examples, - args.metadata_to_test, - args.untrained, - args.prompt, - args.no_cuda, + repo_id=args.repo_id, + subfolder=args.subfolder, + config_file_path=args.config_file_path, + output_file=args.output_file, + save_data=args.save_data, + test=args.test, + max_n_examples=args.max_n_examples, + metadata_to_test=args.metadata_to_test, + untrained=args.untrained, + prompt=args.prompt, + no_cuda=args.no_cuda, ) # Load config # Write results to output file From b8e569268ad9b13ad3035b56392278c05bd54515 Mon Sep 17 00:00:00 2001 From: jordiclive Date: Tue, 6 Jun 2023 14:19:27 +0100 Subject: [PATCH 07/15] update with changes --- bsmetadata/evaluation.py | 16 +++++++++------- bsmetadata/hydra_configs/v2.yaml | 9 ++++++--- bsmetadata/train.py | 2 +- tests/test_metadata_utils.py | 18 ++++++++++++++++++ 4 files changed, 34 insertions(+), 11 deletions(-) diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py index 88ecd73d..d0cdf56f 100644 --- a/bsmetadata/evaluation.py +++ b/bsmetadata/evaluation.py @@ -46,6 +46,7 @@ def mean_loss_fn( metadata_mask: torch.Tensor = None, save_data: bool = False, idx: int = None, + tokenizer = None, ) -> torch.Tensor: """Calculates the perplexity for a given batch. @@ -62,15 +63,15 @@ def mean_loss_fn( b = outputs.logits.size(0) lm_logits = outputs.logits - lm_logits[:, :, 50257] = float("-inf") - lm_logits[:, :, 50258] = float("-inf") - labels = batch["labels"] attention_mask = batch["attention_mask"] shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() if metadata_mask is not None: + for special_tok in tokenizer.additional_special_tokens_ids: + shift_logits[:, :, special_tok] = torch.finfo(lm_logits.dtype).min + metadata_mask = metadata_mask.bool() nonmetadata_cumsum = torch.cumsum(~metadata_mask, dim=-1) first_nonmetadata = nonmetadata_cumsum == 1 @@ -133,7 +134,7 @@ def mean_loss_fn( shift_labels.view(-1), reduction="none", ).view(b, -1) - + loss = torch.nan_to_num(loss) if save_data: # Save the non-masked tokens & their loss suffix = "_meta" if metadata_mask is not None else "" @@ -181,6 +182,7 @@ def get_mean_loss( save_data: bool = False, idx: int = None, model=None, + tokenizer = None, ) -> torch.Tensor: """Prepares the arguments for perplexity calculation and passes them to the perplexity function. @@ -198,7 +200,7 @@ def get_mean_loss( metadata_mask = batch.pop("metadata_mask", None) outputs = model(**batch) batch["labels"] = labels - nll = mean_loss_fn(batch, outputs, metadata_mask, save_data=save_data, idx=idx) + nll = mean_loss_fn(batch, outputs, metadata_mask, save_data=save_data, idx=idx,tokenizer=tokenizer) return nll @@ -421,13 +423,13 @@ def evaluate_main( # Calculate nll (natural-log loss) normal_nll, normal_example_len = get_mean_loss( - normal_batch, save_data=save_data, idx=idx, model=model + normal_batch, save_data=save_data, idx=idx, model=model,tokenizer=tokenizer ) # [0] # print("PPL") # print(normal_ppl) total_normal_nll.append(normal_nll) # * normal_example_len metadata_nll, metadata_example_len = get_mean_loss( - metadata_batch, save_data=save_data, idx=idx, model=model + metadata_batch, save_data=save_data, idx=idx, model=model,tokenizer=tokenizer ) # [0] # print(metadata_ppl) total_metadata_nll.append(metadata_nll) # * metadata_example_len diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml index 62786638..3eaefcf0 100644 --- a/bsmetadata/hydra_configs/v2.yaml +++ b/bsmetadata/hydra_configs/v2.yaml @@ -12,7 +12,7 @@ data_config: title: 1.0657717366883845 generation_datasource: 1.0 entity_paragraph: 1.028817740667444 - + generation_length_text: 1.0 #- url: 1.0 #- generation_length_sentence #- generation_length_text @@ -29,6 +29,7 @@ data_config: - datasource - length - entity_paragraph + - generation_length_text metadata_column_list: - html - timestamp @@ -37,7 +38,7 @@ data_config: #- url - generation_datasource #- generation_length_sentence - #- generation_length_text + - generation_length_text - entity_paragraph local_metadata_special_tokens: entity_paragraph: "entity" @@ -73,10 +74,12 @@ data_config: - 0.0 local_metadata_special_token_start: entity_paragraph: "" + html: "" local_metadata_special_token_end: entity_paragraph: " " + html: "" local_metadata_special_token_state: true - html_overall_sample_rate: 1 + html_overall_sample_rate: 0.5 without_metadata_same_context: false experiment: with_metadata_datasetv2_tf per_device_eval_batch_size: 32 # 32 for 40gb diff --git a/bsmetadata/train.py b/bsmetadata/train.py index 9c5a928e..2c458afc 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -380,7 +380,7 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val) results = evaluate_main( output_file="eval.txt", # metadata_to_test="entity_paragraph", - metadata_to_test="title,html,entity_paragraph,website_desc,generation_datasource,timestamp", + metadata_to_test="title,html,entity_paragraph,website_desc,generation_datasource,timestamp,generation_length_text", model=model, tokenizer=tokenizer, accelerator=accelerator, diff --git a/tests/test_metadata_utils.py b/tests/test_metadata_utils.py index 49408078..51ecdcb8 100644 --- a/tests/test_metadata_utils.py +++ b/tests/test_metadata_utils.py @@ -454,6 +454,24 @@ def test_entity_settings(self): "EntityOn |EntityParagraphOn ||| |United Kingdom| |Louis Vuitton| |Billy Connolly| |Something in Common| |Lembit Öpik| Hints and tips for media appearances, speaking and social media. This week; wall-to-wall politicians; Great Britain [[United Kingdom]]: Louis Vuitton [[Louis Vuitton]] condoms; Billy Connolly [[Billy Connolly]],; Lisa Dutton; Something in Common [[Something in Common]]; What was I saying?: We’re all publishers; An interview with Lembit Opik [[Lembit Öpik]]; Music from The Good Suns", ) + def test_html_special_token_settings(self): + # from transformers import AddedToken + + cfg = MetadataConfig() + PROCESSORS["html"] = HtmlProcessor + cfg.metadata_list = ["html"] + cfg.treat_local_metadata_as_regular_text = True + cfg.local_metadata_special_token_start = {"html": ""} + cfg.local_metadata_special_token_end = {"html": ""} + text, mask = add_local_metadata_to_text(self.examples[1], cfg) + self.assertEqual( + text, + "An apple is an edible fruit " + "produced by an " + "apple tree" + " (Malus domestica).", + ) + def test_add_local_metadata_to_text(self): cfg = MetadataConfig() cfg.metadata_list = ["html", "entity"] From ee61ded8c918903ae1931f5e263cef03e360c247 Mon Sep 17 00:00:00 2001 From: jordiclive Date: Tue, 6 Jun 2023 14:36:26 +0100 Subject: [PATCH 08/15] update configs --- bsmetadata/evaluation.py | 2 +- bsmetadata/hydra_configs/v2.yaml | 3 +- bsmetadata/train.py | 6 +++- slurm_40.sh | 47 ++++++++++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 slurm_40.sh diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py index d0cdf56f..62ff1577 100644 --- a/bsmetadata/evaluation.py +++ b/bsmetadata/evaluation.py @@ -275,7 +275,7 @@ def evaluate_main( test: bool = False, max_n_examples: int = 1500, prompt: bool = False, - no_cuda: bool = True, + no_cuda: bool = False, save_data: bool = False, untrained: bool = False, config_file_path: str = None, diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml index 3eaefcf0..44d9cddf 100644 --- a/bsmetadata/hydra_configs/v2.yaml +++ b/bsmetadata/hydra_configs/v2.yaml @@ -97,6 +97,7 @@ data_config: map_batch_size: 1 weight_decay: 0.01 learning_rate: 0.0001 +wb_name: "metadata_datasetv2_tf" num_train_epochs: 1 max_train_steps: 100000 lr_scheduler_type: linear @@ -114,7 +115,7 @@ eval_num_per_epoch: 3 eval_steps: 250 save_strategy: STEPS save_num_per_epoch: 3 -save_steps: 250 +save_steps: 1000 do_train: true do_eval: true gradient_checkpointing: true diff --git a/bsmetadata/train.py b/bsmetadata/train.py index 2c458afc..1e5c8648 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -34,6 +34,7 @@ class CFG: data_config: DataConfig = DataConfig() weight_decay: float = field(default=0.0, metadata={"help": "The weight decay to use for training."}) learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate."}) + wb_name: str = field(default="bsmetadata", metadata={"help": "The name of the wandb project."}) gradient_accumulation_steps: int = field( default=1, metadata={"help": "The number of gradient accumulation steps to perform before updating model parameters."}, @@ -399,7 +400,10 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val) return progress_bar = tqdm(range(args.max_train_steps), desc="training", initial=train_state.completed_steps) - metrics_logger = Logger(is_local_main_process, project=args.project_name, config=config_dict) + t_bs = args.data_config.per_device_train_batch_size * args.gradient_accumulation_steps * 8 + os.environ['WANDB_API_KEY'] = 'd8216641d549f9bb3d0c5074baa39e15dfd55030' + metrics_logger = Logger(is_local_main_process, name=f"{args.wb_name}-{args.learning_rate}-{t_bs}", + entity='jordanclive', project='metadata', config=config_dict) do_eval = args.do_eval and args.start_with_eval if do_eval: diff --git a/slurm_40.sh b/slurm_40.sh new file mode 100644 index 00000000..6adf0cf2 --- /dev/null +++ b/slurm_40.sh @@ -0,0 +1,47 @@ +#!/bin/bash +#SBATCH --account laion +#SBATCH --partition="g40" +#SBATCH --job-name=flan +#SBATCH --gres=gpu:8 +#SBATCH --ntasks-per-node=8 +#SBATCH --cpus-per-task=12 +#SBATCH --output=%x_%j.out +source /fsx/home-jordiclive/miniconda3/bin/activate meta_conda +cd /fsx/home-jordiclive/metadata +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/ +export TRANSFORMERS_CACHE=/fsx/home-jordiclive/transformers_cache + +#export HF_DATASETS_OFFLINE=1 +#export TRANSFORMERS_OFFLINE=1 +#export WANDB_MODE=offline +export HYDRA_FULL_ERROR=1 + + +export MODEL=gpt2-xl +export NUM_GPU=8 +export DEEPSPEED_CONFIG=$(realpath bsmetadata/deepspeed_configs/v2.json) +export DATA_DIR=$(realpath /fsx/home-jordiclive/metadata/local-data/datasets--bs-modeling-metadata--c4-en-html-with-training_metadata_all/snapshots/8f2615d8b8580e89533b90bc3931e0b99ef15aec) +echo "deepspeed_config_file: $DEEPSPEED_CONFIG" + +export WANDB_API_KEY= 'd8216641d549f9bb3d0c5074baa39e15dfd55030' + +echo "compute_environment: LOCAL_MACHINE +deepspeed_config: + deepspeed_config_file: $DEEPSPEED_CONFIG +distributed_type: DEEPSPEED +fp16: true +machine_rank: 0 +main_process_ip: null +main_process_port: null +main_training_function: main +num_machines: 1 +num_processes: -1 +mixed_precision: fp16 +" > accelerate_config.yaml +CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --config_file accelerate_config.yaml bsmetadata/train.py --config-name v2 \ + model_name=$MODEL \ + data_config.dataset_name=$DATA_DIR \ + data_config.train_file='*.jsonl.gz' \ + data_config.validation_file='c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' \ + out_dir=/fsx/home-jordiclive/tmp/metadata-html-quarter \ + wb_name="full-metadata-with-generation-text-0.5-html" \ No newline at end of file From a4619dc7c41897ce38961e46c0371cf80b968047 Mon Sep 17 00:00:00 2001 From: jordiclive Date: Tue, 6 Jun 2023 15:49:40 +0100 Subject: [PATCH 09/15] no message --- bsmetadata/hydra_configs/v2.yaml | 2 +- slurm_40.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml index 44d9cddf..9311028b 100644 --- a/bsmetadata/hydra_configs/v2.yaml +++ b/bsmetadata/hydra_configs/v2.yaml @@ -97,7 +97,7 @@ data_config: map_batch_size: 1 weight_decay: 0.01 learning_rate: 0.0001 -wb_name: "metadata_datasetv2_tf" +wb_name: "all_metadata" num_train_epochs: 1 max_train_steps: 100000 lr_scheduler_type: linear diff --git a/slurm_40.sh b/slurm_40.sh index 6adf0cf2..7f483bb7 100644 --- a/slurm_40.sh +++ b/slurm_40.sh @@ -43,5 +43,5 @@ CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch -- data_config.dataset_name=$DATA_DIR \ data_config.train_file='*.jsonl.gz' \ data_config.validation_file='c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' \ - out_dir=/fsx/home-jordiclive/tmp/metadata-html-quarter \ - wb_name="full-metadata-with-generation-text-0.5-html" \ No newline at end of file + out_dir=/fsx/home-jordiclive/tmp/metadata-html-half \ +# wb_name="full-metadata-with-generation-text-0.5-html" \ No newline at end of file From 09c4e7f74896a14c045250f0ef25e967756179cd Mon Sep 17 00:00:00 2001 From: jordiclive Date: Tue, 6 Jun 2023 17:50:43 +0100 Subject: [PATCH 10/15] no message --- bsmetadata/hydra_configs/v2.yaml | 1 - bsmetadata/train.py | 4 ++-- script.sh | 39 ++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 script.sh diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml index 9311028b..a83fa5d7 100644 --- a/bsmetadata/hydra_configs/v2.yaml +++ b/bsmetadata/hydra_configs/v2.yaml @@ -1,7 +1,6 @@ data_config: streaming: True validation_size_max: 1024 - use_full_evaluation_for_val: true metadata_config: random_sample_metadata: true random_sample_metadata_calculate_size: 16384 diff --git a/bsmetadata/train.py b/bsmetadata/train.py index 1e5c8648..4dc55a2c 100644 --- a/bsmetadata/train.py +++ b/bsmetadata/train.py @@ -408,7 +408,7 @@ def evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val) do_eval = args.do_eval and args.start_with_eval if do_eval: logger.info("Start with an evaluation") - evaluate_multiple_dateloaders(eval_dataloaders, args.data_config.use_full_evaluation_for_val) + evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val=True) if not args.do_train: return @@ -530,7 +530,7 @@ def get_eval_data_iter(): path = Path(args.out_dir).resolve() / f"checkpoint-{completed_steps}step" save(path) if do_eval: - evaluate_multiple_dateloaders(eval_dataloaders, args.data_config.use_full_evaluation_for_val) + evaluate_multiple_dateloaders(eval_dataloaders, use_full_evaluation_for_val=True) if completed_steps >= args.max_train_steps: # finished = True diff --git a/script.sh b/script.sh new file mode 100644 index 00000000..ed7e4f83 --- /dev/null +++ b/script.sh @@ -0,0 +1,39 @@ +source /fsx/home-jordiclive/miniconda3/bin/activate meta_conda +cd /fsx/home-jordiclive/metadata +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/ +export TRANSFORMERS_CACHE=/fsx/home-jordiclive/transformers_cache + +#export HF_DATASETS_OFFLINE=1 +#export TRANSFORMERS_OFFLINE=1 +#export WANDB_MODE=offline +export HYDRA_FULL_ERROR=1 + + +export MODEL=gpt2-xl +export NUM_GPU=8 +export DEEPSPEED_CONFIG=$(realpath bsmetadata/deepspeed_configs/v2.json) +export DATA_DIR=$(realpath /fsx/home-jordiclive/metadata/local-data/datasets--bs-modeling-metadata--c4-en-html-with-training_metadata_all/snapshots/8f2615d8b8580e89533b90bc3931e0b99ef15aec) +echo "deepspeed_config_file: $DEEPSPEED_CONFIG" + +export WANDB_API_KEY= 'd8216641d549f9bb3d0c5074baa39e15dfd55030' + +echo "compute_environment: LOCAL_MACHINE +deepspeed_config: + deepspeed_config_file: $DEEPSPEED_CONFIG +distributed_type: DEEPSPEED +fp16: true +machine_rank: 0 +main_process_ip: null +main_process_port: null +main_training_function: main +num_machines: 1 +num_processes: -1 +mixed_precision: fp16 +" > accelerate_config.yaml +CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch --config_file accelerate_config.yaml bsmetadata/train.py --config-name v2 \ + model_name=$MODEL \ + data_config.dataset_name=$DATA_DIR \ + data_config.train_file='*.jsonl.gz' \ + data_config.validation_file='c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz' \ + out_dir=/fsx/home-jordiclive/tmp/metadata-html-half \ +# wb_name="full-metadata-with-generation-text-0.5-html" \ No newline at end of file From bc892cc0d24084812dfb980bfc8a4ff594b7e6e2 Mon Sep 17 00:00:00 2001 From: jordiclive Date: Tue, 6 Jun 2023 20:13:32 +0100 Subject: [PATCH 11/15] no message --- bsmetadata/evaluation.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py index 62ff1577..7f94e2a9 100644 --- a/bsmetadata/evaluation.py +++ b/bsmetadata/evaluation.py @@ -283,12 +283,12 @@ def evaluate_main( tokenizer: str = None, accelerator=None, ) -> dict: - if config_file_path is None: - try: - config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True) - except Exception: - config_file_path = "bsmetadata/hydra_configs/v2.yaml" -# config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml" need to add this path to PYTHONPATH + # if config_file_path is None: + # try: + # config_file_path = hf_hub_download(repo_id=repo_id, filename="actual_config.yaml", use_auth_token=True) + # except Exception: + # config_file_path = "bsmetadata/hydra_configs/v2.yaml" + config_file_path = "/fsx/home-jordiclive/metadata/bsmetadata/hydra_configs/v2.yaml" #need to add this path to PYTHONPATH repo_args = OmegaConf.load(config_file_path) data_config = repo_args.data_config From 55c5e4ef3d78338e7fb554e0d5bd072406688d13 Mon Sep 17 00:00:00 2001 From: jordiclive Date: Tue, 6 Jun 2023 20:45:09 +0100 Subject: [PATCH 12/15] no message --- bsmetadata/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bsmetadata/evaluation.py b/bsmetadata/evaluation.py index 7f94e2a9..b14dca92 100644 --- a/bsmetadata/evaluation.py +++ b/bsmetadata/evaluation.py @@ -275,7 +275,7 @@ def evaluate_main( test: bool = False, max_n_examples: int = 1500, prompt: bool = False, - no_cuda: bool = False, + no_cuda: bool = True, save_data: bool = False, untrained: bool = False, config_file_path: str = None, From 35d693558b560f582f046af7b080faf0f80176e4 Mon Sep 17 00:00:00 2001 From: jordiclive Date: Tue, 6 Jun 2023 20:54:11 +0100 Subject: [PATCH 13/15] no message --- bsmetadata/hydra_configs/v2.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml index a83fa5d7..5a869e16 100644 --- a/bsmetadata/hydra_configs/v2.yaml +++ b/bsmetadata/hydra_configs/v2.yaml @@ -111,7 +111,7 @@ extra_steps_to_eval_save_at: - 2 evaluation_strategy: STEPS eval_num_per_epoch: 3 -eval_steps: 250 +eval_steps: 500 save_strategy: STEPS save_num_per_epoch: 3 save_steps: 1000 From 1f3a1f13fe714e08b09da9c373b4bb8bfe65ecd4 Mon Sep 17 00:00:00 2001 From: jordiclive Date: Wed, 7 Jun 2023 10:10:08 +0100 Subject: [PATCH 14/15] no message --- bsmetadata/hydra_configs/v2.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml index 5a869e16..3916ead4 100644 --- a/bsmetadata/hydra_configs/v2.yaml +++ b/bsmetadata/hydra_configs/v2.yaml @@ -111,10 +111,10 @@ extra_steps_to_eval_save_at: - 2 evaluation_strategy: STEPS eval_num_per_epoch: 3 -eval_steps: 500 +eval_steps: 1000 save_strategy: STEPS save_num_per_epoch: 3 -save_steps: 1000 +save_steps: 500 do_train: true do_eval: true gradient_checkpointing: true From 4e891027a138d51c564fbac25f1575d00fbdc251 Mon Sep 17 00:00:00 2001 From: jordiclive Date: Wed, 7 Jun 2023 10:10:31 +0100 Subject: [PATCH 15/15] no message --- bsmetadata/hydra_configs/v2.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bsmetadata/hydra_configs/v2.yaml b/bsmetadata/hydra_configs/v2.yaml index 3916ead4..805d593f 100644 --- a/bsmetadata/hydra_configs/v2.yaml +++ b/bsmetadata/hydra_configs/v2.yaml @@ -107,8 +107,8 @@ model_name: gpt2 project_name: metadata_lm jobid: '' start_with_eval: false -extra_steps_to_eval_save_at: -- 2 +#extra_steps_to_eval_save_at: +#- 2 evaluation_strategy: STEPS eval_num_per_epoch: 3 eval_steps: 1000