sanchit-gandhi commited on May 30, 2024

Commit

6096f07

verified ·

1 Parent(s): 3cbe542

Saving train state of step 120000

Browse files

Files changed (23) hide show

checkpoint-120000-epoch-8/config.json +278 -0
checkpoint-120000-epoch-8/generation_config.json +12 -0
checkpoint-120000-epoch-8/optimizer.bin +3 -0
checkpoint-120000-epoch-8/pytorch_model.bin +3 -0
checkpoint-120000-epoch-8/random_states_0.pkl +3 -0
checkpoint-120000-epoch-8/random_states_1.pkl +3 -0
checkpoint-120000-epoch-8/random_states_2.pkl +3 -0
checkpoint-120000-epoch-8/random_states_3.pkl +3 -0
checkpoint-120000-epoch-8/random_states_4.pkl +3 -0
checkpoint-120000-epoch-8/random_states_5.pkl +3 -0
checkpoint-120000-epoch-8/random_states_6.pkl +3 -0
checkpoint-120000-epoch-8/random_states_7.pkl +3 -0
checkpoint-120000-epoch-8/scheduler.bin +3 -0
starting_point_0.01.json +2 -1
training/__pycache__/arguments.cpython-311.pyc +0 -0
training/__pycache__/data.cpython-311.pyc +0 -0
training/__pycache__/eval.cpython-311.pyc +0 -0
training/__pycache__/utils.cpython-311.pyc +0 -0
training/arguments.py +7 -1
training/data.py +6 -1
training/eval.py +1 -2
training/run_parler_tts_training.py +30 -10
training/utils.py +4 -2

checkpoint-120000-epoch-8/config.json ADDED Viewed

	@@ -0,0 +1,278 @@

+{
+  "architectures": [
+    "ParlerTTSForConditionalGeneration"
+  ],
+  "audio_encoder": {
+    "_name_or_path": "parler-tts/dac_44khZ_8kbps",
+    "add_cross_attention": false,
+    "architectures": [
+      "DACModel"
+    ],
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "codebook_size": 1024,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "frame_rate": 86,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "latent_dim": 1024,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_bitrate": 8,
+    "model_type": "dac",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_codebooks": 9,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sampling_rate": 44100,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "decoder": {
+    "_name_or_path": "./parler-tts-untrained-600M/decoder",
+    "activation_dropout": 0.0,
+    "activation_function": "gelu",
+    "add_cross_attention": true,
+    "architectures": [
+      "ParlerTTSForCausalLM"
+    ],
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 1025,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.1,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 1024,
+    "exponential_decay_length_penalty": null,
+    "ffn_dim": 4096,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 0.02,
+    "is_decoder": true,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layerdrop": 0.0,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 4096,
+    "min_length": 0,
+    "model_type": "parler_tts_decoder",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_codebooks": 9,
+    "num_hidden_layers": 24,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1024,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rope_embeddings": false,
+    "rope_theta": 10000.0,
+    "scale_embedding": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 1088
+  },
+  "decoder_start_token_id": 1025,
+  "is_encoder_decoder": true,
+  "model_type": "parler_tts",
+  "pad_token_id": 1024,
+  "prompt_cross_attention": true,
+  "text_encoder": {
+    "_name_or_path": "google/flan-t5-base",
+    "add_cross_attention": false,
+    "architectures": [
+      "T5ForConditionalGeneration"
+    ],
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "classifier_dropout": 0.0,
+    "cross_attention_hidden_size": null,
+    "d_ff": 2048,
+    "d_kv": 64,
+    "d_model": 768,
+    "decoder_start_token_id": 0,
+    "dense_act_fn": "gelu_new",
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout_rate": 0.1,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 1,
+    "exponential_decay_length_penalty": null,
+    "feed_forward_proj": "gated-gelu",
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "is_decoder": false,
+    "is_encoder_decoder": true,
+    "is_gated_act": true,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_epsilon": 1e-06,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "t5",
+    "n_positions": 512,
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_decoder_layers": 12,
+    "num_heads": 12,
+    "num_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_past": true,
+    "output_scores": false,
+    "pad_token_id": 0,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "relative_attention_max_distance": 128,
+    "relative_attention_num_buckets": 32,
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": {
+      "summarization": {
+        "early_stopping": true,
+        "length_penalty": 2.0,
+        "max_length": 200,
+        "min_length": 30,
+        "no_repeat_ngram_size": 3,
+        "num_beams": 4,
+        "prefix": "summarize: "
+      },
+      "translation_en_to_de": {
+        "early_stopping": true,
+        "max_length": 300,
+        "num_beams": 4,
+        "prefix": "translate English to German: "
+      },
+      "translation_en_to_fr": {
+        "early_stopping": true,
+        "max_length": 300,
+        "num_beams": 4,
+        "prefix": "translate English to French: "
+      },
+      "translation_en_to_ro": {
+        "early_stopping": true,
+        "max_length": 300,
+        "num_beams": 4,
+        "prefix": "translate English to Romanian: "
+      }
+    },
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 32128
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.2",
+  "vocab_size": 32128
+}

checkpoint-120000-epoch-8/generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1025,
+  "decoder_start_token_id": 1025,
+  "do_sample": true,
+  "eos_token_id": 1024,
+  "guidance_scale": 1,
+  "key": 10,
+  "max_length": 2580,
+  "pad_token_id": 1024,
+  "transformers_version": "4.40.2"
+}

checkpoint-120000-epoch-8/optimizer.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:733b094e78f727fce8a0183cdcfc72f5e2b154ed2934959c320e2465693c9577
+size 3652769047

checkpoint-120000-epoch-8/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce7533373536c757b73a74a2ba6185974c2d35e466e38f37a6781da4031a98e8
+size 2605239710

checkpoint-120000-epoch-8/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a23edacfa0329be5ccd2f08651b598e5c8cd31d1fd8e33a3ed02c60c8a3654a6
+size 16036

checkpoint-120000-epoch-8/random_states_1.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53c4ef324c16e8b4466ddbd1b26ea5df0eab4d6fe281391be521efad9f1c87f3
+size 16100

checkpoint-120000-epoch-8/random_states_2.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44f53b0c4745162bed7c7899526b263264a4d5d5b4f4e2c1bc4b6af765c4b6e1
+size 16100

checkpoint-120000-epoch-8/random_states_3.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e432fe468eb01fa59dcd2d3c7c8969f260b157c4395d35773b847f11a37b12b
+size 16100

checkpoint-120000-epoch-8/random_states_4.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c41036f9a24b7f68c02bf44c037565e222ee8c40516670494732a1645fabb39
+size 16100

checkpoint-120000-epoch-8/random_states_5.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ee6a0009db3cca6c35019eabafa38b2a46def11da55b2c5e140f70974e8ae50
+size 16100

checkpoint-120000-epoch-8/random_states_6.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:525f5729c247c1493b3cf5283900401c86948a7eae7b41979b3784e3efa6b2bb
+size 16100

checkpoint-120000-epoch-8/random_states_7.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b31f40c509348d672e052602f82d0d37f85f9dcadbc107dc7217a8e3e6f3092
+size 16036

checkpoint-120000-epoch-8/scheduler.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a46a1b5d12218eb49696470fb7337ce7c2ac2f6cb2f18a57ba0f1af48738171
+size 1000

starting_point_0.01.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "model_name_or_path": "parler-tts/parler-tts-untrained-600M-cross-attention",
-    "save_to_disk":  "/fsx/yoach/tmp/artefacts/10k_hours_processed_punctuated/",
     "temporary_save_to_disk": "/scratch/tmp_dataset_audio/",
     "push_to_hub": true,
@@ -10,6 +10,7 @@
     "prompt_tokenizer_name":"google/flan-t5-base",
     "report_to": ["wandb"],
     "overwrite_output_dir": false,
     "output_dir": "./",

 {
     "model_name_or_path": "parler-tts/parler-tts-untrained-600M-cross-attention",
+    "save_to_disk":  "/fsx/sanchit/10k_hours_processed_punctuated",
     "temporary_save_to_disk": "/scratch/tmp_dataset_audio/",
     "push_to_hub": true,
     "prompt_tokenizer_name":"google/flan-t5-base",
     "report_to": ["wandb"],
+    "wandb_run_name": "parler-tts-600M-cross-attention",
     "overwrite_output_dir": false,
     "output_dir": "./",

training/__pycache__/arguments.cpython-311.pyc CHANGED Viewed

Binary files a/training/__pycache__/arguments.cpython-311.pyc and b/training/__pycache__/arguments.cpython-311.pyc differ

training/__pycache__/data.cpython-311.pyc CHANGED Viewed

Binary files a/training/__pycache__/data.cpython-311.pyc and b/training/__pycache__/data.cpython-311.pyc differ

training/__pycache__/eval.cpython-311.pyc CHANGED Viewed

Binary files a/training/__pycache__/eval.cpython-311.pyc and b/training/__pycache__/eval.cpython-311.pyc differ

training/__pycache__/utils.cpython-311.pyc CHANGED Viewed

Binary files a/training/__pycache__/utils.cpython-311.pyc and b/training/__pycache__/utils.cpython-311.pyc differ

training/arguments.py CHANGED Viewed

@@ -218,7 +218,7 @@ class DataTrainingArguments:
         metadata={
             "help": (
                 "If set, filter samples with descriptions that are longer than `max_description_token_length` tokens."
-                "Also, used to set maximum desription token length if `pad_to_max_length=True`."
             )
         },
     )
@@ -277,6 +277,12 @@ class DataTrainingArguments:
         default="parler-speech",
         metadata={"help": "The name of the wandb project."},
     )
     save_to_disk: str = field(
         default=None,
         metadata={

         metadata={
             "help": (
                 "If set, filter samples with descriptions that are longer than `max_description_token_length` tokens."
+                "Also, used to set maximum description token length if `pad_to_max_length=True`."
             )
         },
     )
         default="parler-speech",
         metadata={"help": "The name of the wandb project."},
     )
+    wandb_run_name: str = field(
+        default=None,
+        metadata={
+            "help": "If specified, the name of the run. If not specified, wandb will give a random name to this run."
+        },
+    )
     save_to_disk: str = field(
         default=None,
         metadata={

training/data.py CHANGED Viewed

@@ -31,7 +31,12 @@ class DataCollatorEncodecWithPadding:
         audios = [feature[self.audio_column_name]["array"] for feature in features]
         len_audio = [len(audio) for audio in audios]
-        batch = self.feature_extractor(audios, return_tensors="pt", padding=self.padding, max_length=self.max_length)
         batch["len_audio"] = torch.tensor(len_audio).unsqueeze(1)
         return batch

         audios = [feature[self.audio_column_name]["array"] for feature in features]
         len_audio = [len(audio) for audio in audios]
+        # since resampling has already been performed in the 'load_multiple_datasets' function,
+        # a fixed sampling_rate(44100hz) is passed to the feature_extractor.
+        sampling_rate = self.feature_extractor.sampling_rate
+        batch = self.feature_extractor(
+            audios, sampling_rate=sampling_rate, return_tensors="pt", padding=self.padding, max_length=self.max_length
+        )
         batch["len_audio"] = torch.tensor(len_audio).unsqueeze(1)
         return batch

training/eval.py CHANGED Viewed

@@ -47,8 +47,7 @@ def wer(asr_model_name_or_path, prompts, audios, device, per_device_eval_batch_s
     normalized_references = []
     for pred, ref in zip(transcriptions, prompts):
-        normalizer = english_normalizer
         norm_ref = normalizer(ref)
         if len(norm_ref) > 0:
             norm_pred = normalizer(pred["text"])

     normalized_references = []
     for pred, ref in zip(transcriptions, prompts):
+        normalizer = english_normalizer if return_language and pred["chunks"][0]["language"] == "english" else basic_normalizer
         norm_ref = normalizer(ref)
         if len(norm_ref) > 0:
             norm_pred = normalizer(pred["text"])

training/run_parler_tts_training.py CHANGED Viewed

@@ -98,9 +98,6 @@ def main():
     ####### A. Preparation
     kwargs_handlers = [InitProcessGroupKwargs(timeout=timedelta(minutes=60))]
-    if training_args.torch_compile:
-        # TODO(YL): add more compile modes?
-        kwargs_handlers.append(TorchDynamoPlugin(backend="inductor", mode="default"))  # reduce-overhead
     accelerator = Accelerator(
         gradient_accumulation_steps=training_args.gradient_accumulation_steps,
@@ -129,6 +126,7 @@ def main():
             "adam_beta2": training_args.adam_beta2,
             "temperature": model_args.temperature,
         },
     )
     # Detecting last checkpoint and eventually continue from last checkpoint
@@ -136,7 +134,7 @@ def main():
     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                 "Use --overwrite_output_dir to overcome."
             )
@@ -314,6 +312,7 @@ def main():
         token=data_args.token,
         trust_remote_code=data_args.trust_remote_code,
     )
     # enable gradient checkpointing if necessary
     if training_args.gradient_checkpointing:
@@ -334,8 +333,8 @@ def main():
     feature_extractor_input_name = feature_extractor.model_input_names[0]
     audio_encoder_pad_token_id = config.decoder.pad_token_id
     audio_encoder_eos_token_id = config.decoder.eos_token_id
-    audio_encoder_bos_token_id = model.generation_config.decoder_start_token_id
-    max_length = model.generation_config.max_length
     num_codebooks = model.decoder.config.num_codebooks
     bandwidth = model_args.bandwidth
@@ -538,7 +537,7 @@ def main():
         logger.info(f"Dataset saved at {data_args.save_to_disk}")
     audio_max_length = None
-    if training_args.torch_compile:
         audio_max_length = max(vectorized_datasets["train"]["target_length"])
         with accelerator.main_process_first():
             max_sample = vectorized_datasets["train"].filter(
@@ -548,6 +547,18 @@ def main():
             )
         audio_max_length = torch.tensor(max_sample[0]["labels"]).shape[1]
     # for large datasets it is advised to run the preprocessing on a
     # single machine first with ``args.preprocessing_only`` since there will mostly likely
     # be a timeout when running the script in distributed mode.
@@ -670,6 +681,8 @@ def main():
         checkpoint = last_checkpoint
     if accelerator.is_main_process:
         if training_args.push_to_hub:
             api = HfApi(token=training_args.hub_token)
@@ -682,8 +695,6 @@ def main():
             with open(os.path.join(training_args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "wandb" not in gitignore:
                     gitignore.write("wandb\n")
-        elif training_args.output_dir is not None:
-            os.makedirs(training_args.output_dir, exist_ok=True)
     accelerator.wait_for_everyone()
     # Now save everything to be able to create a single processor later
@@ -740,7 +751,13 @@ def main():
         "do_sample": model_args.do_sample,
         "temperature": model_args.temperature,
         "max_length": model_args.max_length,
     }
     # Define gradient update step fn
     def train_step(
@@ -869,9 +886,11 @@ def main():
                     # safe_serialization=False to avoid shared tensors saving issue (TODO(YL): it's a temporary fix)
                     # https://github.com/huggingface/transformers/issues/27293#issuecomment-1872560074
                     accelerator.save_state(output_dir=intermediate_dir, safe_serialization=False)
                     accelerator.wait_for_everyone()
                     if accelerator.is_main_process:
-                        rotate_checkpoints(
                             training_args.save_total_limit, output_dir=training_args.output_dir, logger=logger
                         )
@@ -886,6 +905,7 @@ def main():
                                 folder_path=training_args.output_dir,
                                 commit_message=f"Saving train state of step {cur_step}",
                                 run_as_future=True,
                             )
                 if training_args.do_eval and (cur_step % eval_steps == 0 or cur_step == total_train_steps):

     ####### A. Preparation
     kwargs_handlers = [InitProcessGroupKwargs(timeout=timedelta(minutes=60))]
     accelerator = Accelerator(
         gradient_accumulation_steps=training_args.gradient_accumulation_steps,
             "adam_beta2": training_args.adam_beta2,
             "temperature": model_args.temperature,
         },
+        init_kwargs={"wandb": {"name": data_args.wandb_run_name}} if data_args.wandb_run_name else {},
     )
     # Detecting last checkpoint and eventually continue from last checkpoint
     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            logger.info(
                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                 "Use --overwrite_output_dir to overcome."
             )
         token=data_args.token,
         trust_remote_code=data_args.trust_remote_code,
     )
+    generation_config = model.generation_config
     # enable gradient checkpointing if necessary
     if training_args.gradient_checkpointing:
     feature_extractor_input_name = feature_extractor.model_input_names[0]
     audio_encoder_pad_token_id = config.decoder.pad_token_id
     audio_encoder_eos_token_id = config.decoder.eos_token_id
+    audio_encoder_bos_token_id = generation_config.decoder_start_token_id
+    max_length = generation_config.max_length
     num_codebooks = model.decoder.config.num_codebooks
     bandwidth = model_args.bandwidth
         logger.info(f"Dataset saved at {data_args.save_to_disk}")
     audio_max_length = None
+    if padding == "max_length":
         audio_max_length = max(vectorized_datasets["train"]["target_length"])
         with accelerator.main_process_first():
             max_sample = vectorized_datasets["train"].filter(
             )
         audio_max_length = torch.tensor(max_sample[0]["labels"]).shape[1]
+    if training_args.group_by_length:
+        # apply a simple heuristic to take into account audio and text lengths
+        def add_target_lengths(target_length, prompt, description):
+            return {"target_length": target_length + len(prompt) + len(description)}
+        with accelerator.main_process_first():
+            vectorized_datasets = vectorized_datasets.map(
+                add_target_lengths,
+                num_proc=num_workers,
+                input_columns=["target_length", "prompt_input_ids", "input_ids"],
+            )
     # for large datasets it is advised to run the preprocessing on a
     # single machine first with ``args.preprocessing_only`` since there will mostly likely
     # be a timeout when running the script in distributed mode.
         checkpoint = last_checkpoint
     if accelerator.is_main_process:
+        if training_args.output_dir is not None:
+            os.makedirs(training_args.output_dir, exist_ok=True)
         if training_args.push_to_hub:
             api = HfApi(token=training_args.hub_token)
             with open(os.path.join(training_args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "wandb" not in gitignore:
                     gitignore.write("wandb\n")
     accelerator.wait_for_everyone()
     # Now save everything to be able to create a single processor later
         "do_sample": model_args.do_sample,
         "temperature": model_args.temperature,
         "max_length": model_args.max_length,
+        # Because of the delayed pattern mask, generation might stop earlier because of unexpected behaviour
+        # on the first tokens of the codebooks that are delayed.
+        # This fix the issue.
+        "min_new_tokens": num_codebooks + 1,
     }
+    for key in gen_kwargs:
+        generation_config.key = gen_kwargs[key]
     # Define gradient update step fn
     def train_step(
                     # safe_serialization=False to avoid shared tensors saving issue (TODO(YL): it's a temporary fix)
                     # https://github.com/huggingface/transformers/issues/27293#issuecomment-1872560074
                     accelerator.save_state(output_dir=intermediate_dir, safe_serialization=False)
+                    config.save_pretrained(intermediate_dir)
+                    generation_config.save_pretrained(intermediate_dir)
                     accelerator.wait_for_everyone()
                     if accelerator.is_main_process:
+                        checkpoints_to_be_deleted = rotate_checkpoints(
                             training_args.save_total_limit, output_dir=training_args.output_dir, logger=logger
                         )
                                 folder_path=training_args.output_dir,
                                 commit_message=f"Saving train state of step {cur_step}",
                                 run_as_future=True,
+                                delete_patterns=checkpoints_to_be_deleted,
                             )
                 if training_args.do_eval and (cur_step % eval_steps == 0 or cur_step == total_train_steps):

training/utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ import re
 import shutil
 from pathlib import Path
 from dataclasses import field
-from typing import Dict, List
 import torch
 from wandb import Audio
@@ -44,7 +44,7 @@ def sorted_checkpoints(output_dir=None, checkpoint_prefix="checkpoint") -> List[
     return checkpoints_sorted
-def rotate_checkpoints(save_total_limit=None, output_dir=None, checkpoint_prefix="checkpoint", logger=None) -> None:
     """Helper function to delete old checkpoints."""
     if save_total_limit is None or save_total_limit <= 0:
         return
@@ -58,6 +58,8 @@ def rotate_checkpoints(save_total_limit=None, output_dir=None, checkpoint_prefix
     for checkpoint in checkpoints_to_be_deleted:
         logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
         shutil.rmtree(checkpoint, ignore_errors=True)
 def log_metric(

 import shutil
 from pathlib import Path
 from dataclasses import field
+from typing import Dict, List, Union
 import torch
 from wandb import Audio
     return checkpoints_sorted
+def rotate_checkpoints(save_total_limit=None, output_dir=None, checkpoint_prefix="checkpoint", logger=None) -> Union[List, None]:
     """Helper function to delete old checkpoints."""
     if save_total_limit is None or save_total_limit <= 0:
         return
     for checkpoint in checkpoints_to_be_deleted:
         logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
         shutil.rmtree(checkpoint, ignore_errors=True)
+    checkpoints_to_be_deleted = [f"*{Path(checkpoint).absolute().name}*"  for checkpoint in checkpoints_to_be_deleted]
+    return checkpoints_to_be_deleted
 def log_metric(