{"output_dir": "/data2/assaf/mamba/outputs/models", "cache_dir": "/data2/hf_cache", "activate_logging": true, "wandb_dir": "/data2/assaf/wandb/mamba", "run_name_addon": "k=6, mindl=12, maxdl=20, 2 chunks, min_seq_len=20", "record_debug_params": false, "eval_mode": false, "mamba_arch": "deci", "model_type": "mamba-130m", "use_finetuned_model": false, "load_cp": null, "clip_grad": true, "clip_grad_max_norm": 1, "seed": 123, "lr_sched_type": "const", "sampling_temperature": 1.2, "save_steps": 100, "eval_steps": 10, "grad_flow_steps": 10, "max_step": 20000, "epochs": 10, "model_device": "cuda:3", "dataset": "ppl_test", "train_set_size": 6144, "eval_set_size": 20, "eval_samples_to_log": 30, "eval_max_len": 20, "max_train_input_len": 20000, "scrolls_evaluator_path": "/data1/assaf/datasets/scrolls/evaluator/dataset_evaluator.py", "niah_context_len_train": 2000, "niah_needle_depths_eval": [0.25], "niah_context_lens_eval": [1000, 2000, 4000, 8000], "ppl_test_context_len_train": 2000, "ppl_test_pred_len": 30, "ppl_test_context_lens_eval": [1000, 2000, 4000, 8000, 16000], "ppl_test_num_windows_per_context_len_eval": 10, "deci_num_chunks": 2, "activate_decimation": true, "decimation_type": "max_p", "decimation_k": 6, "min_decimating_layer": 12, "max_decimating_layer": 20, "decimation_min_seq_len": 20, "decimation_max_p_L_base": 2000, "lr": 0.0001, "weight_decay": 0.1, "grad_accum_steps": 250, "activate_profiling": false} |