|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.1, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 22.986753463745117, |
|
"learning_rate": 0.00198, |
|
"loss": 1.7003, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 8.073963165283203, |
|
"learning_rate": 0.00196, |
|
"loss": 1.9785, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.6197099685668945, |
|
"learning_rate": 0.0019399999999999999, |
|
"loss": 1.9353, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.2731146812438965, |
|
"learning_rate": 0.00192, |
|
"loss": 1.5962, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.425559043884277, |
|
"learning_rate": 0.0019, |
|
"loss": 1.389, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.872774600982666, |
|
"learning_rate": 0.00188, |
|
"loss": 1.4156, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.199490070343018, |
|
"learning_rate": 0.00186, |
|
"loss": 1.6583, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.413191318511963, |
|
"learning_rate": 0.00184, |
|
"loss": 1.4334, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.90674352645874, |
|
"learning_rate": 0.00182, |
|
"loss": 1.6046, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.410930633544922, |
|
"learning_rate": 0.0018000000000000002, |
|
"loss": 1.5504, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.529223680496216, |
|
"learning_rate": 0.0017800000000000001, |
|
"loss": 1.6463, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.781284332275391, |
|
"learning_rate": 0.00176, |
|
"loss": 1.6136, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.6382951736450195, |
|
"learning_rate": 0.00174, |
|
"loss": 1.5105, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.392839431762695, |
|
"learning_rate": 0.00172, |
|
"loss": 1.6061, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.9011926651000977, |
|
"learning_rate": 0.0017, |
|
"loss": 1.6188, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.002920627593994, |
|
"learning_rate": 0.00168, |
|
"loss": 1.4177, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.34838342666626, |
|
"learning_rate": 0.00166, |
|
"loss": 1.5689, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 8.142854690551758, |
|
"learning_rate": 0.00164, |
|
"loss": 1.5804, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.837989330291748, |
|
"learning_rate": 0.0016200000000000001, |
|
"loss": 1.5981, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.33852219581604, |
|
"learning_rate": 0.0016, |
|
"loss": 1.4347, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.069826602935791, |
|
"learning_rate": 0.00158, |
|
"loss": 1.4809, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.71095609664917, |
|
"learning_rate": 0.0015600000000000002, |
|
"loss": 1.388, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.450407981872559, |
|
"learning_rate": 0.0015400000000000001, |
|
"loss": 1.603, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.475738048553467, |
|
"learning_rate": 0.00152, |
|
"loss": 1.5731, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.051819086074829, |
|
"learning_rate": 0.0015, |
|
"loss": 1.5133, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.014269828796387, |
|
"learning_rate": 0.00148, |
|
"loss": 1.5458, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.558957815170288, |
|
"learning_rate": 0.00146, |
|
"loss": 1.4918, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.6234660148620605, |
|
"learning_rate": 0.0014399999999999999, |
|
"loss": 1.5247, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.9923095703125, |
|
"learning_rate": 0.00142, |
|
"loss": 1.6671, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 7.883978366851807, |
|
"learning_rate": 0.0014, |
|
"loss": 1.5732, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.3218066692352295, |
|
"learning_rate": 0.00138, |
|
"loss": 1.6297, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 9.045559883117676, |
|
"learning_rate": 0.00136, |
|
"loss": 1.6581, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.832301139831543, |
|
"learning_rate": 0.00134, |
|
"loss": 1.6966, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.6719107627868652, |
|
"learning_rate": 0.00132, |
|
"loss": 1.5904, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.4335455894470215, |
|
"learning_rate": 0.0013000000000000002, |
|
"loss": 1.6643, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.2848339080810547, |
|
"learning_rate": 0.00128, |
|
"loss": 1.4174, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.8206841945648193, |
|
"learning_rate": 0.00126, |
|
"loss": 1.7362, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.389599084854126, |
|
"learning_rate": 0.00124, |
|
"loss": 1.6058, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.887266159057617, |
|
"learning_rate": 0.00122, |
|
"loss": 1.4604, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.9653384685516357, |
|
"learning_rate": 0.0012, |
|
"loss": 1.5152, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.5362136363983154, |
|
"learning_rate": 0.00118, |
|
"loss": 1.469, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.7318670749664307, |
|
"learning_rate": 0.00116, |
|
"loss": 1.4136, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.6364078521728516, |
|
"learning_rate": 0.00114, |
|
"loss": 1.6937, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.9428081512451172, |
|
"learning_rate": 0.0011200000000000001, |
|
"loss": 1.4825, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.1813700199127197, |
|
"learning_rate": 0.0011, |
|
"loss": 1.4593, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.612652778625488, |
|
"learning_rate": 0.00108, |
|
"loss": 1.389, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.5145719051361084, |
|
"learning_rate": 0.0010600000000000002, |
|
"loss": 1.3896, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.4980382919311523, |
|
"learning_rate": 0.0010400000000000001, |
|
"loss": 1.3725, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.6995227336883545, |
|
"learning_rate": 0.00102, |
|
"loss": 1.4769, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.1483154296875, |
|
"learning_rate": 0.001, |
|
"loss": 1.5983, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 4.0788232684018176e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|