|
import os |
|
from plotly import graph_objects as go |
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
all_eval_results = {} |
|
for fname in os.listdir("data/txt360_eval"): |
|
if fname.endswith(".csv"): |
|
metric_name = fname.replace("CKPT Eval - ", "").replace(".csv", "") |
|
all_eval_results[metric_name] = {} |
|
|
|
|
|
df = pd.read_csv(os.path.join("data/txt360_eval", fname)) |
|
|
|
|
|
fineweb_res = df.iloc[2:, 1].astype(float).fillna(method="bfill") |
|
txt360_base = df.iloc[2:, 2].astype(float).fillna(method="bfill") |
|
txt360_web_up = df.iloc[2:, 3].astype(float).fillna(method="bfill") |
|
txt360_all_up_stack = df.iloc[2:, 4].astype(float).fillna(method="bfill") |
|
|
|
|
|
|
|
all_eval_results[metric_name]["fineweb"] = fineweb_res |
|
all_eval_results[metric_name]["txt360-dedup-only"] = txt360_base |
|
all_eval_results[metric_name]["txt360-web-only-upsampled"] = txt360_web_up |
|
all_eval_results[metric_name]["txt360-all-upsampled + stackv2"] = txt360_all_up_stack |
|
all_eval_results[metric_name]["token"] = [20 * i for i in range(len(fineweb_res))] |
|
|
|
|
|
|
|
all_eval_res_figs = {} |
|
for metric_name, res in all_eval_results.items(): |
|
fig_res = go.Figure() |
|
|
|
|
|
fig_res.add_trace(go.Scatter( |
|
x=all_eval_results[metric_name]["token"], |
|
y=all_eval_results[metric_name]["fineweb"], |
|
mode='lines', name='FineWeb' |
|
)) |
|
fig_res.add_trace(go.Scatter( |
|
x=all_eval_results[metric_name]["token"], |
|
y=all_eval_results[metric_name]["txt360-web-only-upsampled"], |
|
mode='lines', name='TxT360 - CC Data Upsampled' |
|
)) |
|
fig_res.add_trace(go.Scatter( |
|
x=all_eval_results[metric_name]["token"], |
|
y=all_eval_results[metric_name]["txt360-dedup-only"], |
|
mode='lines', name='TxT360 - CC Data Dedup' |
|
)) |
|
fig_res.add_trace(go.Scatter( |
|
x=all_eval_results[metric_name]["token"], |
|
y=all_eval_results[metric_name]["txt360-all-upsampled + stackv2"], |
|
mode='lines', name='TxT360 - Full Upsampled + Stack V2' |
|
)) |
|
|
|
|
|
fig_res.update_layout( |
|
title=f"{metric_name} Performance", |
|
title_x=0.5, |
|
xaxis_title="Billion Tokens", |
|
yaxis_title=metric_name, |
|
legend_title="Dataset", |
|
) |
|
all_eval_res_figs[metric_name] = fig_res |