hunterhector commited on
Commit
a4dc57a
1 Parent(s): e74bc72

add fiture to early sections too

Browse files
Files changed (3) hide show
  1. eval_result_figures.py +67 -0
  2. main.py +7 -4
  3. results.py +1 -64
eval_result_figures.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from plotly import graph_objects as go
3
+ import pandas as pd
4
+
5
+ ## Evaluation Graphs
6
+
7
+ # Load the data
8
+ all_eval_results = {}
9
+ for fname in os.listdir("data/txt360_eval"):
10
+ if fname.endswith(".csv"):
11
+ metric_name = fname.replace("CKPT Eval - ", "").replace(".csv", "")
12
+ all_eval_results[metric_name] = {}
13
+
14
+ # with open(os.path.join("data/txt360_eval", fname)) as f:
15
+ df = pd.read_csv(os.path.join("data/txt360_eval", fname))
16
+
17
+ # slimpajama_res = df.iloc[2:, 2].astype(float).fillna(0.0) # slimpajama
18
+ fineweb_res = df.iloc[2:, 1].astype(float).fillna(method="bfill") # fineweb
19
+ txt360_base = df.iloc[2:, 2].astype(float).fillna(method="bfill") # txt360-dedup-only
20
+ txt360_web_up = df.iloc[2:, 3].astype(float).fillna(method="bfill") # txt360-web-only-upsampled
21
+ txt360_all_up_stack = df.iloc[2:, 4].astype(float).fillna(method="bfill") # txt360-all-upsampled + stackv2
22
+
23
+ # each row is 20B tokens.
24
+ # all_eval_results[metric_name]["slimpajama"] = slimpajama_res
25
+ all_eval_results[metric_name]["fineweb"] = fineweb_res
26
+ all_eval_results[metric_name]["txt360-dedup-only"] = txt360_base
27
+ all_eval_results[metric_name]["txt360-web-only-upsampled"] = txt360_web_up
28
+ all_eval_results[metric_name]["txt360-all-upsampled + stackv2"] = txt360_all_up_stack
29
+ all_eval_results[metric_name]["token"] = [20 * i for i in range(len(fineweb_res))]
30
+
31
+
32
+ # Eval Result Plots
33
+ all_eval_res_figs = {}
34
+ for metric_name, res in all_eval_results.items():
35
+ fig_res = go.Figure()
36
+
37
+ # Add lines
38
+ fig_res.add_trace(go.Scatter(
39
+ x=all_eval_results[metric_name]["token"],
40
+ y=all_eval_results[metric_name]["fineweb"],
41
+ mode='lines', name='FineWeb'
42
+ ))
43
+ fig_res.add_trace(go.Scatter(
44
+ x=all_eval_results[metric_name]["token"],
45
+ y=all_eval_results[metric_name]["txt360-web-only-upsampled"],
46
+ mode='lines', name='TxT360 - CC Data Upsampled'
47
+ ))
48
+ fig_res.add_trace(go.Scatter(
49
+ x=all_eval_results[metric_name]["token"],
50
+ y=all_eval_results[metric_name]["txt360-dedup-only"],
51
+ mode='lines', name='TxT360 - CC Data Dedup'
52
+ ))
53
+ fig_res.add_trace(go.Scatter(
54
+ x=all_eval_results[metric_name]["token"],
55
+ y=all_eval_results[metric_name]["txt360-all-upsampled + stackv2"],
56
+ mode='lines', name='TxT360 - Full Upsampled + Stack V2'
57
+ ))
58
+
59
+ # Update layout
60
+ fig_res.update_layout(
61
+ title=f"{metric_name} Performance",
62
+ title_x=0.5, # Centers the title
63
+ xaxis_title="Billion Tokens",
64
+ yaxis_title=metric_name,
65
+ legend_title="Dataset",
66
+ )
67
+ all_eval_res_figs[metric_name] = fig_res
main.py CHANGED
@@ -23,6 +23,8 @@ import results
23
  from pybtex.database import parse_file
24
  import data_viewer
25
 
 
 
26
 
27
  app, rt = fast_app(
28
  debug=True,
@@ -200,7 +202,7 @@ def main():
200
  ),
201
  Li(
202
  A(
203
- "Motivation Behind TxT360",
204
  href="#section12",
205
  )
206
  ),
@@ -296,7 +298,7 @@ def main():
296
  ),
297
  Li(
298
  A(
299
- "Motivation Behind Global Deduplication",
300
  href="#section42",
301
  )
302
  ),
@@ -354,7 +356,7 @@ def main():
354
  ),
355
  Li(
356
  A(
357
- "Upsampling Experiment",
358
  href="#section52",
359
  )
360
  ),
@@ -852,9 +854,10 @@ def intro():
852
  Section(
853
  H2("About TxT360"),
854
  P( "TL;DR ",
855
- B("We introduce TxT360 (Trillion eXtracted Text), the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 high-quality data sources from diverse domains (e.g., FreeLaw, PG-19, etc.). The large-scale deduplication process and rich metadata stored enables precise control over data distribution. In addition to document selection, TxT360, along with its rich metadata, allows for the assignment of optimal data weights. We demonstrate a simple but effective upsampling recipe that creates a 15+ trillion-token corpus, outperforming FineWeb 15T. Furthermore, TxT360 empowers pre-trainers to explore more advanced weighting techniques, a capability not commonly available in previous pre-training datasets."
856
  )
857
  ),
 
858
  P(
859
  "Building on top of the prior studies on pre-training data",
860
  D_cite(bibtex_key="refinedweb"),
 
23
  from pybtex.database import parse_file
24
  import data_viewer
25
 
26
+ from eval_result_figures import all_eval_res_figs
27
+
28
 
29
  app, rt = fast_app(
30
  debug=True,
 
202
  ),
203
  Li(
204
  A(
205
+ "Why TxT360",
206
  href="#section12",
207
  )
208
  ),
 
298
  ),
299
  Li(
300
  A(
301
+ "Why Global Deduplication",
302
  href="#section42",
303
  )
304
  ),
 
356
  ),
357
  Li(
358
  A(
359
+ "A Simple Data Mix Creates a Good Learning Curve",
360
  href="#section52",
361
  )
362
  ),
 
854
  Section(
855
  H2("About TxT360"),
856
  P( "TL;DR ",
857
+ B("We introduce TxT360 (Trillion eXtracted Text), the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 high-quality data sources from diverse domains (e.g., FreeLaw, PG-19, etc.). The large-scale deduplication process and rich metadata stored enables precise control over data distribution. We demonstrate a simple but effective upsampling recipe that creates a 15+ trillion-token corpus, outperforming FineWeb 15T on several key metrics. With the information, TxT360 empowers pre-trainers to explore more advanced weighting techniques, a feature not commonly available in previous pre-training datasets. In line with our 360° open source spirit, we document all detailed steps, reasons of our decisions, detailed statistics and more, in additional to the dataset itself. We hope this can serve as a useful resource for future developers."
858
  )
859
  ),
860
+ plotly2fasthtml(all_eval_res_figs["MMLU"]),
861
  P(
862
  "Building on top of the prior studies on pre-training data",
863
  D_cite(bibtex_key="refinedweb"),
results.py CHANGED
@@ -11,70 +11,7 @@ from plotly import graph_objects as go
11
  import pandas as pd
12
  import plotly.express as px
13
 
14
-
15
- ## Evaluation Graphs
16
-
17
- # Load the data
18
- all_eval_results = {}
19
- for fname in os.listdir("data/txt360_eval"):
20
- if fname.endswith(".csv"):
21
- metric_name = fname.replace("CKPT Eval - ", "").replace(".csv", "")
22
- all_eval_results[metric_name] = {}
23
-
24
- # with open(os.path.join("data/txt360_eval", fname)) as f:
25
- df = pd.read_csv(os.path.join("data/txt360_eval", fname))
26
-
27
- # slimpajama_res = df.iloc[2:, 2].astype(float).fillna(0.0) # slimpajama
28
- fineweb_res = df.iloc[2:, 1].astype(float).fillna(method="bfill") # fineweb
29
- txt360_base = df.iloc[2:, 2].astype(float).fillna(method="bfill") # txt360-dedup-only
30
- txt360_web_up = df.iloc[2:, 3].astype(float).fillna(method="bfill") # txt360-web-only-upsampled
31
- txt360_all_up_stack = df.iloc[2:, 4].astype(float).fillna(method="bfill") # txt360-all-upsampled + stackv2
32
-
33
- # each row is 20B tokens.
34
- # all_eval_results[metric_name]["slimpajama"] = slimpajama_res
35
- all_eval_results[metric_name]["fineweb"] = fineweb_res
36
- all_eval_results[metric_name]["txt360-dedup-only"] = txt360_base
37
- all_eval_results[metric_name]["txt360-web-only-upsampled"] = txt360_web_up
38
- all_eval_results[metric_name]["txt360-all-upsampled + stackv2"] = txt360_all_up_stack
39
- all_eval_results[metric_name]["token"] = [20 * i for i in range(len(fineweb_res))]
40
-
41
-
42
- # Eval Result Plots
43
- all_eval_res_figs = {}
44
- for metric_name, res in all_eval_results.items():
45
- fig_res = go.Figure()
46
-
47
- # Add lines
48
- fig_res.add_trace(go.Scatter(
49
- x=all_eval_results[metric_name]["token"],
50
- y=all_eval_results[metric_name]["fineweb"],
51
- mode='lines', name='FineWeb'
52
- ))
53
- fig_res.add_trace(go.Scatter(
54
- x=all_eval_results[metric_name]["token"],
55
- y=all_eval_results[metric_name]["txt360-web-only-upsampled"],
56
- mode='lines', name='TxT360 - CC Data Upsampled'
57
- ))
58
- fig_res.add_trace(go.Scatter(
59
- x=all_eval_results[metric_name]["token"],
60
- y=all_eval_results[metric_name]["txt360-dedup-only"],
61
- mode='lines', name='TxT360 - CC Data Dedup'
62
- ))
63
- fig_res.add_trace(go.Scatter(
64
- x=all_eval_results[metric_name]["token"],
65
- y=all_eval_results[metric_name]["txt360-all-upsampled + stackv2"],
66
- mode='lines', name='TxT360 - Full Upsampled + Stack V2'
67
- ))
68
-
69
- # Update layout
70
- fig_res.update_layout(
71
- title=f"{metric_name} Performance",
72
- title_x=0.5, # Centers the title
73
- xaxis_title="Billion Tokens",
74
- yaxis_title=metric_name,
75
- legend_title="Dataset",
76
- )
77
- all_eval_res_figs[metric_name] = fig_res
78
 
79
  ##upsampling validation loss graph
80
 
 
11
  import pandas as pd
12
  import plotly.express as px
13
 
14
+ from eval_result_figures import all_eval_res_figs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  ##upsampling validation loss graph
17