Spaces:

LLM360
/

TxT360

Running

App Files Files Community

hunterhector commited on Oct 7

Commit

a4dc57a

•

1 Parent(s): e74bc72

add fiture to early sections too

Browse files

Files changed (3) hide show

eval_result_figures.py +67 -0
main.py +7 -4
results.py +1 -64

eval_result_figures.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+from plotly import graph_objects as go
+import pandas as pd
+## Evaluation Graphs
+# Load the data
+all_eval_results = {}
+for fname in os.listdir("data/txt360_eval"):
+    if fname.endswith(".csv"):
+        metric_name = fname.replace("CKPT Eval - ", "").replace(".csv", "")
+        all_eval_results[metric_name] = {}
+        # with open(os.path.join("data/txt360_eval", fname)) as f:
+        df = pd.read_csv(os.path.join("data/txt360_eval", fname))
+        # slimpajama_res = df.iloc[2:, 2].astype(float).fillna(0.0) # slimpajama
+        fineweb_res = df.iloc[2:, 1].astype(float).fillna(method="bfill") # fineweb
+        txt360_base = df.iloc[2:, 2].astype(float).fillna(method="bfill") # txt360-dedup-only
+        txt360_web_up = df.iloc[2:, 3].astype(float).fillna(method="bfill") # txt360-web-only-upsampled
+        txt360_all_up_stack = df.iloc[2:, 4].astype(float).fillna(method="bfill") # txt360-all-upsampled + stackv2
+        # each row is 20B tokens.
+        # all_eval_results[metric_name]["slimpajama"] = slimpajama_res
+        all_eval_results[metric_name]["fineweb"] = fineweb_res
+        all_eval_results[metric_name]["txt360-dedup-only"] = txt360_base
+        all_eval_results[metric_name]["txt360-web-only-upsampled"] = txt360_web_up
+        all_eval_results[metric_name]["txt360-all-upsampled + stackv2"] = txt360_all_up_stack
+        all_eval_results[metric_name]["token"] = [20 * i for i in range(len(fineweb_res))]
+# Eval Result Plots
+all_eval_res_figs = {}
+for metric_name, res in all_eval_results.items():
+    fig_res = go.Figure()
+    # Add lines
+    fig_res.add_trace(go.Scatter(
+        x=all_eval_results[metric_name]["token"],
+        y=all_eval_results[metric_name]["fineweb"],
+        mode='lines', name='FineWeb'
+    ))
+    fig_res.add_trace(go.Scatter(
+        x=all_eval_results[metric_name]["token"],
+        y=all_eval_results[metric_name]["txt360-web-only-upsampled"],
+        mode='lines', name='TxT360 - CC Data Upsampled'
+    ))
+    fig_res.add_trace(go.Scatter(
+        x=all_eval_results[metric_name]["token"],
+        y=all_eval_results[metric_name]["txt360-dedup-only"],
+        mode='lines', name='TxT360 - CC Data Dedup'
+    ))
+    fig_res.add_trace(go.Scatter(
+        x=all_eval_results[metric_name]["token"],
+        y=all_eval_results[metric_name]["txt360-all-upsampled + stackv2"],
+        mode='lines', name='TxT360 - Full Upsampled + Stack V2'
+    ))
+    # Update layout
+    fig_res.update_layout(
+        title=f"{metric_name} Performance",
+        title_x=0.5,  # Centers the title
+        xaxis_title="Billion Tokens",
+        yaxis_title=metric_name,
+        legend_title="Dataset",
+    )
+    all_eval_res_figs[metric_name] = fig_res

main.py CHANGED Viewed

@@ -23,6 +23,8 @@ import results
 from pybtex.database import parse_file
 import data_viewer
 app, rt = fast_app(
     debug=True,
@@ -200,7 +202,7 @@ def main():
                             ),
                             Li(
                                 A(
-                                    "Motivation Behind TxT360",
                                     href="#section12",
                                 )
                             ),
@@ -296,7 +298,7 @@ def main():
                             ),
                             Li(
                                 A(
-                                    "Motivation Behind Global Deduplication",
                                     href="#section42",
                                 )
                             ),
@@ -354,7 +356,7 @@ def main():
                             ),
                             Li(
                                 A(
-                                    "Upsampling Experiment",
                                     href="#section52",
                                 )
                             ),
@@ -852,9 +854,10 @@ def intro():
         Section(
             H2("About TxT360"),
             P(  "TL;DR ",
-                B("We introduce TxT360 (Trillion eXtracted Text), the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 high-quality data sources from diverse domains (e.g., FreeLaw, PG-19, etc.). The large-scale deduplication process and rich metadata stored enables precise control over data distribution. In addition to document selection, TxT360, along with its rich metadata, allows for the assignment of optimal data weights. We demonstrate a simple but effective upsampling recipe that creates a 15+ trillion-token corpus, outperforming FineWeb 15T. Furthermore, TxT360 empowers pre-trainers to explore more advanced weighting techniques, a capability not commonly available in previous pre-training datasets."
                 )
             ),
             P(
                 "Building on top of the prior studies on pre-training data",
                 D_cite(bibtex_key="refinedweb"),

 from pybtex.database import parse_file
 import data_viewer
+from eval_result_figures import all_eval_res_figs
 app, rt = fast_app(
     debug=True,
                             ),
                             Li(
                                 A(
+                                    "Why TxT360",
                                     href="#section12",
                                 )
                             ),
                             ),
                             Li(
                                 A(
+                                    "Why Global Deduplication",
                                     href="#section42",
                                 )
                             ),
                             ),
                             Li(
                                 A(
+                                    "A Simple Data Mix Creates a Good Learning Curve",
                                     href="#section52",
                                 )
                             ),
         Section(
             H2("About TxT360"),
             P(  "TL;DR ",
+                B("We introduce TxT360 (Trillion eXtracted Text), the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 high-quality data sources from diverse domains (e.g., FreeLaw, PG-19, etc.). The large-scale deduplication process and rich metadata stored enables precise control over data distribution. We demonstrate a simple but effective upsampling recipe that creates a 15+ trillion-token corpus, outperforming FineWeb 15T on several key metrics. With the information, TxT360 empowers pre-trainers to explore more advanced weighting techniques, a feature not commonly available in previous pre-training datasets. In line with our 360° open source spirit, we document all detailed steps, reasons of our decisions, detailed statistics and more, in additional to the dataset itself. We hope this can serve as a useful resource for future developers."
                 )
             ),
+            plotly2fasthtml(all_eval_res_figs["MMLU"]),
             P(
                 "Building on top of the prior studies on pre-training data",
                 D_cite(bibtex_key="refinedweb"),

results.py CHANGED Viewed

@@ -11,70 +11,7 @@ from plotly import graph_objects as go
 import pandas as pd
 import plotly.express as px
-## Evaluation Graphs
-# Load the data
-all_eval_results = {}
-for fname in os.listdir("data/txt360_eval"):
-    if fname.endswith(".csv"):
-        metric_name = fname.replace("CKPT Eval - ", "").replace(".csv", "")
-        all_eval_results[metric_name] = {}
-        # with open(os.path.join("data/txt360_eval", fname)) as f:
-        df = pd.read_csv(os.path.join("data/txt360_eval", fname))
-        # slimpajama_res = df.iloc[2:, 2].astype(float).fillna(0.0) # slimpajama
-        fineweb_res = df.iloc[2:, 1].astype(float).fillna(method="bfill") # fineweb
-        txt360_base = df.iloc[2:, 2].astype(float).fillna(method="bfill") # txt360-dedup-only
-        txt360_web_up = df.iloc[2:, 3].astype(float).fillna(method="bfill") # txt360-web-only-upsampled
-        txt360_all_up_stack = df.iloc[2:, 4].astype(float).fillna(method="bfill") # txt360-all-upsampled + stackv2
-        # each row is 20B tokens.
-        # all_eval_results[metric_name]["slimpajama"] = slimpajama_res
-        all_eval_results[metric_name]["fineweb"] = fineweb_res
-        all_eval_results[metric_name]["txt360-dedup-only"] = txt360_base
-        all_eval_results[metric_name]["txt360-web-only-upsampled"] = txt360_web_up
-        all_eval_results[metric_name]["txt360-all-upsampled + stackv2"] = txt360_all_up_stack
-        all_eval_results[metric_name]["token"] = [20 * i for i in range(len(fineweb_res))]
-# Eval Result Plots
-all_eval_res_figs = {}
-for metric_name, res in all_eval_results.items():
-    fig_res = go.Figure()
-    # Add lines
-    fig_res.add_trace(go.Scatter(
-        x=all_eval_results[metric_name]["token"],
-        y=all_eval_results[metric_name]["fineweb"],
-        mode='lines', name='FineWeb'
-    ))
-    fig_res.add_trace(go.Scatter(
-        x=all_eval_results[metric_name]["token"],
-        y=all_eval_results[metric_name]["txt360-web-only-upsampled"],
-        mode='lines', name='TxT360 - CC Data Upsampled'
-    ))
-    fig_res.add_trace(go.Scatter(
-        x=all_eval_results[metric_name]["token"],
-        y=all_eval_results[metric_name]["txt360-dedup-only"],
-        mode='lines', name='TxT360 - CC Data Dedup'
-    ))
-    fig_res.add_trace(go.Scatter(
-        x=all_eval_results[metric_name]["token"],
-        y=all_eval_results[metric_name]["txt360-all-upsampled + stackv2"],
-        mode='lines', name='TxT360 - Full Upsampled + Stack V2'
-    ))
-    # Update layout
-    fig_res.update_layout(
-        title=f"{metric_name} Performance",
-        title_x=0.5,  # Centers the title
-        xaxis_title="Billion Tokens",
-        yaxis_title=metric_name,
-        legend_title="Dataset",
-    )
-    all_eval_res_figs[metric_name] = fig_res
 ##upsampling validation loss graph

 import pandas as pd
 import plotly.express as px
+from eval_result_figures import all_eval_res_figs
 ##upsampling validation loss graph