Spaces:

allenai
/

reward-bench

Running

App Files Files Community

natolambert commited on Mar 7

Commit

bbe05a0

•

1 Parent(s): c8a4819

style

Browse files

Files changed (3) hide show

app.py +21 -4
src/css.py +22 -0
src/md.py +3 -6

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from src.utils import load_all_data
 from src.md import ABOUT_TEXT, TOP_TEXT
 from src.plt import plot_avg_correlation
 from src.constants import subset_mapping, length_categories, example_counts
 import numpy as np
 api = HfApi()
@@ -185,18 +186,18 @@ def regex_table(dataframe, regex, filter_button):
     return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)]
-with gr.Blocks() as app:
     # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
     with gr.Row():
-        with gr.Column(scale=3):
-            gr.Markdown(TOP_TEXT)
-        with gr.Column(scale=2.2):
             # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
             # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
             # img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
             gr.Markdown("""
                         ![](file/src/logo.png)
                         """)
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏆 RewardBench Leaderboard"):
             with gr.Row():
@@ -321,6 +322,22 @@ with gr.Blocks() as app:
     model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
     model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
 # Load data when app starts, TODO make this used somewhere...
 # def load_data_on_start():
 #     data_rewardbench = load_all_data(repo_dir_rewardbench)

 from src.md import ABOUT_TEXT, TOP_TEXT
 from src.plt import plot_avg_correlation
 from src.constants import subset_mapping, length_categories, example_counts
+from src.css import custom_css
 import numpy as np
 api = HfApi()
     return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)]
+with gr.Blocks(css=custom_css) as app:
     # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
     with gr.Row():
+        with gr.Column(scale=1.65):
             # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
             # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
             # img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
             gr.Markdown("""
                         ![](file/src/logo.png)
                         """)
+        with gr.Column(scale=3):
+            gr.Markdown(TOP_TEXT)
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏆 RewardBench Leaderboard"):
             with gr.Row():
     model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
     model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
+    with gr.Row():
+        with gr.Accordion("📚 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=r"""
+                    @misc{RewardBench,
+                        title={RewardBench: Benchmarking Reward Models},
+                        author={Lambert, Nathan and Pyatkin, Valentina and Morrison, Jacob and Miranda, LJ and Lin, Bill Yuchen and Chandu, Khyathi and Dziri, Nouha and Kumar, Sachin and Zick, Tom and Choi, Yejin and Smith, Noah A. and Hajishirzi, Hannaneh},
+                        year={2024},
+                        howpublished={\url{https://huggingface.co/spaces/allenai/reward-bench}
+                        }
+                    """,
+                height=15,
+                label="Copy the following to cite these results.",
+                elem_id="citation-button",
+                show_copy_button=True,
+            )
 # Load data when app starts, TODO make this used somewhere...
 # def load_data_on_start():
 #     data_rewardbench = load_all_data(repo_dir_rewardbench)

src/css.py ADDED Viewed

	@@ -0,0 +1,22 @@

+custom_css = """
+/* Full width space */
+.gradio-container {
+  max-width: 95%;
+}
+/* Text tyle and margins */
+.markdown-text {
+  font-size: 17px !important;
+}
+.tab-buttons button {
+  font-size: 20px;
+}
+h1 {
+  font-size: 32px !important;
+  margin-top: 0px !important;
+}
+"""

src/md.py CHANGED Viewed

@@ -16,6 +16,7 @@ We include multiple types of reward models in this evaluation:
 3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed.
 4. **Random**: Random choice baseline.
 Others, such as **Generative Judge** are coming soon.
 ### Subset Details
@@ -78,11 +79,7 @@ For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-col
 """
 TOP_TEXT = """
-# RewardBench from AI2
-Evaluating the capabilities, safety, and pitfalls of reward models.
 [Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Existing Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
-All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
 """

 3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed.
 4. **Random**: Random choice baseline.
+All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
 Others, such as **Generative Judge** are coming soon.
 ### Subset Details
 """
 TOP_TEXT = """
+# RewardBench: Benchmarking Reward Models
+### Evaluating the capabilities, safety, and pitfalls of reward models
 [Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Existing Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
 """