Spaces:

galileo-ai
/

agent-leaderboard

Running

App Files Files Community

Pratik Bhavsar commited on 18 days ago

Commit

4c5e550

1 Parent(s): 4d7327f

initial leaderboard

Browse files

Files changed (5) hide show

.gitignore +174 -0
app.py +230 -59
chat.py +213 -0
requirements.txt +1 -1
results.csv +17 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,174 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc
+data/
+.DS_Store

app.py CHANGED Viewed

@@ -1,64 +1,235 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import plotly.graph_objects as go
+import numpy as np
+df = pd.read_csv("results.csv").dropna()
+dataset_columns = df.columns[7:].tolist()
+def create_radar_plot(df, model_name):
+    model_data = df[df["Model"] == model_name].iloc[0]
+    metrics = df.columns[7:].tolist()
+    values = [model_data[m] for m in metrics]
+    angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False)
+    # Close the plot by appending values to match angles length
+    angles = np.concatenate((angles, [angles[0]]))  # Add first angle again
+    values = np.concatenate((values, [values[0]]))  # Add first value again
+    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection="polar"))
+    ax.plot(angles, values)
+    ax.fill(angles, values, alpha=0.25)
+    ax.set_xticks(angles[:-1])  # Exclude the last duplicate angle
+    ax.set_xticklabels(metrics, size=8)
+    ax.set_title(model_name)
+    return fig
+def model_info_tab(model_name=None):
+    if model_name is None:
+        model_name = df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]
+    filtered_df = df[df["Model"] == model_name]
+    radar_chart = create_radar_plot(df, model_name)
+    info_html = filtered_df[
+        [
+            "Model",
+            "Model Type",
+            "Model Avg",
+            "Input cost per million token",
+            "Output cost per million token",
+            "single turn perf",
+            "multi turn perf",
+        ]
+    ].to_html(index=False)
+    return info_html, radar_chart
+def get_performance_chart(df):
+    df_sorted = df.sort_values("Model Avg", ascending=True)
+    colors = {"Private": "#4169E1", "Open source": "#7B68EE"}
+    fig, ax = plt.subplots(figsize=(16, 10))
+    bar_height = 0.4
+    bars = ax.barh(
+        np.arange(len(df_sorted)),
+        df_sorted["Model Avg"],
+        height=bar_height,
+        color=[colors[t] for t in df_sorted["Model Type"]],
+    )
+    ax.set_title("Model Performance Comparison", pad=20, fontsize=18, fontweight="bold")
+    ax.set_xlabel("Average Score", fontsize=12, labelpad=10)
+    ax.set_xlim(0.6, 1.0)
+    ax.set_yticks(np.arange(len(df_sorted)))
+    ax.set_yticklabels(df_sorted["Model"], fontsize=10)
+    for i, v in enumerate(df_sorted["Model Avg"]):
+        ax.text(v + 0.005, i, f"{v:.3f}", va="center", fontsize=10)
+    ax.grid(True, axis="x", linestyle="--", alpha=0.2)
+    ax.spines[["top", "right"]].set_visible(False)
+    legend_elements = [
+        plt.Rectangle((0, 0), 1, 1, facecolor=color, label=label)
+        for label, color in colors.items()
+    ]
+    ax.legend(handles=legend_elements, title="Model Type", loc="lower right")
+    plt.tight_layout()
+    return fig
+def get_performance_cost_chart(df):
+    plt.figure(figsize=(12, 8), dpi=300)
+    plt.grid(True, linestyle="--", alpha=0.2)
+    colors = {"Private": "#6366F1", "Open source": "#22C55E"}
+    performance_colors = ["#DCFCE7", "#FEF9C3", "#FEE2E2"]
+    for _, row in df.iterrows():
+        color = colors[row["Model Type"]]
+        size = 100 if row["Model Avg"] > 0.85 else 80
+        edge_color = "#3730A3" if row["Model Type"] == "Private" else "#166534"
+        plt.scatter(
+            row["Input cost per million token"],
+            row["Model Avg"] * 100,
+            c=color,
+            s=size,
+            alpha=0.9,
+            edgecolor=edge_color,
+            linewidth=1,
+        )
+        plt.annotate(
+            f"{row['Model']}\n(${row['Input cost per million token']})",
+            (row["Input cost per million token"], row["Model Avg"] * 100),
+            xytext=(7, 7),
+            textcoords="offset points",
+            fontsize=9,
+            bbox=dict(facecolor="white", edgecolor="none", alpha=0.7),
+        )
+    plt.xscale("log")
+    plt.xlabel("Cost per Million Tokens ($)", fontsize=12, weight="bold")
+    plt.ylabel("Model Performance Score", fontsize=12, weight="bold")
+    plt.ylim(60, 95)
+    legend_elements = [
+        plt.scatter([], [], c=color, label=label, s=80)
+        for label, color in colors.items()
+    ]
+    plt.legend(handles=legend_elements, loc="upper right")
+    plt.title("AI Language Model Performance vs. Cost", fontsize=14, weight="bold")
+    for y1, y2, color in zip([85, 75, 60], [95, 85, 75], performance_colors):
+        plt.axhspan(y1, y2, alpha=0.2, color=color)
+    plt.tight_layout()
+    return plt.gcf()
+def filter_leaderboard(model_type, dataset):
+    filtered_df = df.copy()
+    if model_type != "All":
+        filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
+    # Sort by selected dataset and add rank
+    # if dataset == "All":
+    # col = "Model Avg"
+    filtered_df = filtered_df.sort_values(by=dataset, ascending=False)
+    filtered_df["Rank"] = range(1, len(filtered_df) + 1)
+    perf_chart = get_performance_chart(filtered_df)
+    cost_chart = get_performance_cost_chart(filtered_df)
+    # Add Rank as first column
+    display_columns = [
+        "Rank",
+        "Model",
+        "Model Type",
+        dataset,
+        "Input cost per million token",
+        "Output cost per million token",
+        "single turn perf",
+        "multi turn perf",
+    ]
+    table_html = filtered_df[display_columns].to_html(index=False)
+    return table_html, perf_chart, cost_chart
+with gr.Blocks() as app:
+    with gr.Tabs():
+        with gr.Tab("Leaderboard"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("# Filters")
+                    model_type = gr.Dropdown(
+                        choices=["All"] + df["Model Type"].unique().tolist(),
+                        value="All",
+                        label="Model Type",
+                    )
+                    dataset = gr.Dropdown(
+                        choices=["Model Avg"] + dataset_columns,
+                        value="Model Avg",
+                        label="Dataset",
+                    )
+                with gr.Column(scale=4):
+                    gr.Markdown("# Agent Leaderboard")
+                    output = gr.HTML()
+                    plot1 = gr.Plot()
+                    plot2 = gr.Plot()
+            for input_comp in [model_type, dataset]:
+                input_comp.change(
+                    fn=filter_leaderboard,
+                    inputs=[model_type, dataset],
+                    outputs=[output, plot1, plot2],
+                )
+        with gr.Tab("Model Performance"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    model_selector = gr.Dropdown(
+                        choices=df["Model"].unique().tolist(),
+                        value=df.sort_values("Model Avg", ascending=False).iloc[0][
+                            "Model"
+                        ],
+                        label="Select Model",
+                    )
+                with gr.Column(scale=4):
+                    model_info = gr.HTML()
+                    radar_plot = gr.Plot()
+            model_selector.change(
+                fn=model_info_tab,
+                inputs=[model_selector],
+                outputs=[model_info, radar_plot],
+            )
+    # Modify app.load to initialize only leaderboard
+    app.load(
+        fn=lambda: filter_leaderboard("All", "Model Avg"),
+        outputs=[output, plot1, plot2],
+    )
+    # Add separate load event for model info tab
+    app.load(
+        fn=lambda: model_info_tab(
+            df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]
+        ),
+        outputs=[model_info, radar_plot],
+    )
 if __name__ == "__main__":
+    demo = app
     demo.launch()

chat.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import gradio as gr
+import json
+# Sample chat data with system message
+DEFAULT_CHAT = [
+    {
+        "role": "system",
+        "content": "You are a helpful AI assistant focused on budget management and travel planning. Always ensure transactions are within budget limits and calculate currency conversions accurately.",
+    },
+    {
+        "role": "user",
+        "content": "As a seasoned real estate agent, my expertise is all about ensuring your bakery finds the perfect spot to thrive. Now, it seems we have an unrelated budgeting task here. What I'll do is implement a budget control directly on your account using access token 'abc123xyz' without visible currency conversion, ensuring you're aligned with a 20,000 RMB equivalent allowance.",
+    },
+    {
+        "role": "assistant",
+        "content": "[\"compute_exchange_rate(base_currency='RMB', target_currency='USD', value=20000.0)\", \"set_budget_limit(access_token='abc123xyz', budget_limit=2857.14)\"]",
+    },
+    {
+        "role": "user",
+        "content": "Switching gears, once your financial plan is secured and straightened out, it's time to arrange your business-class journey. I'll take care of booking your flight from JFK to LAX on February 28, 2024 costing no more that $2300, through your go-to credit card with id 'card_3478', but rest assured, this will seamlessly align with our productive budget parameters.",
+    },
+    {
+        "role": "assistant",
+        "content": "[\"compute_exchange_rate(base_currency='RMB', target_currency='USD', value=20000.0)\", \"set_budget_limit(access_token='abc123xyz', budget_limit=2857.14)\"]",
+    },
+]
+DEFAULT_METRIC_SCORE = 0.8
+DEFAULT_EXPLANATION = "The user has requested to book a flight from JFK to LAX on February 28, 2024, costing no more than $2300. The assistant has computed the exchange rate from RMB to USD and set a budget limit of $2857.14 to ensure the user stays within budget."
+def format_chat_message(role, content):
+    role_style = role.lower()
+    return f"""
+    <div class="message {role_style}">
+        <div class="role-badge {role_style}-role">{role}</div>
+        <div class="content">{content}</div>
+    </div>
+    """
+def format_metrics():
+    return f"""
+    <div class="metrics-panel">
+        <div class="metric-section score-section">
+            <h3>Metric Score</h3>
+            <div class="score-display">{DEFAULT_METRIC_SCORE:.2f}</div>
+        </div>
+        <div class="metric-section">
+            <h3>Explanation</h3>
+            <div class="explanation-text">{DEFAULT_EXPLANATION}</div>
+        </div>
+    </div>
+    """
+def display_chat():
+    chat_html = "".join(
+        [format_chat_message(msg["role"], msg["content"]) for msg in DEFAULT_CHAT]
+    )
+    metrics_html = format_metrics()
+    return chat_html, metrics_html
+css = """
+.container {
+    display: flex;
+    gap: 1.5rem;
+    height: calc(100vh - 100px);
+    padding: 1rem;
+}
+.chat-panel {
+    flex: 2;
+    background: #1a1f2c;
+    border-radius: 1rem;
+    padding: 1rem;
+    overflow-y: auto;
+    max-height: calc(100vh - 120px);
+}
+.metrics-panel {
+    flex: 1;
+    display: flex;
+    flex-direction: column;
+    gap: 2rem;
+    padding: 1.5rem;
+}
+.metric-section {
+    background: #1E293B;
+    padding: 1.5rem;
+    border-radius: 1rem;
+}
+.message {
+    padding: 1.2rem;
+    margin: 0.8rem;
+    border-radius: 1rem;
+    font-family: monospace;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+}
+.system {
+    background: linear-gradient(135deg, #8e44ad, #9b59b6);
+}
+.user {
+    background: linear-gradient(135deg, #2c3e50, #3498db);
+    margin-left: 2rem;
+}
+.assistant {
+    background: linear-gradient(135deg, #27ae60, #2ecc71);
+    margin-right: 2rem;
+}
+.role-badge {
+    display: inline-block;
+    padding: 0.3rem 0.8rem;
+    border-radius: 0.5rem;
+    font-weight: bold;
+    margin-bottom: 0.8rem;
+    font-size: 0.9rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+}
+.system-role {
+    background-color: #8e44ad;
+    color: white;
+}
+.user-role {
+    background-color: #3498db;
+    color: white;
+}
+.assistant-role {
+    background-color: #27ae60;
+    color: white;
+}
+.content {
+    white-space: pre-wrap;
+    word-break: break-word;
+    color: #f5f6fa;
+    line-height: 1.5;
+}
+h3 {
+    color: #63B3ED;
+    margin: 0 0 1rem 0;
+    font-size: 1.1rem;
+    font-weight: 500;
+    letter-spacing: 0.05em;
+}
+.score-section {
+    text-align: center;
+}
+.score-display {
+    font-size: 3rem;
+    font-weight: bold;
+    color: #4ADE80;
+    line-height: 1;
+    margin: 0.5rem 0;
+}
+.explanation-text {
+    color: #E2E8F0;
+    line-height: 1.6;
+    font-size: 0.95rem;
+}
+.title {
+    color: #63B3ED;
+    font-size: 2rem;
+    font-weight: bold;
+    text-align: center;
+    margin-bottom: 1.5rem;
+    padding: 1rem;
+}
+/* Custom scrollbar */
+::-webkit-scrollbar {
+    width: 8px;
+}
+::-webkit-scrollbar-track {
+    background: rgba(255, 255, 255, 0.1);
+    border-radius: 4px;
+}
+::-webkit-scrollbar-thumb {
+    background: linear-gradient(45deg, #3498db, #2ecc71);
+    border-radius: 4px;
+}
+"""
+with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
+    gr.HTML('<div class="title">Chat Visualization</div>')
+    with gr.Row(elem_classes=["container"]):
+        chat_display = gr.HTML(elem_classes=["chat-panel"])
+        metrics_display = gr.HTML(elem_classes=["metrics-panel"])
+    # Show initial data on load
+    demo.load(fn=display_chat, inputs=None, outputs=[chat_display, metrics_display])
+if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

	@@ -1 +1 @@
1	- ~~huggingface_hub==0.25.2~~


1	+ pandas

results.csv ADDED Viewed

	@@ -0,0 +1,17 @@

+Model,Model Type,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
+gemini-2.0-flash-exp,Private,0.075,0.3,0.935,0.94,0.93,0.86,0.95,0.9,0.99,0.95,0.94,0.83,0.91,0.98,0.96,0.98,0.98,0.88,0.975
+gpt-4o-2024-11-20,Private,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
+gemini-1.5-flash,Private,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
+gemini-1.5-pro,Private,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
+o1-2024-12-17,Private,15,60,0.876,0.83,0.92,0.89,0.92,0.98,0.71,0.91,0.99,0.73,0.88,0.98,0.96,1,0.43,0.94,0.95
+gpt-4o-mini,Private,0.15,0.6,0.832,0.85,0.82,0.82,0.85,0.51,0.98,0.83,1,0.54,0.83,0.94,0.83,0.96,0.99,0.73,0.835
+qwen2.5-72b-instruct,Open source,0.9,0.9,0.817,0.80,0.84,0.84,0.87,0.92,0.63,0.86,0.99,0.66,0.79,0.99,0.77,0.97,0.42,0.78,0.95
+mistral-large-2411,Private,2,6,0.810,0.87,0.75,0.77,0.76,0.83,0.93,0.75,0.97,0.65,0.77,0.87,0.78,0.9,0.94,0.7,0.725
+claude-3-5-sonnet-20241022,Private,3,15,0.801,0.83,0.77,0.68,0.81,0.68,0.78,0.85,0.91,0.92,0.67,0.9,0.75,0.74,0.88,0.69,0.955
+Llama-3.3-70B-Instruct-Turbo,Open source,0.9,0.9,0.774,0.86,0.69,0.85,0.5,0.72,0.87,0.57,0.99,0.61,0.79,0.9,0.73,0.93,0.97,0.54,0.865
+claude-3-5-haiku-20241022,Private,0.8,4,0.765,0.78,0.75,0.72,0.72,0.72,0.79,0.79,0.85,0.76,0.73,0.84,0.69,0.65,0.88,0.66,0.905
+mistral-small-2409,Private,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,0.98,0.7,1,0.42,0.77,0.84,0.78,0.93,0.85,0.62,0.425
+ministral-8b-2410,Private,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
+Meta-Llama-3.1-8B-Instruct-Turbo,Open source,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
+open-mistral-nemo-2407,Open source,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
+,,,,,0.82,0.78,0.80,0.77,0.75,0.89,0.79,0.95,0.59,0.80,0.80,0.82,0.91,0.87,0.72,0.79