Spaces:

galileo-ai
/

agent-leaderboard

Running

File size: 5,927 Bytes

# chat.py
import gradio as gr
import json
import pandas as pd
import numpy as np
from functools import lru_cache
import promptquality as pq

project_name = "agent-lb-v1"
PROJECT_ID = pq.get_project_from_name(project_name).id


@lru_cache(maxsize=1000)
def get_model_score_for_dataset(model, dataset):
    print(f"Getting metrics for {model} {project_name} for dataset {dataset}")
    run_name = f"{model} {dataset}"
    run_id = pq.get_run_from_name(run_name, PROJECT_ID).id
    rows = pq.get_rows(
        project_id=PROJECT_ID,
        run_id=run_id,
        task_type=None,
        config=None,
        starting_token=0,
        limit=1000,
    )

    rationales = [d.metrics.tool_selection_quality_rationale for d in rows]
    scores = [
        round(d.metrics.tool_selection_quality, 2)
        for d, rationale in zip(rows, rationales)
        if rationale
    ]
    explanations = [
        d.metrics.tool_selection_quality_explanation
        for d, rationale in zip(rows, rationales)
        if rationale
    ]
    rationales = [r for r in rationales if r]
    mean_score = round(np.mean(scores), 2)
    return {
        "mean_score": mean_score,
        "scores": scores,
        "rationales": rationales,
        "explanations": explanations,
    }


def get_updated_df(df, data):
    df["rationale"] = data["rationales"]
    df["explanation"] = data["explanations"]
    df["score"] = data["scores"]
    return df


def get_chat_and_score_df(model, dataset):
    data = get_model_score_for_dataset(model, dataset)
    df = pd.read_parquet(f"datasets/{dataset}.parquet")
    df = get_updated_df(df, data)
    return df


def format_chat_message(role, content):
    """Format individual chat messages with proper styling."""
    role_style = role.lower()
    return f"""
    <div class="message {role_style}">
        <div class="role-badge {role_style}-role">{role}</div>
        <div class="content">{content}</div>
    </div>
    """


def format_tool_info(tools):
    """Format tool information with proper styling."""
    if isinstance(tools, str):
        try:
            tools = json.loads(tools)
        except:
            return "<div>No tool information available</div>"

    if not tools:
        return "<div>No tool information available</div>"

    tool_html = ""
    for tool in tools:
        tool_html += f"""
        <div class="tool-section">
            <div class="tool-name">{tool.get('name', 'Unnamed Tool')}</div>
            <div class="tool-description">{tool.get('description', 'No description available')}</div>
            <div class="tool-parameters">
                {format_parameters(tool.get('parameters', {}))}
            </div>
        </div>
        """
    return f'<div class="tool-info-panel">{tool_html}</div>'


def format_parameters(parameters):
    if not parameters:
        return "<div>No parameters</div>"

    params_html = ""
    for name, desc in parameters.items():
        params_html += f"""
        <div class="parameter">
            <span class="param-name">{name}:</span> {desc}
        </div>
        """
    return params_html


def format_metrics(score, rationale, explanation):
    """Format metrics display with proper styling."""
    return f"""
    <div class="metrics-panel">
        <div class="metric-section">
            <h3>Score</h3>
            <div class="score-display">{score:.2f}</div>
        </div>
        <div class="metric-section">
            <h3>Rationale</h3>
            <div class="explanation-text">{rationale}</div>
        </div>
        <div class="metric-section">
            <h3>Explanation</h3>
            <div class="explanation-text">{explanation}</div>
        </div>
    </div>
    """


def update_chat_display(df, index):
    """Update the chat visualization for a specific index."""
    if df is None or df.empty or index >= len(df):
        return (
            "<div>No data available</div>",
            "<div>No metrics available</div>",
            "<div>No tool information available</div>",
        )

    row = df.iloc[index]

    # Format chat messages
    messages = json.loads(row["conversation"])
    chat_html = f"""
    <div class="chat-panel">
        {"".join([format_chat_message(msg["role"], msg["content"]) 
                 for msg in messages])}
    </div>
    """

    # Format metrics
    metrics_html = format_metrics(row["score"], row["rationale"], row["explanation"])

    # Format tool info
    tool_html = format_tool_info(row["tools_langchain"])

    return chat_html, metrics_html, tool_html


def filter_and_update_display(model, dataset, selected_scores, current_index):
    try:
        # Get data and filter by scores
        df_chat = get_chat_and_score_df(model, dataset)
        if selected_scores:
            df_chat = df_chat[df_chat["score"].isin(selected_scores)]

        if df_chat.empty:
            return (
                "<div>No data available for selected filters</div>",
                "<div>No metrics available</div>",
                "<div>No tool information available</div>",
                gr.update(maximum=0, value=0),
                "0/0",
            )

        # Update index bounds
        max_index = len(df_chat) - 1
        current_index = min(current_index, max_index)

        # Get displays for current index
        chat_html, metrics_html, tool_html = update_chat_display(df_chat, current_index)

        return (
            chat_html,
            metrics_html,
            tool_html,
            gr.update(maximum=max_index, value=current_index),
            f"{current_index + 1}/{len(df_chat)}",
        )
    except Exception as e:
        print(f"Error in filter_and_update_display: {str(e)}")
        return (
            f"<div>Error: {str(e)}</div>",
            "<div>No metrics available</div>",
            "<div>No tool information available</div>",
            gr.update(maximum=0, value=0),
            "0/0",
        )