File size: 5,927 Bytes
4a46abc
4c5e550
 
4a46abc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c5e550
4a46abc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c5e550
 
 
4a46abc
4c5e550
 
 
 
 
 
 
 
 
4a46abc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c5e550
 
4a46abc
 
 
 
 
 
 
4c5e550
 
 
4a46abc
4c5e550
 
 
 
 
4a46abc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# chat.py
import gradio as gr
import json
import pandas as pd
import numpy as np
from functools import lru_cache
import promptquality as pq

project_name = "agent-lb-v1"
PROJECT_ID = pq.get_project_from_name(project_name).id


@lru_cache(maxsize=1000)
def get_model_score_for_dataset(model, dataset):
    print(f"Getting metrics for {model} {project_name} for dataset {dataset}")
    run_name = f"{model} {dataset}"
    run_id = pq.get_run_from_name(run_name, PROJECT_ID).id
    rows = pq.get_rows(
        project_id=PROJECT_ID,
        run_id=run_id,
        task_type=None,
        config=None,
        starting_token=0,
        limit=1000,
    )

    rationales = [d.metrics.tool_selection_quality_rationale for d in rows]
    scores = [
        round(d.metrics.tool_selection_quality, 2)
        for d, rationale in zip(rows, rationales)
        if rationale
    ]
    explanations = [
        d.metrics.tool_selection_quality_explanation
        for d, rationale in zip(rows, rationales)
        if rationale
    ]
    rationales = [r for r in rationales if r]
    mean_score = round(np.mean(scores), 2)
    return {
        "mean_score": mean_score,
        "scores": scores,
        "rationales": rationales,
        "explanations": explanations,
    }


def get_updated_df(df, data):
    df["rationale"] = data["rationales"]
    df["explanation"] = data["explanations"]
    df["score"] = data["scores"]
    return df


def get_chat_and_score_df(model, dataset):
    data = get_model_score_for_dataset(model, dataset)
    df = pd.read_parquet(f"datasets/{dataset}.parquet")
    df = get_updated_df(df, data)
    return df


def format_chat_message(role, content):
    """Format individual chat messages with proper styling."""
    role_style = role.lower()
    return f"""
    <div class="message {role_style}">
        <div class="role-badge {role_style}-role">{role}</div>
        <div class="content">{content}</div>
    </div>
    """


def format_tool_info(tools):
    """Format tool information with proper styling."""
    if isinstance(tools, str):
        try:
            tools = json.loads(tools)
        except:
            return "<div>No tool information available</div>"

    if not tools:
        return "<div>No tool information available</div>"

    tool_html = ""
    for tool in tools:
        tool_html += f"""
        <div class="tool-section">
            <div class="tool-name">{tool.get('name', 'Unnamed Tool')}</div>
            <div class="tool-description">{tool.get('description', 'No description available')}</div>
            <div class="tool-parameters">
                {format_parameters(tool.get('parameters', {}))}
            </div>
        </div>
        """
    return f'<div class="tool-info-panel">{tool_html}</div>'


def format_parameters(parameters):
    if not parameters:
        return "<div>No parameters</div>"

    params_html = ""
    for name, desc in parameters.items():
        params_html += f"""
        <div class="parameter">
            <span class="param-name">{name}:</span> {desc}
        </div>
        """
    return params_html


def format_metrics(score, rationale, explanation):
    """Format metrics display with proper styling."""
    return f"""
    <div class="metrics-panel">
        <div class="metric-section">
            <h3>Score</h3>
            <div class="score-display">{score:.2f}</div>
        </div>
        <div class="metric-section">
            <h3>Rationale</h3>
            <div class="explanation-text">{rationale}</div>
        </div>
        <div class="metric-section">
            <h3>Explanation</h3>
            <div class="explanation-text">{explanation}</div>
        </div>
    </div>
    """


def update_chat_display(df, index):
    """Update the chat visualization for a specific index."""
    if df is None or df.empty or index >= len(df):
        return (
            "<div>No data available</div>",
            "<div>No metrics available</div>",
            "<div>No tool information available</div>",
        )

    row = df.iloc[index]

    # Format chat messages
    messages = json.loads(row["conversation"])
    chat_html = f"""
    <div class="chat-panel">
        {"".join([format_chat_message(msg["role"], msg["content"]) 
                 for msg in messages])}
    </div>
    """

    # Format metrics
    metrics_html = format_metrics(row["score"], row["rationale"], row["explanation"])

    # Format tool info
    tool_html = format_tool_info(row["tools_langchain"])

    return chat_html, metrics_html, tool_html


def filter_and_update_display(model, dataset, selected_scores, current_index):
    try:
        # Get data and filter by scores
        df_chat = get_chat_and_score_df(model, dataset)
        if selected_scores:
            df_chat = df_chat[df_chat["score"].isin(selected_scores)]

        if df_chat.empty:
            return (
                "<div>No data available for selected filters</div>",
                "<div>No metrics available</div>",
                "<div>No tool information available</div>",
                gr.update(maximum=0, value=0),
                "0/0",
            )

        # Update index bounds
        max_index = len(df_chat) - 1
        current_index = min(current_index, max_index)

        # Get displays for current index
        chat_html, metrics_html, tool_html = update_chat_display(df_chat, current_index)

        return (
            chat_html,
            metrics_html,
            tool_html,
            gr.update(maximum=max_index, value=current_index),
            f"{current_index + 1}/{len(df_chat)}",
        )
    except Exception as e:
        print(f"Error in filter_and_update_display: {str(e)}")
        return (
            f"<div>Error: {str(e)}</div>",
            "<div>No metrics available</div>",
            "<div>No tool information available</div>",
            gr.update(maximum=0, value=0),
            "0/0",
        )