Spaces:
Running
Running
add more llms
Browse files- app.py +73 -16
- gen_table.py +12 -4
- meta_data.py +12 -4
- preprocess.py +180 -0
- src/detail_math_score.json +1523 -203
- src/detail_results.csv +101 -0
- src/overall_filtered_results.csv +51 -0
- src/overall_math_score.json +657 -57
- src/overall_results.csv +51 -0
- src/record.csv +148 -0
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import abc
|
2 |
import gradio as gr
|
|
|
3 |
|
4 |
from gen_table import *
|
5 |
from meta_data import *
|
@@ -17,13 +18,25 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
17 |
DATASETS.remove('META')
|
18 |
print(DATASETS)
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
gr.Markdown(LEADERBORAD_INTRODUCTION.format(EVAL_TIME))
|
22 |
with gr.Tabs(elem_classes='tab-buttons') as tabs:
|
23 |
with gr.Tab(label='🏅 Open Agent Overall Math Leaderboard'):
|
24 |
gr.Markdown(LEADERBOARD_MD['MATH_MAIN'])
|
25 |
-
check_box
|
26 |
-
|
|
|
27 |
|
28 |
type_map = check_box['type_map']
|
29 |
type_map['Rank'] = 'number'
|
@@ -35,6 +48,21 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
35 |
interactive=True,
|
36 |
)
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
initial_headers = ['Rank'] + check_box['essential'] + checkbox_group.value
|
39 |
available_headers = [h for h in initial_headers if h in overall_table.columns]
|
40 |
|
@@ -46,33 +74,55 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
46 |
wrap=True,
|
47 |
visible=True)
|
48 |
|
49 |
-
def filter_df(fields, *args):
|
50 |
headers = ['Rank'] + check_box['essential'] + fields
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
# Ensure all requested columns exist
|
54 |
-
available_headers = [h for h in headers if h in
|
55 |
|
56 |
-
original_columns =
|
57 |
available_headers = sorted(available_headers, key=lambda x: original_columns.index(x))
|
58 |
|
59 |
-
|
60 |
# If no columns are available, return an empty DataFrame with basic columns
|
61 |
if not available_headers:
|
62 |
available_headers = ['Rank'] + check_box['essential']
|
63 |
|
64 |
comp = gr.components.DataFrame(
|
65 |
-
value=
|
66 |
type='pandas',
|
67 |
datatype=[type_map[x] for x in available_headers],
|
68 |
interactive=False,
|
69 |
wrap=True,
|
70 |
visible=True)
|
|
|
71 |
return comp
|
72 |
|
|
|
73 |
checkbox_group.change(
|
74 |
fn=filter_df,
|
75 |
-
inputs=[checkbox_group],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
outputs=data_component
|
77 |
)
|
78 |
|
@@ -84,7 +134,12 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
84 |
results_detail = struct_detail['results']
|
85 |
|
86 |
table, check_box = BUILD_L2_DF(results_detail, DEFAULT_MATH_BENCH)
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
88 |
type_map = check_box['type_map']
|
89 |
type_map['Rank'] = 'number'
|
90 |
|
@@ -112,12 +167,12 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
112 |
interactive=True
|
113 |
)
|
114 |
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
|
122 |
data_component = gr.components.DataFrame(
|
123 |
value=table[headers],
|
@@ -159,6 +214,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
159 |
interactive=False,
|
160 |
wrap=True,
|
161 |
visible=True)
|
|
|
162 |
return comp
|
163 |
|
164 |
# 为所有复选框组添加change事件
|
@@ -196,6 +252,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
|
|
196 |
show_copy_button=True,
|
197 |
)
|
198 |
|
|
|
199 |
|
200 |
if __name__ == '__main__':
|
201 |
demo.launch(server_name='0.0.0.0')
|
|
|
1 |
import abc
|
2 |
import gradio as gr
|
3 |
+
import os
|
4 |
|
5 |
from gen_table import *
|
6 |
from meta_data import *
|
|
|
18 |
DATASETS.remove('META')
|
19 |
print(DATASETS)
|
20 |
|
21 |
+
# 确保在定义llm_options之前生成overall_table
|
22 |
+
check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
|
23 |
+
overall_table = generate_table(results, DEFAULT_MATH_BENCH)
|
24 |
+
|
25 |
+
# 保存完整的overall_table为CSV文件
|
26 |
+
csv_path_overall = os.path.join(os.getcwd(), 'src/overall_results.csv')
|
27 |
+
overall_table.to_csv(csv_path_overall, index=False)
|
28 |
+
print(f"Overall results saved to {csv_path_overall}")
|
29 |
+
|
30 |
+
# 从overall_table中提取所有可能的LLM选项
|
31 |
+
llm_options = list(set(row.LLM for row in overall_table.itertuples() if hasattr(row, 'LLM')))
|
32 |
|
33 |
gr.Markdown(LEADERBORAD_INTRODUCTION.format(EVAL_TIME))
|
34 |
with gr.Tabs(elem_classes='tab-buttons') as tabs:
|
35 |
with gr.Tab(label='🏅 Open Agent Overall Math Leaderboard'):
|
36 |
gr.Markdown(LEADERBOARD_MD['MATH_MAIN'])
|
37 |
+
# 移动check_box和overall_table的定义到这里
|
38 |
+
# check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
|
39 |
+
# overall_table = generate_table(results, DEFAULT_MATH_BENCH)
|
40 |
|
41 |
type_map = check_box['type_map']
|
42 |
type_map['Rank'] = 'number'
|
|
|
48 |
interactive=True,
|
49 |
)
|
50 |
|
51 |
+
# 新增的CheckboxGroup组件用于选择Algorithm和LLM
|
52 |
+
algo_name = gr.CheckboxGroup(
|
53 |
+
choices=ALGORITHMS,
|
54 |
+
value=ALGORITHMS,
|
55 |
+
label='Algorithm',
|
56 |
+
interactive=True
|
57 |
+
)
|
58 |
+
|
59 |
+
llm_name = gr.CheckboxGroup(
|
60 |
+
choices=llm_options, # 使用提取的llm_options
|
61 |
+
value=llm_options,
|
62 |
+
label='LLM',
|
63 |
+
interactive=True
|
64 |
+
)
|
65 |
+
|
66 |
initial_headers = ['Rank'] + check_box['essential'] + checkbox_group.value
|
67 |
available_headers = [h for h in initial_headers if h in overall_table.columns]
|
68 |
|
|
|
74 |
wrap=True,
|
75 |
visible=True)
|
76 |
|
77 |
+
def filter_df(fields, algos, llms, *args):
|
78 |
headers = ['Rank'] + check_box['essential'] + fields
|
79 |
+
df = overall_table.copy()
|
80 |
+
|
81 |
+
# 添加过滤逻辑
|
82 |
+
df['flag'] = df.apply(lambda row: (
|
83 |
+
row['Algorithm'] in algos and
|
84 |
+
row['LLM'] in llms
|
85 |
+
), axis=1)
|
86 |
+
|
87 |
+
df = df[df['flag']].copy()
|
88 |
+
df.pop('flag')
|
89 |
|
90 |
# Ensure all requested columns exist
|
91 |
+
available_headers = [h for h in headers if h in df.columns]
|
92 |
|
93 |
+
original_columns = df.columns.tolist()
|
94 |
available_headers = sorted(available_headers, key=lambda x: original_columns.index(x))
|
95 |
|
|
|
96 |
# If no columns are available, return an empty DataFrame with basic columns
|
97 |
if not available_headers:
|
98 |
available_headers = ['Rank'] + check_box['essential']
|
99 |
|
100 |
comp = gr.components.DataFrame(
|
101 |
+
value=df[available_headers],
|
102 |
type='pandas',
|
103 |
datatype=[type_map[x] for x in available_headers],
|
104 |
interactive=False,
|
105 |
wrap=True,
|
106 |
visible=True)
|
107 |
+
|
108 |
return comp
|
109 |
|
110 |
+
# 更新change事件以包含新的过滤条件
|
111 |
checkbox_group.change(
|
112 |
fn=filter_df,
|
113 |
+
inputs=[checkbox_group, algo_name, llm_name],
|
114 |
+
outputs=data_component
|
115 |
+
)
|
116 |
+
|
117 |
+
algo_name.change(
|
118 |
+
fn=filter_df,
|
119 |
+
inputs=[checkbox_group, algo_name, llm_name],
|
120 |
+
outputs=data_component
|
121 |
+
)
|
122 |
+
|
123 |
+
llm_name.change(
|
124 |
+
fn=filter_df,
|
125 |
+
inputs=[checkbox_group, algo_name, llm_name],
|
126 |
outputs=data_component
|
127 |
)
|
128 |
|
|
|
134 |
results_detail = struct_detail['results']
|
135 |
|
136 |
table, check_box = BUILD_L2_DF(results_detail, DEFAULT_MATH_BENCH)
|
137 |
+
|
138 |
+
# 保存完整的table为CSV文件
|
139 |
+
csv_path_detail = os.path.join(os.getcwd(), 'src/detail_results.csv')
|
140 |
+
table.to_csv(csv_path_detail, index=False)
|
141 |
+
print(f"Detail results saved to {csv_path_detail}")
|
142 |
+
|
143 |
type_map = check_box['type_map']
|
144 |
type_map['Rank'] = 'number'
|
145 |
|
|
|
167 |
interactive=True
|
168 |
)
|
169 |
|
170 |
+
llm_name = gr.CheckboxGroup(
|
171 |
+
choices=check_box['LLM_options'],
|
172 |
+
value=check_box['LLM_options'],
|
173 |
+
label='LLM',
|
174 |
+
interactive=True
|
175 |
+
)
|
176 |
|
177 |
data_component = gr.components.DataFrame(
|
178 |
value=table[headers],
|
|
|
214 |
interactive=False,
|
215 |
wrap=True,
|
216 |
visible=True)
|
217 |
+
|
218 |
return comp
|
219 |
|
220 |
# 为所有复选框组添加change事件
|
|
|
252 |
show_copy_button=True,
|
253 |
)
|
254 |
|
255 |
+
|
256 |
|
257 |
if __name__ == '__main__':
|
258 |
demo.launch(server_name='0.0.0.0')
|
gen_table.py
CHANGED
@@ -97,6 +97,15 @@ def BUILD_L2_DF(results, fields):
|
|
97 |
# Create DataFrame
|
98 |
df = pd.DataFrame(res)
|
99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
# Sort by Dataset and Score in descending order
|
101 |
df = df.sort_values(['Dataset', 'Score'], ascending=[True, False])
|
102 |
|
@@ -109,10 +118,10 @@ def BUILD_L2_DF(results, fields):
|
|
109 |
df = df[columns + remaining_columns]
|
110 |
|
111 |
# Set checkbox configuration
|
112 |
-
check_box =
|
113 |
-
check_box['
|
114 |
-
check_box['required'] = check_box['essential'] + ['Score', 'Pass rate', 'X-shot', 'Samples', 'All tokens', 'Cost($)']
|
115 |
check_box['all'] = ['Score', 'Pass rate', 'X-shot', 'Samples', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens', 'All tokens', 'Cost($)']
|
|
|
116 |
type_map = defaultdict(lambda: 'number')
|
117 |
type_map['Algorithm'] = 'html'
|
118 |
type_map['LLM'] = type_map['Vision Model'] = 'html'
|
@@ -122,7 +131,6 @@ def BUILD_L2_DF(results, fields):
|
|
122 |
type_map['Cost($)'] = 'number'
|
123 |
check_box['type_map'] = type_map
|
124 |
|
125 |
-
|
126 |
return df, check_box
|
127 |
|
128 |
|
|
|
97 |
# Create DataFrame
|
98 |
df = pd.DataFrame(res)
|
99 |
|
100 |
+
# 获取所有唯一的 Algorithm 和 LLM
|
101 |
+
unique_algorithms = df['Algorithm'].unique().tolist()
|
102 |
+
unique_llms = df['LLM'].unique().tolist()
|
103 |
+
|
104 |
+
# Set checkbox configuration
|
105 |
+
check_box = {}
|
106 |
+
check_box['Algorithm_options'] = unique_algorithms # 添加 Algorithm 可选项
|
107 |
+
check_box['LLM_options'] = unique_llms # 添加 LLM 可选项
|
108 |
+
|
109 |
# Sort by Dataset and Score in descending order
|
110 |
df = df.sort_values(['Dataset', 'Score'], ascending=[True, False])
|
111 |
|
|
|
118 |
df = df[columns + remaining_columns]
|
119 |
|
120 |
# Set checkbox configuration
|
121 |
+
check_box['essential'] = ['Algorithm', 'Dataset', 'LLM', 'X-shot', 'Eval Date']
|
122 |
+
check_box['required'] = check_box['essential'] + ['Score', 'Pass rate', 'Samples', 'All tokens', 'Cost($)']
|
|
|
123 |
check_box['all'] = ['Score', 'Pass rate', 'X-shot', 'Samples', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens', 'All tokens', 'Cost($)']
|
124 |
+
|
125 |
type_map = defaultdict(lambda: 'number')
|
126 |
type_map['Algorithm'] = 'html'
|
127 |
type_map['LLM'] = type_map['Vision Model'] = 'html'
|
|
|
131 |
type_map['Cost($)'] = 'number'
|
132 |
check_box['type_map'] = type_map
|
133 |
|
|
|
134 |
return df, check_box
|
135 |
|
136 |
|
meta_data.py
CHANGED
@@ -45,11 +45,19 @@ LEADERBOARD_MD['MATH_DETAIL'] = f"""
|
|
45 |
- default parameters: temperature=0.0
|
46 |
- LLM prices:
|
47 |
- gpt-3.5-turbo:
|
48 |
-
|
49 |
-
-
|
50 |
- Doubao-lite-32k (1 USD = 7.3249 CNY):
|
51 |
-
|
52 |
-
- 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
- IO (Input-Output) is the baseline method that directly prompts the model with the question and expects an answer without any intermediate reasoning steps. It represents the most basic way of using language models and serves as a reference point for evaluating the effectiveness of other algorithms.
|
55 |
|
|
|
45 |
- default parameters: temperature=0.0
|
46 |
- LLM prices:
|
47 |
- gpt-3.5-turbo:
|
48 |
+
- 0.5$/1M tokens (input)
|
49 |
+
- 1.5$/1M tokens (output)
|
50 |
- Doubao-lite-32k (1 USD = 7.3249 CNY):
|
51 |
+
- 0.04096$/1M tokens (input)
|
52 |
+
- 0.08200$/1M tokens (output)
|
53 |
+
- gpt-4o-2024-08-06:
|
54 |
+
- 2.50$ /1M input tokens (input)
|
55 |
+
- 10$ /1M output tokens (output)
|
56 |
+
- Qwen2.5-7B-Instruct and Llama-3.3-70B-Instruct:
|
57 |
+
- Prices can be found https://cloud.siliconflow.cn/.
|
58 |
+
- Other open source LLMs:
|
59 |
+
- Deployed locally, please check the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository for more information.
|
60 |
+
- Cost is not considered in the leaderboard.
|
61 |
|
62 |
- IO (Input-Output) is the baseline method that directly prompts the model with the question and expects an answer without any intermediate reasoning steps. It represents the most basic way of using language models and serves as a reference point for evaluating the effectiveness of other algorithms.
|
63 |
|
preprocess.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import json
|
3 |
+
from datetime import datetime
|
4 |
+
|
5 |
+
def process_csv_to_json():
|
6 |
+
# 读取CSV文件
|
7 |
+
df = pd.read_csv('src/record.csv')
|
8 |
+
|
9 |
+
# 清理数据:删除空行,重命名列
|
10 |
+
df = df.dropna(how='all')
|
11 |
+
df = df.rename(columns={
|
12 |
+
'dataset': 'Dataset',
|
13 |
+
'llm': 'LLM',
|
14 |
+
'score\n(EM)': 'Score',
|
15 |
+
'pass rate': 'Pass rate',
|
16 |
+
'Cost($)': 'Cost($)',
|
17 |
+
'Eval Date': 'Eval Date',
|
18 |
+
'framework': 'Framework',
|
19 |
+
'X-shot': 'X-shot',
|
20 |
+
'Nums': 'Samples',
|
21 |
+
'All tokens': 'All tokens',
|
22 |
+
'Total input tokens': 'Total input tokens',
|
23 |
+
'Average input tokens': 'Average input tokens',
|
24 |
+
'Total output tokens': 'Total output tokens',
|
25 |
+
'Average output tokens': 'Average output tokens'
|
26 |
+
})
|
27 |
+
|
28 |
+
# 辅助函数:处理包含逗号的数字字符串
|
29 |
+
def parse_number(value):
|
30 |
+
if pd.isna(value):
|
31 |
+
return 0
|
32 |
+
# 先移除逗号,然后转换为浮点数,最后转换为整数
|
33 |
+
return int(float(str(value).replace(',', '')))
|
34 |
+
|
35 |
+
# 初始化结果字典
|
36 |
+
result = {
|
37 |
+
"time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
38 |
+
"results": {}
|
39 |
+
}
|
40 |
+
|
41 |
+
# 获取所有唯一的LLM
|
42 |
+
llms = df['LLM'].dropna().unique()
|
43 |
+
|
44 |
+
# 遍历每个算法
|
45 |
+
for algorithm in df['Algorithm'].dropna().unique():
|
46 |
+
if not isinstance(algorithm, str):
|
47 |
+
continue
|
48 |
+
|
49 |
+
result['results'][algorithm] = {}
|
50 |
+
|
51 |
+
# 对每个LLM进行处理
|
52 |
+
for llm in llms:
|
53 |
+
llm_data = df[(df['Algorithm'] == algorithm) & (df['LLM'] == llm)]
|
54 |
+
if llm_data.empty:
|
55 |
+
continue
|
56 |
+
|
57 |
+
# 创建LLM对应的字典
|
58 |
+
result['results'][algorithm][llm] = {
|
59 |
+
'META': {
|
60 |
+
'Algorithm': str(algorithm),
|
61 |
+
'LLM': str(llm),
|
62 |
+
'Eval Date': str(llm_data['Eval Date'].iloc[0])
|
63 |
+
}
|
64 |
+
}
|
65 |
+
|
66 |
+
# 对每个数据集进行处理
|
67 |
+
for dataset in df['Dataset'].dropna().unique():
|
68 |
+
if not isinstance(dataset, str):
|
69 |
+
continue
|
70 |
+
|
71 |
+
dataset_data = llm_data[llm_data['Dataset'] == dataset]
|
72 |
+
|
73 |
+
if not dataset_data.empty:
|
74 |
+
data_row = dataset_data.iloc[0]
|
75 |
+
result['results'][algorithm][llm][dataset] = {
|
76 |
+
'Score': round(float(data_row['Score']), 2), # 保留两位小数
|
77 |
+
'Pass rate': round(float(data_row['Pass rate']) / 100, 4), # 转换为小数并保留两位小数
|
78 |
+
'Cost($)': float(data_row['Cost($)']) if pd.notnull(data_row['Cost($)']) else 0.0,
|
79 |
+
'Framework': str(data_row['Framework']) if 'Framework' in data_row and pd.notnull(data_row['Framework']) else '',
|
80 |
+
'X-shot': str(data_row['X-shot']) if pd.notnull(data_row['X-shot']) else '',
|
81 |
+
'Samples': parse_number(data_row['Samples']),
|
82 |
+
'All tokens': parse_number(data_row['All tokens']),
|
83 |
+
'Total input tokens': parse_number(data_row['Total input tokens']),
|
84 |
+
'Average input tokens': parse_number(data_row['Average input tokens']),
|
85 |
+
'Total output tokens': parse_number(data_row['Total output tokens']),
|
86 |
+
'Average output tokens': parse_number(data_row['Average output tokens'])
|
87 |
+
}
|
88 |
+
|
89 |
+
# 检查每个字段是否存在
|
90 |
+
required_fields = ['Score', 'Pass rate', 'Cost($)', 'Framework', 'X-shot', 'Samples', 'All tokens', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens']
|
91 |
+
|
92 |
+
for key, value in result['results'].items():
|
93 |
+
for llm, datasets in value.items():
|
94 |
+
# 检查 META 信息
|
95 |
+
meta = datasets.get('META', {})
|
96 |
+
if 'LLM' not in meta or 'Eval Date' not in meta:
|
97 |
+
print(f"Missing META fields in algorithm '{key}' for LLM '{llm}'")
|
98 |
+
|
99 |
+
for dataset, data in datasets.items():
|
100 |
+
if dataset == 'META':
|
101 |
+
continue
|
102 |
+
missing_fields = [field for field in required_fields if field not in data]
|
103 |
+
if missing_fields:
|
104 |
+
print(f"Missing fields {missing_fields} in dataset '{dataset}' for LLM '{llm}' in algorithm '{key}'")
|
105 |
+
|
106 |
+
# 保存为JSON文件
|
107 |
+
with open('src/detail_math_score.json', 'w', encoding='utf-8') as f:
|
108 |
+
json.dump(result, f, indent=4, ensure_ascii=False)
|
109 |
+
|
110 |
+
def process_csv_to_overall_json():
|
111 |
+
# 读取CSV文件
|
112 |
+
df = pd.read_csv('src/record.csv')
|
113 |
+
|
114 |
+
# 清理数据:删除空行,重命名列
|
115 |
+
df = df.dropna(how='all')
|
116 |
+
df = df.rename(columns={
|
117 |
+
'dataset': 'Dataset',
|
118 |
+
'llm': 'LLM',
|
119 |
+
'score\n(EM)': 'Score',
|
120 |
+
'Cost($)': 'Cost($)',
|
121 |
+
'Eval Date': 'Eval Date'
|
122 |
+
})
|
123 |
+
|
124 |
+
# 初始化结果字典
|
125 |
+
result = {
|
126 |
+
"time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
127 |
+
"results": {}
|
128 |
+
}
|
129 |
+
|
130 |
+
# 获取所有唯一的LLM
|
131 |
+
llms = df['LLM'].dropna().unique()
|
132 |
+
for llm in llms:
|
133 |
+
# 处理基础算法
|
134 |
+
for algorithm in df['Algorithm'].dropna().unique():
|
135 |
+
if not isinstance(algorithm, str):
|
136 |
+
continue
|
137 |
+
|
138 |
+
# 为非gpt-3.5-turbo的模型添加后缀
|
139 |
+
# 修改:为llama模型添加更多信息以确保唯一性
|
140 |
+
algo_key = algorithm if llm == 'gpt-3.5-turbo' else f"{algorithm}-{llm}"
|
141 |
+
# 检查该算法-LLM组合是否存在
|
142 |
+
algo_data = df[(df['Algorithm'] == algorithm) & (df['LLM'] == llm)]
|
143 |
+
if algo_data.empty:
|
144 |
+
print(f"No data found for algorithm '{algorithm}' and LLM '{llm}'")
|
145 |
+
continue
|
146 |
+
|
147 |
+
result['results'][algo_key] = {
|
148 |
+
"META": {
|
149 |
+
"Algorithm": algorithm,
|
150 |
+
"LLM": llm,
|
151 |
+
"Eval Date": str(algo_data['Eval Date'].iloc[0])
|
152 |
+
}
|
153 |
+
}
|
154 |
+
|
155 |
+
# 处理每个数据集
|
156 |
+
for dataset in ['gsm8k', 'AQuA']:
|
157 |
+
dataset_data = df[(df['Algorithm'] == algorithm) &
|
158 |
+
(df['Dataset'] == dataset) &
|
159 |
+
(df['LLM'] == llm)]
|
160 |
+
if not dataset_data.empty:
|
161 |
+
result['results'][algo_key][dataset] = {
|
162 |
+
"Score": float(dataset_data['Score'].iloc[0]) if pd.notnull(dataset_data['Score'].iloc[0]) else 0.0,
|
163 |
+
"Cost($)": float(dataset_data['Cost($)'].iloc[0]) if pd.notnull(dataset_data['Cost($)'].iloc[0]) else 0.0
|
164 |
+
}
|
165 |
+
else:
|
166 |
+
# 如果数据集为空,确保键存在并设置默认值
|
167 |
+
result['results'][algo_key][dataset] = {
|
168 |
+
"Score": 0.0,
|
169 |
+
"Cost($)": 0.0
|
170 |
+
}
|
171 |
+
|
172 |
+
|
173 |
+
# 保存为JSON文件
|
174 |
+
with open('src/overall_math_score.json', 'w', encoding='utf-8') as f:
|
175 |
+
json.dump(result, f, indent=4, ensure_ascii=False)
|
176 |
+
|
177 |
+
if __name__ == "__main__":
|
178 |
+
# 生成两种格式的JSON文件
|
179 |
+
process_csv_to_json()
|
180 |
+
process_csv_to_overall_json()
|
src/detail_math_score.json
CHANGED
@@ -1,207 +1,667 @@
|
|
1 |
{
|
2 |
-
"time": "2025-01-09
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"gpt-3.5-turbo": {
|
6 |
"META": {
|
7 |
"Algorithm": "IO",
|
8 |
"LLM": "gpt-3.5-turbo",
|
9 |
-
"Eval Date": "2025/
|
10 |
},
|
11 |
"gsm8k": {
|
12 |
"Score": 37.83,
|
13 |
-
"Pass rate":
|
14 |
-
"
|
15 |
-
"
|
|
|
16 |
"Samples": 1319,
|
|
|
17 |
"Total input tokens": 546990,
|
18 |
"Average input tokens": 415,
|
19 |
"Total output tokens": 39563,
|
20 |
-
"Average output tokens": 30
|
21 |
-
"All tokens": 586553,
|
22 |
-
"Cost($)": 0.3328
|
23 |
},
|
24 |
"AQuA": {
|
25 |
-
"Score": 38.
|
26 |
-
"Pass rate":
|
27 |
-
"
|
28 |
-
"
|
|
|
29 |
"Samples": 254,
|
|
|
30 |
"Total input tokens": 25701,
|
31 |
"Average input tokens": 101,
|
32 |
"Total output tokens": 16770,
|
33 |
-
"Average output tokens": 66
|
34 |
-
"All tokens": 42471,
|
35 |
-
"Cost($)": 0.0380
|
36 |
}
|
37 |
},
|
38 |
"Doubao-lite-32k": {
|
39 |
"META": {
|
40 |
"Algorithm": "IO",
|
41 |
"LLM": "Doubao-lite-32k",
|
42 |
-
"Eval Date": "2025/
|
43 |
},
|
44 |
"gsm8k": {
|
45 |
"Score": 72.02,
|
46 |
-
"Pass rate":
|
47 |
-
"
|
48 |
-
"
|
|
|
49 |
"Samples": 1319,
|
|
|
50 |
"Total input tokens": 617377,
|
51 |
"Average input tokens": 468,
|
52 |
"Total output tokens": 123106,
|
53 |
-
"Average output tokens": 93
|
54 |
-
"All tokens": 740483,
|
55 |
-
"Cost($)": 0.0354
|
56 |
},
|
57 |
"AQuA": {
|
58 |
"Score": 79.13,
|
59 |
-
"Pass rate":
|
60 |
-
"
|
61 |
-
"
|
|
|
62 |
"Samples": 254,
|
|
|
63 |
"Total input tokens": 33058,
|
64 |
"Average input tokens": 130,
|
65 |
"Total output tokens": 54684,
|
66 |
-
"Average output tokens": 215
|
67 |
-
"All tokens": 87742,
|
68 |
-
"Cost($)": 0.0058
|
69 |
}
|
70 |
-
}
|
71 |
-
|
72 |
-
"CoT": {
|
73 |
-
"gpt-3.5-turbo": {
|
74 |
"META": {
|
75 |
-
"Algorithm": "
|
76 |
-
"LLM": "gpt-
|
77 |
-
"Eval Date": "2025/
|
78 |
},
|
79 |
"gsm8k": {
|
80 |
-
"Score":
|
81 |
-
"Pass rate":
|
82 |
-
"
|
83 |
-
"
|
|
|
84 |
"Samples": 1319,
|
85 |
-
"
|
86 |
-
"
|
87 |
-
"
|
88 |
-
"
|
89 |
-
"
|
90 |
-
"Cost($)": 0.6788
|
91 |
},
|
92 |
"AQuA": {
|
93 |
-
"Score":
|
94 |
-
"Pass rate":
|
95 |
-
"
|
96 |
-
"
|
|
|
97 |
"Samples": 254,
|
98 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
"Average input tokens": 100,
|
100 |
-
"Total output tokens":
|
101 |
-
"Average output tokens":
|
102 |
-
"All tokens": 80793,
|
103 |
-
"Cost($)": 0.0957
|
104 |
}
|
105 |
},
|
106 |
-
"
|
107 |
"META": {
|
108 |
-
"Algorithm": "
|
109 |
-
"LLM": "
|
110 |
-
"Eval Date": "2025/
|
111 |
},
|
112 |
"gsm8k": {
|
113 |
-
"Score":
|
114 |
-
"Pass rate":
|
115 |
-
"
|
116 |
-
"
|
|
|
117 |
"Samples": 1319,
|
118 |
-
"
|
119 |
-
"
|
120 |
-
"
|
121 |
-
"
|
122 |
-
"
|
123 |
-
"Cost($)": 0.0557
|
124 |
},
|
125 |
"AQuA": {
|
126 |
-
"Score": 82.
|
127 |
-
"Pass rate":
|
128 |
-
"
|
129 |
-
"
|
|
|
130 |
"Samples": 254,
|
131 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
"Average input tokens": 110,
|
133 |
-
"Total output tokens":
|
134 |
-
"Average output tokens":
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
}
|
138 |
}
|
139 |
},
|
140 |
-
"
|
141 |
"gpt-3.5-turbo": {
|
142 |
"META": {
|
143 |
-
"Algorithm": "
|
144 |
"LLM": "gpt-3.5-turbo",
|
145 |
-
"Eval Date": "2025/
|
146 |
},
|
147 |
"gsm8k": {
|
148 |
-
"Score":
|
149 |
-
"Pass rate":
|
150 |
-
"
|
151 |
-
"
|
|
|
152 |
"Samples": 1319,
|
153 |
-
"
|
154 |
-
"
|
155 |
-
"
|
156 |
-
"
|
157 |
-
"
|
158 |
-
"Cost($)": 5.0227
|
159 |
},
|
160 |
"AQuA": {
|
161 |
-
"Score":
|
162 |
-
"Pass rate":
|
163 |
-
"
|
164 |
-
"
|
|
|
165 |
"Samples": 254,
|
166 |
-
"
|
167 |
-
"
|
168 |
-
"
|
169 |
-
"
|
170 |
-
"
|
171 |
-
"Cost($)": 0.6491
|
172 |
}
|
173 |
},
|
174 |
"Doubao-lite-32k": {
|
175 |
"META": {
|
176 |
-
"Algorithm": "
|
177 |
"LLM": "Doubao-lite-32k",
|
178 |
-
"Eval Date": "2025/
|
179 |
},
|
180 |
"gsm8k": {
|
181 |
-
"Score":
|
182 |
-
"Pass rate":
|
183 |
-
"
|
184 |
-
"
|
|
|
185 |
"Samples": 1319,
|
186 |
-
"
|
187 |
-
"
|
188 |
-
"
|
189 |
-
"
|
190 |
-
"
|
191 |
-
"Cost($)": 0.1533
|
192 |
},
|
193 |
"AQuA": {
|
194 |
-
"Score":
|
195 |
-
"Pass rate":
|
196 |
-
"
|
197 |
-
"
|
|
|
198 |
"Samples": 254,
|
199 |
-
"
|
200 |
-
"
|
201 |
-
"
|
202 |
-
"
|
203 |
-
"
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
}
|
206 |
}
|
207 |
},
|
@@ -210,136 +670,996 @@
|
|
210 |
"META": {
|
211 |
"Algorithm": "PoT",
|
212 |
"LLM": "gpt-3.5-turbo",
|
213 |
-
"Eval Date": "2025/
|
214 |
},
|
215 |
"gsm8k": {
|
216 |
-
"Score": 76.
|
217 |
-
"Pass rate":
|
218 |
-
"
|
219 |
-
"
|
|
|
220 |
"Samples": 1319,
|
|
|
221 |
"Total input tokens": 1090418,
|
222 |
"Average input tokens": 827,
|
223 |
"Total output tokens": 96662,
|
224 |
-
"Average output tokens": 73
|
225 |
-
"All tokens": 1187080,
|
226 |
-
"Cost($)": 0.6902
|
227 |
},
|
228 |
"AQuA": {
|
229 |
-
"Score": 59.
|
230 |
-
"Pass rate":
|
231 |
-
"
|
232 |
-
"
|
|
|
233 |
"Samples": 254,
|
|
|
234 |
"Total input tokens": 225162,
|
235 |
"Average input tokens": 886,
|
236 |
"Total output tokens": 41492,
|
237 |
-
"Average output tokens": 163
|
238 |
-
"All tokens": 266654,
|
239 |
-
"Cost($)": 0.1748
|
240 |
}
|
241 |
},
|
242 |
"Doubao-lite-32k": {
|
243 |
"META": {
|
244 |
"Algorithm": "PoT",
|
245 |
"LLM": "Doubao-lite-32k",
|
246 |
-
"Eval Date": "2025/
|
247 |
},
|
248 |
"gsm8k": {
|
249 |
-
"Score": 79.
|
250 |
-
"Pass rate":
|
251 |
-
"
|
252 |
-
"
|
|
|
253 |
"Samples": 1319,
|
|
|
254 |
"Total input tokens": 1170038,
|
255 |
"Average input tokens": 887,
|
256 |
"Total output tokens": 118017,
|
257 |
-
"Average output tokens": 89
|
258 |
-
"All tokens": 1288055,
|
259 |
-
"Cost($)": 0.0575
|
260 |
},
|
261 |
"AQuA": {
|
262 |
"Score": 71.65,
|
263 |
-
"Pass rate":
|
264 |
-
"
|
265 |
-
"
|
|
|
266 |
"Samples": 254,
|
|
|
267 |
"Total input tokens": 259863,
|
268 |
"Average input tokens": 1023,
|
269 |
"Total output tokens": 49573,
|
270 |
-
"Average output tokens": 195
|
271 |
-
"All tokens": 309436,
|
272 |
-
"Cost($)": 0.0147
|
273 |
}
|
274 |
-
}
|
275 |
-
|
276 |
-
"ReAct-Pro*": {
|
277 |
-
"gpt-3.5-turbo": {
|
278 |
"META": {
|
279 |
-
"Algorithm": "
|
280 |
-
"LLM": "gpt-
|
281 |
-
"Eval Date": "2025/
|
282 |
},
|
283 |
"gsm8k": {
|
284 |
-
"Score":
|
285 |
-
"Pass rate":
|
286 |
-
"
|
287 |
-
"
|
|
|
288 |
"Samples": 1319,
|
289 |
-
"
|
290 |
-
"
|
291 |
-
"
|
292 |
-
"
|
293 |
-
"
|
294 |
-
"Cost($)": 3.4633
|
295 |
},
|
296 |
"AQuA": {
|
297 |
-
"Score":
|
298 |
-
"Pass rate":
|
299 |
-
"
|
300 |
-
"
|
|
|
301 |
"Samples": 254,
|
302 |
-
"
|
303 |
-
"
|
304 |
-
"
|
305 |
-
"
|
306 |
-
"
|
307 |
-
"Cost($)": 0.4928
|
308 |
}
|
309 |
},
|
310 |
-
"
|
311 |
"META": {
|
312 |
-
"Algorithm": "
|
313 |
-
"LLM": "
|
314 |
-
"Eval Date": "2025/
|
315 |
},
|
316 |
"gsm8k": {
|
317 |
-
"Score":
|
318 |
-
"Pass rate":
|
319 |
-
"
|
320 |
-
"
|
|
|
321 |
"Samples": 1319,
|
322 |
-
"
|
323 |
-
"
|
324 |
-
"
|
325 |
-
"
|
326 |
-
"
|
327 |
-
"Cost($)": 0.2513
|
328 |
},
|
329 |
"AQuA": {
|
330 |
-
"Score":
|
331 |
-
"Pass rate":
|
332 |
-
"
|
333 |
-
"
|
|
|
334 |
"Samples": 254,
|
335 |
-
"
|
336 |
-
"
|
337 |
-
"
|
338 |
-
"
|
339 |
-
"
|
340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
}
|
342 |
}
|
343 |
}
|
344 |
}
|
345 |
-
}
|
|
|
1 |
{
|
2 |
+
"time": "2025-01-23 09:27:24",
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"gpt-3.5-turbo": {
|
6 |
"META": {
|
7 |
"Algorithm": "IO",
|
8 |
"LLM": "gpt-3.5-turbo",
|
9 |
+
"Eval Date": "2025/1/7"
|
10 |
},
|
11 |
"gsm8k": {
|
12 |
"Score": 37.83,
|
13 |
+
"Pass rate": 0.9992,
|
14 |
+
"Cost($)": 0.3328,
|
15 |
+
"Framework": "",
|
16 |
+
"X-shot": "8.0",
|
17 |
"Samples": 1319,
|
18 |
+
"All tokens": 586553,
|
19 |
"Total input tokens": 546990,
|
20 |
"Average input tokens": 415,
|
21 |
"Total output tokens": 39563,
|
22 |
+
"Average output tokens": 30
|
|
|
|
|
23 |
},
|
24 |
"AQuA": {
|
25 |
+
"Score": 38.97,
|
26 |
+
"Pass rate": 1.0,
|
27 |
+
"Cost($)": 0.038,
|
28 |
+
"Framework": "",
|
29 |
+
"X-shot": "0.0",
|
30 |
"Samples": 254,
|
31 |
+
"All tokens": 42471,
|
32 |
"Total input tokens": 25701,
|
33 |
"Average input tokens": 101,
|
34 |
"Total output tokens": 16770,
|
35 |
+
"Average output tokens": 66
|
|
|
|
|
36 |
}
|
37 |
},
|
38 |
"Doubao-lite-32k": {
|
39 |
"META": {
|
40 |
"Algorithm": "IO",
|
41 |
"LLM": "Doubao-lite-32k",
|
42 |
+
"Eval Date": "2025/1/7"
|
43 |
},
|
44 |
"gsm8k": {
|
45 |
"Score": 72.02,
|
46 |
+
"Pass rate": 0.9992,
|
47 |
+
"Cost($)": 0.0354,
|
48 |
+
"Framework": "",
|
49 |
+
"X-shot": "8.0",
|
50 |
"Samples": 1319,
|
51 |
+
"All tokens": 740483,
|
52 |
"Total input tokens": 617377,
|
53 |
"Average input tokens": 468,
|
54 |
"Total output tokens": 123106,
|
55 |
+
"Average output tokens": 93
|
|
|
|
|
56 |
},
|
57 |
"AQuA": {
|
58 |
"Score": 79.13,
|
59 |
+
"Pass rate": 1.0,
|
60 |
+
"Cost($)": 0.0058,
|
61 |
+
"Framework": "",
|
62 |
+
"X-shot": "0.0",
|
63 |
"Samples": 254,
|
64 |
+
"All tokens": 87742,
|
65 |
"Total input tokens": 33058,
|
66 |
"Average input tokens": 130,
|
67 |
"Total output tokens": 54684,
|
68 |
+
"Average output tokens": 215
|
|
|
|
|
69 |
}
|
70 |
+
},
|
71 |
+
"gpt-4o": {
|
|
|
|
|
72 |
"META": {
|
73 |
+
"Algorithm": "IO",
|
74 |
+
"LLM": "gpt-4o",
|
75 |
+
"Eval Date": "2025/1/22"
|
76 |
},
|
77 |
"gsm8k": {
|
78 |
+
"Score": 88.4,
|
79 |
+
"Pass rate": 1.0,
|
80 |
+
"Cost($)": 3.3463,
|
81 |
+
"Framework": "",
|
82 |
+
"X-shot": "8.0",
|
83 |
"Samples": 1319,
|
84 |
+
"All tokens": 741446,
|
85 |
+
"Total input tokens": 542416,
|
86 |
+
"Average input tokens": 411,
|
87 |
+
"Total output tokens": 199030,
|
88 |
+
"Average output tokens": 151
|
|
|
89 |
},
|
90 |
"AQuA": {
|
91 |
+
"Score": 75.59,
|
92 |
+
"Pass rate": 0.9724,
|
93 |
+
"Cost($)": 1.1453,
|
94 |
+
"Framework": "",
|
95 |
+
"X-shot": "0.0",
|
96 |
"Samples": 254,
|
97 |
+
"All tokens": 133752,
|
98 |
+
"Total input tokens": 25631,
|
99 |
+
"Average input tokens": 101,
|
100 |
+
"Total output tokens": 108121,
|
101 |
+
"Average output tokens": 426
|
102 |
+
}
|
103 |
+
},
|
104 |
+
"Qwen2.5-72B-Instruct": {
|
105 |
+
"META": {
|
106 |
+
"Algorithm": "IO",
|
107 |
+
"LLM": "Qwen2.5-72B-Instruct",
|
108 |
+
"Eval Date": "2025/1/22"
|
109 |
+
},
|
110 |
+
"gsm8k": {
|
111 |
+
"Score": 86.58,
|
112 |
+
"Pass rate": 1.0,
|
113 |
+
"Cost($)": 0.4899,
|
114 |
+
"Framework": "",
|
115 |
+
"X-shot": "8.0",
|
116 |
+
"Samples": 1319,
|
117 |
+
"All tokens": 869060,
|
118 |
+
"Total input tokens": 555340,
|
119 |
+
"Average input tokens": 421,
|
120 |
+
"Total output tokens": 313720,
|
121 |
+
"Average output tokens": 238
|
122 |
+
},
|
123 |
+
"AQuA": {
|
124 |
+
"Score": 84.25,
|
125 |
+
"Pass rate": 0.996,
|
126 |
+
"Cost($)": 0.0742,
|
127 |
+
"Framework": "",
|
128 |
+
"X-shot": "0.0",
|
129 |
+
"Samples": 254,
|
130 |
+
"All tokens": 131604,
|
131 |
+
"Total input tokens": 25397,
|
132 |
"Average input tokens": 100,
|
133 |
+
"Total output tokens": 106207,
|
134 |
+
"Average output tokens": 418
|
|
|
|
|
135 |
}
|
136 |
},
|
137 |
+
"Llama-3.3-70B-Instruct": {
|
138 |
"META": {
|
139 |
+
"Algorithm": "IO",
|
140 |
+
"LLM": "Llama-3.3-70B-Instruct",
|
141 |
+
"Eval Date": "2025/1/22"
|
142 |
},
|
143 |
"gsm8k": {
|
144 |
+
"Score": 92.26,
|
145 |
+
"Pass rate": 1.0,
|
146 |
+
"Cost($)": 0.4709,
|
147 |
+
"Framework": "",
|
148 |
+
"X-shot": "8.0",
|
149 |
"Samples": 1319,
|
150 |
+
"All tokens": 835275,
|
151 |
+
"Total input tokens": 583916,
|
152 |
+
"Average input tokens": 443,
|
153 |
+
"Total output tokens": 251359,
|
154 |
+
"Average output tokens": 191
|
|
|
155 |
},
|
156 |
"AQuA": {
|
157 |
+
"Score": 82.67,
|
158 |
+
"Pass rate": 0.9921,
|
159 |
+
"Cost($)": 0.0798,
|
160 |
+
"Framework": "",
|
161 |
+
"X-shot": "0.0",
|
162 |
"Samples": 254,
|
163 |
+
"All tokens": 141567,
|
164 |
+
"Total input tokens": 32809,
|
165 |
+
"Average input tokens": 129,
|
166 |
+
"Total output tokens": 108758,
|
167 |
+
"Average output tokens": 428
|
168 |
+
}
|
169 |
+
},
|
170 |
+
"Qwen2.5-7B-Instruct": {
|
171 |
+
"META": {
|
172 |
+
"Algorithm": "IO",
|
173 |
+
"LLM": "Qwen2.5-7B-Instruct",
|
174 |
+
"Eval Date": "2025/1/22"
|
175 |
+
},
|
176 |
+
"gsm8k": {
|
177 |
+
"Score": 57.24,
|
178 |
+
"Pass rate": 1.0,
|
179 |
+
"Cost($)": 0.0,
|
180 |
+
"Framework": "",
|
181 |
+
"X-shot": "8.0",
|
182 |
+
"Samples": 1319,
|
183 |
+
"All tokens": 887913,
|
184 |
+
"Total input tokens": 596229,
|
185 |
+
"Average input tokens": 452,
|
186 |
+
"Total output tokens": 291684,
|
187 |
+
"Average output tokens": 221
|
188 |
+
},
|
189 |
+
"AQuA": {
|
190 |
+
"Score": 78.74,
|
191 |
+
"Pass rate": 0.9842,
|
192 |
+
"Cost($)": 0.0,
|
193 |
+
"Framework": "",
|
194 |
+
"X-shot": "0.0",
|
195 |
+
"Samples": 254,
|
196 |
+
"All tokens": 137771,
|
197 |
+
"Total input tokens": 33271,
|
198 |
+
"Average input tokens": 131,
|
199 |
+
"Total output tokens": 104500,
|
200 |
+
"Average output tokens": 411
|
201 |
+
}
|
202 |
+
},
|
203 |
+
"Llama-3.1-8B-Instruct": {
|
204 |
+
"META": {
|
205 |
+
"Algorithm": "IO",
|
206 |
+
"LLM": "Llama-3.1-8B-Instruct",
|
207 |
+
"Eval Date": "2025/1/22"
|
208 |
+
},
|
209 |
+
"gsm8k": {
|
210 |
+
"Score": 57.16,
|
211 |
+
"Pass rate": 0.9954,
|
212 |
+
"Cost($)": 0.0,
|
213 |
+
"Framework": "",
|
214 |
+
"X-shot": "8.0",
|
215 |
+
"Samples": 1319,
|
216 |
+
"All tokens": 1745429,
|
217 |
+
"Total input tokens": 550941,
|
218 |
+
"Average input tokens": 418,
|
219 |
+
"Total output tokens": 1194488,
|
220 |
+
"Average output tokens": 906
|
221 |
+
},
|
222 |
+
"AQuA": {
|
223 |
+
"Score": 51.18,
|
224 |
+
"Pass rate": 0.9881,
|
225 |
+
"Cost($)": 0.0,
|
226 |
+
"Framework": "",
|
227 |
+
"X-shot": "0.0",
|
228 |
+
"Samples": 254,
|
229 |
+
"All tokens": 133106,
|
230 |
+
"Total input tokens": 26459,
|
231 |
+
"Average input tokens": 104,
|
232 |
+
"Total output tokens": 106647,
|
233 |
+
"Average output tokens": 420
|
234 |
+
}
|
235 |
+
},
|
236 |
+
"Internllm2_5-7B": {
|
237 |
+
"META": {
|
238 |
+
"Algorithm": "IO",
|
239 |
+
"LLM": "Internllm2_5-7B",
|
240 |
+
"Eval Date": "2025/1/22"
|
241 |
+
},
|
242 |
+
"gsm8k": {
|
243 |
+
"Score": 11.59,
|
244 |
+
"Pass rate": 0.9795,
|
245 |
+
"Cost($)": 0.0,
|
246 |
+
"Framework": "",
|
247 |
+
"X-shot": "8.0",
|
248 |
+
"Samples": 1319,
|
249 |
+
"All tokens": 1113728,
|
250 |
+
"Total input tokens": 679302,
|
251 |
+
"Average input tokens": 515,
|
252 |
+
"Total output tokens": 434426,
|
253 |
+
"Average output tokens": 329
|
254 |
+
},
|
255 |
+
"AQuA": {
|
256 |
+
"Score": 47.63,
|
257 |
+
"Pass rate": 0.9094,
|
258 |
+
"Cost($)": 0.0,
|
259 |
+
"Framework": "",
|
260 |
+
"X-shot": "0.0",
|
261 |
+
"Samples": 254,
|
262 |
+
"All tokens": 185041,
|
263 |
+
"Total input tokens": 50232,
|
264 |
+
"Average input tokens": 198,
|
265 |
+
"Total output tokens": 134809,
|
266 |
+
"Average output tokens": 531
|
267 |
+
}
|
268 |
+
},
|
269 |
+
"Qwen2-1.5B-Instruct": {
|
270 |
+
"META": {
|
271 |
+
"Algorithm": "IO",
|
272 |
+
"LLM": "Qwen2-1.5B-Instruct",
|
273 |
+
"Eval Date": "2025/1/22"
|
274 |
+
},
|
275 |
+
"gsm8k": {
|
276 |
+
"Score": 16.67,
|
277 |
+
"Pass rate": 1.0,
|
278 |
+
"Cost($)": 0.0,
|
279 |
+
"Framework": "",
|
280 |
+
"X-shot": "8.0",
|
281 |
+
"Samples": 1319,
|
282 |
+
"All tokens": 736996,
|
283 |
+
"Total input tokens": 568530,
|
284 |
+
"Average input tokens": 431,
|
285 |
+
"Total output tokens": 168466,
|
286 |
+
"Average output tokens": 128
|
287 |
+
},
|
288 |
+
"AQuA": {
|
289 |
+
"Score": 29.13,
|
290 |
+
"Pass rate": 0.9763,
|
291 |
+
"Cost($)": 0.0,
|
292 |
+
"Framework": "",
|
293 |
+
"X-shot": "0.0",
|
294 |
+
"Samples": 254,
|
295 |
+
"All tokens": 71047,
|
296 |
+
"Total input tokens": 27937,
|
297 |
"Average input tokens": 110,
|
298 |
+
"Total output tokens": 43110,
|
299 |
+
"Average output tokens": 170
|
300 |
+
}
|
301 |
+
},
|
302 |
+
"Qwen2-0.5B-Instruct": {
|
303 |
+
"META": {
|
304 |
+
"Algorithm": "IO",
|
305 |
+
"LLM": "Qwen2-0.5B-Instruct",
|
306 |
+
"Eval Date": "2025/1/22"
|
307 |
+
},
|
308 |
+
"gsm8k": {
|
309 |
+
"Score": 14.7,
|
310 |
+
"Pass rate": 1.0,
|
311 |
+
"Cost($)": 0.0,
|
312 |
+
"Framework": "",
|
313 |
+
"X-shot": "8.0",
|
314 |
+
"Samples": 1319,
|
315 |
+
"All tokens": 834897,
|
316 |
+
"Total input tokens": 568116,
|
317 |
+
"Average input tokens": 431,
|
318 |
+
"Total output tokens": 266781,
|
319 |
+
"Average output tokens": 202
|
320 |
+
},
|
321 |
+
"AQuA": {
|
322 |
+
"Score": 27.16,
|
323 |
+
"Pass rate": 0.9881,
|
324 |
+
"Cost($)": 0.0,
|
325 |
+
"Framework": "",
|
326 |
+
"X-shot": "0.0",
|
327 |
+
"Samples": 254,
|
328 |
+
"All tokens": 110415,
|
329 |
+
"Total input tokens": 27937,
|
330 |
+
"Average input tokens": 110,
|
331 |
+
"Total output tokens": 82478,
|
332 |
+
"Average output tokens": 325
|
333 |
}
|
334 |
}
|
335 |
},
|
336 |
+
"ReAct-Pro*": {
|
337 |
"gpt-3.5-turbo": {
|
338 |
"META": {
|
339 |
+
"Algorithm": "ReAct-Pro*",
|
340 |
"LLM": "gpt-3.5-turbo",
|
341 |
+
"Eval Date": "2025/1/7"
|
342 |
},
|
343 |
"gsm8k": {
|
344 |
+
"Score": 74.9,
|
345 |
+
"Pass rate": 0.9939,
|
346 |
+
"Cost($)": 3.4633,
|
347 |
+
"Framework": "",
|
348 |
+
"X-shot": "8.0",
|
349 |
"Samples": 1319,
|
350 |
+
"All tokens": 6646286,
|
351 |
+
"Total input tokens": 6506164,
|
352 |
+
"Average input tokens": 4933,
|
353 |
+
"Total output tokens": 140122,
|
354 |
+
"Average output tokens": 106
|
|
|
355 |
},
|
356 |
"AQuA": {
|
357 |
+
"Score": 64.56,
|
358 |
+
"Pass rate": 0.9803,
|
359 |
+
"Cost($)": 0.4928,
|
360 |
+
"Framework": "",
|
361 |
+
"X-shot": "0.0",
|
362 |
"Samples": 254,
|
363 |
+
"All tokens": 903587,
|
364 |
+
"Total input tokens": 862614,
|
365 |
+
"Average input tokens": 3396,
|
366 |
+
"Total output tokens": 40973,
|
367 |
+
"Average output tokens": 161
|
|
|
368 |
}
|
369 |
},
|
370 |
"Doubao-lite-32k": {
|
371 |
"META": {
|
372 |
+
"Algorithm": "ReAct-Pro*",
|
373 |
"LLM": "Doubao-lite-32k",
|
374 |
+
"Eval Date": "2025/1/7"
|
375 |
},
|
376 |
"gsm8k": {
|
377 |
+
"Score": 85.59,
|
378 |
+
"Pass rate": 0.9962,
|
379 |
+
"Cost($)": 0.2512,
|
380 |
+
"Framework": "",
|
381 |
+
"X-shot": "8.0",
|
382 |
"Samples": 1319,
|
383 |
+
"All tokens": 5998639,
|
384 |
+
"Total input tokens": 5862016,
|
385 |
+
"Average input tokens": 4444,
|
386 |
+
"Total output tokens": 136623,
|
387 |
+
"Average output tokens": 104
|
|
|
388 |
},
|
389 |
"AQuA": {
|
390 |
+
"Score": 77.55,
|
391 |
+
"Pass rate": 0.9606,
|
392 |
+
"Cost($)": 0.0445,
|
393 |
+
"Framework": "",
|
394 |
+
"X-shot": "0.0",
|
395 |
"Samples": 254,
|
396 |
+
"All tokens": 1032841,
|
397 |
+
"Total input tokens": 977890,
|
398 |
+
"Average input tokens": 3850,
|
399 |
+
"Total output tokens": 54951,
|
400 |
+
"Average output tokens": 216
|
401 |
+
}
|
402 |
+
},
|
403 |
+
"gpt-4o": {
|
404 |
+
"META": {
|
405 |
+
"Algorithm": "ReAct-Pro*",
|
406 |
+
"LLM": "gpt-4o",
|
407 |
+
"Eval Date": "2025/1/22"
|
408 |
+
},
|
409 |
+
"gsm8k": {
|
410 |
+
"Score": 63.3,
|
411 |
+
"Pass rate": 0.9954,
|
412 |
+
"Cost($)": 39.0751,
|
413 |
+
"Framework": "",
|
414 |
+
"X-shot": "8.0",
|
415 |
+
"Samples": 1319,
|
416 |
+
"All tokens": 14715887,
|
417 |
+
"Total input tokens": 14411173,
|
418 |
+
"Average input tokens": 10926,
|
419 |
+
"Total output tokens": 304714,
|
420 |
+
"Average output tokens": 231
|
421 |
+
},
|
422 |
+
"AQuA": {
|
423 |
+
"Score": 57.48,
|
424 |
+
"Pass rate": 0.9724,
|
425 |
+
"Cost($)": 2.304,
|
426 |
+
"Framework": "",
|
427 |
+
"X-shot": "0.0",
|
428 |
+
"Samples": 254,
|
429 |
+
"All tokens": 692096,
|
430 |
+
"Total input tokens": 615589,
|
431 |
+
"Average input tokens": 2424,
|
432 |
+
"Total output tokens": 76507,
|
433 |
+
"Average output tokens": 301
|
434 |
+
}
|
435 |
+
},
|
436 |
+
"Qwen2.5-72B-Instruct": {
|
437 |
+
"META": {
|
438 |
+
"Algorithm": "ReAct-Pro*",
|
439 |
+
"LLM": "Qwen2.5-72B-Instruct",
|
440 |
+
"Eval Date": "2025/1/22"
|
441 |
+
},
|
442 |
+
"gsm8k": {
|
443 |
+
"Score": 87.26,
|
444 |
+
"Pass rate": 1.0,
|
445 |
+
"Cost($)": 10.5479,
|
446 |
+
"Framework": "",
|
447 |
+
"X-shot": "8.0",
|
448 |
+
"Samples": 1319,
|
449 |
+
"All tokens": 18710437,
|
450 |
+
"Total input tokens": 18160983,
|
451 |
+
"Average input tokens": 13769,
|
452 |
+
"Total output tokens": 549454,
|
453 |
+
"Average output tokens": 417
|
454 |
+
},
|
455 |
+
"AQuA": {
|
456 |
+
"Score": 73.22,
|
457 |
+
"Pass rate": 1.0,
|
458 |
+
"Cost($)": 0.3177,
|
459 |
+
"Framework": "",
|
460 |
+
"X-shot": "0.0",
|
461 |
+
"Samples": 254,
|
462 |
+
"All tokens": 563603,
|
463 |
+
"Total input tokens": 441765,
|
464 |
+
"Average input tokens": 1739,
|
465 |
+
"Total output tokens": 121838,
|
466 |
+
"Average output tokens": 480
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"Llama-3.3-70B-Instruct": {
|
470 |
+
"META": {
|
471 |
+
"Algorithm": "ReAct-Pro*",
|
472 |
+
"LLM": "Llama-3.3-70B-Instruct",
|
473 |
+
"Eval Date": "2025/1/22"
|
474 |
+
},
|
475 |
+
"gsm8k": {
|
476 |
+
"Score": 87.64,
|
477 |
+
"Pass rate": 0.9992,
|
478 |
+
"Cost($)": 10.1124,
|
479 |
+
"Framework": "",
|
480 |
+
"X-shot": "",
|
481 |
+
"Samples": 1319,
|
482 |
+
"All tokens": 17937864,
|
483 |
+
"Total input tokens": 17038928,
|
484 |
+
"Average input tokens": 12918,
|
485 |
+
"Total output tokens": 898936,
|
486 |
+
"Average output tokens": 682
|
487 |
+
},
|
488 |
+
"AQuA": {
|
489 |
+
"Score": 79.13,
|
490 |
+
"Pass rate": 0.996,
|
491 |
+
"Cost($)": 0.768,
|
492 |
+
"Framework": "",
|
493 |
+
"X-shot": "0.0",
|
494 |
+
"Samples": 254,
|
495 |
+
"All tokens": 1362379,
|
496 |
+
"Total input tokens": 1119143,
|
497 |
+
"Average input tokens": 4406,
|
498 |
+
"Total output tokens": 243236,
|
499 |
+
"Average output tokens": 958
|
500 |
+
}
|
501 |
+
},
|
502 |
+
"Qwen2.5-7B-Instruct": {
|
503 |
+
"META": {
|
504 |
+
"Algorithm": "ReAct-Pro*",
|
505 |
+
"LLM": "Qwen2.5-7B-Instruct",
|
506 |
+
"Eval Date": "2025/1/22"
|
507 |
+
},
|
508 |
+
"gsm8k": {
|
509 |
+
"Score": 82.86,
|
510 |
+
"Pass rate": 1.0,
|
511 |
+
"Cost($)": 0.0,
|
512 |
+
"Framework": "",
|
513 |
+
"X-shot": "8.0",
|
514 |
+
"Samples": 1319,
|
515 |
+
"All tokens": 14850914,
|
516 |
+
"Total input tokens": 14355752,
|
517 |
+
"Average input tokens": 10884,
|
518 |
+
"Total output tokens": 495162,
|
519 |
+
"Average output tokens": 375
|
520 |
+
},
|
521 |
+
"AQuA": {
|
522 |
+
"Score": 74.4,
|
523 |
+
"Pass rate": 0.9921,
|
524 |
+
"Cost($)": 0.0,
|
525 |
+
"Framework": "",
|
526 |
+
"X-shot": "0.0",
|
527 |
+
"Samples": 254,
|
528 |
+
"All tokens": 695844,
|
529 |
+
"Total input tokens": 564165,
|
530 |
+
"Average input tokens": 2221,
|
531 |
+
"Total output tokens": 131679,
|
532 |
+
"Average output tokens": 518
|
533 |
+
}
|
534 |
+
},
|
535 |
+
"Llama-3.1-8B-Instruct": {
|
536 |
+
"META": {
|
537 |
+
"Algorithm": "ReAct-Pro*",
|
538 |
+
"LLM": "Llama-3.1-8B-Instruct",
|
539 |
+
"Eval Date": "2025/1/22"
|
540 |
+
},
|
541 |
+
"gsm8k": {
|
542 |
+
"Score": 67.77,
|
543 |
+
"Pass rate": 0.9855,
|
544 |
+
"Cost($)": 0.0,
|
545 |
+
"Framework": "",
|
546 |
+
"X-shot": "8.0",
|
547 |
+
"Samples": 1319,
|
548 |
+
"All tokens": 22835767,
|
549 |
+
"Total input tokens": 21044978,
|
550 |
+
"Average input tokens": 15955,
|
551 |
+
"Total output tokens": 1790789,
|
552 |
+
"Average output tokens": 1358
|
553 |
+
},
|
554 |
+
"AQuA": {
|
555 |
+
"Score": 55.51,
|
556 |
+
"Pass rate": 0.9685,
|
557 |
+
"Cost($)": 0.0,
|
558 |
+
"Framework": "",
|
559 |
+
"X-shot": "0.0",
|
560 |
+
"Samples": 254,
|
561 |
+
"All tokens": 4340821,
|
562 |
+
"Total input tokens": 3764723,
|
563 |
+
"Average input tokens": 14822,
|
564 |
+
"Total output tokens": 576098,
|
565 |
+
"Average output tokens": 2268
|
566 |
+
}
|
567 |
+
},
|
568 |
+
"Internllm2_5-7B": {
|
569 |
+
"META": {
|
570 |
+
"Algorithm": "ReAct-Pro*",
|
571 |
+
"LLM": "Internllm2_5-7B",
|
572 |
+
"Eval Date": "2025/1/22"
|
573 |
+
},
|
574 |
+
"gsm8k": {
|
575 |
+
"Score": 33.51,
|
576 |
+
"Pass rate": 0.9795,
|
577 |
+
"Cost($)": 0.0,
|
578 |
+
"Framework": "",
|
579 |
+
"X-shot": "",
|
580 |
+
"Samples": 1319,
|
581 |
+
"All tokens": 35669989,
|
582 |
+
"Total input tokens": 30120070,
|
583 |
+
"Average input tokens": 22836,
|
584 |
+
"Total output tokens": 5549919,
|
585 |
+
"Average output tokens": 4208
|
586 |
+
},
|
587 |
+
"AQuA": {
|
588 |
+
"Score": 40.94,
|
589 |
+
"Pass rate": 0.9685,
|
590 |
+
"Cost($)": 0.0,
|
591 |
+
"Framework": "",
|
592 |
+
"X-shot": "0.0",
|
593 |
+
"Samples": 254,
|
594 |
+
"All tokens": 4428801,
|
595 |
+
"Total input tokens": 3592039,
|
596 |
+
"Average input tokens": 14142,
|
597 |
+
"Total output tokens": 836762,
|
598 |
+
"Average output tokens": 3294
|
599 |
+
}
|
600 |
+
},
|
601 |
+
"Qwen2-1.5B-Instruct": {
|
602 |
+
"META": {
|
603 |
+
"Algorithm": "ReAct-Pro*",
|
604 |
+
"LLM": "Qwen2-1.5B-Instruct",
|
605 |
+
"Eval Date": "2025/1/22"
|
606 |
+
},
|
607 |
+
"gsm8k": {
|
608 |
+
"Score": 24.86,
|
609 |
+
"Pass rate": 0.8021,
|
610 |
+
"Cost($)": 0.0,
|
611 |
+
"Framework": "",
|
612 |
+
"X-shot": "8.0",
|
613 |
+
"Samples": 1319,
|
614 |
+
"All tokens": 9828001,
|
615 |
+
"Total input tokens": 9133603,
|
616 |
+
"Average input tokens": 6925,
|
617 |
+
"Total output tokens": 694398,
|
618 |
+
"Average output tokens": 526
|
619 |
+
},
|
620 |
+
"AQuA": {
|
621 |
+
"Score": 25.59,
|
622 |
+
"Pass rate": 0.9606,
|
623 |
+
"Cost($)": 0.0,
|
624 |
+
"Framework": "",
|
625 |
+
"X-shot": "0.0",
|
626 |
+
"Samples": 254,
|
627 |
+
"All tokens": 5072004,
|
628 |
+
"Total input tokens": 4555858,
|
629 |
+
"Average input tokens": 17936,
|
630 |
+
"Total output tokens": 516146,
|
631 |
+
"Average output tokens": 2032
|
632 |
+
}
|
633 |
+
},
|
634 |
+
"Qwen2-0.5B-Instruct": {
|
635 |
+
"META": {
|
636 |
+
"Algorithm": "ReAct-Pro*",
|
637 |
+
"LLM": "Qwen2-0.5B-Instruct",
|
638 |
+
"Eval Date": "2025/1/22"
|
639 |
+
},
|
640 |
+
"gsm8k": {
|
641 |
+
"Score": 7.65,
|
642 |
+
"Pass rate": 0.9522,
|
643 |
+
"Cost($)": 0.0,
|
644 |
+
"Framework": "",
|
645 |
+
"X-shot": "8.0",
|
646 |
+
"Samples": 1319,
|
647 |
+
"All tokens": 55392611,
|
648 |
+
"Total input tokens": 52431343,
|
649 |
+
"Average input tokens": 39751,
|
650 |
+
"Total output tokens": 2961268,
|
651 |
+
"Average output tokens": 2245
|
652 |
+
},
|
653 |
+
"AQuA": {
|
654 |
+
"Score": 24.01,
|
655 |
+
"Pass rate": 0.9685,
|
656 |
+
"Cost($)": 0.0,
|
657 |
+
"Framework": "",
|
658 |
+
"X-shot": "0.0",
|
659 |
+
"Samples": 254,
|
660 |
+
"All tokens": 7170087,
|
661 |
+
"Total input tokens": 6344167,
|
662 |
+
"Average input tokens": 24977,
|
663 |
+
"Total output tokens": 825920,
|
664 |
+
"Average output tokens": 3252
|
665 |
}
|
666 |
}
|
667 |
},
|
|
|
670 |
"META": {
|
671 |
"Algorithm": "PoT",
|
672 |
"LLM": "gpt-3.5-turbo",
|
673 |
+
"Eval Date": "2025/1/7"
|
674 |
},
|
675 |
"gsm8k": {
|
676 |
+
"Score": 76.87,
|
677 |
+
"Pass rate": 0.9924,
|
678 |
+
"Cost($)": 0.6902,
|
679 |
+
"Framework": "",
|
680 |
+
"X-shot": "8.0",
|
681 |
"Samples": 1319,
|
682 |
+
"All tokens": 1187080,
|
683 |
"Total input tokens": 1090418,
|
684 |
"Average input tokens": 827,
|
685 |
"Total output tokens": 96662,
|
686 |
+
"Average output tokens": 73
|
|
|
|
|
687 |
},
|
688 |
"AQuA": {
|
689 |
+
"Score": 59.44,
|
690 |
+
"Pass rate": 1.0,
|
691 |
+
"Cost($)": 0.1748,
|
692 |
+
"Framework": "",
|
693 |
+
"X-shot": "0.0",
|
694 |
"Samples": 254,
|
695 |
+
"All tokens": 266654,
|
696 |
"Total input tokens": 225162,
|
697 |
"Average input tokens": 886,
|
698 |
"Total output tokens": 41492,
|
699 |
+
"Average output tokens": 163
|
|
|
|
|
700 |
}
|
701 |
},
|
702 |
"Doubao-lite-32k": {
|
703 |
"META": {
|
704 |
"Algorithm": "PoT",
|
705 |
"LLM": "Doubao-lite-32k",
|
706 |
+
"Eval Date": "2025/1/7"
|
707 |
},
|
708 |
"gsm8k": {
|
709 |
+
"Score": 79.6,
|
710 |
+
"Pass rate": 0.9257,
|
711 |
+
"Cost($)": 0.0576,
|
712 |
+
"Framework": "",
|
713 |
+
"X-shot": "8.0",
|
714 |
"Samples": 1319,
|
715 |
+
"All tokens": 1288055,
|
716 |
"Total input tokens": 1170038,
|
717 |
"Average input tokens": 887,
|
718 |
"Total output tokens": 118017,
|
719 |
+
"Average output tokens": 89
|
|
|
|
|
720 |
},
|
721 |
"AQuA": {
|
722 |
"Score": 71.65,
|
723 |
+
"Pass rate": 0.9685,
|
724 |
+
"Cost($)": 0.0147,
|
725 |
+
"Framework": "",
|
726 |
+
"X-shot": "0.0",
|
727 |
"Samples": 254,
|
728 |
+
"All tokens": 309436,
|
729 |
"Total input tokens": 259863,
|
730 |
"Average input tokens": 1023,
|
731 |
"Total output tokens": 49573,
|
732 |
+
"Average output tokens": 195
|
|
|
|
|
733 |
}
|
734 |
+
},
|
735 |
+
"gpt-4o": {
|
|
|
|
|
736 |
"META": {
|
737 |
+
"Algorithm": "PoT",
|
738 |
+
"LLM": "gpt-4o",
|
739 |
+
"Eval Date": "2025/1/22"
|
740 |
},
|
741 |
"gsm8k": {
|
742 |
+
"Score": 93.1,
|
743 |
+
"Pass rate": 0.9977,
|
744 |
+
"Cost($)": 4.2166,
|
745 |
+
"Framework": "",
|
746 |
+
"X-shot": "8.0",
|
747 |
"Samples": 1319,
|
748 |
+
"All tokens": 1247912,
|
749 |
+
"Total input tokens": 1101672,
|
750 |
+
"Average input tokens": 835,
|
751 |
+
"Total output tokens": 146240,
|
752 |
+
"Average output tokens": 111
|
|
|
753 |
},
|
754 |
"AQuA": {
|
755 |
+
"Score": 75.19,
|
756 |
+
"Pass rate": 1.0,
|
757 |
+
"Cost($)": 1.6087,
|
758 |
+
"Framework": "",
|
759 |
+
"X-shot": "0.0",
|
760 |
"Samples": 254,
|
761 |
+
"All tokens": 327908,
|
762 |
+
"Total input tokens": 222717,
|
763 |
+
"Average input tokens": 877,
|
764 |
+
"Total output tokens": 105191,
|
765 |
+
"Average output tokens": 414
|
|
|
766 |
}
|
767 |
},
|
768 |
+
"Qwen2.5-72B-Instruct": {
|
769 |
"META": {
|
770 |
+
"Algorithm": "PoT",
|
771 |
+
"LLM": "Qwen2.5-72B-Instruct",
|
772 |
+
"Eval Date": "2025/1/22"
|
773 |
},
|
774 |
"gsm8k": {
|
775 |
+
"Score": 92.34,
|
776 |
+
"Pass rate": 0.9939,
|
777 |
+
"Cost($)": 0.7054,
|
778 |
+
"Framework": "",
|
779 |
+
"X-shot": "8.0",
|
780 |
"Samples": 1319,
|
781 |
+
"All tokens": 1251210,
|
782 |
+
"Total input tokens": 1106682,
|
783 |
+
"Average input tokens": 839,
|
784 |
+
"Total output tokens": 144528,
|
785 |
+
"Average output tokens": 110
|
|
|
786 |
},
|
787 |
"AQuA": {
|
788 |
+
"Score": 75.19,
|
789 |
+
"Pass rate": 1.0,
|
790 |
+
"Cost($)": 0.1645,
|
791 |
+
"Framework": "",
|
792 |
+
"X-shot": "0.0",
|
793 |
"Samples": 254,
|
794 |
+
"All tokens": 291764,
|
795 |
+
"Total input tokens": 249215,
|
796 |
+
"Average input tokens": 981,
|
797 |
+
"Total output tokens": 42549,
|
798 |
+
"Average output tokens": 168
|
799 |
+
}
|
800 |
+
},
|
801 |
+
"Llama-3.3-70B-Instruct": {
|
802 |
+
"META": {
|
803 |
+
"Algorithm": "PoT",
|
804 |
+
"LLM": "Llama-3.3-70B-Instruct",
|
805 |
+
"Eval Date": "2025/1/22"
|
806 |
+
},
|
807 |
+
"gsm8k": {
|
808 |
+
"Score": 73.08,
|
809 |
+
"Pass rate": 0.796,
|
810 |
+
"Cost($)": 0.9736,
|
811 |
+
"Framework": "",
|
812 |
+
"X-shot": "8.0",
|
813 |
+
"Samples": 1319,
|
814 |
+
"All tokens": 1727044,
|
815 |
+
"Total input tokens": 1126025,
|
816 |
+
"Average input tokens": 854,
|
817 |
+
"Total output tokens": 601019,
|
818 |
+
"Average output tokens": 456
|
819 |
+
},
|
820 |
+
"AQuA": {
|
821 |
+
"Score": 79.52,
|
822 |
+
"Pass rate": 0.9921,
|
823 |
+
"Cost($)": 0.1746,
|
824 |
+
"Framework": "",
|
825 |
+
"X-shot": "0.0",
|
826 |
+
"Samples": 254,
|
827 |
+
"All tokens": 309799,
|
828 |
+
"Total input tokens": 240735,
|
829 |
+
"Average input tokens": 948,
|
830 |
+
"Total output tokens": 69064,
|
831 |
+
"Average output tokens": 272
|
832 |
+
}
|
833 |
+
},
|
834 |
+
"Qwen2.5-7B-Instruct": {
|
835 |
+
"META": {
|
836 |
+
"Algorithm": "PoT",
|
837 |
+
"LLM": "Qwen2.5-7B-Instruct",
|
838 |
+
"Eval Date": "2025/1/22"
|
839 |
+
},
|
840 |
+
"gsm8k": {
|
841 |
+
"Score": 58.83,
|
842 |
+
"Pass rate": 0.705,
|
843 |
+
"Cost($)": 0.0,
|
844 |
+
"Framework": "",
|
845 |
+
"X-shot": "8.0",
|
846 |
+
"Samples": 1319,
|
847 |
+
"All tokens": 1362822,
|
848 |
+
"Total input tokens": 1145390,
|
849 |
+
"Average input tokens": 868,
|
850 |
+
"Total output tokens": 217432,
|
851 |
+
"Average output tokens": 165
|
852 |
+
},
|
853 |
+
"AQuA": {
|
854 |
+
"Score": 68.11,
|
855 |
+
"Pass rate": 1.0,
|
856 |
+
"Cost($)": 0.0,
|
857 |
+
"Framework": "",
|
858 |
+
"X-shot": "0.0",
|
859 |
+
"Samples": 254,
|
860 |
+
"All tokens": 313728,
|
861 |
+
"Total input tokens": 264517,
|
862 |
+
"Average input tokens": 1041,
|
863 |
+
"Total output tokens": 49211,
|
864 |
+
"Average output tokens": 194
|
865 |
+
}
|
866 |
+
},
|
867 |
+
"Llama-3.1-8B-Instruct": {
|
868 |
+
"META": {
|
869 |
+
"Algorithm": "PoT",
|
870 |
+
"LLM": "Llama-3.1-8B-Instruct",
|
871 |
+
"Eval Date": "2025/1/22"
|
872 |
+
},
|
873 |
+
"gsm8k": {
|
874 |
+
"Score": 38.66,
|
875 |
+
"Pass rate": 0.5542,
|
876 |
+
"Cost($)": 0.0,
|
877 |
+
"Framework": "",
|
878 |
+
"X-shot": "8.0",
|
879 |
+
"Samples": 1319,
|
880 |
+
"All tokens": 1391111,
|
881 |
+
"Total input tokens": 1147538,
|
882 |
+
"Average input tokens": 870,
|
883 |
+
"Total output tokens": 243573,
|
884 |
+
"Average output tokens": 185
|
885 |
+
},
|
886 |
+
"AQuA": {
|
887 |
+
"Score": 36.61,
|
888 |
+
"Pass rate": 0.9685,
|
889 |
+
"Cost($)": 0.0,
|
890 |
+
"Framework": "",
|
891 |
+
"X-shot": "0.0",
|
892 |
+
"Samples": 254,
|
893 |
+
"All tokens": 290914,
|
894 |
+
"Total input tokens": 240613,
|
895 |
+
"Average input tokens": 947,
|
896 |
+
"Total output tokens": 50301,
|
897 |
+
"Average output tokens": 198
|
898 |
+
}
|
899 |
+
},
|
900 |
+
"Internllm2_5-7B": {
|
901 |
+
"META": {
|
902 |
+
"Algorithm": "PoT",
|
903 |
+
"LLM": "Internllm2_5-7B",
|
904 |
+
"Eval Date": "2025/1/22"
|
905 |
+
},
|
906 |
+
"gsm8k": {
|
907 |
+
"Score": 38.21,
|
908 |
+
"Pass rate": 0.489,
|
909 |
+
"Cost($)": 0.0,
|
910 |
+
"Framework": "",
|
911 |
+
"X-shot": "8.0",
|
912 |
+
"Samples": 1319,
|
913 |
+
"All tokens": 1324949,
|
914 |
+
"Total input tokens": 1136843,
|
915 |
+
"Average input tokens": 862,
|
916 |
+
"Total output tokens": 188106,
|
917 |
+
"Average output tokens": 143
|
918 |
+
},
|
919 |
+
"AQuA": {
|
920 |
+
"Score": 36.61,
|
921 |
+
"Pass rate": 0.9881,
|
922 |
+
"Cost($)": 0.0,
|
923 |
+
"Framework": "",
|
924 |
+
"X-shot": "0.0",
|
925 |
+
"Samples": 254,
|
926 |
+
"All tokens": 301962,
|
927 |
+
"Total input tokens": 233505,
|
928 |
+
"Average input tokens": 919,
|
929 |
+
"Total output tokens": 68457,
|
930 |
+
"Average output tokens": 270
|
931 |
+
}
|
932 |
+
},
|
933 |
+
"Qwen2-1.5B-Instruct": {
|
934 |
+
"META": {
|
935 |
+
"Algorithm": "PoT",
|
936 |
+
"LLM": "Qwen2-1.5B-Instruct",
|
937 |
+
"Eval Date": "2025/1/22"
|
938 |
+
},
|
939 |
+
"gsm8k": {
|
940 |
+
"Score": 18.49,
|
941 |
+
"Pass rate": 0.31,
|
942 |
+
"Cost($)": 0.0,
|
943 |
+
"Framework": "",
|
944 |
+
"X-shot": "8.0",
|
945 |
+
"Samples": 1319,
|
946 |
+
"All tokens": 1327522,
|
947 |
+
"Total input tokens": 1151528,
|
948 |
+
"Average input tokens": 873,
|
949 |
+
"Total output tokens": 175994,
|
950 |
+
"Average output tokens": 133
|
951 |
+
},
|
952 |
+
"AQuA": {
|
953 |
+
"Score": 30.7,
|
954 |
+
"Pass rate": 0.9645,
|
955 |
+
"Cost($)": 0.0,
|
956 |
+
"Framework": "",
|
957 |
+
"X-shot": "0.0",
|
958 |
+
"Samples": 254,
|
959 |
+
"All tokens": 298475,
|
960 |
+
"Total input tokens": 246560,
|
961 |
+
"Average input tokens": 971,
|
962 |
+
"Total output tokens": 51915,
|
963 |
+
"Average output tokens": 204
|
964 |
+
}
|
965 |
+
},
|
966 |
+
"Qwen2-0.5B-Instruct": {
|
967 |
+
"META": {
|
968 |
+
"Algorithm": "PoT",
|
969 |
+
"LLM": "Qwen2-0.5B-Instruct",
|
970 |
+
"Eval Date": "2025/1/22"
|
971 |
+
},
|
972 |
+
"gsm8k": {
|
973 |
+
"Score": 9.62,
|
974 |
+
"Pass rate": 0.169,
|
975 |
+
"Cost($)": 0.0,
|
976 |
+
"Framework": "",
|
977 |
+
"X-shot": "8.0",
|
978 |
+
"Samples": 1319,
|
979 |
+
"All tokens": 1389135,
|
980 |
+
"Total input tokens": 1151528,
|
981 |
+
"Average input tokens": 873,
|
982 |
+
"Total output tokens": 237607,
|
983 |
+
"Average output tokens": 180
|
984 |
+
},
|
985 |
+
"AQuA": {
|
986 |
+
"Score": 17.32,
|
987 |
+
"Pass rate": 0.9212,
|
988 |
+
"Cost($)": 0.0,
|
989 |
+
"Framework": "",
|
990 |
+
"X-shot": "0.0",
|
991 |
+
"Samples": 254,
|
992 |
+
"All tokens": 322281,
|
993 |
+
"Total input tokens": 258867,
|
994 |
+
"Average input tokens": 1019,
|
995 |
+
"Total output tokens": 63414,
|
996 |
+
"Average output tokens": 250
|
997 |
+
}
|
998 |
+
}
|
999 |
+
},
|
1000 |
+
"CoT": {
|
1001 |
+
"gpt-3.5-turbo": {
|
1002 |
+
"META": {
|
1003 |
+
"Algorithm": "CoT",
|
1004 |
+
"LLM": "gpt-3.5-turbo",
|
1005 |
+
"Eval Date": "2025/1/7"
|
1006 |
+
},
|
1007 |
+
"gsm8k": {
|
1008 |
+
"Score": 78.69,
|
1009 |
+
"Pass rate": 1.0,
|
1010 |
+
"Cost($)": 0.6788,
|
1011 |
+
"Framework": "",
|
1012 |
+
"X-shot": "8.0",
|
1013 |
+
"Samples": 1319,
|
1014 |
+
"All tokens": 1088041,
|
1015 |
+
"Total input tokens": 953242,
|
1016 |
+
"Average input tokens": 723,
|
1017 |
+
"Total output tokens": 134799,
|
1018 |
+
"Average output tokens": 102
|
1019 |
+
},
|
1020 |
+
"AQuA": {
|
1021 |
+
"Score": 61.02,
|
1022 |
+
"Pass rate": 0.937,
|
1023 |
+
"Cost($)": 0.0957,
|
1024 |
+
"Framework": "",
|
1025 |
+
"X-shot": "0.0",
|
1026 |
+
"Samples": 254,
|
1027 |
+
"All tokens": 80793,
|
1028 |
+
"Total input tokens": 25447,
|
1029 |
+
"Average input tokens": 100,
|
1030 |
+
"Total output tokens": 55346,
|
1031 |
+
"Average output tokens": 218
|
1032 |
+
}
|
1033 |
+
},
|
1034 |
+
"Doubao-lite-32k": {
|
1035 |
+
"META": {
|
1036 |
+
"Algorithm": "CoT",
|
1037 |
+
"LLM": "Doubao-lite-32k",
|
1038 |
+
"Eval Date": "2025/1/7"
|
1039 |
+
},
|
1040 |
+
"gsm8k": {
|
1041 |
+
"Score": 89.31,
|
1042 |
+
"Pass rate": 1.0,
|
1043 |
+
"Cost($)": 0.0558,
|
1044 |
+
"Framework": "",
|
1045 |
+
"X-shot": "8.0",
|
1046 |
+
"Samples": 1319,
|
1047 |
+
"All tokens": 1201820,
|
1048 |
+
"Total input tokens": 1042095,
|
1049 |
+
"Average input tokens": 790,
|
1050 |
+
"Total output tokens": 159725,
|
1051 |
+
"Average output tokens": 121
|
1052 |
+
},
|
1053 |
+
"AQuA": {
|
1054 |
+
"Score": 82.67,
|
1055 |
+
"Pass rate": 0.9724,
|
1056 |
+
"Cost($)": 0.0066,
|
1057 |
+
"Framework": "",
|
1058 |
+
"X-shot": "0.0",
|
1059 |
+
"Samples": 254,
|
1060 |
+
"All tokens": 94577,
|
1061 |
+
"Total input tokens": 27978,
|
1062 |
+
"Average input tokens": 110,
|
1063 |
+
"Total output tokens": 66599,
|
1064 |
+
"Average output tokens": 262
|
1065 |
+
}
|
1066 |
+
},
|
1067 |
+
"gpt-4o": {
|
1068 |
+
"META": {
|
1069 |
+
"Algorithm": "CoT",
|
1070 |
+
"LLM": "gpt-4o",
|
1071 |
+
"Eval Date": "2025/1/22"
|
1072 |
+
},
|
1073 |
+
"gsm8k": {
|
1074 |
+
"Score": 94.08,
|
1075 |
+
"Pass rate": 1.0,
|
1076 |
+
"Cost($)": 4.5367,
|
1077 |
+
"Framework": "",
|
1078 |
+
"X-shot": "8.0",
|
1079 |
+
"Samples": 1319,
|
1080 |
+
"All tokens": 1165166,
|
1081 |
+
"Total input tokens": 948668,
|
1082 |
+
"Average input tokens": 719,
|
1083 |
+
"Total output tokens": 216498,
|
1084 |
+
"Average output tokens": 164
|
1085 |
+
},
|
1086 |
+
"AQuA": {
|
1087 |
+
"Score": 82.67,
|
1088 |
+
"Pass rate": 0.9803,
|
1089 |
+
"Cost($)": 1.0417,
|
1090 |
+
"Framework": "",
|
1091 |
+
"X-shot": "0.0",
|
1092 |
+
"Samples": 254,
|
1093 |
+
"All tokens": 123017,
|
1094 |
+
"Total input tokens": 25123,
|
1095 |
+
"Average input tokens": 99,
|
1096 |
+
"Total output tokens": 97894,
|
1097 |
+
"Average output tokens": 385
|
1098 |
+
}
|
1099 |
+
},
|
1100 |
+
"Qwen2.5-72B-Instruct": {
|
1101 |
+
"META": {
|
1102 |
+
"Algorithm": "CoT",
|
1103 |
+
"LLM": "Qwen2.5-72B-Instruct",
|
1104 |
+
"Eval Date": "2025/1/22"
|
1105 |
+
},
|
1106 |
+
"gsm8k": {
|
1107 |
+
"Score": 92.87,
|
1108 |
+
"Pass rate": 1.0,
|
1109 |
+
"Cost($)": 0.7195,
|
1110 |
+
"Framework": "",
|
1111 |
+
"X-shot": "8.0",
|
1112 |
+
"Samples": 1319,
|
1113 |
+
"All tokens": 1276252,
|
1114 |
+
"Total input tokens": 1005119,
|
1115 |
+
"Average input tokens": 762,
|
1116 |
+
"Total output tokens": 271133,
|
1117 |
+
"Average output tokens": 206
|
1118 |
+
},
|
1119 |
+
"AQuA": {
|
1120 |
+
"Score": 86.22,
|
1121 |
+
"Pass rate": 0.9921,
|
1122 |
+
"Cost($)": 0.0808,
|
1123 |
+
"Framework": "",
|
1124 |
+
"X-shot": "0.0",
|
1125 |
+
"Samples": 254,
|
1126 |
+
"All tokens": 143289,
|
1127 |
+
"Total input tokens": 25143,
|
1128 |
+
"Average input tokens": 99,
|
1129 |
+
"Total output tokens": 118146,
|
1130 |
+
"Average output tokens": 465
|
1131 |
+
}
|
1132 |
+
},
|
1133 |
+
"Llama-3.3-70B-Instruct": {
|
1134 |
+
"META": {
|
1135 |
+
"Algorithm": "CoT",
|
1136 |
+
"LLM": "Llama-3.3-70B-Instruct",
|
1137 |
+
"Eval Date": "2025/1/22"
|
1138 |
+
},
|
1139 |
+
"gsm8k": {
|
1140 |
+
"Score": 93.93,
|
1141 |
+
"Pass rate": 1.0,
|
1142 |
+
"Cost($)": 0.687,
|
1143 |
+
"Framework": "",
|
1144 |
+
"X-shot": "8.0",
|
1145 |
+
"Samples": 1319,
|
1146 |
+
"All tokens": 1218665,
|
1147 |
+
"Total input tokens": 990168,
|
1148 |
+
"Average input tokens": 751,
|
1149 |
+
"Total output tokens": 228497,
|
1150 |
+
"Average output tokens": 173
|
1151 |
+
},
|
1152 |
+
"AQuA": {
|
1153 |
+
"Score": 83.46,
|
1154 |
+
"Pass rate": 0.9842,
|
1155 |
+
"Cost($)": 0.0927,
|
1156 |
+
"Framework": "",
|
1157 |
+
"X-shot": "0.0",
|
1158 |
+
"Samples": 254,
|
1159 |
+
"All tokens": 164389,
|
1160 |
+
"Total input tokens": 32555,
|
1161 |
+
"Average input tokens": 128,
|
1162 |
+
"Total output tokens": 131834,
|
1163 |
+
"Average output tokens": 519
|
1164 |
+
}
|
1165 |
+
},
|
1166 |
+
"Qwen2.5-7B-Instruct": {
|
1167 |
+
"META": {
|
1168 |
+
"Algorithm": "CoT",
|
1169 |
+
"LLM": "Qwen2.5-7B-Instruct",
|
1170 |
+
"Eval Date": "2025/1/22"
|
1171 |
+
},
|
1172 |
+
"gsm8k": {
|
1173 |
+
"Score": 85.67,
|
1174 |
+
"Pass rate": 1.0,
|
1175 |
+
"Cost($)": 0.0,
|
1176 |
+
"Framework": "",
|
1177 |
+
"X-shot": "8.0",
|
1178 |
+
"Samples": 1319,
|
1179 |
+
"All tokens": 1290805,
|
1180 |
+
"Total input tokens": 1046008,
|
1181 |
+
"Average input tokens": 793,
|
1182 |
+
"Total output tokens": 244797,
|
1183 |
+
"Average output tokens": 186
|
1184 |
+
},
|
1185 |
+
"AQuA": {
|
1186 |
+
"Score": 80.7,
|
1187 |
+
"Pass rate": 0.996,
|
1188 |
+
"Cost($)": 0.0,
|
1189 |
+
"Framework": "",
|
1190 |
+
"X-shot": "0.0",
|
1191 |
+
"Samples": 254,
|
1192 |
+
"All tokens": 149736,
|
1193 |
+
"Total input tokens": 33017,
|
1194 |
+
"Average input tokens": 130,
|
1195 |
+
"Total output tokens": 116719,
|
1196 |
+
"Average output tokens": 460
|
1197 |
+
}
|
1198 |
+
},
|
1199 |
+
"Llama-3.1-8B-Instruct": {
|
1200 |
+
"META": {
|
1201 |
+
"Algorithm": "CoT",
|
1202 |
+
"LLM": "Llama-3.1-8B-Instruct",
|
1203 |
+
"Eval Date": "2025/1/22"
|
1204 |
+
},
|
1205 |
+
"gsm8k": {
|
1206 |
+
"Score": 75.43,
|
1207 |
+
"Pass rate": 0.9992,
|
1208 |
+
"Cost($)": 0.0,
|
1209 |
+
"Framework": "",
|
1210 |
+
"X-shot": "8.0",
|
1211 |
+
"Samples": 1319,
|
1212 |
+
"All tokens": 1248329,
|
1213 |
+
"Total input tokens": 990168,
|
1214 |
+
"Average input tokens": 751,
|
1215 |
+
"Total output tokens": 258161,
|
1216 |
+
"Average output tokens": 196
|
1217 |
+
},
|
1218 |
+
"AQuA": {
|
1219 |
+
"Score": 60.62,
|
1220 |
+
"Pass rate": 1.0,
|
1221 |
+
"Cost($)": 0.0,
|
1222 |
+
"Framework": "",
|
1223 |
+
"X-shot": "0.0",
|
1224 |
+
"Samples": 254,
|
1225 |
+
"All tokens": 144435,
|
1226 |
+
"Total input tokens": 32555,
|
1227 |
+
"Average input tokens": 128,
|
1228 |
+
"Total output tokens": 111880,
|
1229 |
+
"Average output tokens": 440
|
1230 |
+
}
|
1231 |
+
},
|
1232 |
+
"Internllm2_5-7B": {
|
1233 |
+
"META": {
|
1234 |
+
"Algorithm": "CoT",
|
1235 |
+
"LLM": "Internllm2_5-7B",
|
1236 |
+
"Eval Date": "2025/1/22"
|
1237 |
+
},
|
1238 |
+
"gsm8k": {
|
1239 |
+
"Score": 77.71,
|
1240 |
+
"Pass rate": 0.9969,
|
1241 |
+
"Cost($)": 0.0,
|
1242 |
+
"Framework": "",
|
1243 |
+
"X-shot": "8.0",
|
1244 |
+
"Samples": 1319,
|
1245 |
+
"All tokens": 1202163,
|
1246 |
+
"Total input tokens": 968163,
|
1247 |
+
"Average input tokens": 734,
|
1248 |
+
"Total output tokens": 234000,
|
1249 |
+
"Average output tokens": 177
|
1250 |
+
},
|
1251 |
+
"AQuA": {
|
1252 |
+
"Score": 52.75,
|
1253 |
+
"Pass rate": 0.8937,
|
1254 |
+
"Cost($)": 0.0,
|
1255 |
+
"Framework": "",
|
1256 |
+
"X-shot": "0.0",
|
1257 |
+
"Samples": 254,
|
1258 |
+
"All tokens": 127520,
|
1259 |
+
"Total input tokens": 26610,
|
1260 |
+
"Average input tokens": 105,
|
1261 |
+
"Total output tokens": 100910,
|
1262 |
+
"Average output tokens": 397
|
1263 |
+
}
|
1264 |
+
},
|
1265 |
+
"Qwen2-1.5B-Instruct": {
|
1266 |
+
"META": {
|
1267 |
+
"Algorithm": "CoT",
|
1268 |
+
"LLM": "Qwen2-1.5B-Instruct",
|
1269 |
+
"Eval Date": "2025/1/22"
|
1270 |
+
},
|
1271 |
+
"gsm8k": {
|
1272 |
+
"Score": 55.49,
|
1273 |
+
"Pass rate": 1.0,
|
1274 |
+
"Cost($)": 0.0,
|
1275 |
+
"Framework": "",
|
1276 |
+
"X-shot": "8.0",
|
1277 |
+
"Samples": 1319,
|
1278 |
+
"All tokens": 1218525,
|
1279 |
+
"Total input tokens": 1032818,
|
1280 |
+
"Average input tokens": 783,
|
1281 |
+
"Total output tokens": 185707,
|
1282 |
+
"Average output tokens": 141
|
1283 |
+
},
|
1284 |
+
"AQuA": {
|
1285 |
+
"Score": 40.55,
|
1286 |
+
"Pass rate": 0.9881,
|
1287 |
+
"Cost($)": 0.0,
|
1288 |
+
"Framework": "",
|
1289 |
+
"X-shot": "0.0",
|
1290 |
+
"Samples": 254,
|
1291 |
+
"All tokens": 110040,
|
1292 |
+
"Total input tokens": 30477,
|
1293 |
+
"Average input tokens": 120,
|
1294 |
+
"Total output tokens": 79563,
|
1295 |
+
"Average output tokens": 313
|
1296 |
+
}
|
1297 |
+
},
|
1298 |
+
"Qwen2-0.5B-Instruct": {
|
1299 |
+
"META": {
|
1300 |
+
"Algorithm": "CoT",
|
1301 |
+
"LLM": "Qwen2-0.5B-Instruct",
|
1302 |
+
"Eval Date": "2025/1/22"
|
1303 |
+
},
|
1304 |
+
"gsm8k": {
|
1305 |
+
"Score": 35.93,
|
1306 |
+
"Pass rate": 0.9992,
|
1307 |
+
"Cost($)": 0.0,
|
1308 |
+
"Framework": "",
|
1309 |
+
"X-shot": "8.0",
|
1310 |
+
"Samples": 1319,
|
1311 |
+
"All tokens": 1223459,
|
1312 |
+
"Total input tokens": 1032818,
|
1313 |
+
"Average input tokens": 783,
|
1314 |
+
"Total output tokens": 190641,
|
1315 |
+
"Average output tokens": 145
|
1316 |
+
},
|
1317 |
+
"AQuA": {
|
1318 |
+
"Score": 33.07,
|
1319 |
+
"Pass rate": 0.9881,
|
1320 |
+
"Cost($)": 0.0,
|
1321 |
+
"Framework": "",
|
1322 |
+
"X-shot": "0.0",
|
1323 |
+
"Samples": 254,
|
1324 |
+
"All tokens": 117339,
|
1325 |
+
"Total input tokens": 30477,
|
1326 |
+
"Average input tokens": 120,
|
1327 |
+
"Total output tokens": 86862,
|
1328 |
+
"Average output tokens": 342
|
1329 |
+
}
|
1330 |
+
}
|
1331 |
+
},
|
1332 |
+
"SC-CoT": {
|
1333 |
+
"gpt-3.5-turbo": {
|
1334 |
+
"META": {
|
1335 |
+
"Algorithm": "SC-CoT",
|
1336 |
+
"LLM": "gpt-3.5-turbo",
|
1337 |
+
"Eval Date": "2025/1/7"
|
1338 |
+
},
|
1339 |
+
"gsm8k": {
|
1340 |
+
"Score": 82.56,
|
1341 |
+
"Pass rate": 0.9985,
|
1342 |
+
"Cost($)": 2.6285,
|
1343 |
+
"Framework": "",
|
1344 |
+
"X-shot": "8.0",
|
1345 |
+
"Samples": 1319,
|
1346 |
+
"All tokens": 2560697,
|
1347 |
+
"Total input tokens": 1212520,
|
1348 |
+
"Average input tokens": 919,
|
1349 |
+
"Total output tokens": 1348177,
|
1350 |
+
"Average output tokens": 1022
|
1351 |
+
},
|
1352 |
+
"AQuA": {
|
1353 |
+
"Score": 70.47,
|
1354 |
+
"Pass rate": 0.9882,
|
1355 |
+
"Cost($)": 0.5578,
|
1356 |
+
"Framework": "",
|
1357 |
+
"X-shot": "0.0",
|
1358 |
+
"Samples": 254,
|
1359 |
+
"All tokens": 418617,
|
1360 |
+
"Total input tokens": 70157,
|
1361 |
+
"Average input tokens": 276,
|
1362 |
+
"Total output tokens": 348460,
|
1363 |
+
"Average output tokens": 1372
|
1364 |
+
}
|
1365 |
+
},
|
1366 |
+
"Doubao-lite-32k": {
|
1367 |
+
"META": {
|
1368 |
+
"Algorithm": "SC-CoT",
|
1369 |
+
"LLM": "Doubao-lite-32k",
|
1370 |
+
"Eval Date": "2025/1/7"
|
1371 |
+
},
|
1372 |
+
"gsm8k": {
|
1373 |
+
"Score": 83.7,
|
1374 |
+
"Pass rate": 0.997,
|
1375 |
+
"Cost($)": 0.155,
|
1376 |
+
"Framework": "",
|
1377 |
+
"X-shot": "8.0",
|
1378 |
+
"Samples": 1319,
|
1379 |
+
"All tokens": 2507687,
|
1380 |
+
"Total input tokens": 1230019,
|
1381 |
+
"Average input tokens": 933,
|
1382 |
+
"Total output tokens": 1277668,
|
1383 |
+
"Average output tokens": 969
|
1384 |
+
},
|
1385 |
+
"AQuA": {
|
1386 |
+
"Score": 81.5,
|
1387 |
+
"Pass rate": 0.9764,
|
1388 |
+
"Cost($)": 0.0347,
|
1389 |
+
"Framework": "",
|
1390 |
+
"X-shot": "0.0",
|
1391 |
+
"Samples": 254,
|
1392 |
+
"All tokens": 465846,
|
1393 |
+
"Total input tokens": 83830,
|
1394 |
+
"Average input tokens": 330,
|
1395 |
+
"Total output tokens": 382016,
|
1396 |
+
"Average output tokens": 1504
|
1397 |
+
}
|
1398 |
+
},
|
1399 |
+
"gpt-4o": {
|
1400 |
+
"META": {
|
1401 |
+
"Algorithm": "SC-CoT",
|
1402 |
+
"LLM": "gpt-4o",
|
1403 |
+
"Eval Date": "2025/1/22"
|
1404 |
+
},
|
1405 |
+
"gsm8k": {
|
1406 |
+
"Score": 90.75,
|
1407 |
+
"Pass rate": 1.0,
|
1408 |
+
"Cost($)": 24.2428,
|
1409 |
+
"Framework": "",
|
1410 |
+
"X-shot": "8.0",
|
1411 |
+
"Samples": 1319,
|
1412 |
+
"All tokens": 3300971,
|
1413 |
+
"Total input tokens": 1168927,
|
1414 |
+
"Average input tokens": 886,
|
1415 |
+
"Total output tokens": 2132044,
|
1416 |
+
"Average output tokens": 1616
|
1417 |
+
},
|
1418 |
+
"AQuA": {
|
1419 |
+
"Score": 88.19,
|
1420 |
+
"Pass rate": 1.0,
|
1421 |
+
"Cost($)": 6.2412,
|
1422 |
+
"Framework": "",
|
1423 |
+
"X-shot": "0.0",
|
1424 |
+
"Samples": 254,
|
1425 |
+
"All tokens": 678811,
|
1426 |
+
"Total input tokens": 72916,
|
1427 |
+
"Average input tokens": 287,
|
1428 |
+
"Total output tokens": 605895,
|
1429 |
+
"Average output tokens": 2385
|
1430 |
+
}
|
1431 |
+
},
|
1432 |
+
"Qwen2.5-72B-Instruct": {
|
1433 |
+
"META": {
|
1434 |
+
"Algorithm": "SC-CoT",
|
1435 |
+
"LLM": "Qwen2.5-72B-Instruct",
|
1436 |
+
"Eval Date": "2025/1/22"
|
1437 |
+
},
|
1438 |
+
"gsm8k": {
|
1439 |
+
"Score": 90.67,
|
1440 |
+
"Pass rate": 1.0,
|
1441 |
+
"Cost($)": 4.2651,
|
1442 |
+
"Framework": "",
|
1443 |
+
"X-shot": "8.0",
|
1444 |
+
"Samples": 1319,
|
1445 |
+
"All tokens": 7565637,
|
1446 |
+
"Total input tokens": 5292383,
|
1447 |
+
"Average input tokens": 4012,
|
1448 |
+
"Total output tokens": 2273254,
|
1449 |
+
"Average output tokens": 1723
|
1450 |
+
},
|
1451 |
+
"AQuA": {
|
1452 |
+
"Score": 85.82,
|
1453 |
+
"Pass rate": 0.9842,
|
1454 |
+
"Cost($)": 0.5576,
|
1455 |
+
"Framework": "",
|
1456 |
+
"X-shot": "0.0",
|
1457 |
+
"Samples": 254,
|
1458 |
+
"All tokens": 989058,
|
1459 |
+
"Total input tokens": 241149,
|
1460 |
+
"Average input tokens": 949,
|
1461 |
+
"Total output tokens": 747909,
|
1462 |
+
"Average output tokens": 2945
|
1463 |
+
}
|
1464 |
+
},
|
1465 |
+
"Llama-3.3-70B-Instruct": {
|
1466 |
+
"META": {
|
1467 |
+
"Algorithm": "SC-CoT",
|
1468 |
+
"LLM": "Llama-3.3-70B-Instruct",
|
1469 |
+
"Eval Date": "2025/1/22"
|
1470 |
+
},
|
1471 |
+
"gsm8k": {
|
1472 |
+
"Score": 95.45,
|
1473 |
+
"Pass rate": 1.0,
|
1474 |
+
"Cost($)": 4.5021,
|
1475 |
+
"Framework": "",
|
1476 |
+
"X-shot": "8.0",
|
1477 |
+
"Samples": 1319,
|
1478 |
+
"All tokens": 7985996,
|
1479 |
+
"Total input tokens": 5406763,
|
1480 |
+
"Average input tokens": 4099,
|
1481 |
+
"Total output tokens": 2579233,
|
1482 |
+
"Average output tokens": 1955
|
1483 |
+
},
|
1484 |
+
"AQuA": {
|
1485 |
+
"Score": 86.61,
|
1486 |
+
"Pass rate": 0.9921,
|
1487 |
+
"Cost($)": 0.5847,
|
1488 |
+
"Framework": "",
|
1489 |
+
"X-shot": "0.0",
|
1490 |
+
"Samples": 254,
|
1491 |
+
"All tokens": 1037124,
|
1492 |
+
"Total input tokens": 283248,
|
1493 |
+
"Average input tokens": 1115,
|
1494 |
+
"Total output tokens": 753876,
|
1495 |
+
"Average output tokens": 2968
|
1496 |
+
}
|
1497 |
+
},
|
1498 |
+
"Qwen2.5-7B-Instruct": {
|
1499 |
+
"META": {
|
1500 |
+
"Algorithm": "SC-CoT",
|
1501 |
+
"LLM": "Qwen2.5-7B-Instruct",
|
1502 |
+
"Eval Date": "2025/1/22"
|
1503 |
+
},
|
1504 |
+
"gsm8k": {
|
1505 |
+
"Score": 88.32,
|
1506 |
+
"Pass rate": 0.9984,
|
1507 |
+
"Cost($)": 0.0,
|
1508 |
+
"Framework": "",
|
1509 |
+
"X-shot": "8.0",
|
1510 |
+
"Samples": 1319,
|
1511 |
+
"All tokens": 8173818,
|
1512 |
+
"Total input tokens": 5668252,
|
1513 |
+
"Average input tokens": 4297,
|
1514 |
+
"Total output tokens": 2505566,
|
1515 |
+
"Average output tokens": 1900
|
1516 |
+
},
|
1517 |
+
"AQuA": {
|
1518 |
+
"Score": 81.49,
|
1519 |
+
"Pass rate": 1.0,
|
1520 |
+
"Cost($)": 0.0,
|
1521 |
+
"Framework": "",
|
1522 |
+
"X-shot": "0.0",
|
1523 |
+
"Samples": 254,
|
1524 |
+
"All tokens": 1015368,
|
1525 |
+
"Total input tokens": 278848,
|
1526 |
+
"Average input tokens": 1098,
|
1527 |
+
"Total output tokens": 736520,
|
1528 |
+
"Average output tokens": 2900
|
1529 |
+
}
|
1530 |
+
},
|
1531 |
+
"Llama-3.1-8B-Instruct": {
|
1532 |
+
"META": {
|
1533 |
+
"Algorithm": "SC-CoT",
|
1534 |
+
"LLM": "Llama-3.1-8B-Instruct",
|
1535 |
+
"Eval Date": "2025/1/22"
|
1536 |
+
},
|
1537 |
+
"gsm8k": {
|
1538 |
+
"Score": 75.2,
|
1539 |
+
"Pass rate": 0.9954,
|
1540 |
+
"Cost($)": 0.0,
|
1541 |
+
"Framework": "",
|
1542 |
+
"X-shot": "8.0",
|
1543 |
+
"Samples": 1319,
|
1544 |
+
"All tokens": 8444203,
|
1545 |
+
"Total input tokens": 5334657,
|
1546 |
+
"Average input tokens": 4044,
|
1547 |
+
"Total output tokens": 3109546,
|
1548 |
+
"Average output tokens": 2358
|
1549 |
+
},
|
1550 |
+
"AQuA": {
|
1551 |
+
"Score": 53.14,
|
1552 |
+
"Pass rate": 0.9606,
|
1553 |
+
"Cost($)": 0.0,
|
1554 |
+
"Framework": "",
|
1555 |
+
"X-shot": "0.0",
|
1556 |
+
"Samples": 254,
|
1557 |
+
"All tokens": 1041346,
|
1558 |
+
"Total input tokens": 372968,
|
1559 |
+
"Average input tokens": 1468,
|
1560 |
+
"Total output tokens": 668378,
|
1561 |
+
"Average output tokens": 2631
|
1562 |
+
}
|
1563 |
+
},
|
1564 |
+
"Internllm2_5-7B": {
|
1565 |
+
"META": {
|
1566 |
+
"Algorithm": "SC-CoT",
|
1567 |
+
"LLM": "Internllm2_5-7B",
|
1568 |
+
"Eval Date": "2025/1/22"
|
1569 |
+
},
|
1570 |
+
"gsm8k": {
|
1571 |
+
"Score": 41.39,
|
1572 |
+
"Pass rate": 0.9825,
|
1573 |
+
"Cost($)": 0.0,
|
1574 |
+
"Framework": "",
|
1575 |
+
"X-shot": "8.0",
|
1576 |
+
"Samples": 1319,
|
1577 |
+
"All tokens": 10024857,
|
1578 |
+
"Total input tokens": 6674518,
|
1579 |
+
"Average input tokens": 5060,
|
1580 |
+
"Total output tokens": 3350339,
|
1581 |
+
"Average output tokens": 2540
|
1582 |
+
},
|
1583 |
+
"AQuA": {
|
1584 |
+
"Score": 35.85,
|
1585 |
+
"Pass rate": 0.988,
|
1586 |
+
"Cost($)": 0.0,
|
1587 |
+
"Framework": "",
|
1588 |
+
"X-shot": "0.0",
|
1589 |
+
"Samples": 254,
|
1590 |
+
"All tokens": 1240388,
|
1591 |
+
"Total input tokens": 530701,
|
1592 |
+
"Average input tokens": 2089,
|
1593 |
+
"Total output tokens": 709687,
|
1594 |
+
"Average output tokens": 2794
|
1595 |
+
}
|
1596 |
+
},
|
1597 |
+
"Qwen2-1.5B-Instruct": {
|
1598 |
+
"META": {
|
1599 |
+
"Algorithm": "SC-CoT",
|
1600 |
+
"LLM": "Qwen2-1.5B-Instruct",
|
1601 |
+
"Eval Date": "2025/1/22"
|
1602 |
+
},
|
1603 |
+
"gsm8k": {
|
1604 |
+
"Score": 5.53,
|
1605 |
+
"Pass rate": 0.8673,
|
1606 |
+
"Cost($)": 0.0,
|
1607 |
+
"Framework": "",
|
1608 |
+
"X-shot": "8.0",
|
1609 |
+
"Samples": 1319,
|
1610 |
+
"All tokens": 8961768,
|
1611 |
+
"Total input tokens": 5844218,
|
1612 |
+
"Average input tokens": 4431,
|
1613 |
+
"Total output tokens": 3117550,
|
1614 |
+
"Average output tokens": 2364
|
1615 |
+
},
|
1616 |
+
"AQuA": {
|
1617 |
+
"Score": 30.31,
|
1618 |
+
"Pass rate": 0.9724,
|
1619 |
+
"Cost($)": 0.0,
|
1620 |
+
"Framework": "",
|
1621 |
+
"X-shot": "0.0",
|
1622 |
+
"Samples": 254,
|
1623 |
+
"All tokens": 1157076,
|
1624 |
+
"Total input tokens": 430703,
|
1625 |
+
"Average input tokens": 1696,
|
1626 |
+
"Total output tokens": 726373,
|
1627 |
+
"Average output tokens": 2860
|
1628 |
+
}
|
1629 |
+
},
|
1630 |
+
"Qwen2-0.5B-Instruct": {
|
1631 |
+
"META": {
|
1632 |
+
"Algorithm": "SC-CoT",
|
1633 |
+
"LLM": "Qwen2-0.5B-Instruct",
|
1634 |
+
"Eval Date": "2025/1/22"
|
1635 |
+
},
|
1636 |
+
"gsm8k": {
|
1637 |
+
"Score": 3.79,
|
1638 |
+
"Pass rate": 0.9484,
|
1639 |
+
"Cost($)": 0.0,
|
1640 |
+
"Framework": "",
|
1641 |
+
"X-shot": "8.0",
|
1642 |
+
"Samples": 1319,
|
1643 |
+
"All tokens": 10533815,
|
1644 |
+
"Total input tokens": 6529832,
|
1645 |
+
"Average input tokens": 4951,
|
1646 |
+
"Total output tokens": 4003983,
|
1647 |
+
"Average output tokens": 3036
|
1648 |
+
},
|
1649 |
+
"AQuA": {
|
1650 |
+
"Score": 30.7,
|
1651 |
+
"Pass rate": 0.9842,
|
1652 |
+
"Cost($)": 0.0,
|
1653 |
+
"Framework": "",
|
1654 |
+
"X-shot": "0.0",
|
1655 |
+
"Samples": 254,
|
1656 |
+
"All tokens": 1225539,
|
1657 |
+
"Total input tokens": 496206,
|
1658 |
+
"Average input tokens": 1954,
|
1659 |
+
"Total output tokens": 729333,
|
1660 |
+
"Average output tokens": 2871
|
1661 |
}
|
1662 |
}
|
1663 |
}
|
1664 |
}
|
1665 |
+
}
|
src/detail_results.csv
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
|
2 |
+
1,SC-CoT,AQuA,gpt-4o,2025/1/22,88.19,1.0,0.0,6.2412,,254,678811,72916,287,605895,2385
|
3 |
+
2,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,86.61,0.9921,0.0,0.5847,,254,1037124,283248,1115,753876,2968
|
4 |
+
3,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0.0,0.0808,,254,143289,25143,99,118146,465
|
5 |
+
4,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.82,0.9842,0.0,0.5576,,254,989058,241149,949,747909,2945
|
6 |
+
5,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.996,0.0,0.0742,,254,131604,25397,100,106207,418
|
7 |
+
6,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9842,0.0,0.0927,,254,164389,32555,128,131834,519
|
8 |
+
7,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.67,0.9921,0.0,0.0798,,254,141567,32809,129,108758,428
|
9 |
+
8,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.67,0.9724,0.0,0.0066,,254,94577,27978,110,66599,262
|
10 |
+
9,CoT,AQuA,gpt-4o,2025/1/22,82.67,0.9803,0.0,1.0417,,254,123017,25123,99,97894,385
|
11 |
+
10,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,81.5,0.9764,0.0,0.0347,,254,465846,83830,330,382016,1504
|
12 |
+
11,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,81.49,1.0,0.0,0.0,,254,1015368,278848,1098,736520,2900
|
13 |
+
12,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.7,0.996,0.0,0.0,,254,149736,33017,130,116719,460
|
14 |
+
13,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.52,0.9921,0.0,0.1746,,254,309799,240735,948,69064,272
|
15 |
+
14,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0.0,0.0058,,254,87742,33058,130,54684,215
|
16 |
+
15,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.996,0.0,0.768,,254,1362379,1119143,4406,243236,958
|
17 |
+
16,IO,AQuA,Qwen2.5-7B-Instruct,2025/1/22,78.74,0.9842,0.0,0.0,,254,137771,33271,131,104500,411
|
18 |
+
17,ReAct-Pro*,AQuA,Doubao-lite-32k,2025/1/7,77.55,0.9606,0.0,0.0445,,254,1032841,977890,3850,54951,216
|
19 |
+
18,IO,AQuA,gpt-4o,2025/1/22,75.59,0.9724,0.0,1.1453,,254,133752,25631,101,108121,426
|
20 |
+
19,PoT,AQuA,gpt-4o,2025/1/22,75.19,1.0,0.0,1.6087,,254,327908,222717,877,105191,414
|
21 |
+
20,PoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,75.19,1.0,0.0,0.1645,,254,291764,249215,981,42549,168
|
22 |
+
21,ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,2025/1/22,74.4,0.9921,0.0,0.0,,254,695844,564165,2221,131679,518
|
23 |
+
22,ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,2025/1/22,73.22,1.0,0.0,0.3177,,254,563603,441765,1739,121838,480
|
24 |
+
23,PoT,AQuA,Doubao-lite-32k,2025/1/7,71.65,0.9685,0.0,0.0147,,254,309436,259863,1023,49573,195
|
25 |
+
24,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,70.47,0.9882,0.0,0.5578,,254,418617,70157,276,348460,1372
|
26 |
+
25,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0.0,0.0,,254,313728,264517,1041,49211,194
|
27 |
+
26,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.56,0.9803,0.0,0.4928,,254,903587,862614,3396,40973,161
|
28 |
+
27,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0.0,0.0957,,254,80793,25447,100,55346,218
|
29 |
+
28,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.62,1.0,0.0,0.0,,254,144435,32555,128,111880,440
|
30 |
+
29,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.44,1.0,0.0,0.1748,,254,266654,225162,886,41492,163
|
31 |
+
30,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0.0,2.304,,254,692096,615589,2424,76507,301
|
32 |
+
31,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0.0,0.0,,254,4340821,3764723,14822,576098,2268
|
33 |
+
32,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,53.14,0.9606,0.0,0.0,,254,1041346,372968,1468,668378,2631
|
34 |
+
33,CoT,AQuA,Internllm2_5-7B,2025/1/22,52.75,0.8937,0.0,0.0,,254,127520,26610,105,100910,397
|
35 |
+
34,IO,AQuA,Llama-3.1-8B-Instruct,2025/1/22,51.18,0.9881,0.0,0.0,,254,133106,26459,104,106647,420
|
36 |
+
35,IO,AQuA,Internllm2_5-7B,2025/1/22,47.63,0.9094,0.0,0.0,,254,185041,50232,198,134809,531
|
37 |
+
36,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0.0,0.0,,254,4428801,3592039,14142,836762,3294
|
38 |
+
37,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9881,0.0,0.0,,254,110040,30477,120,79563,313
|
39 |
+
38,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.97,1.0,0.0,0.038,,254,42471,25701,101,16770,66
|
40 |
+
39,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0.0,0.0,,254,290914,240613,947,50301,198
|
41 |
+
40,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9881,0.0,0.0,,254,301962,233505,919,68457,270
|
42 |
+
41,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,35.85,0.988,0.0,0.0,,254,1240388,530701,2089,709687,2794
|
43 |
+
42,CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,33.07,0.9881,0.0,0.0,,254,117339,30477,120,86862,342
|
44 |
+
43,PoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.7,0.9645,0.0,0.0,,254,298475,246560,971,51915,204
|
45 |
+
44,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,30.7,0.9842,0.0,0.0,,254,1225539,496206,1954,729333,2871
|
46 |
+
45,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.31,0.9724,0.0,0.0,,254,1157076,430703,1696,726373,2860
|
47 |
+
46,IO,AQuA,Qwen2-1.5B-Instruct,2025/1/22,29.13,0.9763,0.0,0.0,,254,71047,27937,110,43110,170
|
48 |
+
47,IO,AQuA,Qwen2-0.5B-Instruct,2025/1/22,27.16,0.9881,0.0,0.0,,254,110415,27937,110,82478,325
|
49 |
+
48,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0.0,0.0,,254,5072004,4555858,17936,516146,2032
|
50 |
+
49,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.01,0.9685,0.0,0.0,,254,7170087,6344167,24977,825920,3252
|
51 |
+
50,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9212,0.0,0.0,,254,322281,258867,1019,63414,250
|
52 |
+
1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.45,1.0,8.0,4.5021,,1319,7985996,5406763,4099,2579233,1955
|
53 |
+
2,CoT,gsm8k,gpt-4o,2025/1/22,94.08,1.0,8.0,4.5367,,1319,1165166,948668,719,216498,164
|
54 |
+
3,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8.0,0.687,,1319,1218665,990168,751,228497,173
|
55 |
+
4,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8.0,4.2166,,1319,1247912,1101672,835,146240,111
|
56 |
+
5,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8.0,0.7195,,1319,1276252,1005119,762,271133,206
|
57 |
+
6,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8.0,0.7054,,1319,1251210,1106682,839,144528,110
|
58 |
+
7,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.26,1.0,8.0,0.4709,,1319,835275,583916,443,251359,191
|
59 |
+
8,SC-CoT,gsm8k,gpt-4o,2025/1/22,90.75,1.0,8.0,24.2428,,1319,3300971,1168927,886,2132044,1616
|
60 |
+
9,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,90.67,1.0,8.0,4.2651,,1319,7565637,5292383,4012,2273254,1723
|
61 |
+
10,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8.0,0.0558,,1319,1201820,1042095,790,159725,121
|
62 |
+
11,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8.0,3.3463,,1319,741446,542416,411,199030,151
|
63 |
+
12,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,88.32,0.9984,8.0,0.0,,1319,8173818,5668252,4297,2505566,1900
|
64 |
+
13,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,,10.1124,,1319,17937864,17038928,12918,898936,682
|
65 |
+
14,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8.0,10.5479,,1319,18710437,18160983,13769,549454,417
|
66 |
+
15,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8.0,0.4899,,1319,869060,555340,421,313720,238
|
67 |
+
16,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8.0,0.0,,1319,1290805,1046008,793,244797,186
|
68 |
+
17,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.59,0.9962,8.0,0.2512,,1319,5998639,5862016,4444,136623,104
|
69 |
+
18,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,83.7,0.997,8.0,0.155,,1319,2507687,1230019,933,1277668,969
|
70 |
+
19,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.86,1.0,8.0,0.0,,1319,14850914,14355752,10884,495162,375
|
71 |
+
20,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,82.56,0.9985,8.0,2.6285,,1319,2560697,1212520,919,1348177,1022
|
72 |
+
21,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.6,0.9257,8.0,0.0576,,1319,1288055,1170038,887,118017,89
|
73 |
+
22,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.69,1.0,8.0,0.6788,,1319,1088041,953242,723,134799,102
|
74 |
+
23,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.9969,8.0,0.0,,1319,1202163,968163,734,234000,177
|
75 |
+
24,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.87,0.9924,8.0,0.6902,,1319,1187080,1090418,827,96662,73
|
76 |
+
25,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.43,0.9992,8.0,0.0,,1319,1248329,990168,751,258161,196
|
77 |
+
26,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.2,0.9954,8.0,0.0,,1319,8444203,5334657,4044,3109546,2358
|
78 |
+
27,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.9,0.9939,8.0,3.4633,,1319,6646286,6506164,4933,140122,106
|
79 |
+
28,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.08,0.796,8.0,0.9736,,1319,1727044,1126025,854,601019,456
|
80 |
+
29,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8.0,0.0354,,1319,740483,617377,468,123106,93
|
81 |
+
30,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.77,0.9855,8.0,0.0,,1319,22835767,21044978,15955,1790789,1358
|
82 |
+
31,ReAct-Pro*,gsm8k,gpt-4o,2025/1/22,63.3,0.9954,8.0,39.0751,,1319,14715887,14411173,10926,304714,231
|
83 |
+
32,PoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,58.83,0.705,8.0,0.0,,1319,1362822,1145390,868,217432,165
|
84 |
+
33,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8.0,0.0,,1319,887913,596229,452,291684,221
|
85 |
+
34,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9954,8.0,0.0,,1319,1745429,550941,418,1194488,906
|
86 |
+
35,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.49,1.0,8.0,0.0,,1319,1218525,1032818,783,185707,141
|
87 |
+
36,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,41.39,0.9825,8.0,0.0,,1319,10024857,6674518,5060,3350339,2540
|
88 |
+
37,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.66,0.5542,8.0,0.0,,1319,1391111,1147538,870,243573,185
|
89 |
+
38,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8.0,0.0,,1319,1324949,1136843,862,188106,143
|
90 |
+
39,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8.0,0.3328,,1319,586553,546990,415,39563,30
|
91 |
+
40,CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,35.93,0.9992,8.0,0.0,,1319,1223459,1032818,783,190641,145
|
92 |
+
41,ReAct-Pro*,gsm8k,Internllm2_5-7B,2025/1/22,33.51,0.9795,,0.0,,1319,35669989,30120070,22836,5549919,4208
|
93 |
+
42,ReAct-Pro*,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,24.86,0.8021,8.0,0.0,,1319,9828001,9133603,6925,694398,526
|
94 |
+
43,PoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,18.49,0.31,8.0,0.0,,1319,1327522,1151528,873,175994,133
|
95 |
+
44,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.67,1.0,8.0,0.0,,1319,736996,568530,431,168466,128
|
96 |
+
45,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.7,1.0,8.0,0.0,,1319,834897,568116,431,266781,202
|
97 |
+
46,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.59,0.9795,8.0,0.0,,1319,1113728,679302,515,434426,329
|
98 |
+
47,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.62,0.169,8.0,0.0,,1319,1389135,1151528,873,237607,180
|
99 |
+
48,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.65,0.9522,8.0,0.0,,1319,55392611,52431343,39751,2961268,2245
|
100 |
+
49,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,5.53,0.8673,8.0,0.0,,1319,8961768,5844218,4431,3117550,2364
|
101 |
+
50,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,3.79,0.9484,8.0,0.0,,1319,10533815,6529832,4951,4003983,3036
|
src/overall_filtered_results.csv
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($)
|
2 |
+
1.0,SC-CoT,Llama-3.3-70B-Instruct,2025/1/22,91.03,95.45,4.5021,86.61,0.5847
|
3 |
+
2.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,89.55,92.87,0.7195,86.22,0.0808
|
4 |
+
3.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,88.70,93.93,0.687,83.46,0.0927
|
5 |
+
4.0,CoT,gpt-4o,2025/1/22,88.38,94.08,4.5367,82.67,1.0417
|
6 |
+
5.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,88.25,90.67,4.2651,85.82,0.5576
|
7 |
+
6.0,SC-CoT,gpt-4o,2025/1/22,88.24,91.05,35.8006,85.43,6.3449
|
8 |
+
7.0,IO,Llama-3.3-70B-Instruct,2025/1/22,87.47,92.26,0.4709,82.67,0.0798
|
9 |
+
8.0,SC-CoT,Doubao-lite-32k,2025/1/7,86.04,88.62,0.1532,83.46,0.0409
|
10 |
+
9.0,CoT,Doubao-lite-32k,2025/1/7,85.99,89.31,0.0558,82.67,0.0066
|
11 |
+
10.0,IO,Qwen2.5-72B-Instruct,2025/1/22,85.42,86.58,0.4899,84.25,0.0742
|
12 |
+
11.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,84.91,88.32,0.0,81.49,0.0
|
13 |
+
12.0,PoT,gpt-4o,2025/1/22,84.15,93.1,4.2166,75.19,1.6087
|
14 |
+
13.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,83.77,92.34,0.7054,75.19,0.1645
|
15 |
+
14.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,83.39,87.64,10.1124,79.13,0.768
|
16 |
+
15.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,83.19,85.67,0.0,80.7,0.0
|
17 |
+
16.0,IO,gpt-4o,2025/1/22,82.00,88.4,3.3463,75.59,1.1453
|
18 |
+
17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,81.57,85.59,0.2512,77.55,0.0445
|
19 |
+
18.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,80.24,87.26,10.5479,73.22,0.3177
|
20 |
+
19.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,78.63,82.86,0.0,74.4,0.0
|
21 |
+
20.0,PoT,Llama-3.3-70B-Instruct,2025/1/22,76.30,73.08,0.9736,79.52,0.1746
|
22 |
+
21.0,PoT,Doubao-lite-32k,2025/1/7,75.63,79.6,0.0576,71.65,0.0147
|
23 |
+
22.0,IO,Doubao-lite-32k,2025/1/7,75.58,72.02,0.0354,79.13,0.0058
|
24 |
+
23.0,SC-CoT,gpt-3.5-turbo,2025/1/7,73.69,80.06,5.0227,67.32,0.6491
|
25 |
+
24.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,69.73,74.9,3.4633,64.56,0.4928
|
26 |
+
25.0,PoT,gpt-3.5-turbo,2025/1/7,68.16,76.87,0.6902,59.44,0.1748
|
27 |
+
26.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,68.03,75.43,0.0,60.62,0.0
|
28 |
+
27.0,IO,Qwen2.5-7B-Instruct,2025/1/22,67.99,57.24,0.0,78.74,0.0
|
29 |
+
28.0,CoT,Internllm2_5-7B,2025/1/22,65.23,77.71,0.0,52.75,0.0
|
30 |
+
29.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,64.17,75.2,0.0,53.14,0.0
|
31 |
+
30.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,63.47,58.83,0.0,68.11,0.0
|
32 |
+
31.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,61.64,67.77,0.0,55.51,0.0
|
33 |
+
32.0,ReAct-Pro*,gpt-4o,2025/1/22,60.39,63.3,39.0751,57.48,2.304
|
34 |
+
33.0,IO,Llama-3.1-8B-Instruct,2025/1/22,54.17,57.16,0.0,51.18,0.0
|
35 |
+
34.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,48.02,55.49,0.0,40.55,0.0
|
36 |
+
35.0,CoT,gpt-3.5-turbo,2025/1/7,39.35,78.69,0.6788,0.0,0.0
|
37 |
+
36.0,SC-CoT,Internllm2_5-7B,2025/1/22,38.62,41.39,0.0,35.85,0.0
|
38 |
+
37.0,IO,gpt-3.5-turbo,2025/1/7,38.40,37.83,0.3328,38.97,0.038
|
39 |
+
38.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,37.64,38.66,0.0,36.61,0.0
|
40 |
+
39.0,PoT,Internllm2_5-7B,2025/1/22,37.41,38.21,0.0,36.61,0.0
|
41 |
+
40.0,ReAct-Pro*,Internllm2_5-7B,2025/1/22,37.23,33.51,0.0,40.94,0.0
|
42 |
+
41.0,CoT,Qwen2-0.5B-Instruct,2025/1/22,34.50,35.93,0.0,33.07,0.0
|
43 |
+
42.0,IO,Internllm2_5-7B,2025/1/22,29.61,11.59,0.0,47.63,0.0
|
44 |
+
43.0,ReAct-Pro*,Qwen2-1.5B-Instruct,2025/1/22,25.23,24.86,0.0,25.59,0.0
|
45 |
+
44.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,24.60,18.49,0.0,30.7,0.0
|
46 |
+
45.0,IO,Qwen2-1.5B-Instruct,2025/1/22,22.90,16.67,0.0,29.13,0.0
|
47 |
+
46.0,IO,Qwen2-0.5B-Instruct,2025/1/22,20.93,14.7,0.0,27.16,0.0
|
48 |
+
47.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,17.92,5.53,0.0,30.31,0.0
|
49 |
+
48.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,17.25,3.79,0.0,30.7,0.0
|
50 |
+
49.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,15.83,7.65,0.0,24.01,0.0
|
51 |
+
50.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,13.47,9.62,0.0,17.32,0.0
|
src/overall_math_score.json
CHANGED
@@ -1,29 +1,59 @@
|
|
1 |
{
|
2 |
-
"time": "2025-01-09
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"META": {
|
6 |
"Algorithm": "IO",
|
7 |
"LLM": "gpt-3.5-turbo",
|
8 |
-
"Eval Date": "2025/
|
9 |
},
|
10 |
"gsm8k": {
|
11 |
"Score": 37.83,
|
12 |
"Cost($)": 0.3328
|
13 |
},
|
14 |
"AQuA": {
|
15 |
-
"Score": 38.
|
16 |
-
"Cost($)": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
}
|
18 |
},
|
19 |
"CoT": {
|
20 |
"META": {
|
21 |
"Algorithm": "CoT",
|
22 |
"LLM": "gpt-3.5-turbo",
|
23 |
-
"Eval Date": "2025/
|
24 |
},
|
25 |
"gsm8k": {
|
26 |
-
"Score": 78.
|
27 |
"Cost($)": 0.6788
|
28 |
},
|
29 |
"AQuA": {
|
@@ -35,121 +65,691 @@
|
|
35 |
"META": {
|
36 |
"Algorithm": "SC-CoT",
|
37 |
"LLM": "gpt-3.5-turbo",
|
38 |
-
"Eval Date": "2025/
|
39 |
},
|
40 |
"gsm8k": {
|
41 |
-
"Score":
|
42 |
-
"Cost($)":
|
43 |
},
|
44 |
"AQuA": {
|
45 |
-
"Score":
|
46 |
-
"Cost($)": 0.
|
47 |
}
|
48 |
},
|
49 |
-
"
|
50 |
"META": {
|
51 |
-
"Algorithm": "
|
52 |
-
"LLM": "
|
53 |
-
"Eval Date": "2025/
|
54 |
},
|
55 |
"gsm8k": {
|
56 |
-
"Score":
|
57 |
-
"Cost($)": 0.
|
58 |
},
|
59 |
"AQuA": {
|
60 |
-
"Score":
|
61 |
-
"Cost($)": 0.
|
62 |
}
|
63 |
},
|
64 |
-
"ReAct-Pro
|
65 |
"META": {
|
66 |
"Algorithm": "ReAct-Pro*",
|
67 |
-
"LLM": "
|
68 |
-
"Eval Date": "2025/
|
69 |
},
|
70 |
"gsm8k": {
|
71 |
-
"Score":
|
72 |
-
"Cost($)":
|
73 |
},
|
74 |
"AQuA": {
|
75 |
-
"Score":
|
76 |
-
"Cost($)": 0.
|
77 |
}
|
78 |
},
|
79 |
-
"
|
80 |
"META": {
|
81 |
-
"Algorithm": "
|
82 |
"LLM": "Doubao-lite-32k",
|
83 |
-
"Eval Date": "2025/
|
84 |
},
|
85 |
"gsm8k": {
|
86 |
-
"Score":
|
87 |
-
"Cost($)": 0.
|
88 |
},
|
89 |
"AQuA": {
|
90 |
-
"Score":
|
91 |
-
"Cost($)": 0.
|
92 |
}
|
93 |
},
|
94 |
-
"CoT-Doubao": {
|
95 |
"META": {
|
96 |
"Algorithm": "CoT",
|
97 |
"LLM": "Doubao-lite-32k",
|
98 |
-
"Eval Date": "2025/
|
99 |
},
|
100 |
"gsm8k": {
|
101 |
"Score": 89.31,
|
102 |
-
"Cost($)": 0.
|
103 |
},
|
104 |
"AQuA": {
|
105 |
-
"Score": 82.
|
106 |
"Cost($)": 0.0066
|
107 |
}
|
108 |
},
|
109 |
-
"SC-CoT-Doubao": {
|
110 |
"META": {
|
111 |
"Algorithm": "SC-CoT",
|
112 |
"LLM": "Doubao-lite-32k",
|
113 |
-
"Eval Date": "2025/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
},
|
115 |
"gsm8k": {
|
116 |
-
"Score": 88.
|
117 |
-
"Cost($)":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
},
|
119 |
"AQuA": {
|
120 |
"Score": 83.46,
|
121 |
-
"Cost($)": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
}
|
123 |
},
|
124 |
-
"PoT-
|
125 |
"META": {
|
126 |
"Algorithm": "PoT",
|
127 |
-
"LLM": "
|
128 |
-
"Eval Date": "2025/
|
129 |
},
|
130 |
"gsm8k": {
|
131 |
-
"Score":
|
132 |
-
"Cost($)": 0.
|
133 |
},
|
134 |
"AQuA": {
|
135 |
-
"Score":
|
136 |
-
"Cost($)": 0.
|
137 |
}
|
138 |
},
|
139 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
"META": {
|
141 |
"Algorithm": "ReAct-Pro*",
|
142 |
-
"LLM": "
|
143 |
-
"Eval Date": "2025/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
},
|
145 |
"gsm8k": {
|
146 |
-
"Score":
|
147 |
-
"Cost($)": 0.
|
148 |
},
|
149 |
"AQuA": {
|
150 |
-
"Score":
|
151 |
-
"Cost($)": 0.
|
152 |
}
|
153 |
}
|
154 |
}
|
155 |
-
}
|
|
|
1 |
{
|
2 |
+
"time": "2025-01-23 09:27:24",
|
3 |
"results": {
|
4 |
"IO": {
|
5 |
"META": {
|
6 |
"Algorithm": "IO",
|
7 |
"LLM": "gpt-3.5-turbo",
|
8 |
+
"Eval Date": "2025/1/7"
|
9 |
},
|
10 |
"gsm8k": {
|
11 |
"Score": 37.83,
|
12 |
"Cost($)": 0.3328
|
13 |
},
|
14 |
"AQuA": {
|
15 |
+
"Score": 38.97,
|
16 |
+
"Cost($)": 0.038
|
17 |
+
}
|
18 |
+
},
|
19 |
+
"ReAct-Pro*": {
|
20 |
+
"META": {
|
21 |
+
"Algorithm": "ReAct-Pro*",
|
22 |
+
"LLM": "gpt-3.5-turbo",
|
23 |
+
"Eval Date": "2025/1/7"
|
24 |
+
},
|
25 |
+
"gsm8k": {
|
26 |
+
"Score": 74.9,
|
27 |
+
"Cost($)": 3.4633
|
28 |
+
},
|
29 |
+
"AQuA": {
|
30 |
+
"Score": 64.56,
|
31 |
+
"Cost($)": 0.4928
|
32 |
+
}
|
33 |
+
},
|
34 |
+
"PoT": {
|
35 |
+
"META": {
|
36 |
+
"Algorithm": "PoT",
|
37 |
+
"LLM": "gpt-3.5-turbo",
|
38 |
+
"Eval Date": "2025/1/7"
|
39 |
+
},
|
40 |
+
"gsm8k": {
|
41 |
+
"Score": 76.87,
|
42 |
+
"Cost($)": 0.6902
|
43 |
+
},
|
44 |
+
"AQuA": {
|
45 |
+
"Score": 59.44,
|
46 |
+
"Cost($)": 0.1748
|
47 |
}
|
48 |
},
|
49 |
"CoT": {
|
50 |
"META": {
|
51 |
"Algorithm": "CoT",
|
52 |
"LLM": "gpt-3.5-turbo",
|
53 |
+
"Eval Date": "2025/1/7"
|
54 |
},
|
55 |
"gsm8k": {
|
56 |
+
"Score": 78.69,
|
57 |
"Cost($)": 0.6788
|
58 |
},
|
59 |
"AQuA": {
|
|
|
65 |
"META": {
|
66 |
"Algorithm": "SC-CoT",
|
67 |
"LLM": "gpt-3.5-turbo",
|
68 |
+
"Eval Date": "2025/1/7"
|
69 |
},
|
70 |
"gsm8k": {
|
71 |
+
"Score": 82.56,
|
72 |
+
"Cost($)": 2.6285
|
73 |
},
|
74 |
"AQuA": {
|
75 |
+
"Score": 70.47,
|
76 |
+
"Cost($)": 0.5578
|
77 |
}
|
78 |
},
|
79 |
+
"IO-Doubao-lite-32k": {
|
80 |
"META": {
|
81 |
+
"Algorithm": "IO",
|
82 |
+
"LLM": "Doubao-lite-32k",
|
83 |
+
"Eval Date": "2025/1/7"
|
84 |
},
|
85 |
"gsm8k": {
|
86 |
+
"Score": 72.02,
|
87 |
+
"Cost($)": 0.0354
|
88 |
},
|
89 |
"AQuA": {
|
90 |
+
"Score": 79.13,
|
91 |
+
"Cost($)": 0.0058
|
92 |
}
|
93 |
},
|
94 |
+
"ReAct-Pro*-Doubao-lite-32k": {
|
95 |
"META": {
|
96 |
"Algorithm": "ReAct-Pro*",
|
97 |
+
"LLM": "Doubao-lite-32k",
|
98 |
+
"Eval Date": "2025/1/7"
|
99 |
},
|
100 |
"gsm8k": {
|
101 |
+
"Score": 85.59,
|
102 |
+
"Cost($)": 0.2512
|
103 |
},
|
104 |
"AQuA": {
|
105 |
+
"Score": 77.55,
|
106 |
+
"Cost($)": 0.0445
|
107 |
}
|
108 |
},
|
109 |
+
"PoT-Doubao-lite-32k": {
|
110 |
"META": {
|
111 |
+
"Algorithm": "PoT",
|
112 |
"LLM": "Doubao-lite-32k",
|
113 |
+
"Eval Date": "2025/1/7"
|
114 |
},
|
115 |
"gsm8k": {
|
116 |
+
"Score": 79.6,
|
117 |
+
"Cost($)": 0.0576
|
118 |
},
|
119 |
"AQuA": {
|
120 |
+
"Score": 71.65,
|
121 |
+
"Cost($)": 0.0147
|
122 |
}
|
123 |
},
|
124 |
+
"CoT-Doubao-lite-32k": {
|
125 |
"META": {
|
126 |
"Algorithm": "CoT",
|
127 |
"LLM": "Doubao-lite-32k",
|
128 |
+
"Eval Date": "2025/1/7"
|
129 |
},
|
130 |
"gsm8k": {
|
131 |
"Score": 89.31,
|
132 |
+
"Cost($)": 0.0558
|
133 |
},
|
134 |
"AQuA": {
|
135 |
+
"Score": 82.67,
|
136 |
"Cost($)": 0.0066
|
137 |
}
|
138 |
},
|
139 |
+
"SC-CoT-Doubao-lite-32k": {
|
140 |
"META": {
|
141 |
"Algorithm": "SC-CoT",
|
142 |
"LLM": "Doubao-lite-32k",
|
143 |
+
"Eval Date": "2025/1/7"
|
144 |
+
},
|
145 |
+
"gsm8k": {
|
146 |
+
"Score": 83.7,
|
147 |
+
"Cost($)": 0.155
|
148 |
+
},
|
149 |
+
"AQuA": {
|
150 |
+
"Score": 81.5,
|
151 |
+
"Cost($)": 0.0347
|
152 |
+
}
|
153 |
+
},
|
154 |
+
"IO-gpt-4o": {
|
155 |
+
"META": {
|
156 |
+
"Algorithm": "IO",
|
157 |
+
"LLM": "gpt-4o",
|
158 |
+
"Eval Date": "2025/1/22"
|
159 |
},
|
160 |
"gsm8k": {
|
161 |
+
"Score": 88.4,
|
162 |
+
"Cost($)": 3.3463
|
163 |
+
},
|
164 |
+
"AQuA": {
|
165 |
+
"Score": 75.59,
|
166 |
+
"Cost($)": 1.1453
|
167 |
+
}
|
168 |
+
},
|
169 |
+
"ReAct-Pro*-gpt-4o": {
|
170 |
+
"META": {
|
171 |
+
"Algorithm": "ReAct-Pro*",
|
172 |
+
"LLM": "gpt-4o",
|
173 |
+
"Eval Date": "2025/1/22"
|
174 |
+
},
|
175 |
+
"gsm8k": {
|
176 |
+
"Score": 63.3,
|
177 |
+
"Cost($)": 39.0751
|
178 |
+
},
|
179 |
+
"AQuA": {
|
180 |
+
"Score": 57.48,
|
181 |
+
"Cost($)": 2.304
|
182 |
+
}
|
183 |
+
},
|
184 |
+
"PoT-gpt-4o": {
|
185 |
+
"META": {
|
186 |
+
"Algorithm": "PoT",
|
187 |
+
"LLM": "gpt-4o",
|
188 |
+
"Eval Date": "2025/1/22"
|
189 |
+
},
|
190 |
+
"gsm8k": {
|
191 |
+
"Score": 93.1,
|
192 |
+
"Cost($)": 4.2166
|
193 |
+
},
|
194 |
+
"AQuA": {
|
195 |
+
"Score": 75.19,
|
196 |
+
"Cost($)": 1.6087
|
197 |
+
}
|
198 |
+
},
|
199 |
+
"CoT-gpt-4o": {
|
200 |
+
"META": {
|
201 |
+
"Algorithm": "CoT",
|
202 |
+
"LLM": "gpt-4o",
|
203 |
+
"Eval Date": "2025/1/22"
|
204 |
+
},
|
205 |
+
"gsm8k": {
|
206 |
+
"Score": 94.08,
|
207 |
+
"Cost($)": 4.5367
|
208 |
+
},
|
209 |
+
"AQuA": {
|
210 |
+
"Score": 82.67,
|
211 |
+
"Cost($)": 1.0417
|
212 |
+
}
|
213 |
+
},
|
214 |
+
"SC-CoT-gpt-4o": {
|
215 |
+
"META": {
|
216 |
+
"Algorithm": "SC-CoT",
|
217 |
+
"LLM": "gpt-4o",
|
218 |
+
"Eval Date": "2025/1/22"
|
219 |
+
},
|
220 |
+
"gsm8k": {
|
221 |
+
"Score": 90.75,
|
222 |
+
"Cost($)": 24.2428
|
223 |
+
},
|
224 |
+
"AQuA": {
|
225 |
+
"Score": 88.19,
|
226 |
+
"Cost($)": 6.2412
|
227 |
+
}
|
228 |
+
},
|
229 |
+
"IO-Qwen2.5-72B-Instruct": {
|
230 |
+
"META": {
|
231 |
+
"Algorithm": "IO",
|
232 |
+
"LLM": "Qwen2.5-72B-Instruct",
|
233 |
+
"Eval Date": "2025/1/22"
|
234 |
+
},
|
235 |
+
"gsm8k": {
|
236 |
+
"Score": 86.58,
|
237 |
+
"Cost($)": 0.4899
|
238 |
+
},
|
239 |
+
"AQuA": {
|
240 |
+
"Score": 84.25,
|
241 |
+
"Cost($)": 0.0742
|
242 |
+
}
|
243 |
+
},
|
244 |
+
"ReAct-Pro*-Qwen2.5-72B-Instruct": {
|
245 |
+
"META": {
|
246 |
+
"Algorithm": "ReAct-Pro*",
|
247 |
+
"LLM": "Qwen2.5-72B-Instruct",
|
248 |
+
"Eval Date": "2025/1/22"
|
249 |
+
},
|
250 |
+
"gsm8k": {
|
251 |
+
"Score": 87.26,
|
252 |
+
"Cost($)": 10.5479
|
253 |
+
},
|
254 |
+
"AQuA": {
|
255 |
+
"Score": 73.22,
|
256 |
+
"Cost($)": 0.3177
|
257 |
+
}
|
258 |
+
},
|
259 |
+
"PoT-Qwen2.5-72B-Instruct": {
|
260 |
+
"META": {
|
261 |
+
"Algorithm": "PoT",
|
262 |
+
"LLM": "Qwen2.5-72B-Instruct",
|
263 |
+
"Eval Date": "2025/1/22"
|
264 |
+
},
|
265 |
+
"gsm8k": {
|
266 |
+
"Score": 92.34,
|
267 |
+
"Cost($)": 0.7054
|
268 |
+
},
|
269 |
+
"AQuA": {
|
270 |
+
"Score": 75.19,
|
271 |
+
"Cost($)": 0.1645
|
272 |
+
}
|
273 |
+
},
|
274 |
+
"CoT-Qwen2.5-72B-Instruct": {
|
275 |
+
"META": {
|
276 |
+
"Algorithm": "CoT",
|
277 |
+
"LLM": "Qwen2.5-72B-Instruct",
|
278 |
+
"Eval Date": "2025/1/22"
|
279 |
+
},
|
280 |
+
"gsm8k": {
|
281 |
+
"Score": 92.87,
|
282 |
+
"Cost($)": 0.7195
|
283 |
+
},
|
284 |
+
"AQuA": {
|
285 |
+
"Score": 86.22,
|
286 |
+
"Cost($)": 0.0808
|
287 |
+
}
|
288 |
+
},
|
289 |
+
"SC-CoT-Qwen2.5-72B-Instruct": {
|
290 |
+
"META": {
|
291 |
+
"Algorithm": "SC-CoT",
|
292 |
+
"LLM": "Qwen2.5-72B-Instruct",
|
293 |
+
"Eval Date": "2025/1/22"
|
294 |
+
},
|
295 |
+
"gsm8k": {
|
296 |
+
"Score": 90.67,
|
297 |
+
"Cost($)": 4.2651
|
298 |
+
},
|
299 |
+
"AQuA": {
|
300 |
+
"Score": 85.82,
|
301 |
+
"Cost($)": 0.5576
|
302 |
+
}
|
303 |
+
},
|
304 |
+
"IO-Llama-3.3-70B-Instruct": {
|
305 |
+
"META": {
|
306 |
+
"Algorithm": "IO",
|
307 |
+
"LLM": "Llama-3.3-70B-Instruct",
|
308 |
+
"Eval Date": "2025/1/22"
|
309 |
+
},
|
310 |
+
"gsm8k": {
|
311 |
+
"Score": 92.26,
|
312 |
+
"Cost($)": 0.4709
|
313 |
+
},
|
314 |
+
"AQuA": {
|
315 |
+
"Score": 82.67,
|
316 |
+
"Cost($)": 0.0798
|
317 |
+
}
|
318 |
+
},
|
319 |
+
"ReAct-Pro*-Llama-3.3-70B-Instruct": {
|
320 |
+
"META": {
|
321 |
+
"Algorithm": "ReAct-Pro*",
|
322 |
+
"LLM": "Llama-3.3-70B-Instruct",
|
323 |
+
"Eval Date": "2025/1/22"
|
324 |
+
},
|
325 |
+
"gsm8k": {
|
326 |
+
"Score": 87.64,
|
327 |
+
"Cost($)": 10.1124
|
328 |
+
},
|
329 |
+
"AQuA": {
|
330 |
+
"Score": 79.13,
|
331 |
+
"Cost($)": 0.768
|
332 |
+
}
|
333 |
+
},
|
334 |
+
"PoT-Llama-3.3-70B-Instruct": {
|
335 |
+
"META": {
|
336 |
+
"Algorithm": "PoT",
|
337 |
+
"LLM": "Llama-3.3-70B-Instruct",
|
338 |
+
"Eval Date": "2025/1/22"
|
339 |
+
},
|
340 |
+
"gsm8k": {
|
341 |
+
"Score": 73.08,
|
342 |
+
"Cost($)": 0.9736
|
343 |
+
},
|
344 |
+
"AQuA": {
|
345 |
+
"Score": 79.52,
|
346 |
+
"Cost($)": 0.1746
|
347 |
+
}
|
348 |
+
},
|
349 |
+
"CoT-Llama-3.3-70B-Instruct": {
|
350 |
+
"META": {
|
351 |
+
"Algorithm": "CoT",
|
352 |
+
"LLM": "Llama-3.3-70B-Instruct",
|
353 |
+
"Eval Date": "2025/1/22"
|
354 |
+
},
|
355 |
+
"gsm8k": {
|
356 |
+
"Score": 93.93,
|
357 |
+
"Cost($)": 0.687
|
358 |
},
|
359 |
"AQuA": {
|
360 |
"Score": 83.46,
|
361 |
+
"Cost($)": 0.0927
|
362 |
+
}
|
363 |
+
},
|
364 |
+
"SC-CoT-Llama-3.3-70B-Instruct": {
|
365 |
+
"META": {
|
366 |
+
"Algorithm": "SC-CoT",
|
367 |
+
"LLM": "Llama-3.3-70B-Instruct",
|
368 |
+
"Eval Date": "2025/1/22"
|
369 |
+
},
|
370 |
+
"gsm8k": {
|
371 |
+
"Score": 95.45,
|
372 |
+
"Cost($)": 4.5021
|
373 |
+
},
|
374 |
+
"AQuA": {
|
375 |
+
"Score": 86.61,
|
376 |
+
"Cost($)": 0.5847
|
377 |
+
}
|
378 |
+
},
|
379 |
+
"IO-Qwen2.5-7B-Instruct": {
|
380 |
+
"META": {
|
381 |
+
"Algorithm": "IO",
|
382 |
+
"LLM": "Qwen2.5-7B-Instruct",
|
383 |
+
"Eval Date": "2025/1/22"
|
384 |
+
},
|
385 |
+
"gsm8k": {
|
386 |
+
"Score": 57.24,
|
387 |
+
"Cost($)": 0.0
|
388 |
+
},
|
389 |
+
"AQuA": {
|
390 |
+
"Score": 78.74,
|
391 |
+
"Cost($)": 0.0
|
392 |
+
}
|
393 |
+
},
|
394 |
+
"ReAct-Pro*-Qwen2.5-7B-Instruct": {
|
395 |
+
"META": {
|
396 |
+
"Algorithm": "ReAct-Pro*",
|
397 |
+
"LLM": "Qwen2.5-7B-Instruct",
|
398 |
+
"Eval Date": "2025/1/22"
|
399 |
+
},
|
400 |
+
"gsm8k": {
|
401 |
+
"Score": 82.86,
|
402 |
+
"Cost($)": 0.0
|
403 |
+
},
|
404 |
+
"AQuA": {
|
405 |
+
"Score": 74.4,
|
406 |
+
"Cost($)": 0.0
|
407 |
}
|
408 |
},
|
409 |
+
"PoT-Qwen2.5-7B-Instruct": {
|
410 |
"META": {
|
411 |
"Algorithm": "PoT",
|
412 |
+
"LLM": "Qwen2.5-7B-Instruct",
|
413 |
+
"Eval Date": "2025/1/22"
|
414 |
},
|
415 |
"gsm8k": {
|
416 |
+
"Score": 58.83,
|
417 |
+
"Cost($)": 0.0
|
418 |
},
|
419 |
"AQuA": {
|
420 |
+
"Score": 68.11,
|
421 |
+
"Cost($)": 0.0
|
422 |
}
|
423 |
},
|
424 |
+
"CoT-Qwen2.5-7B-Instruct": {
|
425 |
+
"META": {
|
426 |
+
"Algorithm": "CoT",
|
427 |
+
"LLM": "Qwen2.5-7B-Instruct",
|
428 |
+
"Eval Date": "2025/1/22"
|
429 |
+
},
|
430 |
+
"gsm8k": {
|
431 |
+
"Score": 85.67,
|
432 |
+
"Cost($)": 0.0
|
433 |
+
},
|
434 |
+
"AQuA": {
|
435 |
+
"Score": 80.7,
|
436 |
+
"Cost($)": 0.0
|
437 |
+
}
|
438 |
+
},
|
439 |
+
"SC-CoT-Qwen2.5-7B-Instruct": {
|
440 |
+
"META": {
|
441 |
+
"Algorithm": "SC-CoT",
|
442 |
+
"LLM": "Qwen2.5-7B-Instruct",
|
443 |
+
"Eval Date": "2025/1/22"
|
444 |
+
},
|
445 |
+
"gsm8k": {
|
446 |
+
"Score": 88.32,
|
447 |
+
"Cost($)": 0.0
|
448 |
+
},
|
449 |
+
"AQuA": {
|
450 |
+
"Score": 81.49,
|
451 |
+
"Cost($)": 0.0
|
452 |
+
}
|
453 |
+
},
|
454 |
+
"IO-Llama-3.1-8B-Instruct": {
|
455 |
+
"META": {
|
456 |
+
"Algorithm": "IO",
|
457 |
+
"LLM": "Llama-3.1-8B-Instruct",
|
458 |
+
"Eval Date": "2025/1/22"
|
459 |
+
},
|
460 |
+
"gsm8k": {
|
461 |
+
"Score": 57.16,
|
462 |
+
"Cost($)": 0.0
|
463 |
+
},
|
464 |
+
"AQuA": {
|
465 |
+
"Score": 51.18,
|
466 |
+
"Cost($)": 0.0
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"ReAct-Pro*-Llama-3.1-8B-Instruct": {
|
470 |
"META": {
|
471 |
"Algorithm": "ReAct-Pro*",
|
472 |
+
"LLM": "Llama-3.1-8B-Instruct",
|
473 |
+
"Eval Date": "2025/1/22"
|
474 |
+
},
|
475 |
+
"gsm8k": {
|
476 |
+
"Score": 67.77,
|
477 |
+
"Cost($)": 0.0
|
478 |
+
},
|
479 |
+
"AQuA": {
|
480 |
+
"Score": 55.51,
|
481 |
+
"Cost($)": 0.0
|
482 |
+
}
|
483 |
+
},
|
484 |
+
"PoT-Llama-3.1-8B-Instruct": {
|
485 |
+
"META": {
|
486 |
+
"Algorithm": "PoT",
|
487 |
+
"LLM": "Llama-3.1-8B-Instruct",
|
488 |
+
"Eval Date": "2025/1/22"
|
489 |
+
},
|
490 |
+
"gsm8k": {
|
491 |
+
"Score": 38.66,
|
492 |
+
"Cost($)": 0.0
|
493 |
+
},
|
494 |
+
"AQuA": {
|
495 |
+
"Score": 36.61,
|
496 |
+
"Cost($)": 0.0
|
497 |
+
}
|
498 |
+
},
|
499 |
+
"CoT-Llama-3.1-8B-Instruct": {
|
500 |
+
"META": {
|
501 |
+
"Algorithm": "CoT",
|
502 |
+
"LLM": "Llama-3.1-8B-Instruct",
|
503 |
+
"Eval Date": "2025/1/22"
|
504 |
+
},
|
505 |
+
"gsm8k": {
|
506 |
+
"Score": 75.43,
|
507 |
+
"Cost($)": 0.0
|
508 |
+
},
|
509 |
+
"AQuA": {
|
510 |
+
"Score": 60.62,
|
511 |
+
"Cost($)": 0.0
|
512 |
+
}
|
513 |
+
},
|
514 |
+
"SC-CoT-Llama-3.1-8B-Instruct": {
|
515 |
+
"META": {
|
516 |
+
"Algorithm": "SC-CoT",
|
517 |
+
"LLM": "Llama-3.1-8B-Instruct",
|
518 |
+
"Eval Date": "2025/1/22"
|
519 |
+
},
|
520 |
+
"gsm8k": {
|
521 |
+
"Score": 75.2,
|
522 |
+
"Cost($)": 0.0
|
523 |
+
},
|
524 |
+
"AQuA": {
|
525 |
+
"Score": 53.14,
|
526 |
+
"Cost($)": 0.0
|
527 |
+
}
|
528 |
+
},
|
529 |
+
"IO-Internllm2_5-7B": {
|
530 |
+
"META": {
|
531 |
+
"Algorithm": "IO",
|
532 |
+
"LLM": "Internllm2_5-7B",
|
533 |
+
"Eval Date": "2025/1/22"
|
534 |
+
},
|
535 |
+
"gsm8k": {
|
536 |
+
"Score": 11.59,
|
537 |
+
"Cost($)": 0.0
|
538 |
+
},
|
539 |
+
"AQuA": {
|
540 |
+
"Score": 47.63,
|
541 |
+
"Cost($)": 0.0
|
542 |
+
}
|
543 |
+
},
|
544 |
+
"ReAct-Pro*-Internllm2_5-7B": {
|
545 |
+
"META": {
|
546 |
+
"Algorithm": "ReAct-Pro*",
|
547 |
+
"LLM": "Internllm2_5-7B",
|
548 |
+
"Eval Date": "2025/1/22"
|
549 |
+
},
|
550 |
+
"gsm8k": {
|
551 |
+
"Score": 33.51,
|
552 |
+
"Cost($)": 0.0
|
553 |
+
},
|
554 |
+
"AQuA": {
|
555 |
+
"Score": 40.94,
|
556 |
+
"Cost($)": 0.0
|
557 |
+
}
|
558 |
+
},
|
559 |
+
"PoT-Internllm2_5-7B": {
|
560 |
+
"META": {
|
561 |
+
"Algorithm": "PoT",
|
562 |
+
"LLM": "Internllm2_5-7B",
|
563 |
+
"Eval Date": "2025/1/22"
|
564 |
+
},
|
565 |
+
"gsm8k": {
|
566 |
+
"Score": 38.21,
|
567 |
+
"Cost($)": 0.0
|
568 |
+
},
|
569 |
+
"AQuA": {
|
570 |
+
"Score": 36.61,
|
571 |
+
"Cost($)": 0.0
|
572 |
+
}
|
573 |
+
},
|
574 |
+
"CoT-Internllm2_5-7B": {
|
575 |
+
"META": {
|
576 |
+
"Algorithm": "CoT",
|
577 |
+
"LLM": "Internllm2_5-7B",
|
578 |
+
"Eval Date": "2025/1/22"
|
579 |
+
},
|
580 |
+
"gsm8k": {
|
581 |
+
"Score": 77.71,
|
582 |
+
"Cost($)": 0.0
|
583 |
+
},
|
584 |
+
"AQuA": {
|
585 |
+
"Score": 52.75,
|
586 |
+
"Cost($)": 0.0
|
587 |
+
}
|
588 |
+
},
|
589 |
+
"SC-CoT-Internllm2_5-7B": {
|
590 |
+
"META": {
|
591 |
+
"Algorithm": "SC-CoT",
|
592 |
+
"LLM": "Internllm2_5-7B",
|
593 |
+
"Eval Date": "2025/1/22"
|
594 |
+
},
|
595 |
+
"gsm8k": {
|
596 |
+
"Score": 41.39,
|
597 |
+
"Cost($)": 0.0
|
598 |
+
},
|
599 |
+
"AQuA": {
|
600 |
+
"Score": 35.85,
|
601 |
+
"Cost($)": 0.0
|
602 |
+
}
|
603 |
+
},
|
604 |
+
"IO-Qwen2-1.5B-Instruct": {
|
605 |
+
"META": {
|
606 |
+
"Algorithm": "IO",
|
607 |
+
"LLM": "Qwen2-1.5B-Instruct",
|
608 |
+
"Eval Date": "2025/1/22"
|
609 |
+
},
|
610 |
+
"gsm8k": {
|
611 |
+
"Score": 16.67,
|
612 |
+
"Cost($)": 0.0
|
613 |
+
},
|
614 |
+
"AQuA": {
|
615 |
+
"Score": 29.13,
|
616 |
+
"Cost($)": 0.0
|
617 |
+
}
|
618 |
+
},
|
619 |
+
"ReAct-Pro*-Qwen2-1.5B-Instruct": {
|
620 |
+
"META": {
|
621 |
+
"Algorithm": "ReAct-Pro*",
|
622 |
+
"LLM": "Qwen2-1.5B-Instruct",
|
623 |
+
"Eval Date": "2025/1/22"
|
624 |
+
},
|
625 |
+
"gsm8k": {
|
626 |
+
"Score": 24.86,
|
627 |
+
"Cost($)": 0.0
|
628 |
+
},
|
629 |
+
"AQuA": {
|
630 |
+
"Score": 25.59,
|
631 |
+
"Cost($)": 0.0
|
632 |
+
}
|
633 |
+
},
|
634 |
+
"PoT-Qwen2-1.5B-Instruct": {
|
635 |
+
"META": {
|
636 |
+
"Algorithm": "PoT",
|
637 |
+
"LLM": "Qwen2-1.5B-Instruct",
|
638 |
+
"Eval Date": "2025/1/22"
|
639 |
+
},
|
640 |
+
"gsm8k": {
|
641 |
+
"Score": 18.49,
|
642 |
+
"Cost($)": 0.0
|
643 |
+
},
|
644 |
+
"AQuA": {
|
645 |
+
"Score": 30.7,
|
646 |
+
"Cost($)": 0.0
|
647 |
+
}
|
648 |
+
},
|
649 |
+
"CoT-Qwen2-1.5B-Instruct": {
|
650 |
+
"META": {
|
651 |
+
"Algorithm": "CoT",
|
652 |
+
"LLM": "Qwen2-1.5B-Instruct",
|
653 |
+
"Eval Date": "2025/1/22"
|
654 |
+
},
|
655 |
+
"gsm8k": {
|
656 |
+
"Score": 55.49,
|
657 |
+
"Cost($)": 0.0
|
658 |
+
},
|
659 |
+
"AQuA": {
|
660 |
+
"Score": 40.55,
|
661 |
+
"Cost($)": 0.0
|
662 |
+
}
|
663 |
+
},
|
664 |
+
"SC-CoT-Qwen2-1.5B-Instruct": {
|
665 |
+
"META": {
|
666 |
+
"Algorithm": "SC-CoT",
|
667 |
+
"LLM": "Qwen2-1.5B-Instruct",
|
668 |
+
"Eval Date": "2025/1/22"
|
669 |
+
},
|
670 |
+
"gsm8k": {
|
671 |
+
"Score": 5.53,
|
672 |
+
"Cost($)": 0.0
|
673 |
+
},
|
674 |
+
"AQuA": {
|
675 |
+
"Score": 30.31,
|
676 |
+
"Cost($)": 0.0
|
677 |
+
}
|
678 |
+
},
|
679 |
+
"IO-Qwen2-0.5B-Instruct": {
|
680 |
+
"META": {
|
681 |
+
"Algorithm": "IO",
|
682 |
+
"LLM": "Qwen2-0.5B-Instruct",
|
683 |
+
"Eval Date": "2025/1/22"
|
684 |
+
},
|
685 |
+
"gsm8k": {
|
686 |
+
"Score": 14.7,
|
687 |
+
"Cost($)": 0.0
|
688 |
+
},
|
689 |
+
"AQuA": {
|
690 |
+
"Score": 27.16,
|
691 |
+
"Cost($)": 0.0
|
692 |
+
}
|
693 |
+
},
|
694 |
+
"ReAct-Pro*-Qwen2-0.5B-Instruct": {
|
695 |
+
"META": {
|
696 |
+
"Algorithm": "ReAct-Pro*",
|
697 |
+
"LLM": "Qwen2-0.5B-Instruct",
|
698 |
+
"Eval Date": "2025/1/22"
|
699 |
+
},
|
700 |
+
"gsm8k": {
|
701 |
+
"Score": 7.65,
|
702 |
+
"Cost($)": 0.0
|
703 |
+
},
|
704 |
+
"AQuA": {
|
705 |
+
"Score": 24.01,
|
706 |
+
"Cost($)": 0.0
|
707 |
+
}
|
708 |
+
},
|
709 |
+
"PoT-Qwen2-0.5B-Instruct": {
|
710 |
+
"META": {
|
711 |
+
"Algorithm": "PoT",
|
712 |
+
"LLM": "Qwen2-0.5B-Instruct",
|
713 |
+
"Eval Date": "2025/1/22"
|
714 |
+
},
|
715 |
+
"gsm8k": {
|
716 |
+
"Score": 9.62,
|
717 |
+
"Cost($)": 0.0
|
718 |
+
},
|
719 |
+
"AQuA": {
|
720 |
+
"Score": 17.32,
|
721 |
+
"Cost($)": 0.0
|
722 |
+
}
|
723 |
+
},
|
724 |
+
"CoT-Qwen2-0.5B-Instruct": {
|
725 |
+
"META": {
|
726 |
+
"Algorithm": "CoT",
|
727 |
+
"LLM": "Qwen2-0.5B-Instruct",
|
728 |
+
"Eval Date": "2025/1/22"
|
729 |
+
},
|
730 |
+
"gsm8k": {
|
731 |
+
"Score": 35.93,
|
732 |
+
"Cost($)": 0.0
|
733 |
+
},
|
734 |
+
"AQuA": {
|
735 |
+
"Score": 33.07,
|
736 |
+
"Cost($)": 0.0
|
737 |
+
}
|
738 |
+
},
|
739 |
+
"SC-CoT-Qwen2-0.5B-Instruct": {
|
740 |
+
"META": {
|
741 |
+
"Algorithm": "SC-CoT",
|
742 |
+
"LLM": "Qwen2-0.5B-Instruct",
|
743 |
+
"Eval Date": "2025/1/22"
|
744 |
},
|
745 |
"gsm8k": {
|
746 |
+
"Score": 3.79,
|
747 |
+
"Cost($)": 0.0
|
748 |
},
|
749 |
"AQuA": {
|
750 |
+
"Score": 30.7,
|
751 |
+
"Cost($)": 0.0
|
752 |
}
|
753 |
}
|
754 |
}
|
755 |
+
}
|
src/overall_results.csv
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($)
|
2 |
+
1.0,SC-CoT,Llama-3.3-70B-Instruct,2025/1/22,91.03,95.45,4.5021,86.61,0.5847
|
3 |
+
2.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,89.55,92.87,0.7195,86.22,0.0808
|
4 |
+
3.0,SC-CoT,gpt-4o,2025/1/22,89.47,90.75,24.2428,88.19,6.2412
|
5 |
+
4.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,88.70,93.93,0.687,83.46,0.0927
|
6 |
+
5.0,CoT,gpt-4o,2025/1/22,88.38,94.08,4.5367,82.67,1.0417
|
7 |
+
6.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,88.25,90.67,4.2651,85.82,0.5576
|
8 |
+
7.0,IO,Llama-3.3-70B-Instruct,2025/1/22,87.47,92.26,0.4709,82.67,0.0798
|
9 |
+
8.0,CoT,Doubao-lite-32k,2025/1/7,85.99,89.31,0.0558,82.67,0.0066
|
10 |
+
9.0,IO,Qwen2.5-72B-Instruct,2025/1/22,85.42,86.58,0.4899,84.25,0.0742
|
11 |
+
10.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,84.91,88.32,0.0,81.49,0.0
|
12 |
+
11.0,PoT,gpt-4o,2025/1/22,84.15,93.1,4.2166,75.19,1.6087
|
13 |
+
12.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,83.77,92.34,0.7054,75.19,0.1645
|
14 |
+
13.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,83.39,87.64,10.1124,79.13,0.768
|
15 |
+
14.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,83.19,85.67,0.0,80.7,0.0
|
16 |
+
15.0,SC-CoT,Doubao-lite-32k,2025/1/7,82.60,83.7,0.155,81.5,0.0347
|
17 |
+
16.0,IO,gpt-4o,2025/1/22,82.00,88.4,3.3463,75.59,1.1453
|
18 |
+
17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,81.57,85.59,0.2512,77.55,0.0445
|
19 |
+
18.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,80.24,87.26,10.5479,73.22,0.3177
|
20 |
+
19.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,78.63,82.86,0.0,74.4,0.0
|
21 |
+
20.0,SC-CoT,gpt-3.5-turbo,2025/1/7,76.52,82.56,2.6285,70.47,0.5578
|
22 |
+
21.0,PoT,Llama-3.3-70B-Instruct,2025/1/22,76.30,73.08,0.9736,79.52,0.1746
|
23 |
+
22.0,PoT,Doubao-lite-32k,2025/1/7,75.63,79.6,0.0576,71.65,0.0147
|
24 |
+
23.0,IO,Doubao-lite-32k,2025/1/7,75.58,72.02,0.0354,79.13,0.0058
|
25 |
+
24.0,CoT,gpt-3.5-turbo,2025/1/7,69.86,78.69,0.6788,61.02,0.0957
|
26 |
+
25.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,69.73,74.9,3.4633,64.56,0.4928
|
27 |
+
26.0,PoT,gpt-3.5-turbo,2025/1/7,68.16,76.87,0.6902,59.44,0.1748
|
28 |
+
27.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,68.03,75.43,0.0,60.62,0.0
|
29 |
+
28.0,IO,Qwen2.5-7B-Instruct,2025/1/22,67.99,57.24,0.0,78.74,0.0
|
30 |
+
29.0,CoT,Internllm2_5-7B,2025/1/22,65.23,77.71,0.0,52.75,0.0
|
31 |
+
30.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,64.17,75.2,0.0,53.14,0.0
|
32 |
+
31.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,63.47,58.83,0.0,68.11,0.0
|
33 |
+
32.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,61.64,67.77,0.0,55.51,0.0
|
34 |
+
33.0,ReAct-Pro*,gpt-4o,2025/1/22,60.39,63.3,39.0751,57.48,2.304
|
35 |
+
34.0,IO,Llama-3.1-8B-Instruct,2025/1/22,54.17,57.16,0.0,51.18,0.0
|
36 |
+
35.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,48.02,55.49,0.0,40.55,0.0
|
37 |
+
36.0,SC-CoT,Internllm2_5-7B,2025/1/22,38.62,41.39,0.0,35.85,0.0
|
38 |
+
37.0,IO,gpt-3.5-turbo,2025/1/7,38.40,37.83,0.3328,38.97,0.038
|
39 |
+
38.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,37.64,38.66,0.0,36.61,0.0
|
40 |
+
39.0,PoT,Internllm2_5-7B,2025/1/22,37.41,38.21,0.0,36.61,0.0
|
41 |
+
40.0,ReAct-Pro*,Internllm2_5-7B,2025/1/22,37.23,33.51,0.0,40.94,0.0
|
42 |
+
41.0,CoT,Qwen2-0.5B-Instruct,2025/1/22,34.50,35.93,0.0,33.07,0.0
|
43 |
+
42.0,IO,Internllm2_5-7B,2025/1/22,29.61,11.59,0.0,47.63,0.0
|
44 |
+
43.0,ReAct-Pro*,Qwen2-1.5B-Instruct,2025/1/22,25.23,24.86,0.0,25.59,0.0
|
45 |
+
44.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,24.60,18.49,0.0,30.7,0.0
|
46 |
+
45.0,IO,Qwen2-1.5B-Instruct,2025/1/22,22.90,16.67,0.0,29.13,0.0
|
47 |
+
46.0,IO,Qwen2-0.5B-Instruct,2025/1/22,20.93,14.7,0.0,27.16,0.0
|
48 |
+
47.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,17.92,5.53,0.0,30.31,0.0
|
49 |
+
48.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,17.25,3.79,0.0,30.7,0.0
|
50 |
+
49.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,15.83,7.65,0.0,24.01,0.0
|
51 |
+
50.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,13.47,9.62,0.0,17.32,0.0
|
src/record.csv
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Algorithm,dataset,llm,Score,Pass rate,X-shot,X-shot,Parameters,Nums,Total input tokens,Average input tokens,Total output tokens,Average output tokens,All tokens,Cost($),Eval Date,Note,,,,,,,,,,,,,,,,,,,
|
2 |
+
IO,gsm8k,gpt-3.5-turbo,37.83,99.92,8,few_shot,,1319,"546,990",415,"39,563",30,"586,553",0.3328,2025/1/7,,,,,,,,,,,,,,,,,,,,
|
3 |
+
IO,gsm8k,Doubao-lite-32k,72.02,99.92,8,few_shot,,1319,"617,377",468,"123,106",93,"740,483",0.0354,2025/1/7,0.2590 (元),,,,,,,,,,,,,,,,,,,
|
4 |
+
IO,gsm8k,gpt-4o,88.4,100,8,few_shot,,1319,"542,416",411,"199,030",151,"741,446",3.3463,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
5 |
+
IO,gsm8k,Qwen2.5-72B-Instruct,86.58,100,8,few_shot,,1319,"555,340",421,"313,720",238,"869,060",0.4899,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
6 |
+
IO,gsm8k,Llama-3.3-70B-Instruct,92.26,100,8,few_shot,,1319,"583,916",443,"251,359",191,"835,275",0.4709,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
7 |
+
IO,gsm8k,Qwen2.5-7B-Instruct,57.24,100,8,few_shot,,1319,"596,229",452,"291,684",221,"887,913",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
8 |
+
IO,gsm8k,Llama-3.1-8B-Instruct,57.16,99.54,8,few_shot,,1319,"550,941",418,"1,194,488",906,"1,745,429",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
9 |
+
IO,gsm8k,Internllm2_5-7B,11.59,97.95,8,few_shot,,1319,"679,302",515,"434,426",329,"1,113,728",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
10 |
+
IO,gsm8k,Qwen2-1.5B-Instruct,16.67,100,8,few_shot,,1319,"568,530",431,"168,466",128,"736,996",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
11 |
+
IO,gsm8k,Qwen2-0.5B-Instruct,14.7,100,8,few_shot,,1319,"568,116",431,"266,781",202,"834,897",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
12 |
+
ReAct-Pro*,gsm8k,gpt-3.5-turbo,74.9,99.39,8,few_shot,max_steps=10,1319,"6,506,164","4,933","140,122",106,"6,646,286",3.4633,2025/1/7,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
|
13 |
+
ReAct-Pro*,gsm8k,Doubao-lite-32k,85.59,99.62,8,few_shot,max_steps=10,1319,"5,862,016","4,444","136,623",104,"5,998,639",0.2512,2025/1/7,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
|
14 |
+
ReAct-Pro*,gsm8k,gpt-4o,63.3,99.54,8,few_shot,max_steps=10,1319,"14,411,173","10,926","304,714",231,"14,715,887",39.0751,2025/1/22,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
|
15 |
+
ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,87.26,100,8,few_shot,max_steps=10,1319,"18,160,983","13,769","549,454",417,"18,710,437",10.5479,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
16 |
+
ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,87.64,99.92,,few_shot,max_steps=10,1319,"17,038,928","12,918","898,936",682,"17,937,864",10.1124,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
17 |
+
ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,82.86,100,8,few_shot,max_steps=10,1319,"14,355,752","10,884","495,162",375,"14,850,914",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
18 |
+
ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,67.77,98.55,8,few_shot,max_steps=10,1319,"21,044,978","15,955","1,790,789","1,358","22,835,767",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
19 |
+
ReAct-Pro*,gsm8k,Internllm2_5-7B,33.51,97.95,,few_shot,max_steps=10,1319,"30,120,070","22,836","5,549,919","4,208","35,669,989",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
20 |
+
ReAct-Pro*,gsm8k,Qwen2-1.5B-Instruct,24.86,80.21,8,few_shot,max_steps=10,1319,"9,133,603","6,925","694,398",526,"9,828,001",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
21 |
+
ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,7.65,95.22,8,few_shot,max_steps=10,1319,"52,431,343","39,751","2,961,268","2,245","55,392,611",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
22 |
+
PoT,gsm8k,gpt-3.5-turbo,76.87,99.24,8,few_shot,,1319,"1,090,418",827,"96,662",73,"1,187,080",0.6902,2025/1/7,,,,,,,,,,,,,,,,,,,,
|
23 |
+
PoT,gsm8k,Doubao-lite-32k,79.6,92.57,8,few_shot,,1319,"1,170,038",887,"118,017",89,"1,288,055",0.0576,2025/1/7,,,,,,,,,,,,,,,,,,,,
|
24 |
+
PoT,gsm8k,gpt-4o,93.1,99.77,8,few_shot,,1319,"1,101,672",835,"146,240",111,"1,247,912",4.2166,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
25 |
+
PoT,gsm8k,Qwen2.5-72B-Instruct,92.34,99.39,8,few_shot,,1319,"1,106,682",839,"144,528",110,"1,251,210",0.7054,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
26 |
+
PoT,gsm8k,Llama-3.3-70B-Instruct,73.08,79.6,8,few_shot,,1319,"1,126,025",854,"601,019",456,"1,727,044",0.9736,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
27 |
+
PoT,gsm8k,Qwen2.5-7B-Instruct,58.83,70.5,8,few_shot,,1319,"1,145,390",868,"217,432",165,"1,362,822",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
28 |
+
PoT,gsm8k,Llama-3.1-8B-Instruct,38.66,55.42,8,few_shot,,1319,"1,147,538",870,"243,573",185,"1,391,111",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
29 |
+
PoT,gsm8k,Internllm2_5-7B,38.21,48.9,8,few_shot,,1319,"1,136,843",862,"188,106",143,"1,324,949",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
30 |
+
PoT,gsm8k,Qwen2-1.5B-Instruct,18.49,31,8,few_shot,,1319,"1,151,528",873,"175,994",133,"1,327,522",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
31 |
+
PoT,gsm8k,Qwen2-0.5B-Instruct,9.62,16.9,8,few_shot,,1319,"1,151,528",873,"237,607",180,"1,389,135",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
32 |
+
CoT,gsm8k,gpt-3.5-turbo,78.69,100,8,few_shot,,1319,"953,242",723,"134,799",102,"1,088,041",0.6788,2025/1/7,,,,,,,,,,,,,,,,,,,,
|
33 |
+
CoT,gsm8k,Doubao-lite-32k,89.31,100,8,few_shot,,1319,"1,042,095",790,"159,725",121,"1,201,820",0.0558,2025/1/7,0.4084635 (元),,,,,,,,,,,,,,,,,,,
|
34 |
+
CoT,gsm8k,gpt-4o,94.08,100,8,few_shot,,1319,"948,668",719,"216,498",164,"1,165,166",4.5367,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
35 |
+
CoT,gsm8k,Qwen2.5-72B-Instruct,92.87,100,8,few_shot,,1319,"1,005,119",762,"271,133",206,"1,276,252",0.7195,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
36 |
+
CoT,gsm8k,Llama-3.3-70B-Instruct,93.93,100,8,few_shot,,1319,"990,168",751,"228,497",173,"1,218,665",0.6870,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
37 |
+
CoT,gsm8k,Qwen2.5-7B-Instruct,85.67,100,8,few_shot,,1319,"1,046,008",793,"244,797",186,"1,290,805",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
38 |
+
CoT,gsm8k,Llama-3.1-8B-Instruct,75.43,99.92,8,few_shot,,1319,"990,168",751,"258,161",196,"1,248,329",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
39 |
+
CoT,gsm8k,Internllm2_5-7B,77.71,99.69,8,few_shot,,1319,"968,163",734,"234,000",177,"1,202,163",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
40 |
+
CoT,gsm8k,Qwen2-1.5B-Instruct,55.49,100,8,few_shot,,1319,"1,032,818",783,"185,707",141,"1,218,525",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
41 |
+
CoT,gsm8k,Qwen2-0.5B-Instruct,35.93,99.92,8,few_shot,,1319,"1,032,818",783,"190,641",145,"1,223,459",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
42 |
+
SC-CoT,gsm8k,gpt-3.5-turbo,82.56,99.85,8,few_shot,"temperature=1, path_num=5",1319,"1,212,520",919,"1,348,177","1,022","2,560,697",2.6285,2025/1/7,,,,,,,,,,,,,,,,,,,,
|
43 |
+
SC-CoT,gsm8k,Doubao-lite-32k,83.7,99.70,8,few_shot,"temperature=1, path_num=5",1319,"1,230,019",933,"1,277,668",969,"2,507,687",0.1550,2025/1/7,,,,,,,,,,,,,,,,,,,,
|
44 |
+
SC-CoT,gsm8k,gpt-4o,90.75,100,8,few_shot,"temperature=1, path_num=5",1319,"1,168,927",886,"2,132,044","1,616","3,300,971",24.2428,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
45 |
+
SC-CoT,gsm8k,Qwen2.5-72B-Instruct,90.67,100,8,few_shot,"temperature=1, path_num=5",1319,"5,292,383","4,012","2,273,254","1,723","7,565,637",4.2651,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
46 |
+
SC-CoT,gsm8k,Llama-3.3-70B-Instruct,95.45,100,8,few_shot,"temperature=1, path_num=5",1319,"5,406,763","4,099","2,579,233","1,955","7,985,996",4.5021,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
47 |
+
SC-CoT,gsm8k,Qwen2.5-7B-Instruct,88.32,99.84,8,few_shot,"temperature=1, path_num=5",1319,"5,668,252","4,297","2,505,566","1,900","8,173,818",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
48 |
+
SC-CoT,gsm8k,Llama-3.1-8B-Instruct,75.2,99.54,8,few_shot,"temperature=1, path_num=5",1319,"5,334,657","4,044","3,109,546","2,358","8,444,203",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
49 |
+
SC-CoT,gsm8k,Internllm2_5-7B,41.39,98.25,8,few_shot,"temperature=1, path_num=5",1319,"6,674,518","5,060","3,350,339","2,540","10,024,857",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
50 |
+
SC-CoT,gsm8k,Qwen2-1.5B-Instruct,5.53,86.73,8,few_shot,"temperature=1, path_num=5",1319,"5,844,218","4,431","3,117,550","2,364","8,961,768",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
51 |
+
SC-CoT,gsm8k,Qwen2-0.5B-Instruct,3.79,94.84,8,few_shot,"temperature=1, path_num=5",1319,"6,529,832","4,951","4,003,983","3,036","10,533,815",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
52 |
+
IO,AQuA,gpt-3.5-turbo,38.97,100,0,zero_shot,,254,"25,701",101,"16,770",66,"42,471",0.0380,2025/1/7,,,,,,,,,,,,,,,,,,,,
|
53 |
+
IO,AQuA,Doubao-lite-32k,79.13,100,0,zero_shot,,254,"33,058",130,"54,684",215,"87,742",0.0058,2025/1/7,0.0427(元),,,,,,,,,,,,,,,,,,,
|
54 |
+
IO,AQuA,gpt-4o,75.59,97.24,0,zero_shot,,254,"25,631",101,"108,121",426,"133,752",1.1453,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
55 |
+
IO,AQuA,Qwen2.5-72B-Instruct,84.25,99.6,0,zero_shot,,254,"25,397",100,"106,207",418,"131,604",0.0742,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
56 |
+
IO,AQuA,Llama-3.3-70B-Instruct,82.67,99.21,0,zero_shot,,254,"32,809",129,"108,758",428,"141,567",0.0798,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
57 |
+
IO,AQuA,Qwen2.5-7B-Instruct,78.74,98.42,0,zero_shot,,254,"33,271",131,"104,500",411,"137,771",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
58 |
+
IO,AQuA,Llama-3.1-8B-Instruct,51.18,98.81,0,zero_shot,,254,"26,459",104,"106,647",420,"133,106",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
59 |
+
IO,AQuA,Internllm2_5-7B,47.63,90.94,0,zero_shot,,254,"50,232",198,"134,809",531,"185,041",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
60 |
+
IO,AQuA,Qwen2-1.5B-Instruct,29.13,97.63,0,zero_shot,,254,"27,937",110,"43,110",170,"71,047",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
61 |
+
IO,AQuA,Qwen2-0.5B-Instruct,27.16,98.81,0,zero_shot,,254,"27,937",110,"82,478",325,"110,415",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
62 |
+
CoT,AQuA,gpt-3.5-turbo,61.02,93.7,0,zero_shot,,254,"25,447",100,"55,346",218,"80,793",0.0957,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
63 |
+
CoT,AQuA,Doubao-lite-32k,82.67,97.24,0,zero_shot,,254,"27,978",110,"66,599",262,"94,577",0.0066,2025/1/7,0.0483 (元),,,,,,,,,,,,,,,,,,,
|
64 |
+
CoT,AQuA,gpt-4o,82.67,98.03,0,zero_shot,,254,"25,123",99,"97,894",385,"123,017",1.0417,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
65 |
+
CoT,AQuA,Qwen2.5-72B-Instruct,86.22,99.21,0,zero_shot,,254,"25,143",99,"118,146",465,"143,289",0.0808,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
66 |
+
CoT,AQuA,Llama-3.3-70B-Instruct,83.46,98.42,0,zero_shot,,254,"32,555",128,"131,834",519,"164,389",0.0927,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
67 |
+
CoT,AQuA,Qwen2.5-7B-Instruct,80.7,99.6,0,zero_shot,,254,"33,017",130,"116,719",460,"149,736",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
68 |
+
CoT,AQuA,Llama-3.1-8B-Instruct,60.62,100,0,zero_shot,,254,"32,555",128,"111,880",440,"144,435",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
69 |
+
CoT,AQuA,Internllm2_5-7B,52.75,89.37,0,zero_shot,,254,"26,610",105,"100,910",397,"127,520",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
70 |
+
CoT,AQuA,Qwen2-1.5B-Instruct,40.55,98.81,0,zero_shot,,254,"30,477",120,"79,563",313,"110,040",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
71 |
+
CoT,AQuA,Qwen2-0.5B-Instruct,33.07,98.81,0,zero_shot,,254,"30,477",120,"86,862",342,"117,339",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
72 |
+
PoT,AQuA,gpt-3.5-turbo,59.44,100,0,zero_shot,,254,"225,162",886,"41,492",163,"266,654",0.1748,2025/1/7,,,,,,,,,,,,,,,,,,,,
|
73 |
+
PoT,AQuA,gpt-4o,75.19,100,0,zero_shot,,254,"222,717",877,"105,191",414,"327,908",1.6087,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
74 |
+
PoT,AQuA,Doubao-lite-32k,71.65,96.85,0,zero_shot,,254,"259,863","1,023","49,573",195,"309,436",0.0147,2025/1/7,,,,,,,,,,,,,,,,,,,,
|
75 |
+
PoT,AQuA,Qwen2.5-72B-Instruct,75.19,100,0,zero_shot,,254,"249,215",981,"42,549",168,"291,764",0.1645,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
76 |
+
PoT,AQuA,Llama-3.3-70B-Instruct,79.52,99.21,0,zero_shot,,254,"240,735",948,"69,064",272,"309,799",0.1746,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
77 |
+
PoT,AQuA,Qwen2.5-7B-Instruct,68.11,100,0,zero_shot,,254,"264,517","1,041","49,211",194,"313,728",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
78 |
+
PoT,AQuA,Llama-3.1-8B-Instruct,36.61,96.85,0,zero_shot,,254,"240,613",947,"50,301",198,"290,914",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
79 |
+
PoT,AQuA,Internllm2_5-7B,36.61,98.81,0,zero_shot,,254,"233,505",919,"68,457",270,"301,962",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
80 |
+
PoT,AQuA,Qwen2-1.5B-Instruct,30.7,96.45,0,zero_shot,,254,"246,560",971,"51,915",204,"298,475",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
81 |
+
PoT,AQuA,Qwen2-0.5B-Instruct,17.32,92.12,0,zero_shot,,254,"258,867","1,019","63,414",250,"322,281",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
82 |
+
SC-CoT,AQuA,gpt-3.5-turbo,70.47,98.82,0,zero_shot,"temperature=1, path_num=5",254,"70,157",276,"348,460","1,372","418,617",0.5578,2025/1/7,,,,,,,,,,,,,,,,,,,,
|
83 |
+
SC-CoT,AQuA,Doubao-lite-32k,81.5,97.64,0,zero_shot,"temperature=1, path_num=5",254,"83,830",330,"382,016","1,504","465,846",0.0347,2025/1/7,,,,,,,,,,,,,,,,,,,,
|
84 |
+
SC-CoT,AQuA,gpt-4o,88.19,100,0,zero_shot,"temperature=1, path_num=5",254,"72,916",287,"605,895","2,385","678,811",6.2412,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
85 |
+
SC-CoT,AQuA,Qwen2.5-72B-Instruct,85.82,98.42,0,zero_shot,"temperature=1, path_num=5",254,"241,149",949,"747,909","2,945","989,058",0.5576,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
86 |
+
SC-CoT,AQuA,Llama-3.3-70B-Instruct,86.61,99.21,0,zero_shot,"temperature=1, path_num=5",254,"283,248","1,115","753,876","2,968","1,037,124",0.5847,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
87 |
+
SC-CoT,AQuA,Qwen2.5-7B-Instruct,81.49,100,0,zero_shot,"temperature=1, path_num=5",254,"278,848","1,098","736,520","2,900","1,015,368",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
88 |
+
SC-CoT,AQuA,Llama-3.1-8B-Instruct,53.14,96.06,0,zero_shot,"temperature=1, path_num=5",254,"372,968","1,468","668,378","2,631","1,041,346",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
89 |
+
SC-CoT,AQuA,Internllm2_5-7B,35.85,98.8,0,zero_shot,"temperature=1, path_num=5",254,"530,701","2,089","709,687","2,794","1,240,388",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
90 |
+
SC-CoT,AQuA,Qwen2-1.5B-Instruct,30.31,97.24,0,zero_shot,"temperature=1, path_num=5",254,"430,703","1,696","726,373","2,860","1,157,076",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
91 |
+
SC-CoT,AQuA,Qwen2-0.5B-Instruct,30.7,98.42,0,zero_shot,"temperature=1, path_num=5",254,"496,206","1,954","729,333","2,871","1,225,539",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
92 |
+
ReAct-Pro*,AQuA,gpt-3.5-turbo,64.56,98.03,0,zero_shot,max_steps=10,254,"862,614","3,396","40,973",161,"903,587",0.4928,2025/1/7,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
|
93 |
+
ReAct-Pro*,AQuA,Doubao-lite-32k,77.55,96.06,0,zero_shot,max_steps=10,254,"977,890","3,850","54,951",216,"1,032,841",0.0445,2025/1/7,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
|
94 |
+
ReAct-Pro*,AQuA,gpt-4o,57.48,97.24,0,zero_shot,max_steps=10,254,"615,589","2,424","76,507",301,"692,096",2.3040,2025/1/22,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
|
95 |
+
ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,73.22,100,0,zero_shot,max_steps=10,254,"441,765","1,739","121,838",480,"563,603",0.3177,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
96 |
+
ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,79.13,99.6,0,zero_shot,max_steps=10,254,"1,119,143","4,406","243,236",958,"1,362,379",0.7680,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
97 |
+
ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,74.4,99.21,0,zero_shot,max_steps=10,254,"564,165","2,221","131,679",518,"695,844",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
98 |
+
ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,55.51,96.85,0,zero_shot,max_steps=10,254,"3,764,723","14,822","576,098","2,268","4,340,821",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
99 |
+
ReAct-Pro*,AQuA,Internllm2_5-7B,40.94,96.85,0,zero_shot,max_steps=10,254,"3,592,039","14,142","836,762","3,294","4,428,801",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
100 |
+
ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,25.59,96.06,0,zero_shot,max_steps=10,254,"4,555,858","17,936","516,146","2,032","5,072,004",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
101 |
+
ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,24.01,96.85,0,zero_shot,max_steps=10,254,6344167,"24,977",825920,"3,252","7,170,087",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
|
102 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
103 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
104 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
105 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
106 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
107 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
108 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
109 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
110 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
111 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
112 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
113 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
114 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
115 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
116 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
117 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
118 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
119 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
120 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
121 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
122 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
123 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
124 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
125 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
126 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
127 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
128 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
129 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
130 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
131 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
132 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
133 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
134 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
135 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
136 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
137 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
138 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
139 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
140 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
141 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
142 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
143 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
144 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
145 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
146 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
147 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|
148 |
+
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
|