qq-hzlh commited on
Commit
c9a97c2
·
1 Parent(s): efbd6cf

add more llms

Browse files
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import abc
2
  import gradio as gr
 
3
 
4
  from gen_table import *
5
  from meta_data import *
@@ -17,13 +18,25 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
17
  DATASETS.remove('META')
18
  print(DATASETS)
19
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  gr.Markdown(LEADERBORAD_INTRODUCTION.format(EVAL_TIME))
22
  with gr.Tabs(elem_classes='tab-buttons') as tabs:
23
  with gr.Tab(label='🏅 Open Agent Overall Math Leaderboard'):
24
  gr.Markdown(LEADERBOARD_MD['MATH_MAIN'])
25
- check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
26
- overall_table = generate_table(results, DEFAULT_MATH_BENCH)
 
27
 
28
  type_map = check_box['type_map']
29
  type_map['Rank'] = 'number'
@@ -35,6 +48,21 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
35
  interactive=True,
36
  )
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  initial_headers = ['Rank'] + check_box['essential'] + checkbox_group.value
39
  available_headers = [h for h in initial_headers if h in overall_table.columns]
40
 
@@ -46,33 +74,55 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
46
  wrap=True,
47
  visible=True)
48
 
49
- def filter_df(fields, *args):
50
  headers = ['Rank'] + check_box['essential'] + fields
51
- # df = overall_table.copy()
 
 
 
 
 
 
 
 
 
52
 
53
  # Ensure all requested columns exist
54
- available_headers = [h for h in headers if h in overall_table.columns]
55
 
56
- original_columns = overall_table.columns.tolist()
57
  available_headers = sorted(available_headers, key=lambda x: original_columns.index(x))
58
 
59
-
60
  # If no columns are available, return an empty DataFrame with basic columns
61
  if not available_headers:
62
  available_headers = ['Rank'] + check_box['essential']
63
 
64
  comp = gr.components.DataFrame(
65
- value=overall_table[available_headers],
66
  type='pandas',
67
  datatype=[type_map[x] for x in available_headers],
68
  interactive=False,
69
  wrap=True,
70
  visible=True)
 
71
  return comp
72
 
 
73
  checkbox_group.change(
74
  fn=filter_df,
75
- inputs=[checkbox_group],
 
 
 
 
 
 
 
 
 
 
 
 
76
  outputs=data_component
77
  )
78
 
@@ -84,7 +134,12 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
84
  results_detail = struct_detail['results']
85
 
86
  table, check_box = BUILD_L2_DF(results_detail, DEFAULT_MATH_BENCH)
87
- # table = generate_table_detail(results_detail, DEFAULT_MATH_BENCH)
 
 
 
 
 
88
  type_map = check_box['type_map']
89
  type_map['Rank'] = 'number'
90
 
@@ -112,12 +167,12 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
112
  interactive=True
113
  )
114
 
115
- llm_name = gr.CheckboxGroup(
116
- choices=LLM,
117
- value=LLM,
118
- label='LLM',
119
- interactive=True
120
- )
121
 
122
  data_component = gr.components.DataFrame(
123
  value=table[headers],
@@ -159,6 +214,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
159
  interactive=False,
160
  wrap=True,
161
  visible=True)
 
162
  return comp
163
 
164
  # 为所有复选框组添加change事件
@@ -196,6 +252,7 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
196
  show_copy_button=True,
197
  )
198
 
 
199
 
200
  if __name__ == '__main__':
201
  demo.launch(server_name='0.0.0.0')
 
1
  import abc
2
  import gradio as gr
3
+ import os
4
 
5
  from gen_table import *
6
  from meta_data import *
 
18
  DATASETS.remove('META')
19
  print(DATASETS)
20
 
21
+ # 确保在定义llm_options之前生成overall_table
22
+ check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
23
+ overall_table = generate_table(results, DEFAULT_MATH_BENCH)
24
+
25
+ # 保存完整的overall_table为CSV文件
26
+ csv_path_overall = os.path.join(os.getcwd(), 'src/overall_results.csv')
27
+ overall_table.to_csv(csv_path_overall, index=False)
28
+ print(f"Overall results saved to {csv_path_overall}")
29
+
30
+ # 从overall_table中提取所有可能的LLM选项
31
+ llm_options = list(set(row.LLM for row in overall_table.itertuples() if hasattr(row, 'LLM')))
32
 
33
  gr.Markdown(LEADERBORAD_INTRODUCTION.format(EVAL_TIME))
34
  with gr.Tabs(elem_classes='tab-buttons') as tabs:
35
  with gr.Tab(label='🏅 Open Agent Overall Math Leaderboard'):
36
  gr.Markdown(LEADERBOARD_MD['MATH_MAIN'])
37
+ # 移动check_box和overall_table的定义到这里
38
+ # check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH)
39
+ # overall_table = generate_table(results, DEFAULT_MATH_BENCH)
40
 
41
  type_map = check_box['type_map']
42
  type_map['Rank'] = 'number'
 
48
  interactive=True,
49
  )
50
 
51
+ # 新增的CheckboxGroup组件用于选择Algorithm和LLM
52
+ algo_name = gr.CheckboxGroup(
53
+ choices=ALGORITHMS,
54
+ value=ALGORITHMS,
55
+ label='Algorithm',
56
+ interactive=True
57
+ )
58
+
59
+ llm_name = gr.CheckboxGroup(
60
+ choices=llm_options, # 使用提取的llm_options
61
+ value=llm_options,
62
+ label='LLM',
63
+ interactive=True
64
+ )
65
+
66
  initial_headers = ['Rank'] + check_box['essential'] + checkbox_group.value
67
  available_headers = [h for h in initial_headers if h in overall_table.columns]
68
 
 
74
  wrap=True,
75
  visible=True)
76
 
77
+ def filter_df(fields, algos, llms, *args):
78
  headers = ['Rank'] + check_box['essential'] + fields
79
+ df = overall_table.copy()
80
+
81
+ # 添加过滤逻辑
82
+ df['flag'] = df.apply(lambda row: (
83
+ row['Algorithm'] in algos and
84
+ row['LLM'] in llms
85
+ ), axis=1)
86
+
87
+ df = df[df['flag']].copy()
88
+ df.pop('flag')
89
 
90
  # Ensure all requested columns exist
91
+ available_headers = [h for h in headers if h in df.columns]
92
 
93
+ original_columns = df.columns.tolist()
94
  available_headers = sorted(available_headers, key=lambda x: original_columns.index(x))
95
 
 
96
  # If no columns are available, return an empty DataFrame with basic columns
97
  if not available_headers:
98
  available_headers = ['Rank'] + check_box['essential']
99
 
100
  comp = gr.components.DataFrame(
101
+ value=df[available_headers],
102
  type='pandas',
103
  datatype=[type_map[x] for x in available_headers],
104
  interactive=False,
105
  wrap=True,
106
  visible=True)
107
+
108
  return comp
109
 
110
+ # 更新change事件以包含新的过滤条件
111
  checkbox_group.change(
112
  fn=filter_df,
113
+ inputs=[checkbox_group, algo_name, llm_name],
114
+ outputs=data_component
115
+ )
116
+
117
+ algo_name.change(
118
+ fn=filter_df,
119
+ inputs=[checkbox_group, algo_name, llm_name],
120
+ outputs=data_component
121
+ )
122
+
123
+ llm_name.change(
124
+ fn=filter_df,
125
+ inputs=[checkbox_group, algo_name, llm_name],
126
  outputs=data_component
127
  )
128
 
 
134
  results_detail = struct_detail['results']
135
 
136
  table, check_box = BUILD_L2_DF(results_detail, DEFAULT_MATH_BENCH)
137
+
138
+ # 保存完整的table为CSV文件
139
+ csv_path_detail = os.path.join(os.getcwd(), 'src/detail_results.csv')
140
+ table.to_csv(csv_path_detail, index=False)
141
+ print(f"Detail results saved to {csv_path_detail}")
142
+
143
  type_map = check_box['type_map']
144
  type_map['Rank'] = 'number'
145
 
 
167
  interactive=True
168
  )
169
 
170
+ llm_name = gr.CheckboxGroup(
171
+ choices=check_box['LLM_options'],
172
+ value=check_box['LLM_options'],
173
+ label='LLM',
174
+ interactive=True
175
+ )
176
 
177
  data_component = gr.components.DataFrame(
178
  value=table[headers],
 
214
  interactive=False,
215
  wrap=True,
216
  visible=True)
217
+
218
  return comp
219
 
220
  # 为所有复选框组添加change事件
 
252
  show_copy_button=True,
253
  )
254
 
255
+
256
 
257
  if __name__ == '__main__':
258
  demo.launch(server_name='0.0.0.0')
gen_table.py CHANGED
@@ -97,6 +97,15 @@ def BUILD_L2_DF(results, fields):
97
  # Create DataFrame
98
  df = pd.DataFrame(res)
99
 
 
 
 
 
 
 
 
 
 
100
  # Sort by Dataset and Score in descending order
101
  df = df.sort_values(['Dataset', 'Score'], ascending=[True, False])
102
 
@@ -109,10 +118,10 @@ def BUILD_L2_DF(results, fields):
109
  df = df[columns + remaining_columns]
110
 
111
  # Set checkbox configuration
112
- check_box = {}
113
- check_box['essential'] = ['Algorithm', 'Dataset', 'LLM', 'Eval Date']
114
- check_box['required'] = check_box['essential'] + ['Score', 'Pass rate', 'X-shot', 'Samples', 'All tokens', 'Cost($)']
115
  check_box['all'] = ['Score', 'Pass rate', 'X-shot', 'Samples', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens', 'All tokens', 'Cost($)']
 
116
  type_map = defaultdict(lambda: 'number')
117
  type_map['Algorithm'] = 'html'
118
  type_map['LLM'] = type_map['Vision Model'] = 'html'
@@ -122,7 +131,6 @@ def BUILD_L2_DF(results, fields):
122
  type_map['Cost($)'] = 'number'
123
  check_box['type_map'] = type_map
124
 
125
-
126
  return df, check_box
127
 
128
 
 
97
  # Create DataFrame
98
  df = pd.DataFrame(res)
99
 
100
+ # 获取所有唯一的 Algorithm 和 LLM
101
+ unique_algorithms = df['Algorithm'].unique().tolist()
102
+ unique_llms = df['LLM'].unique().tolist()
103
+
104
+ # Set checkbox configuration
105
+ check_box = {}
106
+ check_box['Algorithm_options'] = unique_algorithms # 添加 Algorithm 可选项
107
+ check_box['LLM_options'] = unique_llms # 添加 LLM 可选项
108
+
109
  # Sort by Dataset and Score in descending order
110
  df = df.sort_values(['Dataset', 'Score'], ascending=[True, False])
111
 
 
118
  df = df[columns + remaining_columns]
119
 
120
  # Set checkbox configuration
121
+ check_box['essential'] = ['Algorithm', 'Dataset', 'LLM', 'X-shot', 'Eval Date']
122
+ check_box['required'] = check_box['essential'] + ['Score', 'Pass rate', 'Samples', 'All tokens', 'Cost($)']
 
123
  check_box['all'] = ['Score', 'Pass rate', 'X-shot', 'Samples', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens', 'All tokens', 'Cost($)']
124
+
125
  type_map = defaultdict(lambda: 'number')
126
  type_map['Algorithm'] = 'html'
127
  type_map['LLM'] = type_map['Vision Model'] = 'html'
 
131
  type_map['Cost($)'] = 'number'
132
  check_box['type_map'] = type_map
133
 
 
134
  return df, check_box
135
 
136
 
meta_data.py CHANGED
@@ -45,11 +45,19 @@ LEADERBOARD_MD['MATH_DETAIL'] = f"""
45
  - default parameters: temperature=0.0
46
  - LLM prices:
47
  - gpt-3.5-turbo:
48
- - 0.0005$/1M tokens (input)
49
- - 0.0015$/1M tokens (output)
50
  - Doubao-lite-32k (1 USD = 7.3249 CNY):
51
- - 0.00004096$/1M tokens (input)
52
- - 0.0001$/1M tokens (output)
 
 
 
 
 
 
 
 
53
 
54
  - IO (Input-Output) is the baseline method that directly prompts the model with the question and expects an answer without any intermediate reasoning steps. It represents the most basic way of using language models and serves as a reference point for evaluating the effectiveness of other algorithms.
55
 
 
45
  - default parameters: temperature=0.0
46
  - LLM prices:
47
  - gpt-3.5-turbo:
48
+ - 0.5$/1M tokens (input)
49
+ - 1.5$/1M tokens (output)
50
  - Doubao-lite-32k (1 USD = 7.3249 CNY):
51
+ - 0.04096$/1M tokens (input)
52
+ - 0.08200$/1M tokens (output)
53
+ - gpt-4o-2024-08-06:
54
+ - 2.50$ /1M input tokens (input)
55
+ - 10$ /1M output tokens (output)
56
+ - Qwen2.5-7B-Instruct and Llama-3.3-70B-Instruct:
57
+ - Prices can be found https://cloud.siliconflow.cn/.
58
+ - Other open source LLMs:
59
+ - Deployed locally, please check the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository for more information.
60
+ - Cost is not considered in the leaderboard.
61
 
62
  - IO (Input-Output) is the baseline method that directly prompts the model with the question and expects an answer without any intermediate reasoning steps. It represents the most basic way of using language models and serves as a reference point for evaluating the effectiveness of other algorithms.
63
 
preprocess.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import json
3
+ from datetime import datetime
4
+
5
+ def process_csv_to_json():
6
+ # 读取CSV文件
7
+ df = pd.read_csv('src/record.csv')
8
+
9
+ # 清理数据:删除空行,重命名列
10
+ df = df.dropna(how='all')
11
+ df = df.rename(columns={
12
+ 'dataset': 'Dataset',
13
+ 'llm': 'LLM',
14
+ 'score\n(EM)': 'Score',
15
+ 'pass rate': 'Pass rate',
16
+ 'Cost($)': 'Cost($)',
17
+ 'Eval Date': 'Eval Date',
18
+ 'framework': 'Framework',
19
+ 'X-shot': 'X-shot',
20
+ 'Nums': 'Samples',
21
+ 'All tokens': 'All tokens',
22
+ 'Total input tokens': 'Total input tokens',
23
+ 'Average input tokens': 'Average input tokens',
24
+ 'Total output tokens': 'Total output tokens',
25
+ 'Average output tokens': 'Average output tokens'
26
+ })
27
+
28
+ # 辅助函数:处理包含逗号的数字字符串
29
+ def parse_number(value):
30
+ if pd.isna(value):
31
+ return 0
32
+ # 先移除逗号,然后转换为浮点数,最后转换为整数
33
+ return int(float(str(value).replace(',', '')))
34
+
35
+ # 初始化结果字典
36
+ result = {
37
+ "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
38
+ "results": {}
39
+ }
40
+
41
+ # 获取所有唯一的LLM
42
+ llms = df['LLM'].dropna().unique()
43
+
44
+ # 遍历每个算法
45
+ for algorithm in df['Algorithm'].dropna().unique():
46
+ if not isinstance(algorithm, str):
47
+ continue
48
+
49
+ result['results'][algorithm] = {}
50
+
51
+ # 对每个LLM进行处理
52
+ for llm in llms:
53
+ llm_data = df[(df['Algorithm'] == algorithm) & (df['LLM'] == llm)]
54
+ if llm_data.empty:
55
+ continue
56
+
57
+ # 创建LLM对应的字典
58
+ result['results'][algorithm][llm] = {
59
+ 'META': {
60
+ 'Algorithm': str(algorithm),
61
+ 'LLM': str(llm),
62
+ 'Eval Date': str(llm_data['Eval Date'].iloc[0])
63
+ }
64
+ }
65
+
66
+ # 对每个数据集进行处理
67
+ for dataset in df['Dataset'].dropna().unique():
68
+ if not isinstance(dataset, str):
69
+ continue
70
+
71
+ dataset_data = llm_data[llm_data['Dataset'] == dataset]
72
+
73
+ if not dataset_data.empty:
74
+ data_row = dataset_data.iloc[0]
75
+ result['results'][algorithm][llm][dataset] = {
76
+ 'Score': round(float(data_row['Score']), 2), # 保留两位小数
77
+ 'Pass rate': round(float(data_row['Pass rate']) / 100, 4), # 转换为小数并保留两位小数
78
+ 'Cost($)': float(data_row['Cost($)']) if pd.notnull(data_row['Cost($)']) else 0.0,
79
+ 'Framework': str(data_row['Framework']) if 'Framework' in data_row and pd.notnull(data_row['Framework']) else '',
80
+ 'X-shot': str(data_row['X-shot']) if pd.notnull(data_row['X-shot']) else '',
81
+ 'Samples': parse_number(data_row['Samples']),
82
+ 'All tokens': parse_number(data_row['All tokens']),
83
+ 'Total input tokens': parse_number(data_row['Total input tokens']),
84
+ 'Average input tokens': parse_number(data_row['Average input tokens']),
85
+ 'Total output tokens': parse_number(data_row['Total output tokens']),
86
+ 'Average output tokens': parse_number(data_row['Average output tokens'])
87
+ }
88
+
89
+ # 检查每个字段是否存在
90
+ required_fields = ['Score', 'Pass rate', 'Cost($)', 'Framework', 'X-shot', 'Samples', 'All tokens', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens']
91
+
92
+ for key, value in result['results'].items():
93
+ for llm, datasets in value.items():
94
+ # 检查 META 信息
95
+ meta = datasets.get('META', {})
96
+ if 'LLM' not in meta or 'Eval Date' not in meta:
97
+ print(f"Missing META fields in algorithm '{key}' for LLM '{llm}'")
98
+
99
+ for dataset, data in datasets.items():
100
+ if dataset == 'META':
101
+ continue
102
+ missing_fields = [field for field in required_fields if field not in data]
103
+ if missing_fields:
104
+ print(f"Missing fields {missing_fields} in dataset '{dataset}' for LLM '{llm}' in algorithm '{key}'")
105
+
106
+ # 保存为JSON文件
107
+ with open('src/detail_math_score.json', 'w', encoding='utf-8') as f:
108
+ json.dump(result, f, indent=4, ensure_ascii=False)
109
+
110
+ def process_csv_to_overall_json():
111
+ # 读取CSV文件
112
+ df = pd.read_csv('src/record.csv')
113
+
114
+ # 清理数据:删除空行,重命名列
115
+ df = df.dropna(how='all')
116
+ df = df.rename(columns={
117
+ 'dataset': 'Dataset',
118
+ 'llm': 'LLM',
119
+ 'score\n(EM)': 'Score',
120
+ 'Cost($)': 'Cost($)',
121
+ 'Eval Date': 'Eval Date'
122
+ })
123
+
124
+ # 初始化结果字典
125
+ result = {
126
+ "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
127
+ "results": {}
128
+ }
129
+
130
+ # 获取所有唯一的LLM
131
+ llms = df['LLM'].dropna().unique()
132
+ for llm in llms:
133
+ # 处理基础算法
134
+ for algorithm in df['Algorithm'].dropna().unique():
135
+ if not isinstance(algorithm, str):
136
+ continue
137
+
138
+ # 为非gpt-3.5-turbo的模型添加后缀
139
+ # 修改:为llama模型添加更多信息以确保唯一性
140
+ algo_key = algorithm if llm == 'gpt-3.5-turbo' else f"{algorithm}-{llm}"
141
+ # 检查该算法-LLM组合是否存在
142
+ algo_data = df[(df['Algorithm'] == algorithm) & (df['LLM'] == llm)]
143
+ if algo_data.empty:
144
+ print(f"No data found for algorithm '{algorithm}' and LLM '{llm}'")
145
+ continue
146
+
147
+ result['results'][algo_key] = {
148
+ "META": {
149
+ "Algorithm": algorithm,
150
+ "LLM": llm,
151
+ "Eval Date": str(algo_data['Eval Date'].iloc[0])
152
+ }
153
+ }
154
+
155
+ # 处理每个数据集
156
+ for dataset in ['gsm8k', 'AQuA']:
157
+ dataset_data = df[(df['Algorithm'] == algorithm) &
158
+ (df['Dataset'] == dataset) &
159
+ (df['LLM'] == llm)]
160
+ if not dataset_data.empty:
161
+ result['results'][algo_key][dataset] = {
162
+ "Score": float(dataset_data['Score'].iloc[0]) if pd.notnull(dataset_data['Score'].iloc[0]) else 0.0,
163
+ "Cost($)": float(dataset_data['Cost($)'].iloc[0]) if pd.notnull(dataset_data['Cost($)'].iloc[0]) else 0.0
164
+ }
165
+ else:
166
+ # 如果数据集为空,确保键存在并设置默认值
167
+ result['results'][algo_key][dataset] = {
168
+ "Score": 0.0,
169
+ "Cost($)": 0.0
170
+ }
171
+
172
+
173
+ # 保存为JSON文件
174
+ with open('src/overall_math_score.json', 'w', encoding='utf-8') as f:
175
+ json.dump(result, f, indent=4, ensure_ascii=False)
176
+
177
+ if __name__ == "__main__":
178
+ # 生成两种格式的JSON文件
179
+ process_csv_to_json()
180
+ process_csv_to_overall_json()
src/detail_math_score.json CHANGED
@@ -1,207 +1,667 @@
1
  {
2
- "time": "2025-01-09 17:13:45",
3
  "results": {
4
  "IO": {
5
  "gpt-3.5-turbo": {
6
  "META": {
7
  "Algorithm": "IO",
8
  "LLM": "gpt-3.5-turbo",
9
- "Eval Date": "2025/01/07"
10
  },
11
  "gsm8k": {
12
  "Score": 37.83,
13
- "Pass rate": 99.92,
14
- "X-shot": 8,
15
- "Parameters": "",
 
16
  "Samples": 1319,
 
17
  "Total input tokens": 546990,
18
  "Average input tokens": 415,
19
  "Total output tokens": 39563,
20
- "Average output tokens": 30,
21
- "All tokens": 586553,
22
- "Cost($)": 0.3328
23
  },
24
  "AQuA": {
25
- "Score": 38.98,
26
- "Pass rate": 100.00,
27
- "X-shot": 0,
28
- "Parameters": "",
 
29
  "Samples": 254,
 
30
  "Total input tokens": 25701,
31
  "Average input tokens": 101,
32
  "Total output tokens": 16770,
33
- "Average output tokens": 66,
34
- "All tokens": 42471,
35
- "Cost($)": 0.0380
36
  }
37
  },
38
  "Doubao-lite-32k": {
39
  "META": {
40
  "Algorithm": "IO",
41
  "LLM": "Doubao-lite-32k",
42
- "Eval Date": "2025/01/07"
43
  },
44
  "gsm8k": {
45
  "Score": 72.02,
46
- "Pass rate": 99.92,
47
- "X-shot": 8,
48
- "Parameters": "",
 
49
  "Samples": 1319,
 
50
  "Total input tokens": 617377,
51
  "Average input tokens": 468,
52
  "Total output tokens": 123106,
53
- "Average output tokens": 93,
54
- "All tokens": 740483,
55
- "Cost($)": 0.0354
56
  },
57
  "AQuA": {
58
  "Score": 79.13,
59
- "Pass rate": 100.00,
60
- "X-shot": 0,
61
- "Parameters": "",
 
62
  "Samples": 254,
 
63
  "Total input tokens": 33058,
64
  "Average input tokens": 130,
65
  "Total output tokens": 54684,
66
- "Average output tokens": 215,
67
- "All tokens": 87742,
68
- "Cost($)": 0.0058
69
  }
70
- }
71
- },
72
- "CoT": {
73
- "gpt-3.5-turbo": {
74
  "META": {
75
- "Algorithm": "CoT",
76
- "LLM": "gpt-3.5-turbo",
77
- "Eval Date": "2025/01/07"
78
  },
79
  "gsm8k": {
80
- "Score": 78.70,
81
- "Pass rate": 100.00,
82
- "X-shot": 8,
83
- "Parameters": "",
 
84
  "Samples": 1319,
85
- "Total input tokens": 953242,
86
- "Average input tokens": 723,
87
- "Total output tokens": 134799,
88
- "Average output tokens": 102,
89
- "All tokens": 1088041,
90
- "Cost($)": 0.6788
91
  },
92
  "AQuA": {
93
- "Score": 61.02,
94
- "Pass rate": 93.70,
95
- "X-shot": 0,
96
- "Parameters": "",
 
97
  "Samples": 254,
98
- "Total input tokens": 25447,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  "Average input tokens": 100,
100
- "Total output tokens": 55346,
101
- "Average output tokens": 218,
102
- "All tokens": 80793,
103
- "Cost($)": 0.0957
104
  }
105
  },
106
- "Doubao-lite-32k": {
107
  "META": {
108
- "Algorithm": "CoT",
109
- "LLM": "Doubao-lite-32k",
110
- "Eval Date": "2025/01/07"
111
  },
112
  "gsm8k": {
113
- "Score": 89.31,
114
- "Pass rate": 100.00,
115
- "X-shot": 8,
116
- "Parameters": "",
 
117
  "Samples": 1319,
118
- "Total input tokens": 1042095,
119
- "Average input tokens": 790,
120
- "Total output tokens": 159725,
121
- "Average output tokens": 121,
122
- "All tokens": 1201820,
123
- "Cost($)": 0.0557
124
  },
125
  "AQuA": {
126
- "Score": 82.68,
127
- "Pass rate": 97.24,
128
- "X-shot": 0,
129
- "Parameters": "",
 
130
  "Samples": 254,
131
- "Total input tokens": 27978,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  "Average input tokens": 110,
133
- "Total output tokens": 66599,
134
- "Average output tokens": 262,
135
- "All tokens": 94577,
136
- "Cost($)": 0.0066
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  }
138
  }
139
  },
140
- "SC-COT": {
141
  "gpt-3.5-turbo": {
142
  "META": {
143
- "Algorithm": "SC-CoT",
144
  "LLM": "gpt-3.5-turbo",
145
- "Eval Date": "2025/01/07"
146
  },
147
  "gsm8k": {
148
- "Score": 80.06,
149
- "Pass rate": 99.62,
150
- "X-shot": 8,
151
- "Parameters": "temperature=1, num_path=5",
 
152
  "Samples": 1319,
153
- "Total input tokens": 5260319,
154
- "Average input tokens": 3988,
155
- "Total output tokens": 1595016,
156
- "Average output tokens": 1209,
157
- "All tokens": 6855335,
158
- "Cost($)": 5.0227
159
  },
160
  "AQuA": {
161
- "Score": 67.32,
162
- "Pass rate": 100.00,
163
- "X-shot": 0,
164
- "Parameters": "temperature=1, path_num=5",
 
165
  "Samples": 254,
166
- "Total input tokens": 219241,
167
- "Average input tokens": 863,
168
- "Total output tokens": 359629,
169
- "Average output tokens": 1416,
170
- "All tokens": 578870,
171
- "Cost($)": 0.6491
172
  }
173
  },
174
  "Doubao-lite-32k": {
175
  "META": {
176
- "Algorithm": "SC-CoT",
177
  "LLM": "Doubao-lite-32k",
178
- "Eval Date": "2025/01/07"
179
  },
180
  "gsm8k": {
181
- "Score": 88.63,
182
- "Pass rate": 99.77,
183
- "X-shot": 8,
184
- "Parameters": "temperature=1, num_path=5",
 
185
  "Samples": 1319,
186
- "Total input tokens": 1150443,
187
- "Average input tokens": 872,
188
- "Total output tokens": 1295750,
189
- "Average output tokens": 982,
190
- "All tokens": 2446193,
191
- "Cost($)": 0.1533
192
  },
193
  "AQuA": {
194
- "Score": 83.46,
195
- "Pass rate": 97.24,
196
- "X-shot": 0,
197
- "Parameters": "temperature=1, num_path=5",
 
198
  "Samples": 254,
199
- "Total input tokens": 259804,
200
- "Average input tokens": 1023,
201
- "Total output tokens": 369741,
202
- "Average output tokens": 1456,
203
- "All tokens": 629545,
204
- "Cost($)": 0.0409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  }
206
  }
207
  },
@@ -210,136 +670,996 @@
210
  "META": {
211
  "Algorithm": "PoT",
212
  "LLM": "gpt-3.5-turbo",
213
- "Eval Date": "2025/01/07"
214
  },
215
  "gsm8k": {
216
- "Score": 76.88,
217
- "Pass rate": 99.24,
218
- "X-shot": 8,
219
- "Parameters": "",
 
220
  "Samples": 1319,
 
221
  "Total input tokens": 1090418,
222
  "Average input tokens": 827,
223
  "Total output tokens": 96662,
224
- "Average output tokens": 73,
225
- "All tokens": 1187080,
226
- "Cost($)": 0.6902
227
  },
228
  "AQuA": {
229
- "Score": 59.45,
230
- "Pass rate": 100,
231
- "X-shot": 0,
232
- "Parameters": "",
 
233
  "Samples": 254,
 
234
  "Total input tokens": 225162,
235
  "Average input tokens": 886,
236
  "Total output tokens": 41492,
237
- "Average output tokens": 163,
238
- "All tokens": 266654,
239
- "Cost($)": 0.1748
240
  }
241
  },
242
  "Doubao-lite-32k": {
243
  "META": {
244
  "Algorithm": "PoT",
245
  "LLM": "Doubao-lite-32k",
246
- "Eval Date": "2025/01/07"
247
  },
248
  "gsm8k": {
249
- "Score": 79.61,
250
- "Pass rate": 92.57,
251
- "X-shot": 8,
252
- "Parameters": "",
 
253
  "Samples": 1319,
 
254
  "Total input tokens": 1170038,
255
  "Average input tokens": 887,
256
  "Total output tokens": 118017,
257
- "Average output tokens": 89,
258
- "All tokens": 1288055,
259
- "Cost($)": 0.0575
260
  },
261
  "AQuA": {
262
  "Score": 71.65,
263
- "Pass rate": 96.85,
264
- "X-shot": 0,
265
- "Parameters": "",
 
266
  "Samples": 254,
 
267
  "Total input tokens": 259863,
268
  "Average input tokens": 1023,
269
  "Total output tokens": 49573,
270
- "Average output tokens": 195,
271
- "All tokens": 309436,
272
- "Cost($)": 0.0147
273
  }
274
- }
275
- },
276
- "ReAct-Pro*": {
277
- "gpt-3.5-turbo": {
278
  "META": {
279
- "Algorithm": "ReAct-Pro*",
280
- "LLM": "gpt-3.5-turbo",
281
- "Eval Date": "2025/01/07"
282
  },
283
  "gsm8k": {
284
- "Score": 74.91,
285
- "Pass rate": 99.39,
286
- "X-shot": 8,
287
- "Parameters": "max_steps=10",
 
288
  "Samples": 1319,
289
- "Total input tokens": 6506164,
290
- "Average input tokens": 4933,
291
- "Total output tokens": 140122,
292
- "Average output tokens": 106,
293
- "All tokens": 6646286,
294
- "Cost($)": 3.4633
295
  },
296
  "AQuA": {
297
- "Score": 64.57,
298
- "Pass rate": 98.03,
299
- "X-shot": 0,
300
- "Parameters": "max_steps=10",
 
301
  "Samples": 254,
302
- "Total input tokens": 862614,
303
- "Average input tokens": 3396,
304
- "Total output tokens": 40973,
305
- "Average output tokens": 161,
306
- "All tokens": 903587,
307
- "Cost($)": 0.4928
308
  }
309
  },
310
- "Doubao-lite-32k": {
311
  "META": {
312
- "Algorithm": "ReAct-Pro*",
313
- "LLM": "Doubao-lite-32k",
314
- "Eval Date": "2025/01/07"
315
  },
316
  "gsm8k": {
317
- "Score": 85.60,
318
- "Pass rate": 99.62,
319
- "X-shot": 8,
320
- "Parameters": "max_steps=10",
 
321
  "Samples": 1319,
322
- "Total input tokens": 5862016,
323
- "Average input tokens": 4444,
324
- "Total output tokens": 136623,
325
- "Average output tokens": 104,
326
- "All tokens": 5998639,
327
- "Cost($)": 0.2513
328
  },
329
  "AQuA": {
330
- "Score": 77.56,
331
- "Pass rate": 96.06,
332
- "X-shot": 0,
333
- "Parameters": "max_steps=10",
 
334
  "Samples": 254,
335
- "Total input tokens": 977890,
336
- "Average input tokens": 3850,
337
- "Total output tokens": 54951,
338
- "Average output tokens": 216,
339
- "All tokens": 1032841,
340
- "Cost($)": 0.0446
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  }
342
  }
343
  }
344
  }
345
- }
 
1
  {
2
+ "time": "2025-01-23 09:27:24",
3
  "results": {
4
  "IO": {
5
  "gpt-3.5-turbo": {
6
  "META": {
7
  "Algorithm": "IO",
8
  "LLM": "gpt-3.5-turbo",
9
+ "Eval Date": "2025/1/7"
10
  },
11
  "gsm8k": {
12
  "Score": 37.83,
13
+ "Pass rate": 0.9992,
14
+ "Cost($)": 0.3328,
15
+ "Framework": "",
16
+ "X-shot": "8.0",
17
  "Samples": 1319,
18
+ "All tokens": 586553,
19
  "Total input tokens": 546990,
20
  "Average input tokens": 415,
21
  "Total output tokens": 39563,
22
+ "Average output tokens": 30
 
 
23
  },
24
  "AQuA": {
25
+ "Score": 38.97,
26
+ "Pass rate": 1.0,
27
+ "Cost($)": 0.038,
28
+ "Framework": "",
29
+ "X-shot": "0.0",
30
  "Samples": 254,
31
+ "All tokens": 42471,
32
  "Total input tokens": 25701,
33
  "Average input tokens": 101,
34
  "Total output tokens": 16770,
35
+ "Average output tokens": 66
 
 
36
  }
37
  },
38
  "Doubao-lite-32k": {
39
  "META": {
40
  "Algorithm": "IO",
41
  "LLM": "Doubao-lite-32k",
42
+ "Eval Date": "2025/1/7"
43
  },
44
  "gsm8k": {
45
  "Score": 72.02,
46
+ "Pass rate": 0.9992,
47
+ "Cost($)": 0.0354,
48
+ "Framework": "",
49
+ "X-shot": "8.0",
50
  "Samples": 1319,
51
+ "All tokens": 740483,
52
  "Total input tokens": 617377,
53
  "Average input tokens": 468,
54
  "Total output tokens": 123106,
55
+ "Average output tokens": 93
 
 
56
  },
57
  "AQuA": {
58
  "Score": 79.13,
59
+ "Pass rate": 1.0,
60
+ "Cost($)": 0.0058,
61
+ "Framework": "",
62
+ "X-shot": "0.0",
63
  "Samples": 254,
64
+ "All tokens": 87742,
65
  "Total input tokens": 33058,
66
  "Average input tokens": 130,
67
  "Total output tokens": 54684,
68
+ "Average output tokens": 215
 
 
69
  }
70
+ },
71
+ "gpt-4o": {
 
 
72
  "META": {
73
+ "Algorithm": "IO",
74
+ "LLM": "gpt-4o",
75
+ "Eval Date": "2025/1/22"
76
  },
77
  "gsm8k": {
78
+ "Score": 88.4,
79
+ "Pass rate": 1.0,
80
+ "Cost($)": 3.3463,
81
+ "Framework": "",
82
+ "X-shot": "8.0",
83
  "Samples": 1319,
84
+ "All tokens": 741446,
85
+ "Total input tokens": 542416,
86
+ "Average input tokens": 411,
87
+ "Total output tokens": 199030,
88
+ "Average output tokens": 151
 
89
  },
90
  "AQuA": {
91
+ "Score": 75.59,
92
+ "Pass rate": 0.9724,
93
+ "Cost($)": 1.1453,
94
+ "Framework": "",
95
+ "X-shot": "0.0",
96
  "Samples": 254,
97
+ "All tokens": 133752,
98
+ "Total input tokens": 25631,
99
+ "Average input tokens": 101,
100
+ "Total output tokens": 108121,
101
+ "Average output tokens": 426
102
+ }
103
+ },
104
+ "Qwen2.5-72B-Instruct": {
105
+ "META": {
106
+ "Algorithm": "IO",
107
+ "LLM": "Qwen2.5-72B-Instruct",
108
+ "Eval Date": "2025/1/22"
109
+ },
110
+ "gsm8k": {
111
+ "Score": 86.58,
112
+ "Pass rate": 1.0,
113
+ "Cost($)": 0.4899,
114
+ "Framework": "",
115
+ "X-shot": "8.0",
116
+ "Samples": 1319,
117
+ "All tokens": 869060,
118
+ "Total input tokens": 555340,
119
+ "Average input tokens": 421,
120
+ "Total output tokens": 313720,
121
+ "Average output tokens": 238
122
+ },
123
+ "AQuA": {
124
+ "Score": 84.25,
125
+ "Pass rate": 0.996,
126
+ "Cost($)": 0.0742,
127
+ "Framework": "",
128
+ "X-shot": "0.0",
129
+ "Samples": 254,
130
+ "All tokens": 131604,
131
+ "Total input tokens": 25397,
132
  "Average input tokens": 100,
133
+ "Total output tokens": 106207,
134
+ "Average output tokens": 418
 
 
135
  }
136
  },
137
+ "Llama-3.3-70B-Instruct": {
138
  "META": {
139
+ "Algorithm": "IO",
140
+ "LLM": "Llama-3.3-70B-Instruct",
141
+ "Eval Date": "2025/1/22"
142
  },
143
  "gsm8k": {
144
+ "Score": 92.26,
145
+ "Pass rate": 1.0,
146
+ "Cost($)": 0.4709,
147
+ "Framework": "",
148
+ "X-shot": "8.0",
149
  "Samples": 1319,
150
+ "All tokens": 835275,
151
+ "Total input tokens": 583916,
152
+ "Average input tokens": 443,
153
+ "Total output tokens": 251359,
154
+ "Average output tokens": 191
 
155
  },
156
  "AQuA": {
157
+ "Score": 82.67,
158
+ "Pass rate": 0.9921,
159
+ "Cost($)": 0.0798,
160
+ "Framework": "",
161
+ "X-shot": "0.0",
162
  "Samples": 254,
163
+ "All tokens": 141567,
164
+ "Total input tokens": 32809,
165
+ "Average input tokens": 129,
166
+ "Total output tokens": 108758,
167
+ "Average output tokens": 428
168
+ }
169
+ },
170
+ "Qwen2.5-7B-Instruct": {
171
+ "META": {
172
+ "Algorithm": "IO",
173
+ "LLM": "Qwen2.5-7B-Instruct",
174
+ "Eval Date": "2025/1/22"
175
+ },
176
+ "gsm8k": {
177
+ "Score": 57.24,
178
+ "Pass rate": 1.0,
179
+ "Cost($)": 0.0,
180
+ "Framework": "",
181
+ "X-shot": "8.0",
182
+ "Samples": 1319,
183
+ "All tokens": 887913,
184
+ "Total input tokens": 596229,
185
+ "Average input tokens": 452,
186
+ "Total output tokens": 291684,
187
+ "Average output tokens": 221
188
+ },
189
+ "AQuA": {
190
+ "Score": 78.74,
191
+ "Pass rate": 0.9842,
192
+ "Cost($)": 0.0,
193
+ "Framework": "",
194
+ "X-shot": "0.0",
195
+ "Samples": 254,
196
+ "All tokens": 137771,
197
+ "Total input tokens": 33271,
198
+ "Average input tokens": 131,
199
+ "Total output tokens": 104500,
200
+ "Average output tokens": 411
201
+ }
202
+ },
203
+ "Llama-3.1-8B-Instruct": {
204
+ "META": {
205
+ "Algorithm": "IO",
206
+ "LLM": "Llama-3.1-8B-Instruct",
207
+ "Eval Date": "2025/1/22"
208
+ },
209
+ "gsm8k": {
210
+ "Score": 57.16,
211
+ "Pass rate": 0.9954,
212
+ "Cost($)": 0.0,
213
+ "Framework": "",
214
+ "X-shot": "8.0",
215
+ "Samples": 1319,
216
+ "All tokens": 1745429,
217
+ "Total input tokens": 550941,
218
+ "Average input tokens": 418,
219
+ "Total output tokens": 1194488,
220
+ "Average output tokens": 906
221
+ },
222
+ "AQuA": {
223
+ "Score": 51.18,
224
+ "Pass rate": 0.9881,
225
+ "Cost($)": 0.0,
226
+ "Framework": "",
227
+ "X-shot": "0.0",
228
+ "Samples": 254,
229
+ "All tokens": 133106,
230
+ "Total input tokens": 26459,
231
+ "Average input tokens": 104,
232
+ "Total output tokens": 106647,
233
+ "Average output tokens": 420
234
+ }
235
+ },
236
+ "Internllm2_5-7B": {
237
+ "META": {
238
+ "Algorithm": "IO",
239
+ "LLM": "Internllm2_5-7B",
240
+ "Eval Date": "2025/1/22"
241
+ },
242
+ "gsm8k": {
243
+ "Score": 11.59,
244
+ "Pass rate": 0.9795,
245
+ "Cost($)": 0.0,
246
+ "Framework": "",
247
+ "X-shot": "8.0",
248
+ "Samples": 1319,
249
+ "All tokens": 1113728,
250
+ "Total input tokens": 679302,
251
+ "Average input tokens": 515,
252
+ "Total output tokens": 434426,
253
+ "Average output tokens": 329
254
+ },
255
+ "AQuA": {
256
+ "Score": 47.63,
257
+ "Pass rate": 0.9094,
258
+ "Cost($)": 0.0,
259
+ "Framework": "",
260
+ "X-shot": "0.0",
261
+ "Samples": 254,
262
+ "All tokens": 185041,
263
+ "Total input tokens": 50232,
264
+ "Average input tokens": 198,
265
+ "Total output tokens": 134809,
266
+ "Average output tokens": 531
267
+ }
268
+ },
269
+ "Qwen2-1.5B-Instruct": {
270
+ "META": {
271
+ "Algorithm": "IO",
272
+ "LLM": "Qwen2-1.5B-Instruct",
273
+ "Eval Date": "2025/1/22"
274
+ },
275
+ "gsm8k": {
276
+ "Score": 16.67,
277
+ "Pass rate": 1.0,
278
+ "Cost($)": 0.0,
279
+ "Framework": "",
280
+ "X-shot": "8.0",
281
+ "Samples": 1319,
282
+ "All tokens": 736996,
283
+ "Total input tokens": 568530,
284
+ "Average input tokens": 431,
285
+ "Total output tokens": 168466,
286
+ "Average output tokens": 128
287
+ },
288
+ "AQuA": {
289
+ "Score": 29.13,
290
+ "Pass rate": 0.9763,
291
+ "Cost($)": 0.0,
292
+ "Framework": "",
293
+ "X-shot": "0.0",
294
+ "Samples": 254,
295
+ "All tokens": 71047,
296
+ "Total input tokens": 27937,
297
  "Average input tokens": 110,
298
+ "Total output tokens": 43110,
299
+ "Average output tokens": 170
300
+ }
301
+ },
302
+ "Qwen2-0.5B-Instruct": {
303
+ "META": {
304
+ "Algorithm": "IO",
305
+ "LLM": "Qwen2-0.5B-Instruct",
306
+ "Eval Date": "2025/1/22"
307
+ },
308
+ "gsm8k": {
309
+ "Score": 14.7,
310
+ "Pass rate": 1.0,
311
+ "Cost($)": 0.0,
312
+ "Framework": "",
313
+ "X-shot": "8.0",
314
+ "Samples": 1319,
315
+ "All tokens": 834897,
316
+ "Total input tokens": 568116,
317
+ "Average input tokens": 431,
318
+ "Total output tokens": 266781,
319
+ "Average output tokens": 202
320
+ },
321
+ "AQuA": {
322
+ "Score": 27.16,
323
+ "Pass rate": 0.9881,
324
+ "Cost($)": 0.0,
325
+ "Framework": "",
326
+ "X-shot": "0.0",
327
+ "Samples": 254,
328
+ "All tokens": 110415,
329
+ "Total input tokens": 27937,
330
+ "Average input tokens": 110,
331
+ "Total output tokens": 82478,
332
+ "Average output tokens": 325
333
  }
334
  }
335
  },
336
+ "ReAct-Pro*": {
337
  "gpt-3.5-turbo": {
338
  "META": {
339
+ "Algorithm": "ReAct-Pro*",
340
  "LLM": "gpt-3.5-turbo",
341
+ "Eval Date": "2025/1/7"
342
  },
343
  "gsm8k": {
344
+ "Score": 74.9,
345
+ "Pass rate": 0.9939,
346
+ "Cost($)": 3.4633,
347
+ "Framework": "",
348
+ "X-shot": "8.0",
349
  "Samples": 1319,
350
+ "All tokens": 6646286,
351
+ "Total input tokens": 6506164,
352
+ "Average input tokens": 4933,
353
+ "Total output tokens": 140122,
354
+ "Average output tokens": 106
 
355
  },
356
  "AQuA": {
357
+ "Score": 64.56,
358
+ "Pass rate": 0.9803,
359
+ "Cost($)": 0.4928,
360
+ "Framework": "",
361
+ "X-shot": "0.0",
362
  "Samples": 254,
363
+ "All tokens": 903587,
364
+ "Total input tokens": 862614,
365
+ "Average input tokens": 3396,
366
+ "Total output tokens": 40973,
367
+ "Average output tokens": 161
 
368
  }
369
  },
370
  "Doubao-lite-32k": {
371
  "META": {
372
+ "Algorithm": "ReAct-Pro*",
373
  "LLM": "Doubao-lite-32k",
374
+ "Eval Date": "2025/1/7"
375
  },
376
  "gsm8k": {
377
+ "Score": 85.59,
378
+ "Pass rate": 0.9962,
379
+ "Cost($)": 0.2512,
380
+ "Framework": "",
381
+ "X-shot": "8.0",
382
  "Samples": 1319,
383
+ "All tokens": 5998639,
384
+ "Total input tokens": 5862016,
385
+ "Average input tokens": 4444,
386
+ "Total output tokens": 136623,
387
+ "Average output tokens": 104
 
388
  },
389
  "AQuA": {
390
+ "Score": 77.55,
391
+ "Pass rate": 0.9606,
392
+ "Cost($)": 0.0445,
393
+ "Framework": "",
394
+ "X-shot": "0.0",
395
  "Samples": 254,
396
+ "All tokens": 1032841,
397
+ "Total input tokens": 977890,
398
+ "Average input tokens": 3850,
399
+ "Total output tokens": 54951,
400
+ "Average output tokens": 216
401
+ }
402
+ },
403
+ "gpt-4o": {
404
+ "META": {
405
+ "Algorithm": "ReAct-Pro*",
406
+ "LLM": "gpt-4o",
407
+ "Eval Date": "2025/1/22"
408
+ },
409
+ "gsm8k": {
410
+ "Score": 63.3,
411
+ "Pass rate": 0.9954,
412
+ "Cost($)": 39.0751,
413
+ "Framework": "",
414
+ "X-shot": "8.0",
415
+ "Samples": 1319,
416
+ "All tokens": 14715887,
417
+ "Total input tokens": 14411173,
418
+ "Average input tokens": 10926,
419
+ "Total output tokens": 304714,
420
+ "Average output tokens": 231
421
+ },
422
+ "AQuA": {
423
+ "Score": 57.48,
424
+ "Pass rate": 0.9724,
425
+ "Cost($)": 2.304,
426
+ "Framework": "",
427
+ "X-shot": "0.0",
428
+ "Samples": 254,
429
+ "All tokens": 692096,
430
+ "Total input tokens": 615589,
431
+ "Average input tokens": 2424,
432
+ "Total output tokens": 76507,
433
+ "Average output tokens": 301
434
+ }
435
+ },
436
+ "Qwen2.5-72B-Instruct": {
437
+ "META": {
438
+ "Algorithm": "ReAct-Pro*",
439
+ "LLM": "Qwen2.5-72B-Instruct",
440
+ "Eval Date": "2025/1/22"
441
+ },
442
+ "gsm8k": {
443
+ "Score": 87.26,
444
+ "Pass rate": 1.0,
445
+ "Cost($)": 10.5479,
446
+ "Framework": "",
447
+ "X-shot": "8.0",
448
+ "Samples": 1319,
449
+ "All tokens": 18710437,
450
+ "Total input tokens": 18160983,
451
+ "Average input tokens": 13769,
452
+ "Total output tokens": 549454,
453
+ "Average output tokens": 417
454
+ },
455
+ "AQuA": {
456
+ "Score": 73.22,
457
+ "Pass rate": 1.0,
458
+ "Cost($)": 0.3177,
459
+ "Framework": "",
460
+ "X-shot": "0.0",
461
+ "Samples": 254,
462
+ "All tokens": 563603,
463
+ "Total input tokens": 441765,
464
+ "Average input tokens": 1739,
465
+ "Total output tokens": 121838,
466
+ "Average output tokens": 480
467
+ }
468
+ },
469
+ "Llama-3.3-70B-Instruct": {
470
+ "META": {
471
+ "Algorithm": "ReAct-Pro*",
472
+ "LLM": "Llama-3.3-70B-Instruct",
473
+ "Eval Date": "2025/1/22"
474
+ },
475
+ "gsm8k": {
476
+ "Score": 87.64,
477
+ "Pass rate": 0.9992,
478
+ "Cost($)": 10.1124,
479
+ "Framework": "",
480
+ "X-shot": "",
481
+ "Samples": 1319,
482
+ "All tokens": 17937864,
483
+ "Total input tokens": 17038928,
484
+ "Average input tokens": 12918,
485
+ "Total output tokens": 898936,
486
+ "Average output tokens": 682
487
+ },
488
+ "AQuA": {
489
+ "Score": 79.13,
490
+ "Pass rate": 0.996,
491
+ "Cost($)": 0.768,
492
+ "Framework": "",
493
+ "X-shot": "0.0",
494
+ "Samples": 254,
495
+ "All tokens": 1362379,
496
+ "Total input tokens": 1119143,
497
+ "Average input tokens": 4406,
498
+ "Total output tokens": 243236,
499
+ "Average output tokens": 958
500
+ }
501
+ },
502
+ "Qwen2.5-7B-Instruct": {
503
+ "META": {
504
+ "Algorithm": "ReAct-Pro*",
505
+ "LLM": "Qwen2.5-7B-Instruct",
506
+ "Eval Date": "2025/1/22"
507
+ },
508
+ "gsm8k": {
509
+ "Score": 82.86,
510
+ "Pass rate": 1.0,
511
+ "Cost($)": 0.0,
512
+ "Framework": "",
513
+ "X-shot": "8.0",
514
+ "Samples": 1319,
515
+ "All tokens": 14850914,
516
+ "Total input tokens": 14355752,
517
+ "Average input tokens": 10884,
518
+ "Total output tokens": 495162,
519
+ "Average output tokens": 375
520
+ },
521
+ "AQuA": {
522
+ "Score": 74.4,
523
+ "Pass rate": 0.9921,
524
+ "Cost($)": 0.0,
525
+ "Framework": "",
526
+ "X-shot": "0.0",
527
+ "Samples": 254,
528
+ "All tokens": 695844,
529
+ "Total input tokens": 564165,
530
+ "Average input tokens": 2221,
531
+ "Total output tokens": 131679,
532
+ "Average output tokens": 518
533
+ }
534
+ },
535
+ "Llama-3.1-8B-Instruct": {
536
+ "META": {
537
+ "Algorithm": "ReAct-Pro*",
538
+ "LLM": "Llama-3.1-8B-Instruct",
539
+ "Eval Date": "2025/1/22"
540
+ },
541
+ "gsm8k": {
542
+ "Score": 67.77,
543
+ "Pass rate": 0.9855,
544
+ "Cost($)": 0.0,
545
+ "Framework": "",
546
+ "X-shot": "8.0",
547
+ "Samples": 1319,
548
+ "All tokens": 22835767,
549
+ "Total input tokens": 21044978,
550
+ "Average input tokens": 15955,
551
+ "Total output tokens": 1790789,
552
+ "Average output tokens": 1358
553
+ },
554
+ "AQuA": {
555
+ "Score": 55.51,
556
+ "Pass rate": 0.9685,
557
+ "Cost($)": 0.0,
558
+ "Framework": "",
559
+ "X-shot": "0.0",
560
+ "Samples": 254,
561
+ "All tokens": 4340821,
562
+ "Total input tokens": 3764723,
563
+ "Average input tokens": 14822,
564
+ "Total output tokens": 576098,
565
+ "Average output tokens": 2268
566
+ }
567
+ },
568
+ "Internllm2_5-7B": {
569
+ "META": {
570
+ "Algorithm": "ReAct-Pro*",
571
+ "LLM": "Internllm2_5-7B",
572
+ "Eval Date": "2025/1/22"
573
+ },
574
+ "gsm8k": {
575
+ "Score": 33.51,
576
+ "Pass rate": 0.9795,
577
+ "Cost($)": 0.0,
578
+ "Framework": "",
579
+ "X-shot": "",
580
+ "Samples": 1319,
581
+ "All tokens": 35669989,
582
+ "Total input tokens": 30120070,
583
+ "Average input tokens": 22836,
584
+ "Total output tokens": 5549919,
585
+ "Average output tokens": 4208
586
+ },
587
+ "AQuA": {
588
+ "Score": 40.94,
589
+ "Pass rate": 0.9685,
590
+ "Cost($)": 0.0,
591
+ "Framework": "",
592
+ "X-shot": "0.0",
593
+ "Samples": 254,
594
+ "All tokens": 4428801,
595
+ "Total input tokens": 3592039,
596
+ "Average input tokens": 14142,
597
+ "Total output tokens": 836762,
598
+ "Average output tokens": 3294
599
+ }
600
+ },
601
+ "Qwen2-1.5B-Instruct": {
602
+ "META": {
603
+ "Algorithm": "ReAct-Pro*",
604
+ "LLM": "Qwen2-1.5B-Instruct",
605
+ "Eval Date": "2025/1/22"
606
+ },
607
+ "gsm8k": {
608
+ "Score": 24.86,
609
+ "Pass rate": 0.8021,
610
+ "Cost($)": 0.0,
611
+ "Framework": "",
612
+ "X-shot": "8.0",
613
+ "Samples": 1319,
614
+ "All tokens": 9828001,
615
+ "Total input tokens": 9133603,
616
+ "Average input tokens": 6925,
617
+ "Total output tokens": 694398,
618
+ "Average output tokens": 526
619
+ },
620
+ "AQuA": {
621
+ "Score": 25.59,
622
+ "Pass rate": 0.9606,
623
+ "Cost($)": 0.0,
624
+ "Framework": "",
625
+ "X-shot": "0.0",
626
+ "Samples": 254,
627
+ "All tokens": 5072004,
628
+ "Total input tokens": 4555858,
629
+ "Average input tokens": 17936,
630
+ "Total output tokens": 516146,
631
+ "Average output tokens": 2032
632
+ }
633
+ },
634
+ "Qwen2-0.5B-Instruct": {
635
+ "META": {
636
+ "Algorithm": "ReAct-Pro*",
637
+ "LLM": "Qwen2-0.5B-Instruct",
638
+ "Eval Date": "2025/1/22"
639
+ },
640
+ "gsm8k": {
641
+ "Score": 7.65,
642
+ "Pass rate": 0.9522,
643
+ "Cost($)": 0.0,
644
+ "Framework": "",
645
+ "X-shot": "8.0",
646
+ "Samples": 1319,
647
+ "All tokens": 55392611,
648
+ "Total input tokens": 52431343,
649
+ "Average input tokens": 39751,
650
+ "Total output tokens": 2961268,
651
+ "Average output tokens": 2245
652
+ },
653
+ "AQuA": {
654
+ "Score": 24.01,
655
+ "Pass rate": 0.9685,
656
+ "Cost($)": 0.0,
657
+ "Framework": "",
658
+ "X-shot": "0.0",
659
+ "Samples": 254,
660
+ "All tokens": 7170087,
661
+ "Total input tokens": 6344167,
662
+ "Average input tokens": 24977,
663
+ "Total output tokens": 825920,
664
+ "Average output tokens": 3252
665
  }
666
  }
667
  },
 
670
  "META": {
671
  "Algorithm": "PoT",
672
  "LLM": "gpt-3.5-turbo",
673
+ "Eval Date": "2025/1/7"
674
  },
675
  "gsm8k": {
676
+ "Score": 76.87,
677
+ "Pass rate": 0.9924,
678
+ "Cost($)": 0.6902,
679
+ "Framework": "",
680
+ "X-shot": "8.0",
681
  "Samples": 1319,
682
+ "All tokens": 1187080,
683
  "Total input tokens": 1090418,
684
  "Average input tokens": 827,
685
  "Total output tokens": 96662,
686
+ "Average output tokens": 73
 
 
687
  },
688
  "AQuA": {
689
+ "Score": 59.44,
690
+ "Pass rate": 1.0,
691
+ "Cost($)": 0.1748,
692
+ "Framework": "",
693
+ "X-shot": "0.0",
694
  "Samples": 254,
695
+ "All tokens": 266654,
696
  "Total input tokens": 225162,
697
  "Average input tokens": 886,
698
  "Total output tokens": 41492,
699
+ "Average output tokens": 163
 
 
700
  }
701
  },
702
  "Doubao-lite-32k": {
703
  "META": {
704
  "Algorithm": "PoT",
705
  "LLM": "Doubao-lite-32k",
706
+ "Eval Date": "2025/1/7"
707
  },
708
  "gsm8k": {
709
+ "Score": 79.6,
710
+ "Pass rate": 0.9257,
711
+ "Cost($)": 0.0576,
712
+ "Framework": "",
713
+ "X-shot": "8.0",
714
  "Samples": 1319,
715
+ "All tokens": 1288055,
716
  "Total input tokens": 1170038,
717
  "Average input tokens": 887,
718
  "Total output tokens": 118017,
719
+ "Average output tokens": 89
 
 
720
  },
721
  "AQuA": {
722
  "Score": 71.65,
723
+ "Pass rate": 0.9685,
724
+ "Cost($)": 0.0147,
725
+ "Framework": "",
726
+ "X-shot": "0.0",
727
  "Samples": 254,
728
+ "All tokens": 309436,
729
  "Total input tokens": 259863,
730
  "Average input tokens": 1023,
731
  "Total output tokens": 49573,
732
+ "Average output tokens": 195
 
 
733
  }
734
+ },
735
+ "gpt-4o": {
 
 
736
  "META": {
737
+ "Algorithm": "PoT",
738
+ "LLM": "gpt-4o",
739
+ "Eval Date": "2025/1/22"
740
  },
741
  "gsm8k": {
742
+ "Score": 93.1,
743
+ "Pass rate": 0.9977,
744
+ "Cost($)": 4.2166,
745
+ "Framework": "",
746
+ "X-shot": "8.0",
747
  "Samples": 1319,
748
+ "All tokens": 1247912,
749
+ "Total input tokens": 1101672,
750
+ "Average input tokens": 835,
751
+ "Total output tokens": 146240,
752
+ "Average output tokens": 111
 
753
  },
754
  "AQuA": {
755
+ "Score": 75.19,
756
+ "Pass rate": 1.0,
757
+ "Cost($)": 1.6087,
758
+ "Framework": "",
759
+ "X-shot": "0.0",
760
  "Samples": 254,
761
+ "All tokens": 327908,
762
+ "Total input tokens": 222717,
763
+ "Average input tokens": 877,
764
+ "Total output tokens": 105191,
765
+ "Average output tokens": 414
 
766
  }
767
  },
768
+ "Qwen2.5-72B-Instruct": {
769
  "META": {
770
+ "Algorithm": "PoT",
771
+ "LLM": "Qwen2.5-72B-Instruct",
772
+ "Eval Date": "2025/1/22"
773
  },
774
  "gsm8k": {
775
+ "Score": 92.34,
776
+ "Pass rate": 0.9939,
777
+ "Cost($)": 0.7054,
778
+ "Framework": "",
779
+ "X-shot": "8.0",
780
  "Samples": 1319,
781
+ "All tokens": 1251210,
782
+ "Total input tokens": 1106682,
783
+ "Average input tokens": 839,
784
+ "Total output tokens": 144528,
785
+ "Average output tokens": 110
 
786
  },
787
  "AQuA": {
788
+ "Score": 75.19,
789
+ "Pass rate": 1.0,
790
+ "Cost($)": 0.1645,
791
+ "Framework": "",
792
+ "X-shot": "0.0",
793
  "Samples": 254,
794
+ "All tokens": 291764,
795
+ "Total input tokens": 249215,
796
+ "Average input tokens": 981,
797
+ "Total output tokens": 42549,
798
+ "Average output tokens": 168
799
+ }
800
+ },
801
+ "Llama-3.3-70B-Instruct": {
802
+ "META": {
803
+ "Algorithm": "PoT",
804
+ "LLM": "Llama-3.3-70B-Instruct",
805
+ "Eval Date": "2025/1/22"
806
+ },
807
+ "gsm8k": {
808
+ "Score": 73.08,
809
+ "Pass rate": 0.796,
810
+ "Cost($)": 0.9736,
811
+ "Framework": "",
812
+ "X-shot": "8.0",
813
+ "Samples": 1319,
814
+ "All tokens": 1727044,
815
+ "Total input tokens": 1126025,
816
+ "Average input tokens": 854,
817
+ "Total output tokens": 601019,
818
+ "Average output tokens": 456
819
+ },
820
+ "AQuA": {
821
+ "Score": 79.52,
822
+ "Pass rate": 0.9921,
823
+ "Cost($)": 0.1746,
824
+ "Framework": "",
825
+ "X-shot": "0.0",
826
+ "Samples": 254,
827
+ "All tokens": 309799,
828
+ "Total input tokens": 240735,
829
+ "Average input tokens": 948,
830
+ "Total output tokens": 69064,
831
+ "Average output tokens": 272
832
+ }
833
+ },
834
+ "Qwen2.5-7B-Instruct": {
835
+ "META": {
836
+ "Algorithm": "PoT",
837
+ "LLM": "Qwen2.5-7B-Instruct",
838
+ "Eval Date": "2025/1/22"
839
+ },
840
+ "gsm8k": {
841
+ "Score": 58.83,
842
+ "Pass rate": 0.705,
843
+ "Cost($)": 0.0,
844
+ "Framework": "",
845
+ "X-shot": "8.0",
846
+ "Samples": 1319,
847
+ "All tokens": 1362822,
848
+ "Total input tokens": 1145390,
849
+ "Average input tokens": 868,
850
+ "Total output tokens": 217432,
851
+ "Average output tokens": 165
852
+ },
853
+ "AQuA": {
854
+ "Score": 68.11,
855
+ "Pass rate": 1.0,
856
+ "Cost($)": 0.0,
857
+ "Framework": "",
858
+ "X-shot": "0.0",
859
+ "Samples": 254,
860
+ "All tokens": 313728,
861
+ "Total input tokens": 264517,
862
+ "Average input tokens": 1041,
863
+ "Total output tokens": 49211,
864
+ "Average output tokens": 194
865
+ }
866
+ },
867
+ "Llama-3.1-8B-Instruct": {
868
+ "META": {
869
+ "Algorithm": "PoT",
870
+ "LLM": "Llama-3.1-8B-Instruct",
871
+ "Eval Date": "2025/1/22"
872
+ },
873
+ "gsm8k": {
874
+ "Score": 38.66,
875
+ "Pass rate": 0.5542,
876
+ "Cost($)": 0.0,
877
+ "Framework": "",
878
+ "X-shot": "8.0",
879
+ "Samples": 1319,
880
+ "All tokens": 1391111,
881
+ "Total input tokens": 1147538,
882
+ "Average input tokens": 870,
883
+ "Total output tokens": 243573,
884
+ "Average output tokens": 185
885
+ },
886
+ "AQuA": {
887
+ "Score": 36.61,
888
+ "Pass rate": 0.9685,
889
+ "Cost($)": 0.0,
890
+ "Framework": "",
891
+ "X-shot": "0.0",
892
+ "Samples": 254,
893
+ "All tokens": 290914,
894
+ "Total input tokens": 240613,
895
+ "Average input tokens": 947,
896
+ "Total output tokens": 50301,
897
+ "Average output tokens": 198
898
+ }
899
+ },
900
+ "Internllm2_5-7B": {
901
+ "META": {
902
+ "Algorithm": "PoT",
903
+ "LLM": "Internllm2_5-7B",
904
+ "Eval Date": "2025/1/22"
905
+ },
906
+ "gsm8k": {
907
+ "Score": 38.21,
908
+ "Pass rate": 0.489,
909
+ "Cost($)": 0.0,
910
+ "Framework": "",
911
+ "X-shot": "8.0",
912
+ "Samples": 1319,
913
+ "All tokens": 1324949,
914
+ "Total input tokens": 1136843,
915
+ "Average input tokens": 862,
916
+ "Total output tokens": 188106,
917
+ "Average output tokens": 143
918
+ },
919
+ "AQuA": {
920
+ "Score": 36.61,
921
+ "Pass rate": 0.9881,
922
+ "Cost($)": 0.0,
923
+ "Framework": "",
924
+ "X-shot": "0.0",
925
+ "Samples": 254,
926
+ "All tokens": 301962,
927
+ "Total input tokens": 233505,
928
+ "Average input tokens": 919,
929
+ "Total output tokens": 68457,
930
+ "Average output tokens": 270
931
+ }
932
+ },
933
+ "Qwen2-1.5B-Instruct": {
934
+ "META": {
935
+ "Algorithm": "PoT",
936
+ "LLM": "Qwen2-1.5B-Instruct",
937
+ "Eval Date": "2025/1/22"
938
+ },
939
+ "gsm8k": {
940
+ "Score": 18.49,
941
+ "Pass rate": 0.31,
942
+ "Cost($)": 0.0,
943
+ "Framework": "",
944
+ "X-shot": "8.0",
945
+ "Samples": 1319,
946
+ "All tokens": 1327522,
947
+ "Total input tokens": 1151528,
948
+ "Average input tokens": 873,
949
+ "Total output tokens": 175994,
950
+ "Average output tokens": 133
951
+ },
952
+ "AQuA": {
953
+ "Score": 30.7,
954
+ "Pass rate": 0.9645,
955
+ "Cost($)": 0.0,
956
+ "Framework": "",
957
+ "X-shot": "0.0",
958
+ "Samples": 254,
959
+ "All tokens": 298475,
960
+ "Total input tokens": 246560,
961
+ "Average input tokens": 971,
962
+ "Total output tokens": 51915,
963
+ "Average output tokens": 204
964
+ }
965
+ },
966
+ "Qwen2-0.5B-Instruct": {
967
+ "META": {
968
+ "Algorithm": "PoT",
969
+ "LLM": "Qwen2-0.5B-Instruct",
970
+ "Eval Date": "2025/1/22"
971
+ },
972
+ "gsm8k": {
973
+ "Score": 9.62,
974
+ "Pass rate": 0.169,
975
+ "Cost($)": 0.0,
976
+ "Framework": "",
977
+ "X-shot": "8.0",
978
+ "Samples": 1319,
979
+ "All tokens": 1389135,
980
+ "Total input tokens": 1151528,
981
+ "Average input tokens": 873,
982
+ "Total output tokens": 237607,
983
+ "Average output tokens": 180
984
+ },
985
+ "AQuA": {
986
+ "Score": 17.32,
987
+ "Pass rate": 0.9212,
988
+ "Cost($)": 0.0,
989
+ "Framework": "",
990
+ "X-shot": "0.0",
991
+ "Samples": 254,
992
+ "All tokens": 322281,
993
+ "Total input tokens": 258867,
994
+ "Average input tokens": 1019,
995
+ "Total output tokens": 63414,
996
+ "Average output tokens": 250
997
+ }
998
+ }
999
+ },
1000
+ "CoT": {
1001
+ "gpt-3.5-turbo": {
1002
+ "META": {
1003
+ "Algorithm": "CoT",
1004
+ "LLM": "gpt-3.5-turbo",
1005
+ "Eval Date": "2025/1/7"
1006
+ },
1007
+ "gsm8k": {
1008
+ "Score": 78.69,
1009
+ "Pass rate": 1.0,
1010
+ "Cost($)": 0.6788,
1011
+ "Framework": "",
1012
+ "X-shot": "8.0",
1013
+ "Samples": 1319,
1014
+ "All tokens": 1088041,
1015
+ "Total input tokens": 953242,
1016
+ "Average input tokens": 723,
1017
+ "Total output tokens": 134799,
1018
+ "Average output tokens": 102
1019
+ },
1020
+ "AQuA": {
1021
+ "Score": 61.02,
1022
+ "Pass rate": 0.937,
1023
+ "Cost($)": 0.0957,
1024
+ "Framework": "",
1025
+ "X-shot": "0.0",
1026
+ "Samples": 254,
1027
+ "All tokens": 80793,
1028
+ "Total input tokens": 25447,
1029
+ "Average input tokens": 100,
1030
+ "Total output tokens": 55346,
1031
+ "Average output tokens": 218
1032
+ }
1033
+ },
1034
+ "Doubao-lite-32k": {
1035
+ "META": {
1036
+ "Algorithm": "CoT",
1037
+ "LLM": "Doubao-lite-32k",
1038
+ "Eval Date": "2025/1/7"
1039
+ },
1040
+ "gsm8k": {
1041
+ "Score": 89.31,
1042
+ "Pass rate": 1.0,
1043
+ "Cost($)": 0.0558,
1044
+ "Framework": "",
1045
+ "X-shot": "8.0",
1046
+ "Samples": 1319,
1047
+ "All tokens": 1201820,
1048
+ "Total input tokens": 1042095,
1049
+ "Average input tokens": 790,
1050
+ "Total output tokens": 159725,
1051
+ "Average output tokens": 121
1052
+ },
1053
+ "AQuA": {
1054
+ "Score": 82.67,
1055
+ "Pass rate": 0.9724,
1056
+ "Cost($)": 0.0066,
1057
+ "Framework": "",
1058
+ "X-shot": "0.0",
1059
+ "Samples": 254,
1060
+ "All tokens": 94577,
1061
+ "Total input tokens": 27978,
1062
+ "Average input tokens": 110,
1063
+ "Total output tokens": 66599,
1064
+ "Average output tokens": 262
1065
+ }
1066
+ },
1067
+ "gpt-4o": {
1068
+ "META": {
1069
+ "Algorithm": "CoT",
1070
+ "LLM": "gpt-4o",
1071
+ "Eval Date": "2025/1/22"
1072
+ },
1073
+ "gsm8k": {
1074
+ "Score": 94.08,
1075
+ "Pass rate": 1.0,
1076
+ "Cost($)": 4.5367,
1077
+ "Framework": "",
1078
+ "X-shot": "8.0",
1079
+ "Samples": 1319,
1080
+ "All tokens": 1165166,
1081
+ "Total input tokens": 948668,
1082
+ "Average input tokens": 719,
1083
+ "Total output tokens": 216498,
1084
+ "Average output tokens": 164
1085
+ },
1086
+ "AQuA": {
1087
+ "Score": 82.67,
1088
+ "Pass rate": 0.9803,
1089
+ "Cost($)": 1.0417,
1090
+ "Framework": "",
1091
+ "X-shot": "0.0",
1092
+ "Samples": 254,
1093
+ "All tokens": 123017,
1094
+ "Total input tokens": 25123,
1095
+ "Average input tokens": 99,
1096
+ "Total output tokens": 97894,
1097
+ "Average output tokens": 385
1098
+ }
1099
+ },
1100
+ "Qwen2.5-72B-Instruct": {
1101
+ "META": {
1102
+ "Algorithm": "CoT",
1103
+ "LLM": "Qwen2.5-72B-Instruct",
1104
+ "Eval Date": "2025/1/22"
1105
+ },
1106
+ "gsm8k": {
1107
+ "Score": 92.87,
1108
+ "Pass rate": 1.0,
1109
+ "Cost($)": 0.7195,
1110
+ "Framework": "",
1111
+ "X-shot": "8.0",
1112
+ "Samples": 1319,
1113
+ "All tokens": 1276252,
1114
+ "Total input tokens": 1005119,
1115
+ "Average input tokens": 762,
1116
+ "Total output tokens": 271133,
1117
+ "Average output tokens": 206
1118
+ },
1119
+ "AQuA": {
1120
+ "Score": 86.22,
1121
+ "Pass rate": 0.9921,
1122
+ "Cost($)": 0.0808,
1123
+ "Framework": "",
1124
+ "X-shot": "0.0",
1125
+ "Samples": 254,
1126
+ "All tokens": 143289,
1127
+ "Total input tokens": 25143,
1128
+ "Average input tokens": 99,
1129
+ "Total output tokens": 118146,
1130
+ "Average output tokens": 465
1131
+ }
1132
+ },
1133
+ "Llama-3.3-70B-Instruct": {
1134
+ "META": {
1135
+ "Algorithm": "CoT",
1136
+ "LLM": "Llama-3.3-70B-Instruct",
1137
+ "Eval Date": "2025/1/22"
1138
+ },
1139
+ "gsm8k": {
1140
+ "Score": 93.93,
1141
+ "Pass rate": 1.0,
1142
+ "Cost($)": 0.687,
1143
+ "Framework": "",
1144
+ "X-shot": "8.0",
1145
+ "Samples": 1319,
1146
+ "All tokens": 1218665,
1147
+ "Total input tokens": 990168,
1148
+ "Average input tokens": 751,
1149
+ "Total output tokens": 228497,
1150
+ "Average output tokens": 173
1151
+ },
1152
+ "AQuA": {
1153
+ "Score": 83.46,
1154
+ "Pass rate": 0.9842,
1155
+ "Cost($)": 0.0927,
1156
+ "Framework": "",
1157
+ "X-shot": "0.0",
1158
+ "Samples": 254,
1159
+ "All tokens": 164389,
1160
+ "Total input tokens": 32555,
1161
+ "Average input tokens": 128,
1162
+ "Total output tokens": 131834,
1163
+ "Average output tokens": 519
1164
+ }
1165
+ },
1166
+ "Qwen2.5-7B-Instruct": {
1167
+ "META": {
1168
+ "Algorithm": "CoT",
1169
+ "LLM": "Qwen2.5-7B-Instruct",
1170
+ "Eval Date": "2025/1/22"
1171
+ },
1172
+ "gsm8k": {
1173
+ "Score": 85.67,
1174
+ "Pass rate": 1.0,
1175
+ "Cost($)": 0.0,
1176
+ "Framework": "",
1177
+ "X-shot": "8.0",
1178
+ "Samples": 1319,
1179
+ "All tokens": 1290805,
1180
+ "Total input tokens": 1046008,
1181
+ "Average input tokens": 793,
1182
+ "Total output tokens": 244797,
1183
+ "Average output tokens": 186
1184
+ },
1185
+ "AQuA": {
1186
+ "Score": 80.7,
1187
+ "Pass rate": 0.996,
1188
+ "Cost($)": 0.0,
1189
+ "Framework": "",
1190
+ "X-shot": "0.0",
1191
+ "Samples": 254,
1192
+ "All tokens": 149736,
1193
+ "Total input tokens": 33017,
1194
+ "Average input tokens": 130,
1195
+ "Total output tokens": 116719,
1196
+ "Average output tokens": 460
1197
+ }
1198
+ },
1199
+ "Llama-3.1-8B-Instruct": {
1200
+ "META": {
1201
+ "Algorithm": "CoT",
1202
+ "LLM": "Llama-3.1-8B-Instruct",
1203
+ "Eval Date": "2025/1/22"
1204
+ },
1205
+ "gsm8k": {
1206
+ "Score": 75.43,
1207
+ "Pass rate": 0.9992,
1208
+ "Cost($)": 0.0,
1209
+ "Framework": "",
1210
+ "X-shot": "8.0",
1211
+ "Samples": 1319,
1212
+ "All tokens": 1248329,
1213
+ "Total input tokens": 990168,
1214
+ "Average input tokens": 751,
1215
+ "Total output tokens": 258161,
1216
+ "Average output tokens": 196
1217
+ },
1218
+ "AQuA": {
1219
+ "Score": 60.62,
1220
+ "Pass rate": 1.0,
1221
+ "Cost($)": 0.0,
1222
+ "Framework": "",
1223
+ "X-shot": "0.0",
1224
+ "Samples": 254,
1225
+ "All tokens": 144435,
1226
+ "Total input tokens": 32555,
1227
+ "Average input tokens": 128,
1228
+ "Total output tokens": 111880,
1229
+ "Average output tokens": 440
1230
+ }
1231
+ },
1232
+ "Internllm2_5-7B": {
1233
+ "META": {
1234
+ "Algorithm": "CoT",
1235
+ "LLM": "Internllm2_5-7B",
1236
+ "Eval Date": "2025/1/22"
1237
+ },
1238
+ "gsm8k": {
1239
+ "Score": 77.71,
1240
+ "Pass rate": 0.9969,
1241
+ "Cost($)": 0.0,
1242
+ "Framework": "",
1243
+ "X-shot": "8.0",
1244
+ "Samples": 1319,
1245
+ "All tokens": 1202163,
1246
+ "Total input tokens": 968163,
1247
+ "Average input tokens": 734,
1248
+ "Total output tokens": 234000,
1249
+ "Average output tokens": 177
1250
+ },
1251
+ "AQuA": {
1252
+ "Score": 52.75,
1253
+ "Pass rate": 0.8937,
1254
+ "Cost($)": 0.0,
1255
+ "Framework": "",
1256
+ "X-shot": "0.0",
1257
+ "Samples": 254,
1258
+ "All tokens": 127520,
1259
+ "Total input tokens": 26610,
1260
+ "Average input tokens": 105,
1261
+ "Total output tokens": 100910,
1262
+ "Average output tokens": 397
1263
+ }
1264
+ },
1265
+ "Qwen2-1.5B-Instruct": {
1266
+ "META": {
1267
+ "Algorithm": "CoT",
1268
+ "LLM": "Qwen2-1.5B-Instruct",
1269
+ "Eval Date": "2025/1/22"
1270
+ },
1271
+ "gsm8k": {
1272
+ "Score": 55.49,
1273
+ "Pass rate": 1.0,
1274
+ "Cost($)": 0.0,
1275
+ "Framework": "",
1276
+ "X-shot": "8.0",
1277
+ "Samples": 1319,
1278
+ "All tokens": 1218525,
1279
+ "Total input tokens": 1032818,
1280
+ "Average input tokens": 783,
1281
+ "Total output tokens": 185707,
1282
+ "Average output tokens": 141
1283
+ },
1284
+ "AQuA": {
1285
+ "Score": 40.55,
1286
+ "Pass rate": 0.9881,
1287
+ "Cost($)": 0.0,
1288
+ "Framework": "",
1289
+ "X-shot": "0.0",
1290
+ "Samples": 254,
1291
+ "All tokens": 110040,
1292
+ "Total input tokens": 30477,
1293
+ "Average input tokens": 120,
1294
+ "Total output tokens": 79563,
1295
+ "Average output tokens": 313
1296
+ }
1297
+ },
1298
+ "Qwen2-0.5B-Instruct": {
1299
+ "META": {
1300
+ "Algorithm": "CoT",
1301
+ "LLM": "Qwen2-0.5B-Instruct",
1302
+ "Eval Date": "2025/1/22"
1303
+ },
1304
+ "gsm8k": {
1305
+ "Score": 35.93,
1306
+ "Pass rate": 0.9992,
1307
+ "Cost($)": 0.0,
1308
+ "Framework": "",
1309
+ "X-shot": "8.0",
1310
+ "Samples": 1319,
1311
+ "All tokens": 1223459,
1312
+ "Total input tokens": 1032818,
1313
+ "Average input tokens": 783,
1314
+ "Total output tokens": 190641,
1315
+ "Average output tokens": 145
1316
+ },
1317
+ "AQuA": {
1318
+ "Score": 33.07,
1319
+ "Pass rate": 0.9881,
1320
+ "Cost($)": 0.0,
1321
+ "Framework": "",
1322
+ "X-shot": "0.0",
1323
+ "Samples": 254,
1324
+ "All tokens": 117339,
1325
+ "Total input tokens": 30477,
1326
+ "Average input tokens": 120,
1327
+ "Total output tokens": 86862,
1328
+ "Average output tokens": 342
1329
+ }
1330
+ }
1331
+ },
1332
+ "SC-CoT": {
1333
+ "gpt-3.5-turbo": {
1334
+ "META": {
1335
+ "Algorithm": "SC-CoT",
1336
+ "LLM": "gpt-3.5-turbo",
1337
+ "Eval Date": "2025/1/7"
1338
+ },
1339
+ "gsm8k": {
1340
+ "Score": 82.56,
1341
+ "Pass rate": 0.9985,
1342
+ "Cost($)": 2.6285,
1343
+ "Framework": "",
1344
+ "X-shot": "8.0",
1345
+ "Samples": 1319,
1346
+ "All tokens": 2560697,
1347
+ "Total input tokens": 1212520,
1348
+ "Average input tokens": 919,
1349
+ "Total output tokens": 1348177,
1350
+ "Average output tokens": 1022
1351
+ },
1352
+ "AQuA": {
1353
+ "Score": 70.47,
1354
+ "Pass rate": 0.9882,
1355
+ "Cost($)": 0.5578,
1356
+ "Framework": "",
1357
+ "X-shot": "0.0",
1358
+ "Samples": 254,
1359
+ "All tokens": 418617,
1360
+ "Total input tokens": 70157,
1361
+ "Average input tokens": 276,
1362
+ "Total output tokens": 348460,
1363
+ "Average output tokens": 1372
1364
+ }
1365
+ },
1366
+ "Doubao-lite-32k": {
1367
+ "META": {
1368
+ "Algorithm": "SC-CoT",
1369
+ "LLM": "Doubao-lite-32k",
1370
+ "Eval Date": "2025/1/7"
1371
+ },
1372
+ "gsm8k": {
1373
+ "Score": 83.7,
1374
+ "Pass rate": 0.997,
1375
+ "Cost($)": 0.155,
1376
+ "Framework": "",
1377
+ "X-shot": "8.0",
1378
+ "Samples": 1319,
1379
+ "All tokens": 2507687,
1380
+ "Total input tokens": 1230019,
1381
+ "Average input tokens": 933,
1382
+ "Total output tokens": 1277668,
1383
+ "Average output tokens": 969
1384
+ },
1385
+ "AQuA": {
1386
+ "Score": 81.5,
1387
+ "Pass rate": 0.9764,
1388
+ "Cost($)": 0.0347,
1389
+ "Framework": "",
1390
+ "X-shot": "0.0",
1391
+ "Samples": 254,
1392
+ "All tokens": 465846,
1393
+ "Total input tokens": 83830,
1394
+ "Average input tokens": 330,
1395
+ "Total output tokens": 382016,
1396
+ "Average output tokens": 1504
1397
+ }
1398
+ },
1399
+ "gpt-4o": {
1400
+ "META": {
1401
+ "Algorithm": "SC-CoT",
1402
+ "LLM": "gpt-4o",
1403
+ "Eval Date": "2025/1/22"
1404
+ },
1405
+ "gsm8k": {
1406
+ "Score": 90.75,
1407
+ "Pass rate": 1.0,
1408
+ "Cost($)": 24.2428,
1409
+ "Framework": "",
1410
+ "X-shot": "8.0",
1411
+ "Samples": 1319,
1412
+ "All tokens": 3300971,
1413
+ "Total input tokens": 1168927,
1414
+ "Average input tokens": 886,
1415
+ "Total output tokens": 2132044,
1416
+ "Average output tokens": 1616
1417
+ },
1418
+ "AQuA": {
1419
+ "Score": 88.19,
1420
+ "Pass rate": 1.0,
1421
+ "Cost($)": 6.2412,
1422
+ "Framework": "",
1423
+ "X-shot": "0.0",
1424
+ "Samples": 254,
1425
+ "All tokens": 678811,
1426
+ "Total input tokens": 72916,
1427
+ "Average input tokens": 287,
1428
+ "Total output tokens": 605895,
1429
+ "Average output tokens": 2385
1430
+ }
1431
+ },
1432
+ "Qwen2.5-72B-Instruct": {
1433
+ "META": {
1434
+ "Algorithm": "SC-CoT",
1435
+ "LLM": "Qwen2.5-72B-Instruct",
1436
+ "Eval Date": "2025/1/22"
1437
+ },
1438
+ "gsm8k": {
1439
+ "Score": 90.67,
1440
+ "Pass rate": 1.0,
1441
+ "Cost($)": 4.2651,
1442
+ "Framework": "",
1443
+ "X-shot": "8.0",
1444
+ "Samples": 1319,
1445
+ "All tokens": 7565637,
1446
+ "Total input tokens": 5292383,
1447
+ "Average input tokens": 4012,
1448
+ "Total output tokens": 2273254,
1449
+ "Average output tokens": 1723
1450
+ },
1451
+ "AQuA": {
1452
+ "Score": 85.82,
1453
+ "Pass rate": 0.9842,
1454
+ "Cost($)": 0.5576,
1455
+ "Framework": "",
1456
+ "X-shot": "0.0",
1457
+ "Samples": 254,
1458
+ "All tokens": 989058,
1459
+ "Total input tokens": 241149,
1460
+ "Average input tokens": 949,
1461
+ "Total output tokens": 747909,
1462
+ "Average output tokens": 2945
1463
+ }
1464
+ },
1465
+ "Llama-3.3-70B-Instruct": {
1466
+ "META": {
1467
+ "Algorithm": "SC-CoT",
1468
+ "LLM": "Llama-3.3-70B-Instruct",
1469
+ "Eval Date": "2025/1/22"
1470
+ },
1471
+ "gsm8k": {
1472
+ "Score": 95.45,
1473
+ "Pass rate": 1.0,
1474
+ "Cost($)": 4.5021,
1475
+ "Framework": "",
1476
+ "X-shot": "8.0",
1477
+ "Samples": 1319,
1478
+ "All tokens": 7985996,
1479
+ "Total input tokens": 5406763,
1480
+ "Average input tokens": 4099,
1481
+ "Total output tokens": 2579233,
1482
+ "Average output tokens": 1955
1483
+ },
1484
+ "AQuA": {
1485
+ "Score": 86.61,
1486
+ "Pass rate": 0.9921,
1487
+ "Cost($)": 0.5847,
1488
+ "Framework": "",
1489
+ "X-shot": "0.0",
1490
+ "Samples": 254,
1491
+ "All tokens": 1037124,
1492
+ "Total input tokens": 283248,
1493
+ "Average input tokens": 1115,
1494
+ "Total output tokens": 753876,
1495
+ "Average output tokens": 2968
1496
+ }
1497
+ },
1498
+ "Qwen2.5-7B-Instruct": {
1499
+ "META": {
1500
+ "Algorithm": "SC-CoT",
1501
+ "LLM": "Qwen2.5-7B-Instruct",
1502
+ "Eval Date": "2025/1/22"
1503
+ },
1504
+ "gsm8k": {
1505
+ "Score": 88.32,
1506
+ "Pass rate": 0.9984,
1507
+ "Cost($)": 0.0,
1508
+ "Framework": "",
1509
+ "X-shot": "8.0",
1510
+ "Samples": 1319,
1511
+ "All tokens": 8173818,
1512
+ "Total input tokens": 5668252,
1513
+ "Average input tokens": 4297,
1514
+ "Total output tokens": 2505566,
1515
+ "Average output tokens": 1900
1516
+ },
1517
+ "AQuA": {
1518
+ "Score": 81.49,
1519
+ "Pass rate": 1.0,
1520
+ "Cost($)": 0.0,
1521
+ "Framework": "",
1522
+ "X-shot": "0.0",
1523
+ "Samples": 254,
1524
+ "All tokens": 1015368,
1525
+ "Total input tokens": 278848,
1526
+ "Average input tokens": 1098,
1527
+ "Total output tokens": 736520,
1528
+ "Average output tokens": 2900
1529
+ }
1530
+ },
1531
+ "Llama-3.1-8B-Instruct": {
1532
+ "META": {
1533
+ "Algorithm": "SC-CoT",
1534
+ "LLM": "Llama-3.1-8B-Instruct",
1535
+ "Eval Date": "2025/1/22"
1536
+ },
1537
+ "gsm8k": {
1538
+ "Score": 75.2,
1539
+ "Pass rate": 0.9954,
1540
+ "Cost($)": 0.0,
1541
+ "Framework": "",
1542
+ "X-shot": "8.0",
1543
+ "Samples": 1319,
1544
+ "All tokens": 8444203,
1545
+ "Total input tokens": 5334657,
1546
+ "Average input tokens": 4044,
1547
+ "Total output tokens": 3109546,
1548
+ "Average output tokens": 2358
1549
+ },
1550
+ "AQuA": {
1551
+ "Score": 53.14,
1552
+ "Pass rate": 0.9606,
1553
+ "Cost($)": 0.0,
1554
+ "Framework": "",
1555
+ "X-shot": "0.0",
1556
+ "Samples": 254,
1557
+ "All tokens": 1041346,
1558
+ "Total input tokens": 372968,
1559
+ "Average input tokens": 1468,
1560
+ "Total output tokens": 668378,
1561
+ "Average output tokens": 2631
1562
+ }
1563
+ },
1564
+ "Internllm2_5-7B": {
1565
+ "META": {
1566
+ "Algorithm": "SC-CoT",
1567
+ "LLM": "Internllm2_5-7B",
1568
+ "Eval Date": "2025/1/22"
1569
+ },
1570
+ "gsm8k": {
1571
+ "Score": 41.39,
1572
+ "Pass rate": 0.9825,
1573
+ "Cost($)": 0.0,
1574
+ "Framework": "",
1575
+ "X-shot": "8.0",
1576
+ "Samples": 1319,
1577
+ "All tokens": 10024857,
1578
+ "Total input tokens": 6674518,
1579
+ "Average input tokens": 5060,
1580
+ "Total output tokens": 3350339,
1581
+ "Average output tokens": 2540
1582
+ },
1583
+ "AQuA": {
1584
+ "Score": 35.85,
1585
+ "Pass rate": 0.988,
1586
+ "Cost($)": 0.0,
1587
+ "Framework": "",
1588
+ "X-shot": "0.0",
1589
+ "Samples": 254,
1590
+ "All tokens": 1240388,
1591
+ "Total input tokens": 530701,
1592
+ "Average input tokens": 2089,
1593
+ "Total output tokens": 709687,
1594
+ "Average output tokens": 2794
1595
+ }
1596
+ },
1597
+ "Qwen2-1.5B-Instruct": {
1598
+ "META": {
1599
+ "Algorithm": "SC-CoT",
1600
+ "LLM": "Qwen2-1.5B-Instruct",
1601
+ "Eval Date": "2025/1/22"
1602
+ },
1603
+ "gsm8k": {
1604
+ "Score": 5.53,
1605
+ "Pass rate": 0.8673,
1606
+ "Cost($)": 0.0,
1607
+ "Framework": "",
1608
+ "X-shot": "8.0",
1609
+ "Samples": 1319,
1610
+ "All tokens": 8961768,
1611
+ "Total input tokens": 5844218,
1612
+ "Average input tokens": 4431,
1613
+ "Total output tokens": 3117550,
1614
+ "Average output tokens": 2364
1615
+ },
1616
+ "AQuA": {
1617
+ "Score": 30.31,
1618
+ "Pass rate": 0.9724,
1619
+ "Cost($)": 0.0,
1620
+ "Framework": "",
1621
+ "X-shot": "0.0",
1622
+ "Samples": 254,
1623
+ "All tokens": 1157076,
1624
+ "Total input tokens": 430703,
1625
+ "Average input tokens": 1696,
1626
+ "Total output tokens": 726373,
1627
+ "Average output tokens": 2860
1628
+ }
1629
+ },
1630
+ "Qwen2-0.5B-Instruct": {
1631
+ "META": {
1632
+ "Algorithm": "SC-CoT",
1633
+ "LLM": "Qwen2-0.5B-Instruct",
1634
+ "Eval Date": "2025/1/22"
1635
+ },
1636
+ "gsm8k": {
1637
+ "Score": 3.79,
1638
+ "Pass rate": 0.9484,
1639
+ "Cost($)": 0.0,
1640
+ "Framework": "",
1641
+ "X-shot": "8.0",
1642
+ "Samples": 1319,
1643
+ "All tokens": 10533815,
1644
+ "Total input tokens": 6529832,
1645
+ "Average input tokens": 4951,
1646
+ "Total output tokens": 4003983,
1647
+ "Average output tokens": 3036
1648
+ },
1649
+ "AQuA": {
1650
+ "Score": 30.7,
1651
+ "Pass rate": 0.9842,
1652
+ "Cost($)": 0.0,
1653
+ "Framework": "",
1654
+ "X-shot": "0.0",
1655
+ "Samples": 254,
1656
+ "All tokens": 1225539,
1657
+ "Total input tokens": 496206,
1658
+ "Average input tokens": 1954,
1659
+ "Total output tokens": 729333,
1660
+ "Average output tokens": 2871
1661
  }
1662
  }
1663
  }
1664
  }
1665
+ }
src/detail_results.csv ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Rank,Algorithm,Dataset,LLM,Eval Date,Score,Pass rate,X-shot,Cost($),Framework,Samples,All tokens,Total input tokens,Average input tokens,Total output tokens,Average output tokens
2
+ 1,SC-CoT,AQuA,gpt-4o,2025/1/22,88.19,1.0,0.0,6.2412,,254,678811,72916,287,605895,2385
3
+ 2,SC-CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,86.61,0.9921,0.0,0.5847,,254,1037124,283248,1115,753876,2968
4
+ 3,CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,86.22,0.9921,0.0,0.0808,,254,143289,25143,99,118146,465
5
+ 4,SC-CoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,85.82,0.9842,0.0,0.5576,,254,989058,241149,949,747909,2945
6
+ 5,IO,AQuA,Qwen2.5-72B-Instruct,2025/1/22,84.25,0.996,0.0,0.0742,,254,131604,25397,100,106207,418
7
+ 6,CoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,83.46,0.9842,0.0,0.0927,,254,164389,32555,128,131834,519
8
+ 7,IO,AQuA,Llama-3.3-70B-Instruct,2025/1/22,82.67,0.9921,0.0,0.0798,,254,141567,32809,129,108758,428
9
+ 8,CoT,AQuA,Doubao-lite-32k,2025/1/7,82.67,0.9724,0.0,0.0066,,254,94577,27978,110,66599,262
10
+ 9,CoT,AQuA,gpt-4o,2025/1/22,82.67,0.9803,0.0,1.0417,,254,123017,25123,99,97894,385
11
+ 10,SC-CoT,AQuA,Doubao-lite-32k,2025/1/7,81.5,0.9764,0.0,0.0347,,254,465846,83830,330,382016,1504
12
+ 11,SC-CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,81.49,1.0,0.0,0.0,,254,1015368,278848,1098,736520,2900
13
+ 12,CoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,80.7,0.996,0.0,0.0,,254,149736,33017,130,116719,460
14
+ 13,PoT,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.52,0.9921,0.0,0.1746,,254,309799,240735,948,69064,272
15
+ 14,IO,AQuA,Doubao-lite-32k,2025/1/7,79.13,1.0,0.0,0.0058,,254,87742,33058,130,54684,215
16
+ 15,ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,2025/1/22,79.13,0.996,0.0,0.768,,254,1362379,1119143,4406,243236,958
17
+ 16,IO,AQuA,Qwen2.5-7B-Instruct,2025/1/22,78.74,0.9842,0.0,0.0,,254,137771,33271,131,104500,411
18
+ 17,ReAct-Pro*,AQuA,Doubao-lite-32k,2025/1/7,77.55,0.9606,0.0,0.0445,,254,1032841,977890,3850,54951,216
19
+ 18,IO,AQuA,gpt-4o,2025/1/22,75.59,0.9724,0.0,1.1453,,254,133752,25631,101,108121,426
20
+ 19,PoT,AQuA,gpt-4o,2025/1/22,75.19,1.0,0.0,1.6087,,254,327908,222717,877,105191,414
21
+ 20,PoT,AQuA,Qwen2.5-72B-Instruct,2025/1/22,75.19,1.0,0.0,0.1645,,254,291764,249215,981,42549,168
22
+ 21,ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,2025/1/22,74.4,0.9921,0.0,0.0,,254,695844,564165,2221,131679,518
23
+ 22,ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,2025/1/22,73.22,1.0,0.0,0.3177,,254,563603,441765,1739,121838,480
24
+ 23,PoT,AQuA,Doubao-lite-32k,2025/1/7,71.65,0.9685,0.0,0.0147,,254,309436,259863,1023,49573,195
25
+ 24,SC-CoT,AQuA,gpt-3.5-turbo,2025/1/7,70.47,0.9882,0.0,0.5578,,254,418617,70157,276,348460,1372
26
+ 25,PoT,AQuA,Qwen2.5-7B-Instruct,2025/1/22,68.11,1.0,0.0,0.0,,254,313728,264517,1041,49211,194
27
+ 26,ReAct-Pro*,AQuA,gpt-3.5-turbo,2025/1/7,64.56,0.9803,0.0,0.4928,,254,903587,862614,3396,40973,161
28
+ 27,CoT,AQuA,gpt-3.5-turbo,2025/1/7,61.02,0.937,0.0,0.0957,,254,80793,25447,100,55346,218
29
+ 28,CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,60.62,1.0,0.0,0.0,,254,144435,32555,128,111880,440
30
+ 29,PoT,AQuA,gpt-3.5-turbo,2025/1/7,59.44,1.0,0.0,0.1748,,254,266654,225162,886,41492,163
31
+ 30,ReAct-Pro*,AQuA,gpt-4o,2025/1/22,57.48,0.9724,0.0,2.304,,254,692096,615589,2424,76507,301
32
+ 31,ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,2025/1/22,55.51,0.9685,0.0,0.0,,254,4340821,3764723,14822,576098,2268
33
+ 32,SC-CoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,53.14,0.9606,0.0,0.0,,254,1041346,372968,1468,668378,2631
34
+ 33,CoT,AQuA,Internllm2_5-7B,2025/1/22,52.75,0.8937,0.0,0.0,,254,127520,26610,105,100910,397
35
+ 34,IO,AQuA,Llama-3.1-8B-Instruct,2025/1/22,51.18,0.9881,0.0,0.0,,254,133106,26459,104,106647,420
36
+ 35,IO,AQuA,Internllm2_5-7B,2025/1/22,47.63,0.9094,0.0,0.0,,254,185041,50232,198,134809,531
37
+ 36,ReAct-Pro*,AQuA,Internllm2_5-7B,2025/1/22,40.94,0.9685,0.0,0.0,,254,4428801,3592039,14142,836762,3294
38
+ 37,CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,40.55,0.9881,0.0,0.0,,254,110040,30477,120,79563,313
39
+ 38,IO,AQuA,gpt-3.5-turbo,2025/1/7,38.97,1.0,0.0,0.038,,254,42471,25701,101,16770,66
40
+ 39,PoT,AQuA,Llama-3.1-8B-Instruct,2025/1/22,36.61,0.9685,0.0,0.0,,254,290914,240613,947,50301,198
41
+ 40,PoT,AQuA,Internllm2_5-7B,2025/1/22,36.61,0.9881,0.0,0.0,,254,301962,233505,919,68457,270
42
+ 41,SC-CoT,AQuA,Internllm2_5-7B,2025/1/22,35.85,0.988,0.0,0.0,,254,1240388,530701,2089,709687,2794
43
+ 42,CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,33.07,0.9881,0.0,0.0,,254,117339,30477,120,86862,342
44
+ 43,PoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.7,0.9645,0.0,0.0,,254,298475,246560,971,51915,204
45
+ 44,SC-CoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,30.7,0.9842,0.0,0.0,,254,1225539,496206,1954,729333,2871
46
+ 45,SC-CoT,AQuA,Qwen2-1.5B-Instruct,2025/1/22,30.31,0.9724,0.0,0.0,,254,1157076,430703,1696,726373,2860
47
+ 46,IO,AQuA,Qwen2-1.5B-Instruct,2025/1/22,29.13,0.9763,0.0,0.0,,254,71047,27937,110,43110,170
48
+ 47,IO,AQuA,Qwen2-0.5B-Instruct,2025/1/22,27.16,0.9881,0.0,0.0,,254,110415,27937,110,82478,325
49
+ 48,ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,2025/1/22,25.59,0.9606,0.0,0.0,,254,5072004,4555858,17936,516146,2032
50
+ 49,ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,2025/1/22,24.01,0.9685,0.0,0.0,,254,7170087,6344167,24977,825920,3252
51
+ 50,PoT,AQuA,Qwen2-0.5B-Instruct,2025/1/22,17.32,0.9212,0.0,0.0,,254,322281,258867,1019,63414,250
52
+ 1,SC-CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,95.45,1.0,8.0,4.5021,,1319,7985996,5406763,4099,2579233,1955
53
+ 2,CoT,gsm8k,gpt-4o,2025/1/22,94.08,1.0,8.0,4.5367,,1319,1165166,948668,719,216498,164
54
+ 3,CoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,93.93,1.0,8.0,0.687,,1319,1218665,990168,751,228497,173
55
+ 4,PoT,gsm8k,gpt-4o,2025/1/22,93.1,0.9977,8.0,4.2166,,1319,1247912,1101672,835,146240,111
56
+ 5,CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.87,1.0,8.0,0.7195,,1319,1276252,1005119,762,271133,206
57
+ 6,PoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,92.34,0.9939,8.0,0.7054,,1319,1251210,1106682,839,144528,110
58
+ 7,IO,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,92.26,1.0,8.0,0.4709,,1319,835275,583916,443,251359,191
59
+ 8,SC-CoT,gsm8k,gpt-4o,2025/1/22,90.75,1.0,8.0,24.2428,,1319,3300971,1168927,886,2132044,1616
60
+ 9,SC-CoT,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,90.67,1.0,8.0,4.2651,,1319,7565637,5292383,4012,2273254,1723
61
+ 10,CoT,gsm8k,Doubao-lite-32k,2025/1/7,89.31,1.0,8.0,0.0558,,1319,1201820,1042095,790,159725,121
62
+ 11,IO,gsm8k,gpt-4o,2025/1/22,88.4,1.0,8.0,3.3463,,1319,741446,542416,411,199030,151
63
+ 12,SC-CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,88.32,0.9984,8.0,0.0,,1319,8173818,5668252,4297,2505566,1900
64
+ 13,ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,87.64,0.9992,,10.1124,,1319,17937864,17038928,12918,898936,682
65
+ 14,ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,87.26,1.0,8.0,10.5479,,1319,18710437,18160983,13769,549454,417
66
+ 15,IO,gsm8k,Qwen2.5-72B-Instruct,2025/1/22,86.58,1.0,8.0,0.4899,,1319,869060,555340,421,313720,238
67
+ 16,CoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,85.67,1.0,8.0,0.0,,1319,1290805,1046008,793,244797,186
68
+ 17,ReAct-Pro*,gsm8k,Doubao-lite-32k,2025/1/7,85.59,0.9962,8.0,0.2512,,1319,5998639,5862016,4444,136623,104
69
+ 18,SC-CoT,gsm8k,Doubao-lite-32k,2025/1/7,83.7,0.997,8.0,0.155,,1319,2507687,1230019,933,1277668,969
70
+ 19,ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,82.86,1.0,8.0,0.0,,1319,14850914,14355752,10884,495162,375
71
+ 20,SC-CoT,gsm8k,gpt-3.5-turbo,2025/1/7,82.56,0.9985,8.0,2.6285,,1319,2560697,1212520,919,1348177,1022
72
+ 21,PoT,gsm8k,Doubao-lite-32k,2025/1/7,79.6,0.9257,8.0,0.0576,,1319,1288055,1170038,887,118017,89
73
+ 22,CoT,gsm8k,gpt-3.5-turbo,2025/1/7,78.69,1.0,8.0,0.6788,,1319,1088041,953242,723,134799,102
74
+ 23,CoT,gsm8k,Internllm2_5-7B,2025/1/22,77.71,0.9969,8.0,0.0,,1319,1202163,968163,734,234000,177
75
+ 24,PoT,gsm8k,gpt-3.5-turbo,2025/1/7,76.87,0.9924,8.0,0.6902,,1319,1187080,1090418,827,96662,73
76
+ 25,CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.43,0.9992,8.0,0.0,,1319,1248329,990168,751,258161,196
77
+ 26,SC-CoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,75.2,0.9954,8.0,0.0,,1319,8444203,5334657,4044,3109546,2358
78
+ 27,ReAct-Pro*,gsm8k,gpt-3.5-turbo,2025/1/7,74.9,0.9939,8.0,3.4633,,1319,6646286,6506164,4933,140122,106
79
+ 28,PoT,gsm8k,Llama-3.3-70B-Instruct,2025/1/22,73.08,0.796,8.0,0.9736,,1319,1727044,1126025,854,601019,456
80
+ 29,IO,gsm8k,Doubao-lite-32k,2025/1/7,72.02,0.9992,8.0,0.0354,,1319,740483,617377,468,123106,93
81
+ 30,ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,67.77,0.9855,8.0,0.0,,1319,22835767,21044978,15955,1790789,1358
82
+ 31,ReAct-Pro*,gsm8k,gpt-4o,2025/1/22,63.3,0.9954,8.0,39.0751,,1319,14715887,14411173,10926,304714,231
83
+ 32,PoT,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,58.83,0.705,8.0,0.0,,1319,1362822,1145390,868,217432,165
84
+ 33,IO,gsm8k,Qwen2.5-7B-Instruct,2025/1/22,57.24,1.0,8.0,0.0,,1319,887913,596229,452,291684,221
85
+ 34,IO,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,57.16,0.9954,8.0,0.0,,1319,1745429,550941,418,1194488,906
86
+ 35,CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,55.49,1.0,8.0,0.0,,1319,1218525,1032818,783,185707,141
87
+ 36,SC-CoT,gsm8k,Internllm2_5-7B,2025/1/22,41.39,0.9825,8.0,0.0,,1319,10024857,6674518,5060,3350339,2540
88
+ 37,PoT,gsm8k,Llama-3.1-8B-Instruct,2025/1/22,38.66,0.5542,8.0,0.0,,1319,1391111,1147538,870,243573,185
89
+ 38,PoT,gsm8k,Internllm2_5-7B,2025/1/22,38.21,0.489,8.0,0.0,,1319,1324949,1136843,862,188106,143
90
+ 39,IO,gsm8k,gpt-3.5-turbo,2025/1/7,37.83,0.9992,8.0,0.3328,,1319,586553,546990,415,39563,30
91
+ 40,CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,35.93,0.9992,8.0,0.0,,1319,1223459,1032818,783,190641,145
92
+ 41,ReAct-Pro*,gsm8k,Internllm2_5-7B,2025/1/22,33.51,0.9795,,0.0,,1319,35669989,30120070,22836,5549919,4208
93
+ 42,ReAct-Pro*,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,24.86,0.8021,8.0,0.0,,1319,9828001,9133603,6925,694398,526
94
+ 43,PoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,18.49,0.31,8.0,0.0,,1319,1327522,1151528,873,175994,133
95
+ 44,IO,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,16.67,1.0,8.0,0.0,,1319,736996,568530,431,168466,128
96
+ 45,IO,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,14.7,1.0,8.0,0.0,,1319,834897,568116,431,266781,202
97
+ 46,IO,gsm8k,Internllm2_5-7B,2025/1/22,11.59,0.9795,8.0,0.0,,1319,1113728,679302,515,434426,329
98
+ 47,PoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,9.62,0.169,8.0,0.0,,1319,1389135,1151528,873,237607,180
99
+ 48,ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,7.65,0.9522,8.0,0.0,,1319,55392611,52431343,39751,2961268,2245
100
+ 49,SC-CoT,gsm8k,Qwen2-1.5B-Instruct,2025/1/22,5.53,0.8673,8.0,0.0,,1319,8961768,5844218,4431,3117550,2364
101
+ 50,SC-CoT,gsm8k,Qwen2-0.5B-Instruct,2025/1/22,3.79,0.9484,8.0,0.0,,1319,10533815,6529832,4951,4003983,3036
src/overall_filtered_results.csv ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($)
2
+ 1.0,SC-CoT,Llama-3.3-70B-Instruct,2025/1/22,91.03,95.45,4.5021,86.61,0.5847
3
+ 2.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,89.55,92.87,0.7195,86.22,0.0808
4
+ 3.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,88.70,93.93,0.687,83.46,0.0927
5
+ 4.0,CoT,gpt-4o,2025/1/22,88.38,94.08,4.5367,82.67,1.0417
6
+ 5.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,88.25,90.67,4.2651,85.82,0.5576
7
+ 6.0,SC-CoT,gpt-4o,2025/1/22,88.24,91.05,35.8006,85.43,6.3449
8
+ 7.0,IO,Llama-3.3-70B-Instruct,2025/1/22,87.47,92.26,0.4709,82.67,0.0798
9
+ 8.0,SC-CoT,Doubao-lite-32k,2025/1/7,86.04,88.62,0.1532,83.46,0.0409
10
+ 9.0,CoT,Doubao-lite-32k,2025/1/7,85.99,89.31,0.0558,82.67,0.0066
11
+ 10.0,IO,Qwen2.5-72B-Instruct,2025/1/22,85.42,86.58,0.4899,84.25,0.0742
12
+ 11.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,84.91,88.32,0.0,81.49,0.0
13
+ 12.0,PoT,gpt-4o,2025/1/22,84.15,93.1,4.2166,75.19,1.6087
14
+ 13.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,83.77,92.34,0.7054,75.19,0.1645
15
+ 14.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,83.39,87.64,10.1124,79.13,0.768
16
+ 15.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,83.19,85.67,0.0,80.7,0.0
17
+ 16.0,IO,gpt-4o,2025/1/22,82.00,88.4,3.3463,75.59,1.1453
18
+ 17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,81.57,85.59,0.2512,77.55,0.0445
19
+ 18.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,80.24,87.26,10.5479,73.22,0.3177
20
+ 19.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,78.63,82.86,0.0,74.4,0.0
21
+ 20.0,PoT,Llama-3.3-70B-Instruct,2025/1/22,76.30,73.08,0.9736,79.52,0.1746
22
+ 21.0,PoT,Doubao-lite-32k,2025/1/7,75.63,79.6,0.0576,71.65,0.0147
23
+ 22.0,IO,Doubao-lite-32k,2025/1/7,75.58,72.02,0.0354,79.13,0.0058
24
+ 23.0,SC-CoT,gpt-3.5-turbo,2025/1/7,73.69,80.06,5.0227,67.32,0.6491
25
+ 24.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,69.73,74.9,3.4633,64.56,0.4928
26
+ 25.0,PoT,gpt-3.5-turbo,2025/1/7,68.16,76.87,0.6902,59.44,0.1748
27
+ 26.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,68.03,75.43,0.0,60.62,0.0
28
+ 27.0,IO,Qwen2.5-7B-Instruct,2025/1/22,67.99,57.24,0.0,78.74,0.0
29
+ 28.0,CoT,Internllm2_5-7B,2025/1/22,65.23,77.71,0.0,52.75,0.0
30
+ 29.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,64.17,75.2,0.0,53.14,0.0
31
+ 30.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,63.47,58.83,0.0,68.11,0.0
32
+ 31.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,61.64,67.77,0.0,55.51,0.0
33
+ 32.0,ReAct-Pro*,gpt-4o,2025/1/22,60.39,63.3,39.0751,57.48,2.304
34
+ 33.0,IO,Llama-3.1-8B-Instruct,2025/1/22,54.17,57.16,0.0,51.18,0.0
35
+ 34.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,48.02,55.49,0.0,40.55,0.0
36
+ 35.0,CoT,gpt-3.5-turbo,2025/1/7,39.35,78.69,0.6788,0.0,0.0
37
+ 36.0,SC-CoT,Internllm2_5-7B,2025/1/22,38.62,41.39,0.0,35.85,0.0
38
+ 37.0,IO,gpt-3.5-turbo,2025/1/7,38.40,37.83,0.3328,38.97,0.038
39
+ 38.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,37.64,38.66,0.0,36.61,0.0
40
+ 39.0,PoT,Internllm2_5-7B,2025/1/22,37.41,38.21,0.0,36.61,0.0
41
+ 40.0,ReAct-Pro*,Internllm2_5-7B,2025/1/22,37.23,33.51,0.0,40.94,0.0
42
+ 41.0,CoT,Qwen2-0.5B-Instruct,2025/1/22,34.50,35.93,0.0,33.07,0.0
43
+ 42.0,IO,Internllm2_5-7B,2025/1/22,29.61,11.59,0.0,47.63,0.0
44
+ 43.0,ReAct-Pro*,Qwen2-1.5B-Instruct,2025/1/22,25.23,24.86,0.0,25.59,0.0
45
+ 44.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,24.60,18.49,0.0,30.7,0.0
46
+ 45.0,IO,Qwen2-1.5B-Instruct,2025/1/22,22.90,16.67,0.0,29.13,0.0
47
+ 46.0,IO,Qwen2-0.5B-Instruct,2025/1/22,20.93,14.7,0.0,27.16,0.0
48
+ 47.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,17.92,5.53,0.0,30.31,0.0
49
+ 48.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,17.25,3.79,0.0,30.7,0.0
50
+ 49.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,15.83,7.65,0.0,24.01,0.0
51
+ 50.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,13.47,9.62,0.0,17.32,0.0
src/overall_math_score.json CHANGED
@@ -1,29 +1,59 @@
1
  {
2
- "time": "2025-01-09 17:13:45",
3
  "results": {
4
  "IO": {
5
  "META": {
6
  "Algorithm": "IO",
7
  "LLM": "gpt-3.5-turbo",
8
- "Eval Date": "2025/01/07"
9
  },
10
  "gsm8k": {
11
  "Score": 37.83,
12
  "Cost($)": 0.3328
13
  },
14
  "AQuA": {
15
- "Score": 38.98,
16
- "Cost($)": 0.0380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  }
18
  },
19
  "CoT": {
20
  "META": {
21
  "Algorithm": "CoT",
22
  "LLM": "gpt-3.5-turbo",
23
- "Eval Date": "2025/01/07"
24
  },
25
  "gsm8k": {
26
- "Score": 78.70,
27
  "Cost($)": 0.6788
28
  },
29
  "AQuA": {
@@ -35,121 +65,691 @@
35
  "META": {
36
  "Algorithm": "SC-CoT",
37
  "LLM": "gpt-3.5-turbo",
38
- "Eval Date": "2025/01/07"
39
  },
40
  "gsm8k": {
41
- "Score": 80.06,
42
- "Cost($)": 5.0227
43
  },
44
  "AQuA": {
45
- "Score": 67.32,
46
- "Cost($)": 0.6491
47
  }
48
  },
49
- "PoT": {
50
  "META": {
51
- "Algorithm": "PoT",
52
- "LLM": "gpt-3.5-turbo",
53
- "Eval Date": "2025/01/07"
54
  },
55
  "gsm8k": {
56
- "Score": 76.88,
57
- "Cost($)": 0.6902
58
  },
59
  "AQuA": {
60
- "Score": 59.45,
61
- "Cost($)": 0.1748
62
  }
63
  },
64
- "ReAct-Pro*": {
65
  "META": {
66
  "Algorithm": "ReAct-Pro*",
67
- "LLM": "gpt-3.5-turbo",
68
- "Eval Date": "2025/01/07"
69
  },
70
  "gsm8k": {
71
- "Score": 74.91,
72
- "Cost($)": 3.4633
73
  },
74
  "AQuA": {
75
- "Score": 64.57,
76
- "Cost($)": 0.4928
77
  }
78
  },
79
- "IO-Doubao": {
80
  "META": {
81
- "Algorithm": "IO",
82
  "LLM": "Doubao-lite-32k",
83
- "Eval Date": "2025/01/07"
84
  },
85
  "gsm8k": {
86
- "Score": 72.02,
87
- "Cost($)": 0.0354
88
  },
89
  "AQuA": {
90
- "Score": 79.13,
91
- "Cost($)": 0.0058
92
  }
93
  },
94
- "CoT-Doubao": {
95
  "META": {
96
  "Algorithm": "CoT",
97
  "LLM": "Doubao-lite-32k",
98
- "Eval Date": "2025/01/07"
99
  },
100
  "gsm8k": {
101
  "Score": 89.31,
102
- "Cost($)": 0.0557
103
  },
104
  "AQuA": {
105
- "Score": 82.68,
106
  "Cost($)": 0.0066
107
  }
108
  },
109
- "SC-CoT-Doubao": {
110
  "META": {
111
  "Algorithm": "SC-CoT",
112
  "LLM": "Doubao-lite-32k",
113
- "Eval Date": "2025/01/07"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  },
115
  "gsm8k": {
116
- "Score": 88.63,
117
- "Cost($)": 0.1533
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  },
119
  "AQuA": {
120
  "Score": 83.46,
121
- "Cost($)": 0.0409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  }
123
  },
124
- "PoT-Doubao": {
125
  "META": {
126
  "Algorithm": "PoT",
127
- "LLM": "Doubao-lite-32k",
128
- "Eval Date": "2025/01/07"
129
  },
130
  "gsm8k": {
131
- "Score": 79.61,
132
- "Cost($)": 0.0576
133
  },
134
  "AQuA": {
135
- "Score": 71.65,
136
- "Cost($)": 0.0147
137
  }
138
  },
139
- "ReAct-Pro-Doubao": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  "META": {
141
  "Algorithm": "ReAct-Pro*",
142
- "LLM": "Doubao-lite-32k",
143
- "Eval Date": "2025/01/07"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  },
145
  "gsm8k": {
146
- "Score": 85.60,
147
- "Cost($)": 0.2513
148
  },
149
  "AQuA": {
150
- "Score": 77.56,
151
- "Cost($)": 0.0446
152
  }
153
  }
154
  }
155
- }
 
1
  {
2
+ "time": "2025-01-23 09:27:24",
3
  "results": {
4
  "IO": {
5
  "META": {
6
  "Algorithm": "IO",
7
  "LLM": "gpt-3.5-turbo",
8
+ "Eval Date": "2025/1/7"
9
  },
10
  "gsm8k": {
11
  "Score": 37.83,
12
  "Cost($)": 0.3328
13
  },
14
  "AQuA": {
15
+ "Score": 38.97,
16
+ "Cost($)": 0.038
17
+ }
18
+ },
19
+ "ReAct-Pro*": {
20
+ "META": {
21
+ "Algorithm": "ReAct-Pro*",
22
+ "LLM": "gpt-3.5-turbo",
23
+ "Eval Date": "2025/1/7"
24
+ },
25
+ "gsm8k": {
26
+ "Score": 74.9,
27
+ "Cost($)": 3.4633
28
+ },
29
+ "AQuA": {
30
+ "Score": 64.56,
31
+ "Cost($)": 0.4928
32
+ }
33
+ },
34
+ "PoT": {
35
+ "META": {
36
+ "Algorithm": "PoT",
37
+ "LLM": "gpt-3.5-turbo",
38
+ "Eval Date": "2025/1/7"
39
+ },
40
+ "gsm8k": {
41
+ "Score": 76.87,
42
+ "Cost($)": 0.6902
43
+ },
44
+ "AQuA": {
45
+ "Score": 59.44,
46
+ "Cost($)": 0.1748
47
  }
48
  },
49
  "CoT": {
50
  "META": {
51
  "Algorithm": "CoT",
52
  "LLM": "gpt-3.5-turbo",
53
+ "Eval Date": "2025/1/7"
54
  },
55
  "gsm8k": {
56
+ "Score": 78.69,
57
  "Cost($)": 0.6788
58
  },
59
  "AQuA": {
 
65
  "META": {
66
  "Algorithm": "SC-CoT",
67
  "LLM": "gpt-3.5-turbo",
68
+ "Eval Date": "2025/1/7"
69
  },
70
  "gsm8k": {
71
+ "Score": 82.56,
72
+ "Cost($)": 2.6285
73
  },
74
  "AQuA": {
75
+ "Score": 70.47,
76
+ "Cost($)": 0.5578
77
  }
78
  },
79
+ "IO-Doubao-lite-32k": {
80
  "META": {
81
+ "Algorithm": "IO",
82
+ "LLM": "Doubao-lite-32k",
83
+ "Eval Date": "2025/1/7"
84
  },
85
  "gsm8k": {
86
+ "Score": 72.02,
87
+ "Cost($)": 0.0354
88
  },
89
  "AQuA": {
90
+ "Score": 79.13,
91
+ "Cost($)": 0.0058
92
  }
93
  },
94
+ "ReAct-Pro*-Doubao-lite-32k": {
95
  "META": {
96
  "Algorithm": "ReAct-Pro*",
97
+ "LLM": "Doubao-lite-32k",
98
+ "Eval Date": "2025/1/7"
99
  },
100
  "gsm8k": {
101
+ "Score": 85.59,
102
+ "Cost($)": 0.2512
103
  },
104
  "AQuA": {
105
+ "Score": 77.55,
106
+ "Cost($)": 0.0445
107
  }
108
  },
109
+ "PoT-Doubao-lite-32k": {
110
  "META": {
111
+ "Algorithm": "PoT",
112
  "LLM": "Doubao-lite-32k",
113
+ "Eval Date": "2025/1/7"
114
  },
115
  "gsm8k": {
116
+ "Score": 79.6,
117
+ "Cost($)": 0.0576
118
  },
119
  "AQuA": {
120
+ "Score": 71.65,
121
+ "Cost($)": 0.0147
122
  }
123
  },
124
+ "CoT-Doubao-lite-32k": {
125
  "META": {
126
  "Algorithm": "CoT",
127
  "LLM": "Doubao-lite-32k",
128
+ "Eval Date": "2025/1/7"
129
  },
130
  "gsm8k": {
131
  "Score": 89.31,
132
+ "Cost($)": 0.0558
133
  },
134
  "AQuA": {
135
+ "Score": 82.67,
136
  "Cost($)": 0.0066
137
  }
138
  },
139
+ "SC-CoT-Doubao-lite-32k": {
140
  "META": {
141
  "Algorithm": "SC-CoT",
142
  "LLM": "Doubao-lite-32k",
143
+ "Eval Date": "2025/1/7"
144
+ },
145
+ "gsm8k": {
146
+ "Score": 83.7,
147
+ "Cost($)": 0.155
148
+ },
149
+ "AQuA": {
150
+ "Score": 81.5,
151
+ "Cost($)": 0.0347
152
+ }
153
+ },
154
+ "IO-gpt-4o": {
155
+ "META": {
156
+ "Algorithm": "IO",
157
+ "LLM": "gpt-4o",
158
+ "Eval Date": "2025/1/22"
159
  },
160
  "gsm8k": {
161
+ "Score": 88.4,
162
+ "Cost($)": 3.3463
163
+ },
164
+ "AQuA": {
165
+ "Score": 75.59,
166
+ "Cost($)": 1.1453
167
+ }
168
+ },
169
+ "ReAct-Pro*-gpt-4o": {
170
+ "META": {
171
+ "Algorithm": "ReAct-Pro*",
172
+ "LLM": "gpt-4o",
173
+ "Eval Date": "2025/1/22"
174
+ },
175
+ "gsm8k": {
176
+ "Score": 63.3,
177
+ "Cost($)": 39.0751
178
+ },
179
+ "AQuA": {
180
+ "Score": 57.48,
181
+ "Cost($)": 2.304
182
+ }
183
+ },
184
+ "PoT-gpt-4o": {
185
+ "META": {
186
+ "Algorithm": "PoT",
187
+ "LLM": "gpt-4o",
188
+ "Eval Date": "2025/1/22"
189
+ },
190
+ "gsm8k": {
191
+ "Score": 93.1,
192
+ "Cost($)": 4.2166
193
+ },
194
+ "AQuA": {
195
+ "Score": 75.19,
196
+ "Cost($)": 1.6087
197
+ }
198
+ },
199
+ "CoT-gpt-4o": {
200
+ "META": {
201
+ "Algorithm": "CoT",
202
+ "LLM": "gpt-4o",
203
+ "Eval Date": "2025/1/22"
204
+ },
205
+ "gsm8k": {
206
+ "Score": 94.08,
207
+ "Cost($)": 4.5367
208
+ },
209
+ "AQuA": {
210
+ "Score": 82.67,
211
+ "Cost($)": 1.0417
212
+ }
213
+ },
214
+ "SC-CoT-gpt-4o": {
215
+ "META": {
216
+ "Algorithm": "SC-CoT",
217
+ "LLM": "gpt-4o",
218
+ "Eval Date": "2025/1/22"
219
+ },
220
+ "gsm8k": {
221
+ "Score": 90.75,
222
+ "Cost($)": 24.2428
223
+ },
224
+ "AQuA": {
225
+ "Score": 88.19,
226
+ "Cost($)": 6.2412
227
+ }
228
+ },
229
+ "IO-Qwen2.5-72B-Instruct": {
230
+ "META": {
231
+ "Algorithm": "IO",
232
+ "LLM": "Qwen2.5-72B-Instruct",
233
+ "Eval Date": "2025/1/22"
234
+ },
235
+ "gsm8k": {
236
+ "Score": 86.58,
237
+ "Cost($)": 0.4899
238
+ },
239
+ "AQuA": {
240
+ "Score": 84.25,
241
+ "Cost($)": 0.0742
242
+ }
243
+ },
244
+ "ReAct-Pro*-Qwen2.5-72B-Instruct": {
245
+ "META": {
246
+ "Algorithm": "ReAct-Pro*",
247
+ "LLM": "Qwen2.5-72B-Instruct",
248
+ "Eval Date": "2025/1/22"
249
+ },
250
+ "gsm8k": {
251
+ "Score": 87.26,
252
+ "Cost($)": 10.5479
253
+ },
254
+ "AQuA": {
255
+ "Score": 73.22,
256
+ "Cost($)": 0.3177
257
+ }
258
+ },
259
+ "PoT-Qwen2.5-72B-Instruct": {
260
+ "META": {
261
+ "Algorithm": "PoT",
262
+ "LLM": "Qwen2.5-72B-Instruct",
263
+ "Eval Date": "2025/1/22"
264
+ },
265
+ "gsm8k": {
266
+ "Score": 92.34,
267
+ "Cost($)": 0.7054
268
+ },
269
+ "AQuA": {
270
+ "Score": 75.19,
271
+ "Cost($)": 0.1645
272
+ }
273
+ },
274
+ "CoT-Qwen2.5-72B-Instruct": {
275
+ "META": {
276
+ "Algorithm": "CoT",
277
+ "LLM": "Qwen2.5-72B-Instruct",
278
+ "Eval Date": "2025/1/22"
279
+ },
280
+ "gsm8k": {
281
+ "Score": 92.87,
282
+ "Cost($)": 0.7195
283
+ },
284
+ "AQuA": {
285
+ "Score": 86.22,
286
+ "Cost($)": 0.0808
287
+ }
288
+ },
289
+ "SC-CoT-Qwen2.5-72B-Instruct": {
290
+ "META": {
291
+ "Algorithm": "SC-CoT",
292
+ "LLM": "Qwen2.5-72B-Instruct",
293
+ "Eval Date": "2025/1/22"
294
+ },
295
+ "gsm8k": {
296
+ "Score": 90.67,
297
+ "Cost($)": 4.2651
298
+ },
299
+ "AQuA": {
300
+ "Score": 85.82,
301
+ "Cost($)": 0.5576
302
+ }
303
+ },
304
+ "IO-Llama-3.3-70B-Instruct": {
305
+ "META": {
306
+ "Algorithm": "IO",
307
+ "LLM": "Llama-3.3-70B-Instruct",
308
+ "Eval Date": "2025/1/22"
309
+ },
310
+ "gsm8k": {
311
+ "Score": 92.26,
312
+ "Cost($)": 0.4709
313
+ },
314
+ "AQuA": {
315
+ "Score": 82.67,
316
+ "Cost($)": 0.0798
317
+ }
318
+ },
319
+ "ReAct-Pro*-Llama-3.3-70B-Instruct": {
320
+ "META": {
321
+ "Algorithm": "ReAct-Pro*",
322
+ "LLM": "Llama-3.3-70B-Instruct",
323
+ "Eval Date": "2025/1/22"
324
+ },
325
+ "gsm8k": {
326
+ "Score": 87.64,
327
+ "Cost($)": 10.1124
328
+ },
329
+ "AQuA": {
330
+ "Score": 79.13,
331
+ "Cost($)": 0.768
332
+ }
333
+ },
334
+ "PoT-Llama-3.3-70B-Instruct": {
335
+ "META": {
336
+ "Algorithm": "PoT",
337
+ "LLM": "Llama-3.3-70B-Instruct",
338
+ "Eval Date": "2025/1/22"
339
+ },
340
+ "gsm8k": {
341
+ "Score": 73.08,
342
+ "Cost($)": 0.9736
343
+ },
344
+ "AQuA": {
345
+ "Score": 79.52,
346
+ "Cost($)": 0.1746
347
+ }
348
+ },
349
+ "CoT-Llama-3.3-70B-Instruct": {
350
+ "META": {
351
+ "Algorithm": "CoT",
352
+ "LLM": "Llama-3.3-70B-Instruct",
353
+ "Eval Date": "2025/1/22"
354
+ },
355
+ "gsm8k": {
356
+ "Score": 93.93,
357
+ "Cost($)": 0.687
358
  },
359
  "AQuA": {
360
  "Score": 83.46,
361
+ "Cost($)": 0.0927
362
+ }
363
+ },
364
+ "SC-CoT-Llama-3.3-70B-Instruct": {
365
+ "META": {
366
+ "Algorithm": "SC-CoT",
367
+ "LLM": "Llama-3.3-70B-Instruct",
368
+ "Eval Date": "2025/1/22"
369
+ },
370
+ "gsm8k": {
371
+ "Score": 95.45,
372
+ "Cost($)": 4.5021
373
+ },
374
+ "AQuA": {
375
+ "Score": 86.61,
376
+ "Cost($)": 0.5847
377
+ }
378
+ },
379
+ "IO-Qwen2.5-7B-Instruct": {
380
+ "META": {
381
+ "Algorithm": "IO",
382
+ "LLM": "Qwen2.5-7B-Instruct",
383
+ "Eval Date": "2025/1/22"
384
+ },
385
+ "gsm8k": {
386
+ "Score": 57.24,
387
+ "Cost($)": 0.0
388
+ },
389
+ "AQuA": {
390
+ "Score": 78.74,
391
+ "Cost($)": 0.0
392
+ }
393
+ },
394
+ "ReAct-Pro*-Qwen2.5-7B-Instruct": {
395
+ "META": {
396
+ "Algorithm": "ReAct-Pro*",
397
+ "LLM": "Qwen2.5-7B-Instruct",
398
+ "Eval Date": "2025/1/22"
399
+ },
400
+ "gsm8k": {
401
+ "Score": 82.86,
402
+ "Cost($)": 0.0
403
+ },
404
+ "AQuA": {
405
+ "Score": 74.4,
406
+ "Cost($)": 0.0
407
  }
408
  },
409
+ "PoT-Qwen2.5-7B-Instruct": {
410
  "META": {
411
  "Algorithm": "PoT",
412
+ "LLM": "Qwen2.5-7B-Instruct",
413
+ "Eval Date": "2025/1/22"
414
  },
415
  "gsm8k": {
416
+ "Score": 58.83,
417
+ "Cost($)": 0.0
418
  },
419
  "AQuA": {
420
+ "Score": 68.11,
421
+ "Cost($)": 0.0
422
  }
423
  },
424
+ "CoT-Qwen2.5-7B-Instruct": {
425
+ "META": {
426
+ "Algorithm": "CoT",
427
+ "LLM": "Qwen2.5-7B-Instruct",
428
+ "Eval Date": "2025/1/22"
429
+ },
430
+ "gsm8k": {
431
+ "Score": 85.67,
432
+ "Cost($)": 0.0
433
+ },
434
+ "AQuA": {
435
+ "Score": 80.7,
436
+ "Cost($)": 0.0
437
+ }
438
+ },
439
+ "SC-CoT-Qwen2.5-7B-Instruct": {
440
+ "META": {
441
+ "Algorithm": "SC-CoT",
442
+ "LLM": "Qwen2.5-7B-Instruct",
443
+ "Eval Date": "2025/1/22"
444
+ },
445
+ "gsm8k": {
446
+ "Score": 88.32,
447
+ "Cost($)": 0.0
448
+ },
449
+ "AQuA": {
450
+ "Score": 81.49,
451
+ "Cost($)": 0.0
452
+ }
453
+ },
454
+ "IO-Llama-3.1-8B-Instruct": {
455
+ "META": {
456
+ "Algorithm": "IO",
457
+ "LLM": "Llama-3.1-8B-Instruct",
458
+ "Eval Date": "2025/1/22"
459
+ },
460
+ "gsm8k": {
461
+ "Score": 57.16,
462
+ "Cost($)": 0.0
463
+ },
464
+ "AQuA": {
465
+ "Score": 51.18,
466
+ "Cost($)": 0.0
467
+ }
468
+ },
469
+ "ReAct-Pro*-Llama-3.1-8B-Instruct": {
470
  "META": {
471
  "Algorithm": "ReAct-Pro*",
472
+ "LLM": "Llama-3.1-8B-Instruct",
473
+ "Eval Date": "2025/1/22"
474
+ },
475
+ "gsm8k": {
476
+ "Score": 67.77,
477
+ "Cost($)": 0.0
478
+ },
479
+ "AQuA": {
480
+ "Score": 55.51,
481
+ "Cost($)": 0.0
482
+ }
483
+ },
484
+ "PoT-Llama-3.1-8B-Instruct": {
485
+ "META": {
486
+ "Algorithm": "PoT",
487
+ "LLM": "Llama-3.1-8B-Instruct",
488
+ "Eval Date": "2025/1/22"
489
+ },
490
+ "gsm8k": {
491
+ "Score": 38.66,
492
+ "Cost($)": 0.0
493
+ },
494
+ "AQuA": {
495
+ "Score": 36.61,
496
+ "Cost($)": 0.0
497
+ }
498
+ },
499
+ "CoT-Llama-3.1-8B-Instruct": {
500
+ "META": {
501
+ "Algorithm": "CoT",
502
+ "LLM": "Llama-3.1-8B-Instruct",
503
+ "Eval Date": "2025/1/22"
504
+ },
505
+ "gsm8k": {
506
+ "Score": 75.43,
507
+ "Cost($)": 0.0
508
+ },
509
+ "AQuA": {
510
+ "Score": 60.62,
511
+ "Cost($)": 0.0
512
+ }
513
+ },
514
+ "SC-CoT-Llama-3.1-8B-Instruct": {
515
+ "META": {
516
+ "Algorithm": "SC-CoT",
517
+ "LLM": "Llama-3.1-8B-Instruct",
518
+ "Eval Date": "2025/1/22"
519
+ },
520
+ "gsm8k": {
521
+ "Score": 75.2,
522
+ "Cost($)": 0.0
523
+ },
524
+ "AQuA": {
525
+ "Score": 53.14,
526
+ "Cost($)": 0.0
527
+ }
528
+ },
529
+ "IO-Internllm2_5-7B": {
530
+ "META": {
531
+ "Algorithm": "IO",
532
+ "LLM": "Internllm2_5-7B",
533
+ "Eval Date": "2025/1/22"
534
+ },
535
+ "gsm8k": {
536
+ "Score": 11.59,
537
+ "Cost($)": 0.0
538
+ },
539
+ "AQuA": {
540
+ "Score": 47.63,
541
+ "Cost($)": 0.0
542
+ }
543
+ },
544
+ "ReAct-Pro*-Internllm2_5-7B": {
545
+ "META": {
546
+ "Algorithm": "ReAct-Pro*",
547
+ "LLM": "Internllm2_5-7B",
548
+ "Eval Date": "2025/1/22"
549
+ },
550
+ "gsm8k": {
551
+ "Score": 33.51,
552
+ "Cost($)": 0.0
553
+ },
554
+ "AQuA": {
555
+ "Score": 40.94,
556
+ "Cost($)": 0.0
557
+ }
558
+ },
559
+ "PoT-Internllm2_5-7B": {
560
+ "META": {
561
+ "Algorithm": "PoT",
562
+ "LLM": "Internllm2_5-7B",
563
+ "Eval Date": "2025/1/22"
564
+ },
565
+ "gsm8k": {
566
+ "Score": 38.21,
567
+ "Cost($)": 0.0
568
+ },
569
+ "AQuA": {
570
+ "Score": 36.61,
571
+ "Cost($)": 0.0
572
+ }
573
+ },
574
+ "CoT-Internllm2_5-7B": {
575
+ "META": {
576
+ "Algorithm": "CoT",
577
+ "LLM": "Internllm2_5-7B",
578
+ "Eval Date": "2025/1/22"
579
+ },
580
+ "gsm8k": {
581
+ "Score": 77.71,
582
+ "Cost($)": 0.0
583
+ },
584
+ "AQuA": {
585
+ "Score": 52.75,
586
+ "Cost($)": 0.0
587
+ }
588
+ },
589
+ "SC-CoT-Internllm2_5-7B": {
590
+ "META": {
591
+ "Algorithm": "SC-CoT",
592
+ "LLM": "Internllm2_5-7B",
593
+ "Eval Date": "2025/1/22"
594
+ },
595
+ "gsm8k": {
596
+ "Score": 41.39,
597
+ "Cost($)": 0.0
598
+ },
599
+ "AQuA": {
600
+ "Score": 35.85,
601
+ "Cost($)": 0.0
602
+ }
603
+ },
604
+ "IO-Qwen2-1.5B-Instruct": {
605
+ "META": {
606
+ "Algorithm": "IO",
607
+ "LLM": "Qwen2-1.5B-Instruct",
608
+ "Eval Date": "2025/1/22"
609
+ },
610
+ "gsm8k": {
611
+ "Score": 16.67,
612
+ "Cost($)": 0.0
613
+ },
614
+ "AQuA": {
615
+ "Score": 29.13,
616
+ "Cost($)": 0.0
617
+ }
618
+ },
619
+ "ReAct-Pro*-Qwen2-1.5B-Instruct": {
620
+ "META": {
621
+ "Algorithm": "ReAct-Pro*",
622
+ "LLM": "Qwen2-1.5B-Instruct",
623
+ "Eval Date": "2025/1/22"
624
+ },
625
+ "gsm8k": {
626
+ "Score": 24.86,
627
+ "Cost($)": 0.0
628
+ },
629
+ "AQuA": {
630
+ "Score": 25.59,
631
+ "Cost($)": 0.0
632
+ }
633
+ },
634
+ "PoT-Qwen2-1.5B-Instruct": {
635
+ "META": {
636
+ "Algorithm": "PoT",
637
+ "LLM": "Qwen2-1.5B-Instruct",
638
+ "Eval Date": "2025/1/22"
639
+ },
640
+ "gsm8k": {
641
+ "Score": 18.49,
642
+ "Cost($)": 0.0
643
+ },
644
+ "AQuA": {
645
+ "Score": 30.7,
646
+ "Cost($)": 0.0
647
+ }
648
+ },
649
+ "CoT-Qwen2-1.5B-Instruct": {
650
+ "META": {
651
+ "Algorithm": "CoT",
652
+ "LLM": "Qwen2-1.5B-Instruct",
653
+ "Eval Date": "2025/1/22"
654
+ },
655
+ "gsm8k": {
656
+ "Score": 55.49,
657
+ "Cost($)": 0.0
658
+ },
659
+ "AQuA": {
660
+ "Score": 40.55,
661
+ "Cost($)": 0.0
662
+ }
663
+ },
664
+ "SC-CoT-Qwen2-1.5B-Instruct": {
665
+ "META": {
666
+ "Algorithm": "SC-CoT",
667
+ "LLM": "Qwen2-1.5B-Instruct",
668
+ "Eval Date": "2025/1/22"
669
+ },
670
+ "gsm8k": {
671
+ "Score": 5.53,
672
+ "Cost($)": 0.0
673
+ },
674
+ "AQuA": {
675
+ "Score": 30.31,
676
+ "Cost($)": 0.0
677
+ }
678
+ },
679
+ "IO-Qwen2-0.5B-Instruct": {
680
+ "META": {
681
+ "Algorithm": "IO",
682
+ "LLM": "Qwen2-0.5B-Instruct",
683
+ "Eval Date": "2025/1/22"
684
+ },
685
+ "gsm8k": {
686
+ "Score": 14.7,
687
+ "Cost($)": 0.0
688
+ },
689
+ "AQuA": {
690
+ "Score": 27.16,
691
+ "Cost($)": 0.0
692
+ }
693
+ },
694
+ "ReAct-Pro*-Qwen2-0.5B-Instruct": {
695
+ "META": {
696
+ "Algorithm": "ReAct-Pro*",
697
+ "LLM": "Qwen2-0.5B-Instruct",
698
+ "Eval Date": "2025/1/22"
699
+ },
700
+ "gsm8k": {
701
+ "Score": 7.65,
702
+ "Cost($)": 0.0
703
+ },
704
+ "AQuA": {
705
+ "Score": 24.01,
706
+ "Cost($)": 0.0
707
+ }
708
+ },
709
+ "PoT-Qwen2-0.5B-Instruct": {
710
+ "META": {
711
+ "Algorithm": "PoT",
712
+ "LLM": "Qwen2-0.5B-Instruct",
713
+ "Eval Date": "2025/1/22"
714
+ },
715
+ "gsm8k": {
716
+ "Score": 9.62,
717
+ "Cost($)": 0.0
718
+ },
719
+ "AQuA": {
720
+ "Score": 17.32,
721
+ "Cost($)": 0.0
722
+ }
723
+ },
724
+ "CoT-Qwen2-0.5B-Instruct": {
725
+ "META": {
726
+ "Algorithm": "CoT",
727
+ "LLM": "Qwen2-0.5B-Instruct",
728
+ "Eval Date": "2025/1/22"
729
+ },
730
+ "gsm8k": {
731
+ "Score": 35.93,
732
+ "Cost($)": 0.0
733
+ },
734
+ "AQuA": {
735
+ "Score": 33.07,
736
+ "Cost($)": 0.0
737
+ }
738
+ },
739
+ "SC-CoT-Qwen2-0.5B-Instruct": {
740
+ "META": {
741
+ "Algorithm": "SC-CoT",
742
+ "LLM": "Qwen2-0.5B-Instruct",
743
+ "Eval Date": "2025/1/22"
744
  },
745
  "gsm8k": {
746
+ "Score": 3.79,
747
+ "Cost($)": 0.0
748
  },
749
  "AQuA": {
750
+ "Score": 30.7,
751
+ "Cost($)": 0.0
752
  }
753
  }
754
  }
755
+ }
src/overall_results.csv ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($)
2
+ 1.0,SC-CoT,Llama-3.3-70B-Instruct,2025/1/22,91.03,95.45,4.5021,86.61,0.5847
3
+ 2.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,89.55,92.87,0.7195,86.22,0.0808
4
+ 3.0,SC-CoT,gpt-4o,2025/1/22,89.47,90.75,24.2428,88.19,6.2412
5
+ 4.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,88.70,93.93,0.687,83.46,0.0927
6
+ 5.0,CoT,gpt-4o,2025/1/22,88.38,94.08,4.5367,82.67,1.0417
7
+ 6.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,88.25,90.67,4.2651,85.82,0.5576
8
+ 7.0,IO,Llama-3.3-70B-Instruct,2025/1/22,87.47,92.26,0.4709,82.67,0.0798
9
+ 8.0,CoT,Doubao-lite-32k,2025/1/7,85.99,89.31,0.0558,82.67,0.0066
10
+ 9.0,IO,Qwen2.5-72B-Instruct,2025/1/22,85.42,86.58,0.4899,84.25,0.0742
11
+ 10.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,84.91,88.32,0.0,81.49,0.0
12
+ 11.0,PoT,gpt-4o,2025/1/22,84.15,93.1,4.2166,75.19,1.6087
13
+ 12.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,83.77,92.34,0.7054,75.19,0.1645
14
+ 13.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,83.39,87.64,10.1124,79.13,0.768
15
+ 14.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,83.19,85.67,0.0,80.7,0.0
16
+ 15.0,SC-CoT,Doubao-lite-32k,2025/1/7,82.60,83.7,0.155,81.5,0.0347
17
+ 16.0,IO,gpt-4o,2025/1/22,82.00,88.4,3.3463,75.59,1.1453
18
+ 17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,81.57,85.59,0.2512,77.55,0.0445
19
+ 18.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,80.24,87.26,10.5479,73.22,0.3177
20
+ 19.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,78.63,82.86,0.0,74.4,0.0
21
+ 20.0,SC-CoT,gpt-3.5-turbo,2025/1/7,76.52,82.56,2.6285,70.47,0.5578
22
+ 21.0,PoT,Llama-3.3-70B-Instruct,2025/1/22,76.30,73.08,0.9736,79.52,0.1746
23
+ 22.0,PoT,Doubao-lite-32k,2025/1/7,75.63,79.6,0.0576,71.65,0.0147
24
+ 23.0,IO,Doubao-lite-32k,2025/1/7,75.58,72.02,0.0354,79.13,0.0058
25
+ 24.0,CoT,gpt-3.5-turbo,2025/1/7,69.86,78.69,0.6788,61.02,0.0957
26
+ 25.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,69.73,74.9,3.4633,64.56,0.4928
27
+ 26.0,PoT,gpt-3.5-turbo,2025/1/7,68.16,76.87,0.6902,59.44,0.1748
28
+ 27.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,68.03,75.43,0.0,60.62,0.0
29
+ 28.0,IO,Qwen2.5-7B-Instruct,2025/1/22,67.99,57.24,0.0,78.74,0.0
30
+ 29.0,CoT,Internllm2_5-7B,2025/1/22,65.23,77.71,0.0,52.75,0.0
31
+ 30.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,64.17,75.2,0.0,53.14,0.0
32
+ 31.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,63.47,58.83,0.0,68.11,0.0
33
+ 32.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,61.64,67.77,0.0,55.51,0.0
34
+ 33.0,ReAct-Pro*,gpt-4o,2025/1/22,60.39,63.3,39.0751,57.48,2.304
35
+ 34.0,IO,Llama-3.1-8B-Instruct,2025/1/22,54.17,57.16,0.0,51.18,0.0
36
+ 35.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,48.02,55.49,0.0,40.55,0.0
37
+ 36.0,SC-CoT,Internllm2_5-7B,2025/1/22,38.62,41.39,0.0,35.85,0.0
38
+ 37.0,IO,gpt-3.5-turbo,2025/1/7,38.40,37.83,0.3328,38.97,0.038
39
+ 38.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,37.64,38.66,0.0,36.61,0.0
40
+ 39.0,PoT,Internllm2_5-7B,2025/1/22,37.41,38.21,0.0,36.61,0.0
41
+ 40.0,ReAct-Pro*,Internllm2_5-7B,2025/1/22,37.23,33.51,0.0,40.94,0.0
42
+ 41.0,CoT,Qwen2-0.5B-Instruct,2025/1/22,34.50,35.93,0.0,33.07,0.0
43
+ 42.0,IO,Internllm2_5-7B,2025/1/22,29.61,11.59,0.0,47.63,0.0
44
+ 43.0,ReAct-Pro*,Qwen2-1.5B-Instruct,2025/1/22,25.23,24.86,0.0,25.59,0.0
45
+ 44.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,24.60,18.49,0.0,30.7,0.0
46
+ 45.0,IO,Qwen2-1.5B-Instruct,2025/1/22,22.90,16.67,0.0,29.13,0.0
47
+ 46.0,IO,Qwen2-0.5B-Instruct,2025/1/22,20.93,14.7,0.0,27.16,0.0
48
+ 47.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,17.92,5.53,0.0,30.31,0.0
49
+ 48.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,17.25,3.79,0.0,30.7,0.0
50
+ 49.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,15.83,7.65,0.0,24.01,0.0
51
+ 50.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,13.47,9.62,0.0,17.32,0.0
src/record.csv ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Algorithm,dataset,llm,Score,Pass rate,X-shot,X-shot,Parameters,Nums,Total input tokens,Average input tokens,Total output tokens,Average output tokens,All tokens,Cost($),Eval Date,Note,,,,,,,,,,,,,,,,,,,
2
+ IO,gsm8k,gpt-3.5-turbo,37.83,99.92,8,few_shot,,1319,"546,990",415,"39,563",30,"586,553",0.3328,2025/1/7,,,,,,,,,,,,,,,,,,,,
3
+ IO,gsm8k,Doubao-lite-32k,72.02,99.92,8,few_shot,,1319,"617,377",468,"123,106",93,"740,483",0.0354,2025/1/7,0.2590 (元),,,,,,,,,,,,,,,,,,,
4
+ IO,gsm8k,gpt-4o,88.4,100,8,few_shot,,1319,"542,416",411,"199,030",151,"741,446",3.3463,2025/1/22,,,,,,,,,,,,,,,,,,,,
5
+ IO,gsm8k,Qwen2.5-72B-Instruct,86.58,100,8,few_shot,,1319,"555,340",421,"313,720",238,"869,060",0.4899,2025/1/22,,,,,,,,,,,,,,,,,,,,
6
+ IO,gsm8k,Llama-3.3-70B-Instruct,92.26,100,8,few_shot,,1319,"583,916",443,"251,359",191,"835,275",0.4709,2025/1/22,,,,,,,,,,,,,,,,,,,,
7
+ IO,gsm8k,Qwen2.5-7B-Instruct,57.24,100,8,few_shot,,1319,"596,229",452,"291,684",221,"887,913",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
8
+ IO,gsm8k,Llama-3.1-8B-Instruct,57.16,99.54,8,few_shot,,1319,"550,941",418,"1,194,488",906,"1,745,429",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
9
+ IO,gsm8k,Internllm2_5-7B,11.59,97.95,8,few_shot,,1319,"679,302",515,"434,426",329,"1,113,728",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
10
+ IO,gsm8k,Qwen2-1.5B-Instruct,16.67,100,8,few_shot,,1319,"568,530",431,"168,466",128,"736,996",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
11
+ IO,gsm8k,Qwen2-0.5B-Instruct,14.7,100,8,few_shot,,1319,"568,116",431,"266,781",202,"834,897",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
12
+ ReAct-Pro*,gsm8k,gpt-3.5-turbo,74.9,99.39,8,few_shot,max_steps=10,1319,"6,506,164","4,933","140,122",106,"6,646,286",3.4633,2025/1/7,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
13
+ ReAct-Pro*,gsm8k,Doubao-lite-32k,85.59,99.62,8,few_shot,max_steps=10,1319,"5,862,016","4,444","136,623",104,"5,998,639",0.2512,2025/1/7,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
14
+ ReAct-Pro*,gsm8k,gpt-4o,63.3,99.54,8,few_shot,max_steps=10,1319,"14,411,173","10,926","304,714",231,"14,715,887",39.0751,2025/1/22,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
15
+ ReAct-Pro*,gsm8k,Qwen2.5-72B-Instruct,87.26,100,8,few_shot,max_steps=10,1319,"18,160,983","13,769","549,454",417,"18,710,437",10.5479,2025/1/22,,,,,,,,,,,,,,,,,,,,
16
+ ReAct-Pro*,gsm8k,Llama-3.3-70B-Instruct,87.64,99.92,,few_shot,max_steps=10,1319,"17,038,928","12,918","898,936",682,"17,937,864",10.1124,2025/1/22,,,,,,,,,,,,,,,,,,,,
17
+ ReAct-Pro*,gsm8k,Qwen2.5-7B-Instruct,82.86,100,8,few_shot,max_steps=10,1319,"14,355,752","10,884","495,162",375,"14,850,914",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
18
+ ReAct-Pro*,gsm8k,Llama-3.1-8B-Instruct,67.77,98.55,8,few_shot,max_steps=10,1319,"21,044,978","15,955","1,790,789","1,358","22,835,767",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
19
+ ReAct-Pro*,gsm8k,Internllm2_5-7B,33.51,97.95,,few_shot,max_steps=10,1319,"30,120,070","22,836","5,549,919","4,208","35,669,989",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
20
+ ReAct-Pro*,gsm8k,Qwen2-1.5B-Instruct,24.86,80.21,8,few_shot,max_steps=10,1319,"9,133,603","6,925","694,398",526,"9,828,001",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
21
+ ReAct-Pro*,gsm8k,Qwen2-0.5B-Instruct,7.65,95.22,8,few_shot,max_steps=10,1319,"52,431,343","39,751","2,961,268","2,245","55,392,611",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
22
+ PoT,gsm8k,gpt-3.5-turbo,76.87,99.24,8,few_shot,,1319,"1,090,418",827,"96,662",73,"1,187,080",0.6902,2025/1/7,,,,,,,,,,,,,,,,,,,,
23
+ PoT,gsm8k,Doubao-lite-32k,79.6,92.57,8,few_shot,,1319,"1,170,038",887,"118,017",89,"1,288,055",0.0576,2025/1/7,,,,,,,,,,,,,,,,,,,,
24
+ PoT,gsm8k,gpt-4o,93.1,99.77,8,few_shot,,1319,"1,101,672",835,"146,240",111,"1,247,912",4.2166,2025/1/22,,,,,,,,,,,,,,,,,,,,
25
+ PoT,gsm8k,Qwen2.5-72B-Instruct,92.34,99.39,8,few_shot,,1319,"1,106,682",839,"144,528",110,"1,251,210",0.7054,2025/1/22,,,,,,,,,,,,,,,,,,,,
26
+ PoT,gsm8k,Llama-3.3-70B-Instruct,73.08,79.6,8,few_shot,,1319,"1,126,025",854,"601,019",456,"1,727,044",0.9736,2025/1/22,,,,,,,,,,,,,,,,,,,,
27
+ PoT,gsm8k,Qwen2.5-7B-Instruct,58.83,70.5,8,few_shot,,1319,"1,145,390",868,"217,432",165,"1,362,822",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
28
+ PoT,gsm8k,Llama-3.1-8B-Instruct,38.66,55.42,8,few_shot,,1319,"1,147,538",870,"243,573",185,"1,391,111",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
29
+ PoT,gsm8k,Internllm2_5-7B,38.21,48.9,8,few_shot,,1319,"1,136,843",862,"188,106",143,"1,324,949",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
30
+ PoT,gsm8k,Qwen2-1.5B-Instruct,18.49,31,8,few_shot,,1319,"1,151,528",873,"175,994",133,"1,327,522",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
31
+ PoT,gsm8k,Qwen2-0.5B-Instruct,9.62,16.9,8,few_shot,,1319,"1,151,528",873,"237,607",180,"1,389,135",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
32
+ CoT,gsm8k,gpt-3.5-turbo,78.69,100,8,few_shot,,1319,"953,242",723,"134,799",102,"1,088,041",0.6788,2025/1/7,,,,,,,,,,,,,,,,,,,,
33
+ CoT,gsm8k,Doubao-lite-32k,89.31,100,8,few_shot,,1319,"1,042,095",790,"159,725",121,"1,201,820",0.0558,2025/1/7,0.4084635 (元),,,,,,,,,,,,,,,,,,,
34
+ CoT,gsm8k,gpt-4o,94.08,100,8,few_shot,,1319,"948,668",719,"216,498",164,"1,165,166",4.5367,2025/1/22,,,,,,,,,,,,,,,,,,,,
35
+ CoT,gsm8k,Qwen2.5-72B-Instruct,92.87,100,8,few_shot,,1319,"1,005,119",762,"271,133",206,"1,276,252",0.7195,2025/1/22,,,,,,,,,,,,,,,,,,,,
36
+ CoT,gsm8k,Llama-3.3-70B-Instruct,93.93,100,8,few_shot,,1319,"990,168",751,"228,497",173,"1,218,665",0.6870,2025/1/22,,,,,,,,,,,,,,,,,,,,
37
+ CoT,gsm8k,Qwen2.5-7B-Instruct,85.67,100,8,few_shot,,1319,"1,046,008",793,"244,797",186,"1,290,805",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
38
+ CoT,gsm8k,Llama-3.1-8B-Instruct,75.43,99.92,8,few_shot,,1319,"990,168",751,"258,161",196,"1,248,329",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
39
+ CoT,gsm8k,Internllm2_5-7B,77.71,99.69,8,few_shot,,1319,"968,163",734,"234,000",177,"1,202,163",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
40
+ CoT,gsm8k,Qwen2-1.5B-Instruct,55.49,100,8,few_shot,,1319,"1,032,818",783,"185,707",141,"1,218,525",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
41
+ CoT,gsm8k,Qwen2-0.5B-Instruct,35.93,99.92,8,few_shot,,1319,"1,032,818",783,"190,641",145,"1,223,459",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
42
+ SC-CoT,gsm8k,gpt-3.5-turbo,82.56,99.85,8,few_shot,"temperature=1, path_num=5",1319,"1,212,520",919,"1,348,177","1,022","2,560,697",2.6285,2025/1/7,,,,,,,,,,,,,,,,,,,,
43
+ SC-CoT,gsm8k,Doubao-lite-32k,83.7,99.70,8,few_shot,"temperature=1, path_num=5",1319,"1,230,019",933,"1,277,668",969,"2,507,687",0.1550,2025/1/7,,,,,,,,,,,,,,,,,,,,
44
+ SC-CoT,gsm8k,gpt-4o,90.75,100,8,few_shot,"temperature=1, path_num=5",1319,"1,168,927",886,"2,132,044","1,616","3,300,971",24.2428,2025/1/22,,,,,,,,,,,,,,,,,,,,
45
+ SC-CoT,gsm8k,Qwen2.5-72B-Instruct,90.67,100,8,few_shot,"temperature=1, path_num=5",1319,"5,292,383","4,012","2,273,254","1,723","7,565,637",4.2651,2025/1/22,,,,,,,,,,,,,,,,,,,,
46
+ SC-CoT,gsm8k,Llama-3.3-70B-Instruct,95.45,100,8,few_shot,"temperature=1, path_num=5",1319,"5,406,763","4,099","2,579,233","1,955","7,985,996",4.5021,2025/1/22,,,,,,,,,,,,,,,,,,,,
47
+ SC-CoT,gsm8k,Qwen2.5-7B-Instruct,88.32,99.84,8,few_shot,"temperature=1, path_num=5",1319,"5,668,252","4,297","2,505,566","1,900","8,173,818",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
48
+ SC-CoT,gsm8k,Llama-3.1-8B-Instruct,75.2,99.54,8,few_shot,"temperature=1, path_num=5",1319,"5,334,657","4,044","3,109,546","2,358","8,444,203",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
49
+ SC-CoT,gsm8k,Internllm2_5-7B,41.39,98.25,8,few_shot,"temperature=1, path_num=5",1319,"6,674,518","5,060","3,350,339","2,540","10,024,857",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
50
+ SC-CoT,gsm8k,Qwen2-1.5B-Instruct,5.53,86.73,8,few_shot,"temperature=1, path_num=5",1319,"5,844,218","4,431","3,117,550","2,364","8,961,768",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
51
+ SC-CoT,gsm8k,Qwen2-0.5B-Instruct,3.79,94.84,8,few_shot,"temperature=1, path_num=5",1319,"6,529,832","4,951","4,003,983","3,036","10,533,815",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
52
+ IO,AQuA,gpt-3.5-turbo,38.97,100,0,zero_shot,,254,"25,701",101,"16,770",66,"42,471",0.0380,2025/1/7,,,,,,,,,,,,,,,,,,,,
53
+ IO,AQuA,Doubao-lite-32k,79.13,100,0,zero_shot,,254,"33,058",130,"54,684",215,"87,742",0.0058,2025/1/7,0.0427(元),,,,,,,,,,,,,,,,,,,
54
+ IO,AQuA,gpt-4o,75.59,97.24,0,zero_shot,,254,"25,631",101,"108,121",426,"133,752",1.1453,2025/1/22,,,,,,,,,,,,,,,,,,,,
55
+ IO,AQuA,Qwen2.5-72B-Instruct,84.25,99.6,0,zero_shot,,254,"25,397",100,"106,207",418,"131,604",0.0742,2025/1/22,,,,,,,,,,,,,,,,,,,,
56
+ IO,AQuA,Llama-3.3-70B-Instruct,82.67,99.21,0,zero_shot,,254,"32,809",129,"108,758",428,"141,567",0.0798,2025/1/22,,,,,,,,,,,,,,,,,,,,
57
+ IO,AQuA,Qwen2.5-7B-Instruct,78.74,98.42,0,zero_shot,,254,"33,271",131,"104,500",411,"137,771",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
58
+ IO,AQuA,Llama-3.1-8B-Instruct,51.18,98.81,0,zero_shot,,254,"26,459",104,"106,647",420,"133,106",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
59
+ IO,AQuA,Internllm2_5-7B,47.63,90.94,0,zero_shot,,254,"50,232",198,"134,809",531,"185,041",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
60
+ IO,AQuA,Qwen2-1.5B-Instruct,29.13,97.63,0,zero_shot,,254,"27,937",110,"43,110",170,"71,047",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
61
+ IO,AQuA,Qwen2-0.5B-Instruct,27.16,98.81,0,zero_shot,,254,"27,937",110,"82,478",325,"110,415",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
62
+ CoT,AQuA,gpt-3.5-turbo,61.02,93.7,0,zero_shot,,254,"25,447",100,"55,346",218,"80,793",0.0957,2025/1/22,,,,,,,,,,,,,,,,,,,,
63
+ CoT,AQuA,Doubao-lite-32k,82.67,97.24,0,zero_shot,,254,"27,978",110,"66,599",262,"94,577",0.0066,2025/1/7,0.0483 (元),,,,,,,,,,,,,,,,,,,
64
+ CoT,AQuA,gpt-4o,82.67,98.03,0,zero_shot,,254,"25,123",99,"97,894",385,"123,017",1.0417,2025/1/22,,,,,,,,,,,,,,,,,,,,
65
+ CoT,AQuA,Qwen2.5-72B-Instruct,86.22,99.21,0,zero_shot,,254,"25,143",99,"118,146",465,"143,289",0.0808,2025/1/22,,,,,,,,,,,,,,,,,,,,
66
+ CoT,AQuA,Llama-3.3-70B-Instruct,83.46,98.42,0,zero_shot,,254,"32,555",128,"131,834",519,"164,389",0.0927,2025/1/22,,,,,,,,,,,,,,,,,,,,
67
+ CoT,AQuA,Qwen2.5-7B-Instruct,80.7,99.6,0,zero_shot,,254,"33,017",130,"116,719",460,"149,736",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
68
+ CoT,AQuA,Llama-3.1-8B-Instruct,60.62,100,0,zero_shot,,254,"32,555",128,"111,880",440,"144,435",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
69
+ CoT,AQuA,Internllm2_5-7B,52.75,89.37,0,zero_shot,,254,"26,610",105,"100,910",397,"127,520",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
70
+ CoT,AQuA,Qwen2-1.5B-Instruct,40.55,98.81,0,zero_shot,,254,"30,477",120,"79,563",313,"110,040",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
71
+ CoT,AQuA,Qwen2-0.5B-Instruct,33.07,98.81,0,zero_shot,,254,"30,477",120,"86,862",342,"117,339",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
72
+ PoT,AQuA,gpt-3.5-turbo,59.44,100,0,zero_shot,,254,"225,162",886,"41,492",163,"266,654",0.1748,2025/1/7,,,,,,,,,,,,,,,,,,,,
73
+ PoT,AQuA,gpt-4o,75.19,100,0,zero_shot,,254,"222,717",877,"105,191",414,"327,908",1.6087,2025/1/22,,,,,,,,,,,,,,,,,,,,
74
+ PoT,AQuA,Doubao-lite-32k,71.65,96.85,0,zero_shot,,254,"259,863","1,023","49,573",195,"309,436",0.0147,2025/1/7,,,,,,,,,,,,,,,,,,,,
75
+ PoT,AQuA,Qwen2.5-72B-Instruct,75.19,100,0,zero_shot,,254,"249,215",981,"42,549",168,"291,764",0.1645,2025/1/22,,,,,,,,,,,,,,,,,,,,
76
+ PoT,AQuA,Llama-3.3-70B-Instruct,79.52,99.21,0,zero_shot,,254,"240,735",948,"69,064",272,"309,799",0.1746,2025/1/22,,,,,,,,,,,,,,,,,,,,
77
+ PoT,AQuA,Qwen2.5-7B-Instruct,68.11,100,0,zero_shot,,254,"264,517","1,041","49,211",194,"313,728",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
78
+ PoT,AQuA,Llama-3.1-8B-Instruct,36.61,96.85,0,zero_shot,,254,"240,613",947,"50,301",198,"290,914",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
79
+ PoT,AQuA,Internllm2_5-7B,36.61,98.81,0,zero_shot,,254,"233,505",919,"68,457",270,"301,962",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
80
+ PoT,AQuA,Qwen2-1.5B-Instruct,30.7,96.45,0,zero_shot,,254,"246,560",971,"51,915",204,"298,475",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
81
+ PoT,AQuA,Qwen2-0.5B-Instruct,17.32,92.12,0,zero_shot,,254,"258,867","1,019","63,414",250,"322,281",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
82
+ SC-CoT,AQuA,gpt-3.5-turbo,70.47,98.82,0,zero_shot,"temperature=1, path_num=5",254,"70,157",276,"348,460","1,372","418,617",0.5578,2025/1/7,,,,,,,,,,,,,,,,,,,,
83
+ SC-CoT,AQuA,Doubao-lite-32k,81.5,97.64,0,zero_shot,"temperature=1, path_num=5",254,"83,830",330,"382,016","1,504","465,846",0.0347,2025/1/7,,,,,,,,,,,,,,,,,,,,
84
+ SC-CoT,AQuA,gpt-4o,88.19,100,0,zero_shot,"temperature=1, path_num=5",254,"72,916",287,"605,895","2,385","678,811",6.2412,2025/1/22,,,,,,,,,,,,,,,,,,,,
85
+ SC-CoT,AQuA,Qwen2.5-72B-Instruct,85.82,98.42,0,zero_shot,"temperature=1, path_num=5",254,"241,149",949,"747,909","2,945","989,058",0.5576,2025/1/22,,,,,,,,,,,,,,,,,,,,
86
+ SC-CoT,AQuA,Llama-3.3-70B-Instruct,86.61,99.21,0,zero_shot,"temperature=1, path_num=5",254,"283,248","1,115","753,876","2,968","1,037,124",0.5847,2025/1/22,,,,,,,,,,,,,,,,,,,,
87
+ SC-CoT,AQuA,Qwen2.5-7B-Instruct,81.49,100,0,zero_shot,"temperature=1, path_num=5",254,"278,848","1,098","736,520","2,900","1,015,368",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
88
+ SC-CoT,AQuA,Llama-3.1-8B-Instruct,53.14,96.06,0,zero_shot,"temperature=1, path_num=5",254,"372,968","1,468","668,378","2,631","1,041,346",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
89
+ SC-CoT,AQuA,Internllm2_5-7B,35.85,98.8,0,zero_shot,"temperature=1, path_num=5",254,"530,701","2,089","709,687","2,794","1,240,388",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
90
+ SC-CoT,AQuA,Qwen2-1.5B-Instruct,30.31,97.24,0,zero_shot,"temperature=1, path_num=5",254,"430,703","1,696","726,373","2,860","1,157,076",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
91
+ SC-CoT,AQuA,Qwen2-0.5B-Instruct,30.7,98.42,0,zero_shot,"temperature=1, path_num=5",254,"496,206","1,954","729,333","2,871","1,225,539",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
92
+ ReAct-Pro*,AQuA,gpt-3.5-turbo,64.56,98.03,0,zero_shot,max_steps=10,254,"862,614","3,396","40,973",161,"903,587",0.4928,2025/1/7,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
93
+ ReAct-Pro*,AQuA,Doubao-lite-32k,77.55,96.06,0,zero_shot,max_steps=10,254,"977,890","3,850","54,951",216,"1,032,841",0.0445,2025/1/7,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
94
+ ReAct-Pro*,AQuA,gpt-4o,57.48,97.24,0,zero_shot,max_steps=10,254,"615,589","2,424","76,507",301,"692,096",2.3040,2025/1/22,"think-action 单独返回,prompt v1",,,,,,,,,,,,,,,,,,,
95
+ ReAct-Pro*,AQuA,Qwen2.5-72B-Instruct,73.22,100,0,zero_shot,max_steps=10,254,"441,765","1,739","121,838",480,"563,603",0.3177,2025/1/22,,,,,,,,,,,,,,,,,,,,
96
+ ReAct-Pro*,AQuA,Llama-3.3-70B-Instruct,79.13,99.6,0,zero_shot,max_steps=10,254,"1,119,143","4,406","243,236",958,"1,362,379",0.7680,2025/1/22,,,,,,,,,,,,,,,,,,,,
97
+ ReAct-Pro*,AQuA,Qwen2.5-7B-Instruct,74.4,99.21,0,zero_shot,max_steps=10,254,"564,165","2,221","131,679",518,"695,844",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
98
+ ReAct-Pro*,AQuA,Llama-3.1-8B-Instruct,55.51,96.85,0,zero_shot,max_steps=10,254,"3,764,723","14,822","576,098","2,268","4,340,821",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
99
+ ReAct-Pro*,AQuA,Internllm2_5-7B,40.94,96.85,0,zero_shot,max_steps=10,254,"3,592,039","14,142","836,762","3,294","4,428,801",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
100
+ ReAct-Pro*,AQuA,Qwen2-1.5B-Instruct,25.59,96.06,0,zero_shot,max_steps=10,254,"4,555,858","17,936","516,146","2,032","5,072,004",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
101
+ ReAct-Pro*,AQuA,Qwen2-0.5B-Instruct,24.01,96.85,0,zero_shot,max_steps=10,254,6344167,"24,977",825920,"3,252","7,170,087",0.0000,2025/1/22,,,,,,,,,,,,,,,,,,,,
102
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
103
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
104
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
105
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
106
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
107
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
108
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
109
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
110
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
111
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
112
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
113
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
114
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
115
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
116
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
117
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
118
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
119
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
120
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
121
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
122
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
123
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
124
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
125
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
126
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
127
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
128
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
129
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
130
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
131
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
132
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
133
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
134
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
135
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
136
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
137
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
138
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
139
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
140
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
141
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
142
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
143
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
144
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
145
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
146
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
147
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
148
+ ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,