Miaoran000 commited on
Commit
5c4aa1e
·
1 Parent(s): 2aa9a75

minor updates

Browse files
.gitignore CHANGED
@@ -18,3 +18,7 @@ src/assets/model_counts.html
18
 
19
  generation_results/
20
  Hallucination Leaderboard Results
 
 
 
 
 
18
 
19
  generation_results/
20
  Hallucination Leaderboard Results
21
+ dataset_stats.py
22
+
23
+ get_comparison.py
24
+ GPT-4-Turbo_v.s._GPT-4o.csv
requirements.txt CHANGED
@@ -14,4 +14,9 @@ requests==2.28.2
14
  tqdm==4.65.0
15
  transformers==4.35.2
16
  tokenizers>=0.15.0
17
- sentence-transformers==2.2.2
 
 
 
 
 
 
14
  tqdm==4.65.0
15
  transformers==4.35.2
16
  tokenizers>=0.15.0
17
+ sentence-transformers==2.2.2
18
+ google-generativeai
19
+ replicate
20
+ anthropic
21
+ openai
22
+ cohere
src/backend/evaluate_model.py CHANGED
@@ -110,11 +110,14 @@ class Evaluator:
110
 
111
  source_summary_df = self.generated_summaries_df[["source", "summary"]]
112
 
113
- # #update leaderboard_summaries.csv
114
- # #first remove previous results for the current model
115
- # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), encoding='utf-8', sep="\t")
116
  # mask = existing_df['model'] == self.model
117
  # existing_df = existing_df[~mask]
 
 
 
118
  # # get new result
119
  leaderboard_summaries_df = source_summary_df
120
  leaderboard_summaries_df.insert(2, "model", [self.model]*leaderboard_summaries_df.shape[0])
@@ -124,12 +127,22 @@ class Evaluator:
124
  # update leaderboard_summaries_with_scores.csv
125
  # BUG: get error when opening the file
126
  # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'),
127
- # encoding='utf-8', sep=",", on_bad_lines='warn', quotechar='"', quoting=2)
128
  # print(existing_df.shape)
 
 
129
  # mask = existing_df['model'] == self.model
130
  # existing_df = existing_df[~mask]
131
- # get new result
132
  leaderboard_summaries_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
133
  leaderboard_summaries_with_scores_df.insert(3, "model", [self.model]*leaderboard_summaries_with_scores_df.shape[0])
134
  leaderboard_summaries_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'), mode='a', index=False, header=False)
135
- print('leaderboard_summaries_with_scores.csv has been updated')
 
 
 
 
 
 
 
 
 
110
 
111
  source_summary_df = self.generated_summaries_df[["source", "summary"]]
112
 
113
+ #update leaderboard_summaries.csv
114
+ #first remove previous results for the current model
115
+ # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), encoding='utf-8')
116
  # mask = existing_df['model'] == self.model
117
  # existing_df = existing_df[~mask]
118
+ # print(existing_df.shape)
119
+ # summary_doc = set(existing_df['model'].values.tolist())
120
+ # print(summary_doc)
121
  # # get new result
122
  leaderboard_summaries_df = source_summary_df
123
  leaderboard_summaries_df.insert(2, "model", [self.model]*leaderboard_summaries_df.shape[0])
 
127
  # update leaderboard_summaries_with_scores.csv
128
  # BUG: get error when opening the file
129
  # existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'),
130
+ # encoding='utf-8', sep=",", quotechar='"', quoting=2)
131
  # print(existing_df.shape)
132
+ # score_doc = set(existing_df['model'].values.tolist())
133
+ # print(score_doc)
134
  # mask = existing_df['model'] == self.model
135
  # existing_df = existing_df[~mask]
136
+ # # get new result
137
  leaderboard_summaries_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
138
  leaderboard_summaries_with_scores_df.insert(3, "model", [self.model]*leaderboard_summaries_with_scores_df.shape[0])
139
  leaderboard_summaries_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'), mode='a', index=False, header=False)
140
+ print('leaderboard_summaries_with_scores.csv has been updated')
141
+
142
+ # for model in summary_doc:
143
+ # if model not in score_doc:
144
+ # print(f"{model} records missing in leaderboard_summaries_with_scores.csv")
145
+
146
+ # for model in score_doc:
147
+ # if model not in summary_doc:
148
+ # print(f"{model} records missing in leaderboard_summaries.csv")
src/backend/model_operations.py CHANGED
@@ -13,18 +13,21 @@ from sentence_transformers import CrossEncoder
13
  import litellm
14
  # from litellm import completion
15
  from tqdm import tqdm
16
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
17
  # from accelerate import PartialState
18
  # from accelerate.inference import prepare_pippy
19
  import torch
20
  import cohere
21
  from openai import OpenAI
 
 
 
22
  import google.generativeai as genai
23
 
24
  import src.backend.util as util
25
  import src.envs as envs
26
 
27
- litellm.set_verbose=False
28
 
29
  # Set up basic configuration for logging
30
  logging.basicConfig(level=logging.INFO,
@@ -123,15 +126,15 @@ class SummaryGenerator:
123
  break
124
  except Exception as e:
125
  if 'Rate limit reached' in str(e):
126
- wait_time = 3660
127
  current_time = datetime.now().strftime('%H:%M:%S')
128
- print(f"Rate limit hit at {current_time}. Waiting for 1 hour before retrying...")
129
  time.sleep(wait_time)
130
  elif 'is currently loading' in str(e):
131
  wait_time = 200
132
  print(f"Model is loading, wait for {wait_time}")
133
  time.sleep(wait_time)
134
- elif '429 Resource has been exhausted' in str(e): # for gemini models
135
  wait_time = 60
136
  print(f"Quota has reached, wait for {wait_time}")
137
  time.sleep(wait_time)
@@ -166,13 +169,14 @@ class SummaryGenerator:
166
  def generate_summary(self, system_prompt: str, user_prompt: str):
167
  # Using Together AI API
168
  using_together_api = False
169
- together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm', 'llama-3']
170
  for together_ai_api_model in together_ai_api_models:
171
  if together_ai_api_model in self.model_id.lower():
172
  using_together_api = True
173
  break
174
  # if 'mixtral' in self.model_id.lower() or 'dbrx' in self.model_id.lower() or 'wizardlm' in self.model_id.lower(): # For mixtral and dbrx models, use Together AI API
175
  if using_together_api:
 
176
  # suffix = "completions" if ('mixtral' in self.model_id.lower() or 'base' in self.model_id.lower()) else "chat/completions"
177
  suffix = "chat/completions"
178
  url = f"https://api.together.xyz/v1/{suffix}"
@@ -184,14 +188,6 @@ class SummaryGenerator:
184
  "temperature": 0.0,
185
  # 'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
186
  }
187
- # if 'mixtral' in self.model_id.lower():
188
- # # payload['prompt'] = user_prompt
189
- # # payload['prompt'] = "Write a summary of the following passage:\nPassage:\n" + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
190
- # payload['prompt'] = 'You must stick to the passage provided. Provide a concise summary of the following passage, covering the core pieces of information described:\nPassage:\n' + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
191
- # print(payload)
192
- # else:
193
- # payload['messages'] = [{"role": "system", "content": system_prompt},
194
- # {"role": "user", "content": user_prompt}]
195
  payload['messages'] = [{"role": "system", "content": system_prompt},
196
  {"role": "user", "content": user_prompt}]
197
  headers = {
@@ -201,6 +197,7 @@ class SummaryGenerator:
201
  }
202
 
203
  response = requests.post(url, json=payload, headers=headers)
 
204
  try:
205
  result = json.loads(response.text)
206
  # print(result)
@@ -219,14 +216,16 @@ class SummaryGenerator:
219
 
220
  # Using OpenAI API
221
  elif 'gpt' in self.model_id.lower():
222
- response = litellm.completion(
 
223
  model=self.model_id.replace('openai/',''),
224
  messages=[{"role": "system", "content": system_prompt},
225
  {"role": "user", "content": user_prompt}],
226
  temperature=0.0,
227
  max_tokens=250,
228
  )
229
- result = response['choices'][0]['message']['content']
 
230
  print(result)
231
  return result
232
 
@@ -258,10 +257,11 @@ class SummaryGenerator:
258
  "threshold": "BLOCK_NONE"
259
  },
260
  ]
261
- model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest" if "gemini-1.5-pro" in self.model_id.lower() else self.model_id.lower().split('google/')[-1],
262
  generation_config=generation_config,
263
  system_instruction=system_prompt,
264
  safety_settings=safety_settings)
 
265
  convo = model.start_chat(history=[])
266
  convo.send_message(user_prompt)
267
  # print(convo.last)
@@ -269,39 +269,116 @@ class SummaryGenerator:
269
  print(result)
270
  return result
271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  # Using HF API or download checkpoints
273
  elif self.local_model is None:
 
 
 
 
 
 
 
 
 
 
 
274
  try: # try use HuggingFace API
275
-
276
  response = litellm.completion(
277
  model='command-r-plus' if 'command' in self.model else self.model,
278
  messages=[{"role": "system", "content": system_prompt},
279
  {"role": "user", "content": user_prompt}],
280
  temperature=0.0,
281
- max_tokens=1024,
282
  api_base=self.api_base,
283
  )
284
  result = response['choices'][0]['message']['content']
 
285
  return result
286
- except: # fail to call api. run it locally.
287
- self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
288
- print("Tokenizer loaded")
289
- self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto")
290
- print("Local model loaded")
 
 
 
 
 
 
291
 
292
  # Using local model
293
  if self.local_model: # cannot call API. using local model
294
- messages=[
295
- {"role": "system", "content": system_prompt}, # gemma-1.1 does not accept system role
296
- {"role": "user", "content": user_prompt}
297
- ],
298
- prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
 
 
 
 
 
 
 
 
 
 
 
299
  print(prompt)
 
300
  input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
301
  with torch.no_grad():
302
  outputs = self.local_model.generate(**input_ids, max_new_tokens=250, do_sample=True, temperature=0.01, pad_token_id=self.tokenizer.eos_token_id)
303
  result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
304
- result = result.replace(prompt[0], '')
 
 
 
 
305
  print(result)
306
  return result
307
 
@@ -371,14 +448,12 @@ class EvaluationModel:
371
  summaries = []
372
  source_summary_pairs = util.create_pairs(summaries_df)
373
 
374
- for doc, summary in tqdm(source_summary_pairs, desc="Evaluating hallucinations"):
375
  if util.is_summary_valid(summary):
376
  try:
377
  # summary_pieces = summary.split('\n')
378
  # summary = summary_pieces[0] if len(summary_pieces[0].strip()) > 0 else summary_pieces[1]
379
  summary = summary.replace('<bos>','').replace('<eos>','')
380
- # print([doc, summary])
381
- # print(self.model.predict([doc, summary]))
382
  score = self.model.predict([doc, summary])# [0]
383
  if not isinstance(score, float):
384
  try:
@@ -386,6 +461,12 @@ class EvaluationModel:
386
  except:
387
  logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
388
  continue
 
 
 
 
 
 
389
  hem_scores.append(score)
390
  sources.append(doc)
391
  summaries.append(summary)
 
13
  import litellm
14
  # from litellm import completion
15
  from tqdm import tqdm
16
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig, pipeline
17
  # from accelerate import PartialState
18
  # from accelerate.inference import prepare_pippy
19
  import torch
20
  import cohere
21
  from openai import OpenAI
22
+ import anthropic
23
+ import replicate
24
+ # pip install -U google-generativeai
25
  import google.generativeai as genai
26
 
27
  import src.backend.util as util
28
  import src.envs as envs
29
 
30
+ litellm.set_verbose=True
31
 
32
  # Set up basic configuration for logging
33
  logging.basicConfig(level=logging.INFO,
 
126
  break
127
  except Exception as e:
128
  if 'Rate limit reached' in str(e):
129
+ wait_time = 300
130
  current_time = datetime.now().strftime('%H:%M:%S')
131
+ print(f"Rate limit hit at {current_time}. Waiting for 5 minutes before retrying...")
132
  time.sleep(wait_time)
133
  elif 'is currently loading' in str(e):
134
  wait_time = 200
135
  print(f"Model is loading, wait for {wait_time}")
136
  time.sleep(wait_time)
137
+ elif '429' in str(e): # for gemini models
138
  wait_time = 60
139
  print(f"Quota has reached, wait for {wait_time}")
140
  time.sleep(wait_time)
 
169
  def generate_summary(self, system_prompt: str, user_prompt: str):
170
  # Using Together AI API
171
  using_together_api = False
172
+ together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm', 'llama-3', 'qwen'] #, 'mistralai'
173
  for together_ai_api_model in together_ai_api_models:
174
  if together_ai_api_model in self.model_id.lower():
175
  using_together_api = True
176
  break
177
  # if 'mixtral' in self.model_id.lower() or 'dbrx' in self.model_id.lower() or 'wizardlm' in self.model_id.lower(): # For mixtral and dbrx models, use Together AI API
178
  if using_together_api:
179
+ # print('using together api')
180
  # suffix = "completions" if ('mixtral' in self.model_id.lower() or 'base' in self.model_id.lower()) else "chat/completions"
181
  suffix = "chat/completions"
182
  url = f"https://api.together.xyz/v1/{suffix}"
 
188
  "temperature": 0.0,
189
  # 'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
190
  }
 
 
 
 
 
 
 
 
191
  payload['messages'] = [{"role": "system", "content": system_prompt},
192
  {"role": "user", "content": user_prompt}]
193
  headers = {
 
197
  }
198
 
199
  response = requests.post(url, json=payload, headers=headers)
200
+ print(response)
201
  try:
202
  result = json.loads(response.text)
203
  # print(result)
 
216
 
217
  # Using OpenAI API
218
  elif 'gpt' in self.model_id.lower():
219
+ client = OpenAI()
220
+ response = client.chat.completions.create(
221
  model=self.model_id.replace('openai/',''),
222
  messages=[{"role": "system", "content": system_prompt},
223
  {"role": "user", "content": user_prompt}],
224
  temperature=0.0,
225
  max_tokens=250,
226
  )
227
+ # print(response)
228
+ result = response.choices[0].message.content
229
  print(result)
230
  return result
231
 
 
257
  "threshold": "BLOCK_NONE"
258
  },
259
  ]
260
+ model = genai.GenerativeModel(model_name=self.model_id.lower().split('google/')[-1],
261
  generation_config=generation_config,
262
  system_instruction=system_prompt,
263
  safety_settings=safety_settings)
264
+ # print(model)
265
  convo = model.start_chat(history=[])
266
  convo.send_message(user_prompt)
267
  # print(convo.last)
 
269
  print(result)
270
  return result
271
 
272
+ elif 'snowflake' in self.model_id.lower():
273
+ print("using replicate")
274
+ input = {
275
+ "prompt": user_prompt,
276
+ "temperature": 0,
277
+ "max_new_tokens": 250,
278
+ "stop_sequences": "<|im_end|>",
279
+ "prompt_template": f"<|im_start|>system\n{system_prompt}<|im_end|>\n" + "<|im_start|>user\n{prompt}<|im_end|>\n\n<|im_start|>assistant\n",
280
+ }
281
+ response = replicate.run(
282
+ self.model_id.lower(),
283
+ input=input
284
+ )
285
+ if isinstance(response, list):
286
+ response = ''.join(response)
287
+ print(response)
288
+ print()
289
+
290
+ return response
291
+
292
+ elif 'claude' in self.model_id.lower(): # using anthropic api
293
+ client = anthropic.Anthropic()
294
+ message = client.messages.create(
295
+ model=self.model_id.split('/')[-1],
296
+ max_tokens=250,
297
+ temperature=0,
298
+ system=system_prompt,
299
+ messages=[
300
+ {
301
+ "role": "user",
302
+ "content": [
303
+ {
304
+ "type": "text",
305
+ "text": user_prompt
306
+ }
307
+ ]
308
+ }
309
+ ]
310
+ )
311
+ result = message.content[0].text
312
+ print(result)
313
+ return result
314
+
315
  # Using HF API or download checkpoints
316
  elif self.local_model is None:
317
+ # response = litellm.completion(
318
+ # model='command-r-plus' if 'command' in self.model else self.model,
319
+ # messages=[{"role": "system", "content": system_prompt},
320
+ # {"role": "user", "content": user_prompt}],
321
+ # temperature=0.0,
322
+ # max_tokens=256,
323
+ # api_base=self.api_base,
324
+ # )
325
+ # result = response['choices'][0]['message']['content']
326
+ # print(result)
327
+ # return result
328
  try: # try use HuggingFace API
329
+ print('using huggingface api')
330
  response = litellm.completion(
331
  model='command-r-plus' if 'command' in self.model else self.model,
332
  messages=[{"role": "system", "content": system_prompt},
333
  {"role": "user", "content": user_prompt}],
334
  temperature=0.0,
335
+ max_tokens=250,
336
  api_base=self.api_base,
337
  )
338
  result = response['choices'][0]['message']['content']
339
+ print(result)
340
  return result
341
+ except Exception as e:
342
+ if 'Rate limit reached' in str(e):
343
+ wait_time = 300
344
+ current_time = datetime.now().strftime('%H:%M:%S')
345
+ print(f"Rate limit hit at {current_time}. Waiting for 5 minutes before retrying...")
346
+ time.sleep(wait_time)
347
+ else:
348
+ self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf" if 'openelm' in self.model_id.lower() else self.model_id, trust_remote_code=True)
349
+ print("Tokenizer loaded")
350
+ self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto")
351
+ print("Local model loaded")
352
 
353
  # Using local model
354
  if self.local_model: # cannot call API. using local model
355
+ if 'gemma' in self.model_id.lower() or 'mistral-7b' in self.model_id.lower():
356
+ messages=[
357
+ # gemma-1.1, mistral-7b does not accept system role
358
+ {"role": "user", "content": system_prompt + ' ' + user_prompt}
359
+ ]
360
+ prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
361
+
362
+ elif 'phi-2' in self.model_id.lower():
363
+ prompt = system_prompt + '\n' + user_prompt
364
+
365
+ else:
366
+ messages=[
367
+ {"role": "system", "content": system_prompt}, # gemma-1.1, mistral-7b does not accept system role
368
+ {"role": "user", "content": user_prompt}
369
+ ]
370
+ prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
371
  print(prompt)
372
+ print('-'*50)
373
  input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
374
  with torch.no_grad():
375
  outputs = self.local_model.generate(**input_ids, max_new_tokens=250, do_sample=True, temperature=0.01, pad_token_id=self.tokenizer.eos_token_id)
376
  result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
377
+ if 'gemma-2' in self.model_id.lower():
378
+ result = result.split(user_prompt + '\nmodel')[-1].strip()
379
+ else:
380
+ result = result.replace(prompt.strip(), '')
381
+
382
  print(result)
383
  return result
384
 
 
448
  summaries = []
449
  source_summary_pairs = util.create_pairs(summaries_df)
450
 
451
+ for doc, summary in source_summary_pairs:
452
  if util.is_summary_valid(summary):
453
  try:
454
  # summary_pieces = summary.split('\n')
455
  # summary = summary_pieces[0] if len(summary_pieces[0].strip()) > 0 else summary_pieces[1]
456
  summary = summary.replace('<bos>','').replace('<eos>','')
 
 
457
  score = self.model.predict([doc, summary])# [0]
458
  if not isinstance(score, float):
459
  try:
 
461
  except:
462
  logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
463
  continue
464
+ # print inconsistent summaries for checking
465
+ if score < 0.5:
466
+ print(doc)
467
+ print('-'*10)
468
+ print(summary)
469
+ print('='*20)
470
  hem_scores.append(score)
471
  sources.append(doc)
472
  summaries.append(summary)