Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Miaoran000
commited on
Commit
·
e071b26
1
Parent(s):
5a86006
update for HF HHEM2.1
Browse files- .gitignore +1 -0
- src/backend/evaluate_model.py +3 -20
- src/backend/model_operations.py +75 -71
- src/envs.py +2 -2
.gitignore
CHANGED
@@ -16,6 +16,7 @@ eval-results-bk/
|
|
16 |
eval-results-bk_hhem21/
|
17 |
eval-results_hhem21/
|
18 |
hhem21_server/
|
|
|
19 |
|
20 |
src/assets/model_counts.html
|
21 |
|
|
|
16 |
eval-results-bk_hhem21/
|
17 |
eval-results_hhem21/
|
18 |
hhem21_server/
|
19 |
+
leaderboard_results/
|
20 |
|
21 |
src/assets/model_counts.html
|
22 |
|
src/backend/evaluate_model.py
CHANGED
@@ -56,8 +56,8 @@ class Evaluator:
|
|
56 |
self.write_out = write_out
|
57 |
self.output_base_path = output_base_path
|
58 |
try:
|
59 |
-
self.summary_generator = SummaryGenerator(model, revision)
|
60 |
-
self.eval_model = EvaluationModel(envs.HEM_PATH)
|
61 |
except Exception as e:
|
62 |
logging.error(f"Error initializing Evaluator: {e}")
|
63 |
raise
|
@@ -72,9 +72,6 @@ class Evaluator:
|
|
72 |
"""
|
73 |
try:
|
74 |
df = pd.read_csv(envs.DATASET_PATH)
|
75 |
-
# print(envs.DATASET_PATH)
|
76 |
-
# print(df.shape)
|
77 |
-
# print(df.iloc[-1])
|
78 |
self.generated_summaries_df = self.summary_generator.generate_summaries(df, save_path=f"generation_results/{self.model}.csv")
|
79 |
|
80 |
avg_summary_len = self.summary_generator.avg_length
|
@@ -103,7 +100,7 @@ class Evaluator:
|
|
103 |
print('Updating result files')
|
104 |
leaderboard_path = os.getcwd() # the path of leaderboard folder
|
105 |
print(leaderboard_path)
|
106 |
-
working_path = os.path.join(leaderboard_path, '
|
107 |
if not os.path.exists(working_path):
|
108 |
logging.error(f"Need to first download the results from google drive to the learderboard folder")
|
109 |
raise
|
@@ -124,19 +121,5 @@ class Evaluator:
|
|
124 |
leaderboard_summaries_df.to_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), mode='a', index=False, header=False)
|
125 |
print('leaderboard_summaries.csv has been updated')
|
126 |
|
127 |
-
# update leaderboard_summaries_with_scores.csv
|
128 |
-
# BUG: get error when opening the file
|
129 |
-
existing_df = pd.read_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'),
|
130 |
-
encoding='utf-8', sep=",", quotechar='"', quoting=2)
|
131 |
-
print(existing_df.shape)
|
132 |
-
score_doc = set(existing_df['model'].values.tolist())
|
133 |
-
print(score_doc)
|
134 |
-
mask = existing_df['model'] == self.model
|
135 |
-
existing_df = existing_df[~mask]
|
136 |
-
# get new result
|
137 |
-
leaderboard_summaries_with_scores_df = pd.DataFrame.from_dict(self.eval_results)
|
138 |
-
leaderboard_summaries_with_scores_df.insert(3, "model", [self.model]*leaderboard_summaries_with_scores_df.shape[0])
|
139 |
-
leaderboard_summaries_with_scores_df.to_csv(os.path.join(working_path, 'leaderboard_summaries_with_scores.csv'), mode='a', index=False, header=False)
|
140 |
-
print('leaderboard_summaries_with_scores.csv has been updated')
|
141 |
|
142 |
|
|
|
56 |
self.write_out = write_out
|
57 |
self.output_base_path = output_base_path
|
58 |
try:
|
59 |
+
self.summary_generator = SummaryGenerator(model, revision, self.device)
|
60 |
+
self.eval_model = EvaluationModel(envs.HEM_PATH, self.device)
|
61 |
except Exception as e:
|
62 |
logging.error(f"Error initializing Evaluator: {e}")
|
63 |
raise
|
|
|
72 |
"""
|
73 |
try:
|
74 |
df = pd.read_csv(envs.DATASET_PATH)
|
|
|
|
|
|
|
75 |
self.generated_summaries_df = self.summary_generator.generate_summaries(df, save_path=f"generation_results/{self.model}.csv")
|
76 |
|
77 |
avg_summary_len = self.summary_generator.avg_length
|
|
|
100 |
print('Updating result files')
|
101 |
leaderboard_path = os.getcwd() # the path of leaderboard folder
|
102 |
print(leaderboard_path)
|
103 |
+
working_path = os.path.join(leaderboard_path, 'leaderboard_results')
|
104 |
if not os.path.exists(working_path):
|
105 |
logging.error(f"Need to first download the results from google drive to the learderboard folder")
|
106 |
raise
|
|
|
121 |
leaderboard_summaries_df.to_csv(os.path.join(working_path, 'leaderboard_summaries.csv'), mode='a', index=False, header=False)
|
122 |
print('leaderboard_summaries.csv has been updated')
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
|
src/backend/model_operations.py
CHANGED
@@ -11,11 +11,8 @@ import pandas as pd
|
|
11 |
import spacy
|
12 |
from sentence_transformers import CrossEncoder
|
13 |
import litellm
|
14 |
-
# from litellm import completion
|
15 |
from tqdm import tqdm
|
16 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM,
|
17 |
-
# from accelerate import PartialState
|
18 |
-
# from accelerate.inference import prepare_pippy
|
19 |
import torch
|
20 |
import cohere
|
21 |
from openai import OpenAI
|
@@ -41,20 +38,6 @@ nlp = spacy.load("en_core_web_sm")
|
|
41 |
|
42 |
os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
|
43 |
|
44 |
-
|
45 |
-
def load_evaluation_model(model_path):
|
46 |
-
"""Load the evaluation model from the given path
|
47 |
-
|
48 |
-
Args:
|
49 |
-
model_path (str): Path to the evaluation model
|
50 |
-
|
51 |
-
Returns:
|
52 |
-
CrossEncoder: The evaluation model
|
53 |
-
"""
|
54 |
-
model = CrossEncoder(model_path)
|
55 |
-
return model
|
56 |
-
|
57 |
-
|
58 |
class ModelLoadingException(Exception):
|
59 |
"""Exception raised for errors in loading a model.
|
60 |
|
@@ -81,7 +64,7 @@ class SummaryGenerator:
|
|
81 |
answer_rate (float): Rate of non-empty summaries.
|
82 |
"""
|
83 |
|
84 |
-
def __init__(self, model_id, revision):
|
85 |
"""
|
86 |
Initializes the SummaryGenerator with a model.
|
87 |
|
@@ -94,6 +77,7 @@ class SummaryGenerator:
|
|
94 |
self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
|
95 |
self.summaries_df = pd.DataFrame()
|
96 |
self.revision = revision
|
|
|
97 |
self.avg_length = None
|
98 |
self.answer_rate = None
|
99 |
self.exceptions = None
|
@@ -206,10 +190,9 @@ class SummaryGenerator:
|
|
206 |
|
207 |
payload = {
|
208 |
"model": self.model_id,
|
209 |
-
# "max_tokens": 4096,
|
210 |
'max_new_tokens': 250,
|
211 |
"temperature": 0.0,
|
212 |
-
|
213 |
}
|
214 |
payload['messages'] = [{"role": "system", "content": system_prompt},
|
215 |
{"role": "user", "content": user_prompt}]
|
@@ -365,39 +348,40 @@ class SummaryGenerator:
|
|
365 |
|
366 |
# Using HF API or download checkpoints
|
367 |
elif self.local_model is None and self.local_pipeline is None:
|
368 |
-
try: # try use HuggingFace API
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
)
|
378 |
-
|
379 |
-
|
380 |
-
print(
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
wait_time = 300
|
385 |
-
current_time = datetime.now().strftime('%H:%M:%S')
|
386 |
-
print(f"Rate limit hit at {current_time}. Waiting for 5 minutes before retrying...")
|
387 |
-
time.sleep(wait_time)
|
388 |
-
else:
|
389 |
-
if using_pipeline:
|
390 |
-
self.local_pipeline = pipeline(
|
391 |
-
"text-generation",
|
392 |
-
model=self.model_id,
|
393 |
-
model_kwargs={"torch_dtype": torch.bfloat16},
|
394 |
-
device_map="auto",
|
395 |
-
)
|
396 |
-
else:
|
397 |
-
self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf" if 'openelm' in self.model_id.lower() else self.model_id, trust_remote_code=True)
|
398 |
-
print("Tokenizer loaded")
|
399 |
-
self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto")
|
400 |
-
print("Local model loaded")
|
401 |
|
402 |
|
403 |
# Using local model/pipeline
|
@@ -438,7 +422,7 @@ class SummaryGenerator:
|
|
438 |
prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
|
439 |
# print(prompt)
|
440 |
# print('-'*50)
|
441 |
-
input_ids = self.tokenizer(prompt, return_tensors="pt").to(
|
442 |
with torch.no_grad():
|
443 |
outputs = self.local_model.generate(**input_ids, max_new_tokens=250, do_sample=True, temperature=0.01, pad_token_id=self.tokenizer.eos_token_id)
|
444 |
if 'glm' in self.model_id.lower():
|
@@ -451,6 +435,8 @@ class SummaryGenerator:
|
|
451 |
result = result.split("### Assistant:\n")[-1]
|
452 |
|
453 |
else:
|
|
|
|
|
454 |
result = result.replace(prompt.strip(), '')
|
455 |
|
456 |
print(result)
|
@@ -494,17 +480,43 @@ class EvaluationModel:
|
|
494 |
hallucination_rate (float): Rate of hallucination in summaries.
|
495 |
"""
|
496 |
|
497 |
-
def __init__(self, model_path):
|
498 |
"""
|
499 |
Initializes the EvaluationModel with a CrossEncoder model.
|
500 |
|
501 |
Args:
|
502 |
model_path (str): Path to the CrossEncoder model.
|
503 |
"""
|
504 |
-
self.model =
|
|
|
|
|
505 |
self.scores = []
|
506 |
self.factual_consistency_rate = None
|
507 |
self.hallucination_rate = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
508 |
|
509 |
def evaluate_hallucination(self, summaries_df):
|
510 |
"""
|
@@ -525,22 +537,14 @@ class EvaluationModel:
|
|
525 |
for doc, summary in source_summary_pairs:
|
526 |
if util.is_summary_valid(summary):
|
527 |
try:
|
528 |
-
# summary_pieces = summary.split('\n')
|
529 |
-
# summary = summary_pieces[0] if len(summary_pieces[0].strip()) > 0 else summary_pieces[1]
|
530 |
summary = summary.replace('<bos>','').replace('<eos>','').strip()
|
531 |
-
score = self.
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
# print inconsistent summaries for checking
|
539 |
-
if score < 0.5:
|
540 |
-
print(doc)
|
541 |
-
print('-'*10)
|
542 |
-
print(summary)
|
543 |
-
print('='*20)
|
544 |
hem_scores.append(score)
|
545 |
sources.append(doc)
|
546 |
summaries.append(summary)
|
|
|
11 |
import spacy
|
12 |
from sentence_transformers import CrossEncoder
|
13 |
import litellm
|
|
|
14 |
from tqdm import tqdm
|
15 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForTokenClassification
|
|
|
|
|
16 |
import torch
|
17 |
import cohere
|
18 |
from openai import OpenAI
|
|
|
38 |
|
39 |
os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
class ModelLoadingException(Exception):
|
42 |
"""Exception raised for errors in loading a model.
|
43 |
|
|
|
64 |
answer_rate (float): Rate of non-empty summaries.
|
65 |
"""
|
66 |
|
67 |
+
def __init__(self, model_id, revision, device):
|
68 |
"""
|
69 |
Initializes the SummaryGenerator with a model.
|
70 |
|
|
|
77 |
self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
|
78 |
self.summaries_df = pd.DataFrame()
|
79 |
self.revision = revision
|
80 |
+
self.device = device
|
81 |
self.avg_length = None
|
82 |
self.answer_rate = None
|
83 |
self.exceptions = None
|
|
|
190 |
|
191 |
payload = {
|
192 |
"model": self.model_id,
|
|
|
193 |
'max_new_tokens': 250,
|
194 |
"temperature": 0.0,
|
195 |
+
|
196 |
}
|
197 |
payload['messages'] = [{"role": "system", "content": system_prompt},
|
198 |
{"role": "user", "content": user_prompt}]
|
|
|
348 |
|
349 |
# Using HF API or download checkpoints
|
350 |
elif self.local_model is None and self.local_pipeline is None:
|
351 |
+
# try: # try use HuggingFace API
|
352 |
+
# print('** using huggingface api')
|
353 |
+
# response = litellm.completion(
|
354 |
+
# model=self.model,
|
355 |
+
# messages=[{"role": "system", "content": system_prompt},
|
356 |
+
# {"role": "user", "content": user_prompt}],
|
357 |
+
# temperature=0.0,
|
358 |
+
# max_tokens=250,
|
359 |
+
# api_base=self.api_base,
|
360 |
+
# )
|
361 |
+
# result = response['choices'][0]['message']['content']
|
362 |
+
# result = result.split('<|im_end|>')[0]
|
363 |
+
# print(result)
|
364 |
+
# return result
|
365 |
+
# except Exception as e:
|
366 |
+
# if 'Rate limit reached' in str(e) :
|
367 |
+
# wait_time = 300
|
368 |
+
# current_time = datetime.now().strftime('%H:%M:%S')
|
369 |
+
# print(f"Rate limit hit at {current_time}. Waiting for 5 minutes before retrying...")
|
370 |
+
# time.sleep(wait_time)
|
371 |
+
# else:
|
372 |
+
if using_pipeline:
|
373 |
+
self.local_pipeline = pipeline(
|
374 |
+
"text-generation",
|
375 |
+
model=self.model_id,
|
376 |
+
model_kwargs={"torch_dtype": torch.bfloat16},
|
377 |
+
device_map="auto",
|
378 |
)
|
379 |
+
else:
|
380 |
+
self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf" if 'openelm' in self.model_id.lower() else self.model_id, trust_remote_code=True)
|
381 |
+
print("Tokenizer loaded")
|
382 |
+
self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto")
|
383 |
+
print(self.local_model.device)
|
384 |
+
print("Local model loaded")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
|
386 |
|
387 |
# Using local model/pipeline
|
|
|
422 |
prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
|
423 |
# print(prompt)
|
424 |
# print('-'*50)
|
425 |
+
input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
426 |
with torch.no_grad():
|
427 |
outputs = self.local_model.generate(**input_ids, max_new_tokens=250, do_sample=True, temperature=0.01, pad_token_id=self.tokenizer.eos_token_id)
|
428 |
if 'glm' in self.model_id.lower():
|
|
|
435 |
result = result.split("### Assistant:\n")[-1]
|
436 |
|
437 |
else:
|
438 |
+
print(prompt)
|
439 |
+
print('-'*50)
|
440 |
result = result.replace(prompt.strip(), '')
|
441 |
|
442 |
print(result)
|
|
|
480 |
hallucination_rate (float): Rate of hallucination in summaries.
|
481 |
"""
|
482 |
|
483 |
+
def __init__(self, model_path, device):
|
484 |
"""
|
485 |
Initializes the EvaluationModel with a CrossEncoder model.
|
486 |
|
487 |
Args:
|
488 |
model_path (str): Path to the CrossEncoder model.
|
489 |
"""
|
490 |
+
self.model = AutoModelForTokenClassification.from_pretrained(model_path)
|
491 |
+
self.device = device
|
492 |
+
self.model.to(self.device)
|
493 |
self.scores = []
|
494 |
self.factual_consistency_rate = None
|
495 |
self.hallucination_rate = None
|
496 |
+
|
497 |
+
def predict(self, text_pairs):
|
498 |
+
"""Load LoRA adapters of HHEM and make predictions
|
499 |
+
All HHEM 2.1 settings, e.g., prompt template, are hardcoded in this function.
|
500 |
+
Args:
|
501 |
+
text_pairs: list of tuples, each tuple contains two strings (premise, hypothesis)
|
502 |
+
checkpoint: model ID on Hugging Face
|
503 |
+
"""
|
504 |
+
|
505 |
+
prompt = "<pad> Determine if the hypothesis is true given the premise?\n\nPremise: {text1}\n\nHypothesis: {text2}"
|
506 |
+
|
507 |
+
tokenizer = AutoTokenizer.from_pretrained('t5-base')
|
508 |
+
inputs = tokenizer(
|
509 |
+
[prompt.format(text1=pair[0], text2=pair[1]) for pair in text_pairs],
|
510 |
+
return_tensors='pt', padding='longest').to(self.device)
|
511 |
+
|
512 |
+
self.model.eval()
|
513 |
+
with torch.no_grad():
|
514 |
+
output = self.model(**inputs)
|
515 |
+
logits = output.logits
|
516 |
+
logits = logits[:,0,:] # get the logits on the first token
|
517 |
+
logits = torch.softmax(logits, dim=-1)
|
518 |
+
scores = [round(x, 5) for x in logits[:, 1].tolist()] # list of float
|
519 |
+
return scores
|
520 |
|
521 |
def evaluate_hallucination(self, summaries_df):
|
522 |
"""
|
|
|
537 |
for doc, summary in source_summary_pairs:
|
538 |
if util.is_summary_valid(summary):
|
539 |
try:
|
|
|
|
|
540 |
summary = summary.replace('<bos>','').replace('<eos>','').strip()
|
541 |
+
score = self.predict([(doc, summary)])[0]
|
542 |
+
# print(score)
|
543 |
+
# if score < 0.5:
|
544 |
+
# print(doc)
|
545 |
+
# print('-'*10)
|
546 |
+
# print(summary)
|
547 |
+
# print('='*20)
|
|
|
|
|
|
|
|
|
|
|
|
|
548 |
hem_scores.append(score)
|
549 |
sources.append(doc)
|
550 |
summaries.append(summary)
|
src/envs.py
CHANGED
@@ -23,10 +23,10 @@ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
|
23 |
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #"cpu"
|
24 |
API = HfApi(token=TOKEN)
|
25 |
|
26 |
-
LEADERBOARD_DATASET_PATH = "
|
27 |
DATASET_PATH = "src/datasets/leaderboard_dataset.csv"
|
28 |
SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
|
29 |
-
HEM_PATH = 'vectara/
|
30 |
|
31 |
SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
|
32 |
USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "
|
|
|
23 |
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #"cpu"
|
24 |
API = HfApi(token=TOKEN)
|
25 |
|
26 |
+
LEADERBOARD_DATASET_PATH = "leaderboard_results/leaderboard_summaries.csv"
|
27 |
DATASET_PATH = "src/datasets/leaderboard_dataset.csv"
|
28 |
SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
|
29 |
+
HEM_PATH = 'vectara/HHEM-2.1'
|
30 |
|
31 |
SYSTEM_PROMPT = "You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided."
|
32 |
USER_PROMPT = "You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described': "
|