Minseok Bae commited on
Commit
2c24f05
·
1 Parent(s): b46b972

modified the evaluation pipelines.

Browse files
src/backend/model_operations.py CHANGED
@@ -6,10 +6,9 @@ import logging
6
  import numpy as np
7
  import pandas as pd
8
  import spacy
9
- # from transformers import AutoModelForCausalLM, AutoTokenizer
10
  from sentence_transformers import CrossEncoder
11
- import litellm
12
  from litellm import completion
 
13
 
14
  import src.backend.util as util
15
  import src.envs as envs
@@ -23,8 +22,6 @@ nlp = spacy.load("en_core_web_sm")
23
 
24
  os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
25
 
26
- litellm.set_verbose=True
27
-
28
 
29
  def load_evaluation_model(model_path):
30
  """Load the evaluation model from the given path
@@ -105,7 +102,7 @@ class SummaryGenerator:
105
  source, summary, dataset = [], [], []
106
  exceptions = []
107
 
108
- for index, row in df.iterrows():
109
  _source = row['text']
110
  _dataset = row['dataset']
111
 
@@ -129,11 +126,12 @@ class SummaryGenerator:
129
  exceptions.append(index)
130
  break
131
 
132
- summary.append(_summary)
133
- source.append(_source)
134
- dataset.append(_dataset)
135
 
136
- time.sleep(1)
 
137
 
138
  self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
139
  columns=["source", "summary", "dataset"])
@@ -147,26 +145,28 @@ class SummaryGenerator:
147
  """
148
  Compute the average length of non-empty summaries using SpaCy.
149
  """
150
- total_words = 0
151
- count = 0
152
 
153
  for summary in self.summaries_df['summary']:
154
- if summary != "":
155
  doc = nlp(summary)
156
  words = [token.text for token in doc if token.is_alpha]
157
- total_words += len(words)
158
- count += 1
159
 
160
- self.avg_length = 0 if count == 0 else total_words / count
161
 
162
  def _compute_answer_rate(self):
163
  """
164
  Compute the rate of non-empty summaries.
165
  """
166
- non_empty_count = sum(1 for summary in self.summaries_df['summary'] if summary)
167
- total_rows = len(self.summaries_df)
 
 
168
 
169
- self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
170
 
171
 
172
  class EvaluationModel:
@@ -193,7 +193,7 @@ class EvaluationModel:
193
 
194
  def evaluate_hallucination(self, summaries_df):
195
  """
196
- Evaluate the hallucination rate in summaries. This method updates the 'scores' attribute
197
  of the instance with the computed scores.
198
 
199
  Args:
@@ -202,14 +202,24 @@ class EvaluationModel:
202
  Returns:
203
  list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
204
  """
 
205
  source_summary_pairs = util.create_pairs(summaries_df)
206
- try:
207
- scores = self.model.predict(source_summary_pairs)
208
- self.scores = scores
209
- return self.scores
210
- except Exception as e:
211
- logging.error(f"Error evaluating hallucination: {e}")
212
- raise
 
 
 
 
 
 
 
 
 
213
 
214
  def compute_factual_consistency_rate(self, threshold=0.5):
215
  """
@@ -240,4 +250,3 @@ class EvaluationModel:
240
  self.hallucination_rate = 100 - self.factual_consistency_rate
241
 
242
  return self.factual_consistency_rate
243
-
 
6
  import numpy as np
7
  import pandas as pd
8
  import spacy
 
9
  from sentence_transformers import CrossEncoder
 
10
  from litellm import completion
11
+ from tqdm import tqdm
12
 
13
  import src.backend.util as util
14
  import src.envs as envs
 
22
 
23
  os.environ["HUGGINGFACE_API_KEY"] = envs.TOKEN
24
 
 
 
25
 
26
  def load_evaluation_model(model_path):
27
  """Load the evaluation model from the given path
 
102
  source, summary, dataset = [], [], []
103
  exceptions = []
104
 
105
+ for index, row in tqdm(df.iterrows(), total=df.shape[0]):
106
  _source = row['text']
107
  _dataset = row['dataset']
108
 
 
126
  exceptions.append(index)
127
  break
128
 
129
+ summary.append(_summary)
130
+ source.append(_source)
131
+ dataset.append(_dataset)
132
 
133
+ # Sleep to prevent hitting rate limits too frequently
134
+ time.sleep(1)
135
 
136
  self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
137
  columns=["source", "summary", "dataset"])
 
145
  """
146
  Compute the average length of non-empty summaries using SpaCy.
147
  """
148
+ total_word_count = 0
149
+ total_count = 0
150
 
151
  for summary in self.summaries_df['summary']:
152
+ if util.is_summary_valid(summary):
153
  doc = nlp(summary)
154
  words = [token.text for token in doc if token.is_alpha]
155
+ total_word_count += len(words)
156
+ total_count += 1
157
 
158
+ self.avg_length = 0 if total_count == 0 else total_word_count / total_count
159
 
160
  def _compute_answer_rate(self):
161
  """
162
  Compute the rate of non-empty summaries.
163
  """
164
+ valid_count = sum(1 for summary in self.summaries_df['summary']
165
+ if util.is_summary_valid(summary))
166
+
167
+ total_count = len(self.summaries_df)
168
 
169
+ self.answer_rate = 0 if total_count == 0 else valid_count / total_count
170
 
171
 
172
  class EvaluationModel:
 
193
 
194
  def evaluate_hallucination(self, summaries_df):
195
  """
196
+ Evaluate the hallucination rate in summaries. Updates the 'scores' attribute
197
  of the instance with the computed scores.
198
 
199
  Args:
 
202
  Returns:
203
  list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
204
  """
205
+ hem_scores = []
206
  source_summary_pairs = util.create_pairs(summaries_df)
207
+
208
+ for doc, summary in tqdm(source_summary_pairs, desc="Evaluating hallucinations"):
209
+ if util.is_summary_valid(summary):
210
+ try:
211
+ score = self.model.predict([doc, summary])[0]
212
+ if not isinstance(score, float):
213
+ logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
214
+ continue
215
+ hem_scores.append(score)
216
+ except Exception as e:
217
+ logging.error(f"Error while running HEM: {e}")
218
+ raise
219
+
220
+ self.scores = hem_scores
221
+ return hem_scores
222
+
223
 
224
  def compute_factual_consistency_rate(self, threshold=0.5):
225
  """
 
250
  self.hallucination_rate = 100 - self.factual_consistency_rate
251
 
252
  return self.factual_consistency_rate
 
src/backend/util.py CHANGED
@@ -1,3 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def create_pairs(df):
2
  """
3
  Creates pairs of source and summary from the dataframe.
 
1
+ def is_summary_valid(summary: str) -> bool:
2
+ """
3
+ Checks if the summary is valid.
4
+
5
+ A summary is valid if it is not empty and contains at least five words.
6
+
7
+ Args:
8
+ summary (str): The summary to check.
9
+
10
+ Returns:
11
+ bool: True if the summary is valid, False otherwise.
12
+ """
13
+ if isinstance(summary, str):
14
+ words = summary.split()
15
+ if len(words) >= 5:
16
+ return True
17
+ return False
18
+
19
+
20
  def create_pairs(df):
21
  """
22
  Creates pairs of source and summary from the dataframe.