abdalrahmanshahrour commited on
Commit
779bd31
1 Parent(s): d4ecab0
Files changed (1) hide show
  1. summarize.py +0 -65
summarize.py CHANGED
@@ -99,71 +99,6 @@ def get_results(text, model_selected, num_beams, length_penalty,number_of_senten
99
  length_penalty=length_penalty,
100
  no_repeat_ngram_size = 3)[0]['generated_text']
101
  logger.info('auto-arabic-summarization')
102
-
103
- elif model_selected == 'BERT2BERT':
104
-
105
- model_name="malmarjeh/bert2bert"
106
- preprocessor = ArabertPreprocessor(model_name="")
107
-
108
- tokenizer = AutoTokenizer.from_pretrained(model_name)
109
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
110
- pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
111
- result = pipeline1(text,
112
- pad_token_id= tokenizer.eos_token_id,
113
- num_beams=num_beams,
114
- repetition_penalty=3.0,
115
- max_length=200,
116
- length_penalty=length_penalty,
117
- no_repeat_ngram_size = 3)[0]['generated_text']
118
- logger.info('BERT2BERT')
119
-
120
- elif model_selected == "xlmroberta2xlmroberta":
121
- model_name="ahmeddbahaa/xlmroberta2xlmroberta-finetune-summarization-ar"
122
- preprocessor = ArabertPreprocessor(model_name="")
123
-
124
- tokenizer = AutoTokenizer.from_pretrained(model_name)
125
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
126
- pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
127
- result = pipeline1(text,
128
- pad_token_id= tokenizer.eos_token_id,
129
- num_beams=num_beams,
130
- repetition_penalty=3.0,
131
- max_length=200,
132
- length_penalty=length_penalty,
133
- no_repeat_ngram_size = 3)[0]['generated_text']
134
- logger.info('xlmroberta2xlmroberta')
135
-
136
- elif model_selected == "nltk_summarizer":
137
- # number_of_sentence = 3
138
- stopWords = set(nltk.corpus.stopwords.words("arabic") + nltk.corpus.stopwords.words("english"))
139
- word_frequencies = {}
140
- for word in nltk.word_tokenize(text):
141
- if word not in stopWords:
142
- if word not in punctuation:
143
- if word not in word_frequencies.keys():
144
- word_frequencies[word] = 1
145
- else:
146
- word_frequencies[word] += 1
147
-
148
- maximum_frequncy = max(list(word_frequencies.values()),default=3)
149
-
150
- for word in word_frequencies.keys():
151
- word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
152
-
153
- sentence_list = nltk.sent_tokenize(text)
154
- sentence_scores = {}
155
- for sent in sentence_list:
156
- for word in nltk.word_tokenize(sent.lower()):
157
- if word in word_frequencies.keys():
158
- if len(sent.split(' ')) < 30:
159
- if sent not in sentence_scores.keys():
160
- sentence_scores[sent] = word_frequencies[word]
161
- else:
162
- sentence_scores[sent] += word_frequencies[word]
163
-
164
- summary_sentences = heapq.nlargest(number_of_sentence, sentence_scores, key=sentence_scores.get)
165
-
166
- result = ' '.join(summary_sentences)
167
  else:
168
  result = "الرجاء اختيار نموذج"
169
 
 
99
  length_penalty=length_penalty,
100
  no_repeat_ngram_size = 3)[0]['generated_text']
101
  logger.info('auto-arabic-summarization')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  else:
103
  result = "الرجاء اختيار نموذج"
104