raphael825 commited on
Commit
a5d3b8f
ยท
1 Parent(s): 9d6f9cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -185
app.py CHANGED
@@ -1,8 +1,8 @@
 
1
  import requests
2
  import json
3
  import re
4
  import gradio as gr
5
- import Model
6
  from pytube import YouTube
7
  import whisper
8
  import time
@@ -18,22 +18,19 @@ from langchain.chains.combine_documents.stuff import StuffDocumentsChain
18
  from PIL import Image
19
  from io import BytesIO
20
 
21
-
22
  openai_api_key = ""
23
 
24
 
25
-
26
  # for API
27
 
28
  # # ==
29
 
30
  def youtube_text(link):
31
  yt = YouTube(link)
32
- yt.streams.filter(only_audio=True).first().download \
33
- (output_path=".", filename="test.mp3")
34
 
35
  start = time.time()
36
- model = whisper.load_model("small")
37
  text = model.transcribe("test.mp3")
38
  end = time.time()
39
 
@@ -50,102 +47,62 @@ def youtube_text(link):
50
 
51
  split_docs = text_splitter.split_documents(docs)
52
 
53
- with open("split_example_small.pkl", "wb") as f:
54
  pickle.dump(split_docs, f)
55
 
56
  return split_docs, full_docs
57
 
58
 
59
- #
60
- # def youtube_summary(full_docs, openai_key):
61
- #
62
- # prompt = """The following is a documents
63
- # You need to output two things from the above Video.
64
- # 1. Write an executive summary
65
- # Read the following documents and write a summary that integrates them to quickly identify the main topics of the Video.
66
- # Your summary should.
67
- # - Must be written in Korean
68
- # - Be a single paragraph
69
- # - Be descriptive and detailed so that you can tell at a glance what is being said without having to look at the original Video.
70
- # 2. Choose your keywords
71
- # The keywords have the following conditions
72
- # - Must be written in Korean
73
- # - Must be a single word
74
- # - Must be a word that appears in the Video
75
- # - Must be a word that is not a stopword
76
- # - Must be a word that is not a proper noun
77
- # - Must be a word that is not a preposition
78
- # - Must be a word that is not a conjunction
79
- # - Must be a word that is not an interjection
80
- # - Must be a word that is not an adjective
81
- # - Must be a word that is not an adverb
82
- # - Output as a Python array (ex: [keyword1,keyword2,keyword3] )
83
- # - Output a total of 3 keywords
84
- # - Choose words you might use to search for a book title !
85
- # Here is an example of the final output
86
- # ์š”์•ฝ: Document_summary
87
- # ํ‚ค์›Œ๋“œ: [ Keyword1,Keyword2,Keyword3]
88
- # """
89
- #
90
- #
91
- # try:
92
- #
93
- # response = client.chat.completions.create(
94
- # messages={
95
- # "role": "system", "content": "You are a helpful assistant."
96
- # "role": "user", "content": prompt
97
- # },
98
- # temperature=0.7)
99
- #
100
- # with open ("data/result_new.json", "w") as f:
101
- # json.dump(response.choices[0].message['content'], f, indent=4)
102
- # return response.choices[0].message['content']
103
- # except Exception as e:
104
- # print(e)
105
- # return "Error"
106
-
107
- #
108
- # 1. Write an executive summary
109
- # Read the following documents and write a summary that integrates them to quickly identify the main topics of the Video.
110
- # Your summary should.
111
- # - Must be written in Korean
112
- # - Be a single paragraph
113
- # - Be descriptive and detailed so that you can tell at a glance what is being said without having to look at the original Video.
114
- # 2.
115
-
116
-
117
- def youtube_sum(split_docs, full_docs):
118
- llm = ChatOpenAI(temperature=0.7, openai_api_key=openai_api_key)
119
 
120
  # Map prompt
121
  map_template = """The following is a set of documents
122
  {docs}
123
- Based on this list of docs, please identify the main themes
124
  Helpful Answer:"""
125
 
126
  map_prompt = PromptTemplate.from_template(map_template)
127
 
128
  # Reduce prompt
129
  reduce_template = """The following is set of summaries:
130
- {doc_summaries}
131
- You need to output Keyword from the above Video.
132
- Choose your keywords
133
- The keywords have the following conditions
134
- - Must be written in Korean
135
- - Must be a single word
136
- - Must be a word that appears in the Video
137
- - Must be a word that is not a stopword
138
- - Must be a word that is not a proper noun
139
- - Must be a word that is not a preposition
140
- - Must be a word that is not a conjunction
141
- - Must be a word that is not an interjection
142
- - Must be a word that is not an adjective
143
- - Must be a word that is not an adverb
144
- - Output a total of 3 keywords
145
- - Choose words you might use to search for a book title !
146
- Here is an example of the final output
147
- Keyword: Keyword1,Keyword2,Keyword3
148
- Helpful Answer:"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  reduce_prompt = PromptTemplate.from_template(reduce_template)
151
 
@@ -181,79 +138,71 @@ def youtube_sum(split_docs, full_docs):
181
  # Run
182
  result = map_reduce_chain.run(split_docs)
183
  print(result)
184
- with open("result.txt", "w") as f:
185
  f.write(result)
186
  return result
187
 
188
 
189
  def text_to_arr(result):
190
- parts = re.split(r'Keyword:', result, flags=re.IGNORECASE)
191
- # Take the last part (the actual keywords), strip whitespace, and split by commas
192
- keywords = parts[-1].strip().split(", ")
193
- # Now 'keywords' is an array (list in Python) containing the extracted keywords
194
- print(keywords)
195
 
196
- return keywords
 
197
 
 
 
 
 
 
 
 
198
 
199
- def aladin_api(keywords, selected_option):
 
 
 
200
  aladin_key = 'ttbkangmj08250027001'
 
201
  all_data = []
202
- title = []
203
- keyword = keywords
204
  if selected_option == "์‚ฌํšŒ":
205
- for key in keyword:
206
- print(key)
207
- url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
208
- "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=90853&outofStockFilter=1"
209
- response = requests.get(url)
210
- response_json = json.loads(response.text)
211
- all_data.append(response_json)
212
- # request ๋ณด๋‚ด๊ธฐ
213
- all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
214
- with open("book.json", "wb") as f:
215
- f.write(all_data.encode("utf-8"))
216
-
217
 
218
  elif selected_option == "๊ณผํ•™":
219
- for key in keyword:
220
- print(key)
221
- url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
222
- "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=987&outofStockFilter=1"
223
- response = requests.get(url)
224
- response_json = json.loads(response.text)
225
- all_data.append(response_json)
226
- # request ๋ณด๋‚ด๊ธฐ
227
- all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
228
- with open("book.json", "wb") as f:
229
- f.write(all_data.encode("utf-8"))
230
 
231
  elif selected_option == "์†Œ์„ค":
232
- for key in keyword:
233
- print(key)
234
- url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
235
- "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=1&outofStockFilter=1"
236
- response = requests.get(url)
237
- response_json = json.loads(response.text)
238
- all_data.append(response_json)
 
 
 
 
 
 
 
 
239
  # request ๋ณด๋‚ด๊ธฐ
240
- all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
241
- with open("book.json", "wb") as f:
242
- f.write(all_data.encode("utf-8"))
243
-
244
- elif selected_option == "๊ฒฝ์ œ๊ฒฝ์˜":
245
- for key in keyword:
246
- print(key)
247
- url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
248
- "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=170&outofStockFilter=1"
249
- response = requests.get(url)
250
- response_json = json.loads(response.text)
251
- all_data.append(response_json)
252
- # request ๋ณด๋‚ด๊ธฐ
253
- all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
254
- with open("book.json", "wb") as f:
255
- f.write(all_data.encode("utf-8"))
256
-
257
  print(all_data)
258
  return all_data
259
 
@@ -261,7 +210,7 @@ def aladin_api(keywords, selected_option):
261
  def book_output(book_json):
262
  data = json.loads(book_json)
263
 
264
- if len(data[0]['item']) != 0:
265
  title1 = data[0]['item'][0]['title']
266
  book_link1 = data[0]['item'][0]['link']
267
  cover_link1 = data[0]['item'][0]['cover']
@@ -270,65 +219,69 @@ def book_output(book_json):
270
  else:
271
  title1 = "No Data"
272
  book_link1 = "No Data"
273
- image1 = "No Data"
274
 
275
- if len(data[1]['item']) != 0:
276
- title2 = data[1]['item'][0]['title']
277
- book_link2 = data[1]['item'][0]['link']
278
- cover_link2 = data[1]['item'][0]['cover']
279
  response2 = requests.get(cover_link2)
280
  image2 = Image.open(BytesIO(response2.content))
281
  else:
282
  title2 = "No Data"
283
  book_link2 = "No Data"
284
- image2 = "No Data"
285
 
286
- if len(data[2]['item']) != 0:
287
- title3 = data[2]['item'][0]['title']
288
- book_link3 = data[2]['item'][0]['link']
289
- cover_link3 = data[2]['item'][0]['cover']
290
  response3 = requests.get(cover_link3)
291
  image3 = Image.open(BytesIO(response3.content))
292
  else:
293
  title3 = "No Data"
294
  book_link3 = "No Data"
295
- image3 = "No Data"
296
 
297
- return title1, image1, title2, image2, title3, image3
298
-
299
-
300
- def process_selection(input_list):
301
- # Your processing logic here for the selected option
302
- API_KEY = input_list[0]
303
- link = input_list[1]
304
- selected_option = input_list[2]
305
- result = f"You selected: {selected_option}"
306
- print(result)
307
- return API_KEY, link, selected_option
308
 
309
 
310
  def get_title(API_KEY, link, selected_option):
311
  docs, split_docs = youtube_text(link)
312
- result = youtube_sum(docs, split_docs)
313
  keywords = text_to_arr(result)
314
  all_data = aladin_api(keywords, selected_option)
315
- title1, image1, title2, image2, title3, image3 = book_output(all_data)
316
- return result, title1, image1, title2, image2, title3, image3
317
 
318
 
319
  # Define the list of options for the Dropdown
320
- options_list = ["์‚ฌํšŒ", "๊ณผํ•™", "์†Œ์„ค", "๊ฒฝ์ œ๊ฒฝ์˜"]
321
-
322
- iface = gr.Interface(fn=get_title, inputs=[gr.Textbox(label="Your OpenAI KEY"),
323
- gr.Textbox(label="Input Link"),
324
- gr.Dropdown(choices=options_list, label="Select a category")],
325
- outputs=[
326
- gr.Textbox(label="Keywords"),
327
- gr.Textbox(label="Title1"),
328
- gr.Image(label="Image1"),
329
- gr.Textbox(label="Title2"),
330
- gr.Image(label="Image2"),
331
- gr.Textbox(label="Title3"),
332
- gr.Image(label="Image3"),
333
- ])
334
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import requests
3
  import json
4
  import re
5
  import gradio as gr
 
6
  from pytube import YouTube
7
  import whisper
8
  import time
 
18
  from PIL import Image
19
  from io import BytesIO
20
 
 
21
  openai_api_key = ""
22
 
23
 
 
24
  # for API
25
 
26
  # # ==
27
 
28
  def youtube_text(link):
29
  yt = YouTube(link)
30
+ yt.streams.filter(only_audio=True).first().download(output_path=".", filename="test.mp3")
 
31
 
32
  start = time.time()
33
+ model = whisper.load_model("base")
34
  text = model.transcribe("test.mp3")
35
  end = time.time()
36
 
 
47
 
48
  split_docs = text_splitter.split_documents(docs)
49
 
50
+ with open("temp/split_example_small.pkl", "wb") as f:
51
  pickle.dump(split_docs, f)
52
 
53
  return split_docs, full_docs
54
 
55
 
56
+ def youtube_sum(split_docs, full_docs, API_KEY):
57
+ openai_key = API_KEY
58
+ llm = ChatOpenAI(temperature=0.7, openai_api_key=openai_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  # Map prompt
61
  map_template = """The following is a set of documents
62
  {docs}
63
+ Based on this list of Video subtitles , please identify the main themes
64
  Helpful Answer:"""
65
 
66
  map_prompt = PromptTemplate.from_template(map_template)
67
 
68
  # Reduce prompt
69
  reduce_template = """The following is set of summaries:
70
+ {doc_summaries}
71
+ You need to output two things from the above Video Subtitles.
72
+ 1. Write an executive summary
73
+ Read the following subtitles and write a summary that integrates them to quickly identify the main topics of the Video.
74
+ Your summary should.
75
+ - Must be written in Korean
76
+ - Be a 1~2 paragraph
77
+ - Be descriptive and detailed so that you can tell at a glance what is being said without having to look at the original Video.
78
+ - There are no more than three main topics in the video.
79
+ - Please also briefly describe the overall content of the video
80
+ 2. Choose your keyword
81
+ The keywords have the following conditions
82
+ - Must be written in Korean
83
+ - Must be a single word
84
+ - Must be a noun
85
+ - Must be a word that appears in the Video
86
+ - Must be a word that is not a stopword
87
+ - Must be a word that is not a proper noun
88
+ - Must be a word that is not a number
89
+ - Must be a word that is not a verb
90
+ - Must be a word that is not a pronoun
91
+ - Must be a word that is not a preposition
92
+ - Must be a word that is not a conjunction
93
+ - Must be a word that is not an interjection
94
+ - Must be a word that is not an adjective
95
+ - Must be a word that is not an adverb
96
+ - Must be a word that is not a determiner
97
+ - Must be a word that is not a particle
98
+ - Must be a word that is not a numeral
99
+ - Output only one keyword
100
+
101
+ Here is an example of the final output
102
+ Summary: Summary of The video
103
+ Keyword: keyword
104
+ Don't output any other text outside of the given format
105
+ Helpful Answer:"""
106
 
107
  reduce_prompt = PromptTemplate.from_template(reduce_template)
108
 
 
138
  # Run
139
  result = map_reduce_chain.run(split_docs)
140
  print(result)
141
+ with open("temp/result.txt", "w") as f:
142
  f.write(result)
143
  return result
144
 
145
 
146
  def text_to_arr(result):
147
+ text = result
 
 
 
 
148
 
149
+ # Regular expression to find the keyword
150
+ match = re.search(r"Keyword:\s*(\w+)", text)
151
 
152
+ if match:
153
+ keyword = match.group(1)
154
+ print("Keyword:", keyword) # The keyword is in the first capturing group
155
+ else:
156
+ match = re.search(r"ํ‚ค์›Œ๋“œ:\s*(\w+)", text)
157
+ keyword = match.group(1) # No keyword found
158
+ print("Keyword:", keyword)
159
 
160
+ return keyword
161
+
162
+
163
+ def aladin_api(keyword, selected_option):
164
  aladin_key = 'ttbkangmj08250027001'
165
+ keyword = keyword
166
  all_data = []
 
 
167
  if selected_option == "์‚ฌํšŒ":
168
+ key = keyword
169
+ print(key)
170
+ url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
171
+ "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=798&outofStockFilter=1"
172
+ response = requests.get(url)
173
+ response_json = json.loads(response.text)
174
+ all_data.append(response_json)
 
 
 
 
 
175
 
176
  elif selected_option == "๊ณผํ•™":
177
+ key = keyword
178
+ print(key)
179
+ url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
180
+ "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=987&outofStockFilter=1"
181
+ response = requests.get(url)
182
+ response_json = json.loads(response.text)
183
+ all_data.append(response_json)
 
 
 
 
184
 
185
  elif selected_option == "์†Œ์„ค":
186
+ key = keyword
187
+ print(key)
188
+ url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
189
+ "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=1&outofStockFilter=1"
190
+ response = requests.get(url)
191
+ response_json = json.loads(response.text)
192
+ all_data.append(response_json)
193
+
194
+ elif selected_option == "๊ธˆ์œต":
195
+ key = keyword
196
+ url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
197
+ "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=170&outofStockFilter=1"
198
+ response = requests.get(url)
199
+ response_json = json.loads(response.text)
200
+ all_data.append(response_json)
201
  # request ๋ณด๋‚ด๊ธฐ
202
+ all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
203
+ with open("temp/book.json", "wb") as f:
204
+ f.write(all_data.encode("utf-8"))
205
+ print(type(all_data))
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  print(all_data)
207
  return all_data
208
 
 
210
  def book_output(book_json):
211
  data = json.loads(book_json)
212
 
213
+ if len(data[0]['item'][0]) != 0:
214
  title1 = data[0]['item'][0]['title']
215
  book_link1 = data[0]['item'][0]['link']
216
  cover_link1 = data[0]['item'][0]['cover']
 
219
  else:
220
  title1 = "No Data"
221
  book_link1 = "No Data"
222
+ image1 = Image.open("NO DATA.jpeg")
223
 
224
+ if len(data[0]['item'][1]) != 0:
225
+ title2 = data[0]['item'][1]['title']
226
+ book_link2 = data[0]['item'][1]['link']
227
+ cover_link2 = data[0]['item'][1]['cover']
228
  response2 = requests.get(cover_link2)
229
  image2 = Image.open(BytesIO(response2.content))
230
  else:
231
  title2 = "No Data"
232
  book_link2 = "No Data"
233
+ image2 = Image.open("NO DATA.jpeg")
234
 
235
+ if len(data[0]['item'][2]) != 0:
236
+ title3 = data[0]['item'][2]['title']
237
+ book_link3 = data[0]['item'][2]['link']
238
+ cover_link3 = data[0]['item'][2]['cover']
239
  response3 = requests.get(cover_link3)
240
  image3 = Image.open(BytesIO(response3.content))
241
  else:
242
  title3 = "No Data"
243
  book_link3 = "No Data"
244
+ image3 = Image.open("NO DATA.jpeg")
245
 
246
+ return title1, image1, title2, image2, title3, image3, book_link1, book_link2, book_link3
 
 
 
 
 
 
 
 
 
 
247
 
248
 
249
  def get_title(API_KEY, link, selected_option):
250
  docs, split_docs = youtube_text(link)
251
+ result = youtube_sum(docs, split_docs, API_KEY)
252
  keywords = text_to_arr(result)
253
  all_data = aladin_api(keywords, selected_option)
254
+ title1, image1, title2, image2, title3, image3, link1, link2, link3 = book_output(all_data)
255
+ return result, title1, image1, title2, image2, title3, image3, link1, link2, link3
256
 
257
 
258
  # Define the list of options for the Dropdown
259
+ options_list = ["์‚ฌํšŒ", "๊ณผํ•™", "์†Œ์„ค", "๊ธˆ์œต"]
260
+
261
+ with gr.Blocks() as demo:
262
+ gr.Markdown("Paste your Youtube Link and get the book recommandation")
263
+ with gr.Column():
264
+ with gr.Row():
265
+ inp1 = gr.Textbox(label="Your OpenAI KEY")
266
+ inp2 = gr.Textbox(label="Input Link")
267
+ inp3 = gr.Dropdown(choices=options_list, label="Select a category")
268
+ btn = gr.Button("Find the book")
269
+
270
+ with gr.Column():
271
+ out1 = gr.Textbox(label="Summary")
272
+ with gr.Row():
273
+ out2 = gr.Textbox(label="Title1")
274
+ out4 = gr.Textbox(label="Title2")
275
+ out6 = gr.Textbox(label="Title3")
276
+ with gr.Row():
277
+ out3 = gr.Image(label="Image1")
278
+ out5 = gr.Image(label="Image2")
279
+ out7 = gr.Image(label="Image3")
280
+ with gr.Row():
281
+ out8 = gr.HTML(label="Book Link1")
282
+ out9 = gr.HTML(label="Book Link2")
283
+ out10 = gr.HTML(label="Book Link3")
284
+ btn.click(fn=get_title, inputs=[inp1, inp2, inp3],
285
+ outputs=[out1, out2, out3, out4, out5, out6, out7, out8, out9, out10])
286
+
287
+ demo.launch(share=True)