Spaces:

raphael825
/

read_the_youtube

Sleeping

App Files Files Community

raphael825 commited on Dec 10, 2023

Commit

a5d3b8f

1 Parent(s): 9d6f9cb

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -185

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import requests
 import json
 import re
 import gradio as gr
-import Model
 from pytube import YouTube
 import whisper
 import time
@@ -18,22 +18,19 @@ from langchain.chains.combine_documents.stuff import StuffDocumentsChain
 from PIL import Image
 from io import BytesIO
 openai_api_key = ""
 # for API
 # # ==
 def youtube_text(link):
     yt = YouTube(link)
-    yt.streams.filter(only_audio=True).first().download \
-        (output_path=".", filename="test.mp3")
     start = time.time()
-    model = whisper.load_model("small")
     text = model.transcribe("test.mp3")
     end = time.time()
@@ -50,102 +47,62 @@ def youtube_text(link):
     split_docs = text_splitter.split_documents(docs)
-    with open("split_example_small.pkl", "wb") as f:
         pickle.dump(split_docs, f)
     return split_docs, full_docs
-#
-# def youtube_summary(full_docs, openai_key):
-#
-#     prompt = """The following is a documents
-#     You need to output two things from the above Video.
-#     1. Write an executive summary
-#     Read the following documents and write a summary that integrates them to quickly identify the main topics of the Video.
-#     Your summary should.
-#     - Must be written in Korean
-#     - Be a single paragraph
-#     - Be descriptive and detailed so that you can tell at a glance what is being said without having to look at the original Video.
-#     2. Choose your keywords
-#     The keywords have the following conditions
-#     - Must be written in Korean
-#     - Must be a single word
-#     - Must be a word that appears in the Video
-#     - Must be a word that is not a stopword
-#     - Must be a word that is not a proper noun
-#     - Must be a word that is not a preposition
-#     - Must be a word that is not a conjunction
-#     - Must be a word that is not an interjection
-#     - Must be a word that is not an adjective
-#     - Must be a word that is not an adverb
-#     - Output as a Python array (ex: [keyword1,keyword2,keyword3] )
-#     - Output a total of 3 keywords
-#     - Choose words you might use to search for a book title !
-#     Here is an example of the final output
-#     요약: Document_summary
-#     키워드: [ Keyword1,Keyword2,Keyword3]
-#     """
-#
-#
-#     try:
-#
-#         response = client.chat.completions.create(
-#             messages={
-#                 "role": "system", "content": "You are a helpful assistant."
-#                 "role": "user", "content": prompt
-#             },
-#             temperature=0.7)
-#
-#         with open ("data/result_new.json", "w") as f:
-#             json.dump(response.choices[0].message['content'], f, indent=4)
-#         return response.choices[0].message['content']
-#     except Exception as e:
-#         print(e)
-#         return "Error"
-#
-# 1. Write an executive summary
-#         Read the following documents and write a summary that integrates them to quickly identify the main topics of the Video.
-#         Your summary should.
-#         - Must be written in Korean
-#         - Be a single paragraph
-#         - Be descriptive and detailed so that you can tell at a glance what is being said without having to look at the original Video.
-#         2.
-def youtube_sum(split_docs, full_docs):
-    llm = ChatOpenAI(temperature=0.7, openai_api_key=openai_api_key)
     # Map prompt
     map_template = """The following is a set of documents
     {docs}
-    Based on this list of docs, please identify the main themes
     Helpful Answer:"""
     map_prompt = PromptTemplate.from_template(map_template)
     # Reduce prompt
     reduce_template = """The following is set of summaries:
-        {doc_summaries}
-        You need to output Keyword from the above Video.
-         Choose your keywords
-        The keywords have the following conditions
-        - Must be written in Korean
-        - Must be a single word
-        - Must be a word that appears in the Video
-        - Must be a word that is not a stopword
-        - Must be a word that is not a proper noun
-        - Must be a word that is not a preposition
-        - Must be a word that is not a conjunction
-        - Must be a word that is not an interjection
-        - Must be a word that is not an adjective
-        - Must be a word that is not an adverb
-        - Output a total of 3 keywords
-        - Choose words you might use to search for a book title !
-        Here is an example of the final output
-        Keyword: Keyword1,Keyword2,Keyword3
-        Helpful Answer:"""
     reduce_prompt = PromptTemplate.from_template(reduce_template)
@@ -181,79 +138,71 @@ def youtube_sum(split_docs, full_docs):
     # Run
     result = map_reduce_chain.run(split_docs)
     print(result)
-    with open("result.txt", "w") as f:
         f.write(result)
     return result
 def text_to_arr(result):
-    parts = re.split(r'Keyword:', result, flags=re.IGNORECASE)
-    # Take the last part (the actual keywords), strip whitespace, and split by commas
-    keywords = parts[-1].strip().split(", ")
-    # Now 'keywords' is an array (list in Python) containing the extracted keywords
-    print(keywords)
-    return keywords
-def aladin_api(keywords, selected_option):
     aladin_key = 'ttbkangmj08250027001'
     all_data = []
-    title = []
-    keyword = keywords
     if selected_option == "사회":
-        for key in keyword:
-            print(key)
-            url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
-                  "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=90853&outofStockFilter=1"
-            response = requests.get(url)
-            response_json = json.loads(response.text)
-            all_data.append(response_json)
-        # request 보내기
-        all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
-        with open("book.json", "wb") as f:
-            f.write(all_data.encode("utf-8"))
     elif selected_option == "과학":
-        for key in keyword:
-            print(key)
-            url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
-                  "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=987&outofStockFilter=1"
-            response = requests.get(url)
-            response_json = json.loads(response.text)
-            all_data.append(response_json)
-        # request 보내기
-        all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
-        with open("book.json", "wb") as f:
-            f.write(all_data.encode("utf-8"))
     elif selected_option == "소설":
-        for key in keyword:
-            print(key)
-            url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
-                  "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=1&outofStockFilter=1"
-            response = requests.get(url)
-            response_json = json.loads(response.text)
-            all_data.append(response_json)
         # request 보내기
-        all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
-        with open("book.json", "wb") as f:
-            f.write(all_data.encode("utf-8"))
-    elif selected_option == "경제경영":
-        for key in keyword:
-            print(key)
-            url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
-                  "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=170&outofStockFilter=1"
-            response = requests.get(url)
-            response_json = json.loads(response.text)
-            all_data.append(response_json)
-        # request 보내기
-        all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
-        with open("book.json", "wb") as f:
-            f.write(all_data.encode("utf-8"))
     print(all_data)
     return all_data
@@ -261,7 +210,7 @@ def aladin_api(keywords, selected_option):
 def book_output(book_json):
     data = json.loads(book_json)
-    if len(data[0]['item']) != 0:
         title1 = data[0]['item'][0]['title']
         book_link1 = data[0]['item'][0]['link']
         cover_link1 = data[0]['item'][0]['cover']
@@ -270,65 +219,69 @@ def book_output(book_json):
     else:
         title1 = "No Data"
         book_link1 = "No Data"
-        image1 = "No Data"
-    if len(data[1]['item']) != 0:
-        title2 = data[1]['item'][0]['title']
-        book_link2 = data[1]['item'][0]['link']
-        cover_link2 = data[1]['item'][0]['cover']
         response2 = requests.get(cover_link2)
         image2 = Image.open(BytesIO(response2.content))
     else:
         title2 = "No Data"
         book_link2 = "No Data"
-        image2 = "No Data"
-    if len(data[2]['item']) != 0:
-        title3 = data[2]['item'][0]['title']
-        book_link3 = data[2]['item'][0]['link']
-        cover_link3 = data[2]['item'][0]['cover']
         response3 = requests.get(cover_link3)
         image3 = Image.open(BytesIO(response3.content))
     else:
         title3 = "No Data"
         book_link3 = "No Data"
-        image3 = "No Data"
-    return title1, image1, title2, image2, title3, image3
-def process_selection(input_list):
-    # Your processing logic here for the selected option
-    API_KEY = input_list[0]
-    link = input_list[1]
-    selected_option = input_list[2]
-    result = f"You selected: {selected_option}"
-    print(result)
-    return API_KEY, link, selected_option
 def get_title(API_KEY, link, selected_option):
     docs, split_docs = youtube_text(link)
-    result = youtube_sum(docs, split_docs)
     keywords = text_to_arr(result)
     all_data = aladin_api(keywords, selected_option)
-    title1, image1, title2, image2, title3, image3 = book_output(all_data)
-    return result, title1, image1, title2, image2, title3, image3
 # Define the list of options for the Dropdown
-options_list = ["사회", "과학", "소설", "경제경영"]
-iface = gr.Interface(fn=get_title, inputs=[gr.Textbox(label="Your OpenAI KEY"),
-                                           gr.Textbox(label="Input Link"),
-                                           gr.Dropdown(choices=options_list, label="Select a category")],
-                     outputs=[
-                         gr.Textbox(label="Keywords"),
-                         gr.Textbox(label="Title1"),
-                         gr.Image(label="Image1"),
-                         gr.Textbox(label="Title2"),
-                         gr.Image(label="Image2"),
-                         gr.Textbox(label="Title3"),
-                         gr.Image(label="Image3"),
-                     ])
-iface.launch()

+import os
 import requests
 import json
 import re
 import gradio as gr
 from pytube import YouTube
 import whisper
 import time
 from PIL import Image
 from io import BytesIO
 openai_api_key = ""
 # for API
 # # ==
 def youtube_text(link):
     yt = YouTube(link)
+    yt.streams.filter(only_audio=True).first().download(output_path=".", filename="test.mp3")
     start = time.time()
+    model = whisper.load_model("base")
     text = model.transcribe("test.mp3")
     end = time.time()
     split_docs = text_splitter.split_documents(docs)
+    with open("temp/split_example_small.pkl", "wb") as f:
         pickle.dump(split_docs, f)
     return split_docs, full_docs
+def youtube_sum(split_docs, full_docs, API_KEY):
+    openai_key = API_KEY
+    llm = ChatOpenAI(temperature=0.7, openai_api_key=openai_key)
     # Map prompt
     map_template = """The following is a set of documents
     {docs}
+    Based on this list of Video subtitles , please identify the main themes
     Helpful Answer:"""
     map_prompt = PromptTemplate.from_template(map_template)
     # Reduce prompt
     reduce_template = """The following is set of summaries:
+    {doc_summaries}
+    You need to output two things from the above Video Subtitles.
+    1. Write an executive summary
+    Read the following subtitles and write a summary that integrates them to quickly identify the main topics of the Video.
+    Your summary should.
+    - Must be written in Korean
+    - Be a 1~2 paragraph
+    - Be descriptive and detailed so that you can tell at a glance what is being said without having to look at the original Video.
+    - There are no more than three main topics in the video.
+    - Please also briefly describe the overall content of the video
+    2. Choose your keyword
+    The keywords have the following conditions
+    - Must be written in Korean
+    - Must be a single word
+    - Must be a noun
+    - Must be a word that appears in the Video
+    - Must be a word that is not a stopword
+    - Must be a word that is not a proper noun
+    - Must be a word that is not a number
+    - Must be a word that is not a verb
+    - Must be a word that is not a pronoun
+    - Must be a word that is not a preposition
+    - Must be a word that is not a conjunction
+    - Must be a word that is not an interjection
+    - Must be a word that is not an adjective
+    - Must be a word that is not an adverb
+    - Must be a word that is not a determiner
+    - Must be a word that is not a particle
+    - Must be a word that is not a numeral
+    - Output only one keyword
+    Here is an example of the final output
+    Summary: Summary of The video
+    Keyword: keyword
+    Don't output any other text outside of the given format
+    Helpful Answer:"""
     reduce_prompt = PromptTemplate.from_template(reduce_template)
     # Run
     result = map_reduce_chain.run(split_docs)
     print(result)
+    with open("temp/result.txt", "w") as f:
         f.write(result)
     return result
 def text_to_arr(result):
+    text = result
+    # Regular expression to find the keyword
+    match = re.search(r"Keyword:\s*(\w+)", text)
+    if match:
+        keyword = match.group(1)
+        print("Keyword:", keyword)  # The keyword is in the first capturing group
+    else:
+        match = re.search(r"키워드:\s*(\w+)", text)
+        keyword = match.group(1)  # No keyword found
+        print("Keyword:", keyword)
+    return keyword
+def aladin_api(keyword, selected_option):
     aladin_key = 'ttbkangmj08250027001'
+    keyword = keyword
     all_data = []
     if selected_option == "사회":
+        key = keyword
+        print(key)
+        url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
+              "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=798&outofStockFilter=1"
+        response = requests.get(url)
+        response_json = json.loads(response.text)
+        all_data.append(response_json)
     elif selected_option == "과학":
+        key = keyword
+        print(key)
+        url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
+              "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=987&outofStockFilter=1"
+        response = requests.get(url)
+        response_json = json.loads(response.text)
+        all_data.append(response_json)
     elif selected_option == "소설":
+        key = keyword
+        print(key)
+        url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
+              "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=1&outofStockFilter=1"
+        response = requests.get(url)
+        response_json = json.loads(response.text)
+        all_data.append(response_json)
+    elif selected_option == "금융":
+        key = keyword
+        url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
+              "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=170&outofStockFilter=1"
+        response = requests.get(url)
+        response_json = json.loads(response.text)
+        all_data.append(response_json)
         # request 보내기
+    all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
+    with open("temp/book.json", "wb") as f:
+        f.write(all_data.encode("utf-8"))
+    print(type(all_data))
     print(all_data)
     return all_data
 def book_output(book_json):
     data = json.loads(book_json)
+    if len(data[0]['item'][0]) != 0:
         title1 = data[0]['item'][0]['title']
         book_link1 = data[0]['item'][0]['link']
         cover_link1 = data[0]['item'][0]['cover']
     else:
         title1 = "No Data"
         book_link1 = "No Data"
+        image1 = Image.open("NO DATA.jpeg")
+    if len(data[0]['item'][1]) != 0:
+        title2 = data[0]['item'][1]['title']
+        book_link2 = data[0]['item'][1]['link']
+        cover_link2 = data[0]['item'][1]['cover']
         response2 = requests.get(cover_link2)
         image2 = Image.open(BytesIO(response2.content))
     else:
         title2 = "No Data"
         book_link2 = "No Data"
+        image2 = Image.open("NO DATA.jpeg")
+    if len(data[0]['item'][2]) != 0:
+        title3 = data[0]['item'][2]['title']
+        book_link3 = data[0]['item'][2]['link']
+        cover_link3 = data[0]['item'][2]['cover']
         response3 = requests.get(cover_link3)
         image3 = Image.open(BytesIO(response3.content))
     else:
         title3 = "No Data"
         book_link3 = "No Data"
+        image3 = Image.open("NO DATA.jpeg")
+    return title1, image1, title2, image2, title3, image3, book_link1, book_link2, book_link3
 def get_title(API_KEY, link, selected_option):
     docs, split_docs = youtube_text(link)
+    result = youtube_sum(docs, split_docs, API_KEY)
     keywords = text_to_arr(result)
     all_data = aladin_api(keywords, selected_option)
+    title1, image1, title2, image2, title3, image3, link1, link2, link3 = book_output(all_data)
+    return result, title1, image1, title2, image2, title3, image3, link1, link2, link3
 # Define the list of options for the Dropdown
+options_list = ["사회", "과학", "소설", "금융"]
+with gr.Blocks() as demo:
+    gr.Markdown("Paste your Youtube Link and get the book recommandation")
+    with gr.Column():
+        with gr.Row():
+            inp1 = gr.Textbox(label="Your OpenAI KEY")
+            inp2 = gr.Textbox(label="Input Link")
+        inp3 = gr.Dropdown(choices=options_list, label="Select a category")
+        btn = gr.Button("Find the book")
+    with gr.Column():
+        out1 = gr.Textbox(label="Summary")
+        with gr.Row():
+            out2 = gr.Textbox(label="Title1")
+            out4 = gr.Textbox(label="Title2")
+            out6 = gr.Textbox(label="Title3")
+        with gr.Row():
+            out3 = gr.Image(label="Image1")
+            out5 = gr.Image(label="Image2")
+            out7 = gr.Image(label="Image3")
+        with gr.Row():
+            out8 = gr.HTML(label="Book Link1")
+            out9 = gr.HTML(label="Book Link2")
+            out10 = gr.HTML(label="Book Link3")
+    btn.click(fn=get_title, inputs=[inp1, inp2, inp3],
+              outputs=[out1, out2, out3, out4, out5, out6, out7, out8, out9, out10])
+demo.launch(share=True)