import os import requests import json import re import gradio as gr from pytube import YouTube import whisper import time import pickle from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema.document import Document from langchain.chains.mapreduce import MapReduceChain from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain from langchain.chat_models import ChatOpenAI from langchain.chains.llm import LLMChain from langchain.prompts import PromptTemplate from langchain.chains.combine_documents.stuff import StuffDocumentsChain from PIL import Image from io import BytesIO openai_api_key = "" # for API # # == def youtube_text(link): yt = YouTube(link) yt.streams.filter(only_audio=True).first().download(output_path=".", filename="test.mp3") start = time.time() model = whisper.load_model("base") text = model.transcribe("test.mp3") end = time.time() print(text["text"]) print(f"{end - start:.2f}sec") text_splitter = RecursiveCharacterTextSplitter( chunk_size=2000, chunk_overlap=50, length_function=len, ) full_docs = text["text"] docs = [Document(page_content=x) for x in text_splitter.split_text(text["text"])] split_docs = text_splitter.split_documents(docs) with open("temp/split_example_small.pkl", "wb") as f: pickle.dump(split_docs, f) return split_docs, full_docs def youtube_sum(split_docs, full_docs, API_KEY): openai_key = API_KEY llm = ChatOpenAI(temperature=0.7, openai_api_key=openai_key) # Map prompt map_template = """The following is a set of documents {docs} Based on this list of Video subtitles , please identify the main themes Helpful Answer:""" map_prompt = PromptTemplate.from_template(map_template) # Reduce prompt reduce_template = """The following is set of summaries: {doc_summaries} You need to output two things from the above Video Subtitles. 1. Write an executive summary Read the following subtitles and write a summary that integrates them to quickly identify the main topics of the Video. Your summary should. - Must be written in Korean - Be a 1~2 paragraph - Be descriptive and detailed so that you can tell at a glance what is being said without having to look at the original Video. - There are no more than three main topics in the video. - Please also briefly describe the overall content of the video 2. Choose your keyword The keywords have the following conditions - Must be written in Korean - Must be a single word - Must be a noun - Must be a word that appears in the Video - Must be a word that is not a stopword - Must be a word that is not a proper noun - Must be a word that is not a number - Must be a word that is not a verb - Must be a word that is not a pronoun - Must be a word that is not a preposition - Must be a word that is not a conjunction - Must be a word that is not an interjection - Must be a word that is not an adjective - Must be a word that is not an adverb - Must be a word that is not a determiner - Must be a word that is not a particle - Must be a word that is not a numeral - Output only one keyword Here is an example of the final output Summary: Summary of The video Keyword: keyword Don't output any other text outside of the given format Helpful Answer:""" reduce_prompt = PromptTemplate.from_template(reduce_template) reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt) combine_documents_chain = StuffDocumentsChain( llm_chain=reduce_chain, document_variable_name="doc_summaries" ) # Combines and iteravely reduces the mapped documents reduce_documents_chain = ReduceDocumentsChain( # This is final chain that is called. combine_documents_chain=combine_documents_chain, # If documents exceed context for `StuffDocumentsChain` collapse_documents_chain=combine_documents_chain, # The maximum number of tokens to group documents into. token_max=4000, ) # 2. Map chain map_chain = LLMChain(llm=llm, prompt=map_prompt) # Combining documents by mapping a chain over them, then combining results map_reduce_chain = MapReduceDocumentsChain( # Map chain llm_chain=map_chain, # Reduce chain reduce_documents_chain=reduce_documents_chain, # The variable name in the llm_chain to put the documents in document_variable_name="docs", # Return the results of the map steps in the output return_intermediate_steps=False, ) # Run result = map_reduce_chain.run(split_docs) print(result) with open("temp/result.txt", "w") as f: f.write(result) return result def text_to_arr(result): text = result # Regular expression to find the keyword match = re.search(r"Keyword:\s*(\w+)", text) if match: keyword = match.group(1) print("Keyword:", keyword) # The keyword is in the first capturing group else: match = re.search(r"키워드:\s*(\w+)", text) keyword = match.group(1) # No keyword found print("Keyword:", keyword) return keyword def aladin_api(keyword, selected_option): aladin_key = 'ttbkangmj08250027001' keyword = keyword all_data = [] if selected_option == "사회": key = keyword print(key) url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \ "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=798&outofStockFilter=1" response = requests.get(url) response_json = json.loads(response.text) all_data.append(response_json) elif selected_option == "과학": key = keyword print(key) url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \ "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=987&outofStockFilter=1" response = requests.get(url) response_json = json.loads(response.text) all_data.append(response_json) elif selected_option == "소설": key = keyword print(key) url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \ "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=1&outofStockFilter=1" response = requests.get(url) response_json = json.loads(response.text) all_data.append(response_json) elif selected_option == "금융": key = keyword url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \ "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=170&outofStockFilter=1" response = requests.get(url) response_json = json.loads(response.text) all_data.append(response_json) # request 보내기 all_data = json.dumps(all_data, ensure_ascii=False, indent=4) with open("temp/book.json", "wb") as f: f.write(all_data.encode("utf-8")) print(type(all_data)) print(all_data) return all_data def book_output(book_json): data = json.loads(book_json) if len(data[0]['item'][0]) != 0: title1 = data[0]['item'][0]['title'] book_link1 = data[0]['item'][0]['link'] cover_link1 = data[0]['item'][0]['cover'] response1 = requests.get(cover_link1) image1 = Image.open(BytesIO(response1.content)) else: title1 = "No Data" book_link1 = "No Data" image1 = Image.open("NO DATA.jpeg") if len(data[0]['item'][1]) != 0: title2 = data[0]['item'][1]['title'] book_link2 = data[0]['item'][1]['link'] cover_link2 = data[0]['item'][1]['cover'] response2 = requests.get(cover_link2) image2 = Image.open(BytesIO(response2.content)) else: title2 = "No Data" book_link2 = "No Data" image2 = Image.open("NO DATA.jpeg") if len(data[0]['item'][2]) != 0: title3 = data[0]['item'][2]['title'] book_link3 = data[0]['item'][2]['link'] cover_link3 = data[0]['item'][2]['cover'] response3 = requests.get(cover_link3) image3 = Image.open(BytesIO(response3.content)) else: title3 = "No Data" book_link3 = "No Data" image3 = Image.open("NO DATA.jpeg") return title1, image1, title2, image2, title3, image3, book_link1, book_link2, book_link3 def get_title(API_KEY, link, selected_option): docs, split_docs = youtube_text(link) result = youtube_sum(docs, split_docs, API_KEY) keywords = text_to_arr(result) all_data = aladin_api(keywords, selected_option) title1, image1, title2, image2, title3, image3, link1, link2, link3 = book_output(all_data) return result, title1, image1, title2, image2, title3, image3, link1, link2, link3 # Define the list of options for the Dropdown options_list = ["사회", "과학", "소설", "금융"] with gr.Blocks() as demo: gr.Markdown("Paste your Youtube Link and get the book recommandation") with gr.Column(): with gr.Row(): inp1 = gr.Textbox(label="Your OpenAI KEY") inp2 = gr.Textbox(label="Input Link") inp3 = gr.Dropdown(choices=options_list, label="Select a category") btn = gr.Button("Find the book") with gr.Column(): out1 = gr.Textbox(label="Summary") with gr.Row(): out2 = gr.Textbox(label="Title1") out4 = gr.Textbox(label="Title2") out6 = gr.Textbox(label="Title3") with gr.Row(): out3 = gr.Image(label="Image1") out5 = gr.Image(label="Image2") out7 = gr.Image(label="Image3") with gr.Row(): out8 = gr.HTML(label="Book Link1") out9 = gr.HTML(label="Book Link2") out10 = gr.HTML(label="Book Link3") btn.click(fn=get_title, inputs=[inp1, inp2, inp3], outputs=[out1, out2, out3, out4, out5, out6, out7, out8, out9, out10]) demo.launch(share=True)