wower99 commited on
Commit
a46fd4b
·
1 Parent(s): 125913a

video generation feature v1 is functional

Browse files
Files changed (6) hide show
  1. .streamlit/config.toml +2 -0
  2. app.py +102 -106
  3. constants.py +5 -1
  4. requirements.txt +83 -1
  5. structured_output_extractor.py +101 -0
  6. utils.py +148 -4
.streamlit/config.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [server]
2
+ maxUploadSize = 20
app.py CHANGED
@@ -1,26 +1,19 @@
1
  import streamlit as st
2
- import requests
3
- import io
4
- from gradio_client import Client, handle_file
5
- import tempfile
6
- import os
7
- from utils import clean_response, get_translation, get_image_prompts, generate_images, generate_video
8
  import constants
 
9
 
 
10
 
11
- # Initialize the client only once
12
- if 'client' not in st.session_state:
13
- st.session_state.client = Client("habib926653/openai-whisper-large-v3-turbo", hf_token=constants.HF_TOKEN)
14
-
15
- # Initialize state variables
16
  if 'transcript_visible' not in st.session_state:
17
  st.session_state.transcript_visible = False
18
  if 'translation_visible' not in st.session_state:
19
  st.session_state.translation_visible = False
20
  if 'uploaded_file_name' not in st.session_state:
21
  st.session_state.uploaded_file_name = None
22
- if 'converted_audio' not in st.session_state:
23
- st.session_state.converted_audio = None
24
  if 'was_converted' not in st.session_state:
25
  st.session_state.was_converted = False
26
  if 'transcript' not in st.session_state:
@@ -34,43 +27,34 @@ if 'image_prompts' not in st.session_state:
34
  if 'generated_images' not in st.session_state:
35
  st.session_state.generated_images = None
36
 
37
- # Function to convert the audio to MP3 using the external API
38
- def convert_to_mp3(audio_file):
39
- if audio_file.name.endswith(".mp3"):
40
- return audio_file, False # File is already MP3
41
- else:
42
- # Send to the external converter API
43
- url = constants.AUDIO_CONVERTER_ENDPOINT
44
- files = {"file": (audio_file.name, audio_file, "audio/mp3")}
45
-
46
- with st.spinner("Converting audio to MP3... Please wait."):
47
- response = requests.post(url, files=files)
48
-
49
- if response.status_code == 200:
50
- # If conversion is successful, save and return the MP3 file
51
- converted_file = io.BytesIO(response.content)
52
- converted_file.name = "converted.mp3"
53
- st.success("✅ File successfully converted to MP3!")
54
- return converted_file, True # File was converted
55
- else:
56
- st.error("❌ Conversion failed. Please try another format.")
57
- return None, None
58
 
59
  # Streamlit UI
60
  st.markdown(
61
  "<h1 style='text-align: center;'>AI Video Generator</h1>",
62
  unsafe_allow_html=True
63
  )
64
- st.info("Video Generation Feature Currently Under Development")
65
 
66
  # Upload audio file
67
  audio_file = st.file_uploader("🔼 Upload your audio file:", type=constants.SUPPORTED_FORMATS)
68
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  if audio_file:
70
  # Reset states only when a new file is uploaded
71
  if st.session_state.uploaded_file_name != audio_file.name:
72
  st.session_state.uploaded_file_name = audio_file.name
73
- st.session_state.converted_audio, st.session_state.was_converted = convert_to_mp3(audio_file)
74
  st.session_state.transcript = None
75
  st.session_state.translation = None
76
  st.session_state.image_prompts = None
@@ -78,78 +62,90 @@ if audio_file:
78
 
79
  st.info(f"Uploaded file: **{audio_file.name}**")
80
 
81
- if st.session_state.converted_audio:
82
- if not st.session_state.was_converted:
83
- st.success("🎧 The uploaded file is already in MP3 format.")
84
- else:
85
- st.success("✅ File successfully converted to MP3!")
86
-
87
- # Transcription logic
88
- if st.session_state.transcript is None:
89
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
90
- tmp_file.write(st.session_state.converted_audio.read())
91
- tmp_file_path = tmp_file.name
92
-
93
- with st.spinner("Transcribing audio... Please wait."):
94
- result = st.session_state.client.predict(
95
- param_0=handle_file(tmp_file_path),
96
- api_name="/predict"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  )
98
- st.session_state.transcript = clean_response(result)
99
- os.remove(tmp_file_path)
100
-
101
- # Translation logic
102
- if st.session_state.transcript and st.session_state.translation is None:
103
- with st.spinner("Generating translation... Please wait."):
104
- st.session_state.translation = get_translation(st.session_state.transcript)
105
-
106
- st.audio(st.session_state.converted_audio, format="audio/mp3")
107
-
108
- # Toggle transcript visibility
109
- toggle_transcript = st.checkbox("Show Transcript", value=st.session_state.transcript_visible)
110
- st.session_state.transcript_visible = toggle_transcript
111
-
112
- if st.session_state.transcript_visible:
113
- st.write("### Transcription:")
114
- st.write(st.session_state.transcript)
115
-
116
- # Toggle translation visibility
117
- toggle_translation = st.checkbox("Show Translation", value=st.session_state.translation_visible)
118
- st.session_state.translation_visible = toggle_translation
119
-
120
- if st.session_state.translation_visible:
121
- st.write("### Translation:")
122
- st.write(st.session_state.translation)
123
-
124
- # Image generation logic
125
- if st.session_state.translation and st.session_state.image_prompts is None:
126
- with st.spinner("Generating image prompts... Please wait."):
127
- if 'Already in English' in st.session_state.translation:
128
- st.info("Audio is Already in English. Using Transcription to generate Image Prompts")
129
- st.session_state.image_prompts = get_image_prompts(st.session_state.transcript)['image_prompts']
130
- else:
131
- st.session_state.image_prompts = get_image_prompts(st.session_state.translation)['image_prompts']
132
-
133
-
134
- # Ensure that generated_images is always a list
135
- if 'generated_images' not in st.session_state or st.session_state.generated_images is None:
136
- st.session_state.generated_images = []
137
-
138
- # Generate images only if they have not been generated already
139
- if st.session_state.image_prompts and not st.session_state.generated_images:
140
- with st.spinner("Generating images... Please wait."):
141
- for prompt, image_path in generate_images(st.session_state.image_prompts):
142
- # Display each image as soon as it's generated
143
- st.image(image_path, caption=f"{prompt}", use_container_width=True)
144
- # Append the generated image to the session state
145
- st.session_state.generated_images.append((prompt, image_path))
146
-
147
- # Display all previously generated images (including newly generated ones)
148
- else:
149
- for prompt, image_path in st.session_state.generated_images:
150
- # Display each image
151
- st.image(image_path, caption=f"{prompt}", use_container_width=True)
152
 
153
  else:
154
  st.warning("Please upload an audio file to proceed.")
155
 
 
 
1
  import streamlit as st
2
+ from utils import get_translation, get_image_prompts, segments_to_chunks, generate_images, generate_video
 
 
 
 
 
3
  import constants
4
+ from groq import Groq
5
 
6
+ client = Groq()
7
 
8
+ # Initialize state variables if not already set
 
 
 
 
9
  if 'transcript_visible' not in st.session_state:
10
  st.session_state.transcript_visible = False
11
  if 'translation_visible' not in st.session_state:
12
  st.session_state.translation_visible = False
13
  if 'uploaded_file_name' not in st.session_state:
14
  st.session_state.uploaded_file_name = None
15
+ if 'audio' not in st.session_state:
16
+ st.session_state.audio = None
17
  if 'was_converted' not in st.session_state:
18
  st.session_state.was_converted = False
19
  if 'transcript' not in st.session_state:
 
27
  if 'generated_images' not in st.session_state:
28
  st.session_state.generated_images = None
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  # Streamlit UI
32
  st.markdown(
33
  "<h1 style='text-align: center;'>AI Video Generator</h1>",
34
  unsafe_allow_html=True
35
  )
36
+ st.info("Video Generation Feature - Functional But Can be Buggy")
37
 
38
  # Upload audio file
39
  audio_file = st.file_uploader("🔼 Upload your audio file:", type=constants.SUPPORTED_FORMATS)
40
 
41
+ print(audio_file,'is the upload')
42
+
43
+ # if audio_file is not None:
44
+ # # Check the duration of the uploaded audio file
45
+ # duration = get_audio_duration(audio_file)
46
+
47
+ # # Allow only files up to 5 minutes (300 seconds)
48
+ # if duration > 300:
49
+ # st.error("The uploaded audio file exceeds the 5-minute limit. Please upload a shorter file.")
50
+ # else:
51
+ # st.success(f"Audio file uploaded successfully! Duration: {duration/60:.2f} minutes")
52
+
53
  if audio_file:
54
  # Reset states only when a new file is uploaded
55
  if st.session_state.uploaded_file_name != audio_file.name:
56
  st.session_state.uploaded_file_name = audio_file.name
57
+ st.session_state.audio = audio_file
58
  st.session_state.transcript = None
59
  st.session_state.translation = None
60
  st.session_state.image_prompts = None
 
62
 
63
  st.info(f"Uploaded file: **{audio_file.name}**")
64
 
65
+ # Read the uploaded file's bytes and send to Groq API for transcription
66
+ file_bytes = audio_file.read()
67
+
68
+ # Create a transcription of the audio file using Groq API
69
+ result = client.audio.transcriptions.create(
70
+ file=(audio_file.name, file_bytes), # Send the audio file content directly to the API
71
+ model="whisper-large-v3-turbo", # Model to use for transcription
72
+ prompt="Specify context or spelling", # Optional context for better transcription accuracy
73
+ response_format="verbose_json", # Return detailed JSON response
74
+ temperature=0.0, # Control randomness in the transcription output
75
+ )
76
+ st.session_state.transcript = result.text
77
+ st.session_state.segments = result.segments
78
+
79
+ # Translation logic
80
+ if st.session_state.transcript and st.session_state.translation is None:
81
+ with st.spinner("Generating translation... Please wait."):
82
+ st.session_state.translation = get_translation(st.session_state.transcript)
83
+
84
+ st.audio(st.session_state.audio, format=f"audio/{audio_file.type}")
85
+
86
+ # Toggle transcript visibility
87
+ toggle_transcript = st.checkbox("Show Transcript", value=st.session_state.transcript_visible, key="toggle_transcript")
88
+ st.session_state.transcript_visible = toggle_transcript
89
+
90
+ if st.session_state.transcript_visible:
91
+ st.write("### Transcription:")
92
+ st.write(st.session_state.transcript)
93
+
94
+ # Toggle translation visibility
95
+ toggle_translation = st.checkbox("Show Translation", value=st.session_state.translation_visible, key="toggle_translation")
96
+ st.session_state.translation_visible = toggle_translation
97
+
98
+ if st.session_state.translation_visible:
99
+ st.write("### Translation:")
100
+ st.write(st.session_state.translation)
101
+
102
+ # Image generation logic
103
+ if st.session_state.translation and st.session_state.image_prompts is None:
104
+ with st.spinner("Generating image prompts... Please wait."):
105
+ if 'Already in English' in st.session_state.translation:
106
+ st.info("Audio is Already in English. Using Transcription to generate Image Prompts")
107
+ st.session_state.image_prompts = get_image_prompts(segments_to_chunks(st.session_state.segments))['image_prompts']
108
+ else:
109
+ st.session_state.image_prompts = get_image_prompts(segments_to_chunks(st.session_state.segments))['image_prompts']
110
+
111
+ print(st.session_state.image_prompts)
112
+ # Ensure that generated_images is always a list
113
+ if 'generated_images' not in st.session_state or st.session_state.generated_images is None:
114
+ st.session_state.generated_images = []
115
+
116
+ # Generate images only if they have not been generated already
117
+ if st.session_state.image_prompts and not st.session_state.generated_images:
118
+ with st.spinner("Generating images... Please wait."):
119
+ for prompt, image_path in generate_images(st.session_state.image_prompts):
120
+ # # Display each image as soon as it's generated
121
+ # st.image(image_path, caption=f"{prompt}", use_container_width=True)
122
+ # Append the generated image to the session state
123
+ st.session_state.generated_images.append((prompt, image_path))
124
+
125
+ # # Display all previously generated images (including newly generated ones)
126
+ # else:
127
+ # for prompt, image_path in st.session_state.generated_images:
128
+ # st.image(image_path, caption=f"{prompt}", use_container_width=True)
129
+
130
+ # Generate video when all images are generated
131
+ if st.session_state.generated_images and st.session_state.audio:
132
+ if st.button("Generate Video"):
133
+ with st.spinner("Generating video... Please wait."):
134
+ # Map images to segments
135
+ image_paths = [img[1] for img in st.session_state.generated_images]
136
+ generated_video_path = generate_video(
137
+ audio_file=st.session_state.audio,
138
+ images=image_paths,
139
+ segments=st.session_state.segments
140
  )
141
+ st.session_state.generated_video = generated_video_path
142
+ st.success("Video generated successfully!")
143
+
144
+ # Display the generated video
145
+ if st.session_state.generated_video:
146
+ st.video(st.session_state.generated_video)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  else:
149
  st.warning("Please upload an audio file to proceed.")
150
 
151
+
constants.py CHANGED
@@ -4,6 +4,8 @@ import os
4
  load_dotenv()
5
 
6
  HF_TOKEN = os.getenv("HF_TOKEN", None)
 
 
7
  AUDIO_CONVERTER_ENDPOINT="https://audio-converter-api-587c.onrender.com/convert/mp3"
8
 
9
 
@@ -12,4 +14,6 @@ PROMPT_GENERATION_ENDPOINT="https://habib926653-text-translator-agent-api.hf.spa
12
  IMAGE_GENERATION_SPACE_NAME="habib926653/stabilityai-stable-diffusion-3.5-large-turbo"
13
 
14
  # Supported formats
15
- SUPPORTED_FORMATS = ["mp3", "wav", "ogg", "flac", "aac", "m4a"]
 
 
 
4
  load_dotenv()
5
 
6
  HF_TOKEN = os.getenv("HF_TOKEN", None)
7
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
8
+
9
  AUDIO_CONVERTER_ENDPOINT="https://audio-converter-api-587c.onrender.com/convert/mp3"
10
 
11
 
 
14
  IMAGE_GENERATION_SPACE_NAME="habib926653/stabilityai-stable-diffusion-3.5-large-turbo"
15
 
16
  # Supported formats
17
+ SUPPORTED_FORMATS = ["mp3", "wav", "ogg", "flac", "aac", "m4a"]
18
+
19
+
requirements.txt CHANGED
@@ -1,3 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  python-dotenv==1.0.1
 
 
 
 
 
 
 
 
 
 
 
 
2
  streamlit==1.41.1
3
- gradio_client==1.5.2
 
 
 
 
 
 
 
 
 
1
+ altair==5.5.0
2
+ annotated-types==0.7.0
3
+ anyio==4.8.0
4
+ attrs==24.3.0
5
+ audeer==2.2.1
6
+ audiofile==1.5.1
7
+ audmath==1.4.1
8
+ blinker==1.9.0
9
+ cachetools==5.5.0
10
+ certifi==2024.12.14
11
+ cffi==1.17.1
12
+ charset-normalizer==3.4.1
13
+ click==8.1.8
14
+ decorator==4.4.2
15
+ distro==1.9.0
16
+ exceptiongroup==1.2.2
17
+ filelock==3.16.1
18
+ fsspec==2024.12.0
19
+ gitdb==4.0.12
20
+ GitPython==3.1.44
21
+ gradio_client==1.5.4
22
+ groq==0.15.0
23
+ h11==0.14.0
24
+ httpcore==1.0.7
25
+ httpx==0.28.1
26
+ huggingface-hub==0.27.1
27
+ idna==3.10
28
+ imageio==2.36.1
29
+ imageio-ffmpeg==0.5.1
30
+ Jinja2==3.1.5
31
+ jsonpatch==1.33
32
+ jsonpointer==3.0.0
33
+ jsonschema==4.23.0
34
+ jsonschema-specifications==2024.10.1
35
+ langchain-core==0.3.29
36
+ langchain-groq==0.2.3
37
+ langgraph==0.2.62
38
+ langgraph-checkpoint==2.0.9
39
+ langgraph-sdk==0.1.51
40
+ langsmith==0.2.10
41
+ markdown-it-py==3.0.0
42
+ MarkupSafe==3.0.2
43
+ mdurl==0.1.2
44
+ moviepy==1.0.3
45
+ msgpack==1.1.0
46
+ narwhals==1.21.1
47
+ numpy==2.2.1
48
+ opencv-python==4.10.0.84
49
+ orjson==3.10.14
50
+ packaging==24.2
51
+ pandas==2.2.3
52
+ pillow==11.1.0
53
+ proglog==0.1.10
54
+ protobuf==5.29.3
55
+ pyarrow==18.1.0
56
+ pycparser==2.22
57
+ pydantic==2.10.5
58
+ pydantic_core==2.27.2
59
+ pydeck==0.9.1
60
+ pydub==0.25.1
61
+ Pygments==2.19.1
62
+ python-dateutil==2.9.0.post0
63
  python-dotenv==1.0.1
64
+ pytz==2024.2
65
+ PyYAML==6.0.2
66
+ referencing==0.35.1
67
+ requests==2.32.3
68
+ requests-toolbelt==1.0.0
69
+ rich==13.9.4
70
+ rpds-py==0.22.3
71
+ scipy==1.15.1
72
+ six==1.17.0
73
+ smmap==5.0.2
74
+ sniffio==1.3.1
75
+ soundfile==0.13.0
76
  streamlit==1.41.1
77
+ tenacity==9.0.0
78
+ toml==0.10.2
79
+ tornado==6.4.2
80
+ tqdm==4.67.1
81
+ typing_extensions==4.12.2
82
+ tzdata==2024.2
83
+ urllib3==2.3.0
84
+ watchdog==6.0.0
85
+ websockets==14.1
structured_output_extractor.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Type, Optional
2
+ from pydantic import BaseModel
3
+ from langgraph.graph import StateGraph, START, END
4
+ from typing import TypedDict
5
+ import constants # Assuming constants.py holds LLM provider configurations
6
+ from langchain_groq import ChatGroq
7
+
8
+
9
+ # Define the State structure (similar to previous definition)
10
+ class State(TypedDict):
11
+ messages: list
12
+ output: Optional[BaseModel]
13
+
14
+
15
+ # Generic Pydantic model-based structured output extractor
16
+ class StructuredOutputExtractor:
17
+ def __init__(self, response_schema: Type[BaseModel]):
18
+ """
19
+ Initializes the extractor for any given structured output model.
20
+
21
+ :param response_schema: Pydantic model class used for structured output extraction
22
+ """
23
+ self.response_schema = response_schema
24
+
25
+ # Initialize language model (provider and API keys come from constants.py)
26
+ self.llm = ChatGroq(model="llama-3.3-70b-versatile")
27
+
28
+ # Bind the model with structured output capability
29
+ self.structured_llm = self.llm.with_structured_output(response_schema)
30
+
31
+ # Build the graph for structured output
32
+ self._build_graph()
33
+
34
+ def _build_graph(self):
35
+ """
36
+ Build the LangGraph computational graph for structured extraction.
37
+ """
38
+ graph_builder = StateGraph(State)
39
+
40
+ # Add nodes and edges for structured output
41
+ graph_builder.add_node("extract", self._extract_structured_info)
42
+ graph_builder.add_edge(START, "extract")
43
+ graph_builder.add_edge("extract", END)
44
+
45
+ self.graph = graph_builder.compile()
46
+
47
+ def _extract_structured_info(self, state: dict):
48
+ """
49
+ Extract structured information using the specified response model.
50
+
51
+ :param state: Current graph state
52
+ :return: Updated state with structured output
53
+ """
54
+ query = state['messages'][-1].content
55
+ print(f"Processing query: {query}")
56
+ try:
57
+ # Extract details using the structured model
58
+ output = self.structured_llm.invoke(query)
59
+ # Return the structured response
60
+ return {"output": output}
61
+ except Exception as e:
62
+ print(f"Error during extraction: {e}")
63
+ return {"output": None}
64
+
65
+ def extract(self, query: str) -> Optional[BaseModel]:
66
+ """
67
+ Public method to extract structured information.
68
+
69
+ :param query: Input query for structured output extraction
70
+ :return: Structured model object or None
71
+ """
72
+ from langchain_core.messages import SystemMessage
73
+
74
+ result = self.graph.invoke({
75
+ "messages": [SystemMessage(content=query)]
76
+ })
77
+ # Return the structured model response, if available
78
+ result = result.get('output')
79
+ return result
80
+
81
+
82
+ if __name__ == '__main__':
83
+
84
+ # Example Pydantic model (e.g., Movie)
85
+ class Movie(BaseModel):
86
+ title: str
87
+ year: int
88
+ genre: str
89
+ rating: Optional[float] = None
90
+ actors: list[str] = []
91
+
92
+
93
+ # Example usage with a generic structured extractor
94
+ extractor = StructuredOutputExtractor(response_schema=Movie)
95
+
96
+ query = "Tell me about the movie Inception. Provide details about its title, year, genre, rating, and main actors."
97
+
98
+ result = extractor.extract(query)
99
+ print(type(result))
100
+ if result:
101
+ print(result)
utils.py CHANGED
@@ -4,6 +4,14 @@ import constants
4
  import os
5
  from PIL import Image
6
  from gradio_client import Client
 
 
 
 
 
 
 
 
7
 
8
 
9
  def clean_response(result):
@@ -48,7 +56,7 @@ def get_translation(text: str):
48
 
49
 
50
 
51
- def get_image_prompts(text_input):
52
  headers = {
53
  "Authorization": f"Bearer {constants.HF_TOKEN}", # Replace with your token
54
  "Content-Type": "application/json" # Optional, ensures JSON payload
@@ -73,6 +81,29 @@ def get_image_prompts(text_input):
73
  print(f"Error during request: {e}")
74
  return {"error": str(e)}
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
 
78
 
@@ -126,11 +157,124 @@ def tmp_folder(folder_name: str) -> str:
126
  return folder_path
127
 
128
 
129
- def generate_video(image_folder, audio):
130
- return os.path.join(os.getcwd(), "test.mp4")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
 
133
  # Example usage:
134
  if __name__ == "__main__":
135
  result = generate_images(["a guy in jungle", "a waterfall","greenery"])
136
- print(result,'is the result')
 
 
4
  import os
5
  from PIL import Image
6
  from gradio_client import Client
7
+ import moviepy.editor as mp
8
+ from moviepy.video.VideoClip import ImageClip
9
+ from moviepy.editor import AudioFileClip
10
+ from structured_output_extractor import StructuredOutputExtractor
11
+ from pydantic import BaseModel, Field
12
+ from typing import List
13
+ import tempfile
14
+ import os
15
 
16
 
17
  def clean_response(result):
 
56
 
57
 
58
 
59
+ def old_get_image_prompts(text_input):
60
  headers = {
61
  "Authorization": f"Bearer {constants.HF_TOKEN}", # Replace with your token
62
  "Content-Type": "application/json" # Optional, ensures JSON payload
 
81
  print(f"Error during request: {e}")
82
  return {"error": str(e)}
83
 
84
+ def segments_to_chunks(segments):
85
+ chunks = []
86
+ for segment in segments:
87
+ chunks.append(segment.get("text"))
88
+ return chunks
89
+
90
+
91
+ def get_image_prompts(text_input : List):
92
+ # Example Pydantic model (e.g., Movie)
93
+ class ImagePromptResponseSchema(BaseModel):
94
+ image_prompts: List[str] = Field(
95
+ description="List of detailed image prompts, Each Image Prompt Per Chunk"
96
+ )
97
+
98
+ extractor = StructuredOutputExtractor(response_schema=ImagePromptResponseSchema)
99
+ chunks_count = len(text_input)
100
+ chunks = "chunk: " + "\nchunk: ".join(text_input)
101
+ prompt = f"""ROLE: You are a Highly Experienced Image Prompt Sythesizer
102
+ TASK: Generate {chunks_count} image prompts, Each per chunk\n\n {chunks}"""
103
+ result = extractor.extract(prompt)
104
+ return result.model_dump() # returns dictionary version pydantic model
105
+
106
+
107
 
108
 
109
 
 
157
  return folder_path
158
 
159
 
160
+
161
+ def old_generate_video(audio_file, images, segments):
162
+ print(f"images: {images}")
163
+ print(f"segments: {segments}")
164
+ print(f"audio file: {audio_file.name}")
165
+ try:
166
+ # Save the uploaded audio file to a temporary location
167
+ file_extension = os.path.splitext(audio_file.name)[1]
168
+ temp_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=f"{file_extension}")
169
+ temp_audio_path.write(audio_file.read())
170
+ temp_audio_path.close()
171
+
172
+ # Load the audio file using MoviePy
173
+ audio = mp.AudioFileClip(temp_audio_path.name)
174
+ audio_duration = audio.duration
175
+
176
+ # Create video clips for each segment using the corresponding image
177
+ video_clips = []
178
+ for i, segment in enumerate(segments):
179
+ start_time = segment["start"]
180
+ end_time = segment["end"]
181
+
182
+ # Ensure the image index is within bounds
183
+ image_path = images[min(i, len(images) - 1)]
184
+
185
+ # Create an ImageClip for the current segment
186
+ image_clip = ImageClip(image_path, duration=end_time - start_time)
187
+ image_clip = image_clip.set_start(start_time).set_end(end_time)
188
+ video_clips.append(image_clip)
189
+
190
+ # Concatenate all the image clips to form the video
191
+ video = mp.concatenate_videoclips(video_clips, method="compose")
192
+
193
+ # Add the audio to the video
194
+ video = video.set_audio(audio)
195
+
196
+ # Save the video to a temporary file
197
+ temp_dir = tempfile.gettempdir()
198
+ video_path = os.path.join(temp_dir, "generated_video.mp4")
199
+ video.write_videofile(video_path, fps=24, codec="libx264", audio_codec="aac")
200
+
201
+ # Clean up the temporary audio file
202
+ os.remove(temp_audio_path.name)
203
+
204
+ return video_path
205
+
206
+ except Exception as e:
207
+ print(f"Error generating video: {e}")
208
+ return
209
+
210
+
211
+ from moviepy.editor import *
212
+
213
+ def generate_video(audio_file, images, segments):
214
+ print(f"images: {images}")
215
+ print(f"segments: {segments}")
216
+ print(f"audio file: {audio_file.name}")
217
+ try:
218
+ # Save the uploaded audio file to a temporary location
219
+ file_extension = os.path.splitext(audio_file.name)[1]
220
+ temp_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=f"{file_extension}")
221
+ temp_audio_path.write(audio_file.read())
222
+ temp_audio_path.close()
223
+
224
+ # Load the audio file using MoviePy
225
+ audio = AudioFileClip(temp_audio_path.name)
226
+ audio_duration = audio.duration
227
+
228
+ # Define YouTube-like dimensions (16:9 aspect ratio, e.g., 1920x1080)
229
+ frame_width = 1920
230
+ frame_height = 1080
231
+
232
+ # Create video clips for each segment using the corresponding image
233
+ video_clips = []
234
+ for i, segment in enumerate(segments):
235
+ start_time = segment["start"]
236
+ end_time = segment["end"]
237
+
238
+ # Ensure the image index is within bounds
239
+ image_path = images[min(i, len(images) - 1)]
240
+
241
+ # Create an ImageClip for the current segment
242
+ image_clip = ImageClip(image_path, duration=end_time - start_time)
243
+
244
+ # Resize and pad the image to fit a 16:9 aspect ratio
245
+ image_clip = image_clip.resize(height=frame_height).on_color(
246
+ size=(frame_width, frame_height),
247
+ color=(0, 0, 0), # Black background
248
+ pos="center" # Center the image
249
+ )
250
+
251
+ # Set the timing of the clip
252
+ image_clip = image_clip.set_start(start_time).set_end(end_time)
253
+ video_clips.append(image_clip)
254
+
255
+ # Concatenate all the image clips to form the video
256
+ video = concatenate_videoclips(video_clips, method="compose")
257
+
258
+ # Add the audio to the video
259
+ video = video.set_audio(audio)
260
+
261
+ # Save the video to a temporary file
262
+ temp_dir = tempfile.gettempdir()
263
+ video_path = os.path.join(temp_dir, "generated_video.mp4")
264
+ video.write_videofile(video_path, fps=24, codec="libx264", audio_codec="aac")
265
+
266
+ # Clean up the temporary audio file
267
+ os.remove(temp_audio_path.name)
268
+
269
+ return video_path
270
+
271
+ except Exception as e:
272
+ print(f"Error generating video: {e}")
273
+ return
274
 
275
 
276
  # Example usage:
277
  if __name__ == "__main__":
278
  result = generate_images(["a guy in jungle", "a waterfall","greenery"])
279
+
280
+