video generation feature v1 is functional
Browse files- .streamlit/config.toml +2 -0
- app.py +102 -106
- constants.py +5 -1
- requirements.txt +83 -1
- structured_output_extractor.py +101 -0
- utils.py +148 -4
.streamlit/config.toml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[server]
|
2 |
+
maxUploadSize = 20
|
app.py
CHANGED
@@ -1,26 +1,19 @@
|
|
1 |
import streamlit as st
|
2 |
-
import
|
3 |
-
import io
|
4 |
-
from gradio_client import Client, handle_file
|
5 |
-
import tempfile
|
6 |
-
import os
|
7 |
-
from utils import clean_response, get_translation, get_image_prompts, generate_images, generate_video
|
8 |
import constants
|
|
|
9 |
|
|
|
10 |
|
11 |
-
# Initialize
|
12 |
-
if 'client' not in st.session_state:
|
13 |
-
st.session_state.client = Client("habib926653/openai-whisper-large-v3-turbo", hf_token=constants.HF_TOKEN)
|
14 |
-
|
15 |
-
# Initialize state variables
|
16 |
if 'transcript_visible' not in st.session_state:
|
17 |
st.session_state.transcript_visible = False
|
18 |
if 'translation_visible' not in st.session_state:
|
19 |
st.session_state.translation_visible = False
|
20 |
if 'uploaded_file_name' not in st.session_state:
|
21 |
st.session_state.uploaded_file_name = None
|
22 |
-
if '
|
23 |
-
st.session_state.
|
24 |
if 'was_converted' not in st.session_state:
|
25 |
st.session_state.was_converted = False
|
26 |
if 'transcript' not in st.session_state:
|
@@ -34,43 +27,34 @@ if 'image_prompts' not in st.session_state:
|
|
34 |
if 'generated_images' not in st.session_state:
|
35 |
st.session_state.generated_images = None
|
36 |
|
37 |
-
# Function to convert the audio to MP3 using the external API
|
38 |
-
def convert_to_mp3(audio_file):
|
39 |
-
if audio_file.name.endswith(".mp3"):
|
40 |
-
return audio_file, False # File is already MP3
|
41 |
-
else:
|
42 |
-
# Send to the external converter API
|
43 |
-
url = constants.AUDIO_CONVERTER_ENDPOINT
|
44 |
-
files = {"file": (audio_file.name, audio_file, "audio/mp3")}
|
45 |
-
|
46 |
-
with st.spinner("Converting audio to MP3... Please wait."):
|
47 |
-
response = requests.post(url, files=files)
|
48 |
-
|
49 |
-
if response.status_code == 200:
|
50 |
-
# If conversion is successful, save and return the MP3 file
|
51 |
-
converted_file = io.BytesIO(response.content)
|
52 |
-
converted_file.name = "converted.mp3"
|
53 |
-
st.success("✅ File successfully converted to MP3!")
|
54 |
-
return converted_file, True # File was converted
|
55 |
-
else:
|
56 |
-
st.error("❌ Conversion failed. Please try another format.")
|
57 |
-
return None, None
|
58 |
|
59 |
# Streamlit UI
|
60 |
st.markdown(
|
61 |
"<h1 style='text-align: center;'>AI Video Generator</h1>",
|
62 |
unsafe_allow_html=True
|
63 |
)
|
64 |
-
st.info("Video Generation Feature
|
65 |
|
66 |
# Upload audio file
|
67 |
audio_file = st.file_uploader("🔼 Upload your audio file:", type=constants.SUPPORTED_FORMATS)
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
if audio_file:
|
70 |
# Reset states only when a new file is uploaded
|
71 |
if st.session_state.uploaded_file_name != audio_file.name:
|
72 |
st.session_state.uploaded_file_name = audio_file.name
|
73 |
-
st.session_state.
|
74 |
st.session_state.transcript = None
|
75 |
st.session_state.translation = None
|
76 |
st.session_state.image_prompts = None
|
@@ -78,78 +62,90 @@ if audio_file:
|
|
78 |
|
79 |
st.info(f"Uploaded file: **{audio_file.name}**")
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
#
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
)
|
98 |
-
st.session_state.
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
st.session_state.translation = get_translation(st.session_state.transcript)
|
105 |
-
|
106 |
-
st.audio(st.session_state.converted_audio, format="audio/mp3")
|
107 |
-
|
108 |
-
# Toggle transcript visibility
|
109 |
-
toggle_transcript = st.checkbox("Show Transcript", value=st.session_state.transcript_visible)
|
110 |
-
st.session_state.transcript_visible = toggle_transcript
|
111 |
-
|
112 |
-
if st.session_state.transcript_visible:
|
113 |
-
st.write("### Transcription:")
|
114 |
-
st.write(st.session_state.transcript)
|
115 |
-
|
116 |
-
# Toggle translation visibility
|
117 |
-
toggle_translation = st.checkbox("Show Translation", value=st.session_state.translation_visible)
|
118 |
-
st.session_state.translation_visible = toggle_translation
|
119 |
-
|
120 |
-
if st.session_state.translation_visible:
|
121 |
-
st.write("### Translation:")
|
122 |
-
st.write(st.session_state.translation)
|
123 |
-
|
124 |
-
# Image generation logic
|
125 |
-
if st.session_state.translation and st.session_state.image_prompts is None:
|
126 |
-
with st.spinner("Generating image prompts... Please wait."):
|
127 |
-
if 'Already in English' in st.session_state.translation:
|
128 |
-
st.info("Audio is Already in English. Using Transcription to generate Image Prompts")
|
129 |
-
st.session_state.image_prompts = get_image_prompts(st.session_state.transcript)['image_prompts']
|
130 |
-
else:
|
131 |
-
st.session_state.image_prompts = get_image_prompts(st.session_state.translation)['image_prompts']
|
132 |
-
|
133 |
-
|
134 |
-
# Ensure that generated_images is always a list
|
135 |
-
if 'generated_images' not in st.session_state or st.session_state.generated_images is None:
|
136 |
-
st.session_state.generated_images = []
|
137 |
-
|
138 |
-
# Generate images only if they have not been generated already
|
139 |
-
if st.session_state.image_prompts and not st.session_state.generated_images:
|
140 |
-
with st.spinner("Generating images... Please wait."):
|
141 |
-
for prompt, image_path in generate_images(st.session_state.image_prompts):
|
142 |
-
# Display each image as soon as it's generated
|
143 |
-
st.image(image_path, caption=f"{prompt}", use_container_width=True)
|
144 |
-
# Append the generated image to the session state
|
145 |
-
st.session_state.generated_images.append((prompt, image_path))
|
146 |
-
|
147 |
-
# Display all previously generated images (including newly generated ones)
|
148 |
-
else:
|
149 |
-
for prompt, image_path in st.session_state.generated_images:
|
150 |
-
# Display each image
|
151 |
-
st.image(image_path, caption=f"{prompt}", use_container_width=True)
|
152 |
|
153 |
else:
|
154 |
st.warning("Please upload an audio file to proceed.")
|
155 |
|
|
|
|
1 |
import streamlit as st
|
2 |
+
from utils import get_translation, get_image_prompts, segments_to_chunks, generate_images, generate_video
|
|
|
|
|
|
|
|
|
|
|
3 |
import constants
|
4 |
+
from groq import Groq
|
5 |
|
6 |
+
client = Groq()
|
7 |
|
8 |
+
# Initialize state variables if not already set
|
|
|
|
|
|
|
|
|
9 |
if 'transcript_visible' not in st.session_state:
|
10 |
st.session_state.transcript_visible = False
|
11 |
if 'translation_visible' not in st.session_state:
|
12 |
st.session_state.translation_visible = False
|
13 |
if 'uploaded_file_name' not in st.session_state:
|
14 |
st.session_state.uploaded_file_name = None
|
15 |
+
if 'audio' not in st.session_state:
|
16 |
+
st.session_state.audio = None
|
17 |
if 'was_converted' not in st.session_state:
|
18 |
st.session_state.was_converted = False
|
19 |
if 'transcript' not in st.session_state:
|
|
|
27 |
if 'generated_images' not in st.session_state:
|
28 |
st.session_state.generated_images = None
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
# Streamlit UI
|
32 |
st.markdown(
|
33 |
"<h1 style='text-align: center;'>AI Video Generator</h1>",
|
34 |
unsafe_allow_html=True
|
35 |
)
|
36 |
+
st.info("Video Generation Feature - Functional But Can be Buggy")
|
37 |
|
38 |
# Upload audio file
|
39 |
audio_file = st.file_uploader("🔼 Upload your audio file:", type=constants.SUPPORTED_FORMATS)
|
40 |
|
41 |
+
print(audio_file,'is the upload')
|
42 |
+
|
43 |
+
# if audio_file is not None:
|
44 |
+
# # Check the duration of the uploaded audio file
|
45 |
+
# duration = get_audio_duration(audio_file)
|
46 |
+
|
47 |
+
# # Allow only files up to 5 minutes (300 seconds)
|
48 |
+
# if duration > 300:
|
49 |
+
# st.error("The uploaded audio file exceeds the 5-minute limit. Please upload a shorter file.")
|
50 |
+
# else:
|
51 |
+
# st.success(f"Audio file uploaded successfully! Duration: {duration/60:.2f} minutes")
|
52 |
+
|
53 |
if audio_file:
|
54 |
# Reset states only when a new file is uploaded
|
55 |
if st.session_state.uploaded_file_name != audio_file.name:
|
56 |
st.session_state.uploaded_file_name = audio_file.name
|
57 |
+
st.session_state.audio = audio_file
|
58 |
st.session_state.transcript = None
|
59 |
st.session_state.translation = None
|
60 |
st.session_state.image_prompts = None
|
|
|
62 |
|
63 |
st.info(f"Uploaded file: **{audio_file.name}**")
|
64 |
|
65 |
+
# Read the uploaded file's bytes and send to Groq API for transcription
|
66 |
+
file_bytes = audio_file.read()
|
67 |
+
|
68 |
+
# Create a transcription of the audio file using Groq API
|
69 |
+
result = client.audio.transcriptions.create(
|
70 |
+
file=(audio_file.name, file_bytes), # Send the audio file content directly to the API
|
71 |
+
model="whisper-large-v3-turbo", # Model to use for transcription
|
72 |
+
prompt="Specify context or spelling", # Optional context for better transcription accuracy
|
73 |
+
response_format="verbose_json", # Return detailed JSON response
|
74 |
+
temperature=0.0, # Control randomness in the transcription output
|
75 |
+
)
|
76 |
+
st.session_state.transcript = result.text
|
77 |
+
st.session_state.segments = result.segments
|
78 |
+
|
79 |
+
# Translation logic
|
80 |
+
if st.session_state.transcript and st.session_state.translation is None:
|
81 |
+
with st.spinner("Generating translation... Please wait."):
|
82 |
+
st.session_state.translation = get_translation(st.session_state.transcript)
|
83 |
+
|
84 |
+
st.audio(st.session_state.audio, format=f"audio/{audio_file.type}")
|
85 |
+
|
86 |
+
# Toggle transcript visibility
|
87 |
+
toggle_transcript = st.checkbox("Show Transcript", value=st.session_state.transcript_visible, key="toggle_transcript")
|
88 |
+
st.session_state.transcript_visible = toggle_transcript
|
89 |
+
|
90 |
+
if st.session_state.transcript_visible:
|
91 |
+
st.write("### Transcription:")
|
92 |
+
st.write(st.session_state.transcript)
|
93 |
+
|
94 |
+
# Toggle translation visibility
|
95 |
+
toggle_translation = st.checkbox("Show Translation", value=st.session_state.translation_visible, key="toggle_translation")
|
96 |
+
st.session_state.translation_visible = toggle_translation
|
97 |
+
|
98 |
+
if st.session_state.translation_visible:
|
99 |
+
st.write("### Translation:")
|
100 |
+
st.write(st.session_state.translation)
|
101 |
+
|
102 |
+
# Image generation logic
|
103 |
+
if st.session_state.translation and st.session_state.image_prompts is None:
|
104 |
+
with st.spinner("Generating image prompts... Please wait."):
|
105 |
+
if 'Already in English' in st.session_state.translation:
|
106 |
+
st.info("Audio is Already in English. Using Transcription to generate Image Prompts")
|
107 |
+
st.session_state.image_prompts = get_image_prompts(segments_to_chunks(st.session_state.segments))['image_prompts']
|
108 |
+
else:
|
109 |
+
st.session_state.image_prompts = get_image_prompts(segments_to_chunks(st.session_state.segments))['image_prompts']
|
110 |
+
|
111 |
+
print(st.session_state.image_prompts)
|
112 |
+
# Ensure that generated_images is always a list
|
113 |
+
if 'generated_images' not in st.session_state or st.session_state.generated_images is None:
|
114 |
+
st.session_state.generated_images = []
|
115 |
+
|
116 |
+
# Generate images only if they have not been generated already
|
117 |
+
if st.session_state.image_prompts and not st.session_state.generated_images:
|
118 |
+
with st.spinner("Generating images... Please wait."):
|
119 |
+
for prompt, image_path in generate_images(st.session_state.image_prompts):
|
120 |
+
# # Display each image as soon as it's generated
|
121 |
+
# st.image(image_path, caption=f"{prompt}", use_container_width=True)
|
122 |
+
# Append the generated image to the session state
|
123 |
+
st.session_state.generated_images.append((prompt, image_path))
|
124 |
+
|
125 |
+
# # Display all previously generated images (including newly generated ones)
|
126 |
+
# else:
|
127 |
+
# for prompt, image_path in st.session_state.generated_images:
|
128 |
+
# st.image(image_path, caption=f"{prompt}", use_container_width=True)
|
129 |
+
|
130 |
+
# Generate video when all images are generated
|
131 |
+
if st.session_state.generated_images and st.session_state.audio:
|
132 |
+
if st.button("Generate Video"):
|
133 |
+
with st.spinner("Generating video... Please wait."):
|
134 |
+
# Map images to segments
|
135 |
+
image_paths = [img[1] for img in st.session_state.generated_images]
|
136 |
+
generated_video_path = generate_video(
|
137 |
+
audio_file=st.session_state.audio,
|
138 |
+
images=image_paths,
|
139 |
+
segments=st.session_state.segments
|
140 |
)
|
141 |
+
st.session_state.generated_video = generated_video_path
|
142 |
+
st.success("Video generated successfully!")
|
143 |
+
|
144 |
+
# Display the generated video
|
145 |
+
if st.session_state.generated_video:
|
146 |
+
st.video(st.session_state.generated_video)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
else:
|
149 |
st.warning("Please upload an audio file to proceed.")
|
150 |
|
151 |
+
|
constants.py
CHANGED
@@ -4,6 +4,8 @@ import os
|
|
4 |
load_dotenv()
|
5 |
|
6 |
HF_TOKEN = os.getenv("HF_TOKEN", None)
|
|
|
|
|
7 |
AUDIO_CONVERTER_ENDPOINT="https://audio-converter-api-587c.onrender.com/convert/mp3"
|
8 |
|
9 |
|
@@ -12,4 +14,6 @@ PROMPT_GENERATION_ENDPOINT="https://habib926653-text-translator-agent-api.hf.spa
|
|
12 |
IMAGE_GENERATION_SPACE_NAME="habib926653/stabilityai-stable-diffusion-3.5-large-turbo"
|
13 |
|
14 |
# Supported formats
|
15 |
-
SUPPORTED_FORMATS = ["mp3", "wav", "ogg", "flac", "aac", "m4a"]
|
|
|
|
|
|
4 |
load_dotenv()
|
5 |
|
6 |
HF_TOKEN = os.getenv("HF_TOKEN", None)
|
7 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
8 |
+
|
9 |
AUDIO_CONVERTER_ENDPOINT="https://audio-converter-api-587c.onrender.com/convert/mp3"
|
10 |
|
11 |
|
|
|
14 |
IMAGE_GENERATION_SPACE_NAME="habib926653/stabilityai-stable-diffusion-3.5-large-turbo"
|
15 |
|
16 |
# Supported formats
|
17 |
+
SUPPORTED_FORMATS = ["mp3", "wav", "ogg", "flac", "aac", "m4a"]
|
18 |
+
|
19 |
+
|
requirements.txt
CHANGED
@@ -1,3 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
python-dotenv==1.0.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
streamlit==1.41.1
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==5.5.0
|
2 |
+
annotated-types==0.7.0
|
3 |
+
anyio==4.8.0
|
4 |
+
attrs==24.3.0
|
5 |
+
audeer==2.2.1
|
6 |
+
audiofile==1.5.1
|
7 |
+
audmath==1.4.1
|
8 |
+
blinker==1.9.0
|
9 |
+
cachetools==5.5.0
|
10 |
+
certifi==2024.12.14
|
11 |
+
cffi==1.17.1
|
12 |
+
charset-normalizer==3.4.1
|
13 |
+
click==8.1.8
|
14 |
+
decorator==4.4.2
|
15 |
+
distro==1.9.0
|
16 |
+
exceptiongroup==1.2.2
|
17 |
+
filelock==3.16.1
|
18 |
+
fsspec==2024.12.0
|
19 |
+
gitdb==4.0.12
|
20 |
+
GitPython==3.1.44
|
21 |
+
gradio_client==1.5.4
|
22 |
+
groq==0.15.0
|
23 |
+
h11==0.14.0
|
24 |
+
httpcore==1.0.7
|
25 |
+
httpx==0.28.1
|
26 |
+
huggingface-hub==0.27.1
|
27 |
+
idna==3.10
|
28 |
+
imageio==2.36.1
|
29 |
+
imageio-ffmpeg==0.5.1
|
30 |
+
Jinja2==3.1.5
|
31 |
+
jsonpatch==1.33
|
32 |
+
jsonpointer==3.0.0
|
33 |
+
jsonschema==4.23.0
|
34 |
+
jsonschema-specifications==2024.10.1
|
35 |
+
langchain-core==0.3.29
|
36 |
+
langchain-groq==0.2.3
|
37 |
+
langgraph==0.2.62
|
38 |
+
langgraph-checkpoint==2.0.9
|
39 |
+
langgraph-sdk==0.1.51
|
40 |
+
langsmith==0.2.10
|
41 |
+
markdown-it-py==3.0.0
|
42 |
+
MarkupSafe==3.0.2
|
43 |
+
mdurl==0.1.2
|
44 |
+
moviepy==1.0.3
|
45 |
+
msgpack==1.1.0
|
46 |
+
narwhals==1.21.1
|
47 |
+
numpy==2.2.1
|
48 |
+
opencv-python==4.10.0.84
|
49 |
+
orjson==3.10.14
|
50 |
+
packaging==24.2
|
51 |
+
pandas==2.2.3
|
52 |
+
pillow==11.1.0
|
53 |
+
proglog==0.1.10
|
54 |
+
protobuf==5.29.3
|
55 |
+
pyarrow==18.1.0
|
56 |
+
pycparser==2.22
|
57 |
+
pydantic==2.10.5
|
58 |
+
pydantic_core==2.27.2
|
59 |
+
pydeck==0.9.1
|
60 |
+
pydub==0.25.1
|
61 |
+
Pygments==2.19.1
|
62 |
+
python-dateutil==2.9.0.post0
|
63 |
python-dotenv==1.0.1
|
64 |
+
pytz==2024.2
|
65 |
+
PyYAML==6.0.2
|
66 |
+
referencing==0.35.1
|
67 |
+
requests==2.32.3
|
68 |
+
requests-toolbelt==1.0.0
|
69 |
+
rich==13.9.4
|
70 |
+
rpds-py==0.22.3
|
71 |
+
scipy==1.15.1
|
72 |
+
six==1.17.0
|
73 |
+
smmap==5.0.2
|
74 |
+
sniffio==1.3.1
|
75 |
+
soundfile==0.13.0
|
76 |
streamlit==1.41.1
|
77 |
+
tenacity==9.0.0
|
78 |
+
toml==0.10.2
|
79 |
+
tornado==6.4.2
|
80 |
+
tqdm==4.67.1
|
81 |
+
typing_extensions==4.12.2
|
82 |
+
tzdata==2024.2
|
83 |
+
urllib3==2.3.0
|
84 |
+
watchdog==6.0.0
|
85 |
+
websockets==14.1
|
structured_output_extractor.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Type, Optional
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from langgraph.graph import StateGraph, START, END
|
4 |
+
from typing import TypedDict
|
5 |
+
import constants # Assuming constants.py holds LLM provider configurations
|
6 |
+
from langchain_groq import ChatGroq
|
7 |
+
|
8 |
+
|
9 |
+
# Define the State structure (similar to previous definition)
|
10 |
+
class State(TypedDict):
|
11 |
+
messages: list
|
12 |
+
output: Optional[BaseModel]
|
13 |
+
|
14 |
+
|
15 |
+
# Generic Pydantic model-based structured output extractor
|
16 |
+
class StructuredOutputExtractor:
|
17 |
+
def __init__(self, response_schema: Type[BaseModel]):
|
18 |
+
"""
|
19 |
+
Initializes the extractor for any given structured output model.
|
20 |
+
|
21 |
+
:param response_schema: Pydantic model class used for structured output extraction
|
22 |
+
"""
|
23 |
+
self.response_schema = response_schema
|
24 |
+
|
25 |
+
# Initialize language model (provider and API keys come from constants.py)
|
26 |
+
self.llm = ChatGroq(model="llama-3.3-70b-versatile")
|
27 |
+
|
28 |
+
# Bind the model with structured output capability
|
29 |
+
self.structured_llm = self.llm.with_structured_output(response_schema)
|
30 |
+
|
31 |
+
# Build the graph for structured output
|
32 |
+
self._build_graph()
|
33 |
+
|
34 |
+
def _build_graph(self):
|
35 |
+
"""
|
36 |
+
Build the LangGraph computational graph for structured extraction.
|
37 |
+
"""
|
38 |
+
graph_builder = StateGraph(State)
|
39 |
+
|
40 |
+
# Add nodes and edges for structured output
|
41 |
+
graph_builder.add_node("extract", self._extract_structured_info)
|
42 |
+
graph_builder.add_edge(START, "extract")
|
43 |
+
graph_builder.add_edge("extract", END)
|
44 |
+
|
45 |
+
self.graph = graph_builder.compile()
|
46 |
+
|
47 |
+
def _extract_structured_info(self, state: dict):
|
48 |
+
"""
|
49 |
+
Extract structured information using the specified response model.
|
50 |
+
|
51 |
+
:param state: Current graph state
|
52 |
+
:return: Updated state with structured output
|
53 |
+
"""
|
54 |
+
query = state['messages'][-1].content
|
55 |
+
print(f"Processing query: {query}")
|
56 |
+
try:
|
57 |
+
# Extract details using the structured model
|
58 |
+
output = self.structured_llm.invoke(query)
|
59 |
+
# Return the structured response
|
60 |
+
return {"output": output}
|
61 |
+
except Exception as e:
|
62 |
+
print(f"Error during extraction: {e}")
|
63 |
+
return {"output": None}
|
64 |
+
|
65 |
+
def extract(self, query: str) -> Optional[BaseModel]:
|
66 |
+
"""
|
67 |
+
Public method to extract structured information.
|
68 |
+
|
69 |
+
:param query: Input query for structured output extraction
|
70 |
+
:return: Structured model object or None
|
71 |
+
"""
|
72 |
+
from langchain_core.messages import SystemMessage
|
73 |
+
|
74 |
+
result = self.graph.invoke({
|
75 |
+
"messages": [SystemMessage(content=query)]
|
76 |
+
})
|
77 |
+
# Return the structured model response, if available
|
78 |
+
result = result.get('output')
|
79 |
+
return result
|
80 |
+
|
81 |
+
|
82 |
+
if __name__ == '__main__':
|
83 |
+
|
84 |
+
# Example Pydantic model (e.g., Movie)
|
85 |
+
class Movie(BaseModel):
|
86 |
+
title: str
|
87 |
+
year: int
|
88 |
+
genre: str
|
89 |
+
rating: Optional[float] = None
|
90 |
+
actors: list[str] = []
|
91 |
+
|
92 |
+
|
93 |
+
# Example usage with a generic structured extractor
|
94 |
+
extractor = StructuredOutputExtractor(response_schema=Movie)
|
95 |
+
|
96 |
+
query = "Tell me about the movie Inception. Provide details about its title, year, genre, rating, and main actors."
|
97 |
+
|
98 |
+
result = extractor.extract(query)
|
99 |
+
print(type(result))
|
100 |
+
if result:
|
101 |
+
print(result)
|
utils.py
CHANGED
@@ -4,6 +4,14 @@ import constants
|
|
4 |
import os
|
5 |
from PIL import Image
|
6 |
from gradio_client import Client
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def clean_response(result):
|
@@ -48,7 +56,7 @@ def get_translation(text: str):
|
|
48 |
|
49 |
|
50 |
|
51 |
-
def
|
52 |
headers = {
|
53 |
"Authorization": f"Bearer {constants.HF_TOKEN}", # Replace with your token
|
54 |
"Content-Type": "application/json" # Optional, ensures JSON payload
|
@@ -73,6 +81,29 @@ def get_image_prompts(text_input):
|
|
73 |
print(f"Error during request: {e}")
|
74 |
return {"error": str(e)}
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
|
78 |
|
@@ -126,11 +157,124 @@ def tmp_folder(folder_name: str) -> str:
|
|
126 |
return folder_path
|
127 |
|
128 |
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
|
133 |
# Example usage:
|
134 |
if __name__ == "__main__":
|
135 |
result = generate_images(["a guy in jungle", "a waterfall","greenery"])
|
136 |
-
|
|
|
|
4 |
import os
|
5 |
from PIL import Image
|
6 |
from gradio_client import Client
|
7 |
+
import moviepy.editor as mp
|
8 |
+
from moviepy.video.VideoClip import ImageClip
|
9 |
+
from moviepy.editor import AudioFileClip
|
10 |
+
from structured_output_extractor import StructuredOutputExtractor
|
11 |
+
from pydantic import BaseModel, Field
|
12 |
+
from typing import List
|
13 |
+
import tempfile
|
14 |
+
import os
|
15 |
|
16 |
|
17 |
def clean_response(result):
|
|
|
56 |
|
57 |
|
58 |
|
59 |
+
def old_get_image_prompts(text_input):
|
60 |
headers = {
|
61 |
"Authorization": f"Bearer {constants.HF_TOKEN}", # Replace with your token
|
62 |
"Content-Type": "application/json" # Optional, ensures JSON payload
|
|
|
81 |
print(f"Error during request: {e}")
|
82 |
return {"error": str(e)}
|
83 |
|
84 |
+
def segments_to_chunks(segments):
|
85 |
+
chunks = []
|
86 |
+
for segment in segments:
|
87 |
+
chunks.append(segment.get("text"))
|
88 |
+
return chunks
|
89 |
+
|
90 |
+
|
91 |
+
def get_image_prompts(text_input : List):
|
92 |
+
# Example Pydantic model (e.g., Movie)
|
93 |
+
class ImagePromptResponseSchema(BaseModel):
|
94 |
+
image_prompts: List[str] = Field(
|
95 |
+
description="List of detailed image prompts, Each Image Prompt Per Chunk"
|
96 |
+
)
|
97 |
+
|
98 |
+
extractor = StructuredOutputExtractor(response_schema=ImagePromptResponseSchema)
|
99 |
+
chunks_count = len(text_input)
|
100 |
+
chunks = "chunk: " + "\nchunk: ".join(text_input)
|
101 |
+
prompt = f"""ROLE: You are a Highly Experienced Image Prompt Sythesizer
|
102 |
+
TASK: Generate {chunks_count} image prompts, Each per chunk\n\n {chunks}"""
|
103 |
+
result = extractor.extract(prompt)
|
104 |
+
return result.model_dump() # returns dictionary version pydantic model
|
105 |
+
|
106 |
+
|
107 |
|
108 |
|
109 |
|
|
|
157 |
return folder_path
|
158 |
|
159 |
|
160 |
+
|
161 |
+
def old_generate_video(audio_file, images, segments):
|
162 |
+
print(f"images: {images}")
|
163 |
+
print(f"segments: {segments}")
|
164 |
+
print(f"audio file: {audio_file.name}")
|
165 |
+
try:
|
166 |
+
# Save the uploaded audio file to a temporary location
|
167 |
+
file_extension = os.path.splitext(audio_file.name)[1]
|
168 |
+
temp_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=f"{file_extension}")
|
169 |
+
temp_audio_path.write(audio_file.read())
|
170 |
+
temp_audio_path.close()
|
171 |
+
|
172 |
+
# Load the audio file using MoviePy
|
173 |
+
audio = mp.AudioFileClip(temp_audio_path.name)
|
174 |
+
audio_duration = audio.duration
|
175 |
+
|
176 |
+
# Create video clips for each segment using the corresponding image
|
177 |
+
video_clips = []
|
178 |
+
for i, segment in enumerate(segments):
|
179 |
+
start_time = segment["start"]
|
180 |
+
end_time = segment["end"]
|
181 |
+
|
182 |
+
# Ensure the image index is within bounds
|
183 |
+
image_path = images[min(i, len(images) - 1)]
|
184 |
+
|
185 |
+
# Create an ImageClip for the current segment
|
186 |
+
image_clip = ImageClip(image_path, duration=end_time - start_time)
|
187 |
+
image_clip = image_clip.set_start(start_time).set_end(end_time)
|
188 |
+
video_clips.append(image_clip)
|
189 |
+
|
190 |
+
# Concatenate all the image clips to form the video
|
191 |
+
video = mp.concatenate_videoclips(video_clips, method="compose")
|
192 |
+
|
193 |
+
# Add the audio to the video
|
194 |
+
video = video.set_audio(audio)
|
195 |
+
|
196 |
+
# Save the video to a temporary file
|
197 |
+
temp_dir = tempfile.gettempdir()
|
198 |
+
video_path = os.path.join(temp_dir, "generated_video.mp4")
|
199 |
+
video.write_videofile(video_path, fps=24, codec="libx264", audio_codec="aac")
|
200 |
+
|
201 |
+
# Clean up the temporary audio file
|
202 |
+
os.remove(temp_audio_path.name)
|
203 |
+
|
204 |
+
return video_path
|
205 |
+
|
206 |
+
except Exception as e:
|
207 |
+
print(f"Error generating video: {e}")
|
208 |
+
return
|
209 |
+
|
210 |
+
|
211 |
+
from moviepy.editor import *
|
212 |
+
|
213 |
+
def generate_video(audio_file, images, segments):
|
214 |
+
print(f"images: {images}")
|
215 |
+
print(f"segments: {segments}")
|
216 |
+
print(f"audio file: {audio_file.name}")
|
217 |
+
try:
|
218 |
+
# Save the uploaded audio file to a temporary location
|
219 |
+
file_extension = os.path.splitext(audio_file.name)[1]
|
220 |
+
temp_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=f"{file_extension}")
|
221 |
+
temp_audio_path.write(audio_file.read())
|
222 |
+
temp_audio_path.close()
|
223 |
+
|
224 |
+
# Load the audio file using MoviePy
|
225 |
+
audio = AudioFileClip(temp_audio_path.name)
|
226 |
+
audio_duration = audio.duration
|
227 |
+
|
228 |
+
# Define YouTube-like dimensions (16:9 aspect ratio, e.g., 1920x1080)
|
229 |
+
frame_width = 1920
|
230 |
+
frame_height = 1080
|
231 |
+
|
232 |
+
# Create video clips for each segment using the corresponding image
|
233 |
+
video_clips = []
|
234 |
+
for i, segment in enumerate(segments):
|
235 |
+
start_time = segment["start"]
|
236 |
+
end_time = segment["end"]
|
237 |
+
|
238 |
+
# Ensure the image index is within bounds
|
239 |
+
image_path = images[min(i, len(images) - 1)]
|
240 |
+
|
241 |
+
# Create an ImageClip for the current segment
|
242 |
+
image_clip = ImageClip(image_path, duration=end_time - start_time)
|
243 |
+
|
244 |
+
# Resize and pad the image to fit a 16:9 aspect ratio
|
245 |
+
image_clip = image_clip.resize(height=frame_height).on_color(
|
246 |
+
size=(frame_width, frame_height),
|
247 |
+
color=(0, 0, 0), # Black background
|
248 |
+
pos="center" # Center the image
|
249 |
+
)
|
250 |
+
|
251 |
+
# Set the timing of the clip
|
252 |
+
image_clip = image_clip.set_start(start_time).set_end(end_time)
|
253 |
+
video_clips.append(image_clip)
|
254 |
+
|
255 |
+
# Concatenate all the image clips to form the video
|
256 |
+
video = concatenate_videoclips(video_clips, method="compose")
|
257 |
+
|
258 |
+
# Add the audio to the video
|
259 |
+
video = video.set_audio(audio)
|
260 |
+
|
261 |
+
# Save the video to a temporary file
|
262 |
+
temp_dir = tempfile.gettempdir()
|
263 |
+
video_path = os.path.join(temp_dir, "generated_video.mp4")
|
264 |
+
video.write_videofile(video_path, fps=24, codec="libx264", audio_codec="aac")
|
265 |
+
|
266 |
+
# Clean up the temporary audio file
|
267 |
+
os.remove(temp_audio_path.name)
|
268 |
+
|
269 |
+
return video_path
|
270 |
+
|
271 |
+
except Exception as e:
|
272 |
+
print(f"Error generating video: {e}")
|
273 |
+
return
|
274 |
|
275 |
|
276 |
# Example usage:
|
277 |
if __name__ == "__main__":
|
278 |
result = generate_images(["a guy in jungle", "a waterfall","greenery"])
|
279 |
+
|
280 |
+
|