beingcognitive commited on
Commit
53fe2ef
1 Parent(s): e1d8b7d

streamlit app

Browse files
Files changed (2) hide show
  1. app.py +141 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoProcessor, BlipForConditionalGeneration, pipeline, AutoModelForCausalLM, AutoTokenizer
3
+ from PIL import Image as PILImage
4
+ import scipy.io.wavfile as wavfile
5
+ import os
6
+ import uuid
7
+
8
+ # Set page config at the very beginning
9
+ st.set_page_config(page_title="Image to Music", layout="wide")
10
+
11
+ # Load models outside of functions
12
+ @st.cache_resource
13
+ def load_models():
14
+ model_id = "Salesforce/blip-image-captioning-large"
15
+ processor = AutoProcessor.from_pretrained(model_id)
16
+ blip_model = BlipForConditionalGeneration.from_pretrained(model_id)
17
+ synthesiser = pipeline("text-to-audio", model="facebook/musicgen-small")
18
+ phi_model = AutoModelForCausalLM.from_pretrained(
19
+ "microsoft/Phi-3.5-mini-instruct",
20
+ device_map="auto",
21
+ torch_dtype="auto",
22
+ trust_remote_code=True
23
+ )
24
+ phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
25
+ return processor, blip_model, synthesiser, phi_model, phi_tokenizer
26
+
27
+ processor, blip_model, synthesiser, phi_model, phi_tokenizer = load_models()
28
+
29
+ @st.cache_data
30
+ def image_to_text(_image: PILImage.Image):
31
+ try:
32
+ # Prepare the image for the model
33
+ inputs = processor(images=_image, return_tensors="pt")
34
+
35
+ # Generate caption
36
+ output = blip_model.generate(**inputs, max_new_tokens=100)
37
+
38
+ # Decode the output
39
+ caption = processor.decode(output[0], skip_special_tokens=True)
40
+
41
+ return caption
42
+ # # Create a music generation prompt based on the caption
43
+ # music_prompt = f"Generate music inspired by this scene: {caption}. Consider elements like tempo, instrumentation, genre, and emotions evoked by the scene."
44
+
45
+ # return music_prompt
46
+ except Exception as e:
47
+ return f"Error in image_to_text: {str(e)}"
48
+
49
+ @st.cache_data
50
+ def refine_prompt(caption: str):
51
+ try:
52
+ messages = [
53
+ {"role": "system", "content": "You are a helpful AI assistant for generating music prompts."},
54
+ {"role": "user", "content": f"Generate a detailed music prompt based on this scene: {caption}. Consider elements like tempo, instrumentation, genre, and emotions."}
55
+ ]
56
+ pipe = pipeline(
57
+ "text-generation",
58
+ model=phi_model,
59
+ tokenizer=phi_tokenizer,
60
+ )
61
+ generation_args = {
62
+ "max_new_tokens": 500,
63
+ "return_full_text": False,
64
+ "temperature": 0.7,
65
+ "do_sample": True,
66
+ }
67
+ output = pipe(messages, **generation_args)
68
+ refined_prompt = output[0]['generated_text']
69
+ return refined_prompt
70
+ except Exception as e:
71
+ return f"Error in refine_prompt: {str(e)}"
72
+
73
+ def text_to_music(response: str):
74
+ try:
75
+ music = synthesiser(response, forward_params={"do_sample": True})
76
+ output_path = f"musicgen_out_{uuid.uuid4()}.wav"
77
+ wavfile.write(output_path, rate=music["sampling_rate"], data=music["audio"])
78
+ return output_path
79
+ except Exception as e:
80
+ return f"Error in text_to_music: {str(e)}"
81
+
82
+ def cleanup_old_files():
83
+ for file in os.listdir():
84
+ if file.startswith("musicgen_out_") and file.endswith(".wav"):
85
+ os.remove(file)
86
+
87
+ def main():
88
+ # st.set_page_config(page_title="Image to Music", layout="wide")
89
+
90
+ st.title("Image to Music")
91
+ st.write("""
92
+ Generate music inspired by an image.
93
+
94
+ This project enables the creation of music based on the inspiration drawn from an image, leveraging multiple AI technologies.
95
+
96
+ ## How It Works
97
+
98
+ 1. **Image to Text Description**
99
+ - Use Salesforce BLIP to convert the image into a caption.
100
+ 2. **Text to Refined Music Prompt**
101
+ - Use Microsoft Phi-3.5-mini- to generate a detailed music prompt based on the caption.
102
+ 3. **Music Prompt to Music**
103
+ - Use Facebook MusicGen to generate music from the refined prompt.
104
+
105
+ ## Steps
106
+
107
+ 1. **Image -> [ Salesforce BLIP ] -> Caption**
108
+ 2. **Caption -> [ Microsoft Phi-3.5-mini ] -> Refined Music Prompt**
109
+ 3. **Refined Music Prompt -> [ Facebook MusicGen ] -> Music**
110
+
111
+ Let's turn your visual inspirations into beautiful melodies!
112
+
113
+ **Please Note:**
114
+ The music generation process may take several minutes to complete.
115
+ This is due to the complex AI models working behind the scenes to create unique music based on your image.
116
+ Thank you for your patience! """)
117
+
118
+ uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
119
+
120
+ if uploaded_file is not None:
121
+ image = PILImage.open(uploaded_file)
122
+ st.image(image, caption="Uploaded Image", use_column_width=True)
123
+
124
+ if st.button("Generate Music"):
125
+ with st.spinner("Processing image..."):
126
+ caption = image_to_text(image)
127
+ st.text_area("Generated Caption", caption, height=100)
128
+
129
+ with st.spinner("Refining music prompt..."):
130
+ refined_prompt = refine_prompt(caption)
131
+ st.text_area("Refined Music Prompt", refined_prompt, height=150)
132
+
133
+ with st.spinner("Generating music..."):
134
+ music_file = text_to_music(refined_prompt)
135
+
136
+ st.audio(music_file)
137
+
138
+ cleanup_old_files()
139
+
140
+ if __name__ == "__main__":
141
+ main()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ scipy
2
+ torch
3
+ torchvision
4
+ transformers
5
+ accelerate