fffiloni commited on
Commit
3740288
·
verified ·
1 Parent(s): b20b9d1

update APIs

Browse files
Files changed (1) hide show
  1. app.py +42 -49
app.py CHANGED
@@ -23,46 +23,29 @@ def extract_audio(video_in):
23
  return 'audio.wav'
24
 
25
  def get_caption_from_kosmos(image_in):
26
- kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")
27
-
28
  kosmos2_result = kosmos2_client.predict(
29
- image_in, # str (filepath or URL to image) in 'Test Image' Image component
30
- "Detailed", # str in 'Description Type' Radio component
31
- fn_index=4
32
  )
33
-
34
  print(f"KOSMOS2 RETURNS: {kosmos2_result}")
35
 
36
- with open(kosmos2_result[1], 'r') as f:
37
- data = json.load(f)
38
-
39
- reconstructed_sentence = []
40
- for sublist in data:
41
- reconstructed_sentence.append(sublist[0])
42
-
43
- full_sentence = ' '.join(reconstructed_sentence)
44
- #print(full_sentence)
45
-
46
- # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
47
- pattern = r'^Describe this image in detail:\s*(.*)$'
48
- # Apply the regex pattern to extract the description text.
49
- match = re.search(pattern, full_sentence)
50
- if match:
51
- description = match.group(1)
52
- print(description)
53
- else:
54
- print("Unable to locate valid description.")
55
 
56
  # Find the last occurrence of "."
57
- last_period_index = description.rfind('.')
58
 
59
  # Truncate the string up to the last period
60
- truncated_caption = description[:last_period_index + 1]
61
 
62
  # print(truncated_caption)
63
- print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
64
 
65
- return truncated_caption
66
 
67
  def get_caption(image_in):
68
  client = Client("fffiloni/moondream1", hf_token=hf_token)
@@ -101,19 +84,20 @@ def get_magnet(prompt):
101
 
102
  def get_audioldm(prompt):
103
  try:
104
- client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
 
105
  result = client.predict(
106
- prompt, # str in 'Input text' Textbox component
107
- "Low quality. Music.", # str in 'Negative prompt' Textbox component
108
- 10, # int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
109
- 3.5, # int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
110
- 45, # int | float in 'Seed' Number component
111
- 3, # int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
112
- fn_index=1
113
  )
114
  print(result)
115
- audio_result = extract_audio(result)
116
- return audio_result
117
  except:
118
  raise gr.Error("AudioLDM space API is not ready, please try again in few minutes ")
119
 
@@ -133,10 +117,10 @@ def get_tango(prompt):
133
  try:
134
  client = Client("fffiloni/tango")
135
  result = client.predict(
136
- prompt, # str representing string value in 'Prompt' Textbox component
137
- 100, # int | float representing numeric value between 100 and 200 in 'Steps' Slider component
138
- 4, # int | float representing numeric value between 1 and 10 in 'Guidance Scale' Slider component
139
- api_name="/predict"
140
  )
141
  print(result)
142
  return result
@@ -149,10 +133,11 @@ def get_tango2(prompt):
149
  try:
150
  client = Client("declare-lab/tango2")
151
  result = client.predict(
152
- prompt,
153
- 100,
154
- 4,
155
- api_name="/predict"
 
156
  )
157
  print(result)
158
  return result
@@ -196,7 +181,7 @@ def get_ezaudio(prompt):
196
  raise gr.Error("EzAudio space API is not ready, please try again in few minutes ")
197
 
198
  def infer(image_in, chosen_model):
199
- caption = get_caption(image_in)
200
  if chosen_model == "MAGNet" :
201
  magnet_result = get_magnet(caption)
202
  return magnet_result
@@ -240,7 +225,15 @@ with gr.Blocks(css=css) as demo:
240
  with gr.Column():
241
  image_in = gr.Image(sources=["upload"], type="filepath", label="Image input", value="oiseau.png")
242
  with gr.Row():
243
- chosen_model = gr.Dropdown(label="Choose a model", choices=["MAGNet", "AudioLDM-2", "AudioGen", "Tango", "Tango 2", "Stable Audio Open", "EzAudio"], value="AudioLDM-2")
 
 
 
 
 
 
 
 
244
  submit_btn = gr.Button("Submit")
245
  with gr.Column():
246
  audio_o = gr.Audio(label="Audio output")
 
23
  return 'audio.wav'
24
 
25
  def get_caption_from_kosmos(image_in):
26
+ kosmos2_client = Client("fffiloni/Kosmos-2-API", hf_token=hf_token)
 
27
  kosmos2_result = kosmos2_client.predict(
28
+ image_input=handle_file(image_in),
29
+ text_input="Detailed",
30
+ api_name="/generate_predictions"
31
  )
 
32
  print(f"KOSMOS2 RETURNS: {kosmos2_result}")
33
 
34
+ data = kosmos2_result[1]
35
+
36
+ # Extract and combine tokens starting from the second element
37
+ sentence = ''.join(item['token'] for item in data[1:])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  # Find the last occurrence of "."
40
+ #last_period_index = full_sentence.rfind('.')
41
 
42
  # Truncate the string up to the last period
43
+ #truncated_caption = full_sentence[:last_period_index + 1]
44
 
45
  # print(truncated_caption)
46
+ #print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
47
 
48
+ return sentence
49
 
50
  def get_caption(image_in):
51
  client = Client("fffiloni/moondream1", hf_token=hf_token)
 
84
 
85
  def get_audioldm(prompt):
86
  try:
87
+ client = Client("fffiloni/audioldm2-text2audio-text2music-API", hf_token=hf_token)
88
+ seed = random.randint(0, MAX_SEED)
89
  result = client.predict(
90
+ text=prompt, # str in 'Input text' Textbox component
91
+ negative_prompt="Low quality. Music.", # str in 'Negative prompt' Textbox component
92
+ duration=10, # int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
93
+ guidance_scale=6.5, # int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
94
+ random_seed=seed, # int | float in 'Seed' Number component
95
+ n_candidates=3, # int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
96
+ api_name="/text2audio"
97
  )
98
  print(result)
99
+
100
+ return result
101
  except:
102
  raise gr.Error("AudioLDM space API is not ready, please try again in few minutes ")
103
 
 
117
  try:
118
  client = Client("fffiloni/tango")
119
  result = client.predict(
120
+ prompt=prompt,
121
+ steps=100,
122
+ guidance=3,
123
+ api_name="/predict"
124
  )
125
  print(result)
126
  return result
 
133
  try:
134
  client = Client("declare-lab/tango2")
135
  result = client.predict(
136
+ prompt=prompt,
137
+ output_format="wav",
138
+ steps=100,
139
+ guidance=3,
140
+ api_name="/predict"
141
  )
142
  print(result)
143
  return result
 
181
  raise gr.Error("EzAudio space API is not ready, please try again in few minutes ")
182
 
183
  def infer(image_in, chosen_model):
184
+ caption = get_caption_from_kosmos(image_in)
185
  if chosen_model == "MAGNet" :
186
  magnet_result = get_magnet(caption)
187
  return magnet_result
 
225
  with gr.Column():
226
  image_in = gr.Image(sources=["upload"], type="filepath", label="Image input", value="oiseau.png")
227
  with gr.Row():
228
+ chosen_model = gr.Dropdown(label="Choose a model", choices=[
229
+ #"MAGNet",
230
+ "AudioLDM-2",
231
+ #"AudioGen",
232
+ "Tango",
233
+ "Tango 2",
234
+ "Stable Audio Open",
235
+ "EzAudio"
236
+ ], value="AudioLDM-2")
237
  submit_btn = gr.Button("Submit")
238
  with gr.Column():
239
  audio_o = gr.Audio(label="Audio output")