speech2speech / app.py
hu-po's picture
release 0.3
a86e62a
import asyncio
import logging
import os
import random
from typing import Dict, List, Tuple
import gradio as gr
import yaml
from src.elevenlabs import (Speaker, check_voice_exists, get_make_voice,
play_history, save_history, set_elevenlabs_key)
from src.openailib import top_response, speech_to_text, set_openai_key
from src.tube import extract_audio
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)
class ConversationState:
COLORS: list = ['#FFA07A', '#F08080', '#AFEEEE', '#B0E0E6', '#DDA0DD',
'#FFFFE0', '#F0E68C', '#90EE90', '#87CEFA', '#FFB6C1']
YAML_FILEPATH: str = os.path.join(os.path.dirname(__file__), 'voices.yaml')
AUDIO_SAVEDIR: str = os.path.join(
os.path.dirname(__file__), 'audio_export')
def __init__(self,
names: list = None,
iam: str = None,
model: str = "gpt-3.5-turbo",
max_tokens: int = 30,
temperature: float = 0.5,
history: list = None):
self.model = model
self.max_tokens = max_tokens
self.temperature = temperature
# Make sure save dir exists, make any necessary directories
os.makedirs(self.AUDIO_SAVEDIR, exist_ok=True)
self.audio_savepath = os.path.join(
self.AUDIO_SAVEDIR, 'conversation.wav')
log.info(f"Resetting conversation")
with open(self.YAML_FILEPATH, 'r') as file:
self.characters_yaml = file.read()
file.seek(0)
self.characters_dict = yaml.safe_load(file)
self.all_characters = [
name for name in self.characters_dict.keys()]
self.names = names or random.choices(self.all_characters, k=2)
self.iam = iam or random.choice(self.names)
assert self.iam in self.names, f"{self.iam} not in {self.names}"
log.info(f"Loading voices")
self.speakers: Dict[str, Speaker] = {}
self.speakers_descriptions: str = ''
for i, name in enumerate(self.names):
if check_voice_exists(name) is None:
log.warning(f"Voice {name} does not exist")
continue
_speaker = Speaker(
name=name,
voice=get_make_voice(name),
color=self.COLORS[i % len(self.COLORS)],
description=self.characters_dict[name].get(
"description", None),
)
self.speakers[name] = _speaker
if _speaker.description is not None:
self.speakers_descriptions += f"{_speaker.name}: {_speaker.description}.\n"
# System is fed into OpenAI to condition the prompt
self.system = f"You create funny conversation dialogues."
self.system += f"This conversation is between {', '.join(self.names)}."
self.system += "Do not introduce new characters."
self.system += "Descriptions for each of the characters are:\n"
for speaker in self.speakers.values():
self.system += f"{speaker.name}: {speaker.description}\n"
self.system += "Only return one person's response at a time."
self.system += "Each response must start with the character name, then a colon, then their response in a single line."
self.system += "Keep the responses short and witty."
self.system += "Make sure the responses are only one sentence long."
self.system += "Do not continue a previous response. Always start a new response."
# History is fed in at every step
self.step = 0
if history is None:
self.history: List[Tuple[Speaker, str]] = []
def add_to_history(self, text: str, speaker: Speaker = None):
if speaker is None:
speaker = self.speakers[self.iam]
self.history.append((speaker, text))
def history_to_prompt(self) -> str:
prompt: str = ''
for speaker, text in self.history:
prompt += f"{speaker.name}:{text}\n"
return prompt
def html_history(self) -> str:
history_html: str = ''
for speaker, text in self.history:
_bubble = f"<div style='background-color: {speaker.color}; border-radius: 5px; padding: 5px; margin: 5px;'>{speaker.name}: {text}</div>"
history_html += _bubble
return history_html
# Storing state in the global scope like this is bad, but
# perfect is the enemy of good enough and gradio is kind of shit
STATE = ConversationState()
def reset(names, iam, model, max_tokens, temperature):
# Push new global state to the global scope
global STATE
STATE = ConversationState(
names=names,
iam=iam,
model=model,
max_tokens=max_tokens,
temperature=temperature,
)
return STATE.html_history()
def step_mic(audio):
global STATE
try:
request = speech_to_text(audio)
STATE.add_to_history(request)
except TypeError as e:
log.warning(e)
pass
return STATE.html_history()
def step_continue():
global STATE
response = top_response(STATE.history_to_prompt(),
system=STATE.system,
model=STATE.model,
max_tokens=STATE.max_tokens,
temperature=STATE.temperature,
)
for line in response.splitlines():
try:
# TODO: Add any filters here as assertion errors
if not line:
continue
assert ":" in line, f"Line {line} does not have a colon"
name, text = line.split(":")
assert name in STATE.all_characters, f"Name {name} is not in {STATE.all_characters}"
speaker = STATE.speakers[name]
assert len(text) > 0, f"Text {text} is empty"
STATE.add_to_history(text, speaker=speaker)
except AssertionError as e:
log.warning(e)
continue
return STATE.html_history()
def save_audio():
global STATE
log.info(f"Saving audio")
asyncio.run(save_history(STATE.history, STATE.audio_savepath))
return STATE.audio_savepath
def play_audio():
global STATE
log.info(f"Playing audio")
asyncio.run(play_history(STATE.history))
return STATE.html_history()
def make_voices(voices_yaml: str):
global STATE
try:
STATE.characters_dict = yaml.safe_load(voices_yaml)
for name, metadata in STATE.characters_dict.items():
videos = metadata['references']
assert isinstance(name, str), f"Name {name} is not a string"
assert isinstance(videos, list), f"Videos {videos} is not a list"
if check_voice_exists(name):
continue
audio_paths = []
for i, video in enumerate(videos):
assert isinstance(video, Dict), f"Video {video} is not a dict"
assert 'url' in video, f"Video {video} does not have a url"
url = video['url']
start_minute = video.get('start_minute', 0)
duration = video.get('duration_seconds', 120)
label = os.path.join(STATE.AUDIO_SAVEDIR, f"audio.{name}.{i}")
output_path = extract_audio(url, label, start_minute, duration)
audio_paths.append(output_path)
get_make_voice(name, audio_paths)
except Exception as e:
raise e
# return f"Error: {e}"
return "Success"
# Define the main GradIO UI
with gr.Blocks() as demo:
gr.HTML('''
<center>
<h1>Speech2Speech</h1>
Make a private copy of this space to paste your API keys.
<br>
<a href="https://huggingface.co./spaces/hu-po/speech2speech?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
</center>''')
with gr.Row():
openai_api_key_textbox = gr.Textbox(
placeholder="Paste your OpenAI API key here",
show_label=False,
lines=1,
type="password",
)
elevenlabs_api_key_textbox = gr.Textbox(
placeholder="Paste your ElevenLabs API key here",
show_label=False,
lines=1,
type="password",
)
with gr.Tab("Conversation"):
gr_convo_output = gr.HTML()
with gr.Row():
with gr.Column():
gr_mic = gr.Audio(
label="Record audio into conversation",
source="microphone",
type="filepath",
)
gr_add_button = gr.Button(value="Add to conversation")
gr_playaudio_button = gr.Button(value="Play audio")
gr_saveaudio_button = gr.Button(value="Export audio")
gr_outputaudio = gr.Audio(
label="Audio output",
source="upload",
type="filepath",
)
with gr.Column():
gr_iam = gr.Dropdown(
choices=STATE.all_characters, label="I am", value=STATE.iam)
gr_chars = gr.CheckboxGroup(
STATE.all_characters, label="Characters", value=STATE.names)
gr_reset_button = gr.Button(value="Reset conversation")
with gr.Accordion("Settings", open=False):
openai_api_key_textbox = gr.Textbox(
placeholder="Paste your OpenAI API key here",
show_label=False,
lines=1,
type="password",
)
elevenlabs_api_key_textbox = gr.Textbox(
placeholder="Paste your ElevenLabs API key here",
show_label=False,
lines=1,
type="password",
)
gr_model = gr.Dropdown(choices=["gpt-3.5-turbo", "gpt-4"],
label='GPT Model behind conversation', value=STATE.model)
gr_max_tokens = gr.Slider(minimum=1, maximum=500, value=STATE.max_tokens,
label="Max tokens", step=1)
gr_temperature = gr.Slider(
minimum=0.0, maximum=1.0, value=STATE.temperature, label="Temperature (randomness in conversation)")
with gr.Tab("New Characters"):
gr_make_voice_button = gr.Button(value="Update Characters")
gr_voice_data = gr.Textbox(
lines=25, label="Character YAML config", value=STATE.characters_yaml)
gr_make_voice_output = gr.Textbox(
lines=2, label="Character creation logs...")
gr.HTML('''<center>
Created by <a href="https://youtube.com/@hu-po">Hu Po</a> GitHub: <a href="https://github.com/hu-po/speech2speech">speech2speech</a>
</center>
''')
# Buttons and actions
gr_mic.change(step_mic, gr_mic, gr_convo_output)
openai_api_key_textbox.change(set_openai_key, openai_api_key_textbox, None)
elevenlabs_api_key_textbox.change(
set_elevenlabs_key, elevenlabs_api_key_textbox, None)
gr_add_button.click(step_continue, None, gr_convo_output)
gr_reset_button.click(
reset,
inputs=[gr_chars, gr_iam, gr_model, gr_max_tokens, gr_temperature],
outputs=[gr_convo_output],
)
gr_saveaudio_button.click(save_audio, None, gr_outputaudio)
gr_playaudio_button.click(play_audio, None, None)
gr_make_voice_button.click(
make_voices, inputs=gr_voice_data, outputs=gr_make_voice_output,
)
if __name__ == "__main__":
demo.launch()