Skip to content

Commit

Permalink
add elevenlabs
Browse files Browse the repository at this point in the history
  • Loading branch information
bigsk1 committed Jun 22, 2024
1 parent c0635f6 commit 95e2c24
Show file tree
Hide file tree
Showing 11 changed files with 295 additions and 47 deletions.
7 changes: 6 additions & 1 deletion .env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ CHARACTER_NAME=wizard


# Text-to-Speech Provider - Options: xtts (local uses the custom character .wav) or openai (uses OpenAI TTS voice) - once set if run webui can't change in ui until you stop server and restart
# openai or xtts
# openai or xtts or elevenlabs
TTS_PROVIDER=xtts

# OpenAI TTS Voice - When TTS_PROVIDER is set to openai above, it will use the chosen voice.
Expand All @@ -23,6 +23,11 @@ OPENAI_BASE_URL=https://api.openai.com/v1/chat/completions
OPENAI_TTS_URL=https://api.openai.com/v1/audio/speech
OLLAMA_BASE_URL=http://localhost:11434

ELEVENLABS_API_KEY=49b111111111

# Default voice ID
ELEVENLABS_TTS_VOICE=VgPpppppppp

# Models to use - llama3 works well for local usage.
# OPTIONAL: For screen analysis, if MODEL_PROVIDER is ollama, llava will be used by default.
# Ensure you have llava downloaded with Ollama. If OpenAI is used, gpt-4o works well.
Expand Down
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -210,4 +210,10 @@ downloads/
XTTS-v2
checkpoints
XTTS-v2/*
checkpoints/*
checkpoints/*

temp_output.wav
temp_output.mp3

#elevenlabs_voices.json
elevenlabs_voices.json.bak
36 changes: 34 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Voice Chat AI is a project that allows you to interact with different AI charact
## Features

- **Supports both OpenAI and Ollama language models**: Choose the model that best fits your needs.
- **Provides text-to-speech synthesis using XTTS or OpenAI TTS**: Enjoy natural and expressive voices.
- **Provides text-to-speech synthesis using XTTS or OpenAI TTS or ElevenLabs**: Enjoy natural and expressive voices.
- **No typing needed, just speak**: Hands-free interaction makes conversations smooth and effortless.
- **Analyzes user mood and adjusts AI responses accordingly**: Get personalized responses based on your mood.
- **You can, just by speaking, have the AI analyze your screen and chat about it**: Seamlessly integrate visual context into your conversations.
Expand All @@ -24,6 +24,8 @@ Voice Chat AI is a project that allows you to interact with different AI charact

- Python 3.10
- CUDA-enabled GPU
- Ollama models or Openai API for chat
- XTTS or Openai API or ElevenLabs API for speech
- Microsoft C++ Build Tools on windows
- Microphone
- A sense of humor
Expand Down Expand Up @@ -154,7 +156,7 @@ MODEL_PROVIDER=ollama
# Character to use - Options: samantha, wizard, pirate, valleygirl, newscaster1920s, alien_scientist, cyberpunk, detective
CHARACTER_NAME=wizard
# Text-to-Speech Provider - Options: xtts (local uses the custom character .wav) or openai (uses OpenAI TTS voice)
# Text-to-Speech Provider - Options: xtts (local uses the custom character .wav) or openai (uses OpenAI TTS voice) or elevenlabs (add voice names and id's to elevenlabs_voices.json)
TTS_PROVIDER=xtts
# OpenAI TTS Voice - When TTS_PROVIDER is set to openai above, it will use the chosen voice.
Expand All @@ -170,6 +172,11 @@ OLLAMA_BASE_URL=http://localhost:11434
# OpenAI API Key for models and speech (replace with your actual API key)
OPENAI_API_KEY=sk-proj-1111111111
ELEVENLABS_API_KEY=49b111111111
# Default voice ID
ELEVENLABS_TTS_VOICE=VgPpppppppp
# Models to use - llama3 works well for local usage.
# OPTIONAL: For screen analysis, if MODEL_PROVIDER is ollama, llava will be used by default.
# Ensure you have llava downloaded with Ollama. If OpenAI is used, gpt-4o works well.
Expand Down Expand Up @@ -215,6 +222,31 @@ python cli.py
"screenshot" to have the AI explain what it is seeing in detail.
- To stop the conversation, say "Quit", "Exit", or "Leave". ( ctl+c always works also)

### ElevenLabs

Add names and voice id's - in the webui you can select them in dropdown menu

```json
{
"voices": [
{
"id": "8qUUChaaaaaaaaa",
"name": "Joe - cool, calm, deep"
},
{
"id": "JqseNaaaaaaaaaa",
"name": "Joanne - pensive, introspective"
},
{
"id": "L5iaaaaaaaaa",
"name": "Victoria - Classy British Mature"
}
]
}
```
For the CLI the voice id in the .env will be used


## Adding New Characters

1. Create a new folder for the character in the project's characters directory.
Expand Down
103 changes: 96 additions & 7 deletions app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from pathlib import Path
import re
import io
from pydub import AudioSegment

# Load environment variables
load_dotenv()
Expand All @@ -32,6 +33,10 @@
OPENAI_BASE_URL = os.getenv('OPENAI_BASE_URL')
OLLAMA_MODEL = os.getenv('OLLAMA_MODEL')
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL')
ELEVENLABS_API_KEY = os.getenv('ELEVENLABS_API_KEY')
ELEVENLABS_TTS_VOICE = os.getenv('ELEVENLABS_TTS_VOICE')
XTTS_SPEED = os.getenv('XTTS_SPEED', '1.1')


# Initialize OpenAI API key
OpenAI.api_key = OPENAI_API_KEY
Expand Down Expand Up @@ -83,15 +88,27 @@ def init_openai_tts_voice(voice_name):
OPENAI_TTS_VOICE = voice_name
print(f"Switched to OpenAI TTS voice: {voice_name}")

def init_elevenlabs_tts_voice(voice_name):
global ELEVENLABS_TTS_VOICE
ELEVENLABS_TTS_VOICE = voice_name
print(f"Switched to ElevenLabs TTS voice: {voice_name}")

def init_xtts_speed(speed_value):
global XTTS_SPEED
XTTS_SPEED = speed_value
print(f"Switched to XTTS speed: {speed_value}")


# Initial model and TTS voice setup
if MODEL_PROVIDER == "openai":
init_openai_model(OPENAI_MODEL) # Ensure the OpenAI model is initialized
init_openai_model(OPENAI_MODEL)
init_openai_tts_voice(OPENAI_TTS_VOICE)
elif MODEL_PROVIDER == "ollama":
init_ollama_model(OLLAMA_MODEL)

if TTS_PROVIDER == "elevenlabs":
init_elevenlabs_tts_voice(ELEVENLABS_TTS_VOICE)


# Function to open a file and return its contents as a string
def open_file(filepath):
Expand All @@ -100,6 +117,15 @@ def open_file(filepath):

# Function to play audio using PyAudio
def play_audio(file_path):
file_extension = Path(file_path).suffix.lstrip('.').lower()

temp_wav_path = os.path.join(output_dir, 'temp_output.wav')

if file_extension == 'mp3':
audio = AudioSegment.from_mp3(file_path)
audio.export(temp_wav_path, format="wav")
file_path = temp_wav_path

wf = wave.open(file_path, 'rb')
p = pyaudio.PyAudio()
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
Expand Down Expand Up @@ -136,6 +162,15 @@ def process_and_play(prompt, audio_file_pth):
play_audio(output_path)
else:
print("Error: Audio file not found.")
elif TTS_PROVIDER == 'elevenlabs':
output_path = os.path.join(output_dir, 'output.mp3')
elevenlabs_text_to_speech(prompt, output_path)
print(f"Generated audio file at: {output_path}")
if os.path.exists(output_path):
print("Playing generated audio...")
play_audio(output_path)
else:
print("Error: Audio file not found.")
else:
tts_model = xtts_model
try:
Expand All @@ -146,7 +181,7 @@ def process_and_play(prompt, audio_file_pth):
gpt_cond_len=24,
temperature=0.2,
language='en',
speed=1.1
speed=float(XTTS_SPEED)
)
synthesized_audio = outputs['wav']
src_path = os.path.join(output_dir, 'output.wav')
Expand Down Expand Up @@ -211,6 +246,36 @@ def openai_text_to_speech(prompt, output_path):
except requests.HTTPError as e:
print(f"Error during OpenAI TTS: {e}")

def elevenlabs_text_to_speech(text, output_path):
CHUNK_SIZE = 1024
tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{ELEVENLABS_TTS_VOICE}/stream"

headers = {
"Accept": "application/json",
"xi-api-key": ELEVENLABS_API_KEY
}

data = {
"text": text,
"model_id": "eleven_multilingual_v2",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.8,
"style": 0.0,
"use_speaker_boost": True
}
}

response = requests.post(tts_url, headers=headers, json=data, stream=True, timeout=30)

if response.ok:
with open(output_path, "wb") as f:
for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
f.write(chunk)
print("Audio stream saved successfully.")
else:
print("Error generating speech:", response.text)

def sanitize_response(response):
response = re.sub(r'\*.*?\*', '', response)
response = re.sub(r'[^\w\s,.\'!?]', '', response)
Expand Down Expand Up @@ -324,6 +389,7 @@ def chatgpt_streamed(user_input, system_message, mood_prompt, conversation_histo
response.raise_for_status()

print("Starting OpenAI stream...")
line_buffer = ""
for line in response.iter_lines(decode_unicode=True):
if line.startswith("data:"):
line = line[5:].strip()
Expand All @@ -332,18 +398,25 @@ def chatgpt_streamed(user_input, system_message, mood_prompt, conversation_histo
chunk = json.loads(line)
delta_content = chunk['choices'][0]['delta'].get('content', '')
if delta_content:
print(NEON_GREEN + delta_content + RESET_COLOR, end='', flush=True)
full_response += delta_content
line_buffer += delta_content
if '\n' in line_buffer:
lines = line_buffer.split('\n')
for line in lines[:-1]:
print(NEON_GREEN + line + RESET_COLOR)
full_response += line + '\n'
line_buffer = lines[-1]
except json.JSONDecodeError:
continue
if line_buffer:
print(NEON_GREEN + line_buffer + RESET_COLOR)
full_response += line_buffer
print("\nOpenAI stream complete.")

except requests.exceptions.RequestException as e:
full_response = f"Error connecting to OpenAI model: {e}"

return full_response


def transcribe_with_whisper(audio_file):
segments, info = whisper_model.transcribe(audio_file, beam_size=5)
transcription = ""
Expand Down Expand Up @@ -387,7 +460,12 @@ def record_audio(file_path, silence_threshold=512, silence_duration=4.0, chunk_s

def execute_once(question_prompt):
temp_image_path = os.path.join(output_dir, 'temp_img.jpg')
temp_audio_path = os.path.join(output_dir, 'temp_audio.wav')

# Determine the audio file format based on the TTS provider
if TTS_PROVIDER == 'elevenlabs':
temp_audio_path = os.path.join(output_dir, 'temp_audio.mp3') # Use mp3 for ElevenLabs
else:
temp_audio_path = os.path.join(output_dir, 'temp_audio.wav') # Use wav for others

image_path = take_screenshot(temp_image_path)
response = analyze_image(image_path, question_prompt)
Expand All @@ -400,10 +478,19 @@ def execute_once(question_prompt):
print(text_response)

generate_speech(text_response, temp_audio_path)
play_audio(temp_audio_path)

if TTS_PROVIDER == 'elevenlabs':
# Convert MP3 to WAV if ElevenLabs is used
temp_wav_path = os.path.join(output_dir, 'temp_output.wav')
audio = AudioSegment.from_mp3(temp_audio_path)
audio.export(temp_wav_path, format="wav")
play_audio(temp_wav_path)
else:
play_audio(temp_audio_path)

os.remove(image_path)


def execute_screenshot_and_analyze():
question_prompt = "What do you see in this image? Keep it short but detailed and answer any follow up questions about it"
print("Taking screenshot and analyzing...")
Expand Down Expand Up @@ -472,6 +559,8 @@ def generate_speech(text, temp_audio_path):
audio_file.write(response.content)
else:
print(f"Failed to generate speech: {response.status_code} - {response.text}")
elif TTS_PROVIDER == 'elevenlabs':
elevenlabs_text_to_speech(text, temp_audio_path)
else:
tts_model = xtts_model
try:
Expand Down
9 changes: 6 additions & 3 deletions app/app_logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
init_ollama_model,
init_openai_model,
init_openai_tts_voice,
init_elevenlabs_tts_voice,
init_xtts_speed,
)


router = APIRouter()

continue_conversation = False
Expand Down Expand Up @@ -111,7 +112,6 @@ async def conversation_loop():
await send_message_to_clients(f"{current_character.capitalize()}: {chatbot_response}")
print(f"{current_character.capitalize()}: {chatbot_response}")


def set_env_variable(key: str, value: str):
os.environ[key] = value
if key == "OLLAMA_MODEL":
Expand All @@ -120,4 +120,7 @@ def set_env_variable(key: str, value: str):
init_openai_model(value) # Reinitialize OpenAI model
if key == "OPENAI_TTS_VOICE":
init_openai_tts_voice(value) # Reinitialize OpenAI TTS voice

if key == "ELEVENLABS_TTS_VOICE":
init_elevenlabs_tts_voice(value) # Reinitialize Elevenlabs TTS voice
if key == "XTTS_SPEED":
init_xtts_speed(value) # Reinitialize XTTS speed
19 changes: 18 additions & 1 deletion app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ async def get(request: Request):
openai_model = os.getenv("OPENAI_MODEL")
ollama_model = os.getenv("OLLAMA_MODEL")
xtts_speed = os.getenv("XTTS_SPEED")
elevenlabs_voice = os.getenv("ELEVENLABS_TTS_VOICE")

return templates.TemplateResponse("index.html", {
"request": request,
Expand All @@ -49,6 +50,7 @@ async def get(request: Request):
"openai_model": openai_model,
"ollama_model": ollama_model,
"xtts_speed": xtts_speed,
"elevenlabs_voice": elevenlabs_voice,
})

@app.get("/characters")
Expand All @@ -58,6 +60,19 @@ async def get_characters():
characters = [name for name in os.listdir(characters_folder) if os.path.isdir(os.path.join(characters_folder, name))]
return {"characters": characters}

@app.get("/elevenlabs_voices")
async def get_elevenlabs_voices():
project_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
voices_file = os.path.join(project_dir, 'elevenlabs_voices.json')
try:
with open(voices_file, 'r', encoding='utf-8') as f:
voices = json.load(f)
return voices
except FileNotFoundError:
return {"voices": []}
except Exception as e:
return {"error": str(e)}

@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
await websocket.accept()
Expand Down Expand Up @@ -85,6 +100,8 @@ async def websocket_endpoint(websocket: WebSocket):
set_env_variable("OLLAMA_MODEL", message["model"])
elif message["action"] == "set_xtts_speed":
set_env_variable("XTTS_SPEED", message["speed"])
elif message["action"] == "set_elevenlabs_voice":
set_env_variable("ELEVENLABS_TTS_VOICE", message["voice"])
except WebSocketDisconnect:
if websocket in clients:
clients.remove(websocket)
Expand All @@ -103,4 +120,4 @@ def signal_handler(sig, frame):
try:
uvicorn.run(app, host="0.0.0.0", port=8000)
except KeyboardInterrupt:
print("Server stopped by user.")
print("Server stopped by user.")
Loading

0 comments on commit 95e2c24

Please sign in to comment.