add elevenlabs

bigsk1 · Jun 22, 2024 · 95e2c24 · 95e2c24
1 parent c0635f6
commit 95e2c24
Show file tree

Hide file tree

Showing 11 changed files with 295 additions and 47 deletions.
diff --git a/.env.sample b/.env.sample
@@ -10,7 +10,7 @@ CHARACTER_NAME=wizard
 
 
 # Text-to-Speech Provider - Options: xtts (local uses the custom character .wav) or openai (uses OpenAI TTS voice) - once set if run webui can't change in ui until you stop server and restart
-# openai or xtts
+# openai or xtts or elevenlabs
 TTS_PROVIDER=xtts
 
 # OpenAI TTS Voice - When TTS_PROVIDER is set to openai above, it will use the chosen voice.
@@ -23,6 +23,11 @@ OPENAI_BASE_URL=https://api.openai.com/v1/chat/completions
 OPENAI_TTS_URL=https://api.openai.com/v1/audio/speech
 OLLAMA_BASE_URL=http://localhost:11434
 
+ELEVENLABS_API_KEY=49b111111111
+
+# Default voice ID
+ELEVENLABS_TTS_VOICE=VgPpppppppp
+
 # Models to use - llama3 works well for local usage.
 # OPTIONAL: For screen analysis, if MODEL_PROVIDER is ollama, llava will be used by default.
 # Ensure you have llava downloaded with Ollama. If OpenAI is used, gpt-4o works well.

diff --git a/.gitignore b/.gitignore
@@ -210,4 +210,10 @@ downloads/
 XTTS-v2
 checkpoints
 XTTS-v2/*
-checkpoints/*
+checkpoints/*
+
+temp_output.wav
+temp_output.mp3
+
+#elevenlabs_voices.json
+elevenlabs_voices.json.bak
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ Voice Chat AI is a project that allows you to interact with different AI charact
 ## Features
 
 - **Supports both OpenAI and Ollama language models**: Choose the model that best fits your needs.
-- **Provides text-to-speech synthesis using XTTS or OpenAI TTS**: Enjoy natural and expressive voices.
+- **Provides text-to-speech synthesis using XTTS or OpenAI TTS or ElevenLabs**: Enjoy natural and expressive voices.
 - **No typing needed, just speak**: Hands-free interaction makes conversations smooth and effortless.
 - **Analyzes user mood and adjusts AI responses accordingly**: Get personalized responses based on your mood.
 - **You can, just by speaking, have the AI analyze your screen and chat about it**: Seamlessly integrate visual context into your conversations.
@@ -24,6 +24,8 @@ Voice Chat AI is a project that allows you to interact with different AI charact
 
 - Python 3.10
 - CUDA-enabled GPU
+- Ollama models or Openai API for chat
+- XTTS or Openai API or ElevenLabs API for speech
 - Microsoft C++ Build Tools on windows
 - Microphone
 - A sense of humor
@@ -154,7 +156,7 @@ MODEL_PROVIDER=ollama
 # Character to use - Options: samantha, wizard, pirate, valleygirl, newscaster1920s, alien_scientist, cyberpunk, detective
 CHARACTER_NAME=wizard
 
-# Text-to-Speech Provider - Options: xtts (local uses the custom character .wav) or openai (uses OpenAI TTS voice)
+# Text-to-Speech Provider - Options: xtts (local uses the custom character .wav) or openai (uses OpenAI TTS voice) or elevenlabs (add voice names and id's to elevenlabs_voices.json)
 TTS_PROVIDER=xtts
 
 # OpenAI TTS Voice - When TTS_PROVIDER is set to openai above, it will use the chosen voice.
@@ -170,6 +172,11 @@ OLLAMA_BASE_URL=http://localhost:11434
 # OpenAI API Key for models and speech (replace with your actual API key)
 OPENAI_API_KEY=sk-proj-1111111111
 
+ELEVENLABS_API_KEY=49b111111111
+
+# Default voice ID
+ELEVENLABS_TTS_VOICE=VgPpppppppp
+
 # Models to use - llama3 works well for local usage.
 # OPTIONAL: For screen analysis, if MODEL_PROVIDER is ollama, llava will be used by default.
 # Ensure you have llava downloaded with Ollama. If OpenAI is used, gpt-4o works well.
@@ -215,6 +222,31 @@ python cli.py
  "screenshot" to have the AI explain what it is seeing in detail.
 - To stop the conversation, say "Quit", "Exit", or "Leave". ( ctl+c always works also)
 
+### ElevenLabs
+
+Add names and voice id's - in the webui you can select them in dropdown menu
+
+```json
+{
+ "voices": [
+ {
+ "id": "8qUUChaaaaaaaaa",
+ "name": "Joe - cool, calm, deep"
+ },
+ {
+ "id": "JqseNaaaaaaaaaa",
+ "name": "Joanne - pensive, introspective"
+ },
+ {
+ "id": "L5iaaaaaaaaa",
+ "name": "Victoria - Classy British Mature"
+ } 
+ ]
+}
+```
+For the CLI the voice id in the .env will be used
+
+
 ## Adding New Characters
 
 1. Create a new folder for the character in the project's characters directory.

diff --git a/app/app.py b/app/app.py
@@ -18,6 +18,7 @@
 from pathlib import Path
 import re
 import io
+from pydub import AudioSegment
 
 # Load environment variables
 load_dotenv()
@@ -32,6 +33,10 @@
 OPENAI_BASE_URL = os.getenv('OPENAI_BASE_URL')
 OLLAMA_MODEL = os.getenv('OLLAMA_MODEL')
 OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL')
+ELEVENLABS_API_KEY = os.getenv('ELEVENLABS_API_KEY')
+ELEVENLABS_TTS_VOICE = os.getenv('ELEVENLABS_TTS_VOICE')
+XTTS_SPEED = os.getenv('XTTS_SPEED', '1.1') 
+
 
 # Initialize OpenAI API key
 OpenAI.api_key = OPENAI_API_KEY
@@ -83,15 +88,27 @@ def init_openai_tts_voice(voice_name):
  OPENAI_TTS_VOICE = voice_name
  print(f"Switched to OpenAI TTS voice: {voice_name}")
 
+def init_elevenlabs_tts_voice(voice_name):
+ global ELEVENLABS_TTS_VOICE
+ ELEVENLABS_TTS_VOICE = voice_name
+ print(f"Switched to ElevenLabs TTS voice: {voice_name}")
+
+def init_xtts_speed(speed_value):
+ global XTTS_SPEED
+ XTTS_SPEED = speed_value
+ print(f"Switched to XTTS speed: {speed_value}")
 
 
 # Initial model and TTS voice setup
 if MODEL_PROVIDER == "openai":
- init_openai_model(OPENAI_MODEL) # Ensure the OpenAI model is initialized
+ init_openai_model(OPENAI_MODEL) 
  init_openai_tts_voice(OPENAI_TTS_VOICE)
 elif MODEL_PROVIDER == "ollama":
  init_ollama_model(OLLAMA_MODEL)
 
+if TTS_PROVIDER == "elevenlabs":
+ init_elevenlabs_tts_voice(ELEVENLABS_TTS_VOICE) 
+
 
 # Function to open a file and return its contents as a string
 def open_file(filepath):
@@ -100,6 +117,15 @@ def open_file(filepath):
 
 # Function to play audio using PyAudio
 def play_audio(file_path):
+ file_extension = Path(file_path).suffix.lstrip('.').lower()
+
+ temp_wav_path = os.path.join(output_dir, 'temp_output.wav')
+
+ if file_extension == 'mp3':
+ audio = AudioSegment.from_mp3(file_path)
+ audio.export(temp_wav_path, format="wav")
+ file_path = temp_wav_path
+
  wf = wave.open(file_path, 'rb')
  p = pyaudio.PyAudio()
  stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
@@ -136,6 +162,15 @@ def process_and_play(prompt, audio_file_pth):
  play_audio(output_path)
  else:
  print("Error: Audio file not found.")
+ elif TTS_PROVIDER == 'elevenlabs':
+ output_path = os.path.join(output_dir, 'output.mp3')
+ elevenlabs_text_to_speech(prompt, output_path)
+ print(f"Generated audio file at: {output_path}")
+ if os.path.exists(output_path):
+ print("Playing generated audio...")
+ play_audio(output_path)
+ else:
+ print("Error: Audio file not found.")
  else:
  tts_model = xtts_model
  try:
@@ -146,7 +181,7 @@ def process_and_play(prompt, audio_file_pth):
  gpt_cond_len=24,
  temperature=0.2,
  language='en',
- speed=1.1
+ speed=float(XTTS_SPEED)
  )
  synthesized_audio = outputs['wav']
  src_path = os.path.join(output_dir, 'output.wav')
@@ -211,6 +246,36 @@ def openai_text_to_speech(prompt, output_path):
  except requests.HTTPError as e:
  print(f"Error during OpenAI TTS: {e}")
 
+def elevenlabs_text_to_speech(text, output_path):
+ CHUNK_SIZE = 1024
+ tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{ELEVENLABS_TTS_VOICE}/stream"
+
+ headers = {
+ "Accept": "application/json",
+ "xi-api-key": ELEVENLABS_API_KEY
+ }
+
+ data = {
+ "text": text,
+ "model_id": "eleven_multilingual_v2",
+ "voice_settings": {
+ "stability": 0.5,
+ "similarity_boost": 0.8,
+ "style": 0.0,
+ "use_speaker_boost": True
+ }
+ }
+
+ response = requests.post(tts_url, headers=headers, json=data, stream=True, timeout=30)
+
+ if response.ok:
+ with open(output_path, "wb") as f:
+ for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
+ f.write(chunk)
+ print("Audio stream saved successfully.")
+ else:
+ print("Error generating speech:", response.text)
+
 def sanitize_response(response):
  response = re.sub(r'\*.*?\*', '', response)
  response = re.sub(r'[^\w\s,.\'!?]', '', response)
@@ -324,6 +389,7 @@ def chatgpt_streamed(user_input, system_message, mood_prompt, conversation_histo
  response.raise_for_status()
 
  print("Starting OpenAI stream...")
+ line_buffer = ""
  for line in response.iter_lines(decode_unicode=True):
  if line.startswith("data:"):
  line = line[5:].strip()
@@ -332,18 +398,25 @@ def chatgpt_streamed(user_input, system_message, mood_prompt, conversation_histo
  chunk = json.loads(line)
  delta_content = chunk['choices'][0]['delta'].get('content', '')
  if delta_content:
- print(NEON_GREEN + delta_content + RESET_COLOR, end='', flush=True)
- full_response += delta_content
+ line_buffer += delta_content
+ if '\n' in line_buffer:
+ lines = line_buffer.split('\n')
+ for line in lines[:-1]:
+ print(NEON_GREEN + line + RESET_COLOR)
+ full_response += line + '\n'
+ line_buffer = lines[-1]
  except json.JSONDecodeError:
  continue
+ if line_buffer:
+ print(NEON_GREEN + line_buffer + RESET_COLOR)
+ full_response += line_buffer
  print("\nOpenAI stream complete.")
 
  except requests.exceptions.RequestException as e:
  full_response = f"Error connecting to OpenAI model: {e}"
 
  return full_response
 
-
 def transcribe_with_whisper(audio_file):
  segments, info = whisper_model.transcribe(audio_file, beam_size=5)
  transcription = ""
@@ -387,7 +460,12 @@ def record_audio(file_path, silence_threshold=512, silence_duration=4.0, chunk_s
 
 def execute_once(question_prompt):
  temp_image_path = os.path.join(output_dir, 'temp_img.jpg')
- temp_audio_path = os.path.join(output_dir, 'temp_audio.wav')
+
+ # Determine the audio file format based on the TTS provider
+ if TTS_PROVIDER == 'elevenlabs':
+ temp_audio_path = os.path.join(output_dir, 'temp_audio.mp3') # Use mp3 for ElevenLabs
+ else:
+ temp_audio_path = os.path.join(output_dir, 'temp_audio.wav') # Use wav for others
 
  image_path = take_screenshot(temp_image_path)
  response = analyze_image(image_path, question_prompt)
@@ -400,10 +478,19 @@ def execute_once(question_prompt):
  print(text_response)
 
  generate_speech(text_response, temp_audio_path)
- play_audio(temp_audio_path)
+
+ if TTS_PROVIDER == 'elevenlabs':
+ # Convert MP3 to WAV if ElevenLabs is used
+ temp_wav_path = os.path.join(output_dir, 'temp_output.wav')
+ audio = AudioSegment.from_mp3(temp_audio_path)
+ audio.export(temp_wav_path, format="wav")
+ play_audio(temp_wav_path)
+ else:
+ play_audio(temp_audio_path)
 
  os.remove(image_path)
 
+
 def execute_screenshot_and_analyze():
  question_prompt = "What do you see in this image? Keep it short but detailed and answer any follow up questions about it"
  print("Taking screenshot and analyzing...")
@@ -472,6 +559,8 @@ def generate_speech(text, temp_audio_path):
  audio_file.write(response.content)
  else:
  print(f"Failed to generate speech: {response.status_code} - {response.text}")
+ elif TTS_PROVIDER == 'elevenlabs':
+ elevenlabs_text_to_speech(text, temp_audio_path)
  else:
  tts_model = xtts_model
  try:

diff --git a/app/app_logic.py b/app/app_logic.py
@@ -15,9 +15,10 @@
  init_ollama_model,
  init_openai_model,
  init_openai_tts_voice,
+ init_elevenlabs_tts_voice,
+ init_xtts_speed,
 )
 
-
 router = APIRouter()
 
 continue_conversation = False
@@ -111,7 +112,6 @@ async def conversation_loop():
  await send_message_to_clients(f"{current_character.capitalize()}: {chatbot_response}")
  print(f"{current_character.capitalize()}: {chatbot_response}")
 
-
 def set_env_variable(key: str, value: str):
  os.environ[key] = value
  if key == "OLLAMA_MODEL":
@@ -120,4 +120,7 @@ def set_env_variable(key: str, value: str):
  init_openai_model(value) # Reinitialize OpenAI model
  if key == "OPENAI_TTS_VOICE":
  init_openai_tts_voice(value) # Reinitialize OpenAI TTS voice
-
+ if key == "ELEVENLABS_TTS_VOICE":
+ init_elevenlabs_tts_voice(value) # Reinitialize Elevenlabs TTS voice
+ if key == "XTTS_SPEED":
+ init_xtts_speed(value) # Reinitialize XTTS speed
diff --git a/app/main.py b/app/main.py
@@ -39,6 +39,7 @@ async def get(request: Request):
  openai_model = os.getenv("OPENAI_MODEL")
  ollama_model = os.getenv("OLLAMA_MODEL")
  xtts_speed = os.getenv("XTTS_SPEED")
+ elevenlabs_voice = os.getenv("ELEVENLABS_TTS_VOICE")
 
  return templates.TemplateResponse("index.html", {
  "request": request,
@@ -49,6 +50,7 @@ async def get(request: Request):
  "openai_model": openai_model,
  "ollama_model": ollama_model,
  "xtts_speed": xtts_speed,
+ "elevenlabs_voice": elevenlabs_voice,
  })
 
 @app.get("/characters")
@@ -58,6 +60,19 @@ async def get_characters():
  characters = [name for name in os.listdir(characters_folder) if os.path.isdir(os.path.join(characters_folder, name))]
  return {"characters": characters}
 
+@app.get("/elevenlabs_voices")
+async def get_elevenlabs_voices():
+ project_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+ voices_file = os.path.join(project_dir, 'elevenlabs_voices.json')
+ try:
+ with open(voices_file, 'r', encoding='utf-8') as f:
+ voices = json.load(f)
+ return voices
+ except FileNotFoundError:
+ return {"voices": []}
+ except Exception as e:
+ return {"error": str(e)}
+
 @app.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket):
  await websocket.accept()
@@ -85,6 +100,8 @@ async def websocket_endpoint(websocket: WebSocket):
  set_env_variable("OLLAMA_MODEL", message["model"])
  elif message["action"] == "set_xtts_speed":
  set_env_variable("XTTS_SPEED", message["speed"])
+ elif message["action"] == "set_elevenlabs_voice":
+ set_env_variable("ELEVENLABS_TTS_VOICE", message["voice"])
  except WebSocketDisconnect:
  if websocket in clients:
  clients.remove(websocket)
@@ -103,4 +120,4 @@ def signal_handler(sig, frame):
  try:
  uvicorn.run(app, host="0.0.0.0", port=8000)
  except KeyboardInterrupt:
- print("Server stopped by user.")
+ print("Server stopped by user.")