diff --git a/.gitignore b/.gitignore index da754a4..1c19331 100644 --- a/.gitignore +++ b/.gitignore @@ -216,4 +216,4 @@ temp_output.wav temp_output.mp3 #elevenlabs_voices.json -elevenlabs_voices.json.bak +#elevenlabs_voices.json.bak diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..39932da --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 bigsk1 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md index 98a7027..5bfc2d6 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,12 @@ [![Python application](https://github.com/bigsk1/voice-chat-ai/actions/workflows/python-app.yml/badge.svg)](https://github.com/bigsk1/voice-chat-ai/actions/workflows/python-app.yml) +[![License](https://img.shields.io/github/license/bigsk1/voice-chat-ai)](https://github.com/bigsk1/voice-chat-ai/blob/main/LICENSE) -# Voice Chat AI +# Voice Chat AI 🎙️ -Voice Chat AI is a project that allows you to interact with different AI characters using speech. You can choose between various characters, each with unique personalities and voices. You can run all locally, you can use openai for chat and voice, you can mix between the two. +Voice Chat AI is a project that allows you to interact with different AI characters using speech. You can choose between various characters, each with unique personalities and voices. Have a serious conversation with Albert Einstein or role play with the OS from the movie HER. + +You can run all locally, you can use openai for chat and voice, you can mix between the two. You can use ElevenLabs voices with ollama models all controlled from a Web UI. Ask the AI to look at your screen and it will explain in detail what it's looking at. ![Ai-Speech](https://imagedelivery.net/WfhVb8dSNAAvdXUdMfBuPQ/ed0edfea-265d-4c23-d11d-0b5ba0f02d00/public) @@ -25,7 +28,7 @@ Voice Chat AI is a project that allows you to interact with different AI charact - Python 3.10 - CUDA-enabled GPU - Ollama models or Openai API for chat -- XTTS or Openai API or ElevenLabs API for speech +- Local XTTS or Openai API or ElevenLabs API for speech - Microsoft C++ Build Tools on windows - Microphone - A sense of humor @@ -39,14 +42,14 @@ Voice Chat AI is a project that allows you to interact with different AI charact cd voice-chat-ai ``` -2. Create a virtual environment: +2. Create a virtual environment: 🐍 ```bash python -m venv venv source venv/bin/activate # On Windows use `venv\Scripts\Activate` ``` - or use conda just make it python 3.10 + or use `conda` just make it python 3.10 ```bash conda create --name voice-chat-ai python=3.10 @@ -121,7 +124,7 @@ unzip XTTS-v2.zip -d . ## Docker - Experimental -This image is huge when built because of all the checkpoints, base image, build tools and audio tools - 40gb - there maybe a way to get it smaller I haven't tried yet +This image is huge when built because of all the checkpoints, base image, build tools and audio tools - 40gb - there maybe a way to get it smaller I haven't tried yet, was just an experiment to see if I could get it to work! Docker run command allows you to use microphone in docker container @@ -142,7 +145,7 @@ docker run -d --gpus all -e "PULSE_SERVER=/mnt/wslg/PulseServer" -v \\wsl$\Ubunt In the docker folder there is also some scripts to update the model and tts provider into the container, so you can change from openai to ollama and back again if you like, instead of exec into the container and making changes manually. -## Configuration +## Configuration ⚙️ 1. Rename the .env.sample to `.env` in the root directory of the project and configure it with the necessary environment variables: - The app is controlled based on the variables you add. @@ -174,7 +177,7 @@ OPENAI_API_KEY=sk-proj-1111111111 ELEVENLABS_API_KEY=49b111111111 -# Default voice ID +# Default voice ID - add voice id's and names in elevenlabs_voices.json for dropdown list in webui ELEVENLABS_TTS_VOICE=VgPpppppppp # Models to use - llama3 works well for local usage. @@ -195,7 +198,7 @@ XTTS_SPEED=1.2 ## Usage -Run the application: +Run the application: 🏃 Web UI ```bash diff --git a/app/app.py b/app/app.py index 16aa028..93c2ad4 100644 --- a/app/app.py +++ b/app/app.py @@ -20,6 +20,7 @@ import io from pydub import AudioSegment + # Load environment variables load_dotenv() @@ -98,18 +99,36 @@ def init_xtts_speed(speed_value): XTTS_SPEED = speed_value print(f"Switched to XTTS speed: {speed_value}") - # Initial model and TTS voice setup if MODEL_PROVIDER == "openai": init_openai_model(OPENAI_MODEL) - init_openai_tts_voice(OPENAI_TTS_VOICE) + #init_openai_tts_voice(OPENAI_TTS_VOICE) elif MODEL_PROVIDER == "ollama": init_ollama_model(OLLAMA_MODEL) if TTS_PROVIDER == "elevenlabs": - init_elevenlabs_tts_voice(ELEVENLABS_TTS_VOICE) - + init_elevenlabs_tts_voice(ELEVENLABS_TTS_VOICE) + +# Function to display ElevenLabs quota +def display_elevenlabs_quota(): + try: + response = requests.get( + "https://api.elevenlabs.io/v1/user", + headers={"xi-api-key": ELEVENLABS_API_KEY}, + timeout=30 + ) + response.raise_for_status() + user_data = response.json() + character_count = user_data['subscription']['character_count'] + character_limit = user_data['subscription']['character_limit'] + print(f"{NEON_GREEN}ElevenLabs Character Usage: {character_count} / {character_limit}{RESET_COLOR}") + except Exception as e: + print(f"{YELLOW}Could not fetch ElevenLabs quota: {e}{RESET_COLOR}") + +if TTS_PROVIDER == "elevenlabs": + display_elevenlabs_quota() + # Function to open a file and return its contents as a string def open_file(filepath): with open(filepath, 'r', encoding='utf-8') as infile: @@ -464,14 +483,19 @@ def execute_once(question_prompt): # Determine the audio file format based on the TTS provider if TTS_PROVIDER == 'elevenlabs': temp_audio_path = os.path.join(output_dir, 'temp_audio.mp3') # Use mp3 for ElevenLabs + max_char_length = 500 # Set a higher limit for ElevenLabs + elif TTS_PROVIDER == 'openai': + temp_audio_path = os.path.join(output_dir, 'temp_audio.wav') # Use wav for OpenAI + max_char_length = 500 # Set a higher limit for OpenAI else: - temp_audio_path = os.path.join(output_dir, 'temp_audio.wav') # Use wav for others + temp_audio_path = os.path.join(output_dir, 'temp_audio.wav') # Use wav for XTTS + max_char_length = 250 # Set a lower limit for XTTS image_path = take_screenshot(temp_image_path) response = analyze_image(image_path, question_prompt) text_response = response.get('choices', [{}])[0].get('message', {}).get('content', 'No response received.') - max_char_length = 350 + # Truncate response based on the TTS provider's limit if len(text_response) > max_char_length: text_response = text_response[:max_char_length] + "..." diff --git a/app/app_logic.py b/app/app_logic.py index a7c87ec..edcc0f4 100644 --- a/app/app_logic.py +++ b/app/app_logic.py @@ -25,12 +25,13 @@ conversation_history = [] clients = [] -current_character = "pirate" # Default character +current_character = "pirate" # Default character as placeholder is nothing selected def set_character(character): global current_character current_character = character + def record_audio_and_transcribe(): audio_file = "temp_recording.wav" record_audio(audio_file) diff --git a/app/static/css/styles.css b/app/static/css/styles.css index bffd532..0ffd79e 100644 --- a/app/static/css/styles.css +++ b/app/static/css/styles.css @@ -5,26 +5,46 @@ body { display: flex; flex-direction: column; align-items: center; - justify-content: center; - height: 100vh; + justify-content: flex-start; + height: 90vh; margin: 0; + padding: 10px; + text-align: center; +} + +header, footer { + width: 100%; + text-align: center; + padding: 10px 0; + background-color: #1e1e1e; + border-bottom: 1px solid #333; +} + +main { + width: 100%; + max-width: 80%; + text-align: center; } h1 { - margin-bottom: 20px; + margin-bottom: 15px; } #conversation { - width: 80%; - max-width: 800px; - height: 400px; + width: 100%; + max-height: 475px; + min-height: 475px; border: 1px solid #ffffff; - padding: 10px; - overflow-y: scroll; + padding: 8px; + overflow-y: auto; background-color: #1e1e1e; margin-bottom: 20px; + margin-top: 20px; + box-shadow: 0 0 10px rgba(0, 0, 0, 0.5); + text-align: center; } + #messages p { margin: 10px 0; } @@ -37,12 +57,20 @@ button { color: #ffffff; border: none; border-radius: 5px; + transition: background-color 0.3s ease; } button:hover { background-color: #555555; } +.controls { + display: flex; + justify-content: center; + gap: 30px; + margin-bottom: 20px; +} + .dropdown-container { display: flex; flex-wrap: wrap; @@ -84,3 +112,12 @@ code { color: #07f107; } +.ai-message { + color: #25c480; + margin: 5px 0; +} + +.user-message { + color: #ffffff; + margin: 5px 0; +} diff --git a/app/static/js/scripts.js b/app/static/js/scripts.js index fc4cb1f..ec987ef 100644 --- a/app/static/js/scripts.js +++ b/app/static/js/scripts.js @@ -38,6 +38,10 @@ document.addEventListener("DOMContentLoaded", function() { } else { messages.innerHTML += `

${formattedMessage}

`; } + + // Scroll to the bottom of the messages div + const conversation = document.getElementById('conversation'); + conversation.scrollTop = conversation.scrollHeight; }; document.getElementById('start-conversation-btn').addEventListener('click', function() { diff --git a/app/templates/index.html b/app/templates/index.html index 06a90f7..2bc7897 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -1,101 +1,104 @@ - + - Voice Chat AI - - + + - + Voice Chat AI -

Voice Chat AI

-
-
-
- - - -