danny-avila · berry-13 · Aug 4, 2023 · Aug 4, 2023 · Aug 4, 2023 · Aug 5, 2023
diff --git a/.env.example b/.env.example
@@ -247,6 +247,14 @@ MEILI_NO_ANALYTICS=true
 MEILI_HOST=http://0.0.0.0:7700
 MEILI_MASTER_KEY=DrhYf7zENyR6AlUCKmnz0eYASOQdl6zxH7s7MKFSfFCt
 
+
+#==================================================#
+# Speech to Text & Text to Speech #
+#==================================================#
+
+STT_API_KEY=
+TTS_API_KEY=
+
 #===================================================#
 # User System #
 #===================================================#

diff --git a/api/app/clients/OpenAIClient.js b/api/app/clients/OpenAIClient.js
@@ -27,6 +27,7 @@ const {
  createContextHandlers,
 } = require('./prompts');
 const { encodeAndFormat } = require('~/server/services/Files/images/encode');
+const { updateTokenWebsocket } = require('~/server/services/Files/Audio');
 const { isEnabled, sleep } = require('~/server/utils');
 const { handleOpenAIErrors } = require('./tools/util');
 const spendTokens = require('~/models/spendTokens');
@@ -594,6 +595,7 @@ class OpenAIClient extends BaseClient {
  payload,
  (progressMessage) => {
  if (progressMessage === '[DONE]') {
+ updateTokenWebsocket('[DONE]');
  return;
  }
 
@@ -1211,10 +1213,14 @@ ${convo}
  const azureDelay = this.modelOptions.model?.includes('gpt-4') ? 30 : 17;
  for await (const chunk of stream) {
  const token = chunk.choices[0]?.delta?.content || '';
+
+ updateTokenWebsocket(token);
+
  intermediateReply += token;
  onProgress(token);
  if (abortController.signal.aborted) {
  stream.controller.abort();
+ updateTokenWebsocket('[DONE]');
  break;
  }
 

diff --git a/api/package.json b/api/package.json
@@ -94,6 +94,7 @@
  "ua-parser-js": "^1.0.36",
  "winston": "^3.11.0",
  "winston-daily-rotate-file": "^4.7.1",
+ "ws": "^8.17.0",
  "zod": "^3.22.4"
  },
  "devDependencies": {

diff --git a/api/server/routes/files/index.js b/api/server/routes/files/index.js
@@ -5,6 +5,8 @@ const { createMulterInstance } = require('./multer');
 const files = require('./files');
 const images = require('./images');
 const avatar = require('./avatar');
+const stt = require('./stt');
+const tts = require('./tts');
 
 const initialize = async () => {
  const router = express.Router();
@@ -18,6 +20,8 @@ const initialize = async () => {
  router.post('/', upload.single('file'));
  router.post('/images', upload.single('file'));
 
+ router.use('/stt', stt);
+ router.use('/tts', tts);
  router.use('/', files);
  router.use('/images', images);
  router.use('/images/avatar', avatar);

diff --git a/api/server/routes/files/stt.js b/api/server/routes/files/stt.js
@@ -0,0 +1,13 @@
+const express = require('express');
+const router = express.Router();
+const multer = require('multer');
+const { requireJwtAuth } = require('~/server/middleware/');
+const { speechToText } = require('~/server/services/Files/Audio');
+
+const upload = multer();
+
+router.post('/', requireJwtAuth, upload.single('audio'), async (req, res) => {
+ await speechToText(req, res);
+});
+
+module.exports = router;
diff --git a/api/server/routes/files/tts.js b/api/server/routes/files/tts.js
@@ -0,0 +1,26 @@
+const express = require('express');
+const router = express.Router();
+const multer = require('multer');
+const { requireJwtAuth } = require('~/server/middleware/');
+const {
+ textToSpeech,
+ getVoices,
+ streamAudioFromWebSocket,
+} = require('~/server/services/Files/Audio');
+
+const upload = multer();
+
+router.post('/', requireJwtAuth, upload.none(), async (req, res) => {
+ const { websocket } = req.body;
+ if (websocket) {
+ streamAudioFromWebSocket(req, res);
+ } else {
+ await textToSpeech(req, res);
+ }
+});
+
+router.get('/voices', requireJwtAuth, async (req, res) => {
+ await getVoices(req, res);
+});
+
+module.exports = router;
diff --git a/api/server/services/Files/Audio/getVoices.js b/api/server/services/Files/Audio/getVoices.js
@@ -0,0 +1,45 @@
+const { logger } = require('~/config');
+const getCustomConfig = require('~/server/services/Config/getCustomConfig');
+const { getProvider } = require('./textToSpeech');
+
+/**
+ * This function retrieves the available voices for the current TTS provider
+ * It first fetches the TTS configuration and determines the provider
+ * Then, based on the provider, it sends the corresponding voices as a JSON response
+ *
+ * @param {Object} req - The request object
+ * @param {Object} res - The response object
+ * @returns {void}
+ * @throws {Error} - If the provider is not 'openai' or 'elevenlabs', an error is thrown
+ */
+async function getVoices(req, res) {
+ try {
+ const customConfig = await getCustomConfig();
+
+ if (!customConfig || !customConfig?.tts) {
+ throw new Error('Configuration or TTS schema is missing');
+ }
+
+ const ttsSchema = customConfig?.tts;
+ const provider = getProvider(ttsSchema);
+ let voices;
+
+ switch (provider) {
+ case 'openai':
+ voices = ttsSchema.openai?.voices;
+ break;
+ case 'elevenlabs':
+ voices = ttsSchema.elevenlabs?.voices;
+ break;
+ default:
+ throw new Error('Invalid provider');
+ }
+
+ res.json(voices);
+ } catch (error) {
+ logger.error(`Failed to get voices: ${error.message}`);
+ res.status(500).json({ error: 'Failed to get voices' });
+ }
+}
+
+module.exports = getVoices;
diff --git a/api/server/services/Files/Audio/index.js b/api/server/services/Files/Audio/index.js
@@ -0,0 +1,12 @@
+const { textToSpeech, streamAudioFromWebSocket } = require('./textToSpeech');
+const speechToText = require('./speechToText');
+const getVoices = require('./getVoices');
+const { updateTokenWebsocket } = require('./webSocket');
+
+module.exports = {
+ textToSpeech,
+ speechToText,
+ getVoices,
+ updateTokenWebsocket,
+ streamAudioFromWebSocket,
+};
diff --git a/api/server/services/Files/Audio/speechToText.js b/api/server/services/Files/Audio/speechToText.js
@@ -0,0 +1,211 @@
+const axios = require('axios');
+const { Readable } = require('stream');
+const { logger } = require('~/config');
+const getCustomConfig = require('~/server/services/Config/getCustomConfig');
+const { extractEnvVariable } = require('librechat-data-provider');
+
+/**
+ * Handle the response from the STT API
+ * @param {Object} response - The response from the STT API
+ *
+ * @returns {string} The text from the response data
+ *
+ * @throws Will throw an error if the response status is not 200 or the response data is missing
+ */
+async function handleResponse(response) {
+ if (response.status !== 200) {
+ throw new Error('Invalid response from the STT API');
+ }
+
+ if (!response.data || !response.data.text) {
+ throw new Error('Missing data in response from the STT API');
+ }
+
+ return response.data.text.trim();
+}
+
+function getProvider(sttSchema) {
+ if (sttSchema.openai) {
+ return 'openai';
+ }
+
+ throw new Error('Invalid provider');
+}
+
+function removeUndefined(obj) {
+ Object.keys(obj).forEach((key) => {
+ if (obj[key] && typeof obj[key] === 'object') {
+ removeUndefined(obj[key]);
+ if (Object.keys(obj[key]).length === 0) {
+ delete obj[key];
+ }
+ } else if (obj[key] === undefined) {
+ delete obj[key];
+ }
+ });
+}
+
+/**
+ * This function prepares the necessary data and headers for making a request to the OpenAI API
+ * It uses the provided speech-to-text schema and audio stream to create the request
+ *
+ * @param {Object} sttSchema - The speech-to-text schema containing the OpenAI configuration
+ * @param {Stream} audioReadStream - The audio data to be transcribed
+ *
+ * @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request
+ * If an error occurs, it returns an array with three null values and logs the error with logger
+ */
+function openAIProvider(sttSchema, audioReadStream) {
+ try {
+ const url = sttSchema.openai?.url || 'https://api.openai.com/v1/audio/transcriptions';
+ const apiKey = sttSchema.openai.apiKey ? extractEnvVariable(sttSchema.openai.apiKey) : '';
+
+ let data = {
+ file: audioReadStream,
+ model: sttSchema.openai.model,
+ };
+
+ let headers = {
+ 'Content-Type': 'multipart/form-data',
+ };
+
+ [headers].forEach(removeUndefined);
+
+ if (apiKey) {
+ headers.Authorization = 'Bearer ' + apiKey;
+ }
+
+ return [url, data, headers];
+ } catch (error) {
+ logger.error('An error occurred while preparing the OpenAI API STT request: ', error);
+ return [null, null, null];
+ }
+}
+
+/**
+ * This function prepares the necessary data and headers for making a request to the Azure API
+ * It uses the provided request and audio stream to create the request
+ *
+ * @param {Object} req - The request object, which should contain the endpoint in its body
+ * @param {Stream} audioReadStream - The audio data to be transcribed
+ *
+ * @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request
+ * If an error occurs, it returns an array with three null values and logs the error with logger
+ */
+function azureProvider(req, audioReadStream) {
+ try {
+ const { endpoint } = req.body;
+ const azureConfig = req.app.locals[endpoint];
+
+ if (!azureConfig) {
+ throw new Error(`No configuration found for endpoint: ${endpoint}`);
+ }
+
+ const { apiKey, instanceName, whisperModel, apiVersion } = Object.entries(
+ azureConfig.groupMap,
+ ).reduce((acc, [, value]) => {
+ if (acc) {
+ return acc;
+ }
+
+ const whisperKey = Object.keys(value.models).find((modelKey) =>
+ modelKey.startsWith('whisper'),
+ );
+
+ if (whisperKey) {
+ return {
+ apiVersion: value.version,
+ apiKey: value.apiKey,
+ instanceName: value.instanceName,
+ whisperModel: value.models[whisperKey]['deploymentName'],
+ };
+ }
+
+ return null;
+ }, null);
+
+ if (!apiKey || !instanceName || !whisperModel || !apiVersion) {
+ throw new Error('Required Azure configuration values are missing');
+ }
+
+ const baseURL = `https://${instanceName}.openai.azure.com`;
+
+ const url = `${baseURL}/openai/deployments/${whisperModel}/audio/transcriptions?api-version=${apiVersion}`;
+
+ let data = {
+ file: audioReadStream,
+ filename: 'audio.wav',
+ contentType: 'audio/wav',
+ knownLength: audioReadStream.length,
+ };
+
+ const headers = {
+ ...data.getHeaders(),
+ 'Content-Type': 'multipart/form-data',
+ 'api-key': apiKey,
+ };
+
+ return [url, data, headers];
+ } catch (error) {
+ logger.error('An error occurred while preparing the Azure API STT request: ', error);
+ return [null, null, null];
+ }
+}
+
+/**
+ * Convert speech to text
+ * @param {Object} req - The request object
+ * @param {Object} res - The response object
+ *
+ * @returns {Object} The response object with the text from the STT API
+ *
+ * @throws Will throw an error if an error occurs while processing the audio
+ */
+
+async function speechToText(req, res) {
+ const customConfig = await getCustomConfig();
+ if (!customConfig) {
+ return res.status(500).send('Custom config not found');
+ }
+
+ if (!req.file || !req.file.buffer) {
+ return res.status(400).json({ message: 'No audio file provided in the FormData' });
+ }
+
+ const audioBuffer = req.file.buffer;
+ const audioReadStream = Readable.from(audioBuffer);
+ audioReadStream.path = 'audio.wav';
+
+ const provider = getProvider(customConfig.stt);
+
+ let [url, data, headers] = [];
+
+ switch (provider) {
+ case 'openai':
+ [url, data, headers] = openAIProvider(customConfig.stt, audioReadStream);
+ break;
+ case 'azure':
+ [url, data, headers] = azureProvider(req, audioReadStream);
+ break;
+ default:
+ throw new Error('Invalid provider');
+ }
+
+ if (!Readable.from) {
+ const audioBlob = new Blob([audioBuffer], { type: req.file.mimetype });
+ delete data['file'];
+ data['file'] = audioBlob;
+ }
+
+ try {
+ const response = await axios.post(url, data, { headers: headers });
+ const text = await handleResponse(response);
+
+ res.json({ text });
+ } catch (error) {
+ logger.error('An error occurred while processing the audio:', error);
+ res.sendStatus(500);
+ }
+}
+
+module.exports = speechToText;