Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🗣️ feat: STT & TTS #1603

Closed
wants to merge 191 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
191 commits
Select commit Hold shift + click to select a range
1af6751
Update TextChat.jsx
bsu3338 Aug 4, 2023
b3636ab
Update SubmitButton.jsx
bsu3338 Aug 4, 2023
4401d0d
Update TextChat.jsx
bsu3338 Aug 4, 2023
07b2af1
Merge branch 'danny-avila:main' into Speech-to-Text
bsu3338 Aug 5, 2023
5a67874
Update SubmitButton.jsx
bsu3338 Aug 5, 2023
14f4d66
Create ListeningIcon.tsx
bsu3338 Aug 5, 2023
65a7b2b
Update index.ts
bsu3338 Aug 5, 2023
31441ed
Update SubmitButton.jsx
bsu3338 Aug 5, 2023
74fa8d1
Update TextChat.jsx
bsu3338 Aug 5, 2023
37c0f5b
Update ListeningIcon.tsx
bsu3338 Aug 5, 2023
46c53d1
Update ListeningIcon.tsx
bsu3338 Aug 5, 2023
2ffb5be
Create SpeechRecognition.tsx
bsu3338 Aug 5, 2023
49a9dae
Update TextChat.jsx
bsu3338 Aug 5, 2023
eb842c6
Update TextChat.jsx
bsu3338 Aug 5, 2023
8982ec1
Update SpeechRecognition.tsx
bsu3338 Aug 5, 2023
ca3f064
Update TextChat.jsx
bsu3338 Aug 5, 2023
2522d76
Update SpeechRecognition.tsx
bsu3338 Aug 5, 2023
d9a4d2f
Update SpeechRecognition.tsx
bsu3338 Aug 5, 2023
42aadd2
Update SpeechRecognition.tsx
bsu3338 Aug 5, 2023
5ad9927
Merge branch 'danny-avila:main' into Speech-to-Text
bsu3338 Aug 5, 2023
5d76082
Update SpeechRecognition.tsx
bsu3338 Aug 5, 2023
93ceae6
Merge branch 'danny-avila:main' into Speech-to-Text
bsu3338 Aug 5, 2023
b49024f
Update SubmitButton.jsx
bsu3338 Aug 6, 2023
28a00a5
Update TextChat.jsx
bsu3338 Aug 6, 2023
69ff48d
Update SpeechRecognition.tsx
bsu3338 Aug 6, 2023
cfe6325
Merge branch 'main' into Speech-to-Text
bsu3338 Aug 6, 2023
fd23679
Merge branch 'main' into Speech-to-Text
bsu3338 Aug 6, 2023
148a71b
Merge branch 'main' into Speech-to-Text
bsu3338 Aug 7, 2023
252325d
Merge branch 'main' into Speech-to-Text
bsu3338 Aug 9, 2023
f9ed2ad
Create SpeechSynthesis.tsx
bsu3338 Aug 9, 2023
09c68d1
Update index.jsx
bsu3338 Aug 9, 2023
e7d7d73
Update SpeechSynthesis.tsx
bsu3338 Aug 9, 2023
e313637
Update SpeechRecognition.tsx
bsu3338 Aug 9, 2023
78278b5
Merge branch 'main' into Speech-to-Text
bsu3338 Aug 9, 2023
776daa1
Update TextChat.jsx
bsu3338 Aug 9, 2023
c02d43b
Update SpeechRecognition.tsx
bsu3338 Aug 9, 2023
c7ffb25
Update SpeechRecognition.tsx
bsu3338 Aug 9, 2023
10e3be5
Update SpeechRecognition.tsx
bsu3338 Aug 9, 2023
78a8106
Update TextChat.jsx
bsu3338 Aug 11, 2023
7e8bae2
Merge branch 'danny-avila:main' into Speech-to-Text
bsu3338 Aug 11, 2023
ed4b25b
Squashed commit of the following:
bsu3338 Sep 3, 2023
ad3c78f
Merge branch 'danny-avila:main' into Speech-September
bsu3338 Sep 3, 2023
863af2c
Create VolumeMuteIcon.tsx
bsu3338 Sep 3, 2023
b03001d
Create VolumeIcon.tsx
bsu3338 Sep 3, 2023
29a5b55
Update index.ts
bsu3338 Sep 3, 2023
8d5114b
Update SubmitButton.jsx
bsu3338 Sep 3, 2023
6583877
Update SubmitButton.jsx
bsu3338 Sep 3, 2023
9a3e67f
Update TextChat.jsx
bsu3338 Sep 3, 2023
6033eb3
Update TextChat.jsx
bsu3338 Sep 4, 2023
d405454
Update SpeechRecognition.tsx
bsu3338 Sep 4, 2023
7f101bd
Update SpeechRecognition.tsx
bsu3338 Sep 4, 2023
9a27e56
Update TextChat.jsx
bsu3338 Sep 4, 2023
5542f8e
Update SpeechRecognition.tsx
bsu3338 Sep 4, 2023
c7eea96
Update TextChat.jsx
bsu3338 Sep 4, 2023
3e36c16
Update HoverButtons.tsx
bsu3338 Sep 4, 2023
c041c32
Update useServerStream.ts
bsu3338 Sep 4, 2023
4b30c13
Update useServerStream.ts
bsu3338 Sep 4, 2023
8ed04e4
Update HoverButtons.tsx
bsu3338 Sep 4, 2023
875ce4b
Update useServerStream.ts
bsu3338 Sep 4, 2023
609d1df
Update useServerStream.ts
bsu3338 Sep 4, 2023
4b4afcd
Update HoverButtons.tsx
bsu3338 Sep 4, 2023
6133531
Update VolumeIcon.tsx
bsu3338 Sep 4, 2023
37c828d
Update VolumeMuteIcon.tsx
bsu3338 Sep 4, 2023
95cf300
Update HoverButtons.tsx
bsu3338 Sep 4, 2023
e9882de
Update SpeechSynthesis.tsx
bsu3338 Sep 4, 2023
7ae0e7e
Update HoverButtons.tsx
bsu3338 Sep 4, 2023
c794f07
Update HoverButtons.tsx
bsu3338 Sep 4, 2023
d95fa19
Update SpeechSynthesis.tsx
bsu3338 Sep 4, 2023
c5ce576
Update SpeechSynthesis.tsx
bsu3338 Sep 4, 2023
4c6d067
Update HoverButtons.tsx
bsu3338 Sep 4, 2023
39e84ef
Update SpeechSynthesis.tsx
bsu3338 Sep 4, 2023
5b80ddf
Update package.json
bsu3338 Sep 4, 2023
6686126
Update SpeechRecognition.tsx
bsu3338 Sep 4, 2023
0b35dbe
Update SpeechRecognition.tsx
bsu3338 Sep 4, 2023
67f111c
Update SpeechRecognition.tsx
bsu3338 Sep 4, 2023
1019529
Update SpeechRecognition.tsx
bsu3338 Sep 4, 2023
c1087ac
Squashed commit of the following:
bsu3338 Sep 4, 2023
f3b7b3e
Merge branch 'Speech-September' into Speech-to-Text
bsu3338 Sep 4, 2023
3217b40
Update package-lock.json
bsu3338 Sep 4, 2023
86bffc8
Merge remote-tracking branch 'upstream/main' into Speech-to-Text
bsu3338 Sep 22, 2023
ae1ba09
Merge branch 'danny-avila:main' into Speech-to-Text
bsu3338 Sep 22, 2023
17bf603
Update SubmitButton.tsx
bsu3338 Sep 23, 2023
04720a0
Update SpeechRecognition.tsx
bsu3338 Sep 23, 2023
c1a38ad
fix: typescript error
berry-13 Nov 3, 2023
4679ba2
Merge branch 'main' into Speech-to-Text
berry-13 Jan 14, 2024
3f0de68
style: moved to new UI
berry-13 Jan 15, 2024
3767123
fix:(SpeechRecognition) lint error
berry-13 Jan 19, 2024
e5bf3af
Merge branch 'main' into Speech-to-Text
berry-13 Jan 19, 2024
7c0af5e
moved everything to hooks
berry-13 Jan 19, 2024
15004ae
feat: support stt external
berry-13 Jan 20, 2024
af5f6a8
Merge branch 'main' into Speech-to-Text
berry-13 Jan 20, 2024
c62053a
fix(useExternalSpeechRecognition): recording the audio
berry-13 Jan 20, 2024
2995686
feat: whisper api support
berry-13 Jan 23, 2024
6e928e2
refactor(SpeechReecognition); fix(HoverButtons): set isSpeakling corr…
berry-13 Jan 23, 2024
a5c3461
fix: spelling errors
berry-13 Jan 25, 2024
c3bec3a
fix: renamed files
berry-13 Jan 25, 2024
4164159
BIG FIX
berry-13 Jan 28, 2024
146b5a8
feat: whisper support
berry-13 Jan 28, 2024
10b0622
fixed some ChatForm bugs and added the tts route
berry-13 Jan 28, 2024
7a4e854
handling more errors
berry-13 Jan 28, 2024
88b7b37
Fix audio stream initialization and cleanup in useSpeechToTextExternal
berry-13 Jan 28, 2024
7b69cf3
feat: Elevenlabs TTS
berry-13 Feb 1, 2024
8931c53
fixed some req issues
berry-13 Feb 4, 2024
3a2fdf8
Merge branch 'main' into Speech-to-Text
berry-13 Feb 7, 2024
3f5bb8c
fix: stt not activating on Mac
berry-13 Feb 7, 2024
3d0d942
Merge branch 'main' into Speech-to-Text
berry-13 Feb 12, 2024
27af0df
Merge branch 'main' into Speech-to-Text
berry-13 Feb 14, 2024
3ee6992
fix: send audio blob to frontend
berry-13 Feb 22, 2024
b8de3cf
fix(ChatForm): startupConfig var
berry-13 Mar 1, 2024
aa29a85
Update text-to-speech and speech-to-text services
berry-13 Mar 1, 2024
853e9ea
handle more errors correctly
berry-13 Mar 2, 2024
be4da8a
Remove console.log statements
berry-13 Mar 2, 2024
0ef4cdf
Merge branch 'main' into Speech-to-Text
berry-13 Mar 7, 2024
d815b69
feat: added manual trigger with button
berry-13 Mar 9, 2024
374cad9
fix: SpeechToText and SpeechToTextExernal + AudioRecorder
berry-13 Mar 9, 2024
881b90d
refactor: TTS component
berry-13 Mar 10, 2024
5651f90
chore: removed unused variable
berry-13 Mar 10, 2024
ac6acce
feat: azure stt
berry-13 Mar 15, 2024
e736e21
feat: dedicated speech panel
berry-13 Mar 16, 2024
dd3a886
feat: STT button switch: fix: TextArea pr value adapted
berry-13 Mar 16, 2024
fec7e1f
Merge branch 'main' into Speech-to-Text
berry-13 Mar 16, 2024
5502b7d
refactor: textToSpeech function and useTextToSpeechMutation
berry-13 Mar 16, 2024
774cfc3
Merge branch 'main' into Speech-to-Text
berry-13 Mar 22, 2024
e95be15
fix: typo data-service
berry-13 Mar 22, 2024
9116fd5
fix: blob backend to frontend
berry-13 Mar 22, 2024
664b7de
feat: TTS button for external
berry-13 Mar 22, 2024
e7e38df
feat: librechat.yaml
berry-13 Mar 23, 2024
1c37ebe
style: spinner when loading TTS
berry-13 Mar 23, 2024
6fca8d4
feat: hold click to download file
berry-13 Mar 24, 2024
c747867
style: disabled when apiKey not provided
berry-13 Mar 24, 2024
eccf7bf
fix: typo startupConfig?.speechToTextExternal
berry-13 Mar 24, 2024
b6c2857
style: update icons
berry-13 Mar 24, 2024
e56d860
fix(useTextToSpeech): set isSpeaking when audio finish
berry-13 Mar 24, 2024
8f01ba4
fix: small issues with local TTS
berry-13 Mar 24, 2024
bb713b2
style: update settings dark theme
berry-13 Mar 24, 2024
1e22721
Merge branch 'main' into Speech-to-Text
berry-13 Mar 25, 2024
6423b38
docs: STT & TTS
berry-13 Mar 25, 2024
85b3168
Merge branch 'main' into Speech-to-Text
berry-13 Mar 26, 2024
d47b7ed
WIP: chat audio automatic; docs(custom_config): update to new .yaml v…
berry-13 Mar 27, 2024
5a58a62
fix: send button disabled
berry-13 Mar 27, 2024
8e98620
fix: interval update
berry-13 Mar 28, 2024
4572ebf
localization
berry-13 Mar 28, 2024
4417864
removed unused test code
berry-13 Mar 28, 2024
b7985f8
revert interval update to 100
berry-13 Mar 28, 2024
c701758
feat: auto-send message
berry-13 Mar 28, 2024
c0c9477
Merge branch 'main' into Speech-to-Text
berry-13 Mar 29, 2024
8a90f93
fix: chat audio automatic, default false
berry-13 Mar 29, 2024
d970158
Merge branch 'Speech-to-Text' of https://github.com/danny-avila/libre…
berry-13 Mar 29, 2024
65249f4
refactor: moved all logic to hooks
berry-13 Apr 1, 2024
18a9cc7
Merge branch 'main' into Speech-to-Text
Apr 4, 2024
4eb6841
Merge branch 'main' into Speech-to-Text
berry-13 Apr 15, 2024
79a6901
Merge branch 'main' into Speech-to-Text
berry-13 Apr 23, 2024
c235b38
chore: renamed ChatAudio to conversationMode
berry-13 Apr 23, 2024
151be34
refactor: organized Speech panel
berry-13 Apr 23, 2024
03db6ef
feat: autoSendText switch
berry-13 Apr 23, 2024
26e0df1
feat: moved chataudio to conversationMode and improved error handling…
berry-13 Apr 23, 2024
bc8121d
refactor: Auto transcribe audio
berry-13 Apr 23, 2024
8807431
test: AutoSendTextSwitch, AutoTranscribeAudioSwitch and ConversationM…
berry-13 Apr 24, 2024
01abb65
fix: various speechTab fixes
berry-13 Apr 24, 2024
63fe703
refactor(useSpeechToTextBrowser):: handle more errors
berry-13 Apr 24, 2024
7f69f3f
feat: engine select
berry-13 Apr 25, 2024
78bda40
Merge branch 'main' into Speech-to-Text
berry-13 Apr 25, 2024
2acc9a9
feat: advanced mode
berry-13 Apr 27, 2024
de1dd10
chore: converted hooks to TS
berry-13 Apr 27, 2024
f595225
feat: cache TTS
berry-13 Apr 27, 2024
a38ba05
feat: delete cache; fix: cache issues
berry-13 Apr 27, 2024
619d336
refactor(useTextToSpeechExternal): removed unused import
berry-13 Apr 27, 2024
8d4bea9
feat: cache switch; refactor: moved to dir STT/TTS
berry-13 Apr 27, 2024
59861b9
tests: CacheTTS, TextToSpeech, SpeechToText
berry-13 Apr 27, 2024
3e40ad0
feat: custom elevenlabs compatibility
berry-13 Apr 27, 2024
7f48031
fix(useTextToSpeechExternal): cache switch not working
berry-13 Apr 27, 2024
0875fe5
Merge branch 'main' into Speech-to-Text
berry-13 May 2, 2024
e39d0eb
feat: animation for STT
berry-13 May 3, 2024
db4fc17
Merge branch 'main' into Speech-to-Text
berry-13 May 5, 2024
9f07c80
Merge branch 'main' into Speech-to-Text
berry-13 May 6, 2024
415a869
Merge branch 'main' into Speech-to-Text
berry-13 May 7, 2024
e06a13b
Merge branch 'main' into Speech-to-Text
berry-13 May 10, 2024
ca12731
fix: settings var not working
berry-13 May 8, 2024
f3b78cf
chore: remove unused var
berry-13 May 8, 2024
486740a
feat: voice dropdown; refactor: yaml changes
berry-13 May 11, 2024
d3f5878
fix(textToSpeech): remove undefined properties
berry-13 May 11, 2024
8647cc3
refactor: Remove console logs and unused variable
berry-13 May 11, 2024
cc35f77
Merge branch 'main' into Speech-to-Text
berry-13 May 13, 2024
b619b80
fix: TTS; feat: support coqui and piper
berry-13 May 13, 2024
6c1f7df
fix: some STT issues
berry-13 May 13, 2024
ece8f89
fix: stt test
berry-13 May 14, 2024
24ad1d9
fix: STT backend sending wrong data
berry-13 May 14, 2024
74a8ef5
BREAKING: switch to react-speech-recognition, add regenerator-runtime…
berry-13 May 16, 2024
80b6689
feat: websocket backend
berry-13 May 17, 2024
e27f59e
Merge branch 'main' into Speech-to-Text
berry-13 May 17, 2024
edc5c8e
foundations for websocket
berry-13 May 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 8 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,14 @@ MEILI_NO_ANALYTICS=true
MEILI_HOST=http://0.0.0.0:7700
MEILI_MASTER_KEY=DrhYf7zENyR6AlUCKmnz0eYASOQdl6zxH7s7MKFSfFCt


#==================================================#
# Speech to Text & Text to Speech #
#==================================================#

STT_API_KEY=
TTS_API_KEY=

#===================================================#
# User System #
#===================================================#
Expand Down
6 changes: 6 additions & 0 deletions api/app/clients/OpenAIClient.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ const {
createContextHandlers,
} = require('./prompts');
const { encodeAndFormat } = require('~/server/services/Files/images/encode');
const { updateTokenWebsocket } = require('~/server/services/Files/Audio');
const { isEnabled, sleep } = require('~/server/utils');
const { handleOpenAIErrors } = require('./tools/util');
const spendTokens = require('~/models/spendTokens');
Expand Down Expand Up @@ -594,6 +595,7 @@ class OpenAIClient extends BaseClient {
payload,
(progressMessage) => {
if (progressMessage === '[DONE]') {
updateTokenWebsocket('[DONE]');
return;
}

Expand Down Expand Up @@ -1211,10 +1213,14 @@ ${convo}
const azureDelay = this.modelOptions.model?.includes('gpt-4') ? 30 : 17;
for await (const chunk of stream) {
const token = chunk.choices[0]?.delta?.content || '';

updateTokenWebsocket(token);

intermediateReply += token;
onProgress(token);
if (abortController.signal.aborted) {
stream.controller.abort();
updateTokenWebsocket('[DONE]');
break;
}

Expand Down
1 change: 1 addition & 0 deletions api/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
"ua-parser-js": "^1.0.36",
"winston": "^3.11.0",
"winston-daily-rotate-file": "^4.7.1",
"ws": "^8.17.0",
"zod": "^3.22.4"
},
"devDependencies": {
Expand Down
4 changes: 4 additions & 0 deletions api/server/routes/files/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ const { createMulterInstance } = require('./multer');
const files = require('./files');
const images = require('./images');
const avatar = require('./avatar');
const stt = require('./stt');
const tts = require('./tts');

const initialize = async () => {
const router = express.Router();
Expand All @@ -18,6 +20,8 @@ const initialize = async () => {
router.post('/', upload.single('file'));
router.post('/images', upload.single('file'));

router.use('/stt', stt);
router.use('/tts', tts);
router.use('/', files);
router.use('/images', images);
router.use('/images/avatar', avatar);
Expand Down
13 changes: 13 additions & 0 deletions api/server/routes/files/stt.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
const express = require('express');
const router = express.Router();
const multer = require('multer');
const { requireJwtAuth } = require('~/server/middleware/');
const { speechToText } = require('~/server/services/Files/Audio');

const upload = multer();

router.post('/', requireJwtAuth, upload.single('audio'), async (req, res) => {
await speechToText(req, res);
});

module.exports = router;
26 changes: 26 additions & 0 deletions api/server/routes/files/tts.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
const express = require('express');
const router = express.Router();
const multer = require('multer');
const { requireJwtAuth } = require('~/server/middleware/');
const {
textToSpeech,
getVoices,
streamAudioFromWebSocket,
} = require('~/server/services/Files/Audio');

const upload = multer();

router.post('/', requireJwtAuth, upload.none(), async (req, res) => {
const { websocket } = req.body;
if (websocket) {
streamAudioFromWebSocket(req, res);
} else {
await textToSpeech(req, res);
}
});

router.get('/voices', requireJwtAuth, async (req, res) => {
await getVoices(req, res);
});

module.exports = router;
45 changes: 45 additions & 0 deletions api/server/services/Files/Audio/getVoices.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
const { logger } = require('~/config');
const getCustomConfig = require('~/server/services/Config/getCustomConfig');
const { getProvider } = require('./textToSpeech');

/**
* This function retrieves the available voices for the current TTS provider
* It first fetches the TTS configuration and determines the provider
* Then, based on the provider, it sends the corresponding voices as a JSON response
*
* @param {Object} req - The request object
* @param {Object} res - The response object
* @returns {void}
* @throws {Error} - If the provider is not 'openai' or 'elevenlabs', an error is thrown
*/
async function getVoices(req, res) {
try {
const customConfig = await getCustomConfig();

if (!customConfig || !customConfig?.tts) {
throw new Error('Configuration or TTS schema is missing');
}

const ttsSchema = customConfig?.tts;
const provider = getProvider(ttsSchema);
let voices;

switch (provider) {
case 'openai':
voices = ttsSchema.openai?.voices;
break;
case 'elevenlabs':
voices = ttsSchema.elevenlabs?.voices;
break;
default:
throw new Error('Invalid provider');
}

res.json(voices);
} catch (error) {
logger.error(`Failed to get voices: ${error.message}`);
res.status(500).json({ error: 'Failed to get voices' });
}
}

module.exports = getVoices;
12 changes: 12 additions & 0 deletions api/server/services/Files/Audio/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
const { textToSpeech, streamAudioFromWebSocket } = require('./textToSpeech');
const speechToText = require('./speechToText');
const getVoices = require('./getVoices');
const { updateTokenWebsocket } = require('./webSocket');

module.exports = {
textToSpeech,
speechToText,
getVoices,
updateTokenWebsocket,
streamAudioFromWebSocket,
};
211 changes: 211 additions & 0 deletions api/server/services/Files/Audio/speechToText.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
const axios = require('axios');
const { Readable } = require('stream');
const { logger } = require('~/config');
const getCustomConfig = require('~/server/services/Config/getCustomConfig');
const { extractEnvVariable } = require('librechat-data-provider');

/**
* Handle the response from the STT API
* @param {Object} response - The response from the STT API
*
* @returns {string} The text from the response data
*
* @throws Will throw an error if the response status is not 200 or the response data is missing
*/
async function handleResponse(response) {
if (response.status !== 200) {
throw new Error('Invalid response from the STT API');
}

if (!response.data || !response.data.text) {
throw new Error('Missing data in response from the STT API');
}

return response.data.text.trim();
}

function getProvider(sttSchema) {
if (sttSchema.openai) {
return 'openai';
}

throw new Error('Invalid provider');
}

function removeUndefined(obj) {
Object.keys(obj).forEach((key) => {
if (obj[key] && typeof obj[key] === 'object') {
removeUndefined(obj[key]);
if (Object.keys(obj[key]).length === 0) {
delete obj[key];
}
} else if (obj[key] === undefined) {
delete obj[key];
}
});
}

/**
* This function prepares the necessary data and headers for making a request to the OpenAI API
* It uses the provided speech-to-text schema and audio stream to create the request
*
* @param {Object} sttSchema - The speech-to-text schema containing the OpenAI configuration
* @param {Stream} audioReadStream - The audio data to be transcribed
*
* @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request
* If an error occurs, it returns an array with three null values and logs the error with logger
*/
function openAIProvider(sttSchema, audioReadStream) {
try {
const url = sttSchema.openai?.url || 'https://api.openai.com/v1/audio/transcriptions';
const apiKey = sttSchema.openai.apiKey ? extractEnvVariable(sttSchema.openai.apiKey) : '';

let data = {
file: audioReadStream,
model: sttSchema.openai.model,
};

let headers = {
'Content-Type': 'multipart/form-data',
};

[headers].forEach(removeUndefined);

if (apiKey) {
headers.Authorization = 'Bearer ' + apiKey;
}

return [url, data, headers];
} catch (error) {
logger.error('An error occurred while preparing the OpenAI API STT request: ', error);
return [null, null, null];
}
}

/**
* This function prepares the necessary data and headers for making a request to the Azure API
* It uses the provided request and audio stream to create the request
*
* @param {Object} req - The request object, which should contain the endpoint in its body
* @param {Stream} audioReadStream - The audio data to be transcribed
*
* @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request
* If an error occurs, it returns an array with three null values and logs the error with logger
*/
function azureProvider(req, audioReadStream) {
try {
const { endpoint } = req.body;
const azureConfig = req.app.locals[endpoint];

if (!azureConfig) {
throw new Error(`No configuration found for endpoint: ${endpoint}`);
}

const { apiKey, instanceName, whisperModel, apiVersion } = Object.entries(
azureConfig.groupMap,
).reduce((acc, [, value]) => {
if (acc) {
return acc;
}

const whisperKey = Object.keys(value.models).find((modelKey) =>
modelKey.startsWith('whisper'),
);

if (whisperKey) {
return {
apiVersion: value.version,
apiKey: value.apiKey,
instanceName: value.instanceName,
whisperModel: value.models[whisperKey]['deploymentName'],
};
}

return null;
}, null);

if (!apiKey || !instanceName || !whisperModel || !apiVersion) {
throw new Error('Required Azure configuration values are missing');
}

const baseURL = `https://${instanceName}.openai.azure.com`;

const url = `${baseURL}/openai/deployments/${whisperModel}/audio/transcriptions?api-version=${apiVersion}`;

let data = {
file: audioReadStream,
filename: 'audio.wav',
contentType: 'audio/wav',
knownLength: audioReadStream.length,
};

const headers = {
...data.getHeaders(),
'Content-Type': 'multipart/form-data',
'api-key': apiKey,
};

return [url, data, headers];
} catch (error) {
logger.error('An error occurred while preparing the Azure API STT request: ', error);
return [null, null, null];
}
}

/**
* Convert speech to text
* @param {Object} req - The request object
* @param {Object} res - The response object
*
* @returns {Object} The response object with the text from the STT API
*
* @throws Will throw an error if an error occurs while processing the audio
*/

async function speechToText(req, res) {
const customConfig = await getCustomConfig();
if (!customConfig) {
return res.status(500).send('Custom config not found');
}

if (!req.file || !req.file.buffer) {
return res.status(400).json({ message: 'No audio file provided in the FormData' });
}

const audioBuffer = req.file.buffer;
const audioReadStream = Readable.from(audioBuffer);
audioReadStream.path = 'audio.wav';

const provider = getProvider(customConfig.stt);

let [url, data, headers] = [];

switch (provider) {
case 'openai':
[url, data, headers] = openAIProvider(customConfig.stt, audioReadStream);
break;
case 'azure':
[url, data, headers] = azureProvider(req, audioReadStream);
break;
default:
throw new Error('Invalid provider');
}

if (!Readable.from) {
const audioBlob = new Blob([audioBuffer], { type: req.file.mimetype });
delete data['file'];
data['file'] = audioBlob;
}

try {
const response = await axios.post(url, data, { headers: headers });
const text = await handleResponse(response);

res.json({ text });
} catch (error) {
logger.error('An error occurred while processing the audio:', error);
res.sendStatus(500);
}
}

module.exports = speechToText;