[FEATURE] Added support for LocalAI Speech To Text configuration (#2376)

* Added support for LocalAI to the Speech To Text configuration. Added a few debug statements around speech to text conversion. Finally, refactored the speechToTextProviders a bit to try and remove some magic strings that have undocumented rules around naming. * LocalAI STT - PR Feedback - Updated LocalAI Image, changed casing, and updated the default model to whisper-1.
FlowiseAI · May 13, 2024 · d3f03e3 · d3f03e3
1 parent 823cefb
commit d3f03e3
Show file tree

Hide file tree

Showing 4 changed files with 129 additions and 32 deletions.
diff --git a/packages/components/src/speechToText.ts b/packages/components/src/speechToText.ts
@@ -4,40 +4,69 @@ import { type ClientOptions, OpenAIClient } from '@langchain/openai'
 import { AssemblyAI } from 'assemblyai'
 import { getFileFromStorage } from './storageUtils'
 
+const SpeechToTextType = {
+ OPENAI_WHISPER: 'openAIWhisper',
+ ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
+ LOCALAI_STT: 'localAISTT'
+}
+
 export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => {
  if (speechToTextConfig) {
  const credentialId = speechToTextConfig.credentialId as string
  const credentialData = await getCredentialData(credentialId ?? '', options)
  const audio_file = await getFileFromStorage(upload.name, options.chatflowid, options.chatId)
 
- if (speechToTextConfig.name === 'openAIWhisper') {
- const openAIClientOptions: ClientOptions = {
- apiKey: credentialData.openAIApiKey
- }
- const openAIClient = new OpenAIClient(openAIClientOptions)
- const transcription = await openAIClient.audio.transcriptions.create({
- file: new File([new Blob([audio_file])], upload.name),
- model: 'whisper-1',
- language: speechToTextConfig?.language,
- temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
- prompt: speechToTextConfig?.prompt
- })
- if (transcription?.text) {
- return transcription.text
+ switch (speechToTextConfig.name) {
+ case SpeechToTextType.OPENAI_WHISPER: {
+ const openAIClientOptions: ClientOptions = {
+ apiKey: credentialData.openAIApiKey
+ }
+ const openAIClient = new OpenAIClient(openAIClientOptions)
+ const openAITranscription = await openAIClient.audio.transcriptions.create({
+ file: new File([new Blob([audio_file])], upload.name),
+ model: 'whisper-1',
+ language: speechToTextConfig?.language,
+ temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
+ prompt: speechToTextConfig?.prompt
+ })
+ if (openAITranscription?.text) {
+ return openAITranscription.text
+ }
+ break
  }
- } else if (speechToTextConfig.name === 'assemblyAiTranscribe') {
- const client = new AssemblyAI({
- apiKey: credentialData.assemblyAIApiKey
- })
+  case SpeechToTextType.ASSEMBLYAI_TRANSCRIBE: {
+  const assemblyAIClient = new AssemblyAI({
+  apiKey: credentialData.assemblyAIApiKey
+  })
 
- const params = {
- audio: audio_file,
- speaker_labels: false
- }
+  const params = {
+  audio: audio_file,
+  speaker_labels: false
+  }
 
- const transcription = await client.transcripts.transcribe(params)
- if (transcription?.text) {
- return transcription.text
+ const assemblyAITranscription = await assemblyAIClient.transcripts.transcribe(params)
+ if (assemblyAITranscription?.text) {
+ return assemblyAITranscription.text
+ }
+ break
+ }
+ case SpeechToTextType.LOCALAI_STT: {
+ const LocalAIClientOptions: ClientOptions = {
+ apiKey: credentialData.localAIApiKey,
+ baseURL: speechToTextConfig?.baseUrl
+ }
+ const localAIClient = new OpenAIClient(LocalAIClientOptions)
+ const localAITranscription = await localAIClient.audio.transcriptions.create({
+ file: new File([new Blob([audio_file])], upload.name),
+ model: speechToTextConfig?.model || 'whisper-1',
+ language: speechToTextConfig?.language,
+ temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
+ prompt: speechToTextConfig?.prompt
+ })
+ if (localAITranscription?.text) {
+ return localAITranscription.text
+ }
+ break
  }
  }
  } else {

diff --git a/packages/server/src/utils/buildChatflow.ts b/packages/server/src/utils/buildChatflow.ts
@@ -78,7 +78,8 @@ export const utilBuildChatflow = async (req: Request, socketIO?: Server, isInter
  }
 
  // Run Speech to Text conversion
- if (upload.mime === 'audio/webm' || upload.mime === 'audio/mp4') {
+ if (upload.mime === 'audio/webm' || upload.mime === 'audio/mp4' || upload.mime === 'audio/ogg') {
+ logger.debug(`Attempting a speech to text conversion...`)
  let speechToTextConfig: ICommonObject = {}
  if (chatflow.speechToText) {
  const speechToTextProviders = JSON.parse(chatflow.speechToText)
@@ -99,6 +100,7 @@ export const utilBuildChatflow = async (req: Request, socketIO?: Server, isInter
  databaseEntities: databaseEntities
  }
  const speechToTextResult = await convertSpeechToText(upload, speechToTextConfig, options)
+ logger.debug(`Speech to text result: ${speechToTextResult}`)
  if (speechToTextResult) {
  incomingInput.question = speechToTextResult
  }

diff --git a/packages/ui/src/assets/images/localai.png b/packages/ui/src/assets/images/localai.png
diff --git a/packages/ui/src/ui-component/extended/SpeechToText.jsx b/packages/ui/src/ui-component/extended/SpeechToText.jsx
@@ -16,17 +16,27 @@ import { StyledButton } from '@/ui-component/button/StyledButton'
 import { Dropdown } from '@/ui-component/dropdown/Dropdown'
 import openAISVG from '@/assets/images/openai.svg'
 import assemblyAIPng from '@/assets/images/assemblyai.png'
+import localAiPng from '@/assets/images/localai.png'
 
 // store
 import useNotifier from '@/utils/useNotifier'
 
 // API
 import chatflowsApi from '@/api/chatflows'
 
+// If implementing a new provider, this must be updated in
+// components/src/speechToText.ts as well
+const SpeechToTextType = {
+ OPENAI_WHISPER: 'openAIWhisper',
+ ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
+ LOCALAI_STT: 'localAISTT'
+}
+
+// Weird quirk - the key must match the name property value.
 const speechToTextProviders = {
- openAIWhisper: {
+ [SpeechToTextType.OPENAI_WHISPER]: {
  label: 'OpenAI Whisper',
- name: 'openAIWhisper',
+ name: SpeechToTextType.OPENAI_WHISPER,
  icon: openAISVG,
  url: 'https://platform.openai.com/docs/guides/speech-to-text',
  inputs: [
@@ -63,9 +73,9 @@ const speechToTextProviders = {
  }
  ]
  },
- assemblyAiTranscribe: {
+ [SpeechToTextType.ASSEMBLYAI_TRANSCRIBE]: {
  label: 'Assembly AI',
- name: 'assemblyAiTranscribe',
+ name: SpeechToTextType.ASSEMBLYAI_TRANSCRIBE,
  icon: assemblyAIPng,
  url: 'https://www.assemblyai.com/',
  inputs: [
@@ -76,6 +86,59 @@ const speechToTextProviders = {
  credentialNames: ['assemblyAIApi']
  }
  ]
+ },
+ [SpeechToTextType.LOCALAI_STT]: {
+ label: 'LocalAi STT',
+ name: SpeechToTextType.LOCALAI_STT,
+ icon: localAiPng,
+ url: 'https://localai.io/features/audio-to-text/',
+ inputs: [
+ {
+ label: 'Connect Credential',
+ name: 'credential',
+ type: 'credential',
+ credentialNames: ['localAIApi']
+ },
+ {
+ label: 'Base URL',
+ name: 'baseUrl',
+ type: 'string',
+ description: 'The base URL of the local AI server'
+ },
+ {
+ label: 'Language',
+ name: 'language',
+ type: 'string',
+ description:
+ 'The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.',
+ placeholder: 'en',
+ optional: true
+ },
+ {
+ label: 'Model',
+ name: 'model',
+ type: 'string',
+ description: `The STT model to load. Defaults to whisper-1 if left blank.`,
+ placeholder: 'whisper-1',
+ optional: true
+ },
+ {
+ label: 'Prompt',
+ name: 'prompt',
+ type: 'string',
+ rows: 4,
+ description: `An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.`,
+ optional: true
+ },
+ {
+ label: 'Temperature',
+ name: 'temperature',
+ type: 'number',
+ step: 0.1,
+ description: `The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.`,
+ optional: true
+ }
+ ]
  }
 }
 
@@ -191,8 +254,11 @@ const SpeechToText = ({ dialogProps }) => {
  <FormControl fullWidth>
  <Select size='small' value={selectedProvider} onChange={handleProviderChange}>
  <MenuItem value='none'>None</MenuItem>
- <MenuItem value='openAIWhisper'>OpenAI Whisper</MenuItem>
- <MenuItem value='assemblyAiTranscribe'>Assembly AI</MenuItem>
+ {Object.values(speechToTextProviders).map((provider) => (
+ <MenuItem key={provider.name} value={provider.name}>
+ {provider.label}
+ </MenuItem>
+ ))}
  </Select>
  </FormControl>
  </Box>