Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Voice Activation #25

Open
RicardoEPRodrigues opened this issue Apr 8, 2024 · 1 comment
Open

Voice Activation #25

RicardoEPRodrigues opened this issue Apr 8, 2024 · 1 comment
Labels
enhancement New feature or request

Comments

@RicardoEPRodrigues
Copy link

Hey,

I saw you had voice activation planned for this tool. I ended up implementing a basic solution on my end and wanted to share it with you.

In my use case, I really needed an open mic where the player spoke when they wanted. So I implemented voice detection through changes in the intensity of the audio (not perfect but it kinda works). I leave the code below if you want to try it out.

In essence, I pick up the data points from the CapturableSoundWave and create an average over time. If for a given time frame, the values deviate enough from the average, then I activate the voice recording. The recording stops after a second if the data points return to the average (the player stopped speaking).

Header

#pragma once

#include "CoreMinimal.h"
#include "SpeechRecognizer.h"
#include "MyRecorderSpeechRecognition.generated.h"

/**
 * 
 */
UCLASS(Blueprintable)
class MY_API UMyRecorderSpeechRecognition : public UObject
{
	GENERATED_BODY()

public:
	virtual void BeginDestroy() override;

	virtual void Init();
	virtual void StartAudioSession(bool bIsMuted);
	virtual void StopAudioSession();
	virtual void SetAudioSessionMute(const bool bIsMuted);
	virtual bool GetAudioSessionMute();

	/**
	 * Audio Input Device ID (aka which microphone to use).
	 */
	UPROPERTY(BlueprintReadWrite, EditAnywhere, Category="STT")
	int DeviceId = 0;

	/**
	 * Window of captured audio data to be stored prior to voice activation.
	 */
	UPROPERTY(BlueprintReadWrite, EditAnywhere, Category="STT|Voice Activation")
	int VoiceActivationWindow = 5;

	/**
	 * Time to wait after voice activation and before sending the audio data to be recognized.
	 */
	UPROPERTY(BlueprintReadWrite, EditAnywhere, Category="STT|Voice Activation")
	float VoiceActivationDelay = 1.0f;

	/**
	 * Multiplier of the deviation to detect a significant shift in volume to trigger voice activation.
	 */
	UPROPERTY(BlueprintReadWrite, EditAnywhere, Category="STT|Voice Activation")
	float VoiceActivationDeviationMultiplier = 2.0f;

	/**
	 * Minimum value the deviation can have to offer better results. (Most relevant in quiet environments.)
	 */
	UPROPERTY(BlueprintReadWrite, EditAnywhere, Category="STT|Voice Activation")
	float VoiceActivationMinDeviation = 0.5f;

protected:

	UPROPERTY()
	TArray<float> AudioData;

	float ActiveAverage = -1;
	float ActiveDeviation = -1;
	
	// STT Section
	UPROPERTY()
	bool IsAudioSessionMuted = false;

	UPROPERTY(BlueprintReadWrite)
	class USpeechRecognizer* SpeechRecognizer;

	UPROPERTY(BlueprintReadWrite)
	class UCapturableSoundWave* CapturableSoundWave;

	UPROPERTY()
	FOnSpeechRecognitionStartedDynamic OnStartSpeechRecognitionEvent;

	UFUNCTION()
	void OnRecognitionFinished();
	UFUNCTION()
	void OnRecognitionError(const FString& ShortErrorMessage, const FString& LongErrorMessage);
	UFUNCTION()
	void OnRecognizedTextSegment(const FString& RecognizedWords);

	UFUNCTION()
	void OnStartSpeechRecognition(bool bSucceeded);

	UFUNCTION()
	void OnPopulateAudioData(const TArray<float>& PopulatedAudioData);

	UFUNCTION()
	void Reset();

	bool IsVoiceActivated = false;
	
	FTimerDelegate VoiceActivationDelegate{};
	FTimerHandle VoiceActivationTimerHandle{};

	UFUNCTION()
	void OnVoiceActivation();
};

Code

// Fill out your copyright notice in the Description page of Project Settings.


#include "MyRecorderSpeechRecognition.h"

#include "Sound/CapturableSoundWave.h"

void UMyRecorderSpeechRecognition::BeginDestroy()
{
	if (SpeechRecognizer)
	{
		SpeechRecognizer->StopSpeechRecognition();
		SpeechRecognizer->OnRecognitionFinished.RemoveDynamic(
			this, &UMyRecorderSpeechRecognition::OnRecognitionFinished);
		SpeechRecognizer->OnRecognitionError.
		                  RemoveDynamic(this, &UMyRecorderSpeechRecognition::OnRecognitionError);
		SpeechRecognizer->OnRecognizedTextSegment.RemoveDynamic(
			this, &UMyRecorderSpeechRecognition::OnRecognizedTextSegment);
	}

	OnStartSpeechRecognitionEvent.Clear();

	if (CapturableSoundWave)
	{
		CapturableSoundWave->StopCapture();
		CapturableSoundWave->OnPopulateAudioData.RemoveDynamic(
			this, &UMyRecorderSpeechRecognition::OnPopulateAudioData);
	}

	if (GetWorld() && GetWorld()->GetTimerManager().IsTimerActive(VoiceActivationTimerHandle))
	{
		GetWorld()->GetTimerManager().ClearTimer(VoiceActivationTimerHandle);
	}

	Super::BeginDestroy();
}

void UMyRecorderSpeechRecognition::Init()
{
	SpeechRecognizer = USpeechRecognizer::CreateSpeechRecognizer();
	SpeechRecognizer->SetLanguage(ESpeechRecognizerLanguage::En);
	SpeechRecognizer->OnRecognitionFinished.AddUniqueDynamic(
		this, &UMyRecorderSpeechRecognition::OnRecognitionFinished);
	SpeechRecognizer->OnRecognitionError.AddUniqueDynamic(this, &UMyRecorderSpeechRecognition::OnRecognitionError);
	SpeechRecognizer->OnRecognizedTextSegment.AddUniqueDynamic(
		this, &UMyRecorderSpeechRecognition::OnRecognizedTextSegment);
	SpeechRecognizer->SetStreamingDefaults();
	SpeechRecognizer->SetSuppressBlank(true);
	SpeechRecognizer->SetSuppressNonSpeechTokens(true);
	SpeechRecognizer->SetNumOfThreads(0);
	SpeechRecognizer->SetStepSize(0);

	OnStartSpeechRecognitionEvent.BindDynamic(this, &UMyRecorderSpeechRecognition::OnStartSpeechRecognition);

	CapturableSoundWave = UCapturableSoundWave::CreateCapturableSoundWave();
	CapturableSoundWave->OnPopulateAudioData.AddUniqueDynamic(
		this, &UMyRecorderSpeechRecognition::OnPopulateAudioData);
}

void UMyRecorderSpeechRecognition::StartAudioSession(bool bIsMuted)
{
	if (!SpeechRecognizer)
	{
		UE_LOG(LogTemp, Error, TEXT("Unable to start audio session. Speech Recognizer is not defined."));
		return;
	}

	SpeechRecognizer->StartSpeechRecognition(OnStartSpeechRecognitionEvent);
	IsAudioSessionMuted = bIsMuted;
}

void UMyRecorderSpeechRecognition::StopAudioSession()
{
	if (CapturableSoundWave)
	{
		CapturableSoundWave->StopCapture();
	}
	if (SpeechRecognizer)
	{
		SpeechRecognizer->StopSpeechRecognition();
	}
	Reset();
}

void UMyRecorderSpeechRecognition::SetAudioSessionMute(const bool bIsMuted)
{
	if (IsAudioSessionMuted == bIsMuted)
	{
		return;
	}

	IsAudioSessionMuted = bIsMuted;
	if (!CapturableSoundWave)
	{
		return;
	}

	CapturableSoundWave->ToggleMute(IsAudioSessionMuted);
}

bool UMyRecorderSpeechRecognition::GetAudioSessionMute()
{
	return IsAudioSessionMuted;
}

void UMyRecorderSpeechRecognition::OnRecognitionFinished()
{
}

void UMyRecorderSpeechRecognition::OnRecognitionError(const FString& ShortErrorMessage,
                                                           const FString& LongErrorMessage)
{
	UE_LOG(LogTemp, Error, TEXT("Speech Recognition Error. %s. %s"), *ShortErrorMessage, *LongErrorMessage);
}

void UMyRecorderSpeechRecognition::OnRecognizedTextSegment(const FString& RecognizedWords)
{
	// Send text to game.
}

void UMyRecorderSpeechRecognition::OnStartSpeechRecognition(bool bSucceeded)
{
	if (!CapturableSoundWave)
	{
		UE_LOG(LogTemp, Error, TEXT("Unable to start speech recognition. CapturableSoundWave is not defined."));
		return;
	}

	Reset();

	CapturableSoundWave->StartCapture(DeviceId);
	if (!IsAudioSessionMuted) SetAudioSessionMute(IsAudioSessionMuted);
}

float MathSumAbs(const TArray<float>& Population)
{
	float Std = 0;

	for (const auto Data : Population)
	{
		Std += FMath::Abs(Data);
	}
	return Std;
}

void UMyRecorderSpeechRecognition::OnPopulateAudioData(const TArray<float>& PopulatedAudioData)
{
	if (!SpeechRecognizer)
	{
		UE_LOG(LogTemp, Error, TEXT("Unable to process audio data. SpeechRecognizer is not defined."));
		return;
	}
	if (!CapturableSoundWave)
	{
		UE_LOG(LogTemp, Error, TEXT("Unable to process audio data. CapturableSoundWave is not defined."));
		return;
	}

	const float Sum = MathSumAbs(PopulatedAudioData);
	const float Deviation = FMath::Abs(ActiveAverage - Sum);

	if (ActiveAverage <= 0)
	{
		ActiveAverage = Sum;
		ActiveDeviation = FMath::Max(Deviation, VoiceActivationMinDeviation);
		return;
	}

	if (Sum > ActiveAverage + (ActiveDeviation * VoiceActivationDeviationMultiplier))
	{
		FTimerManager& TimerManager = GetWorld()->GetTimerManager();
		if (TimerManager.IsTimerActive(VoiceActivationTimerHandle))
		{
			TimerManager.ClearTimer(VoiceActivationTimerHandle);
		}
		else
		{
			UE_LOG(LogTemp, Log, TEXT("Voice Activation: ON"));
		}

		VoiceActivationDelegate.BindUObject(this, &UMyRecorderSpeechRecognition::OnVoiceActivation);
		TimerManager.SetTimer(VoiceActivationTimerHandle, VoiceActivationDelegate,
		                      VoiceActivationDelay, false);
		IsVoiceActivated = true;
	}

	ActiveAverage = (ActiveAverage * 0.3f) + (Sum * 0.7f);
	ActiveDeviation = FMath::Max((ActiveDeviation + Deviation) * 0.5f, VoiceActivationMinDeviation);

	if (!IsVoiceActivated)
	{
		const int WindowNum = PopulatedAudioData.Num() * (VoiceActivationWindow - 1);
		if (AudioData.Num() > WindowNum)
		{
			const int NumDataToRemove = AudioData.Num() - WindowNum;
			AudioData.RemoveAt(0, NumDataToRemove);
		}
	}
	AudioData.Append(PopulatedAudioData);

	// SpeechRecognizer->ProcessAudioData(PopulatedAudioData, CapturableSoundWave->GetSampleRate(),
	//                                    CapturableSoundWave->GetNumOfChannels(), false);
}

void UMyRecorderSpeechRecognition::Reset()
{
	ActiveAverage = -1;
	ActiveDeviation = VoiceActivationMinDeviation;

	IsVoiceActivated = false;

	AudioData.Empty();
}

void UMyRecorderSpeechRecognition::OnVoiceActivation()
{
	UE_LOG(LogTemp, Log, TEXT("Voice Activation: OFF"));

	SpeechRecognizer->ProcessAudioData(AudioData, CapturableSoundWave->GetSampleRate(),
	                                   CapturableSoundWave->GetNumOfChannels(), true);

	AudioData.Empty();
	IsVoiceActivated = false;
}

@gtreshchev
Copy link
Owner

Hi, thank you for sharing! :) I'll see if it can be integrated into the plugin, maybe as a separate function 👍

@gtreshchev gtreshchev added bug Something isn't working enhancement New feature or request and removed bug Something isn't working labels Apr 25, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New feature or request
Projects
None yet
Development

No branches or pull requests

2 participants