Skip to content

Commit

Permalink
New stream events and more documentation (#53)
Browse files Browse the repository at this point in the history
* Comments for whisper wrapper

* Stream segment event

* More events

* More comments

* Stream comments

* More doc

* More docs

* Some cleanup

* Comment here
  • Loading branch information
Macoron committed Aug 25, 2023
1 parent ea7ed46 commit 8725359
Show file tree
Hide file tree
Showing 5 changed files with 232 additions and 55 deletions.
16 changes: 14 additions & 2 deletions Assets/Samples/5 - Streaming/StreamingSampleMic.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
namespace Whisper.Samples
{
/// <summary>
///
/// Stream transcription from microphone input.
/// </summary>
public class StreamingSampleMic : MonoBehaviour
{
Expand All @@ -23,8 +23,10 @@ private async void Start()
{
_stream = await whisper.CreateStream(microphoneRecord);
_stream.OnResultUpdated += OnResult;
_stream.OnSegmentUpdated += OnSegmentUpdated;
_stream.OnSegmentFinished += OnSegmentFinished;
_stream.OnStreamFinished += OnFinished;

microphoneRecord.OnRecordStop += OnRecordStop;
button.onClick.AddListener(OnButtonPressed);
}
Expand Down Expand Up @@ -53,6 +55,16 @@ private void OnResult(string result)
UiUtils.ScrollDown(scroll);
}

private void OnSegmentUpdated(WhisperResult segment)
{
print($"Segment updated: {segment.Result}");
}

private void OnSegmentFinished(WhisperResult segment)
{
print($"Segment finished: {segment.Result}");
}

private void OnFinished(string finalResult)
{
print("Stream finished!");
Expand Down
34 changes: 33 additions & 1 deletion Packages/com.whisper.unity/Runtime/Utils/MicrophoneRecord.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@
using JetBrains.Annotations;
using UnityEngine;
using UnityEngine.UI;
// ReSharper disable RedundantCast

namespace Whisper.Utils
{
/// <summary>
/// Small portion of recorded audio.
/// </summary>
public struct AudioChunk
{
public float[] Data;
Expand All @@ -20,33 +24,61 @@ public struct AudioChunk
public delegate void OnChunkReadyDelegate(AudioChunk chunk);
public delegate void OnRecordStopDelegate(float[] data, int frequency, int channels, float length);

/// <summary>
/// Controls microphone input settings and recording.
/// </summary>
public class MicrophoneRecord : MonoBehaviour
{
[Tooltip("Max length of recorded audio from microphone in seconds")]
public int maxLengthSec = 30;
[Tooltip("Microphone sample rate")]
public int frequency = 16000;
[Tooltip("Length of audio chunks in seconds, useful for streaming")]
public float chunksLengthSec = 0.5f;
[Tooltip("Should microphone play echo when recording is complete?")]
public bool echo = true;

[Header("Voice Activity Detection (VAD)")]
[Tooltip("Should microphone check if audio input has speech?")]
public bool useVad = true;
[Tooltip("How often VAD checks if current audio chunk has speech")]
public float vadUpdateRateSec = 0.1f;
[Tooltip("Seconds of audio record that VAD uses to check if chunk has speech")]
public float vadContextSec = 30f;
[Tooltip("Window size where VAD tries to detect speech")]
public float vadLastSec = 1.25f;
[Tooltip("Threshold of VAD energy activation")]
public float vadThd = 0.6f;
[Tooltip("Threshold of VAD filter frequency")]
public float vadFreqThd = 100.0f;
[Tooltip("Optional indicator that changes color when speech detected")]
[CanBeNull] public Image vadIndicatorImage;

[Header("VAD Stop")]
[Header("VAD Stop")]
[Tooltip("If true microphone will stop record when no speech detected")]
public bool vadStop;
[Tooltip("If true whisper transcription will drop last audio where silence was detected")]
public bool dropVadPart = true;
[Tooltip("After how many seconds of silence microphone will stop record")]
public float vadStopTime = 3f;

[Header("Microphone selection (optional)")]
[Tooltip("Optional UI dropdown with all available microphone inputs")]
[CanBeNull] public Dropdown microphoneDropdown;
[Tooltip("The label of default microphone input in dropdown")]
public string microphoneDefaultLabel = "Default microphone";

/// <summary>
/// Raised when VAD status changed.
/// </summary>
public event OnVadChangedDelegate OnVadChanged;
/// <summary>
/// Raised when new audio chunk from microphone is ready.
/// </summary>
public event OnChunkReadyDelegate OnChunkReady;
/// <summary>
/// Raised when microphone record stopped.
/// </summary>
public event OnRecordStopDelegate OnRecordStop;

private float _recordStart;
Expand Down
103 changes: 67 additions & 36 deletions Packages/com.whisper.unity/Runtime/WhisperManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@

namespace Whisper
{
/// <summary>
/// Manages Whisper model lifecycle in Unity scene.
/// </summary>
public class WhisperManager : MonoBehaviour
{
[Tooltip("Log level for whisper loading and inference")]
Expand All @@ -16,49 +19,24 @@ public class WhisperManager : MonoBehaviour
[SerializeField]
[Tooltip("Path to model weights file")]
private string modelPath = "Whisper/ggml-tiny.bin";

public string ModelPath
{
get => modelPath;
set
{
if (IsLoaded || IsLoading)
{
throw new InvalidOperationException("Cannot change model path after loading the model");
}

modelPath = value;
}
}


[SerializeField]
[Tooltip("Determines whether the StreamingAssets folder should be prepended to the model path")]
private bool isModelPathInStreamingAssets = true;

public bool IsModelPathInStreamingAssets
{
get => isModelPathInStreamingAssets;
set
{
if (IsLoaded || IsLoading)
{
throw new InvalidOperationException("Cannot change model path after loading the model");
}

isModelPathInStreamingAssets = value;
}
}

[SerializeField] [Tooltip("Should model weights be loaded on awake?")]

[SerializeField]
[Tooltip("Should model weights be loaded on awake?")]
private bool initOnAwake = true;

[Header("Language")] [Tooltip("Output text language. Use empty or \"auto\" for auto-detection.")]
[Header("Language")]
[Tooltip("Output text language. Use empty or \"auto\" for auto-detection.")]
public string language = "en";

[Tooltip("Force output text to English translation. Improves translation quality.")]
public bool translateToEnglish;

[Header("Advanced settings")] [SerializeField]
[Header("Advanced settings")]
[SerializeField]
private WhisperSamplingStrategy strategy = WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY;

[Tooltip("Do not use past transcription (if any) as initial prompt for the decoder.")]
Expand Down Expand Up @@ -106,14 +84,49 @@ public bool IsModelPathInStreamingAssets
"These can significantly reduce the quality of the output.")]
public int audioCtx;

/// <summary>
/// Raised when whisper transcribed a new text segment from audio.
/// </summary>
public event OnNewSegmentDelegate OnNewSegment;

/// <summary>
/// Raised when whisper made some progress in transcribing audio.
/// Progress changes from 0 to 100 included.
/// </summary>
public event OnProgressDelegate OnProgress;

private WhisperWrapper _whisper;
private WhisperParams _params;
private readonly MainThreadDispatcher _dispatcher = new MainThreadDispatcher();

public string ModelPath
{
get => modelPath;
set
{
if (IsLoaded || IsLoading)
{
throw new InvalidOperationException("Cannot change model path after loading the model");
}

modelPath = value;
}
}

public bool IsModelPathInStreamingAssets
{
get => isModelPathInStreamingAssets;
set
{
if (IsLoaded || IsLoading)
{
throw new InvalidOperationException("Cannot change model path after loading the model");
}

isModelPathInStreamingAssets = value;
}
}

/// <summary>
/// Checks if whisper weights are loaded and ready to be used.
/// </summary>
Expand Down Expand Up @@ -165,7 +178,7 @@ public async Task InitModel()
IsLoading = true;
try
{
var path = IsModelPathInStreamingAssets
var path = isModelPathInStreamingAssets
? Path.Combine(Application.streamingAssetsPath, modelPath)
: modelPath;
_whisper = await WhisperWrapper.InitFromFileAsync(path);
Expand All @@ -183,6 +196,9 @@ public async Task InitModel()
IsLoading = false;
}

/// <summary>
/// Checks if currently loaded whisper model supports multilingual transcription.
/// </summary>
public bool IsMultilingual()
{
if (!IsLoaded)
Expand All @@ -195,8 +211,9 @@ public bool IsMultilingual()
}

/// <summary>
/// Get transcription from audio clip.
/// Start async transcription of audio clip.
/// </summary>
/// <returns>Full audio transcript. Null if transcription failed.</returns>
public async Task<WhisperResult> GetTextAsync(AudioClip clip)
{
var isLoaded = await CheckIfLoaded();
Expand All @@ -209,8 +226,12 @@ public async Task<WhisperResult> GetTextAsync(AudioClip clip)
}

/// <summary>
/// Get transcription from audio buffer.
/// Start async transcription of audio buffer.
/// </summary>
/// <param name="samples">Raw audio buffer.</param>
/// <param name="frequency">Audio sample rate.</param>
/// <param name="channels">Audio channels count.</param>
/// <returns>Full audio transcript. Null if transcription failed.</returns>
public async Task<WhisperResult> GetTextAsync(float[] samples, int frequency, int channels)
{
var isLoaded = await CheckIfLoaded();
Expand All @@ -222,6 +243,12 @@ public async Task<WhisperResult> GetTextAsync(float[] samples, int frequency, in
return res;
}

/// <summary>
/// Create a new instance of Whisper streaming transcription.
/// </summary>
/// <param name="frequency">Audio sample rate.</param>
/// <param name="channels">Audio channels count.</param>
/// <returns>New streaming transcription. Null if failed.</returns>
public async Task<WhisperStream> CreateStream(int frequency, int channels)
{
var isLoaded = await CheckIfLoaded();
Expand All @@ -238,6 +265,10 @@ public async Task<WhisperStream> CreateStream(int frequency, int channels)
return stream;
}

/// <summary>
/// Create a new instance of Whisper streaming transcription from microphone input.
/// </summary>
/// <returns>New streaming transcription. Null if failed.</returns>
public async Task<WhisperStream> CreateStream(MicrophoneRecord microphone)
{
var isLoaded = await CheckIfLoaded();
Expand Down
Loading

0 comments on commit 8725359

Please sign in to comment.