New stream events and more documentation (#53)

* Comments for whisper wrapper * Stream segment event * More events * More comments * Stream comments * More doc * More docs * Some cleanup * Comment here
Macoron · Aug 25, 2023 · 8725359 · 8725359
1 parent ea7ed46
commit 8725359
Show file tree

Hide file tree

Showing 5 changed files with 232 additions and 55 deletions.
diff --git a/Assets/Samples/5 - Streaming/StreamingSampleMic.cs b/Assets/Samples/5 - Streaming/StreamingSampleMic.cs
@@ -5,7 +5,7 @@
 namespace Whisper.Samples
 {
  /// <summary>
- /// 
+ /// Stream transcription from microphone input.
  /// </summary>
  public class StreamingSampleMic : MonoBehaviour
  {
@@ -23,8 +23,10 @@ private async void Start()
  {
  _stream = await whisper.CreateStream(microphoneRecord);
  _stream.OnResultUpdated += OnResult;
+ _stream.OnSegmentUpdated += OnSegmentUpdated;
+ _stream.OnSegmentFinished += OnSegmentFinished;
  _stream.OnStreamFinished += OnFinished;
- 
+
  microphoneRecord.OnRecordStop += OnRecordStop;
  button.onClick.AddListener(OnButtonPressed);
  }
@@ -53,6 +55,16 @@ private void OnResult(string result)
  UiUtils.ScrollDown(scroll);
  }
 
+ private void OnSegmentUpdated(WhisperResult segment)
+ {
+ print($"Segment updated: {segment.Result}");
+ }
+
+ private void OnSegmentFinished(WhisperResult segment)
+ {
+ print($"Segment finished: {segment.Result}");
+ }
+
  private void OnFinished(string finalResult)
  {
  print("Stream finished!");

diff --git a/Packages/com.whisper.unity/Runtime/Utils/MicrophoneRecord.cs b/Packages/com.whisper.unity/Runtime/Utils/MicrophoneRecord.cs
@@ -4,9 +4,13 @@
 using JetBrains.Annotations;
 using UnityEngine;
 using UnityEngine.UI;
+// ReSharper disable RedundantCast
 
 namespace Whisper.Utils
 {
+ /// <summary>
+ /// Small portion of recorded audio.
+ /// </summary>
  public struct AudioChunk
  {
  public float[] Data;
@@ -20,33 +24,61 @@ public struct AudioChunk
  public delegate void OnChunkReadyDelegate(AudioChunk chunk);
  public delegate void OnRecordStopDelegate(float[] data, int frequency, int channels, float length);
 
+ /// <summary>
+ /// Controls microphone input settings and recording. 
+ /// </summary>
  public class MicrophoneRecord : MonoBehaviour
  {
+ [Tooltip("Max length of recorded audio from microphone in seconds")]
  public int maxLengthSec = 30;
+ [Tooltip("Microphone sample rate")]
  public int frequency = 16000;
+ [Tooltip("Length of audio chunks in seconds, useful for streaming")]
  public float chunksLengthSec = 0.5f;
+ [Tooltip("Should microphone play echo when recording is complete?")]
  public bool echo = true;
 
  [Header("Voice Activity Detection (VAD)")]
+ [Tooltip("Should microphone check if audio input has speech?")]
  public bool useVad = true;
+ [Tooltip("How often VAD checks if current audio chunk has speech")]
  public float vadUpdateRateSec = 0.1f;
+ [Tooltip("Seconds of audio record that VAD uses to check if chunk has speech")]
  public float vadContextSec = 30f;
+ [Tooltip("Window size where VAD tries to detect speech")]
  public float vadLastSec = 1.25f;
+ [Tooltip("Threshold of VAD energy activation")]
  public float vadThd = 0.6f;
+ [Tooltip("Threshold of VAD filter frequency")]
  public float vadFreqThd = 100.0f;
+ [Tooltip("Optional indicator that changes color when speech detected")]
  [CanBeNull] public Image vadIndicatorImage;
 
- [Header("VAD Stop")] 
+ [Header("VAD Stop")]
+ [Tooltip("If true microphone will stop record when no speech detected")]
  public bool vadStop;
+ [Tooltip("If true whisper transcription will drop last audio where silence was detected")]
  public bool dropVadPart = true;
+ [Tooltip("After how many seconds of silence microphone will stop record")]
  public float vadStopTime = 3f;
 
  [Header("Microphone selection (optional)")] 
+ [Tooltip("Optional UI dropdown with all available microphone inputs")]
  [CanBeNull] public Dropdown microphoneDropdown;
+ [Tooltip("The label of default microphone input in dropdown")]
  public string microphoneDefaultLabel = "Default microphone";
 
+ /// <summary>
+ /// Raised when VAD status changed.
+ /// </summary>
  public event OnVadChangedDelegate OnVadChanged;
+ /// <summary>
+ /// Raised when new audio chunk from microphone is ready.
+ /// </summary>
  public event OnChunkReadyDelegate OnChunkReady;
+ /// <summary>
+ /// Raised when microphone record stopped.
+ /// </summary>
  public event OnRecordStopDelegate OnRecordStop;
 
  private float _recordStart;

diff --git a/Packages/com.whisper.unity/Runtime/WhisperManager.cs b/Packages/com.whisper.unity/Runtime/WhisperManager.cs
@@ -7,6 +7,9 @@
 
 namespace Whisper
 {
+ /// <summary>
+ /// Manages Whisper model lifecycle in Unity scene.
+ /// </summary>
  public class WhisperManager : MonoBehaviour
  {
  [Tooltip("Log level for whisper loading and inference")]
@@ -16,49 +19,24 @@ public class WhisperManager : MonoBehaviour
  [SerializeField] 
  [Tooltip("Path to model weights file")]
  private string modelPath = "Whisper/ggml-tiny.bin";
-
- public string ModelPath
- {
- get => modelPath;
- set
- {
- if (IsLoaded || IsLoading)
- {
- throw new InvalidOperationException("Cannot change model path after loading the model");
- }
-
- modelPath = value;
- }
- }
-
+
  [SerializeField]
  [Tooltip("Determines whether the StreamingAssets folder should be prepended to the model path")]
  private bool isModelPathInStreamingAssets = true;
-
- public bool IsModelPathInStreamingAssets
- {
- get => isModelPathInStreamingAssets;
- set
- {
- if (IsLoaded || IsLoading)
- {
- throw new InvalidOperationException("Cannot change model path after loading the model");
- }
-
- isModelPathInStreamingAssets = value;
- }
- }
-
- [SerializeField] [Tooltip("Should model weights be loaded on awake?")]
+
+ [SerializeField] 
+ [Tooltip("Should model weights be loaded on awake?")]
  private bool initOnAwake = true;
 
- [Header("Language")] [Tooltip("Output text language. Use empty or \"auto\" for auto-detection.")]
+ [Header("Language")] 
+ [Tooltip("Output text language. Use empty or \"auto\" for auto-detection.")]
  public string language = "en";
 
  [Tooltip("Force output text to English translation. Improves translation quality.")]
  public bool translateToEnglish;
 
- [Header("Advanced settings")] [SerializeField]
+ [Header("Advanced settings")] 
+ [SerializeField]
  private WhisperSamplingStrategy strategy = WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY;
 
  [Tooltip("Do not use past transcription (if any) as initial prompt for the decoder.")]
@@ -106,14 +84,49 @@ public bool IsModelPathInStreamingAssets
  "These can significantly reduce the quality of the output.")]
  public int audioCtx;
 
+ /// <summary>
+ /// Raised when whisper transcribed a new text segment from audio. 
+ /// </summary>
  public event OnNewSegmentDelegate OnNewSegment;
+
+ /// <summary>
+ /// Raised when whisper made some progress in transcribing audio.
+ /// Progress changes from 0 to 100 included.
+ /// </summary>
  public event OnProgressDelegate OnProgress;
 
  private WhisperWrapper _whisper;
  private WhisperParams _params;
  private readonly MainThreadDispatcher _dispatcher = new MainThreadDispatcher();
 
+ public string ModelPath
+ {
+ get => modelPath;
+ set
+ {
+ if (IsLoaded || IsLoading)
+ {
+ throw new InvalidOperationException("Cannot change model path after loading the model");
+ }
+
+ modelPath = value;
+ }
+ }
+
+ public bool IsModelPathInStreamingAssets
+ {
+ get => isModelPathInStreamingAssets;
+ set
+ {
+ if (IsLoaded || IsLoading)
+ {
+ throw new InvalidOperationException("Cannot change model path after loading the model");
+ }
 
+ isModelPathInStreamingAssets = value;
+ }
+ }
+
  /// <summary>
  /// Checks if whisper weights are loaded and ready to be used.
  /// </summary>
@@ -165,7 +178,7 @@ public async Task InitModel()
  IsLoading = true;
  try
  {
- var path = IsModelPathInStreamingAssets
+ var path = isModelPathInStreamingAssets
  ? Path.Combine(Application.streamingAssetsPath, modelPath)
  : modelPath;
  _whisper = await WhisperWrapper.InitFromFileAsync(path);
@@ -183,6 +196,9 @@ public async Task InitModel()
  IsLoading = false;
  }
 
+ /// <summary>
+ /// Checks if currently loaded whisper model supports multilingual transcription.
+ /// </summary>
  public bool IsMultilingual()
  {
  if (!IsLoaded)
@@ -195,8 +211,9 @@ public bool IsMultilingual()
  }
 
  /// <summary>
- /// Get transcription from audio clip.
+ /// Start async transcription of audio clip.
  /// </summary>
+ /// <returns>Full audio transcript. Null if transcription failed.</returns>
  public async Task<WhisperResult> GetTextAsync(AudioClip clip)
  {
  var isLoaded = await CheckIfLoaded();
@@ -209,8 +226,12 @@ public async Task<WhisperResult> GetTextAsync(AudioClip clip)
  }
 
  /// <summary>
- /// Get transcription from audio buffer.
+ /// Start async transcription of audio buffer.
  /// </summary>
+ /// <param name="samples">Raw audio buffer.</param>
+ /// <param name="frequency">Audio sample rate.</param>
+ /// <param name="channels">Audio channels count.</param>
+ /// <returns>Full audio transcript. Null if transcription failed.</returns>
  public async Task<WhisperResult> GetTextAsync(float[] samples, int frequency, int channels)
  {
  var isLoaded = await CheckIfLoaded();
@@ -222,6 +243,12 @@ public async Task<WhisperResult> GetTextAsync(float[] samples, int frequency, in
  return res;
  }
 
+ /// <summary>
+ /// Create a new instance of Whisper streaming transcription.
+ /// </summary>
+ /// <param name="frequency">Audio sample rate.</param>
+ /// <param name="channels">Audio channels count.</param>
+ /// <returns>New streaming transcription. Null if failed.</returns>
  public async Task<WhisperStream> CreateStream(int frequency, int channels)
  {
  var isLoaded = await CheckIfLoaded();
@@ -238,6 +265,10 @@ public async Task<WhisperStream> CreateStream(int frequency, int channels)
  return stream;
  }
 
+ /// <summary>
+ /// Create a new instance of Whisper streaming transcription from microphone input.
+ /// </summary>
+ /// <returns>New streaming transcription. Null if failed.</returns>
  public async Task<WhisperStream> CreateStream(MicrophoneRecord microphone)
  {
  var isLoaded = await CheckIfLoaded();