speechbrain · flexthink · Feb 6, 2024 · Feb 6, 2024 · Feb 6, 2024 · Feb 13, 2024
diff --git a/recipes/LJSpeech/evaluation/README.md b/recipes/LJSpeech/evaluation/README.md
@@ -0,0 +1,46 @@
+# Text-to-Speech (with LJSpeech)
+This folder contains the recipes for evaluation of existing pretrained text-to-speech systems using ASR-based evaluators and MOS estimation
+
+By default, MOS evaluation is performed using a pretrained Transformer model, as defined in `recipes/SOMOS/ttseval/hparams/train.yaml` and available in pre-trained form on HuggingFace in
+https://huggingface.co/flexthink/ttseval-wavlm-transformer
+
+ASR evaluation is performed using the bundled Transformer ASR : https://huggingface.co/speechbrain/asr-transformer-transformerlm-librispeech
+
+# Tacotron 2
+The recipe contains hyperparameters for the evaluation of Tacotron2 in `hparams/tacotron2.yaml`
+
+To perform evaluation, run the following script
+```
+python evaluate.py --data_folder=/your_folder/LJSpeech-1.1 hparams/tacotron.yaml
+```
+
+
+# FastSpeech2
+The recipe contains hyperparameters for the evaluation of FastSpeech2 in `hparams/fastspeech2.yaml`
+
+```
+python train.py --data_folder=/your_folder/LJSpeech-1.1 hparams/fastspeech2.yaml
+```
+
+
+# **About SpeechBrain**
+- Website: https://speechbrain.github.io/
+- Code: https://github.com/speechbrain/speechbrain/
+- HuggingFace: https://huggingface.co/speechbrain/
+
+
+# **Citing SpeechBrain**
+Please, cite SpeechBrain if you use it for your research or business.
+
+```bibtex
+@misc{speechbrain,
+ title={{SpeechBrain}: A General-Purpose Speech Toolkit},
+ author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio},
+ year={2021},
+ eprint={2106.04624},
+ archivePrefix={arXiv},
+ primaryClass={eess.AS},
+ note={arXiv:2106.04624}
+}
+```
+
diff --git a/recipes/LJSpeech/evaluation/adapters.py b/recipes/LJSpeech/evaluation/adapters.py
@@ -0,0 +1,81 @@
+"""Adapters for specific TTS system
+
+Authors
+* Artem Ploujnikov, 2024
+"""
+
+from torch import nn
+
+
+class MelAdapter(nn.Module):
+ """An adapter for TTSes that output a MEL spectrogram
+ and require a vocoder to synthesize an
+ audio wave
+
+ Arguments
+ ---------
+ vocoder : torch.nn.Module | speechbrain.inference.Pretrained
+ the vocoder to be used
+ vocoder_run_opts : dict
+ run options for the vocoder
+ """
+
+ def __init__(self, vocoder, vocoder_run_opts=None):
+ super().__init__()
+ self.vocoder_fn = vocoder
+ self.vocoder_run_opts = vocoder_run_opts or {}
+ self.vocoder = None
+ self.device = None
+
+ def _get_vocoder(self):
+ """Instantiates the vocoder, if not already instantiated"""
+ if self.vocoder is None:
+ run_opts = dict(self.vocoder_run_opts)
+ if self.device is not None:
+ run_opts["device"] = self.device
+ self.vocoder = self.vocoder_fn(run_opts=run_opts)
+ return self.vocoder
+
+ def forward(self, tts_out):
+ """Applies a vocoder to the waveform
+
+ Arguments
+ ---------
+ tts_out : tuple
+ a (tensor, tensor) tuple with a MEL spectrogram
+ of shape (batch x mel x length)
+ and absolute lengths (as in the output of Tacotron2
+ or similar models)
+
+ Returns
+ -------
+ wav : torch.Tensor
+ The waveform
+ lengths : torch.Tensor
+ The lengths
+ """
+ mel_outputs, mel_lengths = tts_out[:2]
+ vocoder = self._get_vocoder()
+ max_len = mel_lengths.max()
+ mel_outputs = mel_outputs[:, :, :max_len]
+ wav = vocoder(mel_outputs)
+ lengths = mel_lengths / max_len
+ return wav, lengths
+
+ def to(self, device):
+ """Transfers the adapter (and the underlying model) to the
+ specified device
+
+ Arguments
+ ---------
+ device : str | torch.Device
+ The device
+
+
+ Returns
+ -------
+ result : MelAdapter
+ the adapter (i.e. returns itself)
+ """
+ self.device = device
+ return super().to(device)