From 1f71ad5d34aaaafc276c77565945ce816d0e204d Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Sun, 9 Jun 2024 12:00:36 +0200 Subject: [PATCH] [update] add common voice fr --- common_voice.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++ train2.py | 14 +++++++++--- training/train.py | 26 ++++++++++----------- 3 files changed, 81 insertions(+), 16 deletions(-) create mode 100644 common_voice.py diff --git a/common_voice.py b/common_voice.py new file mode 100644 index 0000000..fb1578f --- /dev/null +++ b/common_voice.py @@ -0,0 +1,57 @@ +from datasets import Audio, DatasetDict, load_dataset + +from training.train import Trainer + +common_voice = DatasetDict() + +common_voice["train"] = load_dataset( + "mozilla-foundation/common_voice_11_0", + "fr", + split="validation[0:5000]", + use_auth_token=True, +) +common_voice["test"] = load_dataset( + "mozilla-foundation/common_voice_11_0", + "fr", + split="test[0:100]", + use_auth_token=True, +) + +common_voice = common_voice.remove_columns( + [ + "accent", + "age", + "client_id", + "down_votes", + "gender", + "locale", + "path", + "segment", + "up_votes", + ] +) + +common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000)) + +trainer = Trainer() + + +def prepare_dataset(batch): + # load and resample audio data from 48 to 16kHz + audio = batch["audio"] + + # compute log-Mel input features from input audio array + batch["input_features"] = trainer.feature_extractor( + audio["array"], sampling_rate=audio["sampling_rate"] + ).input_features[0] + + # encode target text to label ids + batch["labels"] = trainer.tokenizer(batch["sentence"]).input_ids + return batch + + +common_voice = common_voice.map( + prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=1 +) + +trainer.train(common_voice) diff --git a/train2.py b/train2.py index b46b125..8a20ff3 100644 --- a/train2.py +++ b/train2.py @@ -1,6 +1,6 @@ import librosa import numpy as np -from datasets import Audio, DatasetDict, load_dataset +from datasets import Audio, DatasetDict, load_from_disk from training import utils from training.train import Trainer @@ -12,6 +12,7 @@ is_prepared = False + if not is_prepared: target_sr = trainer.processor.feature_extractor.sampling_rate @@ -28,14 +29,21 @@ def prepare_dataset(batch): batch["labels"] = trainer.tokenizer(batch["lyrics"]).input_ids return batch - dataset = dataset.map(prepare_dataset, num_proc=1) + dataset = dataset.map( + prepare_dataset, remove_columns=dataset.column_names, num_proc=1 + ) + + # filter out samples with empty labels + dataset = dataset.filter(lambda x: len(x["labels"]) > 5) # save the processed dataset dataset.save_to_disk("dataset/test/") else: # load the processed dataset - dataset = load_dataset("dataset/test/") + dataset = load_from_disk("dataset/test/") + +print(dataset) dataset = dataset.train_test_split(test_size=0.05) trainer.train(dataset) diff --git a/training/train.py b/training/train.py index 224cc0c..812ede5 100644 --- a/training/train.py +++ b/training/train.py @@ -10,10 +10,13 @@ WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizer, + logging, ) from training.collator import DataCollatorSpeechSeq2SeqWithPadding +logging.set_verbosity_warning() + class Trainer: """ @@ -23,7 +26,6 @@ class Trainer: def __init__( self, model_name="openai/whisper-tiny", - language="hindi", task="transcribe", output_dir="./whisper-finetuned", ): @@ -31,22 +33,16 @@ def __init__( Args: model_name (str, optional): _description_. Defaults to "openai/whisper-tiny". - language (str, optional): _description_. Defaults to "hindi". task (str, optional): _description_. Defaults to "transcribe". output_dir (str, optional): _description_. Defaults to "./whisper-finetuned". """ self.feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name) - self.tokenizer = WhisperTokenizer.from_pretrained( - model_name, language=language, task=task - ) + self.tokenizer = WhisperTokenizer.from_pretrained(model_name, task=task) - self.processor = WhisperProcessor.from_pretrained( - model_name, language=language, task=task - ) + self.processor = WhisperProcessor.from_pretrained(model_name, task=task) self.model = WhisperForConditionalGeneration.from_pretrained(model_name) - self.model.generation_config.language = language self.model.generation_config.task = task self.model.generation_config.forced_decoder_ids = None @@ -70,8 +66,11 @@ def _compute_metrics(self, pred): pred_str = self.tokenizer.batch_decode(pred_ids, skip_special_tokens=True) label_str = self.tokenizer.batch_decode(label_ids, skip_special_tokens=True) + print(pred_str[0]) + print(label_str[0]) + wer = 100 * self.metric.compute(predictions=pred_str, references=label_str) - print(f"WER: {wer}") + return {"wer": wer} def train(self, dataset): @@ -89,18 +88,19 @@ def train(self, dataset): max_steps=4000, gradient_checkpointing=True, fp16=True, - evaluation_strategy="steps", + eval_strategy="steps", per_device_eval_batch_size=8, predict_with_generate=True, generation_max_length=225, save_steps=80, - eval_steps=40, + eval_steps=80, logging_steps=25, report_to=["tensorboard"], load_best_model_at_end=True, metric_for_best_model="wer", greater_is_better=False, - push_to_hub=True, + push_to_hub=False, + gradient_checkpointing_kwargs={"use_reentrant": False}, ) trainer = Seq2SeqTrainer(