From 3045d3244d3207e29ed43d72a6445bf5207a6291 Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Sat, 1 Jun 2024 17:13:31 +0200 Subject: [PATCH 01/13] [update] calculate alignement between text and audio --- dataset/aeneas_wrapper.py | 41 +++++++++++++++++++++++++++ dataset/exceptions.py | 2 ++ dataset/process.py | 58 +++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 ++ 4 files changed, 103 insertions(+) create mode 100644 dataset/aeneas_wrapper.py create mode 100644 dataset/exceptions.py create mode 100644 dataset/process.py diff --git a/dataset/aeneas_wrapper.py b/dataset/aeneas_wrapper.py new file mode 100644 index 0000000..9b319c4 --- /dev/null +++ b/dataset/aeneas_wrapper.py @@ -0,0 +1,41 @@ +import json +import tempfile + +from aeneas.tools.execute_task import ExecuteTaskCLI + +from dataset.exceptions import AeneasAlignError + + +def aeneas_cli_exec(audio_path: str, lyric_path: str) -> dict: + """Align lyrics with audio + + Args: + audio_path (str): the path to the audio file + lyric_path (str): the path to the lyric file + + Raises: + AeneasAlignError: if Aeneas fails to align lyrics + + Returns: + dict: a dictionary containing the alignment data + """ + + tmp_dir = tempfile.mkdtemp() + + args = [ + "dummy", + audio_path, + lyric_path, + "task_language=en|is_text_type=plain|os_task_file_format=json", + f"{tmp_dir}/lyric.json", + ] + + exit_code = ExecuteTaskCLI(use_sys=False).run(arguments=args) + + if exit_code != 0: + raise AeneasAlignError("Aeneas failed to align lyrics") + + with open(f"{tmp_dir}/lyric.json", "r", encoding="utf-8") as f: + data = json.load(f) + + return data diff --git a/dataset/exceptions.py b/dataset/exceptions.py new file mode 100644 index 0000000..374a290 --- /dev/null +++ b/dataset/exceptions.py @@ -0,0 +1,2 @@ +class AeneasAlignError(Exception): + """Raised when Aeneas fails to align lyrics""" diff --git a/dataset/process.py b/dataset/process.py new file mode 100644 index 0000000..24dd768 --- /dev/null +++ b/dataset/process.py @@ -0,0 +1,58 @@ +import os + +import dataset.exceptions +from dataset.aeneas_wrapper import aeneas_cli_exec + + +class Process: + """Class to process the dataset""" + + def __init__(self, lyric_path: str, audio_path: str): + self.lyric_path = lyric_path + self.audio_path = audio_path + + def _aenas_align(self, audio_path: str, lyric_path: str) -> dict: + """Method to align lyrics with audio + + Args: + audio_path (str): the path to the audio file + lyric_path (str): the path to the lyric file + + Raises: + AeneasAlignError: if Aeneas fails to align lyrics + + Returns: + dict: a dictionary containing the alignment data + """ + + return aeneas_cli_exec(audio_path, lyric_path) + + def _split_audio(self, lyric_path: str, alignement: dict) -> list: + """Method to split audio into 32 seconds segments with the corresponding lyrics + + Args: + lyric_path (str): the path to the lyric file + alignement (dict): the alignment data + + Returns: + list: a list that contain lyrics split into 32 seconds segments + """ + + raise NotImplementedError + + def process(self) -> None: + """Method to process the dataset : + 1. Align lyrics with audio + 2. Split audio into 32 seconds segments + 3. Save the segments to the dataset/audio/processed folder in .wav format + """ + + for audio_f in os.listdir(self.audio_path): + audio_path = os.path.join(self.audio_path, audio_f) + lyric_path = os.path.join(self.lyric_path, audio_f.split(".")[0] + ".txt") + + try: + alignement = self._aenas_align(audio_path, lyric_path) + except dataset.exceptions.AeneasAlignError as e: + print(f"Failed to align {audio_f}: {e}") + continue diff --git a/requirements.txt b/requirements.txt index 72a6856..72fefbf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,5 @@ accelerate bitsandbytes evaluate librosa +aeneas +numpy From 02bf9529a18afadca12f7324a21b3ad71a3469cb Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Sun, 2 Jun 2024 11:52:48 +0200 Subject: [PATCH 02/13] [update] align audio by worlds level --- dataset/aeneas_wrapper.py | 27 ++++++++++++++++++++++++--- dataset/process.py | 32 +++++++++++++++++++++++++++++--- 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/dataset/aeneas_wrapper.py b/dataset/aeneas_wrapper.py index 9b319c4..5afbf8c 100644 --- a/dataset/aeneas_wrapper.py +++ b/dataset/aeneas_wrapper.py @@ -1,10 +1,17 @@ import json +import re import tempfile -from aeneas.tools.execute_task import ExecuteTaskCLI +from aeneas.tools.execute_task import ExecuteTaskCLI, RuntimeConfiguration from dataset.exceptions import AeneasAlignError +rconf = RuntimeConfiguration() +rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH] = True +rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH_L3] = True +rconf[RuntimeConfiguration.TTS_CACHE] = True +rconf.set_granularity(3) + def aeneas_cli_exec(audio_path: str, lyric_path: str) -> dict: """Align lyrics with audio @@ -22,15 +29,29 @@ def aeneas_cli_exec(audio_path: str, lyric_path: str) -> dict: tmp_dir = tempfile.mkdtemp() + with open(lyric_path, "r", encoding="utf-8") as f: + lyric = f.read() + + # remove all text between [] + lyric = re.sub(r"\[.*?\]", "\n", lyric) + + # remove when more than 2 new lines + lyric = re.sub(r"\n{1,}", "\n", lyric).strip() + + lyric = lyric.replace(" ", "\n") + + with open(f"{tmp_dir}/lyric.txt", "w", encoding="utf-8") as f: + f.write(lyric) + args = [ "dummy", audio_path, - lyric_path, + f"{tmp_dir}/lyric.txt", "task_language=en|is_text_type=plain|os_task_file_format=json", f"{tmp_dir}/lyric.json", ] - exit_code = ExecuteTaskCLI(use_sys=False).run(arguments=args) + exit_code = ExecuteTaskCLI(use_sys=False, rconf=rconf).run(arguments=args) if exit_code != 0: raise AeneasAlignError("Aeneas failed to align lyrics") diff --git a/dataset/process.py b/dataset/process.py index 24dd768..219c4f8 100644 --- a/dataset/process.py +++ b/dataset/process.py @@ -1,5 +1,7 @@ import os +from pydub import AudioSegment + import dataset.exceptions from dataset.aeneas_wrapper import aeneas_cli_exec @@ -27,18 +29,38 @@ def _aenas_align(self, audio_path: str, lyric_path: str) -> dict: return aeneas_cli_exec(audio_path, lyric_path) - def _split_audio(self, lyric_path: str, alignement: dict) -> list: + def _split_audio( + self, lyric_path: str, alignement: dict, split_windows: int = 32 + ) -> list: """Method to split audio into 32 seconds segments with the corresponding lyrics Args: lyric_path (str): the path to the lyric file alignement (dict): the alignment data + split_windows (int, optional): the size of the split window in seconds. Defaults to 32. Returns: - list: a list that contain lyrics split into 32 seconds segments + list: a list of list that contain lyrics split into 32 seconds segments """ - raise NotImplementedError + lyric = open(lyric_path, "r", encoding="utf-8").read() + + segments = [] + start_idx = 0 + end_idx = 0 + + for fragment in alignement["fragments"]: + print(fragment) + end_idx = lyric.find(fragment["lines"][0], start_idx) + windows = (len(segments) + 1) * split_windows + + if float(fragment["begin"]) > windows: + segments.append(lyric[start_idx:end_idx]) + start_idx = end_idx + + segments.append(lyric[start_idx:]) + + print(segments, len(segments)) def process(self) -> None: """Method to process the dataset : @@ -56,3 +78,7 @@ def process(self) -> None: except dataset.exceptions.AeneasAlignError as e: print(f"Failed to align {audio_f}: {e}") continue + + self._split_audio(lyric_path, alignement) + + break From d0ca9ed03d4b0fc09564b38fdf930565fea91ff0 Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Sun, 2 Jun 2024 12:49:31 +0200 Subject: [PATCH 03/13] [update] add ds processing --- .gitignore | 7 +- dataset/aeneas_wrapper.py | 80 ++++++++++---------- dataset/process.py | 150 +++++++++++++++++++++++++++++++------- download_dataset.py | 2 +- process.py | 13 ++++ requirements.txt | 2 +- 6 files changed, 184 insertions(+), 70 deletions(-) create mode 100644 process.py diff --git a/.gitignore b/.gitignore index c368e9b..8f5a25c 100644 --- a/.gitignore +++ b/.gitignore @@ -159,10 +159,9 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ -dataset/audio/ -dataset/lyrics/ -dataset/data.json train/ formated_dataset/ -test.py \ No newline at end of file +test.py + +dataset/ \ No newline at end of file diff --git a/dataset/aeneas_wrapper.py b/dataset/aeneas_wrapper.py index 5afbf8c..998a193 100644 --- a/dataset/aeneas_wrapper.py +++ b/dataset/aeneas_wrapper.py @@ -6,57 +6,61 @@ from dataset.exceptions import AeneasAlignError -rconf = RuntimeConfiguration() -rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH] = True -rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH_L3] = True -rconf[RuntimeConfiguration.TTS_CACHE] = True -rconf.set_granularity(3) +class AeneasWrapper: + """Wrapper class for Aeneas CLI""" -def aeneas_cli_exec(audio_path: str, lyric_path: str) -> dict: - """Align lyrics with audio + def __init__(self) -> None: + self._rconf = RuntimeConfiguration() + self._rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH] = True + self._rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH_L3] = True + self._rconf[RuntimeConfiguration.TTS_CACHE] = True + self._rconf.set_granularity(3) - Args: - audio_path (str): the path to the audio file - lyric_path (str): the path to the lyric file + def aeneas_cli_exec(self, audio_path: str, lyric_path: str) -> dict: + """Align lyrics with audio - Raises: - AeneasAlignError: if Aeneas fails to align lyrics + Args: + audio_path (str): the path to the audio file + lyric_path (str): the path to the lyric file - Returns: - dict: a dictionary containing the alignment data - """ + Raises: + AeneasAlignError: if Aeneas fails to align lyrics - tmp_dir = tempfile.mkdtemp() + Returns: + dict: a dictionary containing the alignment data + """ - with open(lyric_path, "r", encoding="utf-8") as f: - lyric = f.read() + tmp_dir = tempfile.mkdtemp() - # remove all text between [] - lyric = re.sub(r"\[.*?\]", "\n", lyric) + with open(lyric_path, "r", encoding="utf-8") as f: + lyric = f.read() - # remove when more than 2 new lines - lyric = re.sub(r"\n{1,}", "\n", lyric).strip() + # remove all text between [] + lyric = re.sub(r"\[.*?\]", "\n", lyric) - lyric = lyric.replace(" ", "\n") + # remove when more than 2 new lines + lyric = re.sub(r"\n{1,}", "\n", lyric).strip() - with open(f"{tmp_dir}/lyric.txt", "w", encoding="utf-8") as f: - f.write(lyric) + lyric = lyric.replace(" ", "\n") - args = [ - "dummy", - audio_path, - f"{tmp_dir}/lyric.txt", - "task_language=en|is_text_type=plain|os_task_file_format=json", - f"{tmp_dir}/lyric.json", - ] + with open(f"{tmp_dir}/lyric.txt", "w", encoding="utf-8") as f: + f.write(lyric) - exit_code = ExecuteTaskCLI(use_sys=False, rconf=rconf).run(arguments=args) + args = [ + "dummy", + audio_path, + f"{tmp_dir}/lyric.txt", + "task_language=en|is_text_type=plain|os_task_file_format=json", + f"{tmp_dir}/lyric.json", + ] - if exit_code != 0: - raise AeneasAlignError("Aeneas failed to align lyrics") + exit_code = ExecuteTaskCLI(use_sys=False, rconf=self._rconf).run(arguments=args) - with open(f"{tmp_dir}/lyric.json", "r", encoding="utf-8") as f: - data = json.load(f) + if exit_code != 0: + raise AeneasAlignError("Aeneas failed to align lyrics") - return data + with open(f"{tmp_dir}/lyric.json", "r", encoding="utf-8") as f: + data = json.load(f) + + return data diff --git a/dataset/process.py b/dataset/process.py index 219c4f8..4eb4d33 100644 --- a/dataset/process.py +++ b/dataset/process.py @@ -1,37 +1,84 @@ import os +from typing import List from pydub import AudioSegment import dataset.exceptions -from dataset.aeneas_wrapper import aeneas_cli_exec +from dataset.aeneas_wrapper import AeneasWrapper -class Process: +class DatasetProcess: """Class to process the dataset""" - def __init__(self, lyric_path: str, audio_path: str): + def __init__( + self, + lyric_path: str, + audio_path: str, + export_path: str = None, + clean: bool = False, + ): + """Constructor to initialize the DatasetProcess class + + Args: + lyric_path (str): the path to the lyrics folder + audio_path (str): the path to the audio folder + export_path (str, optional): the path to export data. Defaults to None. + clean (bool, optional): remove all data in the export path. Defaults to False. + """ + self.lyric_path = lyric_path self.audio_path = audio_path + self.export_path = export_path + + if clean: + self.remove_export_folder() + + self.create_export_folder() + + self.aeneas = AeneasWrapper() + + def create_export_folder(self) -> None: + """Method to create the export folder""" + + if not os.path.exists(self.export_path): + os.makedirs(self.export_path) + + if not os.path.exists(f"{self.export_path}/audio"): + os.makedirs(f"{self.export_path}/audio") + + if not os.path.exists(f"{self.export_path}/lyrics"): + os.makedirs(f"{self.export_path}/lyrics") - def _aenas_align(self, audio_path: str, lyric_path: str) -> dict: - """Method to align lyrics with audio + def remove_export_folder(self) -> None: + """Method to remove the export folder""" + + if os.path.exists(self.export_path): + os.rmdir(self.export_path) + + def _split_audio( + self, audio_path: str, split_windows: int = 32 + ) -> List[AudioSegment]: + """Method to split audio into 32 seconds segments Args: audio_path (str): the path to the audio file - lyric_path (str): the path to the lyric file - - Raises: - AeneasAlignError: if Aeneas fails to align lyrics + split_windows (int, optional): the size of the split window in seconds. Defaults to 32. Returns: - dict: a dictionary containing the alignment data + list: a list of AudioSegment that contain audio split into 32 seconds segments """ - return aeneas_cli_exec(audio_path, lyric_path) + audio = AudioSegment.from_file(audio_path) + segments = [] - def _split_audio( + for i in range(0, len(audio), split_windows * 1000): + segments.append(audio[i : i + split_windows * 1000]) + + return segments + + def _split_lyric( self, lyric_path: str, alignement: dict, split_windows: int = 32 - ) -> list: + ) -> List[str]: """Method to split audio into 32 seconds segments with the corresponding lyrics Args: @@ -43,15 +90,15 @@ def _split_audio( list: a list of list that contain lyrics split into 32 seconds segments """ - lyric = open(lyric_path, "r", encoding="utf-8").read() + with open(lyric_path, "r", encoding="utf-8") as f: + lyric = f.read() segments = [] start_idx = 0 end_idx = 0 for fragment in alignement["fragments"]: - print(fragment) - end_idx = lyric.find(fragment["lines"][0], start_idx) + end_idx = lyric.find(fragment["lines"][0], end_idx) windows = (len(segments) + 1) * split_windows if float(fragment["begin"]) > windows: @@ -60,25 +107,76 @@ def _split_audio( segments.append(lyric[start_idx:]) - print(segments, len(segments)) + return segments - def process(self) -> None: - """Method to process the dataset : - 1. Align lyrics with audio - 2. Split audio into 32 seconds segments - 3. Save the segments to the dataset/audio/processed folder in .wav format + def _export_audio(self, audios: List[AudioSegment], file_name: str) -> None: + """Method to export audio segments to .wav format + + Args: + audios (List[AudioSegment]): a list of AudioSegment + file_name (str): the name of the file + """ + + for i, audio in enumerate(audios): + path = f"{self.audio_path}/{file_name}_{i}.wav" + + if self.export_path: + path = f"{self.export_path}/audio/{file_name}_{i}.wav" + + audio.export(path, format="wav") + + def _export_lyric(self, lyrics: List[str], file_name: str) -> None: + """Method to export lyrics segments to .txt format + + Args: + lyrics (List[str]): a list of lyrics + file_name (str): the name of the file + """ + + for i, lyric in enumerate(lyrics): + path = f"{self.lyric_path}/{file_name}_{i}.txt" + + if self.export_path: + path = f"{self.export_path}/lyrics/{file_name}_{i}.txt" + + with open(path, "w", encoding="utf-8") as f: + f.write(lyric) + + def process(self, remove: bool = False) -> None: + """Method to process the dataset + 1. Align lyrics with audio + 2. Split audio into 32 seconds segments + 3. Save the segments to the dataset/audio/processed folder in .wav format + + Args: + remove (bool, optional): remove the processed file. Defaults to False. """ - for audio_f in os.listdir(self.audio_path): + nbm_files = len(os.listdir(self.audio_path)) + for i, audio_f in enumerate(os.listdir(self.audio_path)): + if not audio_f.endswith(".ogg") and not audio_f.endswith(".mp4"): + continue + audio_path = os.path.join(self.audio_path, audio_f) lyric_path = os.path.join(self.lyric_path, audio_f.split(".")[0] + ".txt") try: - alignement = self._aenas_align(audio_path, lyric_path) + alignement = self.aeneas.aeneas_cli_exec(audio_path, lyric_path) except dataset.exceptions.AeneasAlignError as e: print(f"Failed to align {audio_f}: {e}") continue - self._split_audio(lyric_path, alignement) + lyric_segments = self._split_lyric(lyric_path, alignement) + audio_segments = self._split_audio(audio_path) + + # save the audio segments and the lyrics + self._export_audio(audio_segments, audio_f.split(".")[0]) + self._export_lyric(lyric_segments, audio_f.split(".")[0]) + + print( + f"Processed {i}/ {nbm_files} - {round(i/nbm_files*100, 2)}%", end="\r" + ) - break + if remove: + os.remove(lyric_path) + os.remove(audio_path) diff --git a/download_dataset.py b/download_dataset.py index bbef722..085cc1f 100644 --- a/download_dataset.py +++ b/download_dataset.py @@ -4,7 +4,7 @@ parser = argparse.ArgumentParser( - description="Download images from Sonauto dataset", + description="Download music from Sonauto API", ) parser.add_argument("--num_images", type=int, default=10000) parser.add_argument("--clean", type=bool, default=True) diff --git a/process.py b/process.py new file mode 100644 index 0000000..c79b687 --- /dev/null +++ b/process.py @@ -0,0 +1,13 @@ +import argparse + +from dataset.process import DatasetProcess + + +parser = argparse.ArgumentParser( + description="Process the dataset", +) +parser.add_argument("--num_images", type=int, default=10000) +parser.add_argument("--clean", type=bool, default=True) + +args = parser.parse_args() +api = diff --git a/requirements.txt b/requirements.txt index 72fefbf..7df2311 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,5 +9,5 @@ accelerate bitsandbytes evaluate librosa -aeneas numpy +aeneas From bac2ec88cd0da709e8929d8337646cf33865f338 Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Sun, 2 Jun 2024 12:52:20 +0200 Subject: [PATCH 04/13] [update] add cli cmd --- README.md | 10 ++++++++++ process.py | 15 ++++++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f35d78f..4b261ee 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,16 @@ dataset where `0.wav` corresponds to the audio file and `0.txt` corresponds to the lyrics transcription of the audio file. +## Process the dataset + +To process the dataset, run the following command: + +```bash +python process_dataset.py --clean +``` + +The process will split the audio in chunks of 32 seconds and split the lyrics. + ## License This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. \ No newline at end of file diff --git a/process.py b/process.py index c79b687..32d4a2b 100644 --- a/process.py +++ b/process.py @@ -6,8 +6,17 @@ parser = argparse.ArgumentParser( description="Process the dataset", ) -parser.add_argument("--num_images", type=int, default=10000) -parser.add_argument("--clean", type=bool, default=True) +parser.add_argument("--audio_path", type=str, default="dataset/audio") +parser.add_argument("--lyric_path", type=str, default="dataset/lyrics") +parser.add_argument("--export_path", type=str, default="dataset/export") +parser.add_argument("--clean", type=bool, default=False) args = parser.parse_args() -api = +process = DatasetProcess( + lyric_path=args.lyric_path, + audio_path=args.audio_path, + export_path=args.export_path, + clean=args.clean, +) + +process.process() From 82020d4f329994acf5866cd29d11ca82363cfe31 Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Sun, 2 Jun 2024 12:55:23 +0200 Subject: [PATCH 05/13] [update] add sampling rate --- dataset/process.py | 6 ++++++ process.py => process_dataset.py | 2 ++ 2 files changed, 8 insertions(+) rename process.py => process_dataset.py (86%) diff --git a/dataset/process.py b/dataset/process.py index 4eb4d33..01ec2b3 100644 --- a/dataset/process.py +++ b/dataset/process.py @@ -14,6 +14,7 @@ def __init__( self, lyric_path: str, audio_path: str, + sample_rate: int = None, export_path: str = None, clean: bool = False, ): @@ -22,6 +23,7 @@ def __init__( Args: lyric_path (str): the path to the lyrics folder audio_path (str): the path to the audio folder + sample_rate (int, optional): the sample rate of the audio. Defaults to None. export_path (str, optional): the path to export data. Defaults to None. clean (bool, optional): remove all data in the export path. Defaults to False. """ @@ -29,6 +31,7 @@ def __init__( self.lyric_path = lyric_path self.audio_path = audio_path self.export_path = export_path + self.sample_rate = sample_rate if clean: self.remove_export_folder() @@ -123,6 +126,9 @@ def _export_audio(self, audios: List[AudioSegment], file_name: str) -> None: if self.export_path: path = f"{self.export_path}/audio/{file_name}_{i}.wav" + if self.sample_rate: + audio = audio.set_frame_rate(self.sample_rate) + audio.export(path, format="wav") def _export_lyric(self, lyrics: List[str], file_name: str) -> None: diff --git a/process.py b/process_dataset.py similarity index 86% rename from process.py rename to process_dataset.py index 32d4a2b..16fb937 100644 --- a/process.py +++ b/process_dataset.py @@ -9,12 +9,14 @@ parser.add_argument("--audio_path", type=str, default="dataset/audio") parser.add_argument("--lyric_path", type=str, default="dataset/lyrics") parser.add_argument("--export_path", type=str, default="dataset/export") +parser.add_argument("--sample_rate", type=int, default=None) parser.add_argument("--clean", type=bool, default=False) args = parser.parse_args() process = DatasetProcess( lyric_path=args.lyric_path, audio_path=args.audio_path, + sample_rate=args.sample_rate, export_path=args.export_path, clean=args.clean, ) From 02bd279d70459f13dd12fc0c5d8cb37db8d7b455 Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Sun, 2 Jun 2024 12:58:18 +0200 Subject: [PATCH 06/13] [fix] trying to update pylint to dl aeneas --- .github/workflows/pylint.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index f0dca7c..ad78a74 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -18,6 +18,7 @@ jobs: run: | python -m pip install --upgrade pip pip install pylint + pip install numpy && pip install aeneas pip install -r requirements.txt - name: Analysing the code with pylint run: | From f99247b51a7a1ca00a069be91cfc56a58538427c Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Sun, 2 Jun 2024 13:08:32 +0200 Subject: [PATCH 07/13] [fix] trying to fix pylint aeneas install --- .github/workflows/pylint.yml | 2 +- .pylint_requirements.txt | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 .pylint_requirements.txt diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index ad78a74..b3e8f95 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -19,7 +19,7 @@ jobs: python -m pip install --upgrade pip pip install pylint pip install numpy && pip install aeneas - pip install -r requirements.txt + pip install -r .pylint_requirements.txt - name: Analysing the code with pylint run: | pylint $(git ls-files '*.py') --rcfile=.pylintc diff --git a/.pylint_requirements.txt b/.pylint_requirements.txt new file mode 100644 index 0000000..4c94946 --- /dev/null +++ b/.pylint_requirements.txt @@ -0,0 +1,13 @@ +requests +orjson +jiwer +transformers +torch +torchaudio +datasets +accelerate +bitsandbytes +evaluate +librosa +numpy + From 989dfec06cfd1d0954a911a031414dbdecfc9c50 Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Sun, 2 Jun 2024 13:09:47 +0200 Subject: [PATCH 08/13] [fix] force dl numpy --- .github/workflows/pylint.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index b3e8f95..d6a9dec 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -18,6 +18,7 @@ jobs: run: | python -m pip install --upgrade pip pip install pylint + sudo pip install numpy pip install numpy && pip install aeneas pip install -r .pylint_requirements.txt - name: Analysing the code with pylint From 8ecce29dcc1599c45b1c4619ae7686c2b8b4eb3f Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Sun, 2 Jun 2024 13:12:32 +0200 Subject: [PATCH 09/13] [fix] ignore aeneas package --- .github/workflows/pylint.yml | 2 -- .pylintc | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index d6a9dec..0df1f5d 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -18,8 +18,6 @@ jobs: run: | python -m pip install --upgrade pip pip install pylint - sudo pip install numpy - pip install numpy && pip install aeneas pip install -r .pylint_requirements.txt - name: Analysing the code with pylint run: | diff --git a/.pylintc b/.pylintc index 48f344d..d395a63 100644 --- a/.pylintc +++ b/.pylintc @@ -63,7 +63,7 @@ ignore-patterns=^\.# # (useful for modules/projects where namespaces are manipulated during runtime # and thus existing member attributes cannot be deduced by static analysis). It # supports qualified module names, as well as Unix pattern matching. -ignored-modules= +ignored-modules=aeneas # Python code to execute, usually for sys.path manipulation such as # pygtk.require(). From 234e0c411ca688b45bd844db1b413a3065d54ec3 Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Sun, 2 Jun 2024 13:56:50 +0200 Subject: [PATCH 10/13] [fix] update requirements --- dataset/process.py | 5 +++-- requirements.txt | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dataset/process.py b/dataset/process.py index 01ec2b3..0b4b070 100644 --- a/dataset/process.py +++ b/dataset/process.py @@ -1,4 +1,5 @@ import os +import shutil from typing import List from pydub import AudioSegment @@ -33,7 +34,7 @@ def __init__( self.export_path = export_path self.sample_rate = sample_rate - if clean: + if clean and self.export_path and os.path.exists(self.export_path): self.remove_export_folder() self.create_export_folder() @@ -56,7 +57,7 @@ def remove_export_folder(self) -> None: """Method to remove the export folder""" if os.path.exists(self.export_path): - os.rmdir(self.export_path) + shutil.rmtree(self.export_path) def _split_audio( self, audio_path: str, split_windows: int = 32 diff --git a/requirements.txt b/requirements.txt index 7df2311..ae1418e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ evaluate librosa numpy aeneas +pydub From b8288f94f95352ef1a854682ba18d8cd0b0a296d Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Sun, 2 Jun 2024 15:06:45 +0200 Subject: [PATCH 11/13] [update] change training script & opti --- dataset/process.py | 2 +- train.py | 17 ++++------------- training/train.py | 29 +++++++++++++++++++++-------- training/utils.py | 1 - 4 files changed, 26 insertions(+), 23 deletions(-) diff --git a/dataset/process.py b/dataset/process.py index 0b4b070..7f017ad 100644 --- a/dataset/process.py +++ b/dataset/process.py @@ -34,7 +34,7 @@ def __init__( self.export_path = export_path self.sample_rate = sample_rate - if clean and self.export_path and os.path.exists(self.export_path): + if clean and self.export_path: self.remove_export_folder() self.create_export_folder() diff --git a/train.py b/train.py index 7ed6691..4c4b93f 100644 --- a/train.py +++ b/train.py @@ -1,19 +1,10 @@ -from datasets import DatasetDict - -from training.train import Trainer - from training import utils +from training.train import Trainer -LOAD_DATASET = True +dataset = utils.gather_dataset("./dataset/export") +dataset = dataset.train_test_split(test_size=0.1) -if LOAD_DATASET: - dataset = utils.gather_dataset("./dataset") - dataset = dataset.train_test_split(test_size=0.1) -else: - dataset = DatasetDict.load_from_disk("./formated_dataset") trainer = Trainer(dataset) -if LOAD_DATASET: - dataset = trainer.process_dataset(dataset) - dataset.save_to_disk("./formated_dataset") +dataset = trainer.process_dataset(dataset) trainer.train() diff --git a/training/train.py b/training/train.py index 2e5f6b9..fd6cc40 100644 --- a/training/train.py +++ b/training/train.py @@ -1,6 +1,7 @@ """ This module contains the Trainer class which is responsible for training whisper on predicting lyrics. """ + import warnings import evaluate @@ -8,7 +9,12 @@ import numpy as np import torch from datasets import Dataset -from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer +from transformers import ( + WhisperProcessor, + WhisperForConditionalGeneration, + Seq2SeqTrainingArguments, + Seq2SeqTrainer, +) from transformers.models.whisper.english_normalizer import BasicTextNormalizer from training.collator import DataCollatorSpeechSeq2SeqWithPadding @@ -22,17 +28,19 @@ class Trainer: """ A class that represents the trainer for the whisper model. """ - def __init__(self, dataset=None, model_name="openai/whisper-small", ): + + def __init__( + self, + dataset=None, + model_name="openai/whisper-small", + ): """ The constructor for the Trainer class. The dataset is optional and can be added later with the method process_dataset. The dataset should be formated and already mapped to the columns "audio" and "lyrics" and ready for training. :param dataset: The dataset to train the model on. """ - self.processor = WhisperProcessor.from_pretrained( - model_name, - task="transcribe" - ) + self.processor = WhisperProcessor.from_pretrained(model_name, task="transcribe") self.model = WhisperForConditionalGeneration.from_pretrained(model_name) self.dataset = dataset self.data_collator = DataCollatorSpeechSeq2SeqWithPadding(self.processor) @@ -48,7 +56,9 @@ def prepare_tokenizer(self) -> None: special_tokens_to_add.append(f"[VERSE {i}]") special_tokens_to_add.append("[CHORUS]") special_tokens_to_add.append("[BRIDGE]") - self.processor.tokenizer.add_special_tokens({"additional_special_tokens": special_tokens_to_add}) + self.processor.tokenizer.add_special_tokens( + {"additional_special_tokens": special_tokens_to_add} + ) self.model.resize_token_embeddings(len(self.processor.tokenizer)) def process_dataset(self, dataset) -> Dataset: @@ -56,6 +66,7 @@ def process_dataset(self, dataset) -> Dataset: A method that processes the dataset. :return: None """ + def prepare_dataset(example): target_sr = self.processor.feature_extractor.sampling_rate with warnings.catch_warnings(): @@ -110,7 +121,9 @@ def compute_metrics(self, pred): label_str_norm = [NORMALIZER(label) for label in label_str] # filtering step to only evaluate the samples that correspond to non-zero references: pred_str_norm = [ - pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0 + pred_str_norm[i] + for i in range(len(pred_str_norm)) + if len(label_str_norm[i]) > 0 ] label_str_norm = [ label_str_norm[i] diff --git a/training/utils.py b/training/utils.py index 10d96fa..00392fa 100644 --- a/training/utils.py +++ b/training/utils.py @@ -18,7 +18,6 @@ def gather_dataset(path: str) -> Dataset: """ def gen(): - i = 0 # use to regenerate the dataset audios = glob.glob(path + "/audio/*") lyrics = glob.glob(path + "/lyrics/*.txt") for audio, lyric in zip(audios, lyrics): From e7d646ad1fa3ce766bf9e6c0e04c4dc38c0ba7c6 Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Sun, 2 Jun 2024 15:48:00 +0200 Subject: [PATCH 12/13] [update] save model method --- .gitignore | 3 ++- train.py | 3 ++- training/train.py | 9 +++++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 8f5a25c..26adb6f 100644 --- a/.gitignore +++ b/.gitignore @@ -164,4 +164,5 @@ formated_dataset/ test.py -dataset/ \ No newline at end of file +dataset/ +save.py \ No newline at end of file diff --git a/train.py b/train.py index 4c4b93f..37557c9 100644 --- a/train.py +++ b/train.py @@ -6,5 +6,6 @@ trainer = Trainer(dataset) dataset = trainer.process_dataset(dataset) - trainer.train() + +trainer.save_model("./train") diff --git a/training/train.py b/training/train.py index fd6cc40..867ae67 100644 --- a/training/train.py +++ b/training/train.py @@ -172,3 +172,12 @@ def train(self): tokenizer=self.processor, ) return trainer.train() + + def save_model(self, path: str) -> None: + """ + A method that saves the model. + :param path: The path to save the model. + :return: None + """ + + self.model.save_pretrained(path) From 8d55cbcbc95111048e10dcc7942b859323ab56ce Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Sun, 2 Jun 2024 21:11:27 +0200 Subject: [PATCH 13/13] [update] add instruction to test the model --- README.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/README.md b/README.md index 4b261ee..3922fad 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,39 @@ python process_dataset.py --clean The process will split the audio in chunks of 32 seconds and split the lyrics. +## Test the model + +Here is an example of how to test the model: + +```py +import librosa +import torch +from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline + + +model_name = "Jour/whisper-small-lyric-finetuned" +audio_file = "PATH_TO_AUDIO_FILE" + +device = "cuda:0" if torch.cuda.is_available() else "cpu" +processor = WhisperProcessor.from_pretrained("openai/whisper-small") +model = WhisperForConditionalGeneration.from_pretrained(model_name) + +pipe = pipeline( + "automatic-speech-recognition", + model=model, + tokenizer=processor.tokenizer, + feature_extractor=processor.feature_extractor, + max_new_tokens=128, + chunk_length_s=30, + device=device, +) + +sample, _ = librosa.load(audio_file, sr=processor.feature_extractor.sampling_rate) + +prediction = pipe(sample.copy(), batch_size=8)["text"] +print(prediction) +``` + ## License This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. \ No newline at end of file