From 3045d3244d3207e29ed43d72a6445bf5207a6291 Mon Sep 17 00:00:00 2001
From: Jourdelune <jourdelune863@gmail.com>
Date: Sat, 1 Jun 2024 17:13:31 +0200
Subject: [PATCH 01/13] [update] calculate alignement between text and audio

---
 dataset/aeneas_wrapper.py | 41 +++++++++++++++++++++++++++
 dataset/exceptions.py     |  2 ++
 dataset/process.py        | 58 +++++++++++++++++++++++++++++++++++++++
 requirements.txt          |  2 ++
 4 files changed, 103 insertions(+)
 create mode 100644 dataset/aeneas_wrapper.py
 create mode 100644 dataset/exceptions.py
 create mode 100644 dataset/process.py

diff --git a/dataset/aeneas_wrapper.py b/dataset/aeneas_wrapper.py
new file mode 100644
index 0000000..9b319c4
--- /dev/null
+++ b/dataset/aeneas_wrapper.py
@@ -0,0 +1,41 @@
+import json
+import tempfile
+
+from aeneas.tools.execute_task import ExecuteTaskCLI
+
+from dataset.exceptions import AeneasAlignError
+
+
+def aeneas_cli_exec(audio_path: str, lyric_path: str) -> dict:
+    """Align lyrics with audio
+
+    Args:
+        audio_path (str): the path to the audio file
+        lyric_path (str): the path to the lyric file
+
+    Raises:
+        AeneasAlignError: if Aeneas fails to align lyrics
+
+    Returns:
+        dict: a dictionary containing the alignment data
+    """
+
+    tmp_dir = tempfile.mkdtemp()
+
+    args = [
+        "dummy",
+        audio_path,
+        lyric_path,
+        "task_language=en|is_text_type=plain|os_task_file_format=json",
+        f"{tmp_dir}/lyric.json",
+    ]
+
+    exit_code = ExecuteTaskCLI(use_sys=False).run(arguments=args)
+
+    if exit_code != 0:
+        raise AeneasAlignError("Aeneas failed to align lyrics")
+
+    with open(f"{tmp_dir}/lyric.json", "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    return data
diff --git a/dataset/exceptions.py b/dataset/exceptions.py
new file mode 100644
index 0000000..374a290
--- /dev/null
+++ b/dataset/exceptions.py
@@ -0,0 +1,2 @@
+class AeneasAlignError(Exception):
+    """Raised when Aeneas fails to align lyrics"""
diff --git a/dataset/process.py b/dataset/process.py
new file mode 100644
index 0000000..24dd768
--- /dev/null
+++ b/dataset/process.py
@@ -0,0 +1,58 @@
+import os
+
+import dataset.exceptions
+from dataset.aeneas_wrapper import aeneas_cli_exec
+
+
+class Process:
+    """Class to process the dataset"""
+
+    def __init__(self, lyric_path: str, audio_path: str):
+        self.lyric_path = lyric_path
+        self.audio_path = audio_path
+
+    def _aenas_align(self, audio_path: str, lyric_path: str) -> dict:
+        """Method to align lyrics with audio
+
+        Args:
+            audio_path (str): the path to the audio file
+            lyric_path (str): the path to the lyric file
+
+        Raises:
+            AeneasAlignError: if Aeneas fails to align lyrics
+
+        Returns:
+            dict: a dictionary containing the alignment data
+        """
+
+        return aeneas_cli_exec(audio_path, lyric_path)
+
+    def _split_audio(self, lyric_path: str, alignement: dict) -> list:
+        """Method to split audio into 32 seconds segments with the corresponding lyrics
+
+        Args:
+            lyric_path (str): the path to the lyric file
+            alignement (dict): the alignment data
+
+        Returns:
+            list: a list that contain lyrics split into 32 seconds segments
+        """
+
+        raise NotImplementedError
+
+    def process(self) -> None:
+        """Method to process the dataset :
+        1. Align lyrics with audio
+        2. Split audio into 32 seconds segments
+        3. Save the segments to the dataset/audio/processed folder in .wav format
+        """
+
+        for audio_f in os.listdir(self.audio_path):
+            audio_path = os.path.join(self.audio_path, audio_f)
+            lyric_path = os.path.join(self.lyric_path, audio_f.split(".")[0] + ".txt")
+
+            try:
+                alignement = self._aenas_align(audio_path, lyric_path)
+            except dataset.exceptions.AeneasAlignError as e:
+                print(f"Failed to align {audio_f}: {e}")
+                continue
diff --git a/requirements.txt b/requirements.txt
index 72a6856..72fefbf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,3 +9,5 @@ accelerate
 bitsandbytes
 evaluate
 librosa
+aeneas
+numpy

From 02bf9529a18afadca12f7324a21b3ad71a3469cb Mon Sep 17 00:00:00 2001
From: Jourdelune <jourdelune863@gmail.com>
Date: Sun, 2 Jun 2024 11:52:48 +0200
Subject: [PATCH 02/13] [update] align audio by worlds level

---
 dataset/aeneas_wrapper.py | 27 ++++++++++++++++++++++++---
 dataset/process.py        | 32 +++++++++++++++++++++++++++++---
 2 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/dataset/aeneas_wrapper.py b/dataset/aeneas_wrapper.py
index 9b319c4..5afbf8c 100644
--- a/dataset/aeneas_wrapper.py
+++ b/dataset/aeneas_wrapper.py
@@ -1,10 +1,17 @@
 import json
+import re
 import tempfile
 
-from aeneas.tools.execute_task import ExecuteTaskCLI
+from aeneas.tools.execute_task import ExecuteTaskCLI, RuntimeConfiguration
 
 from dataset.exceptions import AeneasAlignError
 
+rconf = RuntimeConfiguration()
+rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH] = True
+rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH_L3] = True
+rconf[RuntimeConfiguration.TTS_CACHE] = True
+rconf.set_granularity(3)
+
 
 def aeneas_cli_exec(audio_path: str, lyric_path: str) -> dict:
     """Align lyrics with audio
@@ -22,15 +29,29 @@ def aeneas_cli_exec(audio_path: str, lyric_path: str) -> dict:
 
     tmp_dir = tempfile.mkdtemp()
 
+    with open(lyric_path, "r", encoding="utf-8") as f:
+        lyric = f.read()
+
+    # remove all text between []
+    lyric = re.sub(r"\[.*?\]", "\n", lyric)
+
+    # remove when more than 2 new lines
+    lyric = re.sub(r"\n{1,}", "\n", lyric).strip()
+
+    lyric = lyric.replace(" ", "\n")
+
+    with open(f"{tmp_dir}/lyric.txt", "w", encoding="utf-8") as f:
+        f.write(lyric)
+
     args = [
         "dummy",
         audio_path,
-        lyric_path,
+        f"{tmp_dir}/lyric.txt",
         "task_language=en|is_text_type=plain|os_task_file_format=json",
         f"{tmp_dir}/lyric.json",
     ]
 
-    exit_code = ExecuteTaskCLI(use_sys=False).run(arguments=args)
+    exit_code = ExecuteTaskCLI(use_sys=False, rconf=rconf).run(arguments=args)
 
     if exit_code != 0:
         raise AeneasAlignError("Aeneas failed to align lyrics")
diff --git a/dataset/process.py b/dataset/process.py
index 24dd768..219c4f8 100644
--- a/dataset/process.py
+++ b/dataset/process.py
@@ -1,5 +1,7 @@
 import os
 
+from pydub import AudioSegment
+
 import dataset.exceptions
 from dataset.aeneas_wrapper import aeneas_cli_exec
 
@@ -27,18 +29,38 @@ def _aenas_align(self, audio_path: str, lyric_path: str) -> dict:
 
         return aeneas_cli_exec(audio_path, lyric_path)
 
-    def _split_audio(self, lyric_path: str, alignement: dict) -> list:
+    def _split_audio(
+        self, lyric_path: str, alignement: dict, split_windows: int = 32
+    ) -> list:
         """Method to split audio into 32 seconds segments with the corresponding lyrics
 
         Args:
             lyric_path (str): the path to the lyric file
             alignement (dict): the alignment data
+            split_windows (int, optional): the size of the split window in seconds. Defaults to 32.
 
         Returns:
-            list: a list that contain lyrics split into 32 seconds segments
+            list: a list of list that contain lyrics split into 32 seconds segments
         """
 
-        raise NotImplementedError
+        lyric = open(lyric_path, "r", encoding="utf-8").read()
+
+        segments = []
+        start_idx = 0
+        end_idx = 0
+
+        for fragment in alignement["fragments"]:
+            print(fragment)
+            end_idx = lyric.find(fragment["lines"][0], start_idx)
+            windows = (len(segments) + 1) * split_windows
+
+            if float(fragment["begin"]) > windows:
+                segments.append(lyric[start_idx:end_idx])
+                start_idx = end_idx
+
+        segments.append(lyric[start_idx:])
+
+        print(segments, len(segments))
 
     def process(self) -> None:
         """Method to process the dataset :
@@ -56,3 +78,7 @@ def process(self) -> None:
             except dataset.exceptions.AeneasAlignError as e:
                 print(f"Failed to align {audio_f}: {e}")
                 continue
+
+            self._split_audio(lyric_path, alignement)
+
+            break

From d0ca9ed03d4b0fc09564b38fdf930565fea91ff0 Mon Sep 17 00:00:00 2001
From: Jourdelune <jourdelune863@gmail.com>
Date: Sun, 2 Jun 2024 12:49:31 +0200
Subject: [PATCH 03/13] [update] add ds processing

---
 .gitignore                |   7 +-
 dataset/aeneas_wrapper.py |  80 ++++++++++----------
 dataset/process.py        | 150 +++++++++++++++++++++++++++++++-------
 download_dataset.py       |   2 +-
 process.py                |  13 ++++
 requirements.txt          |   2 +-
 6 files changed, 184 insertions(+), 70 deletions(-)
 create mode 100644 process.py

diff --git a/.gitignore b/.gitignore
index c368e9b..8f5a25c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -159,10 +159,9 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 
-dataset/audio/
-dataset/lyrics/
-dataset/data.json
 train/
 formated_dataset/
 
-test.py
\ No newline at end of file
+test.py
+
+dataset/
\ No newline at end of file
diff --git a/dataset/aeneas_wrapper.py b/dataset/aeneas_wrapper.py
index 5afbf8c..998a193 100644
--- a/dataset/aeneas_wrapper.py
+++ b/dataset/aeneas_wrapper.py
@@ -6,57 +6,61 @@
 
 from dataset.exceptions import AeneasAlignError
 
-rconf = RuntimeConfiguration()
-rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH] = True
-rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH_L3] = True
-rconf[RuntimeConfiguration.TTS_CACHE] = True
-rconf.set_granularity(3)
 
+class AeneasWrapper:
+    """Wrapper class for Aeneas CLI"""
 
-def aeneas_cli_exec(audio_path: str, lyric_path: str) -> dict:
-    """Align lyrics with audio
+    def __init__(self) -> None:
+        self._rconf = RuntimeConfiguration()
+        self._rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH] = True
+        self._rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH_L3] = True
+        self._rconf[RuntimeConfiguration.TTS_CACHE] = True
+        self._rconf.set_granularity(3)
 
-    Args:
-        audio_path (str): the path to the audio file
-        lyric_path (str): the path to the lyric file
+    def aeneas_cli_exec(self, audio_path: str, lyric_path: str) -> dict:
+        """Align lyrics with audio
 
-    Raises:
-        AeneasAlignError: if Aeneas fails to align lyrics
+        Args:
+            audio_path (str): the path to the audio file
+            lyric_path (str): the path to the lyric file
 
-    Returns:
-        dict: a dictionary containing the alignment data
-    """
+        Raises:
+            AeneasAlignError: if Aeneas fails to align lyrics
 
-    tmp_dir = tempfile.mkdtemp()
+        Returns:
+            dict: a dictionary containing the alignment data
+        """
 
-    with open(lyric_path, "r", encoding="utf-8") as f:
-        lyric = f.read()
+        tmp_dir = tempfile.mkdtemp()
 
-    # remove all text between []
-    lyric = re.sub(r"\[.*?\]", "\n", lyric)
+        with open(lyric_path, "r", encoding="utf-8") as f:
+            lyric = f.read()
 
-    # remove when more than 2 new lines
-    lyric = re.sub(r"\n{1,}", "\n", lyric).strip()
+        # remove all text between []
+        lyric = re.sub(r"\[.*?\]", "\n", lyric)
 
-    lyric = lyric.replace(" ", "\n")
+        # remove when more than 2 new lines
+        lyric = re.sub(r"\n{1,}", "\n", lyric).strip()
 
-    with open(f"{tmp_dir}/lyric.txt", "w", encoding="utf-8") as f:
-        f.write(lyric)
+        lyric = lyric.replace(" ", "\n")
 
-    args = [
-        "dummy",
-        audio_path,
-        f"{tmp_dir}/lyric.txt",
-        "task_language=en|is_text_type=plain|os_task_file_format=json",
-        f"{tmp_dir}/lyric.json",
-    ]
+        with open(f"{tmp_dir}/lyric.txt", "w", encoding="utf-8") as f:
+            f.write(lyric)
 
-    exit_code = ExecuteTaskCLI(use_sys=False, rconf=rconf).run(arguments=args)
+        args = [
+            "dummy",
+            audio_path,
+            f"{tmp_dir}/lyric.txt",
+            "task_language=en|is_text_type=plain|os_task_file_format=json",
+            f"{tmp_dir}/lyric.json",
+        ]
 
-    if exit_code != 0:
-        raise AeneasAlignError("Aeneas failed to align lyrics")
+        exit_code = ExecuteTaskCLI(use_sys=False, rconf=self._rconf).run(arguments=args)
 
-    with open(f"{tmp_dir}/lyric.json", "r", encoding="utf-8") as f:
-        data = json.load(f)
+        if exit_code != 0:
+            raise AeneasAlignError("Aeneas failed to align lyrics")
 
-    return data
+        with open(f"{tmp_dir}/lyric.json", "r", encoding="utf-8") as f:
+            data = json.load(f)
+
+        return data
diff --git a/dataset/process.py b/dataset/process.py
index 219c4f8..4eb4d33 100644
--- a/dataset/process.py
+++ b/dataset/process.py
@@ -1,37 +1,84 @@
 import os
+from typing import List
 
 from pydub import AudioSegment
 
 import dataset.exceptions
-from dataset.aeneas_wrapper import aeneas_cli_exec
+from dataset.aeneas_wrapper import AeneasWrapper
 
 
-class Process:
+class DatasetProcess:
     """Class to process the dataset"""
 
-    def __init__(self, lyric_path: str, audio_path: str):
+    def __init__(
+        self,
+        lyric_path: str,
+        audio_path: str,
+        export_path: str = None,
+        clean: bool = False,
+    ):
+        """Constructor to initialize the DatasetProcess class
+
+        Args:
+            lyric_path (str): the path to the lyrics folder
+            audio_path (str): the path to the audio folder
+            export_path (str, optional): the path to export data. Defaults to None.
+            clean (bool, optional): remove all data in the export path. Defaults to False.
+        """
+
         self.lyric_path = lyric_path
         self.audio_path = audio_path
+        self.export_path = export_path
+
+        if clean:
+            self.remove_export_folder()
+
+        self.create_export_folder()
+
+        self.aeneas = AeneasWrapper()
+
+    def create_export_folder(self) -> None:
+        """Method to create the export folder"""
+
+        if not os.path.exists(self.export_path):
+            os.makedirs(self.export_path)
+
+        if not os.path.exists(f"{self.export_path}/audio"):
+            os.makedirs(f"{self.export_path}/audio")
+
+        if not os.path.exists(f"{self.export_path}/lyrics"):
+            os.makedirs(f"{self.export_path}/lyrics")
 
-    def _aenas_align(self, audio_path: str, lyric_path: str) -> dict:
-        """Method to align lyrics with audio
+    def remove_export_folder(self) -> None:
+        """Method to remove the export folder"""
+
+        if os.path.exists(self.export_path):
+            os.rmdir(self.export_path)
+
+    def _split_audio(
+        self, audio_path: str, split_windows: int = 32
+    ) -> List[AudioSegment]:
+        """Method to split audio into 32 seconds segments
 
         Args:
             audio_path (str): the path to the audio file
-            lyric_path (str): the path to the lyric file
-
-        Raises:
-            AeneasAlignError: if Aeneas fails to align lyrics
+            split_windows (int, optional): the size of the split window in seconds. Defaults to 32.
 
         Returns:
-            dict: a dictionary containing the alignment data
+            list: a list of AudioSegment that contain audio split into 32 seconds segments
         """
 
-        return aeneas_cli_exec(audio_path, lyric_path)
+        audio = AudioSegment.from_file(audio_path)
+        segments = []
 
-    def _split_audio(
+        for i in range(0, len(audio), split_windows * 1000):
+            segments.append(audio[i : i + split_windows * 1000])
+
+        return segments
+
+    def _split_lyric(
         self, lyric_path: str, alignement: dict, split_windows: int = 32
-    ) -> list:
+    ) -> List[str]:
         """Method to split audio into 32 seconds segments with the corresponding lyrics
 
         Args:
@@ -43,15 +90,15 @@ def _split_audio(
             list: a list of list that contain lyrics split into 32 seconds segments
         """
 
-        lyric = open(lyric_path, "r", encoding="utf-8").read()
+        with open(lyric_path, "r", encoding="utf-8") as f:
+            lyric = f.read()
 
         segments = []
         start_idx = 0
         end_idx = 0
 
         for fragment in alignement["fragments"]:
-            print(fragment)
-            end_idx = lyric.find(fragment["lines"][0], start_idx)
+            end_idx = lyric.find(fragment["lines"][0], end_idx)
             windows = (len(segments) + 1) * split_windows
 
             if float(fragment["begin"]) > windows:
@@ -60,25 +107,76 @@ def _split_audio(
 
         segments.append(lyric[start_idx:])
 
-        print(segments, len(segments))
+        return segments
 
-    def process(self) -> None:
-        """Method to process the dataset :
-        1. Align lyrics with audio
-        2. Split audio into 32 seconds segments
-        3. Save the segments to the dataset/audio/processed folder in .wav format
+    def _export_audio(self, audios: List[AudioSegment], file_name: str) -> None:
+        """Method to export audio segments to .wav format
+
+        Args:
+            audios (List[AudioSegment]): a list of AudioSegment
+            file_name (str): the name of the file
+        """
+
+        for i, audio in enumerate(audios):
+            path = f"{self.audio_path}/{file_name}_{i}.wav"
+
+            if self.export_path:
+                path = f"{self.export_path}/audio/{file_name}_{i}.wav"
+
+            audio.export(path, format="wav")
+
+    def _export_lyric(self, lyrics: List[str], file_name: str) -> None:
+        """Method to export lyrics segments to .txt format
+
+        Args:
+            lyrics (List[str]): a list of lyrics
+            file_name (str): the name of the file
+        """
+
+        for i, lyric in enumerate(lyrics):
+            path = f"{self.lyric_path}/{file_name}_{i}.txt"
+
+            if self.export_path:
+                path = f"{self.export_path}/lyrics/{file_name}_{i}.txt"
+
+            with open(path, "w", encoding="utf-8") as f:
+                f.write(lyric)
+
+    def process(self, remove: bool = False) -> None:
+        """Method to process the dataset
+            1. Align lyrics with audio
+            2. Split audio into 32 seconds segments
+            3. Save the segments to the dataset/audio/processed folder in .wav format
+
+        Args:
+            remove (bool, optional): remove the processed file. Defaults to False.
         """
 
-        for audio_f in os.listdir(self.audio_path):
+        nbm_files = len(os.listdir(self.audio_path))
+        for i, audio_f in enumerate(os.listdir(self.audio_path)):
+            if not audio_f.endswith(".ogg") and not audio_f.endswith(".mp4"):
+                continue
+
             audio_path = os.path.join(self.audio_path, audio_f)
             lyric_path = os.path.join(self.lyric_path, audio_f.split(".")[0] + ".txt")
 
             try:
-                alignement = self._aenas_align(audio_path, lyric_path)
+                alignement = self.aeneas.aeneas_cli_exec(audio_path, lyric_path)
             except dataset.exceptions.AeneasAlignError as e:
                 print(f"Failed to align {audio_f}: {e}")
                 continue
 
-            self._split_audio(lyric_path, alignement)
+            lyric_segments = self._split_lyric(lyric_path, alignement)
+            audio_segments = self._split_audio(audio_path)
+
+            # save the audio segments and the lyrics
+            self._export_audio(audio_segments, audio_f.split(".")[0])
+            self._export_lyric(lyric_segments, audio_f.split(".")[0])
+
+            print(
+                f"Processed {i}/ {nbm_files} - {round(i/nbm_files*100, 2)}%", end="\r"
+            )
 
-            break
+            if remove:
+                os.remove(lyric_path)
+                os.remove(audio_path)
diff --git a/download_dataset.py b/download_dataset.py
index bbef722..085cc1f 100644
--- a/download_dataset.py
+++ b/download_dataset.py
@@ -4,7 +4,7 @@
 
 
 parser = argparse.ArgumentParser(
-    description="Download images from Sonauto dataset",
+    description="Download music from Sonauto API",
 )
 parser.add_argument("--num_images", type=int, default=10000)
 parser.add_argument("--clean", type=bool, default=True)
diff --git a/process.py b/process.py
new file mode 100644
index 0000000..c79b687
--- /dev/null
+++ b/process.py
@@ -0,0 +1,13 @@
+import argparse
+
+from dataset.process import DatasetProcess
+
+
+parser = argparse.ArgumentParser(
+    description="Process the dataset",
+)
+parser.add_argument("--num_images", type=int, default=10000)
+parser.add_argument("--clean", type=bool, default=True)
+
+args = parser.parse_args()
+api = 
diff --git a/requirements.txt b/requirements.txt
index 72fefbf..7df2311 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,5 +9,5 @@ accelerate
 bitsandbytes
 evaluate
 librosa
-aeneas
 numpy
+aeneas

From bac2ec88cd0da709e8929d8337646cf33865f338 Mon Sep 17 00:00:00 2001
From: Jourdelune <jourdelune863@gmail.com>
Date: Sun, 2 Jun 2024 12:52:20 +0200
Subject: [PATCH 04/13] [update] add cli cmd

---
 README.md  | 10 ++++++++++
 process.py | 15 ++++++++++++---
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index f35d78f..4b261ee 100644
--- a/README.md
+++ b/README.md
@@ -35,6 +35,16 @@ dataset
 
 where `0.wav` corresponds to the audio file and `0.txt` corresponds to the lyrics transcription of the audio file.
 
+## Process the dataset
+
+To process the dataset, run the following command:
+
+```bash
+python process_dataset.py --clean
+```
+
+The process will split the audio in chunks of 32 seconds and split the lyrics.
+
 ## License
 
 This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
\ No newline at end of file
diff --git a/process.py b/process.py
index c79b687..32d4a2b 100644
--- a/process.py
+++ b/process.py
@@ -6,8 +6,17 @@
 parser = argparse.ArgumentParser(
     description="Process the dataset",
 )
-parser.add_argument("--num_images", type=int, default=10000)
-parser.add_argument("--clean", type=bool, default=True)
+parser.add_argument("--audio_path", type=str, default="dataset/audio")
+parser.add_argument("--lyric_path", type=str, default="dataset/lyrics")
+parser.add_argument("--export_path", type=str, default="dataset/export")
+parser.add_argument("--clean", type=bool, default=False)
 
 args = parser.parse_args()
-api = 
+process = DatasetProcess(
+    lyric_path=args.lyric_path,
+    audio_path=args.audio_path,
+    export_path=args.export_path,
+    clean=args.clean,
+)
+
+process.process()

From 82020d4f329994acf5866cd29d11ca82363cfe31 Mon Sep 17 00:00:00 2001
From: Jourdelune <jourdelune863@gmail.com>
Date: Sun, 2 Jun 2024 12:55:23 +0200
Subject: [PATCH 05/13] [update] add sampling rate

---
 dataset/process.py               | 6 ++++++
 process.py => process_dataset.py | 2 ++
 2 files changed, 8 insertions(+)
 rename process.py => process_dataset.py (86%)

diff --git a/dataset/process.py b/dataset/process.py
index 4eb4d33..01ec2b3 100644
--- a/dataset/process.py
+++ b/dataset/process.py
@@ -14,6 +14,7 @@ def __init__(
         self,
         lyric_path: str,
         audio_path: str,
+        sample_rate: int = None,
         export_path: str = None,
         clean: bool = False,
     ):
@@ -22,6 +23,7 @@ def __init__(
         Args:
             lyric_path (str): the path to the lyrics folder
             audio_path (str): the path to the audio folder
+            sample_rate (int, optional): the sample rate of the audio. Defaults to None.
             export_path (str, optional): the path to export data. Defaults to None.
             clean (bool, optional): remove all data in the export path. Defaults to False.
         """
@@ -29,6 +31,7 @@ def __init__(
         self.lyric_path = lyric_path
         self.audio_path = audio_path
         self.export_path = export_path
+        self.sample_rate = sample_rate
 
         if clean:
             self.remove_export_folder()
@@ -123,6 +126,9 @@ def _export_audio(self, audios: List[AudioSegment], file_name: str) -> None:
             if self.export_path:
                 path = f"{self.export_path}/audio/{file_name}_{i}.wav"
 
+            if self.sample_rate:
+                audio = audio.set_frame_rate(self.sample_rate)
+
             audio.export(path, format="wav")
 
     def _export_lyric(self, lyrics: List[str], file_name: str) -> None:
diff --git a/process.py b/process_dataset.py
similarity index 86%
rename from process.py
rename to process_dataset.py
index 32d4a2b..16fb937 100644
--- a/process.py
+++ b/process_dataset.py
@@ -9,12 +9,14 @@
 parser.add_argument("--audio_path", type=str, default="dataset/audio")
 parser.add_argument("--lyric_path", type=str, default="dataset/lyrics")
 parser.add_argument("--export_path", type=str, default="dataset/export")
+parser.add_argument("--sample_rate", type=int, default=None)
 parser.add_argument("--clean", type=bool, default=False)
 
 args = parser.parse_args()
 process = DatasetProcess(
     lyric_path=args.lyric_path,
     audio_path=args.audio_path,
+    sample_rate=args.sample_rate,
     export_path=args.export_path,
     clean=args.clean,
 )

From 02bd279d70459f13dd12fc0c5d8cb37db8d7b455 Mon Sep 17 00:00:00 2001
From: Jourdelune <jourdelune863@gmail.com>
Date: Sun, 2 Jun 2024 12:58:18 +0200
Subject: [PATCH 06/13] [fix] trying to update pylint to dl aeneas

---
 .github/workflows/pylint.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
index f0dca7c..ad78a74 100644
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -18,6 +18,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install pylint
+        pip install numpy && pip install aeneas
         pip install -r requirements.txt
     - name: Analysing the code with pylint
       run: |

From f99247b51a7a1ca00a069be91cfc56a58538427c Mon Sep 17 00:00:00 2001
From: Jourdelune <jourdelune863@gmail.com>
Date: Sun, 2 Jun 2024 13:08:32 +0200
Subject: [PATCH 07/13] [fix] trying to fix pylint aeneas install

---
 .github/workflows/pylint.yml |  2 +-
 .pylint_requirements.txt     | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 .pylint_requirements.txt

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
index ad78a74..b3e8f95 100644
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -19,7 +19,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install pylint
         pip install numpy && pip install aeneas
-        pip install -r requirements.txt
+        pip install -r .pylint_requirements.txt
     - name: Analysing the code with pylint
       run: |
         pylint $(git ls-files '*.py') --rcfile=.pylintc
diff --git a/.pylint_requirements.txt b/.pylint_requirements.txt
new file mode 100644
index 0000000..4c94946
--- /dev/null
+++ b/.pylint_requirements.txt
@@ -0,0 +1,13 @@
+requests
+orjson
+jiwer
+transformers
+torch
+torchaudio
+datasets
+accelerate
+bitsandbytes
+evaluate
+librosa
+numpy
+

From 989dfec06cfd1d0954a911a031414dbdecfc9c50 Mon Sep 17 00:00:00 2001
From: Jourdelune <jourdelune863@gmail.com>
Date: Sun, 2 Jun 2024 13:09:47 +0200
Subject: [PATCH 08/13] [fix] force dl numpy

---
 .github/workflows/pylint.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
index b3e8f95..d6a9dec 100644
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -18,6 +18,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install pylint
+        sudo pip install numpy
         pip install numpy && pip install aeneas
         pip install -r .pylint_requirements.txt
     - name: Analysing the code with pylint

From 8ecce29dcc1599c45b1c4619ae7686c2b8b4eb3f Mon Sep 17 00:00:00 2001
From: Jourdelune <jourdelune863@gmail.com>
Date: Sun, 2 Jun 2024 13:12:32 +0200
Subject: [PATCH 09/13] [fix] ignore aeneas package

---
 .github/workflows/pylint.yml | 2 --
 .pylintc                     | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
index d6a9dec..0df1f5d 100644
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -18,8 +18,6 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install pylint
-        sudo pip install numpy
-        pip install numpy && pip install aeneas
         pip install -r .pylint_requirements.txt
     - name: Analysing the code with pylint
       run: |
diff --git a/.pylintc b/.pylintc
index 48f344d..d395a63 100644
--- a/.pylintc
+++ b/.pylintc
@@ -63,7 +63,7 @@ ignore-patterns=^\.#
 # (useful for modules/projects where namespaces are manipulated during runtime
 # and thus existing member attributes cannot be deduced by static analysis). It
 # supports qualified module names, as well as Unix pattern matching.
-ignored-modules=
+ignored-modules=aeneas
 
 # Python code to execute, usually for sys.path manipulation such as
 # pygtk.require().

From 234e0c411ca688b45bd844db1b413a3065d54ec3 Mon Sep 17 00:00:00 2001
From: Jourdelune <jourdelune863@gmail.com>
Date: Sun, 2 Jun 2024 13:56:50 +0200
Subject: [PATCH 10/13] [fix] update requirements

---
 dataset/process.py | 5 +++--
 requirements.txt   | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/dataset/process.py b/dataset/process.py
index 01ec2b3..0b4b070 100644
--- a/dataset/process.py
+++ b/dataset/process.py
@@ -1,4 +1,5 @@
 import os
+import shutil
 from typing import List
 
 from pydub import AudioSegment
@@ -33,7 +34,7 @@ def __init__(
         self.export_path = export_path
         self.sample_rate = sample_rate
 
-        if clean:
+        if clean and self.export_path and os.path.exists(self.export_path):
             self.remove_export_folder()
 
         self.create_export_folder()
@@ -56,7 +57,7 @@ def remove_export_folder(self) -> None:
         """Method to remove the export folder"""
 
         if os.path.exists(self.export_path):
-            os.rmdir(self.export_path)
+            shutil.rmtree(self.export_path)
 
     def _split_audio(
         self, audio_path: str, split_windows: int = 32
diff --git a/requirements.txt b/requirements.txt
index 7df2311..ae1418e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,4 @@ evaluate
 librosa
 numpy
 aeneas
+pydub

From b8288f94f95352ef1a854682ba18d8cd0b0a296d Mon Sep 17 00:00:00 2001
From: Jourdelune <jourdelune863@gmail.com>
Date: Sun, 2 Jun 2024 15:06:45 +0200
Subject: [PATCH 11/13] [update] change training script & opti

---
 dataset/process.py |  2 +-
 train.py           | 17 ++++-------------
 training/train.py  | 29 +++++++++++++++++++++--------
 training/utils.py  |  1 -
 4 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/dataset/process.py b/dataset/process.py
index 0b4b070..7f017ad 100644
--- a/dataset/process.py
+++ b/dataset/process.py
@@ -34,7 +34,7 @@ def __init__(
         self.export_path = export_path
         self.sample_rate = sample_rate
 
-        if clean and self.export_path and os.path.exists(self.export_path):
+        if clean and self.export_path:
             self.remove_export_folder()
 
         self.create_export_folder()
diff --git a/train.py b/train.py
index 7ed6691..4c4b93f 100644
--- a/train.py
+++ b/train.py
@@ -1,19 +1,10 @@
-from datasets import DatasetDict
-
-from training.train import Trainer
-
 from training import utils
+from training.train import Trainer
 
-LOAD_DATASET = True
+dataset = utils.gather_dataset("./dataset/export")
+dataset = dataset.train_test_split(test_size=0.1)
 
-if LOAD_DATASET:
-    dataset = utils.gather_dataset("./dataset")
-    dataset = dataset.train_test_split(test_size=0.1)
-else:
-    dataset = DatasetDict.load_from_disk("./formated_dataset")
 trainer = Trainer(dataset)
-if LOAD_DATASET:
-    dataset = trainer.process_dataset(dataset)
-    dataset.save_to_disk("./formated_dataset")
+dataset = trainer.process_dataset(dataset)
 
 trainer.train()
diff --git a/training/train.py b/training/train.py
index 2e5f6b9..fd6cc40 100644
--- a/training/train.py
+++ b/training/train.py
@@ -1,6 +1,7 @@
 """
 This module contains the Trainer class which is responsible for training whisper on predicting lyrics.
 """
+
 import warnings
 
 import evaluate
@@ -8,7 +9,12 @@
 import numpy as np
 import torch
 from datasets import Dataset
-from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
+from transformers import (
+    WhisperProcessor,
+    WhisperForConditionalGeneration,
+    Seq2SeqTrainingArguments,
+    Seq2SeqTrainer,
+)
 from transformers.models.whisper.english_normalizer import BasicTextNormalizer
 
 from training.collator import DataCollatorSpeechSeq2SeqWithPadding
@@ -22,17 +28,19 @@ class Trainer:
     """
     A class that represents the trainer for the whisper model.
     """
-    def __init__(self, dataset=None, model_name="openai/whisper-small", ):
+
+    def __init__(
+        self,
+        dataset=None,
+        model_name="openai/whisper-small",
+    ):
         """
         The constructor for the Trainer class.
         The dataset is optional and can be added later with the method process_dataset.
         The dataset should be formated and already mapped to the columns "audio" and "lyrics" and ready for training.
         :param dataset: The dataset to train the model on.
         """
-        self.processor = WhisperProcessor.from_pretrained(
-            model_name,
-            task="transcribe"
-        )
+        self.processor = WhisperProcessor.from_pretrained(model_name, task="transcribe")
         self.model = WhisperForConditionalGeneration.from_pretrained(model_name)
         self.dataset = dataset
         self.data_collator = DataCollatorSpeechSeq2SeqWithPadding(self.processor)
@@ -48,7 +56,9 @@ def prepare_tokenizer(self) -> None:
             special_tokens_to_add.append(f"[VERSE {i}]")
         special_tokens_to_add.append("[CHORUS]")
         special_tokens_to_add.append("[BRIDGE]")
-        self.processor.tokenizer.add_special_tokens({"additional_special_tokens": special_tokens_to_add})
+        self.processor.tokenizer.add_special_tokens(
+            {"additional_special_tokens": special_tokens_to_add}
+        )
         self.model.resize_token_embeddings(len(self.processor.tokenizer))
 
     def process_dataset(self, dataset) -> Dataset:
@@ -56,6 +66,7 @@ def process_dataset(self, dataset) -> Dataset:
         A method that processes the dataset.
         :return: None
         """
+
         def prepare_dataset(example):
             target_sr = self.processor.feature_extractor.sampling_rate
             with warnings.catch_warnings():
@@ -110,7 +121,9 @@ def compute_metrics(self, pred):
         label_str_norm = [NORMALIZER(label) for label in label_str]
         # filtering step to only evaluate the samples that correspond to non-zero references:
         pred_str_norm = [
-            pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0
+            pred_str_norm[i]
+            for i in range(len(pred_str_norm))
+            if len(label_str_norm[i]) > 0
         ]
         label_str_norm = [
             label_str_norm[i]
diff --git a/training/utils.py b/training/utils.py
index 10d96fa..00392fa 100644
--- a/training/utils.py
+++ b/training/utils.py
@@ -18,7 +18,6 @@ def gather_dataset(path: str) -> Dataset:
     """
 
     def gen():
-        i = 0  # use to regenerate the dataset
         audios = glob.glob(path + "/audio/*")
         lyrics = glob.glob(path + "/lyrics/*.txt")
         for audio, lyric in zip(audios, lyrics):

From e7d646ad1fa3ce766bf9e6c0e04c4dc38c0ba7c6 Mon Sep 17 00:00:00 2001
From: Jourdelune <jourdelune863@gmail.com>
Date: Sun, 2 Jun 2024 15:48:00 +0200
Subject: [PATCH 12/13] [update] save model method

---
 .gitignore        | 3 ++-
 train.py          | 3 ++-
 training/train.py | 9 +++++++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8f5a25c..26adb6f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -164,4 +164,5 @@ formated_dataset/
 
 test.py
 
-dataset/
\ No newline at end of file
+dataset/
+save.py
\ No newline at end of file
diff --git a/train.py b/train.py
index 4c4b93f..37557c9 100644
--- a/train.py
+++ b/train.py
@@ -6,5 +6,6 @@
 
 trainer = Trainer(dataset)
 dataset = trainer.process_dataset(dataset)
-
 trainer.train()
+
+trainer.save_model("./train")
diff --git a/training/train.py b/training/train.py
index fd6cc40..867ae67 100644
--- a/training/train.py
+++ b/training/train.py
@@ -172,3 +172,12 @@ def train(self):
             tokenizer=self.processor,
         )
         return trainer.train()
+
+    def save_model(self, path: str) -> None:
+        """
+        A method that saves the model.
+        :param path: The path to save the model.
+        :return: None
+        """
+
+        self.model.save_pretrained(path)

From 8d55cbcbc95111048e10dcc7942b859323ab56ce Mon Sep 17 00:00:00 2001
From: Jourdelune <jourdelune863@gmail.com>
Date: Sun, 2 Jun 2024 21:11:27 +0200
Subject: [PATCH 13/13] [update] add instruction to test the model

---
 README.md | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/README.md b/README.md
index 4b261ee..3922fad 100644
--- a/README.md
+++ b/README.md
@@ -45,6 +45,39 @@ python process_dataset.py --clean
 
 The process will split the audio in chunks of 32 seconds and split the lyrics.
 
+## Test the model
+
+Here is an example of how to test the model:
+
+```py
+import librosa
+import torch
+from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
+
+
+model_name = "Jour/whisper-small-lyric-finetuned"
+audio_file = "PATH_TO_AUDIO_FILE"
+
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+processor = WhisperProcessor.from_pretrained("openai/whisper-small")
+model = WhisperForConditionalGeneration.from_pretrained(model_name)
+
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    max_new_tokens=128,
+    chunk_length_s=30,
+    device=device,
+)
+
+sample, _ = librosa.load(audio_file, sr=processor.feature_extractor.sampling_rate)
+
+prediction = pipe(sample.copy(), batch_size=8)["text"]
+print(prediction)
+```
+
 ## License
 
 This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
\ No newline at end of file