Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
# Conflicts:
#	train.py
  • Loading branch information
ostix360 committed Jun 3, 2024
2 parents e183eac + 8d55cbc commit fa092df
Show file tree
Hide file tree
Showing 14 changed files with 381 additions and 30 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install pylint
pip install -r requirements.txt
pip install -r .pylint_requirements.txt
- name: Analysing the code with pylint
run: |
pylint $(git ls-files '*.py') --rcfile=.pylintc
8 changes: 4 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,11 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

dataset/audio/
dataset/lyrics/
dataset/data.json
train/
model/
formated_dataset/

test.py
test.py

dataset/
save.py
13 changes: 13 additions & 0 deletions .pylint_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
requests
orjson
jiwer
transformers
torch
torchaudio
datasets
accelerate
bitsandbytes
evaluate
librosa
numpy

2 changes: 1 addition & 1 deletion .pylintc
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ ignore-patterns=^\.#
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis). It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=
ignored-modules=aeneas

# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
Expand Down
43 changes: 43 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,49 @@ dataset

where `0.wav` corresponds to the audio file and `0.txt` corresponds to the lyrics transcription of the audio file.

## Process the dataset

To process the dataset, run the following command:

```bash
python process_dataset.py --clean
```

The process will split the audio in chunks of 32 seconds and split the lyrics.

## Test the model

Here is an example of how to test the model:

```py
import librosa
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline


model_name = "Jour/whisper-small-lyric-finetuned"
audio_file = "PATH_TO_AUDIO_FILE"

device = "cuda:0" if torch.cuda.is_available() else "cpu"
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained(model_name)

pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
device=device,
)

sample, _ = librosa.load(audio_file, sr=processor.feature_extractor.sampling_rate)

prediction = pipe(sample.copy(), batch_size=8)["text"]
print(prediction)
```

## License

This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
66 changes: 66 additions & 0 deletions dataset/aeneas_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import json
import re
import tempfile

from aeneas.tools.execute_task import ExecuteTaskCLI, RuntimeConfiguration

from dataset.exceptions import AeneasAlignError


class AeneasWrapper:
"""Wrapper class for Aeneas CLI"""

def __init__(self) -> None:
self._rconf = RuntimeConfiguration()
self._rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH] = True
self._rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH_L3] = True
self._rconf[RuntimeConfiguration.TTS_CACHE] = True
self._rconf.set_granularity(3)

def aeneas_cli_exec(self, audio_path: str, lyric_path: str) -> dict:
"""Align lyrics with audio
Args:
audio_path (str): the path to the audio file
lyric_path (str): the path to the lyric file
Raises:
AeneasAlignError: if Aeneas fails to align lyrics
Returns:
dict: a dictionary containing the alignment data
"""

tmp_dir = tempfile.mkdtemp()

with open(lyric_path, "r", encoding="utf-8") as f:
lyric = f.read()

# remove all text between []
lyric = re.sub(r"\[.*?\]", "\n", lyric)

# remove when more than 2 new lines
lyric = re.sub(r"\n{1,}", "\n", lyric).strip()

lyric = lyric.replace(" ", "\n")

with open(f"{tmp_dir}/lyric.txt", "w", encoding="utf-8") as f:
f.write(lyric)

args = [
"dummy",
audio_path,
f"{tmp_dir}/lyric.txt",
"task_language=en|is_text_type=plain|os_task_file_format=json",
f"{tmp_dir}/lyric.json",
]

exit_code = ExecuteTaskCLI(use_sys=False, rconf=self._rconf).run(arguments=args)

if exit_code != 0:
raise AeneasAlignError("Aeneas failed to align lyrics")

with open(f"{tmp_dir}/lyric.json", "r", encoding="utf-8") as f:
data = json.load(f)

return data
2 changes: 2 additions & 0 deletions dataset/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class AeneasAlignError(Exception):
"""Raised when Aeneas fails to align lyrics"""
189 changes: 189 additions & 0 deletions dataset/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import os
import shutil
from typing import List

from pydub import AudioSegment

import dataset.exceptions
from dataset.aeneas_wrapper import AeneasWrapper


class DatasetProcess:
"""Class to process the dataset"""

def __init__(
self,
lyric_path: str,
audio_path: str,
sample_rate: int = None,
export_path: str = None,
clean: bool = False,
):
"""Constructor to initialize the DatasetProcess class
Args:
lyric_path (str): the path to the lyrics folder
audio_path (str): the path to the audio folder
sample_rate (int, optional): the sample rate of the audio. Defaults to None.
export_path (str, optional): the path to export data. Defaults to None.
clean (bool, optional): remove all data in the export path. Defaults to False.
"""

self.lyric_path = lyric_path
self.audio_path = audio_path
self.export_path = export_path
self.sample_rate = sample_rate

if clean and self.export_path:
self.remove_export_folder()

self.create_export_folder()

self.aeneas = AeneasWrapper()

def create_export_folder(self) -> None:
"""Method to create the export folder"""

if not os.path.exists(self.export_path):
os.makedirs(self.export_path)

if not os.path.exists(f"{self.export_path}/audio"):
os.makedirs(f"{self.export_path}/audio")

if not os.path.exists(f"{self.export_path}/lyrics"):
os.makedirs(f"{self.export_path}/lyrics")

def remove_export_folder(self) -> None:
"""Method to remove the export folder"""

if os.path.exists(self.export_path):
shutil.rmtree(self.export_path)

def _split_audio(
self, audio_path: str, split_windows: int = 32
) -> List[AudioSegment]:
"""Method to split audio into 32 seconds segments
Args:
audio_path (str): the path to the audio file
split_windows (int, optional): the size of the split window in seconds. Defaults to 32.
Returns:
list: a list of AudioSegment that contain audio split into 32 seconds segments
"""

audio = AudioSegment.from_file(audio_path)
segments = []

for i in range(0, len(audio), split_windows * 1000):
segments.append(audio[i : i + split_windows * 1000])

return segments

def _split_lyric(
self, lyric_path: str, alignement: dict, split_windows: int = 32
) -> List[str]:
"""Method to split audio into 32 seconds segments with the corresponding lyrics
Args:
lyric_path (str): the path to the lyric file
alignement (dict): the alignment data
split_windows (int, optional): the size of the split window in seconds. Defaults to 32.
Returns:
list: a list of list that contain lyrics split into 32 seconds segments
"""

with open(lyric_path, "r", encoding="utf-8") as f:
lyric = f.read()

segments = []
start_idx = 0
end_idx = 0

for fragment in alignement["fragments"]:
end_idx = lyric.find(fragment["lines"][0], end_idx)
windows = (len(segments) + 1) * split_windows

if float(fragment["begin"]) > windows:
segments.append(lyric[start_idx:end_idx])
start_idx = end_idx

segments.append(lyric[start_idx:])

return segments

def _export_audio(self, audios: List[AudioSegment], file_name: str) -> None:
"""Method to export audio segments to .wav format
Args:
audios (List[AudioSegment]): a list of AudioSegment
file_name (str): the name of the file
"""

for i, audio in enumerate(audios):
path = f"{self.audio_path}/{file_name}_{i}.wav"

if self.export_path:
path = f"{self.export_path}/audio/{file_name}_{i}.wav"

if self.sample_rate:
audio = audio.set_frame_rate(self.sample_rate)

audio.export(path, format="wav")

def _export_lyric(self, lyrics: List[str], file_name: str) -> None:
"""Method to export lyrics segments to .txt format
Args:
lyrics (List[str]): a list of lyrics
file_name (str): the name of the file
"""

for i, lyric in enumerate(lyrics):
path = f"{self.lyric_path}/{file_name}_{i}.txt"

if self.export_path:
path = f"{self.export_path}/lyrics/{file_name}_{i}.txt"

with open(path, "w", encoding="utf-8") as f:
f.write(lyric)

def process(self, remove: bool = False) -> None:
"""Method to process the dataset
1. Align lyrics with audio
2. Split audio into 32 seconds segments
3. Save the segments to the dataset/audio/processed folder in .wav format
Args:
remove (bool, optional): remove the processed file. Defaults to False.
"""

nbm_files = len(os.listdir(self.audio_path))
for i, audio_f in enumerate(os.listdir(self.audio_path)):
if not audio_f.endswith(".ogg") and not audio_f.endswith(".mp4"):
continue

audio_path = os.path.join(self.audio_path, audio_f)
lyric_path = os.path.join(self.lyric_path, audio_f.split(".")[0] + ".txt")

try:
alignement = self.aeneas.aeneas_cli_exec(audio_path, lyric_path)
except dataset.exceptions.AeneasAlignError as e:
print(f"Failed to align {audio_f}: {e}")
continue

lyric_segments = self._split_lyric(lyric_path, alignement)
audio_segments = self._split_audio(audio_path)

# save the audio segments and the lyrics
self._export_audio(audio_segments, audio_f.split(".")[0])
self._export_lyric(lyric_segments, audio_f.split(".")[0])

print(
f"Processed {i}/ {nbm_files} - {round(i/nbm_files*100, 2)}%", end="\r"
)

if remove:
os.remove(lyric_path)
os.remove(audio_path)
2 changes: 1 addition & 1 deletion download_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


parser = argparse.ArgumentParser(
description="Download images from Sonauto dataset",
description="Download music from Sonauto API",
)
parser.add_argument("--num_images", type=int, default=10000)
parser.add_argument("--clean", type=bool, default=True)
Expand Down
24 changes: 24 additions & 0 deletions process_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import argparse

from dataset.process import DatasetProcess


parser = argparse.ArgumentParser(
description="Process the dataset",
)
parser.add_argument("--audio_path", type=str, default="dataset/audio")
parser.add_argument("--lyric_path", type=str, default="dataset/lyrics")
parser.add_argument("--export_path", type=str, default="dataset/export")
parser.add_argument("--sample_rate", type=int, default=None)
parser.add_argument("--clean", type=bool, default=False)

args = parser.parse_args()
process = DatasetProcess(
lyric_path=args.lyric_path,
audio_path=args.audio_path,
sample_rate=args.sample_rate,
export_path=args.export_path,
clean=args.clean,
)

process.process()
Loading

0 comments on commit fa092df

Please sign in to comment.