Skip to content

Commit

Permalink
update readme
Browse files Browse the repository at this point in the history
  • Loading branch information
meo committed Jun 7, 2022
1 parent 9b1ac70 commit 64f15c4
Show file tree
Hide file tree
Showing 17 changed files with 2,900 additions and 1,173 deletions.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ python preprocess.py config/finetune/preprocess.yaml
Finetune speaker voice with

```
python train.py [-h] [--pretrain_dir BASE_LINE_MODEL_PATH] [-p PREPROCESS_CONFIG_PATH] [-m MODEL_CONFIG_PATH] [-t TRAIN_CONFIG_PATH] [--vocoder_checkpoint VOCODER_CHECKPOINT_PATH] [--vocoder_config VOCODER_CONFIG_PATH]
python finetune.py [-h] [--pretrain_dir BASE_LINE_MODEL_PATH] [-p PREPROCESS_CONFIG_PATH] [-m MODEL_CONFIG_PATH] [-t TRAIN_CONFIG_PATH] [--vocoder_checkpoint VOCODER_CHECKPOINT_PATH] [--vocoder_config VOCODER_CONFIG_PATH]
```

# TensorBoard
Expand All @@ -72,6 +72,11 @@ Use
```
tensorboard [--logdir LOG_PATH]
```
* Tensorboard for pretrain model
![](./assets/pretrain_tensorboard.png)

* Tensorboard for finetune with only 5 sentences
![](./assets/finetune_tensorboard.png)

# References
- [**AdaSpeech: Adaptive text to speech for custom voice**](https://arxiv.org/pdf/2103.00993.pdf).
Expand Down
Binary file added assets/finetune_tensorboard.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/pretrain_tensorboard.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 3 additions & 3 deletions config/pretrain/train.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
path:
ckpt_path: "./output/ckpt/Audiobook"
log_path: "./output/log/Audiobook"
result_path: "./output/result/Audiobook"
ckpt_path: "./output/ckpt/multi_language"
log_path: "./output/log/multi_language"
result_path: "./output/result/multi_language"
optimizer:
batch_size: 16
betas: [0.9, 0.98]
Expand Down
28 changes: 19 additions & 9 deletions inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from dataset import Dataset
from text import text_to_sequence
from datetime import datetime
from g2p_en import G2p

import audio as Audio

Expand All @@ -35,7 +36,7 @@ def get_vocoder(config, checkpoint_path):

def synthesize(model, step, configs, vocoder, loader, control_values, output_dir):
preprocess_config, model_config, train_config = configs
pitch_control, energy_control, duration_control = control_values
pitch_control, energy_control, duration_control, eng_pos = control_values
for batch in batchs:
batch = to_device(batch, device)
with torch.no_grad():
Expand All @@ -44,7 +45,7 @@ def synthesize(model, step, configs, vocoder, loader, control_values, output_dir
*(batch[2:]),
p_control=pitch_control,
e_control=energy_control,
d_control=duration_control
d_control=duration_control,
)
synth_samples(
batch,
Expand Down Expand Up @@ -72,6 +73,13 @@ def get_reference_mel(reference_audio_dir, STFT):
help="speaker ID for multi-speaker synthesis, for single-sentence mode only",
)

parser.add_argument(
"--language_id",
type=int,
default=0,
help="language ID for multi-language synthesis"
)

parser.add_argument(
"--output_dir",
type=str
Expand Down Expand Up @@ -106,10 +114,10 @@ def get_reference_mel(reference_audio_dir, STFT):

# Read Config
preprocess_config = yaml.load(
open("config/1_CTV/preprocess.yaml", "r"), Loader=yaml.FullLoader
open("config/pretrain/preprocess.yaml", "r"), Loader=yaml.FullLoader
)
model_config = yaml.load(open("config/1_CTV/model.yaml", "r"), Loader=yaml.FullLoader)
train_config = yaml.load(open("config/1_CTV/train.yaml", "r"), Loader=yaml.FullLoader)
model_config = yaml.load(open("config/pretrain/model.yaml", "r"), Loader=yaml.FullLoader)
train_config = yaml.load(open("config/pretrain/train.yaml", "r"), Loader=yaml.FullLoader)
configs = (preprocess_config, model_config, train_config)

output_dir = args.output_dir
Expand All @@ -131,10 +139,12 @@ def get_reference_mel(reference_audio_dir, STFT):

# Preprocess texts
ids = [datetime.now().strftime("%Y_%m_%d-%I_%M_%S_%p")]
raw_texts = "Một người dân đang chạy thể dục dọc bờ sông Hương thì bất ngờ phát hiện quả đạn thời chiến tranh nằm bên bãi cỏ"
raw_texts = "thông tấn xã thailand cho rằng china đã quá tự cao tự đại trong mối quan hệ với russia"
speakers = np.array([args.speaker_id])
text = convert_text_to_ipa(raw_texts.lower())
languages = np.array([args.language_id])
text, eng_pos = convert_text_to_ipa(raw_texts.lower())
text = text.replace(",", "sp")
print(text)
text = np.array(
text_to_sequence(
text, preprocess_config["preprocessing"]["text"]["text_cleaners"]
Expand All @@ -144,8 +154,8 @@ def get_reference_mel(reference_audio_dir, STFT):
text_lens = np.array([len(text[0])])
mel_spectrogram = get_reference_mel(wav_path, STFT)
mel_spectrogram = np.array([mel_spectrogram])
batchs = [(ids, raw_texts, speakers, text, text_lens, max(text_lens), mel_spectrogram)]
batchs = [(ids, raw_texts, speakers, text, text_lens, max(text_lens), mel_spectrogram, languages)]

control_values = args.pitch_control, args.energy_control, args.duration_control
control_values = args.pitch_control, args.energy_control, args.duration_control, eng_pos

synthesize(model, args.restore_step, configs, vocoder, batchs, control_values, output_dir)
1 change: 0 additions & 1 deletion preprocessed_data/speakers.json

This file was deleted.

1 change: 0 additions & 1 deletion preprocessed_data/stats.json

This file was deleted.

Loading

0 comments on commit 64f15c4

Please sign in to comment.