mirror of
https://github.com/svc-develop-team/so-vits-svc.git
synced 2025-01-08 11:57:43 +08:00
Merge branch '4.1-Stable' into 4.1-Latest
This commit is contained in:
commit
317cde248d
19
README.md
19
README.md
@ -1,4 +1,5 @@
|
|||||||
<div align="center">
|
<div align="center">
|
||||||
|
<img alt="LOGO" src="https://avatars.githubusercontent.com/u/127122328?s=400&u=5395a98a4f945a3a50cb0cc96c2747505d190dbc&v=4" width="300" height="300" />
|
||||||
|
|
||||||
# SoftVC VITS Singing Voice Conversion
|
# SoftVC VITS Singing Voice Conversion
|
||||||
|
|
||||||
@ -265,15 +266,6 @@ Add `--vol_aug` if you want to enable loudness embedding:
|
|||||||
python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug
|
python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug
|
||||||
```
|
```
|
||||||
|
|
||||||
**Speed Up preprocess**
|
|
||||||
|
|
||||||
If your dataset is pretty large,you can increase the param `--num_processes` like that:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug --num_processes 8
|
|
||||||
```
|
|
||||||
All the worker will be assigned to different GPU if you have more than one GPUs.
|
|
||||||
|
|
||||||
After enabling loudness embedding, the trained model will match the loudness of the input source; otherwise, it will match the loudness of the training set.
|
After enabling loudness embedding, the trained model will match the loudness of the input source; otherwise, it will match the loudness of the training set.
|
||||||
|
|
||||||
#### You can modify some parameters in the generated config.json and diffusion.yaml
|
#### You can modify some parameters in the generated config.json and diffusion.yaml
|
||||||
@ -332,6 +324,15 @@ If you want shallow diffusion (optional), you need to add the `--use_diff` param
|
|||||||
python preprocess_hubert_f0.py --f0_predictor dio --use_diff
|
python preprocess_hubert_f0.py --f0_predictor dio --use_diff
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Speed Up preprocess**
|
||||||
|
|
||||||
|
If your dataset is pretty large,you can increase the param `--num_processes` like that:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
python preprocess_hubert_f0.py --speech_encoder vec768l12 --vol_aug --num_processes 8
|
||||||
|
```
|
||||||
|
All the worker will be assigned to different GPU if you have more than one GPUs.
|
||||||
|
|
||||||
After completing the above steps, the dataset directory will contain the preprocessed data, and the dataset_raw folder can be deleted.
|
After completing the above steps, the dataset directory will contain the preprocessed data, and the dataset_raw folder can be deleted.
|
||||||
|
|
||||||
## 🏋️ Training
|
## 🏋️ Training
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
<div align="center">
|
<div align="center">
|
||||||
|
<img alt="LOGO" src="https://avatars.githubusercontent.com/u/127122328?s=400&u=5395a98a4f945a3a50cb0cc96c2747505d190dbc&v=4" width="300" height="300" />
|
||||||
|
|
||||||
# SoftVC VITS Singing Voice Conversion
|
# SoftVC VITS Singing Voice Conversion
|
||||||
|
|
||||||
[**English**](./README.md) | [**中文简体**](./README_zh_CN.md)
|
[**English**](./README.md) | [**中文简体**](./README_zh_CN.md)
|
||||||
@ -268,13 +269,6 @@ wavlmbase+
|
|||||||
```shell
|
```shell
|
||||||
python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug
|
python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug
|
||||||
```
|
```
|
||||||
|
|
||||||
**加速预处理**
|
|
||||||
如若您的数据集比较大,可以尝试添加`--num_processes`参数:
|
|
||||||
```shell
|
|
||||||
python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug --num_processes 8
|
|
||||||
```
|
|
||||||
所有的Workers会被自动分配到多个GPU上(如果您有多个GPU的话)
|
|
||||||
使用后训练出的模型将匹配到输入源响度,否则为训练集响度。
|
使用后训练出的模型将匹配到输入源响度,否则为训练集响度。
|
||||||
|
|
||||||
#### 此时可以在生成的 config.json 与 diffusion.yaml 修改部分参数
|
#### 此时可以在生成的 config.json 与 diffusion.yaml 修改部分参数
|
||||||
@ -335,6 +329,13 @@ fcpe
|
|||||||
python preprocess_hubert_f0.py --f0_predictor dio --use_diff
|
python preprocess_hubert_f0.py --f0_predictor dio --use_diff
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**加速预处理**
|
||||||
|
如若您的数据集比较大,可以尝试添加`--num_processes`参数:
|
||||||
|
```shell
|
||||||
|
python preprocess_hubert_f0.py --f0_predictor dio --use_diff --num_processes 8
|
||||||
|
```
|
||||||
|
所有的Workers会被自动分配到多个线程上
|
||||||
|
|
||||||
执行完以上步骤后 dataset 目录便是预处理完成的数据,可以删除 dataset_raw 文件夹了
|
执行完以上步骤后 dataset 目录便是预处理完成的数据,可以删除 dataset_raw 文件夹了
|
||||||
|
|
||||||
## 🏋️ 训练
|
## 🏋️ 训练
|
||||||
|
@ -28,7 +28,7 @@ class RMVPE:
|
|||||||
def mel2hidden(self, mel):
|
def mel2hidden(self, mel):
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
n_frames = mel.shape[-1]
|
n_frames = mel.shape[-1]
|
||||||
mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='reflect')
|
mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='constant')
|
||||||
hidden = self.model(mel)
|
hidden = self.model(mel)
|
||||||
return hidden[:, :n_frames]
|
return hidden[:, :n_frames]
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@ import re
|
|||||||
import wave
|
import wave
|
||||||
from random import shuffle
|
from random import shuffle
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
import diffusion.logger.utils as du
|
import diffusion.logger.utils as du
|
||||||
@ -47,9 +48,9 @@ if __name__ == "__main__":
|
|||||||
if not file.endswith("wav"):
|
if not file.endswith("wav"):
|
||||||
continue
|
continue
|
||||||
if not pattern.match(file):
|
if not pattern.match(file):
|
||||||
print(f"warning:文件名{file}中包含非字母数字下划线,可能会导致错误。(也可能不会)")
|
logger.warning(f"文件名{file}中包含非字母数字下划线,可能会导致错误。(也可能不会)")
|
||||||
if get_wav_duration(file) < 0.3:
|
if get_wav_duration(file) < 0.3:
|
||||||
print("skip too short audio:", file)
|
logger.info("Skip too short audio:" + file)
|
||||||
continue
|
continue
|
||||||
new_wavs.append(file)
|
new_wavs.append(file)
|
||||||
wavs = new_wavs
|
wavs = new_wavs
|
||||||
@ -60,13 +61,13 @@ if __name__ == "__main__":
|
|||||||
shuffle(train)
|
shuffle(train)
|
||||||
shuffle(val)
|
shuffle(val)
|
||||||
|
|
||||||
print("Writing", args.train_list)
|
logger.info("Writing" + args.train_list)
|
||||||
with open(args.train_list, "w") as f:
|
with open(args.train_list, "w") as f:
|
||||||
for fname in tqdm(train):
|
for fname in tqdm(train):
|
||||||
wavpath = fname
|
wavpath = fname
|
||||||
f.write(wavpath + "\n")
|
f.write(wavpath + "\n")
|
||||||
|
|
||||||
print("Writing", args.val_list)
|
logger.info("Writing" + args.val_list)
|
||||||
with open(args.val_list, "w") as f:
|
with open(args.val_list, "w") as f:
|
||||||
for fname in tqdm(val):
|
for fname in tqdm(val):
|
||||||
wavpath = fname
|
wavpath = fname
|
||||||
@ -101,8 +102,8 @@ if __name__ == "__main__":
|
|||||||
if args.tiny:
|
if args.tiny:
|
||||||
config_template["model"]["filter_channels"] = 512
|
config_template["model"]["filter_channels"] = 512
|
||||||
|
|
||||||
print("Writing configs/config.json")
|
logger.info("Writing to configs/config.json")
|
||||||
with open("configs/config.json", "w") as f:
|
with open("configs/config.json", "w") as f:
|
||||||
json.dump(config_template, f, indent=2)
|
json.dump(config_template, f, indent=2)
|
||||||
print("Writing configs/diffusion.yaml")
|
logger.info("Writing to configs/diffusion.yaml")
|
||||||
du.save_config("configs/diffusion.yaml",d_config_template)
|
du.save_config("configs/diffusion.yaml",d_config_template)
|
||||||
|
@ -10,6 +10,7 @@ import librosa
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import torch.multiprocessing as mp
|
import torch.multiprocessing as mp
|
||||||
|
from loguru import logger
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
import diffusion.logger.utils as du
|
import diffusion.logger.utils as du
|
||||||
@ -27,13 +28,11 @@ hop_length = hps.data.hop_length
|
|||||||
speech_encoder = hps["model"]["speech_encoder"]
|
speech_encoder = hps["model"]["speech_encoder"]
|
||||||
|
|
||||||
|
|
||||||
def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None):
|
def process_one(filename, hmodel,f0p,rank,diff=False,mel_extractor=None):
|
||||||
# print(filename)
|
|
||||||
wav, sr = librosa.load(filename, sr=sampling_rate)
|
wav, sr = librosa.load(filename, sr=sampling_rate)
|
||||||
audio_norm = torch.FloatTensor(wav)
|
audio_norm = torch.FloatTensor(wav)
|
||||||
audio_norm = audio_norm.unsqueeze(0)
|
audio_norm = audio_norm.unsqueeze(0)
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
device = torch.device(f"cuda:{rank}")
|
||||||
|
|
||||||
soft_path = filename + ".soft.pt"
|
soft_path = filename + ".soft.pt"
|
||||||
if not os.path.exists(soft_path):
|
if not os.path.exists(soft_path):
|
||||||
wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000)
|
wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000)
|
||||||
@ -106,17 +105,17 @@ def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None):
|
|||||||
|
|
||||||
|
|
||||||
def process_batch(file_chunk, f0p, diff=False, mel_extractor=None, device="cpu"):
|
def process_batch(file_chunk, f0p, diff=False, mel_extractor=None, device="cpu"):
|
||||||
print("Loading speech encoder for content...")
|
logger.info("Loading speech encoder for content...")
|
||||||
rank = mp.current_process()._identity
|
rank = mp.current_process()._identity
|
||||||
rank = rank[0] if len(rank) > 0 else 0
|
rank = rank[0] if len(rank) > 0 else 0
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
gpu_id = rank % torch.cuda.device_count()
|
gpu_id = rank % torch.cuda.device_count()
|
||||||
device = torch.device(f"cuda:{gpu_id}")
|
device = torch.device(f"cuda:{gpu_id}")
|
||||||
print("Rank {rank} uses device {device}")
|
logger.info(f"Rank {rank} uses device {device}")
|
||||||
hmodel = utils.get_speech_encoder(speech_encoder, device=device)
|
hmodel = utils.get_speech_encoder(speech_encoder, device=device)
|
||||||
print("Loaded speech encoder.")
|
logger.info(f"Loaded speech encoder for rank {rank}")
|
||||||
for filename in tqdm(file_chunk):
|
for filename in tqdm(file_chunk):
|
||||||
process_one(filename, hmodel, f0p, diff, mel_extractor)
|
process_one(filename, hmodel, f0p, gpu_id, diff, mel_extractor)
|
||||||
|
|
||||||
def parallel_process(filenames, num_processes, f0p, diff, mel_extractor, device):
|
def parallel_process(filenames, num_processes, f0p, diff, mel_extractor, device):
|
||||||
with ProcessPoolExecutor(max_workers=num_processes) as executor:
|
with ProcessPoolExecutor(max_workers=num_processes) as executor:
|
||||||
@ -151,9 +150,11 @@ if __name__ == "__main__":
|
|||||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
print(speech_encoder)
|
print(speech_encoder)
|
||||||
print(f0p)
|
logger.info("Using device: ", device)
|
||||||
print("use_diff: ", args.use_diff)
|
logger.info("Using SpeechEncoder: " + speech_encoder)
|
||||||
print("device: ", device)
|
logger.info("Using extractor: " + f0p)
|
||||||
|
logger.info("Using diff Mode: " + str( args.use_diff))
|
||||||
|
|
||||||
if args.use_diff:
|
if args.use_diff:
|
||||||
print("use_diff")
|
print("use_diff")
|
||||||
print("Loading Mel Extractor...")
|
print("Loading Mel Extractor...")
|
||||||
|
@ -10,6 +10,8 @@ torch
|
|||||||
torchaudio
|
torchaudio
|
||||||
torchcrepe
|
torchcrepe
|
||||||
tqdm
|
tqdm
|
||||||
|
rich
|
||||||
|
loguru
|
||||||
scikit-maad
|
scikit-maad
|
||||||
praat-parselmouth
|
praat-parselmouth
|
||||||
onnx
|
onnx
|
||||||
|
@ -9,6 +9,8 @@ torch==1.13.1
|
|||||||
torchaudio==0.13.1
|
torchaudio==0.13.1
|
||||||
torchcrepe
|
torchcrepe
|
||||||
tqdm
|
tqdm
|
||||||
|
rich.progress
|
||||||
|
loguru
|
||||||
scikit-maad
|
scikit-maad
|
||||||
praat-parselmouth
|
praat-parselmouth
|
||||||
onnx
|
onnx
|
||||||
|
@ -15,6 +15,8 @@ sounddevice==0.4.5
|
|||||||
SoundFile==0.10.3.post1
|
SoundFile==0.10.3.post1
|
||||||
starlette==0.19.1
|
starlette==0.19.1
|
||||||
tqdm==4.63.0
|
tqdm==4.63.0
|
||||||
|
rich
|
||||||
|
loguru
|
||||||
torchcrepe
|
torchcrepe
|
||||||
scikit-maad
|
scikit-maad
|
||||||
praat-parselmouth
|
praat-parselmouth
|
||||||
|
@ -6,8 +6,8 @@ from multiprocessing import cpu_count
|
|||||||
|
|
||||||
import librosa
|
import librosa
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from rich.progress import track
|
||||||
from scipy.io import wavfile
|
from scipy.io import wavfile
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
|
|
||||||
def load_wav(wav_path):
|
def load_wav(wav_path):
|
||||||
@ -81,7 +81,7 @@ def process_all_speakers():
|
|||||||
if os.path.isdir(spk_dir):
|
if os.path.isdir(spk_dir):
|
||||||
print(spk_dir)
|
print(spk_dir)
|
||||||
futures = [executor.submit(process, (spk_dir, i, args)) for i in os.listdir(spk_dir) if i.endswith("wav")]
|
futures = [executor.submit(process, (spk_dir, i, args)) for i in os.listdir(spk_dir) if i.endswith("wav")]
|
||||||
for _ in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
for _ in track(concurrent.futures.as_completed(futures), total=len(futures), description="resampling:"):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
from loguru import logger
|
||||||
from torch.optim import lr_scheduler
|
from torch.optim import lr_scheduler
|
||||||
|
|
||||||
from diffusion.data_loaders import get_data_loaders
|
from diffusion.data_loaders import get_data_loaders
|
||||||
@ -28,8 +29,8 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
# load config
|
# load config
|
||||||
args = utils.load_config(cmd.config)
|
args = utils.load_config(cmd.config)
|
||||||
print(' > config:', cmd.config)
|
logger.info(' > config:'+ cmd.config)
|
||||||
print(' > exp:', args.env.expdir)
|
logger.info(' > exp:'+ args.env.expdir)
|
||||||
|
|
||||||
# load vocoder
|
# load vocoder
|
||||||
vocoder = Vocoder(args.vocoder.type, args.vocoder.ckpt, device=args.device)
|
vocoder = Vocoder(args.vocoder.type, args.vocoder.ckpt, device=args.device)
|
||||||
@ -47,7 +48,7 @@ if __name__ == '__main__':
|
|||||||
args.model.k_step_max
|
args.model.k_step_max
|
||||||
)
|
)
|
||||||
|
|
||||||
print(f' > INFO: now model timesteps is {model.timesteps}, and k_step_max is {model.k_step_max}')
|
logger.info(f' > Now model timesteps is {model.timesteps}, and k_step_max is {model.k_step_max}')
|
||||||
|
|
||||||
# load parameters
|
# load parameters
|
||||||
optimizer = torch.optim.AdamW(model.parameters())
|
optimizer = torch.optim.AdamW(model.parameters())
|
||||||
|
Loading…
Reference in New Issue
Block a user