Merge branch '4.1-Stable' into 4.1-Latest

2025-01-08 11:57:43 +08:00 · 2023-07-23 23:05:02 +08:00 · 2023-07-23 23:05:02 +08:00 · 317cde248d
commit 317cde248d
parent 0ed2fb226a 691486fd55
10 changed files with 51 additions and 40 deletions
--- a/README.md
+++ b/README.md
@ -1,4 +1,5 @@
 <div align="center">
 <img alt="LOGO" src="https://avatars.githubusercontent.com/u/127122328?s=400&u=5395a98a4f945a3a50cb0cc96c2747505d190dbc&v=4" width="300" height="300" />
 # SoftVC VITS Singing Voice Conversion
@ -265,15 +266,6 @@ Add `--vol_aug` if you want to enable loudness embedding:
 python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug
 ```
 **Speed Up preprocess**
 If your dataset is pretty large,you can increase the param `--num_processes` like that:
 ```shell
 python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug --num_processes 8
 ```
 All the worker will be assigned to different GPU if you have more than one GPUs.
 After enabling loudness embedding, the trained model will match the loudness of the input source; otherwise, it will match the loudness of the training set.
 #### You can modify some parameters in the generated config.json and diffusion.yaml
@ -332,6 +324,15 @@ If you want shallow diffusion (optional), you need to add the `--use_diff` param
 python preprocess_hubert_f0.py --f0_predictor dio --use_diff
 ```
 **Speed Up preprocess**
 If your dataset is pretty large,you can increase the param `--num_processes` like that:
 ```shell
 python preprocess_hubert_f0.py --speech_encoder vec768l12 --vol_aug --num_processes 8
 ```
 All the worker will be assigned to different GPU if you have more than one GPUs.
 After completing the above steps, the dataset directory will contain the preprocessed data, and the dataset_raw folder can be deleted.
 ## 🏋️‍ Training
--- a/README_zh_CN.md
+++ b/README_zh_CN.md
@ -1,5 +1,6 @@
 <div align="center">
-  
+<img alt="LOGO" src="https://avatars.githubusercontent.com/u/127122328?s=400&u=5395a98a4f945a3a50cb0cc96c2747505d190dbc&v=4" width="300" height="300" />
 # SoftVC VITS Singing Voice Conversion
 [**English**](./README.md) | [**中文简体**](./README_zh_CN.md)
@ -268,13 +269,6 @@ wavlmbase+
 ```shell
 python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug
 ```
 **加速预处理**
 如若您的数据集比较大，可以尝试添加`--num_processes`参数：
 ```shell
 python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug --num_processes 8
 ```
 所有的Workers会被自动分配到多个GPU上（如果您有多个GPU的话）
 使用后训练出的模型将匹配到输入源响度，否则为训练集响度。
 #### 此时可以在生成的 config.json 与 diffusion.yaml 修改部分参数
@ -335,6 +329,13 @@ fcpe
 python preprocess_hubert_f0.py --f0_predictor dio --use_diff
 ```
 **加速预处理**
 如若您的数据集比较大，可以尝试添加`--num_processes`参数：
 ```shell
 python preprocess_hubert_f0.py --f0_predictor dio --use_diff --num_processes 8
 ```
 所有的Workers会被自动分配到多个线程上
 执行完以上步骤后 dataset 目录便是预处理完成的数据，可以删除 dataset_raw 文件夹了
 ## 🏋️‍ 训练
--- a/modules/F0Predictor/rmvpe/inference.py
+++ b/modules/F0Predictor/rmvpe/inference.py
@ -28,7 +28,7 @@ class RMVPE:
    def mel2hidden(self, mel):
        with torch.no_grad():
            n_frames = mel.shape[-1]
-            mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='reflect')
+            mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='constant')
            hidden = self.model(mel)
            return hidden[:, :n_frames]
--- a/preprocess_flist_config.py
+++ b/preprocess_flist_config.py
@ -5,6 +5,7 @@ import re
 import wave
 from random import shuffle
 from loguru import logger
 from tqdm import tqdm
 import diffusion.logger.utils as du
@ -47,9 +48,9 @@ if __name__ == "__main__":
            if not file.endswith("wav"):
                continue
            if not pattern.match(file):
-                print(f"warning：文件名{file}中包含非字母数字下划线，可能会导致错误。（也可能不会）")
+                logger.warning(f"文件名{file}中包含非字母数字下划线，可能会导致错误。（也可能不会）")
            if get_wav_duration(file) < 0.3:
-                print("skip too short audio:", file)
+                logger.info("Skip too short audio:" + file)
                continue
            new_wavs.append(file)
        wavs = new_wavs
@ -60,13 +61,13 @@ if __name__ == "__main__":
    shuffle(train)
    shuffle(val)
-    print("Writing", args.train_list)
+    logger.info("Writing" + args.train_list)
    with open(args.train_list, "w") as f:
        for fname in tqdm(train):
            wavpath = fname
            f.write(wavpath + "\n")
-    print("Writing", args.val_list)
+    logger.info("Writing" + args.val_list)
    with open(args.val_list, "w") as f:
        for fname in tqdm(val):
            wavpath = fname
@ -101,8 +102,8 @@ if __name__ == "__main__":
    if args.tiny:
        config_template["model"]["filter_channels"] = 512
-    print("Writing configs/config.json")
+    logger.info("Writing to configs/config.json")
    with open("configs/config.json", "w") as f:
        json.dump(config_template, f, indent=2)
-    print("Writing configs/diffusion.yaml")
+    logger.info("Writing to configs/diffusion.yaml")
    du.save_config("configs/diffusion.yaml",d_config_template)
--- a/preprocess_hubert_f0.py
+++ b/preprocess_hubert_f0.py
@ -10,6 +10,7 @@ import librosa
 import numpy as np
 import torch
 import torch.multiprocessing as mp
 from loguru import logger
 from tqdm import tqdm
 import diffusion.logger.utils as du
@ -27,13 +28,11 @@ hop_length = hps.data.hop_length
 speech_encoder = hps["model"]["speech_encoder"]
-def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None):
+def process_one(filename, hmodel,f0p,rank,diff=False,mel_extractor=None):
    # print(filename)
    wav, sr = librosa.load(filename, sr=sampling_rate)
    audio_norm = torch.FloatTensor(wav)
    audio_norm = audio_norm.unsqueeze(0)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    device = torch.device(f"cuda:{rank}")
    soft_path = filename + ".soft.pt"
    if not os.path.exists(soft_path):
        wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000)
@ -106,17 +105,17 @@ def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None):
 def process_batch(file_chunk, f0p, diff=False, mel_extractor=None, device="cpu"):
-    print("Loading speech encoder for content...")
+    logger.info("Loading speech encoder for content...")
    rank = mp.current_process()._identity
    rank = rank[0] if len(rank) > 0 else 0
    if torch.cuda.is_available():
        gpu_id = rank % torch.cuda.device_count()
        device = torch.device(f"cuda:{gpu_id}")
-    print("Rank {rank} uses device {device}")
+    logger.info(f"Rank {rank} uses device {device}")
    hmodel = utils.get_speech_encoder(speech_encoder, device=device)
-    print("Loaded speech encoder.")
+    logger.info(f"Loaded speech encoder for rank {rank}")
    for filename in tqdm(file_chunk):
-        process_one(filename, hmodel, f0p, diff, mel_extractor)
+        process_one(filename, hmodel, f0p, gpu_id, diff, mel_extractor)
 def parallel_process(filenames, num_processes, f0p, diff, mel_extractor, device):
    with ProcessPoolExecutor(max_workers=num_processes) as executor:
@ -151,9 +150,11 @@ if __name__ == "__main__":
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(speech_encoder)
-    print(f0p)
+    logger.info("Using device: ", device)
-    print("use_diff: ", args.use_diff)
+    logger.info("Using SpeechEncoder: " + speech_encoder)
-    print("device: ", device)
+    logger.info("Using extractor: " + f0p)
    logger.info("Using diff Mode: " + str( args.use_diff))
    if args.use_diff:
        print("use_diff")
        print("Loading Mel Extractor...")
--- a/requirements.txt
+++ b/requirements.txt
@ -10,6 +10,8 @@ torch
 torchaudio
 torchcrepe
 tqdm
 rich
 loguru
 scikit-maad
 praat-parselmouth
 onnx
--- a/requirements_onnx_encoder.txt
+++ b/requirements_onnx_encoder.txt
@ -9,6 +9,8 @@ torch==1.13.1
 torchaudio==0.13.1
 torchcrepe
 tqdm
 rich.progress
 loguru
 scikit-maad
 praat-parselmouth
 onnx
--- a/requirements_win.txt
+++ b/requirements_win.txt
@ -15,6 +15,8 @@ sounddevice==0.4.5
 SoundFile==0.10.3.post1
 starlette==0.19.1
 tqdm==4.63.0
 rich
 loguru
 torchcrepe
 scikit-maad
 praat-parselmouth
--- a/resample.py
+++ b/resample.py
@ -6,8 +6,8 @@ from multiprocessing import cpu_count
 import librosa
 import numpy as np
 from rich.progress import track
 from scipy.io import wavfile
 from tqdm import tqdm
 def load_wav(wav_path):
@ -81,7 +81,7 @@ def process_all_speakers():
            if os.path.isdir(spk_dir):
                print(spk_dir)
                futures = [executor.submit(process, (spk_dir, i, args)) for i in os.listdir(spk_dir) if i.endswith("wav")]
-                for _ in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
+                for _ in track(concurrent.futures.as_completed(futures), total=len(futures), description="resampling:"):
                    pass
--- a/train_diff.py
+++ b/train_diff.py
@ -1,6 +1,7 @@
 import argparse
 import torch
 from loguru import logger
 from torch.optim import lr_scheduler
 from diffusion.data_loaders import get_data_loaders
@ -28,8 +29,8 @@ if __name__ == '__main__':
    # load config
    args = utils.load_config(cmd.config)
-    print(' > config:', cmd.config)
+    logger.info(' > config:'+ cmd.config)
-    print(' >    exp:', args.env.expdir)
+    logger.info(' > exp:'+ args.env.expdir)
    # load vocoder
    vocoder = Vocoder(args.vocoder.type, args.vocoder.ckpt, device=args.device)
@ -47,7 +48,7 @@ if __name__ == '__main__':
                args.model.k_step_max
                )
-    print(f' > INFO: now model timesteps is {model.timesteps}, and k_step_max is {model.k_step_max}')
+    logger.info(f' > Now model timesteps is {model.timesteps}, and k_step_max is {model.k_step_max}')
    # load parameters
    optimizer = torch.optim.AdamW(model.parameters())