mirror of
https://github.com/svc-develop-team/so-vits-svc.git
synced 2025-01-08 11:57:43 +08:00
diff
This commit is contained in:
parent
7fbca2ee1f
commit
8cc7645379
@ -7,7 +7,6 @@ data:
|
||||
encoder_sample_rate: 16000
|
||||
encoder_hop_size: 320
|
||||
encoder_out_channels: 768 # 256 if using 'hubertsoft'
|
||||
train_path: dataset/44k # Create a folder named "audio" under this path and put the audio clip in it
|
||||
training_files: "filelists/train.txt"
|
||||
validation_files: "filelists/val.txt"
|
||||
extensions: # List of extension included in the data collection
|
||||
@ -27,7 +26,7 @@ infer:
|
||||
speedup: 10
|
||||
method: 'dpm-solver' # 'pndm' or 'dpm-solver'
|
||||
env:
|
||||
expdir: exp/diffusion-test
|
||||
expdir: logs/44k/diffusion
|
||||
gpu_id: 0
|
||||
train:
|
||||
num_workers: 2 # If your cpu and gpu are both very strong, set to 0 may be faster!
|
||||
|
0
diffusion/__init__.py
Normal file
0
diffusion/__init__.py
Normal file
@ -5,6 +5,7 @@ import numpy as np
|
||||
import librosa
|
||||
import torch
|
||||
import random
|
||||
from utils import repeat_expand_2d
|
||||
from tqdm import tqdm
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
@ -51,7 +52,7 @@ def traverse_dir(
|
||||
|
||||
def get_data_loaders(args, whole_audio=False):
|
||||
data_train = AudioDataset(
|
||||
filelists_path = args.training_files,
|
||||
filelists = args.data.training_files,
|
||||
waveform_sec=args.data.duration,
|
||||
hop_size=args.data.block_size,
|
||||
sample_rate=args.data.sampling_rate,
|
||||
@ -72,7 +73,7 @@ def get_data_loaders(args, whole_audio=False):
|
||||
pin_memory=True if args.train.cache_device=='cpu' else False
|
||||
)
|
||||
data_valid = AudioDataset(
|
||||
filelists_path = args.validation_files,
|
||||
filelists = args.data.validation_files,
|
||||
waveform_sec=args.data.duration,
|
||||
hop_size=args.data.block_size,
|
||||
sample_rate=args.data.sampling_rate,
|
||||
@ -123,15 +124,15 @@ class AudioDataset(Dataset):
|
||||
else:
|
||||
print('Load the f0, volume data filelists:', filelists)
|
||||
with open(filelists,"r") as f:
|
||||
self.paths = f.readlines()
|
||||
self.paths = f.read().splitlines()
|
||||
for name_ext in tqdm(self.paths, total=len(self.paths)):
|
||||
name = os.path.splitext(name_ext)[0]
|
||||
path_audio = name_ext
|
||||
duration = librosa.get_duration(filename = path_audio, sr = self.sample_rate)
|
||||
|
||||
path_f0 = name_ext + ".f0.npy"
|
||||
f0 = np.load(path_f0)
|
||||
f0 = torch.from_numpy(f0).float().unsqueeze(-1).to(device)
|
||||
f0,_ = np.load(path_f0,allow_pickle=True)
|
||||
f0 = torch.from_numpy(np.array(f0,dtype=float)).float().unsqueeze(-1).to(device)
|
||||
|
||||
path_volume = name_ext + ".vol.npy"
|
||||
volume = np.load(path_volume)
|
||||
@ -169,8 +170,9 @@ class AudioDataset(Dataset):
|
||||
|
||||
path_units = name_ext + ".soft.pt"
|
||||
units = torch.load(path_units).to(device)
|
||||
units = units[0]
|
||||
units = repeat_expand_2d(units,f0.size(0)).transpose(0,1)
|
||||
|
||||
|
||||
if fp16:
|
||||
mel = mel.half()
|
||||
aug_mel = aug_mel.half()
|
||||
|
@ -3,8 +3,8 @@ import time
|
||||
import numpy as np
|
||||
import torch
|
||||
import librosa
|
||||
from logger.saver import Saver
|
||||
from logger import utils
|
||||
from diffusion.logger.saver import Saver
|
||||
from diffusion.logger import utils
|
||||
from torch import autocast
|
||||
from torch.cuda.amp import GradScaler
|
||||
|
||||
|
@ -34,10 +34,10 @@ def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None):
|
||||
wav, sr = librosa.load(filename, sr=sampling_rate)
|
||||
audio_norm = torch.FloatTensor(wav)
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
soft_path = filename + ".soft.pt"
|
||||
if not os.path.exists(soft_path):
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000)
|
||||
wav16k = torch.from_numpy(wav16k).to(device)
|
||||
c = hmodel.encoder(wav16k)
|
||||
|
10
utils.py
10
utils.py
@ -19,7 +19,7 @@ from modules.commons import sequence_mask
|
||||
|
||||
MATPLOTLIB_FLAG = False
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.WARN)
|
||||
logger = logging
|
||||
|
||||
f0_bin = 256
|
||||
@ -415,10 +415,12 @@ class Volume_Extractor:
|
||||
def __init__(self, hop_size = 512):
|
||||
self.hop_size = hop_size
|
||||
|
||||
def extract(self, audio): # audio: 1d numpy array
|
||||
n_frames = int(len(audio) // self.hop_size) + 1
|
||||
def extract(self, audio): # audio: 2d tensor array
|
||||
if isinstance(audio,torch.Tensor):
|
||||
audio = torch.Tensor(audio)
|
||||
n_frames = int(audio.size(-1) // self.hop_size)
|
||||
audio2 = audio ** 2
|
||||
audio2 = torch.nn.functional.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)), mode = 'reflect')
|
||||
volume = torch.FloatTensor([torch.mean(audio2[int(n * self.hop_size) : int((n + 1) * self.hop_size)]) for n in range(n_frames)])
|
||||
volume = torch.FloatTensor([torch.mean(audio2[:,int(n * self.hop_size) : int((n + 1) * self.hop_size)]) for n in range(n_frames)])
|
||||
volume = torch.sqrt(volume)
|
||||
return volume
|
Loading…
Reference in New Issue
Block a user