This commit is contained in:
ylzz1997 2023-05-17 01:10:43 +08:00
parent 7fbca2ee1f
commit 8cc7645379
6 changed files with 18 additions and 15 deletions

View File

@ -7,7 +7,6 @@ data:
encoder_sample_rate: 16000
encoder_hop_size: 320
encoder_out_channels: 768 # 256 if using 'hubertsoft'
train_path: dataset/44k # Create a folder named "audio" under this path and put the audio clip in it
training_files: "filelists/train.txt"
validation_files: "filelists/val.txt"
extensions: # List of extension included in the data collection
@ -27,7 +26,7 @@ infer:
speedup: 10
method: 'dpm-solver' # 'pndm' or 'dpm-solver'
env:
expdir: exp/diffusion-test
expdir: logs/44k/diffusion
gpu_id: 0
train:
num_workers: 2 # If your cpu and gpu are both very strong, set to 0 may be faster!

0
diffusion/__init__.py Normal file
View File

View File

@ -5,6 +5,7 @@ import numpy as np
import librosa
import torch
import random
from utils import repeat_expand_2d
from tqdm import tqdm
from torch.utils.data import Dataset
@ -51,7 +52,7 @@ def traverse_dir(
def get_data_loaders(args, whole_audio=False):
data_train = AudioDataset(
filelists_path = args.training_files,
filelists = args.data.training_files,
waveform_sec=args.data.duration,
hop_size=args.data.block_size,
sample_rate=args.data.sampling_rate,
@ -72,7 +73,7 @@ def get_data_loaders(args, whole_audio=False):
pin_memory=True if args.train.cache_device=='cpu' else False
)
data_valid = AudioDataset(
filelists_path = args.validation_files,
filelists = args.data.validation_files,
waveform_sec=args.data.duration,
hop_size=args.data.block_size,
sample_rate=args.data.sampling_rate,
@ -123,15 +124,15 @@ class AudioDataset(Dataset):
else:
print('Load the f0, volume data filelists:', filelists)
with open(filelists,"r") as f:
self.paths = f.readlines()
self.paths = f.read().splitlines()
for name_ext in tqdm(self.paths, total=len(self.paths)):
name = os.path.splitext(name_ext)[0]
path_audio = name_ext
duration = librosa.get_duration(filename = path_audio, sr = self.sample_rate)
path_f0 = name_ext + ".f0.npy"
f0 = np.load(path_f0)
f0 = torch.from_numpy(f0).float().unsqueeze(-1).to(device)
f0,_ = np.load(path_f0,allow_pickle=True)
f0 = torch.from_numpy(np.array(f0,dtype=float)).float().unsqueeze(-1).to(device)
path_volume = name_ext + ".vol.npy"
volume = np.load(path_volume)
@ -169,8 +170,9 @@ class AudioDataset(Dataset):
path_units = name_ext + ".soft.pt"
units = torch.load(path_units).to(device)
units = units[0]
units = repeat_expand_2d(units,f0.size(0)).transpose(0,1)
if fp16:
mel = mel.half()
aug_mel = aug_mel.half()

View File

@ -3,8 +3,8 @@ import time
import numpy as np
import torch
import librosa
from logger.saver import Saver
from logger import utils
from diffusion.logger.saver import Saver
from diffusion.logger import utils
from torch import autocast
from torch.cuda.amp import GradScaler

View File

@ -34,10 +34,10 @@ def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None):
wav, sr = librosa.load(filename, sr=sampling_rate)
audio_norm = torch.FloatTensor(wav)
audio_norm = audio_norm.unsqueeze(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
soft_path = filename + ".soft.pt"
if not os.path.exists(soft_path):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000)
wav16k = torch.from_numpy(wav16k).to(device)
c = hmodel.encoder(wav16k)

View File

@ -19,7 +19,7 @@ from modules.commons import sequence_mask
MATPLOTLIB_FLAG = False
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.basicConfig(stream=sys.stdout, level=logging.WARN)
logger = logging
f0_bin = 256
@ -415,10 +415,12 @@ class Volume_Extractor:
def __init__(self, hop_size = 512):
self.hop_size = hop_size
def extract(self, audio): # audio: 1d numpy array
n_frames = int(len(audio) // self.hop_size) + 1
def extract(self, audio): # audio: 2d tensor array
if isinstance(audio,torch.Tensor):
audio = torch.Tensor(audio)
n_frames = int(audio.size(-1) // self.hop_size)
audio2 = audio ** 2
audio2 = torch.nn.functional.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)), mode = 'reflect')
volume = torch.FloatTensor([torch.mean(audio2[int(n * self.hop_size) : int((n + 1) * self.hop_size)]) for n in range(n_frames)])
volume = torch.FloatTensor([torch.mean(audio2[:,int(n * self.hop_size) : int((n + 1) * self.hop_size)]) for n in range(n_frames)])
volume = torch.sqrt(volume)
return volume