Encoder OOP

This commit is contained in:
ylzz1997 2023-05-13 23:45:56 +08:00
parent 9fa9490e53
commit c8245f3f68
15 changed files with 106 additions and 9 deletions

View File

@ -54,7 +54,9 @@
"use_spectral_norm": false, "use_spectral_norm": false,
"gin_channels": 768, "gin_channels": 768,
"ssl_dim": 768, "ssl_dim": 768,
"n_speakers": 200 "n_speakers": 200,
"speech_encoder":"vec768l12",
"speaker_embedding":false
}, },
"spk": { "spk": {
"nyaru": 0, "nyaru": 0,

View File

@ -65,8 +65,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
spk = filename.split("/")[-2] spk = filename.split("/")[-2]
spk = torch.LongTensor([self.spk_map[spk]]) spk = torch.LongTensor([self.spk_map[spk]])
f0 = np.load(filename + ".f0.npy") f0, uv = np.load(filename + ".f0.npy",allow_pickle=True)
f0, uv = utils.interpolate_f0(f0)
f0 = torch.FloatTensor(f0) f0 = torch.FloatTensor(f0)
uv = torch.FloatTensor(uv) uv = torch.FloatTensor(uv)

View File

@ -11,13 +11,11 @@ import gc
import librosa import librosa
import numpy as np import numpy as np
# import onnxruntime # import onnxruntime
import parselmouth
import soundfile import soundfile
import torch import torch
import torchaudio import torchaudio
import cluster import cluster
from hubert import hubert_model
import utils import utils
from models import SynthesizerTrn from models import SynthesizerTrn

View File

@ -44,7 +44,7 @@ class HarvestF0Predictor(F0Predictor):
def compute_f0_uv(self,wav,p_len=None): def compute_f0_uv(self,wav,p_len=None):
if p_len is None: if p_len is None:
p_len = wav.shape[0]//self.hop_length p_len = wav.shape[0]//self.hop_length
f0, t = pyworld.dio( f0, t = pyworld.harvest(
wav.astype(np.double), wav.astype(np.double),
fs=self.sampling_rate, fs=self.sampling_rate,
f0_floor=self.f0_min, f0_floor=self.f0_min,

View File

@ -36,10 +36,11 @@ def process_one(filename, hmodel):
if not os.path.exists(f0_path): if not os.path.exists(f0_path):
from modules.F0Predictor.DioF0Predictor import DioF0Predictor from modules.F0Predictor.DioF0Predictor import DioF0Predictor
f0_predictor = DioF0Predictor(sampling_rate=sampling_rate, hop_length=hop_length) f0_predictor = DioF0Predictor(sampling_rate=sampling_rate, hop_length=hop_length)
f0 = f0_predictor.compute_f0( f0,uv = f0_predictor.compute_f0_uv(
wav wav
) )
np.save(f0_path, f0) np.save(f0_path, np.asanyarray((f0,uv),dtype=object))
spec_path = filename.replace(".wav", ".spec.pt") spec_path = filename.replace(".wav", ".spec.pt")
if not os.path.exists(spec_path): if not os.path.exists(spec_path):

View File

@ -16,7 +16,6 @@ from scipy.io.wavfile import read
import torch import torch
from torch.nn import functional as F from torch.nn import functional as F
from modules.commons import sequence_mask from modules.commons import sequence_mask
from hubert import hubert_model
MATPLOTLIB_FLAG = False MATPLOTLIB_FLAG = False

View File

@ -0,0 +1,31 @@
from vencoder.encoder import SpeechEncoder
import torch
from fairseq import checkpoint_utils
class ContentVec256L9(SpeechEncoder):
def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt"):
print("load model(s) from {}".format(vec_path))
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
[vec_path],
suffix="",
)
self.hidden_dim = 256
self.model = models[0]
self.model.eval()
def encoder(self, wav):
feats = wav
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
inputs = {
"source": feats.to(wav.device),
"padding_mask": padding_mask.to(wav.device),
"output_layer": 9, # layer 9
}
with torch.no_grad():
logits = self.model.extract_features(**inputs)
feats = self.model.final_proj(logits[0])
return feats.transpose(1, 2)

View File

@ -0,0 +1,30 @@
from vencoder.encoder import SpeechEncoder
import torch
class ContentVec768L12(SpeechEncoder):
def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt"):
print("load model(s) from {}".format(vec_path))
from fairseq import checkpoint_utils
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
[vec_path],
suffix="",
)
self.hidden_dim = 768
self.model = models[0]
self.model.eval()
def encoder(self, wav):
feats = wav
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
inputs = {
"source": feats.to(wav.device),
"padding_mask": padding_mask.to(wav.device),
"output_layer": 12, # layer 12
}
with torch.no_grad():
logits = self.model.extract_features(**inputs)
return logits[0].transpose(1, 2)

24
vencoder/HuberSoft.py Normal file
View File

@ -0,0 +1,24 @@
from vencoder.encoder import SpeechEncoder
import torch
from vencoder.hubert import hubert_model
class Hubersoft(SpeechEncoder):
def __init__(self,vec_path = "pretrain/hubert-soft-0d54a1f4.pt",device=None):
print("load model(s) from {}".format(vec_path))
hubert_soft = hubert_model.hubert_soft("hubert/hubert-soft-0d54a1f4.pt")
if device is None:
self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
self.dev = torch.device(device)
self.hidden_dim = 256
self.model = hubert_soft.to(self.dev)
return hubert_soft
def encoder(self, wav):
feats = wav
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
with torch.inference_mode():
units = self.model.units(feats)
return units.transpose(1,2)

12
vencoder/encoder.py Normal file
View File

@ -0,0 +1,12 @@
class SpeechEncoder(object):
def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt"):
self.model = None #This is Model
self.hidden_dim = 768
pass
def encoder(self,wav):
'''
input: wav:[batchsize,signal_length]
output: embedding:[batchsize,wav_frame,hidden_dim]
'''
pass

View File