mirror of
https://github.com/svc-develop-team/so-vits-svc.git
synced 2025-01-08 11:57:43 +08:00
Encoder OOP
This commit is contained in:
parent
9fa9490e53
commit
c8245f3f68
@ -54,7 +54,9 @@
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 768,
|
||||
"ssl_dim": 768,
|
||||
"n_speakers": 200
|
||||
"n_speakers": 200,
|
||||
"speech_encoder":"vec768l12",
|
||||
"speaker_embedding":false
|
||||
},
|
||||
"spk": {
|
||||
"nyaru": 0,
|
||||
|
@ -65,8 +65,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||
spk = filename.split("/")[-2]
|
||||
spk = torch.LongTensor([self.spk_map[spk]])
|
||||
|
||||
f0 = np.load(filename + ".f0.npy")
|
||||
f0, uv = utils.interpolate_f0(f0)
|
||||
f0, uv = np.load(filename + ".f0.npy",allow_pickle=True)
|
||||
|
||||
f0 = torch.FloatTensor(f0)
|
||||
uv = torch.FloatTensor(uv)
|
||||
|
||||
|
@ -11,13 +11,11 @@ import gc
|
||||
import librosa
|
||||
import numpy as np
|
||||
# import onnxruntime
|
||||
import parselmouth
|
||||
import soundfile
|
||||
import torch
|
||||
import torchaudio
|
||||
|
||||
import cluster
|
||||
from hubert import hubert_model
|
||||
import utils
|
||||
from models import SynthesizerTrn
|
||||
|
||||
|
@ -44,7 +44,7 @@ class HarvestF0Predictor(F0Predictor):
|
||||
def compute_f0_uv(self,wav,p_len=None):
|
||||
if p_len is None:
|
||||
p_len = wav.shape[0]//self.hop_length
|
||||
f0, t = pyworld.dio(
|
||||
f0, t = pyworld.harvest(
|
||||
wav.astype(np.double),
|
||||
fs=self.sampling_rate,
|
||||
f0_floor=self.f0_min,
|
||||
|
@ -36,10 +36,11 @@ def process_one(filename, hmodel):
|
||||
if not os.path.exists(f0_path):
|
||||
from modules.F0Predictor.DioF0Predictor import DioF0Predictor
|
||||
f0_predictor = DioF0Predictor(sampling_rate=sampling_rate, hop_length=hop_length)
|
||||
f0 = f0_predictor.compute_f0(
|
||||
f0,uv = f0_predictor.compute_f0_uv(
|
||||
wav
|
||||
)
|
||||
np.save(f0_path, f0)
|
||||
np.save(f0_path, np.asanyarray((f0,uv),dtype=object))
|
||||
|
||||
|
||||
spec_path = filename.replace(".wav", ".spec.pt")
|
||||
if not os.path.exists(spec_path):
|
||||
|
1
utils.py
1
utils.py
@ -16,7 +16,6 @@ from scipy.io.wavfile import read
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
from modules.commons import sequence_mask
|
||||
from hubert import hubert_model
|
||||
|
||||
MATPLOTLIB_FLAG = False
|
||||
|
||||
|
31
vencoder/ContentVec256L9.py
Normal file
31
vencoder/ContentVec256L9.py
Normal file
@ -0,0 +1,31 @@
|
||||
from vencoder.encoder import SpeechEncoder
|
||||
import torch
|
||||
from fairseq import checkpoint_utils
|
||||
|
||||
class ContentVec256L9(SpeechEncoder):
|
||||
def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt"):
|
||||
print("load model(s) from {}".format(vec_path))
|
||||
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
||||
[vec_path],
|
||||
suffix="",
|
||||
)
|
||||
self.hidden_dim = 256
|
||||
self.model = models[0]
|
||||
self.model.eval()
|
||||
|
||||
def encoder(self, wav):
|
||||
feats = wav
|
||||
if feats.dim() == 2: # double channels
|
||||
feats = feats.mean(-1)
|
||||
assert feats.dim() == 1, feats.dim()
|
||||
feats = feats.view(1, -1)
|
||||
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
|
||||
inputs = {
|
||||
"source": feats.to(wav.device),
|
||||
"padding_mask": padding_mask.to(wav.device),
|
||||
"output_layer": 9, # layer 9
|
||||
}
|
||||
with torch.no_grad():
|
||||
logits = self.model.extract_features(**inputs)
|
||||
feats = self.model.final_proj(logits[0])
|
||||
return feats.transpose(1, 2)
|
30
vencoder/ContentVec768L12.py
Normal file
30
vencoder/ContentVec768L12.py
Normal file
@ -0,0 +1,30 @@
|
||||
from vencoder.encoder import SpeechEncoder
|
||||
import torch
|
||||
|
||||
class ContentVec768L12(SpeechEncoder):
|
||||
def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt"):
|
||||
print("load model(s) from {}".format(vec_path))
|
||||
from fairseq import checkpoint_utils
|
||||
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
||||
[vec_path],
|
||||
suffix="",
|
||||
)
|
||||
self.hidden_dim = 768
|
||||
self.model = models[0]
|
||||
self.model.eval()
|
||||
|
||||
def encoder(self, wav):
|
||||
feats = wav
|
||||
if feats.dim() == 2: # double channels
|
||||
feats = feats.mean(-1)
|
||||
assert feats.dim() == 1, feats.dim()
|
||||
feats = feats.view(1, -1)
|
||||
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
|
||||
inputs = {
|
||||
"source": feats.to(wav.device),
|
||||
"padding_mask": padding_mask.to(wav.device),
|
||||
"output_layer": 12, # layer 12
|
||||
}
|
||||
with torch.no_grad():
|
||||
logits = self.model.extract_features(**inputs)
|
||||
return logits[0].transpose(1, 2)
|
24
vencoder/HuberSoft.py
Normal file
24
vencoder/HuberSoft.py
Normal file
@ -0,0 +1,24 @@
|
||||
from vencoder.encoder import SpeechEncoder
|
||||
import torch
|
||||
from vencoder.hubert import hubert_model
|
||||
class Hubersoft(SpeechEncoder):
|
||||
def __init__(self,vec_path = "pretrain/hubert-soft-0d54a1f4.pt",device=None):
|
||||
print("load model(s) from {}".format(vec_path))
|
||||
hubert_soft = hubert_model.hubert_soft("hubert/hubert-soft-0d54a1f4.pt")
|
||||
if device is None:
|
||||
self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
else:
|
||||
self.dev = torch.device(device)
|
||||
self.hidden_dim = 256
|
||||
self.model = hubert_soft.to(self.dev)
|
||||
return hubert_soft
|
||||
|
||||
def encoder(self, wav):
|
||||
feats = wav
|
||||
if feats.dim() == 2: # double channels
|
||||
feats = feats.mean(-1)
|
||||
assert feats.dim() == 1, feats.dim()
|
||||
feats = feats.view(1, -1)
|
||||
with torch.inference_mode():
|
||||
units = self.model.units(feats)
|
||||
return units.transpose(1,2)
|
12
vencoder/encoder.py
Normal file
12
vencoder/encoder.py
Normal file
@ -0,0 +1,12 @@
|
||||
class SpeechEncoder(object):
|
||||
def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt"):
|
||||
self.model = None #This is Model
|
||||
self.hidden_dim = 768
|
||||
pass
|
||||
|
||||
def encoder(self,wav):
|
||||
'''
|
||||
input: wav:[batchsize,signal_length]
|
||||
output: embedding:[batchsize,wav_frame,hidden_dim]
|
||||
'''
|
||||
pass
|
0
vencoder/hubert/__init__.py
Normal file
0
vencoder/hubert/__init__.py
Normal file
Loading…
Reference in New Issue
Block a user