mirror of
https://github.com/svc-develop-team/so-vits-svc.git
synced 2025-01-09 04:27:31 +08:00
Encoder OOP
This commit is contained in:
parent
9fa9490e53
commit
c8245f3f68
@ -54,7 +54,9 @@
|
|||||||
"use_spectral_norm": false,
|
"use_spectral_norm": false,
|
||||||
"gin_channels": 768,
|
"gin_channels": 768,
|
||||||
"ssl_dim": 768,
|
"ssl_dim": 768,
|
||||||
"n_speakers": 200
|
"n_speakers": 200,
|
||||||
|
"speech_encoder":"vec768l12",
|
||||||
|
"speaker_embedding":false
|
||||||
},
|
},
|
||||||
"spk": {
|
"spk": {
|
||||||
"nyaru": 0,
|
"nyaru": 0,
|
||||||
|
@ -65,8 +65,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|||||||
spk = filename.split("/")[-2]
|
spk = filename.split("/")[-2]
|
||||||
spk = torch.LongTensor([self.spk_map[spk]])
|
spk = torch.LongTensor([self.spk_map[spk]])
|
||||||
|
|
||||||
f0 = np.load(filename + ".f0.npy")
|
f0, uv = np.load(filename + ".f0.npy",allow_pickle=True)
|
||||||
f0, uv = utils.interpolate_f0(f0)
|
|
||||||
f0 = torch.FloatTensor(f0)
|
f0 = torch.FloatTensor(f0)
|
||||||
uv = torch.FloatTensor(uv)
|
uv = torch.FloatTensor(uv)
|
||||||
|
|
||||||
|
@ -11,13 +11,11 @@ import gc
|
|||||||
import librosa
|
import librosa
|
||||||
import numpy as np
|
import numpy as np
|
||||||
# import onnxruntime
|
# import onnxruntime
|
||||||
import parselmouth
|
|
||||||
import soundfile
|
import soundfile
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
|
|
||||||
import cluster
|
import cluster
|
||||||
from hubert import hubert_model
|
|
||||||
import utils
|
import utils
|
||||||
from models import SynthesizerTrn
|
from models import SynthesizerTrn
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ class HarvestF0Predictor(F0Predictor):
|
|||||||
def compute_f0_uv(self,wav,p_len=None):
|
def compute_f0_uv(self,wav,p_len=None):
|
||||||
if p_len is None:
|
if p_len is None:
|
||||||
p_len = wav.shape[0]//self.hop_length
|
p_len = wav.shape[0]//self.hop_length
|
||||||
f0, t = pyworld.dio(
|
f0, t = pyworld.harvest(
|
||||||
wav.astype(np.double),
|
wav.astype(np.double),
|
||||||
fs=self.sampling_rate,
|
fs=self.sampling_rate,
|
||||||
f0_floor=self.f0_min,
|
f0_floor=self.f0_min,
|
||||||
|
@ -36,10 +36,11 @@ def process_one(filename, hmodel):
|
|||||||
if not os.path.exists(f0_path):
|
if not os.path.exists(f0_path):
|
||||||
from modules.F0Predictor.DioF0Predictor import DioF0Predictor
|
from modules.F0Predictor.DioF0Predictor import DioF0Predictor
|
||||||
f0_predictor = DioF0Predictor(sampling_rate=sampling_rate, hop_length=hop_length)
|
f0_predictor = DioF0Predictor(sampling_rate=sampling_rate, hop_length=hop_length)
|
||||||
f0 = f0_predictor.compute_f0(
|
f0,uv = f0_predictor.compute_f0_uv(
|
||||||
wav
|
wav
|
||||||
)
|
)
|
||||||
np.save(f0_path, f0)
|
np.save(f0_path, np.asanyarray((f0,uv),dtype=object))
|
||||||
|
|
||||||
|
|
||||||
spec_path = filename.replace(".wav", ".spec.pt")
|
spec_path = filename.replace(".wav", ".spec.pt")
|
||||||
if not os.path.exists(spec_path):
|
if not os.path.exists(spec_path):
|
||||||
|
1
utils.py
1
utils.py
@ -16,7 +16,6 @@ from scipy.io.wavfile import read
|
|||||||
import torch
|
import torch
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
from modules.commons import sequence_mask
|
from modules.commons import sequence_mask
|
||||||
from hubert import hubert_model
|
|
||||||
|
|
||||||
MATPLOTLIB_FLAG = False
|
MATPLOTLIB_FLAG = False
|
||||||
|
|
||||||
|
31
vencoder/ContentVec256L9.py
Normal file
31
vencoder/ContentVec256L9.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
from vencoder.encoder import SpeechEncoder
|
||||||
|
import torch
|
||||||
|
from fairseq import checkpoint_utils
|
||||||
|
|
||||||
|
class ContentVec256L9(SpeechEncoder):
|
||||||
|
def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt"):
|
||||||
|
print("load model(s) from {}".format(vec_path))
|
||||||
|
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
||||||
|
[vec_path],
|
||||||
|
suffix="",
|
||||||
|
)
|
||||||
|
self.hidden_dim = 256
|
||||||
|
self.model = models[0]
|
||||||
|
self.model.eval()
|
||||||
|
|
||||||
|
def encoder(self, wav):
|
||||||
|
feats = wav
|
||||||
|
if feats.dim() == 2: # double channels
|
||||||
|
feats = feats.mean(-1)
|
||||||
|
assert feats.dim() == 1, feats.dim()
|
||||||
|
feats = feats.view(1, -1)
|
||||||
|
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
|
||||||
|
inputs = {
|
||||||
|
"source": feats.to(wav.device),
|
||||||
|
"padding_mask": padding_mask.to(wav.device),
|
||||||
|
"output_layer": 9, # layer 9
|
||||||
|
}
|
||||||
|
with torch.no_grad():
|
||||||
|
logits = self.model.extract_features(**inputs)
|
||||||
|
feats = self.model.final_proj(logits[0])
|
||||||
|
return feats.transpose(1, 2)
|
30
vencoder/ContentVec768L12.py
Normal file
30
vencoder/ContentVec768L12.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
from vencoder.encoder import SpeechEncoder
|
||||||
|
import torch
|
||||||
|
|
||||||
|
class ContentVec768L12(SpeechEncoder):
|
||||||
|
def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt"):
|
||||||
|
print("load model(s) from {}".format(vec_path))
|
||||||
|
from fairseq import checkpoint_utils
|
||||||
|
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
||||||
|
[vec_path],
|
||||||
|
suffix="",
|
||||||
|
)
|
||||||
|
self.hidden_dim = 768
|
||||||
|
self.model = models[0]
|
||||||
|
self.model.eval()
|
||||||
|
|
||||||
|
def encoder(self, wav):
|
||||||
|
feats = wav
|
||||||
|
if feats.dim() == 2: # double channels
|
||||||
|
feats = feats.mean(-1)
|
||||||
|
assert feats.dim() == 1, feats.dim()
|
||||||
|
feats = feats.view(1, -1)
|
||||||
|
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
|
||||||
|
inputs = {
|
||||||
|
"source": feats.to(wav.device),
|
||||||
|
"padding_mask": padding_mask.to(wav.device),
|
||||||
|
"output_layer": 12, # layer 12
|
||||||
|
}
|
||||||
|
with torch.no_grad():
|
||||||
|
logits = self.model.extract_features(**inputs)
|
||||||
|
return logits[0].transpose(1, 2)
|
24
vencoder/HuberSoft.py
Normal file
24
vencoder/HuberSoft.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
from vencoder.encoder import SpeechEncoder
|
||||||
|
import torch
|
||||||
|
from vencoder.hubert import hubert_model
|
||||||
|
class Hubersoft(SpeechEncoder):
|
||||||
|
def __init__(self,vec_path = "pretrain/hubert-soft-0d54a1f4.pt",device=None):
|
||||||
|
print("load model(s) from {}".format(vec_path))
|
||||||
|
hubert_soft = hubert_model.hubert_soft("hubert/hubert-soft-0d54a1f4.pt")
|
||||||
|
if device is None:
|
||||||
|
self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
else:
|
||||||
|
self.dev = torch.device(device)
|
||||||
|
self.hidden_dim = 256
|
||||||
|
self.model = hubert_soft.to(self.dev)
|
||||||
|
return hubert_soft
|
||||||
|
|
||||||
|
def encoder(self, wav):
|
||||||
|
feats = wav
|
||||||
|
if feats.dim() == 2: # double channels
|
||||||
|
feats = feats.mean(-1)
|
||||||
|
assert feats.dim() == 1, feats.dim()
|
||||||
|
feats = feats.view(1, -1)
|
||||||
|
with torch.inference_mode():
|
||||||
|
units = self.model.units(feats)
|
||||||
|
return units.transpose(1,2)
|
12
vencoder/encoder.py
Normal file
12
vencoder/encoder.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
class SpeechEncoder(object):
|
||||||
|
def __init__(self,vec_path = "pretrain/checkpoint_best_legacy_500.pt"):
|
||||||
|
self.model = None #This is Model
|
||||||
|
self.hidden_dim = 768
|
||||||
|
pass
|
||||||
|
|
||||||
|
def encoder(self,wav):
|
||||||
|
'''
|
||||||
|
input: wav:[batchsize,signal_length]
|
||||||
|
output: embedding:[batchsize,wav_frame,hidden_dim]
|
||||||
|
'''
|
||||||
|
pass
|
0
vencoder/hubert/__init__.py
Normal file
0
vencoder/hubert/__init__.py
Normal file
Loading…
Reference in New Issue
Block a user