mirror of
https://github.com/svc-develop-team/so-vits-svc.git
synced 2025-01-09 12:37:30 +08:00
Add files via upload
This commit is contained in:
parent
120444b293
commit
5b249743cd
@ -1,103 +1,62 @@
|
|||||||
{
|
{
|
||||||
"train": {
|
"train": {
|
||||||
"log_interval": 50,
|
"log_interval": 200,
|
||||||
"eval_interval": 1000,
|
"eval_interval": 800,
|
||||||
"seed": 1234,
|
"seed": 1234,
|
||||||
"port": 8001,
|
|
||||||
"epochs": 10000,
|
"epochs": 10000,
|
||||||
"learning_rate": 0.0002,
|
"learning_rate": 0.0001,
|
||||||
"betas": [
|
"betas": [
|
||||||
0.8,
|
0.8,
|
||||||
0.99
|
0.99
|
||||||
],
|
],
|
||||||
"eps": 1e-09,
|
"eps": 1e-09,
|
||||||
"batch_size": 6,
|
"batch_size": 6,
|
||||||
"accumulation_steps": 1,
|
|
||||||
"fp16_run": false,
|
"fp16_run": false,
|
||||||
"lr_decay": 0.998,
|
"lr_decay": 0.999875,
|
||||||
"segment_size": 10240,
|
"segment_size": 10240,
|
||||||
"init_lr_ratio": 1,
|
"init_lr_ratio": 1,
|
||||||
"warmup_epochs": 0,
|
"warmup_epochs": 0,
|
||||||
"c_mel": 45,
|
"c_mel": 45,
|
||||||
"keep_ckpts":4
|
"c_kl": 1.0,
|
||||||
|
"use_sr": true,
|
||||||
|
"max_speclen": 512,
|
||||||
|
"port": "8001",
|
||||||
|
"keep_ckpts": 3
|
||||||
},
|
},
|
||||||
"data": {
|
"data": {
|
||||||
"data_dir": "dataset",
|
"training_files": "filelists/train.txt",
|
||||||
"dataset_type": "SingDataset",
|
"validation_files": "filelists/val.txt",
|
||||||
"collate_type": "SingCollate",
|
|
||||||
"training_filelist": "filelists/train.txt",
|
|
||||||
"validation_filelist": "filelists/val.txt",
|
|
||||||
"max_wav_value": 32768.0,
|
"max_wav_value": 32768.0,
|
||||||
"sampling_rate": 44100,
|
"sampling_rate": 44100,
|
||||||
"n_fft": 2048,
|
"filter_length": 2048,
|
||||||
"fmin": 0,
|
|
||||||
"fmax": 22050,
|
|
||||||
"hop_length": 512,
|
"hop_length": 512,
|
||||||
"win_size": 2048,
|
"win_length": 2048,
|
||||||
"acoustic_dim": 80,
|
"n_mel_channels": 80,
|
||||||
"c_dim": 256,
|
"mel_fmin": 0.0,
|
||||||
"min_level_db": -115,
|
"mel_fmax": 22050
|
||||||
"ref_level_db": 20,
|
|
||||||
"min_db": -115,
|
|
||||||
"max_abs_value": 4.0,
|
|
||||||
"n_speakers": 200
|
|
||||||
},
|
},
|
||||||
"model": {
|
"model": {
|
||||||
|
"inter_channels": 192,
|
||||||
"hidden_channels": 192,
|
"hidden_channels": 192,
|
||||||
"spk_channels": 192,
|
|
||||||
"filter_channels": 768,
|
"filter_channels": 768,
|
||||||
"n_heads": 2,
|
"n_heads": 2,
|
||||||
"n_layers": 4,
|
"n_layers": 6,
|
||||||
"kernel_size": 3,
|
"kernel_size": 3,
|
||||||
"p_dropout": 0.1,
|
"p_dropout": 0.1,
|
||||||
"prior_hidden_channels": 192,
|
|
||||||
"prior_filter_channels": 768,
|
|
||||||
"prior_n_heads": 2,
|
|
||||||
"prior_n_layers": 4,
|
|
||||||
"prior_kernel_size": 3,
|
|
||||||
"prior_p_dropout": 0.1,
|
|
||||||
"resblock": "1",
|
"resblock": "1",
|
||||||
|
"resblock_kernel_sizes": [3,7,11],
|
||||||
|
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||||
|
"upsample_rates": [ 8, 8, 2, 2, 2],
|
||||||
|
"upsample_initial_channel": 512,
|
||||||
|
"upsample_kernel_sizes": [16,16, 4, 4, 4],
|
||||||
|
"n_layers_q": 3,
|
||||||
"use_spectral_norm": false,
|
"use_spectral_norm": false,
|
||||||
"resblock_kernel_sizes": [
|
"gin_channels": 256,
|
||||||
3,
|
"ssl_dim": 256,
|
||||||
7,
|
"n_speakers": 200
|
||||||
11
|
|
||||||
],
|
|
||||||
"resblock_dilation_sizes": [
|
|
||||||
[
|
|
||||||
1,
|
|
||||||
3,
|
|
||||||
5
|
|
||||||
],
|
|
||||||
[
|
|
||||||
1,
|
|
||||||
3,
|
|
||||||
5
|
|
||||||
],
|
|
||||||
[
|
|
||||||
1,
|
|
||||||
3,
|
|
||||||
5
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"upsample_rates": [
|
|
||||||
8,
|
|
||||||
8,
|
|
||||||
4,
|
|
||||||
2
|
|
||||||
],
|
|
||||||
"upsample_initial_channel": 256,
|
|
||||||
"upsample_kernel_sizes": [
|
|
||||||
16,
|
|
||||||
16,
|
|
||||||
8,
|
|
||||||
4
|
|
||||||
],
|
|
||||||
"n_harmonic": 64,
|
|
||||||
"n_bands": 65
|
|
||||||
},
|
},
|
||||||
"spk": {
|
"spk": {
|
||||||
"jishuang": 0,
|
"nyaru": 0,
|
||||||
"huiyu": 1,
|
"huiyu": 1,
|
||||||
"nen": 2,
|
"nen": 2,
|
||||||
"paimon": 3,
|
"paimon": 3,
|
||||||
|
@ -127,8 +127,9 @@ class Svc(object):
|
|||||||
def load_model(self):
|
def load_model(self):
|
||||||
# 获取模型配置
|
# 获取模型配置
|
||||||
self.net_g_ms = SynthesizerTrn(
|
self.net_g_ms = SynthesizerTrn(
|
||||||
self.hps_ms
|
self.hps_ms.data.filter_length // 2 + 1,
|
||||||
)
|
self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
|
||||||
|
**self.hps_ms.model)
|
||||||
_ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
|
_ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
|
||||||
if "half" in self.net_g_path and torch.cuda.is_available():
|
if "half" in self.net_g_path and torch.cuda.is_available():
|
||||||
_ = self.net_g_ms.half().eval().to(self.dev)
|
_ = self.net_g_ms.half().eval().to(self.dev)
|
||||||
@ -173,7 +174,7 @@ class Svc(object):
|
|||||||
c = c.half()
|
c = c.half()
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
start = time.time()
|
start = time.time()
|
||||||
audio = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)[0][0,0].data.float()
|
audio = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)[0,0].data.float()
|
||||||
use_time = time.time() - start
|
use_time = time.time() - start
|
||||||
print("vits use time:{}".format(use_time))
|
print("vits use time:{}".format(use_time))
|
||||||
return audio, audio.shape[-1]
|
return audio, audio.shape[-1]
|
||||||
|
@ -5,187 +5,182 @@ import scipy
|
|||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
from torch.autograd import Function
|
|
||||||
from typing import Any, Optional, Tuple
|
|
||||||
|
|
||||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||||
from torch.nn.utils import weight_norm, remove_weight_norm
|
from torch.nn.utils import weight_norm, remove_weight_norm
|
||||||
|
|
||||||
import modules.commons as commons
|
import modules.commons as commons
|
||||||
from modules.commons import init_weights, get_padding
|
from modules.commons import init_weights, get_padding
|
||||||
from modules.transforms import piecewise_rational_quadratic_transform
|
|
||||||
|
|
||||||
LRELU_SLOPE = 0.1
|
LRELU_SLOPE = 0.1
|
||||||
|
|
||||||
|
|
||||||
class LayerNorm(nn.Module):
|
class LayerNorm(nn.Module):
|
||||||
def __init__(self, channels, eps=1e-5):
|
def __init__(self, channels, eps=1e-5):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.channels = channels
|
self.channels = channels
|
||||||
self.eps = eps
|
self.eps = eps
|
||||||
|
|
||||||
self.gamma = nn.Parameter(torch.ones(channels))
|
self.gamma = nn.Parameter(torch.ones(channels))
|
||||||
self.beta = nn.Parameter(torch.zeros(channels))
|
self.beta = nn.Parameter(torch.zeros(channels))
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
x = x.transpose(1, -1)
|
|
||||||
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
|
||||||
return x.transpose(1, -1)
|
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = x.transpose(1, -1)
|
||||||
|
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
||||||
|
return x.transpose(1, -1)
|
||||||
|
|
||||||
|
|
||||||
class ConvReluNorm(nn.Module):
|
class ConvReluNorm(nn.Module):
|
||||||
def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
|
def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.in_channels = in_channels
|
self.in_channels = in_channels
|
||||||
self.hidden_channels = hidden_channels
|
self.hidden_channels = hidden_channels
|
||||||
self.out_channels = out_channels
|
self.out_channels = out_channels
|
||||||
self.kernel_size = kernel_size
|
self.kernel_size = kernel_size
|
||||||
self.n_layers = n_layers
|
self.n_layers = n_layers
|
||||||
self.p_dropout = p_dropout
|
self.p_dropout = p_dropout
|
||||||
assert n_layers > 1, "Number of layers should be larger than 0."
|
assert n_layers > 1, "Number of layers should be larger than 0."
|
||||||
|
|
||||||
self.conv_layers = nn.ModuleList()
|
self.conv_layers = nn.ModuleList()
|
||||||
self.norm_layers = nn.ModuleList()
|
self.norm_layers = nn.ModuleList()
|
||||||
self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
|
self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
|
||||||
self.norm_layers.append(LayerNorm(hidden_channels))
|
self.norm_layers.append(LayerNorm(hidden_channels))
|
||||||
self.relu_drop = nn.Sequential(
|
self.relu_drop = nn.Sequential(
|
||||||
nn.ReLU(),
|
nn.ReLU(),
|
||||||
nn.Dropout(p_dropout))
|
nn.Dropout(p_dropout))
|
||||||
for _ in range(n_layers - 1):
|
for _ in range(n_layers-1):
|
||||||
self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
|
self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
|
||||||
self.norm_layers.append(LayerNorm(hidden_channels))
|
self.norm_layers.append(LayerNorm(hidden_channels))
|
||||||
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
||||||
self.proj.weight.data.zero_()
|
self.proj.weight.data.zero_()
|
||||||
self.proj.bias.data.zero_()
|
self.proj.bias.data.zero_()
|
||||||
|
|
||||||
def forward(self, x, x_mask):
|
def forward(self, x, x_mask):
|
||||||
x_org = x
|
x_org = x
|
||||||
for i in range(self.n_layers):
|
for i in range(self.n_layers):
|
||||||
x = self.conv_layers[i](x * x_mask)
|
x = self.conv_layers[i](x * x_mask)
|
||||||
x = self.norm_layers[i](x)
|
x = self.norm_layers[i](x)
|
||||||
x = self.relu_drop(x)
|
x = self.relu_drop(x)
|
||||||
x = x_org + self.proj(x)
|
x = x_org + self.proj(x)
|
||||||
return x * x_mask
|
return x * x_mask
|
||||||
|
|
||||||
|
|
||||||
class DDSConv(nn.Module):
|
class DDSConv(nn.Module):
|
||||||
"""
|
"""
|
||||||
Dialted and Depth-Separable Convolution
|
Dialted and Depth-Separable Convolution
|
||||||
"""
|
"""
|
||||||
|
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
|
||||||
|
super().__init__()
|
||||||
|
self.channels = channels
|
||||||
|
self.kernel_size = kernel_size
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.p_dropout = p_dropout
|
||||||
|
|
||||||
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
|
self.drop = nn.Dropout(p_dropout)
|
||||||
super().__init__()
|
self.convs_sep = nn.ModuleList()
|
||||||
self.channels = channels
|
self.convs_1x1 = nn.ModuleList()
|
||||||
self.kernel_size = kernel_size
|
self.norms_1 = nn.ModuleList()
|
||||||
self.n_layers = n_layers
|
self.norms_2 = nn.ModuleList()
|
||||||
self.p_dropout = p_dropout
|
for i in range(n_layers):
|
||||||
|
dilation = kernel_size ** i
|
||||||
|
padding = (kernel_size * dilation - dilation) // 2
|
||||||
|
self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
|
||||||
|
groups=channels, dilation=dilation, padding=padding
|
||||||
|
))
|
||||||
|
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
|
||||||
|
self.norms_1.append(LayerNorm(channels))
|
||||||
|
self.norms_2.append(LayerNorm(channels))
|
||||||
|
|
||||||
self.drop = nn.Dropout(p_dropout)
|
def forward(self, x, x_mask, g=None):
|
||||||
self.convs_sep = nn.ModuleList()
|
if g is not None:
|
||||||
self.convs_1x1 = nn.ModuleList()
|
x = x + g
|
||||||
self.norms_1 = nn.ModuleList()
|
for i in range(self.n_layers):
|
||||||
self.norms_2 = nn.ModuleList()
|
y = self.convs_sep[i](x * x_mask)
|
||||||
for i in range(n_layers):
|
y = self.norms_1[i](y)
|
||||||
dilation = kernel_size ** i
|
y = F.gelu(y)
|
||||||
padding = (kernel_size * dilation - dilation) // 2
|
y = self.convs_1x1[i](y)
|
||||||
self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
|
y = self.norms_2[i](y)
|
||||||
groups=channels, dilation=dilation, padding=padding
|
y = F.gelu(y)
|
||||||
))
|
y = self.drop(y)
|
||||||
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
|
x = x + y
|
||||||
self.norms_1.append(LayerNorm(channels))
|
return x * x_mask
|
||||||
self.norms_2.append(LayerNorm(channels))
|
|
||||||
|
|
||||||
def forward(self, x, x_mask, g=None):
|
|
||||||
if g is not None:
|
|
||||||
x = x + g
|
|
||||||
for i in range(self.n_layers):
|
|
||||||
y = self.convs_sep[i](x * x_mask)
|
|
||||||
y = self.norms_1[i](y)
|
|
||||||
y = F.gelu(y)
|
|
||||||
y = self.convs_1x1[i](y)
|
|
||||||
y = self.norms_2[i](y)
|
|
||||||
y = F.gelu(y)
|
|
||||||
y = self.drop(y)
|
|
||||||
x = x + y
|
|
||||||
return x * x_mask
|
|
||||||
|
|
||||||
|
|
||||||
class WN(torch.nn.Module):
|
class WN(torch.nn.Module):
|
||||||
def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, n_speakers=0, spk_channels=0,
|
def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
|
||||||
p_dropout=0):
|
super(WN, self).__init__()
|
||||||
super(WN, self).__init__()
|
assert(kernel_size % 2 == 1)
|
||||||
assert (kernel_size % 2 == 1)
|
self.hidden_channels =hidden_channels
|
||||||
self.hidden_channels = hidden_channels
|
self.kernel_size = kernel_size,
|
||||||
self.kernel_size = kernel_size,
|
self.dilation_rate = dilation_rate
|
||||||
self.dilation_rate = dilation_rate
|
self.n_layers = n_layers
|
||||||
self.n_layers = n_layers
|
self.gin_channels = gin_channels
|
||||||
self.n_speakers = n_speakers
|
self.p_dropout = p_dropout
|
||||||
self.spk_channels = spk_channels
|
|
||||||
self.p_dropout = p_dropout
|
|
||||||
|
|
||||||
self.in_layers = torch.nn.ModuleList()
|
self.in_layers = torch.nn.ModuleList()
|
||||||
self.res_skip_layers = torch.nn.ModuleList()
|
self.res_skip_layers = torch.nn.ModuleList()
|
||||||
self.drop = nn.Dropout(p_dropout)
|
self.drop = nn.Dropout(p_dropout)
|
||||||
|
|
||||||
if n_speakers > 0:
|
if gin_channels != 0:
|
||||||
cond_layer = torch.nn.Conv1d(spk_channels, 2 * hidden_channels * n_layers, 1)
|
cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
|
||||||
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
|
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
|
||||||
|
|
||||||
for i in range(n_layers):
|
for i in range(n_layers):
|
||||||
dilation = dilation_rate ** i
|
dilation = dilation_rate ** i
|
||||||
padding = int((kernel_size * dilation - dilation) / 2)
|
padding = int((kernel_size * dilation - dilation) / 2)
|
||||||
in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size,
|
in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
|
||||||
dilation=dilation, padding=padding)
|
dilation=dilation, padding=padding)
|
||||||
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
|
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
|
||||||
self.in_layers.append(in_layer)
|
self.in_layers.append(in_layer)
|
||||||
|
|
||||||
# last one is not necessary
|
# last one is not necessary
|
||||||
if i < n_layers - 1:
|
if i < n_layers - 1:
|
||||||
res_skip_channels = 2 * hidden_channels
|
res_skip_channels = 2 * hidden_channels
|
||||||
else:
|
else:
|
||||||
res_skip_channels = hidden_channels
|
res_skip_channels = hidden_channels
|
||||||
|
|
||||||
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
|
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
|
||||||
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
|
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
|
||||||
self.res_skip_layers.append(res_skip_layer)
|
self.res_skip_layers.append(res_skip_layer)
|
||||||
|
|
||||||
def forward(self, x, x_mask, g=None, **kwargs):
|
def forward(self, x, x_mask, g=None, **kwargs):
|
||||||
output = torch.zeros_like(x)
|
output = torch.zeros_like(x)
|
||||||
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
||||||
|
|
||||||
if g is not None:
|
if g is not None:
|
||||||
g = self.cond_layer(g)
|
g = self.cond_layer(g)
|
||||||
|
|
||||||
for i in range(self.n_layers):
|
for i in range(self.n_layers):
|
||||||
x_in = self.in_layers[i](x)
|
x_in = self.in_layers[i](x)
|
||||||
if g is not None:
|
if g is not None:
|
||||||
cond_offset = i * 2 * self.hidden_channels
|
cond_offset = i * 2 * self.hidden_channels
|
||||||
g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
|
g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
|
||||||
else:
|
else:
|
||||||
g_l = torch.zeros_like(x_in)
|
g_l = torch.zeros_like(x_in)
|
||||||
|
|
||||||
acts = commons.fused_add_tanh_sigmoid_multiply(
|
acts = commons.fused_add_tanh_sigmoid_multiply(
|
||||||
x_in,
|
x_in,
|
||||||
g_l,
|
g_l,
|
||||||
n_channels_tensor)
|
n_channels_tensor)
|
||||||
acts = self.drop(acts)
|
acts = self.drop(acts)
|
||||||
|
|
||||||
res_skip_acts = self.res_skip_layers[i](acts)
|
res_skip_acts = self.res_skip_layers[i](acts)
|
||||||
if i < self.n_layers - 1:
|
if i < self.n_layers - 1:
|
||||||
res_acts = res_skip_acts[:, :self.hidden_channels, :]
|
res_acts = res_skip_acts[:,:self.hidden_channels,:]
|
||||||
x = (x + res_acts) * x_mask
|
x = (x + res_acts) * x_mask
|
||||||
output = output + res_skip_acts[:, self.hidden_channels:, :]
|
output = output + res_skip_acts[:,self.hidden_channels:,:]
|
||||||
else:
|
else:
|
||||||
output = output + res_skip_acts
|
output = output + res_skip_acts
|
||||||
return output * x_mask
|
return output * x_mask
|
||||||
|
|
||||||
def remove_weight_norm(self):
|
def remove_weight_norm(self):
|
||||||
if self.n_speakers > 0:
|
if self.gin_channels != 0:
|
||||||
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
||||||
for l in self.in_layers:
|
for l in self.in_layers:
|
||||||
torch.nn.utils.remove_weight_norm(l)
|
torch.nn.utils.remove_weight_norm(l)
|
||||||
for l in self.res_skip_layers:
|
for l in self.res_skip_layers:
|
||||||
torch.nn.utils.remove_weight_norm(l)
|
torch.nn.utils.remove_weight_norm(l)
|
||||||
|
|
||||||
|
|
||||||
class ResBlock1(torch.nn.Module):
|
class ResBlock1(torch.nn.Module):
|
||||||
@ -261,193 +256,87 @@ class ResBlock2(torch.nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class Log(nn.Module):
|
class Log(nn.Module):
|
||||||
def forward(self, x, x_mask, reverse=False, **kwargs):
|
def forward(self, x, x_mask, reverse=False, **kwargs):
|
||||||
if not reverse:
|
if not reverse:
|
||||||
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
|
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
|
||||||
logdet = torch.sum(-y, [1, 2])
|
logdet = torch.sum(-y, [1, 2])
|
||||||
return y, logdet
|
return y, logdet
|
||||||
else:
|
else:
|
||||||
x = torch.exp(x) * x_mask
|
x = torch.exp(x) * x_mask
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
class Flip(nn.Module):
|
class Flip(nn.Module):
|
||||||
def forward(self, x, *args, reverse=False, **kwargs):
|
def forward(self, x, *args, reverse=False, **kwargs):
|
||||||
x = torch.flip(x, [1])
|
x = torch.flip(x, [1])
|
||||||
if not reverse:
|
if not reverse:
|
||||||
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
|
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
|
||||||
return x, logdet
|
return x, logdet
|
||||||
else:
|
else:
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
class ElementwiseAffine(nn.Module):
|
class ElementwiseAffine(nn.Module):
|
||||||
def __init__(self, channels):
|
def __init__(self, channels):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.channels = channels
|
self.channels = channels
|
||||||
self.m = nn.Parameter(torch.zeros(channels, 1))
|
self.m = nn.Parameter(torch.zeros(channels,1))
|
||||||
self.logs = nn.Parameter(torch.zeros(channels, 1))
|
self.logs = nn.Parameter(torch.zeros(channels,1))
|
||||||
|
|
||||||
def forward(self, x, x_mask, reverse=False, **kwargs):
|
def forward(self, x, x_mask, reverse=False, **kwargs):
|
||||||
if not reverse:
|
if not reverse:
|
||||||
y = self.m + torch.exp(self.logs) * x
|
y = self.m + torch.exp(self.logs) * x
|
||||||
y = y * x_mask
|
y = y * x_mask
|
||||||
logdet = torch.sum(self.logs * x_mask, [1, 2])
|
logdet = torch.sum(self.logs * x_mask, [1,2])
|
||||||
return y, logdet
|
return y, logdet
|
||||||
else:
|
else:
|
||||||
x = (x - self.m) * torch.exp(-self.logs) * x_mask
|
x = (x - self.m) * torch.exp(-self.logs) * x_mask
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
class ResidualCouplingLayer(nn.Module):
|
class ResidualCouplingLayer(nn.Module):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
channels,
|
channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
kernel_size,
|
kernel_size,
|
||||||
dilation_rate,
|
dilation_rate,
|
||||||
n_layers,
|
n_layers,
|
||||||
p_dropout=0,
|
p_dropout=0,
|
||||||
n_speakers=0,
|
gin_channels=0,
|
||||||
spk_channels=0,
|
mean_only=False):
|
||||||
mean_only=False):
|
assert channels % 2 == 0, "channels should be divisible by 2"
|
||||||
assert channels % 2 == 0, "channels should be divisible by 2"
|
super().__init__()
|
||||||
super().__init__()
|
self.channels = channels
|
||||||
self.channels = channels
|
self.hidden_channels = hidden_channels
|
||||||
self.hidden_channels = hidden_channels
|
self.kernel_size = kernel_size
|
||||||
self.kernel_size = kernel_size
|
self.dilation_rate = dilation_rate
|
||||||
self.dilation_rate = dilation_rate
|
self.n_layers = n_layers
|
||||||
self.n_layers = n_layers
|
self.half_channels = channels // 2
|
||||||
self.half_channels = channels // 2
|
self.mean_only = mean_only
|
||||||
self.mean_only = mean_only
|
|
||||||
|
|
||||||
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
|
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
|
||||||
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, n_speakers=n_speakers,
|
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
|
||||||
spk_channels=spk_channels)
|
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
|
||||||
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
|
self.post.weight.data.zero_()
|
||||||
self.post.weight.data.zero_()
|
self.post.bias.data.zero_()
|
||||||
self.post.bias.data.zero_()
|
|
||||||
|
|
||||||
def forward(self, x, x_mask, g=None, reverse=False):
|
def forward(self, x, x_mask, g=None, reverse=False):
|
||||||
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
x0, x1 = torch.split(x, [self.half_channels]*2, 1)
|
||||||
h = self.pre(x0) * x_mask
|
h = self.pre(x0) * x_mask
|
||||||
h = self.enc(h, x_mask, g=g)
|
h = self.enc(h, x_mask, g=g)
|
||||||
stats = self.post(h) * x_mask
|
stats = self.post(h) * x_mask
|
||||||
if not self.mean_only:
|
if not self.mean_only:
|
||||||
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
|
m, logs = torch.split(stats, [self.half_channels]*2, 1)
|
||||||
else:
|
else:
|
||||||
m = stats
|
m = stats
|
||||||
logs = torch.zeros_like(m)
|
logs = torch.zeros_like(m)
|
||||||
|
|
||||||
if not reverse:
|
if not reverse:
|
||||||
x1 = m + x1 * torch.exp(logs) * x_mask
|
x1 = m + x1 * torch.exp(logs) * x_mask
|
||||||
x = torch.cat([x0, x1], 1)
|
x = torch.cat([x0, x1], 1)
|
||||||
logdet = torch.sum(logs, [1, 2])
|
logdet = torch.sum(logs, [1,2])
|
||||||
return x, logdet
|
return x, logdet
|
||||||
else:
|
else:
|
||||||
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
||||||
x = torch.cat([x0, x1], 1)
|
x = torch.cat([x0, x1], 1)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
class ResidualCouplingBlock(nn.Module):
|
|
||||||
def __init__(self,
|
|
||||||
channels,
|
|
||||||
hidden_channels,
|
|
||||||
kernel_size,
|
|
||||||
dilation_rate,
|
|
||||||
n_layers,
|
|
||||||
n_flows=4,
|
|
||||||
n_speakers=0,
|
|
||||||
gin_channels=0):
|
|
||||||
super().__init__()
|
|
||||||
self.channels = channels
|
|
||||||
self.hidden_channels = hidden_channels
|
|
||||||
self.kernel_size = kernel_size
|
|
||||||
self.dilation_rate = dilation_rate
|
|
||||||
self.n_layers = n_layers
|
|
||||||
self.n_flows = n_flows
|
|
||||||
self.gin_channels = gin_channels
|
|
||||||
|
|
||||||
self.flows = nn.ModuleList()
|
|
||||||
for i in range(n_flows):
|
|
||||||
self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
|
|
||||||
n_speakers=n_speakers, spk_channels=gin_channels, mean_only=True))
|
|
||||||
self.flows.append(Flip())
|
|
||||||
|
|
||||||
def forward(self, x, x_mask, g=None, reverse=False):
|
|
||||||
if not reverse:
|
|
||||||
for flow in self.flows:
|
|
||||||
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
|
||||||
else:
|
|
||||||
for flow in reversed(self.flows):
|
|
||||||
x = flow(x, x_mask, g=g, reverse=reverse)
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
class ConvFlow(nn.Module):
|
|
||||||
def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
|
|
||||||
super().__init__()
|
|
||||||
self.in_channels = in_channels
|
|
||||||
self.filter_channels = filter_channels
|
|
||||||
self.kernel_size = kernel_size
|
|
||||||
self.n_layers = n_layers
|
|
||||||
self.num_bins = num_bins
|
|
||||||
self.tail_bound = tail_bound
|
|
||||||
self.half_channels = in_channels // 2
|
|
||||||
|
|
||||||
self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
|
|
||||||
self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
|
|
||||||
self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
|
|
||||||
self.proj.weight.data.zero_()
|
|
||||||
self.proj.bias.data.zero_()
|
|
||||||
|
|
||||||
def forward(self, x, x_mask, g=None, reverse=False):
|
|
||||||
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
|
||||||
h = self.pre(x0)
|
|
||||||
h = self.convs(h, x_mask, g=g)
|
|
||||||
h = self.proj(h) * x_mask
|
|
||||||
|
|
||||||
b, c, t = x0.shape
|
|
||||||
h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
|
|
||||||
|
|
||||||
unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
|
|
||||||
unnormalized_heights = h[..., self.num_bins:2 * self.num_bins] / math.sqrt(self.filter_channels)
|
|
||||||
unnormalized_derivatives = h[..., 2 * self.num_bins:]
|
|
||||||
|
|
||||||
x1, logabsdet = piecewise_rational_quadratic_transform(x1,
|
|
||||||
unnormalized_widths,
|
|
||||||
unnormalized_heights,
|
|
||||||
unnormalized_derivatives,
|
|
||||||
inverse=reverse,
|
|
||||||
tails='linear',
|
|
||||||
tail_bound=self.tail_bound
|
|
||||||
)
|
|
||||||
|
|
||||||
x = torch.cat([x0, x1], 1) * x_mask
|
|
||||||
logdet = torch.sum(logabsdet * x_mask, [1, 2])
|
|
||||||
if not reverse:
|
|
||||||
return x, logdet
|
|
||||||
else:
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
class ResStack(nn.Module):
|
|
||||||
def __init__(self, channel, kernel_size=3, base=3, nums=4):
|
|
||||||
super(ResStack, self).__init__()
|
|
||||||
|
|
||||||
self.layers = nn.ModuleList([
|
|
||||||
nn.Sequential(
|
|
||||||
nn.LeakyReLU(),
|
|
||||||
nn.utils.weight_norm(nn.Conv1d(channel, channel,
|
|
||||||
kernel_size=kernel_size, dilation=base ** i, padding=base ** i)),
|
|
||||||
nn.LeakyReLU(),
|
|
||||||
nn.utils.weight_norm(nn.Conv1d(channel, channel,
|
|
||||||
kernel_size=kernel_size, dilation=1, padding=1)),
|
|
||||||
)
|
|
||||||
for i in range(nums)
|
|
||||||
])
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
for layer in self.layers:
|
|
||||||
x = x + layer(x)
|
|
||||||
return x
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user