Updata CNHubertLarge

This commit is contained in:
ylzz1997 2023-06-02 02:15:42 +08:00
parent a67cef6539
commit 9abe2e7b33
6 changed files with 58 additions and 10 deletions

View File

@ -90,7 +90,11 @@ wget -P pretrain/ http://obs.cstcloud.cn/share/obs/sankagenkeshi/checkpoint_best
- download model at [medium.pt](https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt)
- Place it under the `pretrain` director
##### **4. If OnnxHubert/ContentVec as the encoder**
##### **4. If cnhubertlarge as the encoder**
- download model at [chinese-hubert-large-fairseq-ckpt.pt](https://huggingface.co/TencentGameMate/chinese-hubert-large/resolve/main/chinese-hubert-large-fairseq-ckpt.pt)
- Place it under the `pretrain` director
##### **5. If OnnxHubert/ContentVec as the encoder**
- download model at [MoeSS-SUBModel](https://huggingface.co/NaruseMioShirakana/MoeSS-SUBModel/tree/main)
- Place it under the `pretrain` directory
@ -104,6 +108,7 @@ wget -P pretrain/ http://obs.cstcloud.cn/share/obs/sankagenkeshi/checkpoint_best
- "hubertsoft-onnx"
- "hubertsoft"
- "whisper-ppg"
- "cnhubertlarge"
#### **Optional(Strongly recommend)**
@ -223,6 +228,7 @@ vec768l12
vec256l9
hubertsoft
whisper-ppg
cnhubertlarge
```
If the speech_encoder argument is omitted, the default value is vec768l12

View File

@ -89,12 +89,16 @@ wget -P pretrain/ http://obs.cstcloud.cn/share/obs/sankagenkeshi/checkpoint_best
+ 放在`pretrain`目录下
##### **3. 若使用Whisper-ppg作为声音编码器**
- download model at [medium.pt](https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt)
- 放在`pretrain`目录下
+ 下载模型 [medium.pt](https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt)
+ 放在`pretrain`目录下
##### **4. 若使用OnnxHubert/ContentVec作为声音编码器**
- download model at [MoeSS-SUBModel](https://huggingface.co/NaruseMioShirakana/MoeSS-SUBModel/tree/main)
- 放在`pretrain`目录下
##### **4. 若使用cnhubertlarge作为声音编码器**
+ 下载模型 [chinese-hubert-large-fairseq-ckpt.pt](https://huggingface.co/TencentGameMate/chinese-hubert-large/resolve/main/chinese-hubert-large-fairseq-ckpt.pt)
+ 放在`pretrain`目录下
##### **5. 若使用OnnxHubert/ContentVec作为声音编码器**
+ 下载模型 [MoeSS-SUBModel](https://huggingface.co/NaruseMioShirakana/MoeSS-SUBModel/tree/main)
+ 放在`pretrain`目录下
#### **编码器列表**
- "vec768l12"
@ -106,7 +110,8 @@ wget -P pretrain/ http://obs.cstcloud.cn/share/obs/sankagenkeshi/checkpoint_best
- "hubertsoft-onnx"
- "hubertsoft"
- "whisper-ppg"
- "cnhubertlarge"
#### **可选项(强烈建议使用)**
+ 预训练底模文件: `G_0.pth` `D_0.pth`
@ -225,6 +230,7 @@ vec768l12
vec256l9
hubertsoft
whisper-ppg
cnhubertlarge
```
如果省略speech_encoder参数默认值为vec768l12

View File

@ -28,7 +28,7 @@ if __name__ == "__main__":
parser.add_argument("--train_list", type=str, default="./filelists/train.txt", help="path to train list")
parser.add_argument("--val_list", type=str, default="./filelists/val.txt", help="path to val list")
parser.add_argument("--source_dir", type=str, default="./dataset/44k", help="path to source dir")
parser.add_argument("--speech_encoder", type=str, default="vec768l12", help="choice a speech encoder|'vec768l12','vec256l9','hubertsoft','whisper-ppg'")
parser.add_argument("--speech_encoder", type=str, default="vec768l12", help="choice a speech encoder|'vec768l12','vec256l9','hubertsoft','whisper-ppg','cnhubertlarge'")
parser.add_argument("--vol_aug", action="store_true", help="Whether to use volume embedding and volume augmentation")
args = parser.parse_args()
@ -87,7 +87,7 @@ if __name__ == "__main__":
elif args.speech_encoder == "vec256l9" or args.speech_encoder == 'hubertsoft':
config_template["model"]["ssl_dim"] = config_template["model"]["filter_channels"] = config_template["model"]["gin_channels"] = 256
d_config_template["data"]["encoder_out_channels"] = 256
elif args.speech_encoder == "whisper-ppg" :
elif args.speech_encoder == "whisper-ppg" or args.speech_encoder == 'cnhubertlarge':
config_template["model"]["ssl_dim"] = config_template["model"]["filter_channels"] = config_template["model"]["gin_channels"] = 1024
d_config_template["data"]["encoder_out_channels"] = 1024

View File

@ -130,6 +130,9 @@ def get_speech_encoder(speech_encoder,device=None,**kargs):
elif speech_encoder == "whisper-ppg":
from vencoder.WhisperPPG import WhisperPPG
speech_encoder_object = WhisperPPG(device = device)
elif speech_encoder == "cnhubertlarge":
from vencoder.CNHubertLarge import CNHubertLarge
speech_encoder_object = CNHubertLarge(device = device)
else:
raise Exception("Unknown speech encoder")
return speech_encoder_object

33
vencoder/CNHubertLarge.py Normal file
View File

@ -0,0 +1,33 @@
from vencoder.encoder import SpeechEncoder
import torch
from fairseq import checkpoint_utils
class CNHubertLarge(SpeechEncoder):
def __init__(self,vec_path = "pretrain/chinese-hubert-large-fairseq-ckpt.pt",device=None):
print("load model(s) from {}".format(vec_path))
self.hidden_dim = 1024
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
[vec_path],
suffix="",
)
if device is None:
self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
self.dev = torch.device(device)
self.model = models[0].to(self.dev)
self.model.eval()
def encoder(self, wav):
feats = wav
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
inputs = {
"source": feats.to(wav.device),
"padding_mask": padding_mask.to(wav.device)
}
with torch.no_grad():
logits = self.model.extract_features(**inputs)
return logits[0].transpose(1, 2)

View File

@ -7,6 +7,6 @@ class SpeechEncoder(object):
def encoder(self,wav):
'''
input: wav:[batchsize,signal_length]
output: embedding:[batchsize,wav_frame,hidden_dim]
output: embedding:[batchsize,hidden_dim,wav_frame]
'''
pass