2023-06-04 13:11:18 +08:00
|
|
|
import torch
|
|
|
|
|
2023-06-26 14:57:53 +08:00
|
|
|
from vencoder.encoder import SpeechEncoder
|
|
|
|
from vencoder.whisper.audio import log_mel_spectrogram, pad_or_trim
|
|
|
|
from vencoder.whisper.model import ModelDimensions, Whisper
|
2023-06-04 13:11:18 +08:00
|
|
|
|
|
|
|
|
2023-06-04 13:11:52 +08:00
|
|
|
class WhisperPPGLarge(SpeechEncoder):
|
2023-06-22 02:04:03 +08:00
|
|
|
def __init__(self, vec_path="pretrain/large-v2.pt", device=None):
|
|
|
|
super().__init__()
|
2023-06-04 13:11:18 +08:00
|
|
|
if device is None:
|
|
|
|
self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
else:
|
|
|
|
self.dev = torch.device(device)
|
|
|
|
checkpoint = torch.load(vec_path, map_location=device)
|
|
|
|
dims = ModelDimensions(**checkpoint["dims"])
|
|
|
|
model = Whisper(dims)
|
|
|
|
model.load_state_dict(checkpoint["model_state_dict"])
|
|
|
|
self.hidden_dim = dims
|
|
|
|
self.model = model.to(self.dev)
|
|
|
|
|
|
|
|
def encoder(self, wav):
|
|
|
|
audio = wav
|
|
|
|
audln = audio.shape[0]
|
|
|
|
ppgln = audln // 320
|
|
|
|
audio = pad_or_trim(audio)
|
|
|
|
mel = log_mel_spectrogram(audio).to(self.dev)
|
|
|
|
with torch.no_grad():
|
|
|
|
ppg = self.model.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
|
2023-06-22 02:04:03 +08:00
|
|
|
ppg = torch.FloatTensor(ppg[:ppgln, ]).to(self.dev)
|
|
|
|
return ppg[None, :, :].transpose(1, 2)
|