Add chihaya_jinja_sample

Web Edition improvement(16k test)

bugfix:
- merge slot
- servermode append error
This commit is contained in:
w-okada 2023-11-29 00:30:52 +09:00
parent b24c781a72
commit 17597fdaab
10 changed files with 81 additions and 81 deletions

3
.gitignore vendored
View File

@ -58,6 +58,9 @@ server/samples_0003_o.json
server/samples_0003_t2.json
server/samples_0003_o2.json
server/samples_0003_d2.json
server/samples_0004_t.json
server/samples_0004_o.json
server/samples_0004_d.json
server/test_official_v1_v2.json
server/test_ddpn_v1_v2.json

View File

@ -45,6 +45,7 @@ export type WebInfoStateAndMethod = WebInfoState & {
const ModelSampleRateStr = {
"40k": "40k",
"32k": "32k",
"16k": "16k",
} as const;
type ModelSampleRateStr = (typeof ModelSampleRateStr)[keyof typeof ModelSampleRateStr];
@ -71,18 +72,22 @@ const noF0ModelUrl: { [modelType in VoiceChangerType]: { [inputLength in InputLe
"24000": {
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_24000.bin",
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_24000.bin",
"16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_nof0_24000.bin",
},
"16000": {
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_16000.bin",
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_16000.bin",
"16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_nof0_16000.bin",
},
"12000": {
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_12000.bin",
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_12000.bin",
"16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_nof0_12000.bin",
},
"8000": {
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_8000.bin",
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_8000.bin",
"16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_nof0_8000.bin",
},
},
};
@ -109,18 +114,22 @@ const f0ModelUrl: { [modelType in VoiceChangerType]: { [inputLength in InputLeng
"24000": {
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_24000.bin",
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_24000.bin",
"16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_f0_24000.bin",
},
"16000": {
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_16000.bin",
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_16000.bin",
"16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_f0_16000.bin",
},
"12000": {
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_12000.bin",
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_12000.bin",
"16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_f0_12000.bin",
},
"8000": {
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_8000.bin",
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_8000.bin",
"16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_f0_8000.bin",
},
},
};

View File

@ -50,7 +50,7 @@ export const Portrait = (_props: PortraitProps) => {
}
vol.innerText = volume.toFixed(4);
if (webEdition) {
buf.innerText = webInfoState.responseTimeInfo.realDuration.toString() ?? "0";
buf.innerText = bufferingTime.toString();
res.innerText = webInfoState.responseTimeInfo.responseTime.toString() ?? "0";
rtf.innerText = webInfoState.responseTimeInfo.rtf.toString() ?? "0";
} else {

View File

@ -55,6 +55,7 @@ export const WebEditionSettingArea = (_props: WebEditionSettingAreaProps) => {
</div>
);
const sr16KClassName = "character-area-control-button" + (webInfoState.voiceChangerConfig.sampleRate == "16k" ? " character-area-control-button-active" : " character-area-control-button-stanby");
const sr32KClassName = "character-area-control-button" + (webInfoState.voiceChangerConfig.sampleRate == "32k" ? " character-area-control-button-active" : " character-area-control-button-stanby");
const sr40KClassName = "character-area-control-button" + (webInfoState.voiceChangerConfig.sampleRate == "40k" ? " character-area-control-button-active" : " character-area-control-button-stanby");
const sampleRate = (
@ -64,6 +65,15 @@ export const WebEditionSettingArea = (_props: WebEditionSettingAreaProps) => {
<div className="character-area-slider-control">
<span className="character-area-slider-control-kind"></span>
<span className="character-area-control-buttons">
<span
className={!readyForConfig ? "character-area-control-button-disable" : sr16KClassName}
onClick={() => {
if (webInfoState.voiceChangerConfig.sampleRate == "16k" || !readyForConfig) return;
webInfoState.setVoiceChangerConfig("rvcv2", "16k", webInfoState.voiceChangerConfig.useF0, webInfoState.voiceChangerConfig.inputLength);
}}
>
16k
</span>
<span
className={!readyForConfig ? "character-area-control-button-disable" : sr32KClassName}
onClick={() => {

View File

@ -98,11 +98,9 @@ RVCSampleMode: TypeAlias = Literal[
def getSampleJsonAndModelIds(mode: RVCSampleMode):
if mode == "production":
return [
# "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0001.json",
# "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0002.json",
"https://huggingface.co/wok000/vcclient_model/raw/main/samples_0003_t2.json",
"https://huggingface.co/wok000/vcclient_model/raw/main/samples_0003_o2.json",
"https://huggingface.co/wok000/vcclient_model/raw/main/samples_0003_d2.json",
"https://huggingface.co/wok000/vcclient_model/raw/main/samples_0004_t.json",
"https://huggingface.co/wok000/vcclient_model/raw/main/samples_0004_o.json",
"https://huggingface.co/wok000/vcclient_model/raw/main/samples_0004_d.json",
], [
("Tsukuyomi-chan_o", {"useIndex": False}),
("Amitaro_o", {"useIndex": False}),

View File

@ -7,7 +7,7 @@ from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder
from voice_changer.DiffusionSVC.inferencer.onnx.VocoderOnnx import VocoderOnnx
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
from voice_changer.utils.Timer import Timer
from voice_changer.utils.Timer import Timer2
class DiffusionSVCInferencer(Inferencer):
@ -49,18 +49,14 @@ class DiffusionSVCInferencer(Inferencer):
return model_block_size, model_sampling_rate
@torch.no_grad() # 最基本推理代码,将输入标准化为tensor,只与mel打交道
def __call__(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0,
gt_spec=None, infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
spk_emb=None):
def __call__(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0, gt_spec=None, infer_speedup=10, method="dpm-solver", k_step=None, use_tqdm=True, spk_emb=None):
if self.diff_args.model.k_step_max is not None:
if k_step is None:
raise ValueError("k_step must not None when Shallow Diffusion Model inferring")
if k_step > int(self.diff_args.model.k_step_max):
raise ValueError("k_step must <= k_step_max of Shallow Diffusion Model")
if gt_spec is None:
raise ValueError("gt_spec must not None when Shallow Diffusion Model inferring, gt_spec can from "
"input mel or output of naive model")
raise ValueError("gt_spec must not None when Shallow Diffusion Model inferring, gt_spec can from " "input mel or output of naive model")
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev)
@ -75,8 +71,7 @@ class DiffusionSVCInferencer(Inferencer):
return self.diff_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift, gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
@torch.no_grad()
def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None,
aug_shift=0, spk_emb=None):
def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0, spk_emb=None):
# spk_id
spk_emb_dict = None
if self.diff_args.model.use_speaker_encoder: # with speaker encoder
@ -85,9 +80,7 @@ class DiffusionSVCInferencer(Inferencer):
else:
spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.dev)
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev)
out_spec = self.naive_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
aug_shift=aug_shift, infer=True,
spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
out_spec = self.naive_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift, infer=True, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
return out_spec
@torch.no_grad()
@ -114,19 +107,18 @@ class DiffusionSVCInferencer(Inferencer):
silence_front: float,
skip_diffusion: bool = True,
) -> torch.Tensor:
with Timer("pre-process", False) as t:
with Timer2("pre-process", False) as t:
gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
# print("[ ----Timer::1: ]", t.secs)
with Timer("pre-process", False) as t:
with Timer2("pre-process", False) as t:
if skip_diffusion == 0:
out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method="dpm-solver", k_step=k_step, use_tqdm=False, spk_emb=None)
gt_spec = out_mel
# print("[ ----Timer::2: ]", t.secs)
with Timer("pre-process", False) as t: # NOQA
with Timer2("pre-process", False) as t: # NOQA
if self.vocoder_onnx is None:
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
out_wav = self.mel2wav(gt_spec, pitch, start_frame=start_frame)

View File

@ -17,7 +17,7 @@ from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.common.VolumeExtractor import VolumeExtractor
from torchaudio.transforms import Resample
from voice_changer.utils.Timer import Timer
from voice_changer.utils.Timer import Timer2
logger = VoiceChangaerLogger.get_instance().getLogger()
@ -45,7 +45,7 @@ class Pipeline(object):
device,
isHalf,
resamplerIn: Resample,
resamplerOut: Resample
resamplerOut: Resample,
):
self.inferencer = inferencer
inferencer_block_size, inferencer_sampling_rate = inferencer.getConfig()
@ -64,7 +64,7 @@ class Pipeline(object):
logger.info("GENERATE INFERENCER" + str(self.inferencer))
logger.info("GENERATE EMBEDDER" + str(self.embedder))
logger.info("GENERATE PITCH EXTRACTOR" + str(self.pitchExtractor))
self.targetSR = targetSR
self.device = device
self.isHalf = False
@ -103,7 +103,7 @@ class Pipeline(object):
skip_diffusion=True,
):
# print("---------- pipe line --------------------")
with Timer("pre-process", False) as t:
with Timer2("pre-process", False) as t:
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
audio16k = self.resamplerIn(audio_t)
volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0)
@ -111,7 +111,7 @@ class Pipeline(object):
n_frames = int(audio16k.size(-1) // self.hop_size + 1)
# print("[Timer::1: ]", t.secs)
with Timer("pre-process", False) as t:
with Timer2("pre-process", False) as t:
# ピッチ検出
try:
# pitch = self.pitchExtractor.extract(
@ -141,8 +141,7 @@ class Pipeline(object):
feats = feats.view(1, -1)
# print("[Timer::2: ]", t.secs)
with Timer("pre-process", False) as t:
with Timer2("pre-process", False) as t:
# embedding
with autocast(enabled=self.isHalf):
try:
@ -156,28 +155,17 @@ class Pipeline(object):
raise DeviceChangingException()
else:
raise e
feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1)
feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode="nearest").permute(0, 2, 1)
# print("[Timer::3: ]", t.secs)
with Timer("pre-process", False) as t:
with Timer2("pre-process", False) as t:
# 推論実行
try:
with torch.no_grad():
with autocast(enabled=self.isHalf):
audio1 = (
torch.clip(
self.inferencer.infer(
audio16k,
feats,
pitch.unsqueeze(-1),
volume,
mask,
sid,
k_step,
infer_speedup,
silence_front=silence_front,
skip_diffusion=skip_diffusion
).to(dtype=torch.float32),
self.inferencer.infer(audio16k, feats, pitch.unsqueeze(-1), volume, mask, sid, k_step, infer_speedup, silence_front=silence_front, skip_diffusion=skip_diffusion).to(dtype=torch.float32),
-1.0,
1.0,
)
@ -191,7 +179,7 @@ class Pipeline(object):
raise e
# print("[Timer::4: ]", t.secs)
with Timer("pre-process", False) as t: # NOQA
with Timer2("pre-process", False) as t: # NOQA
feats_buffer = feats.squeeze(0).detach().cpu()
if pitch is not None:
pitch_buffer = pitch.squeeze(0).detach().cpu()

View File

@ -9,7 +9,7 @@ from mods.log_control import VoiceChangaerLogger
from voice_changer.Local.AudioDeviceList import checkSamplingRate, list_audio_device
import time
import sounddevice as sd
from voice_changer.utils.Timer import Timer
from voice_changer.utils.Timer import Timer2
import librosa
from voice_changer.utils.VoiceChangerModel import AudioInOut
@ -139,7 +139,7 @@ class ServerDevice:
return out_wav, times
def _processDataWithTime(self, indata: np.ndarray):
with Timer("all_inference_time") as t:
with Timer2("all_inference_time", False) as t:
out_wav, times = self._processData(indata)
all_inference_time = t.secs
self.performance = [all_inference_time] + times

View File

@ -364,7 +364,7 @@ class VoiceChangerManager(ServerDeviceCallbacks):
req = json.loads(request)
req = ModelMergerRequest(**req)
req.files = [MergeElement(**f) for f in req.files]
slot = len(self.modelSlotManager.getAllSlotInfo()) - 1
slot = len(self.modelSlotManager.getAllSlotInfo()) - 2 # Beatrice-JVS が追加されたので -1 -> -2
if req.voiceChangerType == "RVC":
merged = RVCModelMerger.merge_models(self.params, req, slot)
loadParam = LoadModelParams(voiceChangerType="RVC", slot=slot, isSampleMode=False, sampleId="", files=[LoadModelParamFile(name=os.path.basename(merged), kind="rvcModel", dir="")], params={})

View File

@ -3,45 +3,45 @@ import inspect
from typing import Dict, List
class Timer(object):
storedSecs: Dict[str, Dict[str, List[float]]] = {} # Class variable
# class Timer(object):
# storedSecs: Dict[str, Dict[str, List[float]]] = {} # Class variable
def __init__(self, title: str, enalbe: bool = True):
self.title = title
self.enable = enalbe
self.secs = 0
self.msecs = 0
self.avrSecs = 0
# def __init__(self, title: str, enalbe: bool = True):
# self.title = title
# self.enable = enalbe
# self.secs = 0
# self.msecs = 0
# self.avrSecs = 0
if self.enable is False:
return
# if self.enable is False:
# return
self.maxStores = 10
# self.maxStores = 10
current_frame = inspect.currentframe()
caller_frame = inspect.getouterframes(current_frame, 2)
frame = caller_frame[1]
filename = frame.filename
line_number = frame.lineno
self.key = f"{title}_{filename}_{line_number}"
if self.key not in self.storedSecs:
self.storedSecs[self.key] = {}
# current_frame = inspect.currentframe()
# caller_frame = inspect.getouterframes(current_frame, 2)
# frame = caller_frame[1]
# filename = frame.filename
# line_number = frame.lineno
# self.key = f"{title}_{filename}_{line_number}"
# if self.key not in self.storedSecs:
# self.storedSecs[self.key] = {}
def __enter__(self):
if self.enable is False:
return
self.start = time.time()
return self
# def __enter__(self):
# if self.enable is False:
# return
# self.start = time.time()
# return self
def __exit__(self, *_):
if self.enable is False:
return
self.end = time.time()
self.secs = self.end - self.start
self.msecs = self.secs * 1000 # millisecs
self.storedSecs[self.key].append(self.secs)
self.storedSecs[self.key] = self.storedSecs[self.key][-self.maxStores :]
self.avrSecs = sum(self.storedSecs[self.key]) / len(self.storedSecs[self.key])
# def __exit__(self, *_):
# if self.enable is False:
# return
# self.end = time.time()
# self.secs = self.end - self.start
# self.msecs = self.secs * 1000 # millisecs
# self.storedSecs[self.key].append(self.secs)
# self.storedSecs[self.key] = self.storedSecs[self.key][-self.maxStores :]
# self.avrSecs = sum(self.storedSecs[self.key]) / len(self.storedSecs[self.key])
class Timer2(object):