mirror of
https://github.com/svc-develop-team/so-vits-svc.git
synced 2025-01-09 04:27:31 +08:00
Update TTS in webUI
This commit is contained in:
parent
8cf44b0a56
commit
2b5fb886b1
47
edgetts/tts.py
Normal file
47
edgetts/tts.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
import asyncio
|
||||||
|
import random
|
||||||
|
import edge_tts
|
||||||
|
from edge_tts import VoicesManager
|
||||||
|
import sys
|
||||||
|
from langdetect import detect
|
||||||
|
from langdetect import DetectorFactory
|
||||||
|
|
||||||
|
DetectorFactory.seed = 0
|
||||||
|
|
||||||
|
TEXT = sys.argv[1]
|
||||||
|
LANG = detect(TEXT) if sys.argv[2] == "Auto" else sys.argv[2]
|
||||||
|
if LANG == "zh-cn" or LANG == "zh-tw":
|
||||||
|
LOCALE = LANG[:-2] + LANG[-2:].upper()
|
||||||
|
RATE = sys.argv[3]
|
||||||
|
VOLUME = sys.argv[4]
|
||||||
|
GENDER = sys.argv[5] if len(sys.argv) == 6 else None
|
||||||
|
OUTPUT_FILE = "tts.wav"
|
||||||
|
|
||||||
|
print("Running TTS...")
|
||||||
|
print(f"Text: {TEXT}, Language: {LANG}, Gender: {GENDER}, Rate: {RATE}, Volume: {VOLUME}")
|
||||||
|
|
||||||
|
async def _main() -> None:
|
||||||
|
voices = await VoicesManager.create()
|
||||||
|
if not GENDER is None:
|
||||||
|
if LANG == "zh-cn" or LANG == "zh-tw":
|
||||||
|
voice = voices.find(Gender=GENDER, Locale=LOCALE)
|
||||||
|
else:
|
||||||
|
voice = voices.find(Gender=GENDER, Language=LANG)
|
||||||
|
VOICE = random.choice(voice)["Name"]
|
||||||
|
print(f"Using random {LANG} voice: {VOICE}")
|
||||||
|
else:
|
||||||
|
VOICE = LANG
|
||||||
|
|
||||||
|
communicate = edge_tts.Communicate(text = TEXT, voice = VOICE, rate = RATE, volume = VOLUME)
|
||||||
|
await communicate.save(OUTPUT_FILE)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if sys.platform.startswith("win"):
|
||||||
|
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
||||||
|
asyncio.run(_main())
|
||||||
|
else:
|
||||||
|
loop = asyncio.get_event_loop_policy().get_event_loop()
|
||||||
|
try:
|
||||||
|
loop.run_until_complete(_main())
|
||||||
|
finally:
|
||||||
|
loop.close()
|
306
edgetts/tts_voices.py
Normal file
306
edgetts/tts_voices.py
Normal file
@ -0,0 +1,306 @@
|
|||||||
|
#List of Supported Voices for edge_TTS
|
||||||
|
SUPPORTED_VOICES = {
|
||||||
|
'zh-CN-XiaoxiaoNeural': 'zh-CN',
|
||||||
|
'zh-CN-XiaoyiNeural': 'zh-CN',
|
||||||
|
'zh-CN-YunjianNeural': 'zh-CN',
|
||||||
|
'zh-CN-YunxiNeural': 'zh-CN',
|
||||||
|
'zh-CN-YunxiaNeural': 'zh-CN',
|
||||||
|
'zh-CN-YunyangNeural': 'zh-CN',
|
||||||
|
'zh-HK-HiuGaaiNeural': 'zh-HK',
|
||||||
|
'zh-HK-HiuMaanNeural': 'zh-HK',
|
||||||
|
'zh-HK-WanLungNeural': 'zh-HK',
|
||||||
|
'zh-TW-HsiaoChenNeural': 'zh-TW',
|
||||||
|
'zh-TW-YunJheNeural': 'zh-TW',
|
||||||
|
'zh-TW-HsiaoYuNeural': 'zh-TW',
|
||||||
|
'af-ZA-AdriNeural': 'af-ZA',
|
||||||
|
'af-ZA-WillemNeural': 'af-ZA',
|
||||||
|
'am-ET-AmehaNeural': 'am-ET',
|
||||||
|
'am-ET-MekdesNeural': 'am-ET',
|
||||||
|
'ar-AE-FatimaNeural': 'ar-AE',
|
||||||
|
'ar-AE-HamdanNeural': 'ar-AE',
|
||||||
|
'ar-BH-AliNeural': 'ar-BH',
|
||||||
|
'ar-BH-LailaNeural': 'ar-BH',
|
||||||
|
'ar-DZ-AminaNeural': 'ar-DZ',
|
||||||
|
'ar-DZ-IsmaelNeural': 'ar-DZ',
|
||||||
|
'ar-EG-SalmaNeural': 'ar-EG',
|
||||||
|
'ar-EG-ShakirNeural': 'ar-EG',
|
||||||
|
'ar-IQ-BasselNeural': 'ar-IQ',
|
||||||
|
'ar-IQ-RanaNeural': 'ar-IQ',
|
||||||
|
'ar-JO-SanaNeural': 'ar-JO',
|
||||||
|
'ar-JO-TaimNeural': 'ar-JO',
|
||||||
|
'ar-KW-FahedNeural': 'ar-KW',
|
||||||
|
'ar-KW-NouraNeural': 'ar-KW',
|
||||||
|
'ar-LB-LaylaNeural': 'ar-LB',
|
||||||
|
'ar-LB-RamiNeural': 'ar-LB',
|
||||||
|
'ar-LY-ImanNeural': 'ar-LY',
|
||||||
|
'ar-LY-OmarNeural': 'ar-LY',
|
||||||
|
'ar-MA-JamalNeural': 'ar-MA',
|
||||||
|
'ar-MA-MounaNeural': 'ar-MA',
|
||||||
|
'ar-OM-AbdullahNeural': 'ar-OM',
|
||||||
|
'ar-OM-AyshaNeural': 'ar-OM',
|
||||||
|
'ar-QA-AmalNeural': 'ar-QA',
|
||||||
|
'ar-QA-MoazNeural': 'ar-QA',
|
||||||
|
'ar-SA-HamedNeural': 'ar-SA',
|
||||||
|
'ar-SA-ZariyahNeural': 'ar-SA',
|
||||||
|
'ar-SY-AmanyNeural': 'ar-SY',
|
||||||
|
'ar-SY-LaithNeural': 'ar-SY',
|
||||||
|
'ar-TN-HediNeural': 'ar-TN',
|
||||||
|
'ar-TN-ReemNeural': 'ar-TN',
|
||||||
|
'ar-YE-MaryamNeural': 'ar-YE',
|
||||||
|
'ar-YE-SalehNeural': 'ar-YE',
|
||||||
|
'az-AZ-BabekNeural': 'az-AZ',
|
||||||
|
'az-AZ-BanuNeural': 'az-AZ',
|
||||||
|
'bg-BG-BorislavNeural': 'bg-BG',
|
||||||
|
'bg-BG-KalinaNeural': 'bg-BG',
|
||||||
|
'bn-BD-NabanitaNeural': 'bn-BD',
|
||||||
|
'bn-BD-PradeepNeural': 'bn-BD',
|
||||||
|
'bn-IN-BashkarNeural': 'bn-IN',
|
||||||
|
'bn-IN-TanishaaNeural': 'bn-IN',
|
||||||
|
'bs-BA-GoranNeural': 'bs-BA',
|
||||||
|
'bs-BA-VesnaNeural': 'bs-BA',
|
||||||
|
'ca-ES-EnricNeural': 'ca-ES',
|
||||||
|
'ca-ES-JoanaNeural': 'ca-ES',
|
||||||
|
'cs-CZ-AntoninNeural': 'cs-CZ',
|
||||||
|
'cs-CZ-VlastaNeural': 'cs-CZ',
|
||||||
|
'cy-GB-AledNeural': 'cy-GB',
|
||||||
|
'cy-GB-NiaNeural': 'cy-GB',
|
||||||
|
'da-DK-ChristelNeural': 'da-DK',
|
||||||
|
'da-DK-JeppeNeural': 'da-DK',
|
||||||
|
'de-AT-IngridNeural': 'de-AT',
|
||||||
|
'de-AT-JonasNeural': 'de-AT',
|
||||||
|
'de-CH-JanNeural': 'de-CH',
|
||||||
|
'de-CH-LeniNeural': 'de-CH',
|
||||||
|
'de-DE-AmalaNeural': 'de-DE',
|
||||||
|
'de-DE-ConradNeural': 'de-DE',
|
||||||
|
'de-DE-KatjaNeural': 'de-DE',
|
||||||
|
'de-DE-KillianNeural': 'de-DE',
|
||||||
|
'el-GR-AthinaNeural': 'el-GR',
|
||||||
|
'el-GR-NestorasNeural': 'el-GR',
|
||||||
|
'en-AU-NatashaNeural': 'en-AU',
|
||||||
|
'en-AU-WilliamNeural': 'en-AU',
|
||||||
|
'en-CA-ClaraNeural': 'en-CA',
|
||||||
|
'en-CA-LiamNeural': 'en-CA',
|
||||||
|
'en-GB-LibbyNeural': 'en-GB',
|
||||||
|
'en-GB-MaisieNeural': 'en-GB',
|
||||||
|
'en-GB-RyanNeural': 'en-GB',
|
||||||
|
'en-GB-SoniaNeural': 'en-GB',
|
||||||
|
'en-GB-ThomasNeural': 'en-GB',
|
||||||
|
'en-HK-SamNeural': 'en-HK',
|
||||||
|
'en-HK-YanNeural': 'en-HK',
|
||||||
|
'en-IE-ConnorNeural': 'en-IE',
|
||||||
|
'en-IE-EmilyNeural': 'en-IE',
|
||||||
|
'en-IN-NeerjaNeural': 'en-IN',
|
||||||
|
'en-IN-PrabhatNeural': 'en-IN',
|
||||||
|
'en-KE-AsiliaNeural': 'en-KE',
|
||||||
|
'en-KE-ChilembaNeural': 'en-KE',
|
||||||
|
'en-NG-AbeoNeural': 'en-NG',
|
||||||
|
'en-NG-EzinneNeural': 'en-NG',
|
||||||
|
'en-NZ-MitchellNeural': 'en-NZ',
|
||||||
|
'en-NZ-MollyNeural': 'en-NZ',
|
||||||
|
'en-PH-JamesNeural': 'en-PH',
|
||||||
|
'en-PH-RosaNeural': 'en-PH',
|
||||||
|
'en-SG-LunaNeural': 'en-SG',
|
||||||
|
'en-SG-WayneNeural': 'en-SG',
|
||||||
|
'en-TZ-ElimuNeural': 'en-TZ',
|
||||||
|
'en-TZ-ImaniNeural': 'en-TZ',
|
||||||
|
'en-US-AnaNeural': 'en-US',
|
||||||
|
'en-US-AriaNeural': 'en-US',
|
||||||
|
'en-US-ChristopherNeural': 'en-US',
|
||||||
|
'en-US-EricNeural': 'en-US',
|
||||||
|
'en-US-GuyNeural': 'en-US',
|
||||||
|
'en-US-JennyNeural': 'en-US',
|
||||||
|
'en-US-MichelleNeural': 'en-US',
|
||||||
|
'en-ZA-LeahNeural': 'en-ZA',
|
||||||
|
'en-ZA-LukeNeural': 'en-ZA',
|
||||||
|
'es-AR-ElenaNeural': 'es-AR',
|
||||||
|
'es-AR-TomasNeural': 'es-AR',
|
||||||
|
'es-BO-MarceloNeural': 'es-BO',
|
||||||
|
'es-BO-SofiaNeural': 'es-BO',
|
||||||
|
'es-CL-CatalinaNeural': 'es-CL',
|
||||||
|
'es-CL-LorenzoNeural': 'es-CL',
|
||||||
|
'es-CO-GonzaloNeural': 'es-CO',
|
||||||
|
'es-CO-SalomeNeural': 'es-CO',
|
||||||
|
'es-CR-JuanNeural': 'es-CR',
|
||||||
|
'es-CR-MariaNeural': 'es-CR',
|
||||||
|
'es-CU-BelkysNeural': 'es-CU',
|
||||||
|
'es-CU-ManuelNeural': 'es-CU',
|
||||||
|
'es-DO-EmilioNeural': 'es-DO',
|
||||||
|
'es-DO-RamonaNeural': 'es-DO',
|
||||||
|
'es-EC-AndreaNeural': 'es-EC',
|
||||||
|
'es-EC-LuisNeural': 'es-EC',
|
||||||
|
'es-ES-AlvaroNeural': 'es-ES',
|
||||||
|
'es-ES-ElviraNeural': 'es-ES',
|
||||||
|
'es-ES-ManuelEsCUNeural': 'es-ES',
|
||||||
|
'es-GQ-JavierNeural': 'es-GQ',
|
||||||
|
'es-GQ-TeresaNeural': 'es-GQ',
|
||||||
|
'es-GT-AndresNeural': 'es-GT',
|
||||||
|
'es-GT-MartaNeural': 'es-GT',
|
||||||
|
'es-HN-CarlosNeural': 'es-HN',
|
||||||
|
'es-HN-KarlaNeural': 'es-HN',
|
||||||
|
'es-MX-DaliaNeural': 'es-MX',
|
||||||
|
'es-MX-JorgeNeural': 'es-MX',
|
||||||
|
'es-MX-LorenzoEsCLNeural': 'es-MX',
|
||||||
|
'es-NI-FedericoNeural': 'es-NI',
|
||||||
|
'es-NI-YolandaNeural': 'es-NI',
|
||||||
|
'es-PA-MargaritaNeural': 'es-PA',
|
||||||
|
'es-PA-RobertoNeural': 'es-PA',
|
||||||
|
'es-PE-AlexNeural': 'es-PE',
|
||||||
|
'es-PE-CamilaNeural': 'es-PE',
|
||||||
|
'es-PR-KarinaNeural': 'es-PR',
|
||||||
|
'es-PR-VictorNeural': 'es-PR',
|
||||||
|
'es-PY-MarioNeural': 'es-PY',
|
||||||
|
'es-PY-TaniaNeural': 'es-PY',
|
||||||
|
'es-SV-LorenaNeural': 'es-SV',
|
||||||
|
'es-SV-RodrigoNeural': 'es-SV',
|
||||||
|
'es-US-AlonsoNeural': 'es-US',
|
||||||
|
'es-US-PalomaNeural': 'es-US',
|
||||||
|
'es-UY-MateoNeural': 'es-UY',
|
||||||
|
'es-UY-ValentinaNeural': 'es-UY',
|
||||||
|
'es-VE-PaolaNeural': 'es-VE',
|
||||||
|
'es-VE-SebastianNeural': 'es-VE',
|
||||||
|
'et-EE-AnuNeural': 'et-EE',
|
||||||
|
'et-EE-KertNeural': 'et-EE',
|
||||||
|
'fa-IR-DilaraNeural': 'fa-IR',
|
||||||
|
'fa-IR-FaridNeural': 'fa-IR',
|
||||||
|
'fi-FI-HarriNeural': 'fi-FI',
|
||||||
|
'fi-FI-NooraNeural': 'fi-FI',
|
||||||
|
'fil-PH-AngeloNeural': 'fil-PH',
|
||||||
|
'fil-PH-BlessicaNeural': 'fil-PH',
|
||||||
|
'fr-BE-CharlineNeural': 'fr-BE',
|
||||||
|
'fr-BE-GerardNeural': 'fr-BE',
|
||||||
|
'fr-CA-AntoineNeural': 'fr-CA',
|
||||||
|
'fr-CA-JeanNeural': 'fr-CA',
|
||||||
|
'fr-CA-SylvieNeural': 'fr-CA',
|
||||||
|
'fr-CH-ArianeNeural': 'fr-CH',
|
||||||
|
'fr-CH-FabriceNeural': 'fr-CH',
|
||||||
|
'fr-FR-DeniseNeural': 'fr-FR',
|
||||||
|
'fr-FR-EloiseNeural': 'fr-FR',
|
||||||
|
'fr-FR-HenriNeural': 'fr-FR',
|
||||||
|
'ga-IE-ColmNeural': 'ga-IE',
|
||||||
|
'ga-IE-OrlaNeural': 'ga-IE',
|
||||||
|
'gl-ES-RoiNeural': 'gl-ES',
|
||||||
|
'gl-ES-SabelaNeural': 'gl-ES',
|
||||||
|
'gu-IN-DhwaniNeural': 'gu-IN',
|
||||||
|
'gu-IN-NiranjanNeural': 'gu-IN',
|
||||||
|
'he-IL-AvriNeural': 'he-IL',
|
||||||
|
'he-IL-HilaNeural': 'he-IL',
|
||||||
|
'hi-IN-MadhurNeural': 'hi-IN',
|
||||||
|
'hi-IN-SwaraNeural': 'hi-IN',
|
||||||
|
'hr-HR-GabrijelaNeural': 'hr-HR',
|
||||||
|
'hr-HR-SreckoNeural': 'hr-HR',
|
||||||
|
'hu-HU-NoemiNeural': 'hu-HU',
|
||||||
|
'hu-HU-TamasNeural': 'hu-HU',
|
||||||
|
'id-ID-ArdiNeural': 'id-ID',
|
||||||
|
'id-ID-GadisNeural': 'id-ID',
|
||||||
|
'is-IS-GudrunNeural': 'is-IS',
|
||||||
|
'is-IS-GunnarNeural': 'is-IS',
|
||||||
|
'it-IT-DiegoNeural': 'it-IT',
|
||||||
|
'it-IT-ElsaNeural': 'it-IT',
|
||||||
|
'it-IT-IsabellaNeural': 'it-IT',
|
||||||
|
'ja-JP-KeitaNeural': 'ja-JP',
|
||||||
|
'ja-JP-NanamiNeural': 'ja-JP',
|
||||||
|
'jv-ID-DimasNeural': 'jv-ID',
|
||||||
|
'jv-ID-SitiNeural': 'jv-ID',
|
||||||
|
'ka-GE-EkaNeural': 'ka-GE',
|
||||||
|
'ka-GE-GiorgiNeural': 'ka-GE',
|
||||||
|
'kk-KZ-AigulNeural': 'kk-KZ',
|
||||||
|
'kk-KZ-DauletNeural': 'kk-KZ',
|
||||||
|
'km-KH-PisethNeural': 'km-KH',
|
||||||
|
'km-KH-SreymomNeural': 'km-KH',
|
||||||
|
'kn-IN-GaganNeural': 'kn-IN',
|
||||||
|
'kn-IN-SapnaNeural': 'kn-IN',
|
||||||
|
'ko-KR-InJoonNeural': 'ko-KR',
|
||||||
|
'ko-KR-SunHiNeural': 'ko-KR',
|
||||||
|
'lo-LA-ChanthavongNeural': 'lo-LA',
|
||||||
|
'lo-LA-KeomanyNeural': 'lo-LA',
|
||||||
|
'lt-LT-LeonasNeural': 'lt-LT',
|
||||||
|
'lt-LT-OnaNeural': 'lt-LT',
|
||||||
|
'lv-LV-EveritaNeural': 'lv-LV',
|
||||||
|
'lv-LV-NilsNeural': 'lv-LV',
|
||||||
|
'mk-MK-AleksandarNeural': 'mk-MK',
|
||||||
|
'mk-MK-MarijaNeural': 'mk-MK',
|
||||||
|
'ml-IN-MidhunNeural': 'ml-IN',
|
||||||
|
'ml-IN-SobhanaNeural': 'ml-IN',
|
||||||
|
'mn-MN-BataaNeural': 'mn-MN',
|
||||||
|
'mn-MN-YesuiNeural': 'mn-MN',
|
||||||
|
'mr-IN-AarohiNeural': 'mr-IN',
|
||||||
|
'mr-IN-ManoharNeural': 'mr-IN',
|
||||||
|
'ms-MY-OsmanNeural': 'ms-MY',
|
||||||
|
'ms-MY-YasminNeural': 'ms-MY',
|
||||||
|
'mt-MT-GraceNeural': 'mt-MT',
|
||||||
|
'mt-MT-JosephNeural': 'mt-MT',
|
||||||
|
'my-MM-NilarNeural': 'my-MM',
|
||||||
|
'my-MM-ThihaNeural': 'my-MM',
|
||||||
|
'nb-NO-FinnNeural': 'nb-NO',
|
||||||
|
'nb-NO-PernilleNeural': 'nb-NO',
|
||||||
|
'ne-NP-HemkalaNeural': 'ne-NP',
|
||||||
|
'ne-NP-SagarNeural': 'ne-NP',
|
||||||
|
'nl-BE-ArnaudNeural': 'nl-BE',
|
||||||
|
'nl-BE-DenaNeural': 'nl-BE',
|
||||||
|
'nl-NL-ColetteNeural': 'nl-NL',
|
||||||
|
'nl-NL-FennaNeural': 'nl-NL',
|
||||||
|
'nl-NL-MaartenNeural': 'nl-NL',
|
||||||
|
'pl-PL-MarekNeural': 'pl-PL',
|
||||||
|
'pl-PL-ZofiaNeural': 'pl-PL',
|
||||||
|
'ps-AF-GulNawazNeural': 'ps-AF',
|
||||||
|
'ps-AF-LatifaNeural': 'ps-AF',
|
||||||
|
'pt-BR-AntonioNeural': 'pt-BR',
|
||||||
|
'pt-BR-FranciscaNeural': 'pt-BR',
|
||||||
|
'pt-PT-DuarteNeural': 'pt-PT',
|
||||||
|
'pt-PT-RaquelNeural': 'pt-PT',
|
||||||
|
'ro-RO-AlinaNeural': 'ro-RO',
|
||||||
|
'ro-RO-EmilNeural': 'ro-RO',
|
||||||
|
'ru-RU-DmitryNeural': 'ru-RU',
|
||||||
|
'ru-RU-SvetlanaNeural': 'ru-RU',
|
||||||
|
'si-LK-SameeraNeural': 'si-LK',
|
||||||
|
'si-LK-ThiliniNeural': 'si-LK',
|
||||||
|
'sk-SK-LukasNeural': 'sk-SK',
|
||||||
|
'sk-SK-ViktoriaNeural': 'sk-SK',
|
||||||
|
'sl-SI-PetraNeural': 'sl-SI',
|
||||||
|
'sl-SI-RokNeural': 'sl-SI',
|
||||||
|
'so-SO-MuuseNeural': 'so-SO',
|
||||||
|
'so-SO-UbaxNeural': 'so-SO',
|
||||||
|
'sq-AL-AnilaNeural': 'sq-AL',
|
||||||
|
'sq-AL-IlirNeural': 'sq-AL',
|
||||||
|
'sr-RS-NicholasNeural': 'sr-RS',
|
||||||
|
'sr-RS-SophieNeural': 'sr-RS',
|
||||||
|
'su-ID-JajangNeural': 'su-ID',
|
||||||
|
'su-ID-TutiNeural': 'su-ID',
|
||||||
|
'sv-SE-MattiasNeural': 'sv-SE',
|
||||||
|
'sv-SE-SofieNeural': 'sv-SE',
|
||||||
|
'sw-KE-RafikiNeural': 'sw-KE',
|
||||||
|
'sw-KE-ZuriNeural': 'sw-KE',
|
||||||
|
'sw-TZ-DaudiNeural': 'sw-TZ',
|
||||||
|
'sw-TZ-RehemaNeural': 'sw-TZ',
|
||||||
|
'ta-IN-PallaviNeural': 'ta-IN',
|
||||||
|
'ta-IN-ValluvarNeural': 'ta-IN',
|
||||||
|
'ta-LK-KumarNeural': 'ta-LK',
|
||||||
|
'ta-LK-SaranyaNeural': 'ta-LK',
|
||||||
|
'ta-MY-KaniNeural': 'ta-MY',
|
||||||
|
'ta-MY-SuryaNeural': 'ta-MY',
|
||||||
|
'ta-SG-AnbuNeural': 'ta-SG',
|
||||||
|
'ta-SG-VenbaNeural': 'ta-SG',
|
||||||
|
'te-IN-MohanNeural': 'te-IN',
|
||||||
|
'te-IN-ShrutiNeural': 'te-IN',
|
||||||
|
'th-TH-NiwatNeural': 'th-TH',
|
||||||
|
'th-TH-PremwadeeNeural': 'th-TH',
|
||||||
|
'tr-TR-AhmetNeural': 'tr-TR',
|
||||||
|
'tr-TR-EmelNeural': 'tr-TR',
|
||||||
|
'uk-UA-OstapNeural': 'uk-UA',
|
||||||
|
'uk-UA-PolinaNeural': 'uk-UA',
|
||||||
|
'ur-IN-GulNeural': 'ur-IN',
|
||||||
|
'ur-IN-SalmanNeural': 'ur-IN',
|
||||||
|
'ur-PK-AsadNeural': 'ur-PK',
|
||||||
|
'ur-PK-UzmaNeural': 'ur-PK',
|
||||||
|
'uz-UZ-MadinaNeural': 'uz-UZ',
|
||||||
|
'uz-UZ-SardorNeural': 'uz-UZ',
|
||||||
|
'vi-VN-HoaiMyNeural': 'vi-VN',
|
||||||
|
'vi-VN-NamMinhNeural': 'vi-VN',
|
||||||
|
'zu-ZA-ThandoNeural': 'zu-ZA',
|
||||||
|
'zu-ZA-ThembaNeural': 'zu-ZA',
|
||||||
|
}
|
||||||
|
|
||||||
|
SUPPORTED_LANGUAGES = [
|
||||||
|
"Auto",
|
||||||
|
*SUPPORTED_VOICES.keys()
|
||||||
|
]
|
185
webUI.py
185
webUI.py
@ -5,7 +5,9 @@ import re
|
|||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
# os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt")
|
# os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt")
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
@ -19,6 +21,7 @@ from scipy.io import wavfile
|
|||||||
from compress_model import removeOptimizer
|
from compress_model import removeOptimizer
|
||||||
from inference.infer_tool import Svc
|
from inference.infer_tool import Svc
|
||||||
from utils import mix_model
|
from utils import mix_model
|
||||||
|
from edgetts.tts_voices import SUPPORTED_LANGUAGES
|
||||||
|
|
||||||
logging.getLogger('numba').setLevel(logging.WARNING)
|
logging.getLogger('numba').setLevel(logging.WARNING)
|
||||||
logging.getLogger('markdown_it').setLevel(logging.WARNING)
|
logging.getLogger('markdown_it').setLevel(logging.WARNING)
|
||||||
@ -81,7 +84,6 @@ def modelAnalysis(model_path,config_path,cluster_model_path,device,enhance,diff_
|
|||||||
device = cuda[device] if "CUDA" in device else device
|
device = cuda[device] if "CUDA" in device else device
|
||||||
cluster_filepath = os.path.split(cluster_model_path.name) if cluster_model_path is not None else "no_cluster"
|
cluster_filepath = os.path.split(cluster_model_path.name) if cluster_model_path is not None else "no_cluster"
|
||||||
fr = ".pkl" in cluster_filepath[1]
|
fr = ".pkl" in cluster_filepath[1]
|
||||||
#model = Svc(model_path.name, config_path.name, device=device if device!="Auto" else None, cluster_model_path = cluster_model_path.name if cluster_model_path != None else "",nsf_hifigan_enhance=enhance)
|
|
||||||
model = Svc(model_path.name,
|
model = Svc(model_path.name,
|
||||||
config_path.name,
|
config_path.name,
|
||||||
device=device if device != "Auto" else None,
|
device=device if device != "Auto" else None,
|
||||||
@ -126,108 +128,107 @@ def modelUnload():
|
|||||||
model = None
|
model = None
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
return sid.update(choices = [],value=""),"模型卸载完毕!"
|
return sid.update(choices = [],value=""),"模型卸载完毕!"
|
||||||
|
|
||||||
|
def vc_infer(output_format, sid, audio_path, truncated_basename, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment):
|
||||||
|
global model
|
||||||
|
_audio = model.slice_inference(
|
||||||
|
audio_path,
|
||||||
|
sid,
|
||||||
|
vc_transform,
|
||||||
|
slice_db,
|
||||||
|
cluster_ratio,
|
||||||
|
auto_f0,
|
||||||
|
noise_scale,
|
||||||
|
pad_seconds,
|
||||||
|
cl_num,
|
||||||
|
lg_num,
|
||||||
|
lgr_num,
|
||||||
|
f0_predictor,
|
||||||
|
enhancer_adaptive_key,
|
||||||
|
cr_threshold,
|
||||||
|
k_step,
|
||||||
|
use_spk_mix,
|
||||||
|
second_encoding,
|
||||||
|
loudness_envelope_adjustment
|
||||||
|
)
|
||||||
|
model.clear_empty()
|
||||||
|
#构建保存文件的路径,并保存到results文件夹内
|
||||||
|
str(int(time.time()))
|
||||||
|
if not os.path.exists("results"):
|
||||||
|
os.makedirs("results")
|
||||||
|
key = "auto" if auto_f0 else f"{int(vc_transform)}key"
|
||||||
|
cluster = "_" if cluster_ratio == 0 else f"_{cluster_ratio}_"
|
||||||
|
isdiffusion = "sovits"
|
||||||
|
if model.shallow_diffusion:
|
||||||
|
isdiffusion = "sovdiff"
|
||||||
|
|
||||||
def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment):
|
if model.only_diffusion:
|
||||||
|
isdiffusion = "diff"
|
||||||
|
|
||||||
|
output_file_name = 'result_'+truncated_basename+f'_{sid}_{key}{cluster}{isdiffusion}.{output_format}'
|
||||||
|
output_file = os.path.join("results", output_file_name)
|
||||||
|
soundfile.write(output_file, _audio, model.target_sample, format=output_format)
|
||||||
|
return output_file
|
||||||
|
|
||||||
|
def vc_fn(sid, input_audio, output_format, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment):
|
||||||
global model
|
global model
|
||||||
try:
|
try:
|
||||||
if input_audio is None:
|
if input_audio is None:
|
||||||
return "You need to upload an audio", None
|
return "You need to upload an audio", None
|
||||||
if model is None:
|
if model is None:
|
||||||
return "You need to upload an model", None
|
return "You need to upload an model", None
|
||||||
print(input_audio)
|
if getattr(model, 'cluster_model', None) is None and model.feature_retrieval is False:
|
||||||
sampling_rate, audio = input_audio
|
if cluster_ratio != 0:
|
||||||
print(audio.shape,sampling_rate)
|
return "You need to upload an cluster model or feature retrieval model before assigning cluster ratio!", None
|
||||||
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
#print(input_audio)
|
||||||
print(audio.dtype)
|
audio, sampling_rate = soundfile.read(input_audio)
|
||||||
|
#print(audio.shape,sampling_rate)
|
||||||
|
if np.issubdtype(audio.dtype, np.integer):
|
||||||
|
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
||||||
|
#print(audio.dtype)
|
||||||
if len(audio.shape) > 1:
|
if len(audio.shape) > 1:
|
||||||
audio = librosa.to_mono(audio.transpose(1, 0))
|
audio = librosa.to_mono(audio.transpose(1, 0))
|
||||||
temp_path = "temp.wav"
|
# 未知原因Gradio上传的filepath会有一个奇怪的固定后缀,这里去掉
|
||||||
soundfile.write(temp_path, audio, sampling_rate, format="wav")
|
truncated_basename = Path(input_audio).stem[:-6]
|
||||||
_audio = model.slice_inference(
|
processed_audio = os.path.join("raw", f"{truncated_basename}.wav")
|
||||||
temp_path,
|
soundfile.write(processed_audio, audio, sampling_rate, format="wav")
|
||||||
sid,
|
output_file = vc_infer(output_format, sid, processed_audio, truncated_basename, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment)
|
||||||
vc_transform,
|
|
||||||
slice_db,
|
|
||||||
cluster_ratio,
|
|
||||||
auto_f0,
|
|
||||||
noise_scale,
|
|
||||||
pad_seconds,
|
|
||||||
cl_num,
|
|
||||||
lg_num,
|
|
||||||
lgr_num,
|
|
||||||
f0_predictor,
|
|
||||||
enhancer_adaptive_key,
|
|
||||||
cr_threshold,
|
|
||||||
k_step,
|
|
||||||
use_spk_mix,
|
|
||||||
second_encoding,
|
|
||||||
loudness_envelope_adjustment
|
|
||||||
)
|
|
||||||
model.clear_empty()
|
|
||||||
os.remove(temp_path)
|
|
||||||
#构建保存文件的路径,并保存到results文件夹内
|
|
||||||
timestamp = str(int(time.time()))
|
|
||||||
if not os.path.exists("results"):
|
|
||||||
os.makedirs("results")
|
|
||||||
output_file = os.path.join("results", sid + "_" + timestamp + ".wav")
|
|
||||||
soundfile.write(output_file, _audio, model.target_sample, format="wav")
|
|
||||||
return "Success", output_file
|
return "Success", output_file
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if debug:
|
if debug:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
raise gr.Error(e)
|
raise gr.Error(e)
|
||||||
|
|
||||||
def tts_func(_text,_rate,_voice):
|
|
||||||
#使用edge-tts把文字转成音频
|
|
||||||
# voice = "zh-CN-XiaoyiNeural"#女性,较高音
|
|
||||||
# voice = "zh-CN-YunxiNeural"#男性
|
|
||||||
voice = "zh-CN-YunxiNeural"#男性
|
|
||||||
if ( _voice == "女" ) :
|
|
||||||
voice = "zh-CN-XiaoyiNeural"
|
|
||||||
output_file = _text[0:10]+".wav"
|
|
||||||
# communicate = edge_tts.Communicate(_text, voice)
|
|
||||||
# await communicate.save(output_file)
|
|
||||||
if _rate>=0:
|
|
||||||
ratestr="+{:.0%}".format(_rate)
|
|
||||||
elif _rate<0:
|
|
||||||
ratestr="{:.0%}".format(_rate)#减号自带
|
|
||||||
|
|
||||||
p=subprocess.Popen("edge-tts "+
|
|
||||||
" --text "+_text+
|
|
||||||
" --write-media "+output_file+
|
|
||||||
" --voice "+voice+
|
|
||||||
" --rate="+ratestr
|
|
||||||
,shell=True,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stdin=subprocess.PIPE)
|
|
||||||
p.wait()
|
|
||||||
return output_file
|
|
||||||
|
|
||||||
def text_clear(text):
|
def text_clear(text):
|
||||||
return re.sub(r"[\n\,\(\) ]", "", text)
|
return re.sub(r"[\n\,\(\) ]", "", text)
|
||||||
|
|
||||||
def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,f0_predictor,enhancer_adaptive_key,cr_threshold):
|
def vc_fn2(_text, _lang, _gender, _rate, _volume, sid, output_format, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold, k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment):
|
||||||
#使用edge-tts把文字转成音频
|
global model
|
||||||
text2tts=text_clear(text2tts)
|
try:
|
||||||
output_file=tts_func(text2tts,tts_rate,tts_voice)
|
if model is None:
|
||||||
|
return "You need to upload an model", None
|
||||||
#调整采样率
|
if getattr(model, 'cluster_model', None) is None and model.feature_retrieval is False:
|
||||||
sr2=model.target_sample
|
if cluster_ratio != 0:
|
||||||
wav, sr = librosa.load(output_file)
|
return "You need to upload an cluster model or feature retrieval model before assigning cluster ratio!", None
|
||||||
wav2 = librosa.resample(wav, orig_sr=sr, target_sr=sr2)
|
_rate = f"+{int(_rate*100)}%" if _rate >= 0 else f"{int(_rate*100)}%"
|
||||||
save_path2= text2tts[0:10]+"_44k"+".wav"
|
_volume = f"+{int(_volume*100)}%" if _volume >= 0 else f"{int(_volume*100)}%"
|
||||||
wavfile.write(save_path2,sr2,
|
if _lang == "Auto":
|
||||||
(wav2 * np.iinfo(np.int16).max).astype(np.int16)
|
_gender = "Male" if _gender == "男" else "Female"
|
||||||
)
|
subprocess.run([r"workenv\python.exe", "edgetts/tts.py", _text, _lang, _rate, _volume, _gender])
|
||||||
|
else:
|
||||||
#读取音频
|
subprocess.run([r"workenv\python.exe", "edgetts/tts.py", _text, _lang, _rate, _volume])
|
||||||
sample_rate, data=gr_pu.audio_from_file(save_path2)
|
target_sr = 44100
|
||||||
vc_input=(sample_rate, data)
|
y, sr = librosa.load("tts.wav")
|
||||||
|
resampled_y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
|
||||||
a,b=vc_fn(sid, vc_input, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold)
|
soundfile.write("tts.wav", resampled_y, target_sr, subtype = "PCM_16")
|
||||||
os.remove(output_file)
|
input_audio = "tts.wav"
|
||||||
os.remove(save_path2)
|
#audio, _ = soundfile.read(input_audio)
|
||||||
return a,b
|
output_file_path = vc_infer(output_format, sid, input_audio, "tts", vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment)
|
||||||
|
os.remove("tts.wav")
|
||||||
|
return "Success", output_file_path
|
||||||
|
except Exception as e:
|
||||||
|
if debug: traceback.print_exc()
|
||||||
|
raise gr.Error(e)
|
||||||
|
|
||||||
def model_compression(_model):
|
def model_compression(_model):
|
||||||
if _model == "":
|
if _model == "":
|
||||||
@ -291,6 +292,7 @@ with gr.Blocks(
|
|||||||
vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
|
vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
|
||||||
cluster_ratio = gr.Number(label="聚类模型/特征检索混合比例,0-1之间,0即不启用聚类/特征检索。使用聚类/特征检索能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0)
|
cluster_ratio = gr.Number(label="聚类模型/特征检索混合比例,0-1之间,0即不启用聚类/特征检索。使用聚类/特征检索能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0)
|
||||||
slice_db = gr.Number(label="切片阈值", value=-40)
|
slice_db = gr.Number(label="切片阈值", value=-40)
|
||||||
|
output_format = gr.Radio(label="音频输出格式", choices=["wav", "flac", "mp3"], value = "wav")
|
||||||
noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4)
|
noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4)
|
||||||
k_step = gr.Slider(label="浅扩散步数,只有使用了扩散模型才有效,步数越大越接近扩散模型的结果", value=100, minimum = 1, maximum = 1000)
|
k_step = gr.Slider(label="浅扩散步数,只有使用了扩散模型才有效,步数越大越接近扩散模型的结果", value=100, minimum = 1, maximum = 1000)
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
@ -305,12 +307,15 @@ with gr.Blocks(
|
|||||||
use_spk_mix = gr.Checkbox(label = "动态声线融合", value = False, interactive = False)
|
use_spk_mix = gr.Checkbox(label = "动态声线融合", value = False, interactive = False)
|
||||||
with gr.Tabs():
|
with gr.Tabs():
|
||||||
with gr.TabItem("音频转音频"):
|
with gr.TabItem("音频转音频"):
|
||||||
vc_input3 = gr.Audio(label="选择音频")
|
vc_input3 = gr.Audio(label="选择音频", type="filepath")
|
||||||
vc_submit = gr.Button("音频转换", variant="primary")
|
vc_submit = gr.Button("音频转换", variant="primary")
|
||||||
with gr.TabItem("文字转音频"):
|
with gr.TabItem("文字转音频"):
|
||||||
text2tts=gr.Textbox(label="在此输入要转译的文字。注意,使用该功能建议打开F0预测,不然会很怪")
|
text2tts=gr.Textbox(label="在此输入要转译的文字。注意,使用该功能建议打开F0预测,不然会很怪")
|
||||||
tts_rate = gr.Number(label="tts语速", value=0)
|
with gr.Row():
|
||||||
tts_voice = gr.Radio(label="性别",choices=["男","女"], value="男")
|
tts_gender = gr.Radio(label = "说话人性别", choices = ["男","女"], value = "男")
|
||||||
|
tts_lang = gr.Dropdown(label = "选择语言,Auto为根据输入文字自动识别", choices=SUPPORTED_LANGUAGES, value = "Auto")
|
||||||
|
tts_rate = gr.Slider(label = "TTS语音变速(倍速相对值)", minimum = -1, maximum = 3, value = 0, step = 0.1)
|
||||||
|
tts_volume = gr.Slider(label = "TTS语音音量(相对值)", minimum = -1, maximum = 1.5, value = 0, step = 0.1)
|
||||||
vc_submit2 = gr.Button("文字转换", variant="primary")
|
vc_submit2 = gr.Button("文字转换", variant="primary")
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
@ -371,8 +376,8 @@ with gr.Blocks(
|
|||||||
<font size=2> WebUI设置</font>
|
<font size=2> WebUI设置</font>
|
||||||
""")
|
""")
|
||||||
debug_button = gr.Checkbox(label="Debug模式,如果向社区反馈BUG需要打开,打开后控制台可以显示具体错误提示", value=debug)
|
debug_button = gr.Checkbox(label="Debug模式,如果向社区反馈BUG需要打开,打开后控制台可以显示具体错误提示", value=debug)
|
||||||
vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1, vc_output2])
|
vc_submit.click(vc_fn, [sid, vc_input3, output_format, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1, vc_output2])
|
||||||
vc_submit2.click(vc_fn2, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,f0_predictor,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2])
|
vc_submit2.click(vc_fn2, [text2tts, tts_lang, tts_gender, tts_rate, tts_volume, sid, output_format, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1, vc_output2])
|
||||||
debug_button.change(debug_change,[],[])
|
debug_button.change(debug_change,[],[])
|
||||||
model_load_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device,enhance,diff_model_path,diff_config_path,only_diffusion,use_spk_mix],[sid,sid_output])
|
model_load_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device,enhance,diff_model_path,diff_config_path,only_diffusion,use_spk_mix],[sid,sid_output])
|
||||||
model_unload_button.click(modelUnload,[],[sid,sid_output])
|
model_unload_button.click(modelUnload,[],[sid,sid_output])
|
||||||
|
Loading…
Reference in New Issue
Block a user