|  | import gradio as gr | 
					
						
						|  | import os | 
					
						
						|  | os.system("pip install -q piper-tts==1.2.0") | 
					
						
						|  | os.system("pip install -q -r requirements_xtts.txt") | 
					
						
						|  | os.system("pip install -q TTS==0.21.1  --no-deps") | 
					
						
						|  | import spaces | 
					
						
						|  | import torch | 
					
						
						|  | if os.environ.get("ZERO_GPU") != "TRUE" and torch.cuda.is_available(): | 
					
						
						|  |  | 
					
						
						|  | os.system("pip install ort-nightly-gpu --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ort-cuda-12-nightly/pypi/simple/") | 
					
						
						|  | import librosa | 
					
						
						|  | from soni_translate.logging_setup import ( | 
					
						
						|  | logger, | 
					
						
						|  | set_logging_level, | 
					
						
						|  | configure_logging_libs, | 
					
						
						|  | ); configure_logging_libs() | 
					
						
						|  | import whisperx | 
					
						
						|  | import os | 
					
						
						|  | from soni_translate.audio_segments import create_translated_audio | 
					
						
						|  | from soni_translate.text_to_speech import ( | 
					
						
						|  | audio_segmentation_to_voice, | 
					
						
						|  | edge_tts_voices_list, | 
					
						
						|  | coqui_xtts_voices_list, | 
					
						
						|  | piper_tts_voices_list, | 
					
						
						|  | create_wav_file_vc, | 
					
						
						|  | accelerate_segments, | 
					
						
						|  | ) | 
					
						
						|  | from soni_translate.translate_segments import ( | 
					
						
						|  | translate_text, | 
					
						
						|  | TRANSLATION_PROCESS_OPTIONS, | 
					
						
						|  | DOCS_TRANSLATION_PROCESS_OPTIONS | 
					
						
						|  | ) | 
					
						
						|  | from soni_translate.preprocessor import ( | 
					
						
						|  | audio_video_preprocessor, | 
					
						
						|  | audio_preprocessor, | 
					
						
						|  | ) | 
					
						
						|  | from soni_translate.postprocessor import ( | 
					
						
						|  | OUTPUT_TYPE_OPTIONS, | 
					
						
						|  | DOCS_OUTPUT_TYPE_OPTIONS, | 
					
						
						|  | sound_separate, | 
					
						
						|  | get_no_ext_filename, | 
					
						
						|  | media_out, | 
					
						
						|  | get_subtitle_speaker, | 
					
						
						|  | ) | 
					
						
						|  | from soni_translate.language_configuration import ( | 
					
						
						|  | LANGUAGES, | 
					
						
						|  | UNIDIRECTIONAL_L_LIST, | 
					
						
						|  | LANGUAGES_LIST, | 
					
						
						|  | BARK_VOICES_LIST, | 
					
						
						|  | VITS_VOICES_LIST, | 
					
						
						|  | OPENAI_TTS_MODELS, | 
					
						
						|  | ) | 
					
						
						|  | from soni_translate.utils import ( | 
					
						
						|  | remove_files, | 
					
						
						|  | download_list, | 
					
						
						|  | upload_model_list, | 
					
						
						|  | download_manager, | 
					
						
						|  | run_command, | 
					
						
						|  | is_audio_file, | 
					
						
						|  | is_subtitle_file, | 
					
						
						|  | copy_files, | 
					
						
						|  | get_valid_files, | 
					
						
						|  | get_link_list, | 
					
						
						|  | remove_directory_contents, | 
					
						
						|  | ) | 
					
						
						|  | from soni_translate.mdx_net import ( | 
					
						
						|  | UVR_MODELS, | 
					
						
						|  | MDX_DOWNLOAD_LINK, | 
					
						
						|  | mdxnet_models_dir, | 
					
						
						|  | ) | 
					
						
						|  | from soni_translate.speech_segmentation import ( | 
					
						
						|  | ASR_MODEL_OPTIONS, | 
					
						
						|  | COMPUTE_TYPE_GPU, | 
					
						
						|  | COMPUTE_TYPE_CPU, | 
					
						
						|  | find_whisper_models, | 
					
						
						|  | transcribe_speech, | 
					
						
						|  | align_speech, | 
					
						
						|  | diarize_speech, | 
					
						
						|  | diarization_models, | 
					
						
						|  | ) | 
					
						
						|  | from soni_translate.text_multiformat_processor import ( | 
					
						
						|  | BORDER_COLORS, | 
					
						
						|  | srt_file_to_segments, | 
					
						
						|  | document_preprocessor, | 
					
						
						|  | determine_chunk_size, | 
					
						
						|  | plain_text_to_segments, | 
					
						
						|  | segments_to_plain_text, | 
					
						
						|  | process_subtitles, | 
					
						
						|  | linguistic_level_segments, | 
					
						
						|  | break_aling_segments, | 
					
						
						|  | doc_to_txtximg_pages, | 
					
						
						|  | page_data_to_segments, | 
					
						
						|  | update_page_data, | 
					
						
						|  | fix_timestamps_docs, | 
					
						
						|  | create_video_from_images, | 
					
						
						|  | merge_video_and_audio, | 
					
						
						|  | ) | 
					
						
						|  | from soni_translate.languages_gui import language_data, news | 
					
						
						|  | import copy | 
					
						
						|  | import logging | 
					
						
						|  | import json | 
					
						
						|  | from pydub import AudioSegment | 
					
						
						|  | from voice_main import ClassVoices | 
					
						
						|  | import argparse | 
					
						
						|  | import time | 
					
						
						|  | import hashlib | 
					
						
						|  | import sys | 
					
						
						|  |  | 
					
						
						|  | directories = [ | 
					
						
						|  | "downloads", | 
					
						
						|  | "logs", | 
					
						
						|  | "weights", | 
					
						
						|  | "clean_song_output", | 
					
						
						|  | "_XTTS_", | 
					
						
						|  | f"audio2{os.sep}audio", | 
					
						
						|  | "audio", | 
					
						
						|  | "outputs", | 
					
						
						|  | ] | 
					
						
						|  | [ | 
					
						
						|  | os.makedirs(directory) | 
					
						
						|  | for directory in directories | 
					
						
						|  | if not os.path.exists(directory) | 
					
						
						|  | ] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class TTS_Info: | 
					
						
						|  | def __init__(self, piper_enabled, xtts_enabled): | 
					
						
						|  | self.list_edge = edge_tts_voices_list() | 
					
						
						|  | self.list_bark = list(BARK_VOICES_LIST.keys()) | 
					
						
						|  | self.list_vits = list(VITS_VOICES_LIST.keys()) | 
					
						
						|  | self.list_openai_tts = OPENAI_TTS_MODELS | 
					
						
						|  | self.piper_enabled = piper_enabled | 
					
						
						|  | self.list_vits_onnx = ( | 
					
						
						|  | piper_tts_voices_list() if self.piper_enabled else [] | 
					
						
						|  | ) | 
					
						
						|  | self.xtts_enabled = xtts_enabled | 
					
						
						|  |  | 
					
						
						|  | def tts_list(self): | 
					
						
						|  | self.list_coqui_xtts = ( | 
					
						
						|  | coqui_xtts_voices_list() if self.xtts_enabled else [] | 
					
						
						|  | ) | 
					
						
						|  | list_tts = self.list_coqui_xtts + sorted( | 
					
						
						|  | self.list_edge | 
					
						
						|  | + (self.list_bark if os.environ.get("ZERO_GPU") != "TRUE" else []) | 
					
						
						|  | + self.list_vits | 
					
						
						|  | + self.list_openai_tts | 
					
						
						|  | + self.list_vits_onnx | 
					
						
						|  | ) | 
					
						
						|  | return list_tts | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def prog_disp(msg, percent, is_gui, progress=None): | 
					
						
						|  | logger.info(msg) | 
					
						
						|  | if is_gui: | 
					
						
						|  | progress(percent, desc=msg) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def warn_disp(wrn_lang, is_gui): | 
					
						
						|  | logger.warning(wrn_lang) | 
					
						
						|  | if is_gui: | 
					
						
						|  | gr.Warning(wrn_lang) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class SoniTrCache: | 
					
						
						|  | def __init__(self): | 
					
						
						|  | self.cache = { | 
					
						
						|  | 'media': [[]], | 
					
						
						|  | 'refine_vocals': [], | 
					
						
						|  | 'transcript_align': [], | 
					
						
						|  | 'break_align': [], | 
					
						
						|  | 'diarize': [], | 
					
						
						|  | 'translate': [], | 
					
						
						|  | 'subs_and_edit': [], | 
					
						
						|  | 'tts': [], | 
					
						
						|  | 'acc_and_vc': [], | 
					
						
						|  | 'mix_aud': [], | 
					
						
						|  | 'output': [] | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  | self.cache_data = { | 
					
						
						|  | 'media': [], | 
					
						
						|  | 'refine_vocals': [], | 
					
						
						|  | 'transcript_align': [], | 
					
						
						|  | 'break_align': [], | 
					
						
						|  | 'diarize': [], | 
					
						
						|  | 'translate': [], | 
					
						
						|  | 'subs_and_edit': [], | 
					
						
						|  | 'tts': [], | 
					
						
						|  | 'acc_and_vc': [], | 
					
						
						|  | 'mix_aud': [], | 
					
						
						|  | 'output': [] | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  | self.cache_keys = list(self.cache.keys()) | 
					
						
						|  | self.first_task = self.cache_keys[0] | 
					
						
						|  | self.last_task = self.cache_keys[-1] | 
					
						
						|  |  | 
					
						
						|  | self.pre_step = None | 
					
						
						|  | self.pre_params = [] | 
					
						
						|  |  | 
					
						
						|  | def set_variable(self, variable_name, value): | 
					
						
						|  | setattr(self, variable_name, value) | 
					
						
						|  |  | 
					
						
						|  | def task_in_cache(self, step: str, params: list, previous_step_data: dict): | 
					
						
						|  |  | 
					
						
						|  | self.pre_step_cache = None | 
					
						
						|  |  | 
					
						
						|  | if step == self.first_task: | 
					
						
						|  | self.pre_step = None | 
					
						
						|  |  | 
					
						
						|  | if self.pre_step: | 
					
						
						|  | self.cache[self.pre_step] = self.pre_params | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.cache_data[self.pre_step] = copy.deepcopy(previous_step_data) | 
					
						
						|  |  | 
					
						
						|  | self.pre_params = params | 
					
						
						|  |  | 
					
						
						|  | if params == self.cache[step]: | 
					
						
						|  | logger.debug(f"In cache: {str(step)}") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for key, value in self.cache_data[step].items(): | 
					
						
						|  | self.set_variable(key, copy.deepcopy(value)) | 
					
						
						|  | logger.debug( | 
					
						
						|  | f"Chache load: {str(key)}" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | self.pre_step = step | 
					
						
						|  | return True | 
					
						
						|  |  | 
					
						
						|  | else: | 
					
						
						|  | logger.debug(f"Flush next and caching {str(step)}") | 
					
						
						|  | selected_index = self.cache_keys.index(step) | 
					
						
						|  |  | 
					
						
						|  | for idx, key in enumerate(self.cache.keys()): | 
					
						
						|  | if idx >= selected_index: | 
					
						
						|  | self.cache[key] = [] | 
					
						
						|  | self.cache_data[key] = {} | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.pre_step = step | 
					
						
						|  | return False | 
					
						
						|  |  | 
					
						
						|  | def clear_cache(self, media, force=False): | 
					
						
						|  |  | 
					
						
						|  | self.cache["media"] = ( | 
					
						
						|  | self.cache["media"] if len(self.cache["media"]) else [[]] | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if media != self.cache["media"][0] or force: | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.cache = {key: [] for key in self.cache} | 
					
						
						|  | self.cache["media"] = [[]] | 
					
						
						|  |  | 
					
						
						|  | logger.info("Cache flushed") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_hash(filepath): | 
					
						
						|  | with open(filepath, 'rb') as f: | 
					
						
						|  | file_hash = hashlib.blake2b() | 
					
						
						|  | while chunk := f.read(8192): | 
					
						
						|  | file_hash.update(chunk) | 
					
						
						|  |  | 
					
						
						|  | return file_hash.hexdigest()[:18] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def check_openai_api_key(): | 
					
						
						|  | if not os.environ.get("OPENAI_API_KEY"): | 
					
						
						|  | raise ValueError( | 
					
						
						|  | "To use GPT for translation, please set up your OpenAI API key " | 
					
						
						|  | "as an environment variable in Linux as follows: " | 
					
						
						|  | "export OPENAI_API_KEY='your-api-key-here'. Or change the " | 
					
						
						|  | "translation process in Advanced settings." | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class SoniTranslate(SoniTrCache): | 
					
						
						|  | def __init__(self, cpu_mode=False): | 
					
						
						|  | super().__init__() | 
					
						
						|  | if cpu_mode: | 
					
						
						|  | os.environ["SONITR_DEVICE"] = "cpu" | 
					
						
						|  | else: | 
					
						
						|  | os.environ["SONITR_DEVICE"] = ( | 
					
						
						|  | "cuda" if torch.cuda.is_available() else "cpu" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | self.device = os.environ.get("SONITR_DEVICE") | 
					
						
						|  | self.device = self.device if os.environ.get("ZERO_GPU") != "TRUE" else "cuda" | 
					
						
						|  | self.result_diarize = None | 
					
						
						|  | self.align_language = None | 
					
						
						|  | self.result_source_lang = None | 
					
						
						|  | self.edit_subs_complete = False | 
					
						
						|  | self.voiceless_id = None | 
					
						
						|  | self.burn_subs_id = None | 
					
						
						|  |  | 
					
						
						|  | self.vci = ClassVoices(only_cpu=cpu_mode) | 
					
						
						|  |  | 
					
						
						|  | self.tts_voices = self.get_tts_voice_list() | 
					
						
						|  |  | 
					
						
						|  | logger.info(f"Working in: {self.device}") | 
					
						
						|  |  | 
					
						
						|  | def get_tts_voice_list(self): | 
					
						
						|  | try: | 
					
						
						|  | from piper import PiperVoice | 
					
						
						|  |  | 
					
						
						|  | piper_enabled = True | 
					
						
						|  | logger.info("PIPER TTS enabled") | 
					
						
						|  | except Exception as error: | 
					
						
						|  | logger.debug(str(error)) | 
					
						
						|  | piper_enabled = False | 
					
						
						|  | logger.info("PIPER TTS disabled") | 
					
						
						|  | try: | 
					
						
						|  | from TTS.api import TTS | 
					
						
						|  |  | 
					
						
						|  | xtts_enabled = True | 
					
						
						|  | logger.info("Coqui XTTS enabled") | 
					
						
						|  | logger.info( | 
					
						
						|  | "In this app, by using Coqui TTS (text-to-speech), you " | 
					
						
						|  | "acknowledge and agree to the license.\n" | 
					
						
						|  | "You confirm that you have read, understood, and agreed " | 
					
						
						|  | "to the Terms and Conditions specified at the following " | 
					
						
						|  | "link:\nhttps://coqui.ai/cpml.txt." | 
					
						
						|  | ) | 
					
						
						|  | os.environ["COQUI_TOS_AGREED"] = "1" | 
					
						
						|  | except Exception as error: | 
					
						
						|  | logger.debug(str(error)) | 
					
						
						|  | xtts_enabled = False | 
					
						
						|  | logger.info("Coqui XTTS disabled") | 
					
						
						|  |  | 
					
						
						|  | self.tts_info = TTS_Info(piper_enabled, xtts_enabled) | 
					
						
						|  |  | 
					
						
						|  | return self.tts_info.tts_list() | 
					
						
						|  |  | 
					
						
						|  | def batch_multilingual_media_conversion(self, *kwargs): | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | media_file_arg = kwargs[0] if kwargs[0] is not None else [] | 
					
						
						|  |  | 
					
						
						|  | link_media_arg = kwargs[1] | 
					
						
						|  | link_media_arg = [x.strip() for x in link_media_arg.split(',')] | 
					
						
						|  | link_media_arg = get_link_list(link_media_arg) | 
					
						
						|  |  | 
					
						
						|  | path_arg = kwargs[2] | 
					
						
						|  | path_arg = [x.strip() for x in path_arg.split(',')] | 
					
						
						|  | path_arg = get_valid_files(path_arg) | 
					
						
						|  |  | 
					
						
						|  | edit_text_arg = kwargs[31] | 
					
						
						|  | get_text_arg = kwargs[32] | 
					
						
						|  |  | 
					
						
						|  | is_gui_arg = kwargs[-1] | 
					
						
						|  |  | 
					
						
						|  | kwargs = kwargs[3:] | 
					
						
						|  |  | 
					
						
						|  | media_batch = media_file_arg + link_media_arg + path_arg | 
					
						
						|  | media_batch = list(filter(lambda x: x != "", media_batch)) | 
					
						
						|  | media_batch = media_batch if media_batch else [None] | 
					
						
						|  | logger.debug(str(media_batch)) | 
					
						
						|  |  | 
					
						
						|  | remove_directory_contents("outputs") | 
					
						
						|  |  | 
					
						
						|  | if edit_text_arg or get_text_arg: | 
					
						
						|  | return self.multilingual_media_conversion( | 
					
						
						|  | media_batch[0], "", "", *kwargs | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if "SET_LIMIT" == os.getenv("DEMO") or "TRUE" == os.getenv("ZERO_GPU"): | 
					
						
						|  | media_batch = [media_batch[0]] | 
					
						
						|  |  | 
					
						
						|  | result = [] | 
					
						
						|  | for media in media_batch: | 
					
						
						|  |  | 
					
						
						|  | output_file = self.multilingual_media_conversion( | 
					
						
						|  | media, "", "", *kwargs | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if isinstance(output_file, str): | 
					
						
						|  | output_file = [output_file] | 
					
						
						|  | result.extend(output_file) | 
					
						
						|  |  | 
					
						
						|  | if is_gui_arg and len(media_batch) > 1: | 
					
						
						|  | gr.Info(f"Done: {os.path.basename(output_file[0])}") | 
					
						
						|  |  | 
					
						
						|  | return result | 
					
						
						|  |  | 
					
						
						|  | def multilingual_media_conversion( | 
					
						
						|  | self, | 
					
						
						|  | media_file=None, | 
					
						
						|  | link_media="", | 
					
						
						|  | directory_input="", | 
					
						
						|  | YOUR_HF_TOKEN="", | 
					
						
						|  | preview=False, | 
					
						
						|  | transcriber_model="large-v3", | 
					
						
						|  | batch_size=4, | 
					
						
						|  | compute_type="auto", | 
					
						
						|  | origin_language="Automatic detection", | 
					
						
						|  | target_language="English (en)", | 
					
						
						|  | min_speakers=1, | 
					
						
						|  | max_speakers=1, | 
					
						
						|  | tts_voice00="en-US-EmmaMultilingualNeural-Female", | 
					
						
						|  | tts_voice01="en-US-AndrewMultilingualNeural-Male", | 
					
						
						|  | tts_voice02="en-US-AvaMultilingualNeural-Female", | 
					
						
						|  | tts_voice03="en-US-BrianMultilingualNeural-Male", | 
					
						
						|  | tts_voice04="de-DE-SeraphinaMultilingualNeural-Female", | 
					
						
						|  | tts_voice05="de-DE-FlorianMultilingualNeural-Male", | 
					
						
						|  | tts_voice06="fr-FR-VivienneMultilingualNeural-Female", | 
					
						
						|  | tts_voice07="fr-FR-RemyMultilingualNeural-Male", | 
					
						
						|  | tts_voice08="en-US-EmmaMultilingualNeural-Female", | 
					
						
						|  | tts_voice09="en-US-AndrewMultilingualNeural-Male", | 
					
						
						|  | tts_voice10="en-US-EmmaMultilingualNeural-Female", | 
					
						
						|  | tts_voice11="en-US-AndrewMultilingualNeural-Male", | 
					
						
						|  | video_output_name="", | 
					
						
						|  | mix_method_audio="Adjusting volumes and mixing audio", | 
					
						
						|  | max_accelerate_audio=2.1, | 
					
						
						|  | acceleration_rate_regulation=False, | 
					
						
						|  | volume_original_audio=0.25, | 
					
						
						|  | volume_translated_audio=1.80, | 
					
						
						|  | output_format_subtitle="srt", | 
					
						
						|  | get_translated_text=False, | 
					
						
						|  | get_video_from_text_json=False, | 
					
						
						|  | text_json="{}", | 
					
						
						|  | avoid_overlap=False, | 
					
						
						|  | vocal_refinement=False, | 
					
						
						|  | literalize_numbers=True, | 
					
						
						|  | segment_duration_limit=15, | 
					
						
						|  | diarization_model="pyannote_2.1", | 
					
						
						|  | translate_process="google_translator_batch", | 
					
						
						|  | subtitle_file=None, | 
					
						
						|  | output_type="video (mp4)", | 
					
						
						|  | voiceless_track=False, | 
					
						
						|  | voice_imitation=False, | 
					
						
						|  | voice_imitation_max_segments=3, | 
					
						
						|  | voice_imitation_vocals_dereverb=False, | 
					
						
						|  | voice_imitation_remove_previous=True, | 
					
						
						|  | voice_imitation_method="freevc", | 
					
						
						|  | dereverb_automatic_xtts=True, | 
					
						
						|  | text_segmentation_scale="sentence", | 
					
						
						|  | divide_text_segments_by="", | 
					
						
						|  | soft_subtitles_to_video=True, | 
					
						
						|  | burn_subtitles_to_video=False, | 
					
						
						|  | enable_cache=True, | 
					
						
						|  | custom_voices=False, | 
					
						
						|  | custom_voices_workers=1, | 
					
						
						|  | is_gui=False, | 
					
						
						|  | progress=gr.Progress(), | 
					
						
						|  | ): | 
					
						
						|  | if not YOUR_HF_TOKEN: | 
					
						
						|  | YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN") | 
					
						
						|  | if diarization_model == "disable" or max_speakers == 1: | 
					
						
						|  | if YOUR_HF_TOKEN is None: | 
					
						
						|  | YOUR_HF_TOKEN = "" | 
					
						
						|  | elif not YOUR_HF_TOKEN: | 
					
						
						|  | raise ValueError("No valid Hugging Face token") | 
					
						
						|  | else: | 
					
						
						|  | os.environ["YOUR_HF_TOKEN"] = YOUR_HF_TOKEN | 
					
						
						|  |  | 
					
						
						|  | if ( | 
					
						
						|  | "gpt" in translate_process | 
					
						
						|  | or transcriber_model == "OpenAI_API_Whisper" | 
					
						
						|  | or "OpenAI-TTS" in tts_voice00 | 
					
						
						|  | ): | 
					
						
						|  | check_openai_api_key() | 
					
						
						|  |  | 
					
						
						|  | if media_file is None: | 
					
						
						|  | media_file = ( | 
					
						
						|  | directory_input | 
					
						
						|  | if os.path.exists(directory_input) | 
					
						
						|  | else link_media | 
					
						
						|  | ) | 
					
						
						|  | media_file = ( | 
					
						
						|  | media_file if isinstance(media_file, str) else media_file.name | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if is_subtitle_file(media_file): | 
					
						
						|  | subtitle_file = media_file | 
					
						
						|  | media_file = "" | 
					
						
						|  |  | 
					
						
						|  | if media_file is None: | 
					
						
						|  | media_file = "" | 
					
						
						|  |  | 
					
						
						|  | if not origin_language: | 
					
						
						|  | origin_language = "Automatic detection" | 
					
						
						|  |  | 
					
						
						|  | if origin_language in UNIDIRECTIONAL_L_LIST and not subtitle_file: | 
					
						
						|  | raise ValueError( | 
					
						
						|  | f"The language '{origin_language}' " | 
					
						
						|  | "is not supported for transcription (ASR)." | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if get_translated_text: | 
					
						
						|  | self.edit_subs_complete = False | 
					
						
						|  | if get_video_from_text_json: | 
					
						
						|  | if not self.edit_subs_complete: | 
					
						
						|  | raise ValueError("Generate the transcription first.") | 
					
						
						|  |  | 
					
						
						|  | if ( | 
					
						
						|  | ("sound" in output_type or output_type == "raw media") | 
					
						
						|  | and (get_translated_text or get_video_from_text_json) | 
					
						
						|  | ): | 
					
						
						|  | raise ValueError( | 
					
						
						|  | "Please disable 'edit generate subtitles' " | 
					
						
						|  | f"first to acquire the {output_type}." | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | TRANSLATE_AUDIO_TO = LANGUAGES[target_language] | 
					
						
						|  | SOURCE_LANGUAGE = LANGUAGES[origin_language] | 
					
						
						|  |  | 
					
						
						|  | if ( | 
					
						
						|  | transcriber_model == "OpenAI_API_Whisper" | 
					
						
						|  | and SOURCE_LANGUAGE == "zh-TW" | 
					
						
						|  | ): | 
					
						
						|  | logger.warning( | 
					
						
						|  | "OpenAI API Whisper only supports Chinese (Simplified)." | 
					
						
						|  | ) | 
					
						
						|  | SOURCE_LANGUAGE = "zh" | 
					
						
						|  |  | 
					
						
						|  | if ( | 
					
						
						|  | text_segmentation_scale in ["word", "character"] | 
					
						
						|  | and "subtitle" not in output_type | 
					
						
						|  | ): | 
					
						
						|  | wrn_lang = ( | 
					
						
						|  | "Text segmentation by words or characters is typically" | 
					
						
						|  | " used for generating subtitles. If subtitles are not the" | 
					
						
						|  | " intended output, consider selecting 'sentence' " | 
					
						
						|  | "segmentation method to ensure optimal results." | 
					
						
						|  |  | 
					
						
						|  | ) | 
					
						
						|  | warn_disp(wrn_lang, is_gui) | 
					
						
						|  |  | 
					
						
						|  | if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower(): | 
					
						
						|  | wrn_lang = ( | 
					
						
						|  | "Make sure to select a 'TTS Speaker' suitable for" | 
					
						
						|  | " the translation language to avoid errors with the TTS." | 
					
						
						|  | ) | 
					
						
						|  | warn_disp(wrn_lang, is_gui) | 
					
						
						|  |  | 
					
						
						|  | if "_XTTS_" in tts_voice00 and voice_imitation: | 
					
						
						|  | wrn_lang = ( | 
					
						
						|  | "When you select XTTS, it is advisable " | 
					
						
						|  | "to disable Voice Imitation." | 
					
						
						|  | ) | 
					
						
						|  | warn_disp(wrn_lang, is_gui) | 
					
						
						|  |  | 
					
						
						|  | if custom_voices and voice_imitation: | 
					
						
						|  | wrn_lang = ( | 
					
						
						|  | "When you use R.V.C. models, it is advisable" | 
					
						
						|  | " to disable Voice Imitation." | 
					
						
						|  | ) | 
					
						
						|  | warn_disp(wrn_lang, is_gui) | 
					
						
						|  |  | 
					
						
						|  | if not media_file and not subtitle_file: | 
					
						
						|  | raise ValueError( | 
					
						
						|  | "Specifify a media or SRT file in advanced settings" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if subtitle_file: | 
					
						
						|  | subtitle_file = ( | 
					
						
						|  | subtitle_file | 
					
						
						|  | if isinstance(subtitle_file, str) | 
					
						
						|  | else subtitle_file.name | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if subtitle_file and SOURCE_LANGUAGE == "Automatic detection": | 
					
						
						|  | raise Exception( | 
					
						
						|  | "To use an SRT file, you need to specify its " | 
					
						
						|  | "original language (Source language)" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if not media_file and subtitle_file: | 
					
						
						|  | diarization_model = "disable" | 
					
						
						|  | media_file = "audio_support.wav" | 
					
						
						|  | if not get_video_from_text_json: | 
					
						
						|  | remove_files(media_file) | 
					
						
						|  | srt_data = srt_file_to_segments(subtitle_file) | 
					
						
						|  | total_duration = srt_data["segments"][-1]["end"] + 30. | 
					
						
						|  | support_audio = AudioSegment.silent( | 
					
						
						|  | duration=int(total_duration * 1000) | 
					
						
						|  | ) | 
					
						
						|  | support_audio.export( | 
					
						
						|  | media_file, format="wav" | 
					
						
						|  | ) | 
					
						
						|  | logger.info("Supporting audio for the SRT file, created.") | 
					
						
						|  |  | 
					
						
						|  | if "SET_LIMIT" == os.getenv("DEMO"): | 
					
						
						|  | preview = True | 
					
						
						|  | mix_method_audio = "Adjusting volumes and mixing audio" | 
					
						
						|  | transcriber_model = "medium" | 
					
						
						|  | logger.info( | 
					
						
						|  | "DEMO; set preview=True; Generation is limited to " | 
					
						
						|  | "10 seconds to prevent CPU errors. No limitations with GPU.\n" | 
					
						
						|  | "DEMO; set Adjusting volumes and mixing audio\n" | 
					
						
						|  | "DEMO; set whisper model to medium" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if self.device == "cpu" and compute_type not in COMPUTE_TYPE_CPU: | 
					
						
						|  | logger.info("Compute type changed to float32") | 
					
						
						|  | compute_type = "float32" | 
					
						
						|  |  | 
					
						
						|  | base_video_file = "Video.mp4" | 
					
						
						|  | base_audio_wav = "audio.wav" | 
					
						
						|  | dub_audio_file = "audio_dub_solo.ogg" | 
					
						
						|  | vocals_audio_file = "audio_Vocals_DeReverb.wav" | 
					
						
						|  | voiceless_audio_file = "audio_Voiceless.wav" | 
					
						
						|  | mix_audio_file = "audio_mix.mp3" | 
					
						
						|  | vid_subs = "video_subs_file.mp4" | 
					
						
						|  | video_output_file = "video_dub.mp4" | 
					
						
						|  |  | 
					
						
						|  | if os.path.exists(media_file): | 
					
						
						|  | media_base_hash = get_hash(media_file) | 
					
						
						|  | else: | 
					
						
						|  | media_base_hash = media_file | 
					
						
						|  | self.clear_cache(media_base_hash, force=(not enable_cache)) | 
					
						
						|  |  | 
					
						
						|  | if not get_video_from_text_json: | 
					
						
						|  | self.result_diarize = ( | 
					
						
						|  | self.align_language | 
					
						
						|  | ) = self.result_source_lang = None | 
					
						
						|  | if not self.task_in_cache("media", [media_base_hash, preview], {}): | 
					
						
						|  | if is_audio_file(media_file): | 
					
						
						|  | prog_disp( | 
					
						
						|  | "Processing audio...", 0.15, is_gui, progress=progress | 
					
						
						|  | ) | 
					
						
						|  | audio_preprocessor(preview, media_file, base_audio_wav) | 
					
						
						|  | else: | 
					
						
						|  | prog_disp( | 
					
						
						|  | "Processing video...", 0.15, is_gui, progress=progress | 
					
						
						|  | ) | 
					
						
						|  | audio_video_preprocessor( | 
					
						
						|  | preview, media_file, base_video_file, base_audio_wav | 
					
						
						|  | ) | 
					
						
						|  | logger.debug("Set file complete.") | 
					
						
						|  |  | 
					
						
						|  | if "sound" in output_type: | 
					
						
						|  | prog_disp( | 
					
						
						|  | "Separating sounds in the file...", | 
					
						
						|  | 0.50, | 
					
						
						|  | is_gui, | 
					
						
						|  | progress=progress | 
					
						
						|  | ) | 
					
						
						|  | separate_out = sound_separate(base_audio_wav, output_type) | 
					
						
						|  | final_outputs = [] | 
					
						
						|  | for out in separate_out: | 
					
						
						|  | final_name = media_out( | 
					
						
						|  | media_file, | 
					
						
						|  | f"{get_no_ext_filename(out)}", | 
					
						
						|  | video_output_name, | 
					
						
						|  | "wav", | 
					
						
						|  | file_obj=out, | 
					
						
						|  | ) | 
					
						
						|  | final_outputs.append(final_name) | 
					
						
						|  | logger.info(f"Done: {str(final_outputs)}") | 
					
						
						|  | return final_outputs | 
					
						
						|  |  | 
					
						
						|  | if output_type == "raw media": | 
					
						
						|  | output = media_out( | 
					
						
						|  | media_file, | 
					
						
						|  | "raw_media", | 
					
						
						|  | video_output_name, | 
					
						
						|  | "wav" if is_audio_file(media_file) else "mp4", | 
					
						
						|  | file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file, | 
					
						
						|  | ) | 
					
						
						|  | logger.info(f"Done: {output}") | 
					
						
						|  | return output | 
					
						
						|  |  | 
					
						
						|  | if os.environ.get("IS_DEMO") == "TRUE": | 
					
						
						|  | duration_verify = librosa.get_duration(filename=base_audio_wav) | 
					
						
						|  | logger.info(f"Duration: {duration_verify} seconds") | 
					
						
						|  | if duration_verify > 1500: | 
					
						
						|  | raise RuntimeError( | 
					
						
						|  | "The audio is too long to process in this demo. Alternatively, you" | 
					
						
						|  | " can install the app locally or use the Colab notebook available " | 
					
						
						|  | "in the SoniTranslate repository." | 
					
						
						|  | ) | 
					
						
						|  | elif duration_verify > 300: | 
					
						
						|  | tts_voices_list = [ | 
					
						
						|  | tts_voice00, tts_voice01, tts_voice02, tts_voice03, tts_voice04, | 
					
						
						|  | tts_voice05, tts_voice06, tts_voice07, tts_voice08, tts_voice09, | 
					
						
						|  | tts_voice10, tts_voice11 | 
					
						
						|  | ] | 
					
						
						|  |  | 
					
						
						|  | for tts_voice_ in tts_voices_list: | 
					
						
						|  | if "_XTTS_" in tts_voice_: | 
					
						
						|  | raise RuntimeError( | 
					
						
						|  | "XTTS is too slow to be used for audio longer than 5 " | 
					
						
						|  | "minutes in this demo. Alternatively, you can install " | 
					
						
						|  | "the app locally or use the Colab notebook available in" | 
					
						
						|  | " the SoniTranslate repository." | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if not self.task_in_cache("refine_vocals", [vocal_refinement], {}): | 
					
						
						|  | self.vocals = None | 
					
						
						|  | if vocal_refinement: | 
					
						
						|  | try: | 
					
						
						|  | from soni_translate.mdx_net import process_uvr_task | 
					
						
						|  | _, _, _, _, file_vocals = process_uvr_task( | 
					
						
						|  | orig_song_path=base_audio_wav, | 
					
						
						|  | main_vocals=False, | 
					
						
						|  | dereverb=True, | 
					
						
						|  | remove_files_output_dir=True, | 
					
						
						|  | ) | 
					
						
						|  | remove_files(vocals_audio_file) | 
					
						
						|  | copy_files(file_vocals, ".") | 
					
						
						|  | self.vocals = vocals_audio_file | 
					
						
						|  | except Exception as error: | 
					
						
						|  | logger.error(str(error)) | 
					
						
						|  |  | 
					
						
						|  | if not self.task_in_cache("transcript_align", [ | 
					
						
						|  | subtitle_file, | 
					
						
						|  | SOURCE_LANGUAGE, | 
					
						
						|  | transcriber_model, | 
					
						
						|  | compute_type, | 
					
						
						|  | batch_size, | 
					
						
						|  | literalize_numbers, | 
					
						
						|  | segment_duration_limit, | 
					
						
						|  | ( | 
					
						
						|  | "l_unit" | 
					
						
						|  | if text_segmentation_scale in ["word", "character"] | 
					
						
						|  | and subtitle_file | 
					
						
						|  | else "sentence" | 
					
						
						|  | ) | 
					
						
						|  | ], {"vocals": self.vocals}): | 
					
						
						|  | if subtitle_file: | 
					
						
						|  | prog_disp( | 
					
						
						|  | "From SRT file...", 0.30, is_gui, progress=progress | 
					
						
						|  | ) | 
					
						
						|  | audio = whisperx.load_audio( | 
					
						
						|  | base_audio_wav if not self.vocals else self.vocals | 
					
						
						|  | ) | 
					
						
						|  | self.result = srt_file_to_segments(subtitle_file) | 
					
						
						|  | self.result["language"] = SOURCE_LANGUAGE | 
					
						
						|  | else: | 
					
						
						|  | prog_disp( | 
					
						
						|  | "Transcribing...", 0.30, is_gui, progress=progress | 
					
						
						|  | ) | 
					
						
						|  | SOURCE_LANGUAGE = ( | 
					
						
						|  | None | 
					
						
						|  | if SOURCE_LANGUAGE == "Automatic detection" | 
					
						
						|  | else SOURCE_LANGUAGE | 
					
						
						|  | ) | 
					
						
						|  | audio, self.result = transcribe_speech( | 
					
						
						|  | base_audio_wav if not self.vocals else self.vocals, | 
					
						
						|  | transcriber_model, | 
					
						
						|  | compute_type, | 
					
						
						|  | batch_size, | 
					
						
						|  | SOURCE_LANGUAGE, | 
					
						
						|  | literalize_numbers, | 
					
						
						|  | segment_duration_limit, | 
					
						
						|  | ) | 
					
						
						|  | logger.debug( | 
					
						
						|  | "Transcript complete, " | 
					
						
						|  | f"segments count {len(self.result['segments'])}" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | self.align_language = self.result["language"] | 
					
						
						|  | if ( | 
					
						
						|  | not subtitle_file | 
					
						
						|  | or text_segmentation_scale in ["word", "character"] | 
					
						
						|  | ): | 
					
						
						|  | prog_disp("Aligning...", 0.45, is_gui, progress=progress) | 
					
						
						|  | try: | 
					
						
						|  | if self.align_language in ["vi"]: | 
					
						
						|  | logger.info( | 
					
						
						|  | "Deficient alignment for the " | 
					
						
						|  | f"{self.align_language} language, skipping the" | 
					
						
						|  | " process. It is suggested to reduce the " | 
					
						
						|  | "duration of the segments as an alternative." | 
					
						
						|  | ) | 
					
						
						|  | else: | 
					
						
						|  | self.result = align_speech(audio, self.result) | 
					
						
						|  | logger.debug( | 
					
						
						|  | "Align complete, " | 
					
						
						|  | f"segments count {len(self.result['segments'])}" | 
					
						
						|  | ) | 
					
						
						|  | except Exception as error: | 
					
						
						|  | logger.error(str(error)) | 
					
						
						|  |  | 
					
						
						|  | if self.result["segments"] == []: | 
					
						
						|  | raise ValueError("No active speech found in audio") | 
					
						
						|  |  | 
					
						
						|  | if not self.task_in_cache("break_align", [ | 
					
						
						|  | divide_text_segments_by, | 
					
						
						|  | text_segmentation_scale, | 
					
						
						|  | self.align_language | 
					
						
						|  | ], { | 
					
						
						|  | "result": self.result, | 
					
						
						|  | "align_language": self.align_language | 
					
						
						|  | }): | 
					
						
						|  | if self.align_language in ["ja", "zh", "zh-TW"]: | 
					
						
						|  | divide_text_segments_by += "|!|?|...|。" | 
					
						
						|  | if text_segmentation_scale in ["word", "character"]: | 
					
						
						|  | self.result = linguistic_level_segments( | 
					
						
						|  | self.result, | 
					
						
						|  | text_segmentation_scale, | 
					
						
						|  | ) | 
					
						
						|  | elif divide_text_segments_by: | 
					
						
						|  | try: | 
					
						
						|  | self.result = break_aling_segments( | 
					
						
						|  | self.result, | 
					
						
						|  | break_characters=divide_text_segments_by, | 
					
						
						|  | ) | 
					
						
						|  | except Exception as error: | 
					
						
						|  | logger.error(str(error)) | 
					
						
						|  |  | 
					
						
						|  | if not self.task_in_cache("diarize", [ | 
					
						
						|  | min_speakers, | 
					
						
						|  | max_speakers, | 
					
						
						|  | YOUR_HF_TOKEN[:len(YOUR_HF_TOKEN)//2], | 
					
						
						|  | diarization_model | 
					
						
						|  | ], { | 
					
						
						|  | "result": self.result | 
					
						
						|  | }): | 
					
						
						|  | prog_disp("Diarizing...", 0.60, is_gui, progress=progress) | 
					
						
						|  | diarize_model_select = diarization_models[diarization_model] | 
					
						
						|  | self.result_diarize = diarize_speech( | 
					
						
						|  | base_audio_wav if not self.vocals else self.vocals, | 
					
						
						|  | self.result, | 
					
						
						|  | min_speakers, | 
					
						
						|  | max_speakers, | 
					
						
						|  | YOUR_HF_TOKEN, | 
					
						
						|  | diarize_model_select, | 
					
						
						|  | ) | 
					
						
						|  | logger.debug("Diarize complete") | 
					
						
						|  | self.result_source_lang = copy.deepcopy(self.result_diarize) | 
					
						
						|  |  | 
					
						
						|  | if not self.task_in_cache("translate", [ | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | translate_process | 
					
						
						|  | ], { | 
					
						
						|  | "result_diarize": self.result_diarize | 
					
						
						|  | }): | 
					
						
						|  | prog_disp("Translating...", 0.70, is_gui, progress=progress) | 
					
						
						|  | lang_source = ( | 
					
						
						|  | self.align_language | 
					
						
						|  | if self.align_language | 
					
						
						|  | else SOURCE_LANGUAGE | 
					
						
						|  | ) | 
					
						
						|  | self.result_diarize["segments"] = translate_text( | 
					
						
						|  | self.result_diarize["segments"], | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | translate_process, | 
					
						
						|  | chunk_size=1800, | 
					
						
						|  | source=lang_source, | 
					
						
						|  | ) | 
					
						
						|  | logger.debug("Translation complete") | 
					
						
						|  | logger.debug(self.result_diarize) | 
					
						
						|  |  | 
					
						
						|  | if get_translated_text: | 
					
						
						|  |  | 
					
						
						|  | json_data = [] | 
					
						
						|  | for segment in self.result_diarize["segments"]: | 
					
						
						|  | start = segment["start"] | 
					
						
						|  | text = segment["text"] | 
					
						
						|  | speaker = int(segment.get("speaker", "SPEAKER_00")[-2:]) + 1 | 
					
						
						|  | json_data.append( | 
					
						
						|  | {"start": start, "text": text, "speaker": speaker} | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | json_string = json.dumps(json_data, indent=2) | 
					
						
						|  | logger.info("Done") | 
					
						
						|  | self.edit_subs_complete = True | 
					
						
						|  | return json_string.encode().decode("unicode_escape") | 
					
						
						|  |  | 
					
						
						|  | if get_video_from_text_json: | 
					
						
						|  |  | 
					
						
						|  | if self.result_diarize is None: | 
					
						
						|  | raise ValueError("Generate the transcription first.") | 
					
						
						|  |  | 
					
						
						|  | text_json_loaded = json.loads(text_json) | 
					
						
						|  | for i, segment in enumerate(self.result_diarize["segments"]): | 
					
						
						|  | segment["text"] = text_json_loaded[i]["text"] | 
					
						
						|  | segment["speaker"] = "SPEAKER_{:02d}".format( | 
					
						
						|  | int(text_json_loaded[i]["speaker"]) - 1 | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if not self.task_in_cache("subs_and_edit", [ | 
					
						
						|  | copy.deepcopy(self.result_diarize), | 
					
						
						|  | output_format_subtitle, | 
					
						
						|  | TRANSLATE_AUDIO_TO | 
					
						
						|  | ], { | 
					
						
						|  | "result_diarize": self.result_diarize | 
					
						
						|  | }): | 
					
						
						|  | if output_format_subtitle == "disable": | 
					
						
						|  | self.sub_file = "sub_tra.srt" | 
					
						
						|  | elif output_format_subtitle != "ass": | 
					
						
						|  | self.sub_file = process_subtitles( | 
					
						
						|  | self.result_source_lang, | 
					
						
						|  | self.align_language, | 
					
						
						|  | self.result_diarize, | 
					
						
						|  | output_format_subtitle, | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if output_format_subtitle != "srt": | 
					
						
						|  | _ = process_subtitles( | 
					
						
						|  | self.result_source_lang, | 
					
						
						|  | self.align_language, | 
					
						
						|  | self.result_diarize, | 
					
						
						|  | "srt", | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if output_format_subtitle == "ass": | 
					
						
						|  | convert_ori = "ffmpeg -i sub_ori.srt sub_ori.ass -y" | 
					
						
						|  | convert_tra = "ffmpeg -i sub_tra.srt sub_tra.ass -y" | 
					
						
						|  | self.sub_file = "sub_tra.ass" | 
					
						
						|  | run_command(convert_ori) | 
					
						
						|  | run_command(convert_tra) | 
					
						
						|  |  | 
					
						
						|  | format_sub = ( | 
					
						
						|  | output_format_subtitle | 
					
						
						|  | if output_format_subtitle != "disable" | 
					
						
						|  | else "srt" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if output_type == "subtitle": | 
					
						
						|  |  | 
					
						
						|  | out_subs = [] | 
					
						
						|  | tra_subs = media_out( | 
					
						
						|  | media_file, | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | video_output_name, | 
					
						
						|  | format_sub, | 
					
						
						|  | file_obj=self.sub_file, | 
					
						
						|  | ) | 
					
						
						|  | out_subs.append(tra_subs) | 
					
						
						|  |  | 
					
						
						|  | ori_subs = media_out( | 
					
						
						|  | media_file, | 
					
						
						|  | self.align_language, | 
					
						
						|  | video_output_name, | 
					
						
						|  | format_sub, | 
					
						
						|  | file_obj=f"sub_ori.{format_sub}", | 
					
						
						|  | ) | 
					
						
						|  | out_subs.append(ori_subs) | 
					
						
						|  | logger.info(f"Done: {out_subs}") | 
					
						
						|  | return out_subs | 
					
						
						|  |  | 
					
						
						|  | if output_type == "subtitle [by speaker]": | 
					
						
						|  | output = get_subtitle_speaker( | 
					
						
						|  | media_file, | 
					
						
						|  | result=self.result_diarize, | 
					
						
						|  | language=TRANSLATE_AUDIO_TO, | 
					
						
						|  | extension=format_sub, | 
					
						
						|  | base_name=video_output_name, | 
					
						
						|  | ) | 
					
						
						|  | logger.info(f"Done: {str(output)}") | 
					
						
						|  | return output | 
					
						
						|  |  | 
					
						
						|  | if "video [subtitled]" in output_type: | 
					
						
						|  | output = media_out( | 
					
						
						|  | media_file, | 
					
						
						|  | TRANSLATE_AUDIO_TO + "_subtitled", | 
					
						
						|  | video_output_name, | 
					
						
						|  | "wav" if is_audio_file(media_file) else ( | 
					
						
						|  | "mkv" if "mkv" in output_type else "mp4" | 
					
						
						|  | ), | 
					
						
						|  | file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file, | 
					
						
						|  | soft_subtitles=False if is_audio_file(media_file) else True, | 
					
						
						|  | subtitle_files=output_format_subtitle, | 
					
						
						|  | ) | 
					
						
						|  | msg_out = output[0] if isinstance(output, list) else output | 
					
						
						|  | logger.info(f"Done: {msg_out}") | 
					
						
						|  | return output | 
					
						
						|  |  | 
					
						
						|  | if not self.task_in_cache("tts", [ | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | tts_voice00, | 
					
						
						|  | tts_voice01, | 
					
						
						|  | tts_voice02, | 
					
						
						|  | tts_voice03, | 
					
						
						|  | tts_voice04, | 
					
						
						|  | tts_voice05, | 
					
						
						|  | tts_voice06, | 
					
						
						|  | tts_voice07, | 
					
						
						|  | tts_voice08, | 
					
						
						|  | tts_voice09, | 
					
						
						|  | tts_voice10, | 
					
						
						|  | tts_voice11, | 
					
						
						|  | dereverb_automatic_xtts | 
					
						
						|  | ], { | 
					
						
						|  | "sub_file": self.sub_file | 
					
						
						|  | }): | 
					
						
						|  | prog_disp("Text to speech...", 0.80, is_gui, progress=progress) | 
					
						
						|  | self.valid_speakers = audio_segmentation_to_voice( | 
					
						
						|  | self.result_diarize, | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | is_gui, | 
					
						
						|  | tts_voice00, | 
					
						
						|  | tts_voice01, | 
					
						
						|  | tts_voice02, | 
					
						
						|  | tts_voice03, | 
					
						
						|  | tts_voice04, | 
					
						
						|  | tts_voice05, | 
					
						
						|  | tts_voice06, | 
					
						
						|  | tts_voice07, | 
					
						
						|  | tts_voice08, | 
					
						
						|  | tts_voice09, | 
					
						
						|  | tts_voice10, | 
					
						
						|  | tts_voice11, | 
					
						
						|  | dereverb_automatic_xtts, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if not self.task_in_cache("acc_and_vc", [ | 
					
						
						|  | max_accelerate_audio, | 
					
						
						|  | acceleration_rate_regulation, | 
					
						
						|  | voice_imitation, | 
					
						
						|  | voice_imitation_max_segments, | 
					
						
						|  | voice_imitation_remove_previous, | 
					
						
						|  | voice_imitation_vocals_dereverb, | 
					
						
						|  | voice_imitation_method, | 
					
						
						|  | custom_voices, | 
					
						
						|  | custom_voices_workers, | 
					
						
						|  | copy.deepcopy(self.vci.model_config), | 
					
						
						|  | avoid_overlap | 
					
						
						|  | ], { | 
					
						
						|  | "valid_speakers": self.valid_speakers | 
					
						
						|  | }): | 
					
						
						|  | audio_files, speakers_list = accelerate_segments( | 
					
						
						|  | self.result_diarize, | 
					
						
						|  | max_accelerate_audio, | 
					
						
						|  | self.valid_speakers, | 
					
						
						|  | acceleration_rate_regulation, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if voice_imitation: | 
					
						
						|  | prog_disp( | 
					
						
						|  | "Voice Imitation...", 0.85, is_gui, progress=progress | 
					
						
						|  | ) | 
					
						
						|  | from soni_translate.text_to_speech import toneconverter | 
					
						
						|  |  | 
					
						
						|  | try: | 
					
						
						|  | toneconverter( | 
					
						
						|  | copy.deepcopy(self.result_diarize), | 
					
						
						|  | voice_imitation_max_segments, | 
					
						
						|  | voice_imitation_remove_previous, | 
					
						
						|  | voice_imitation_vocals_dereverb, | 
					
						
						|  | voice_imitation_method, | 
					
						
						|  | ) | 
					
						
						|  | except Exception as error: | 
					
						
						|  | logger.error(str(error)) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if custom_voices: | 
					
						
						|  | prog_disp( | 
					
						
						|  | "Applying customized voices...", | 
					
						
						|  | 0.90, | 
					
						
						|  | is_gui, | 
					
						
						|  | progress=progress, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | try: | 
					
						
						|  | self.vci( | 
					
						
						|  | audio_files, | 
					
						
						|  | speakers_list, | 
					
						
						|  | overwrite=True, | 
					
						
						|  | parallel_workers=custom_voices_workers, | 
					
						
						|  | ) | 
					
						
						|  | self.vci.unload_models() | 
					
						
						|  | except Exception as error: | 
					
						
						|  | logger.error(str(error)) | 
					
						
						|  |  | 
					
						
						|  | prog_disp( | 
					
						
						|  | "Creating final translated video...", | 
					
						
						|  | 0.95, | 
					
						
						|  | is_gui, | 
					
						
						|  | progress=progress, | 
					
						
						|  | ) | 
					
						
						|  | remove_files(dub_audio_file) | 
					
						
						|  | create_translated_audio( | 
					
						
						|  | self.result_diarize, | 
					
						
						|  | audio_files, | 
					
						
						|  | dub_audio_file, | 
					
						
						|  | False, | 
					
						
						|  | avoid_overlap, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | hash_base_audio_wav = get_hash(base_audio_wav) | 
					
						
						|  | if voiceless_track: | 
					
						
						|  | if self.voiceless_id != hash_base_audio_wav: | 
					
						
						|  | from soni_translate.mdx_net import process_uvr_task | 
					
						
						|  |  | 
					
						
						|  | try: | 
					
						
						|  |  | 
					
						
						|  | remove_files(voiceless_audio_file) | 
					
						
						|  | uvr_voiceless_audio_wav, _ = process_uvr_task( | 
					
						
						|  | orig_song_path=base_audio_wav, | 
					
						
						|  | song_id="voiceless", | 
					
						
						|  | only_voiceless=True, | 
					
						
						|  | remove_files_output_dir=False, | 
					
						
						|  | ) | 
					
						
						|  | copy_files(uvr_voiceless_audio_wav, ".") | 
					
						
						|  | base_audio_wav = voiceless_audio_file | 
					
						
						|  | self.voiceless_id = hash_base_audio_wav | 
					
						
						|  |  | 
					
						
						|  | except Exception as error: | 
					
						
						|  | logger.error(str(error)) | 
					
						
						|  | else: | 
					
						
						|  | base_audio_wav = voiceless_audio_file | 
					
						
						|  |  | 
					
						
						|  | if not self.task_in_cache("mix_aud", [ | 
					
						
						|  | mix_method_audio, | 
					
						
						|  | volume_original_audio, | 
					
						
						|  | volume_translated_audio, | 
					
						
						|  | voiceless_track | 
					
						
						|  | ], {}): | 
					
						
						|  |  | 
					
						
						|  | remove_files(mix_audio_file) | 
					
						
						|  | command_volume_mix = f'ffmpeg -y -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[0:0]volume={volume_original_audio}[a];[1:0]volume={volume_translated_audio}[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio_file}' | 
					
						
						|  | command_background_mix = f'ffmpeg -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio_file}' | 
					
						
						|  | if mix_method_audio == "Adjusting volumes and mixing audio": | 
					
						
						|  |  | 
					
						
						|  | run_command(command_volume_mix) | 
					
						
						|  | else: | 
					
						
						|  | try: | 
					
						
						|  |  | 
					
						
						|  | run_command(command_background_mix) | 
					
						
						|  | except Exception as error_mix: | 
					
						
						|  |  | 
					
						
						|  | logger.error(str(error_mix)) | 
					
						
						|  | run_command(command_volume_mix) | 
					
						
						|  |  | 
					
						
						|  | if "audio" in output_type or is_audio_file(media_file): | 
					
						
						|  | output = media_out( | 
					
						
						|  | media_file, | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | video_output_name, | 
					
						
						|  | "wav" if "wav" in output_type else ( | 
					
						
						|  | "ogg" if "ogg" in output_type else "mp3" | 
					
						
						|  | ), | 
					
						
						|  | file_obj=mix_audio_file, | 
					
						
						|  | subtitle_files=output_format_subtitle, | 
					
						
						|  | ) | 
					
						
						|  | msg_out = output[0] if isinstance(output, list) else output | 
					
						
						|  | logger.info(f"Done: {msg_out}") | 
					
						
						|  | return output | 
					
						
						|  |  | 
					
						
						|  | hash_base_video_file = get_hash(base_video_file) | 
					
						
						|  |  | 
					
						
						|  | if burn_subtitles_to_video: | 
					
						
						|  | hashvideo_text = [ | 
					
						
						|  | hash_base_video_file, | 
					
						
						|  | [seg["text"] for seg in self.result_diarize["segments"]] | 
					
						
						|  | ] | 
					
						
						|  | if self.burn_subs_id != hashvideo_text: | 
					
						
						|  | try: | 
					
						
						|  | logger.info("Burn subtitles") | 
					
						
						|  | remove_files(vid_subs) | 
					
						
						|  | command = f"ffmpeg -i {base_video_file} -y -vf subtitles=sub_tra.srt -max_muxing_queue_size 9999 {vid_subs}" | 
					
						
						|  | run_command(command) | 
					
						
						|  | base_video_file = vid_subs | 
					
						
						|  | self.burn_subs_id = hashvideo_text | 
					
						
						|  | except Exception as error: | 
					
						
						|  | logger.error(str(error)) | 
					
						
						|  | else: | 
					
						
						|  | base_video_file = vid_subs | 
					
						
						|  |  | 
					
						
						|  | if not self.task_in_cache("output", [ | 
					
						
						|  | hash_base_video_file, | 
					
						
						|  | hash_base_audio_wav, | 
					
						
						|  | burn_subtitles_to_video | 
					
						
						|  | ], {}): | 
					
						
						|  |  | 
					
						
						|  | remove_files(video_output_file) | 
					
						
						|  | run_command( | 
					
						
						|  | f"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output_file}" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | output = media_out( | 
					
						
						|  | media_file, | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | video_output_name, | 
					
						
						|  | "mkv" if "mkv" in output_type else "mp4", | 
					
						
						|  | file_obj=video_output_file, | 
					
						
						|  | soft_subtitles=soft_subtitles_to_video, | 
					
						
						|  | subtitle_files=output_format_subtitle, | 
					
						
						|  | ) | 
					
						
						|  | msg_out = output[0] if isinstance(output, list) else output | 
					
						
						|  | logger.info(f"Done: {msg_out}") | 
					
						
						|  |  | 
					
						
						|  | return output | 
					
						
						|  |  | 
					
						
						|  | def hook_beta_processor( | 
					
						
						|  | self, | 
					
						
						|  | document, | 
					
						
						|  | tgt_lang, | 
					
						
						|  | translate_process, | 
					
						
						|  | ori_lang, | 
					
						
						|  | tts, | 
					
						
						|  | name_final_file, | 
					
						
						|  | custom_voices, | 
					
						
						|  | custom_voices_workers, | 
					
						
						|  | output_type, | 
					
						
						|  | chunk_size, | 
					
						
						|  | width, | 
					
						
						|  | height, | 
					
						
						|  | start_page, | 
					
						
						|  | end_page, | 
					
						
						|  | bcolor, | 
					
						
						|  | is_gui, | 
					
						
						|  | progress | 
					
						
						|  | ): | 
					
						
						|  | prog_disp("Processing pages...", 0.10, is_gui, progress=progress) | 
					
						
						|  | doc_data = doc_to_txtximg_pages(document,  width, height, start_page, end_page, bcolor) | 
					
						
						|  | result_diarize = page_data_to_segments(doc_data, 1700) | 
					
						
						|  |  | 
					
						
						|  | prog_disp("Translating...", 0.20, is_gui, progress=progress) | 
					
						
						|  | result_diarize["segments"] = translate_text( | 
					
						
						|  | result_diarize["segments"], | 
					
						
						|  | tgt_lang, | 
					
						
						|  | translate_process, | 
					
						
						|  | chunk_size=0, | 
					
						
						|  | source=ori_lang, | 
					
						
						|  | ) | 
					
						
						|  | chunk_size = ( | 
					
						
						|  | chunk_size if chunk_size else determine_chunk_size(tts) | 
					
						
						|  | ) | 
					
						
						|  | doc_data = update_page_data(result_diarize, doc_data) | 
					
						
						|  |  | 
					
						
						|  | prog_disp("Text to speech...", 0.30, is_gui, progress=progress) | 
					
						
						|  | result_diarize = page_data_to_segments(doc_data, chunk_size) | 
					
						
						|  | valid_speakers = audio_segmentation_to_voice( | 
					
						
						|  | result_diarize, | 
					
						
						|  | tgt_lang, | 
					
						
						|  | is_gui, | 
					
						
						|  | tts, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | audio_files, speakers_list = accelerate_segments( | 
					
						
						|  | result_diarize, | 
					
						
						|  | 1.0, | 
					
						
						|  | valid_speakers, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if custom_voices: | 
					
						
						|  | prog_disp( | 
					
						
						|  | "Applying customized voices...", | 
					
						
						|  | 0.60, | 
					
						
						|  | is_gui, | 
					
						
						|  | progress=progress, | 
					
						
						|  | ) | 
					
						
						|  | self.vci( | 
					
						
						|  | audio_files, | 
					
						
						|  | speakers_list, | 
					
						
						|  | overwrite=True, | 
					
						
						|  | parallel_workers=custom_voices_workers, | 
					
						
						|  | ) | 
					
						
						|  | self.vci.unload_models() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | result_diarize = fix_timestamps_docs(result_diarize, audio_files) | 
					
						
						|  | final_wav_file = "audio_book.wav" | 
					
						
						|  | remove_files(final_wav_file) | 
					
						
						|  |  | 
					
						
						|  | prog_disp("Creating audio file...", 0.70, is_gui, progress=progress) | 
					
						
						|  | create_translated_audio( | 
					
						
						|  | result_diarize, audio_files, final_wav_file, False | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | prog_disp("Creating video file...", 0.80, is_gui, progress=progress) | 
					
						
						|  | video_doc = create_video_from_images( | 
					
						
						|  | doc_data, | 
					
						
						|  | result_diarize | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | prog_disp("Merging...", 0.90, is_gui, progress=progress) | 
					
						
						|  | vid_out = merge_video_and_audio(video_doc, final_wav_file) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | output = media_out( | 
					
						
						|  | document, | 
					
						
						|  | tgt_lang, | 
					
						
						|  | name_final_file, | 
					
						
						|  | "mkv" if "mkv" in output_type else "mp4", | 
					
						
						|  | file_obj=vid_out, | 
					
						
						|  | ) | 
					
						
						|  | logger.info(f"Done: {output}") | 
					
						
						|  | return output | 
					
						
						|  |  | 
					
						
						|  | def multilingual_docs_conversion( | 
					
						
						|  | self, | 
					
						
						|  | string_text="", | 
					
						
						|  | document=None, | 
					
						
						|  | directory_input="", | 
					
						
						|  | origin_language="English (en)", | 
					
						
						|  | target_language="English (en)", | 
					
						
						|  | tts_voice00="en-US-EmmaMultilingualNeural-Female", | 
					
						
						|  | name_final_file="", | 
					
						
						|  | translate_process="google_translator", | 
					
						
						|  | output_type="audio", | 
					
						
						|  | chunk_size=None, | 
					
						
						|  | custom_voices=False, | 
					
						
						|  | custom_voices_workers=1, | 
					
						
						|  | start_page=1, | 
					
						
						|  | end_page=99999, | 
					
						
						|  | width=1280, | 
					
						
						|  | height=720, | 
					
						
						|  | bcolor="dynamic", | 
					
						
						|  | is_gui=False, | 
					
						
						|  | progress=gr.Progress(), | 
					
						
						|  | ): | 
					
						
						|  | if "gpt" in translate_process: | 
					
						
						|  | check_openai_api_key() | 
					
						
						|  |  | 
					
						
						|  | SOURCE_LANGUAGE = LANGUAGES[origin_language] | 
					
						
						|  | if translate_process != "disable_translation": | 
					
						
						|  | TRANSLATE_AUDIO_TO = LANGUAGES[target_language] | 
					
						
						|  | else: | 
					
						
						|  | TRANSLATE_AUDIO_TO = SOURCE_LANGUAGE | 
					
						
						|  | logger.info("No translation") | 
					
						
						|  | if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower(): | 
					
						
						|  | logger.debug( | 
					
						
						|  | "Make sure to select a 'TTS Speaker' suitable for the " | 
					
						
						|  | "translation language to avoid errors with the TTS." | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | self.clear_cache(string_text, force=True) | 
					
						
						|  |  | 
					
						
						|  | is_string = False | 
					
						
						|  | if document is None: | 
					
						
						|  | if os.path.exists(directory_input): | 
					
						
						|  | document = directory_input | 
					
						
						|  | else: | 
					
						
						|  | document = string_text | 
					
						
						|  | is_string = True | 
					
						
						|  | document = document if isinstance(document, str) else document.name | 
					
						
						|  | if not document: | 
					
						
						|  | raise Exception("No data found") | 
					
						
						|  |  | 
					
						
						|  | if os.environ.get("IS_DEMO") == "TRUE" and not is_string: | 
					
						
						|  | raise RuntimeError( | 
					
						
						|  | "This option is disabled in this demo. " | 
					
						
						|  | "Alternatively, you can install " | 
					
						
						|  | "the app locally or use the Colab notebook available in" | 
					
						
						|  | " the SoniTranslate repository." | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if "videobook" in output_type: | 
					
						
						|  | if not document.lower().endswith(".pdf"): | 
					
						
						|  | raise ValueError( | 
					
						
						|  | "Videobooks are only compatible with PDF files." | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | return self.hook_beta_processor( | 
					
						
						|  | document, | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | translate_process, | 
					
						
						|  | SOURCE_LANGUAGE, | 
					
						
						|  | tts_voice00, | 
					
						
						|  | name_final_file, | 
					
						
						|  | custom_voices, | 
					
						
						|  | custom_voices_workers, | 
					
						
						|  | output_type, | 
					
						
						|  | chunk_size, | 
					
						
						|  | width, | 
					
						
						|  | height, | 
					
						
						|  | start_page, | 
					
						
						|  | end_page, | 
					
						
						|  | bcolor, | 
					
						
						|  | is_gui, | 
					
						
						|  | progress | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | final_wav_file = "audio_book.wav" | 
					
						
						|  |  | 
					
						
						|  | prog_disp("Processing text...", 0.15, is_gui, progress=progress) | 
					
						
						|  | result_file_path, result_text = document_preprocessor( | 
					
						
						|  | document, is_string, start_page, end_page | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if ( | 
					
						
						|  | output_type == "book (txt)" | 
					
						
						|  | and translate_process == "disable_translation" | 
					
						
						|  | ): | 
					
						
						|  | return result_file_path | 
					
						
						|  |  | 
					
						
						|  | if "SET_LIMIT" == os.getenv("DEMO"): | 
					
						
						|  | result_text = result_text[:50] | 
					
						
						|  | logger.info( | 
					
						
						|  | "DEMO; Generation is limited to 50 characters to prevent " | 
					
						
						|  | "CPU errors. No limitations with GPU.\n" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if translate_process != "disable_translation": | 
					
						
						|  |  | 
					
						
						|  | result_diarize = plain_text_to_segments(result_text, 1700) | 
					
						
						|  | prog_disp("Translating...", 0.30, is_gui, progress=progress) | 
					
						
						|  |  | 
					
						
						|  | result_diarize["segments"] = translate_text( | 
					
						
						|  | result_diarize["segments"], | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | translate_process, | 
					
						
						|  | chunk_size=0, | 
					
						
						|  | source=SOURCE_LANGUAGE, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | txt_file_path, result_text = segments_to_plain_text(result_diarize) | 
					
						
						|  |  | 
					
						
						|  | if output_type == "book (txt)": | 
					
						
						|  | return media_out( | 
					
						
						|  | result_file_path if is_string else document, | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | name_final_file, | 
					
						
						|  | "txt", | 
					
						
						|  | file_obj=txt_file_path, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | chunk_size = ( | 
					
						
						|  | chunk_size if chunk_size else determine_chunk_size(tts_voice00) | 
					
						
						|  | ) | 
					
						
						|  | result_diarize = plain_text_to_segments(result_text, chunk_size) | 
					
						
						|  | logger.debug(result_diarize) | 
					
						
						|  |  | 
					
						
						|  | prog_disp("Text to speech...", 0.45, is_gui, progress=progress) | 
					
						
						|  | valid_speakers = audio_segmentation_to_voice( | 
					
						
						|  | result_diarize, | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | is_gui, | 
					
						
						|  | tts_voice00, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | audio_files, speakers_list = accelerate_segments( | 
					
						
						|  | result_diarize, | 
					
						
						|  | 1.0, | 
					
						
						|  | valid_speakers, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if custom_voices: | 
					
						
						|  | prog_disp( | 
					
						
						|  | "Applying customized voices...", | 
					
						
						|  | 0.80, | 
					
						
						|  | is_gui, | 
					
						
						|  | progress=progress, | 
					
						
						|  | ) | 
					
						
						|  | self.vci( | 
					
						
						|  | audio_files, | 
					
						
						|  | speakers_list, | 
					
						
						|  | overwrite=True, | 
					
						
						|  | parallel_workers=custom_voices_workers, | 
					
						
						|  | ) | 
					
						
						|  | self.vci.unload_models() | 
					
						
						|  |  | 
					
						
						|  | prog_disp( | 
					
						
						|  | "Creating final audio file...", 0.90, is_gui, progress=progress | 
					
						
						|  | ) | 
					
						
						|  | remove_files(final_wav_file) | 
					
						
						|  | create_translated_audio( | 
					
						
						|  | result_diarize, audio_files, final_wav_file, True | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | output = media_out( | 
					
						
						|  | result_file_path if is_string else document, | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | name_final_file, | 
					
						
						|  | "mp3" if "mp3" in output_type else ( | 
					
						
						|  | "ogg" if "ogg" in output_type else "wav" | 
					
						
						|  | ), | 
					
						
						|  | file_obj=final_wav_file, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | logger.info(f"Done: {output}") | 
					
						
						|  |  | 
					
						
						|  | return output | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | title = "<center><strong><font size='7'>📽️ SoniTranslate 🈷️</font></strong></center>" | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def create_gui(theme, logs_in_gui=False): | 
					
						
						|  | with gr.Blocks(theme=theme) as app: | 
					
						
						|  | gr.Markdown(title) | 
					
						
						|  | gr.Markdown(lg_conf["description"]) | 
					
						
						|  |  | 
					
						
						|  | if os.environ.get("ZERO_GPU") == "TRUE": | 
					
						
						|  | gr.Markdown( | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | <details> | 
					
						
						|  | <summary style="font-size: 1.5em;">⚠️ Important ⚠️</summary> | 
					
						
						|  | <ul> | 
					
						
						|  | <li>🚀 This demo uses a zero GPU setup only for the transcription and diarization process. Everything else runs on the CPU. It is recommended to use videos no longer than 15 minutes. ⏳</li> | 
					
						
						|  | <li>❗ If you see `queue` when using this, it means another user is currently using it, and you need to wait until they are finished.</li> | 
					
						
						|  | <li>🔒 Some functions are disabled, but if you duplicate this with a GPU and set the value in secrets "ZERO_GPU" to FALSE, you can use the app with full GPU acceleration. ⚡</li> | 
					
						
						|  | </ul> | 
					
						
						|  | </details> | 
					
						
						|  | """ | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | with gr.Tab(lg_conf["tab_translate"]): | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | input_data_type = gr.Dropdown( | 
					
						
						|  | ["SUBMIT VIDEO", "URL", "Find Video Path"], | 
					
						
						|  | value="SUBMIT VIDEO", | 
					
						
						|  | label=lg_conf["video_source"], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | def swap_visibility(data_type): | 
					
						
						|  | if data_type == "URL": | 
					
						
						|  | return ( | 
					
						
						|  | gr.update(visible=False, value=None), | 
					
						
						|  | gr.update(visible=True, value=""), | 
					
						
						|  | gr.update(visible=False, value=""), | 
					
						
						|  | ) | 
					
						
						|  | elif data_type == "SUBMIT VIDEO": | 
					
						
						|  | return ( | 
					
						
						|  | gr.update(visible=True, value=None), | 
					
						
						|  | gr.update(visible=False, value=""), | 
					
						
						|  | gr.update(visible=False, value=""), | 
					
						
						|  | ) | 
					
						
						|  | elif data_type == "Find Video Path": | 
					
						
						|  | return ( | 
					
						
						|  | gr.update(visible=False, value=None), | 
					
						
						|  | gr.update(visible=False, value=""), | 
					
						
						|  | gr.update(visible=True, value=""), | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | video_input = gr.File( | 
					
						
						|  | label="VIDEO", | 
					
						
						|  | file_count="multiple", | 
					
						
						|  | type="filepath", | 
					
						
						|  | ) | 
					
						
						|  | blink_input = gr.Textbox( | 
					
						
						|  | visible=False, | 
					
						
						|  | label=lg_conf["link_label"], | 
					
						
						|  | info=lg_conf["link_info"], | 
					
						
						|  | placeholder=lg_conf["link_ph"], | 
					
						
						|  | ) | 
					
						
						|  | directory_input = gr.Textbox( | 
					
						
						|  | visible=False, | 
					
						
						|  | label=lg_conf["dir_label"], | 
					
						
						|  | info=lg_conf["dir_info"], | 
					
						
						|  | placeholder=lg_conf["dir_ph"], | 
					
						
						|  | ) | 
					
						
						|  | input_data_type.change( | 
					
						
						|  | fn=swap_visibility, | 
					
						
						|  | inputs=input_data_type, | 
					
						
						|  | outputs=[video_input, blink_input, directory_input], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | gr.HTML() | 
					
						
						|  |  | 
					
						
						|  | SOURCE_LANGUAGE = gr.Dropdown( | 
					
						
						|  | LANGUAGES_LIST, | 
					
						
						|  | value=LANGUAGES_LIST[0], | 
					
						
						|  | label=lg_conf["sl_label"], | 
					
						
						|  | info=lg_conf["sl_info"], | 
					
						
						|  | ) | 
					
						
						|  | TRANSLATE_AUDIO_TO = gr.Dropdown( | 
					
						
						|  | LANGUAGES_LIST[1:], | 
					
						
						|  | value="English (en)", | 
					
						
						|  | label=lg_conf["tat_label"], | 
					
						
						|  | info=lg_conf["tat_info"], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | gr.HTML("<hr></h2>") | 
					
						
						|  |  | 
					
						
						|  | gr.Markdown(lg_conf["num_speakers"]) | 
					
						
						|  | MAX_TTS = 12 | 
					
						
						|  | min_speakers = gr.Slider( | 
					
						
						|  | 1, | 
					
						
						|  | MAX_TTS, | 
					
						
						|  | value=1, | 
					
						
						|  | label=lg_conf["min_sk"], | 
					
						
						|  | step=1, | 
					
						
						|  | visible=False, | 
					
						
						|  | ) | 
					
						
						|  | max_speakers = gr.Slider( | 
					
						
						|  | 1, | 
					
						
						|  | MAX_TTS, | 
					
						
						|  | value=1, | 
					
						
						|  | step=1, | 
					
						
						|  | label=lg_conf["max_sk"], | 
					
						
						|  | ) | 
					
						
						|  | gr.Markdown(lg_conf["tts_select"]) | 
					
						
						|  |  | 
					
						
						|  | def submit(value): | 
					
						
						|  | visibility_dict = { | 
					
						
						|  | f"tts_voice{i:02d}": gr.update(visible=i < value) | 
					
						
						|  | for i in range(MAX_TTS) | 
					
						
						|  | } | 
					
						
						|  | return [value for value in visibility_dict.values()] | 
					
						
						|  |  | 
					
						
						|  | tts_voice00 = gr.Dropdown( | 
					
						
						|  | SoniTr.tts_info.tts_list(), | 
					
						
						|  | value="en-US-EmmaMultilingualNeural-Female", | 
					
						
						|  | label=lg_conf["sk1"], | 
					
						
						|  | visible=True, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  | tts_voice01 = gr.Dropdown( | 
					
						
						|  | SoniTr.tts_info.tts_list(), | 
					
						
						|  | value="en-US-AndrewMultilingualNeural-Male", | 
					
						
						|  | label=lg_conf["sk2"], | 
					
						
						|  | visible=False, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  | tts_voice02 = gr.Dropdown( | 
					
						
						|  | SoniTr.tts_info.tts_list(), | 
					
						
						|  | value="en-US-AvaMultilingualNeural-Female", | 
					
						
						|  | label=lg_conf["sk3"], | 
					
						
						|  | visible=False, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  | tts_voice03 = gr.Dropdown( | 
					
						
						|  | SoniTr.tts_info.tts_list(), | 
					
						
						|  | value="en-US-BrianMultilingualNeural-Male", | 
					
						
						|  | label=lg_conf["sk4"], | 
					
						
						|  | visible=False, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  | tts_voice04 = gr.Dropdown( | 
					
						
						|  | SoniTr.tts_info.tts_list(), | 
					
						
						|  | value="de-DE-SeraphinaMultilingualNeural-Female", | 
					
						
						|  | label=lg_conf["sk4"], | 
					
						
						|  | visible=False, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  | tts_voice05 = gr.Dropdown( | 
					
						
						|  | SoniTr.tts_info.tts_list(), | 
					
						
						|  | value="de-DE-FlorianMultilingualNeural-Male", | 
					
						
						|  | label=lg_conf["sk6"], | 
					
						
						|  | visible=False, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  | tts_voice06 = gr.Dropdown( | 
					
						
						|  | SoniTr.tts_info.tts_list(), | 
					
						
						|  | value="fr-FR-VivienneMultilingualNeural-Female", | 
					
						
						|  | label=lg_conf["sk7"], | 
					
						
						|  | visible=False, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  | tts_voice07 = gr.Dropdown( | 
					
						
						|  | SoniTr.tts_info.tts_list(), | 
					
						
						|  | value="fr-FR-RemyMultilingualNeural-Male", | 
					
						
						|  | label=lg_conf["sk8"], | 
					
						
						|  | visible=False, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  | tts_voice08 = gr.Dropdown( | 
					
						
						|  | SoniTr.tts_info.tts_list(), | 
					
						
						|  | value="en-US-EmmaMultilingualNeural-Female", | 
					
						
						|  | label=lg_conf["sk9"], | 
					
						
						|  | visible=False, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  | tts_voice09 = gr.Dropdown( | 
					
						
						|  | SoniTr.tts_info.tts_list(), | 
					
						
						|  | value="en-US-AndrewMultilingualNeural-Male", | 
					
						
						|  | label=lg_conf["sk10"], | 
					
						
						|  | visible=False, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  | tts_voice10 = gr.Dropdown( | 
					
						
						|  | SoniTr.tts_info.tts_list(), | 
					
						
						|  | value="en-US-EmmaMultilingualNeural-Female", | 
					
						
						|  | label=lg_conf["sk11"], | 
					
						
						|  | visible=False, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  | tts_voice11 = gr.Dropdown( | 
					
						
						|  | SoniTr.tts_info.tts_list(), | 
					
						
						|  | value="en-US-AndrewMultilingualNeural-Male", | 
					
						
						|  | label=lg_conf["sk12"], | 
					
						
						|  | visible=False, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  | max_speakers.change( | 
					
						
						|  | submit, | 
					
						
						|  | max_speakers, | 
					
						
						|  | [ | 
					
						
						|  | tts_voice00, | 
					
						
						|  | tts_voice01, | 
					
						
						|  | tts_voice02, | 
					
						
						|  | tts_voice03, | 
					
						
						|  | tts_voice04, | 
					
						
						|  | tts_voice05, | 
					
						
						|  | tts_voice06, | 
					
						
						|  | tts_voice07, | 
					
						
						|  | tts_voice08, | 
					
						
						|  | tts_voice09, | 
					
						
						|  | tts_voice10, | 
					
						
						|  | tts_voice11, | 
					
						
						|  | ], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | with gr.Accordion( | 
					
						
						|  | lg_conf["vc_title"], | 
					
						
						|  | open=False, | 
					
						
						|  | ): | 
					
						
						|  | gr.Markdown(lg_conf["vc_subtitle"]) | 
					
						
						|  | voice_imitation_gui = gr.Checkbox( | 
					
						
						|  | False, | 
					
						
						|  | label=lg_conf["vc_active_label"], | 
					
						
						|  | info=lg_conf["vc_active_info"], | 
					
						
						|  | ) | 
					
						
						|  | openvoice_models = ["openvoice", "openvoice_v2"] | 
					
						
						|  | voice_imitation_method_options = ( | 
					
						
						|  | ["freevc"] + openvoice_models | 
					
						
						|  | if SoniTr.tts_info.xtts_enabled | 
					
						
						|  | else openvoice_models | 
					
						
						|  | ) | 
					
						
						|  | voice_imitation_method_gui = gr.Dropdown( | 
					
						
						|  | voice_imitation_method_options, | 
					
						
						|  | value=voice_imitation_method_options[-1], | 
					
						
						|  | label=lg_conf["vc_method_label"], | 
					
						
						|  | info=lg_conf["vc_method_info"], | 
					
						
						|  | ) | 
					
						
						|  | voice_imitation_max_segments_gui = gr.Slider( | 
					
						
						|  | label=lg_conf["vc_segments_label"], | 
					
						
						|  | info=lg_conf["vc_segments_info"], | 
					
						
						|  | value=3, | 
					
						
						|  | step=1, | 
					
						
						|  | minimum=1, | 
					
						
						|  | maximum=10, | 
					
						
						|  | visible=True, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  | voice_imitation_vocals_dereverb_gui = gr.Checkbox( | 
					
						
						|  | False, | 
					
						
						|  | label=lg_conf["vc_dereverb_label"], | 
					
						
						|  | info=lg_conf["vc_dereverb_info"], | 
					
						
						|  | ) | 
					
						
						|  | voice_imitation_remove_previous_gui = gr.Checkbox( | 
					
						
						|  | True, | 
					
						
						|  | label=lg_conf["vc_remove_label"], | 
					
						
						|  | info=lg_conf["vc_remove_info"], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if SoniTr.tts_info.xtts_enabled: | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | with gr.Accordion( | 
					
						
						|  | lg_conf["xtts_title"], | 
					
						
						|  | open=False, | 
					
						
						|  | ): | 
					
						
						|  | gr.Markdown(lg_conf["xtts_subtitle"]) | 
					
						
						|  | wav_speaker_file = gr.File( | 
					
						
						|  | label=lg_conf["xtts_file_label"] | 
					
						
						|  | ) | 
					
						
						|  | wav_speaker_name = gr.Textbox( | 
					
						
						|  | label=lg_conf["xtts_name_label"], | 
					
						
						|  | value="", | 
					
						
						|  | info=lg_conf["xtts_name_info"], | 
					
						
						|  | placeholder="default_name", | 
					
						
						|  | lines=1, | 
					
						
						|  | ) | 
					
						
						|  | wav_speaker_start = gr.Number( | 
					
						
						|  | label="Time audio start", | 
					
						
						|  | value=0, | 
					
						
						|  | visible=False, | 
					
						
						|  | ) | 
					
						
						|  | wav_speaker_end = gr.Number( | 
					
						
						|  | label="Time audio end", | 
					
						
						|  | value=0, | 
					
						
						|  | visible=False, | 
					
						
						|  | ) | 
					
						
						|  | wav_speaker_dir = gr.Textbox( | 
					
						
						|  | label="Directory save", | 
					
						
						|  | value="_XTTS_", | 
					
						
						|  | visible=False, | 
					
						
						|  | ) | 
					
						
						|  | wav_speaker_dereverb = gr.Checkbox( | 
					
						
						|  | True, | 
					
						
						|  | label=lg_conf["xtts_dereverb_label"], | 
					
						
						|  | info=lg_conf["xtts_dereverb_info"] | 
					
						
						|  | ) | 
					
						
						|  | wav_speaker_output = gr.HTML() | 
					
						
						|  | create_xtts_wav = gr.Button( | 
					
						
						|  | lg_conf["xtts_button"] | 
					
						
						|  | ) | 
					
						
						|  | gr.Markdown(lg_conf["xtts_footer"]) | 
					
						
						|  | else: | 
					
						
						|  | wav_speaker_dereverb = gr.Checkbox( | 
					
						
						|  | False, | 
					
						
						|  | label=lg_conf["xtts_dereverb_label"], | 
					
						
						|  | info=lg_conf["xtts_dereverb_info"], | 
					
						
						|  | visible=False | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | with gr.Accordion( | 
					
						
						|  | lg_conf["extra_setting"], open=False | 
					
						
						|  | ): | 
					
						
						|  | audio_accelerate = gr.Slider( | 
					
						
						|  | label=lg_conf["acc_max_label"], | 
					
						
						|  | value=1.9, | 
					
						
						|  | step=0.1, | 
					
						
						|  | minimum=1.0, | 
					
						
						|  | maximum=2.5, | 
					
						
						|  | visible=True, | 
					
						
						|  | interactive=True, | 
					
						
						|  | info=lg_conf["acc_max_info"], | 
					
						
						|  | ) | 
					
						
						|  | acceleration_rate_regulation_gui = gr.Checkbox( | 
					
						
						|  | False, | 
					
						
						|  | label=lg_conf["acc_rate_label"], | 
					
						
						|  | info=lg_conf["acc_rate_info"], | 
					
						
						|  | ) | 
					
						
						|  | avoid_overlap_gui = gr.Checkbox( | 
					
						
						|  | False, | 
					
						
						|  | label=lg_conf["or_label"], | 
					
						
						|  | info=lg_conf["or_info"], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | gr.HTML("<hr></h2>") | 
					
						
						|  |  | 
					
						
						|  | audio_mix_options = [ | 
					
						
						|  | "Mixing audio with sidechain compression", | 
					
						
						|  | "Adjusting volumes and mixing audio", | 
					
						
						|  | ] | 
					
						
						|  | AUDIO_MIX = gr.Dropdown( | 
					
						
						|  | audio_mix_options, | 
					
						
						|  | value=audio_mix_options[1], | 
					
						
						|  | label=lg_conf["aud_mix_label"], | 
					
						
						|  | info=lg_conf["aud_mix_info"], | 
					
						
						|  | ) | 
					
						
						|  | volume_original_mix = gr.Slider( | 
					
						
						|  | label=lg_conf["vol_ori"], | 
					
						
						|  | info="for Adjusting volumes and mixing audio", | 
					
						
						|  | value=0.25, | 
					
						
						|  | step=0.05, | 
					
						
						|  | minimum=0.0, | 
					
						
						|  | maximum=2.50, | 
					
						
						|  | visible=True, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  | volume_translated_mix = gr.Slider( | 
					
						
						|  | label=lg_conf["vol_tra"], | 
					
						
						|  | info="for Adjusting volumes and mixing audio", | 
					
						
						|  | value=1.80, | 
					
						
						|  | step=0.05, | 
					
						
						|  | minimum=0.0, | 
					
						
						|  | maximum=2.50, | 
					
						
						|  | visible=True, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  | main_voiceless_track = gr.Checkbox( | 
					
						
						|  | label=lg_conf["voiceless_tk_label"], | 
					
						
						|  | info=lg_conf["voiceless_tk_info"], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | gr.HTML("<hr></h2>") | 
					
						
						|  | sub_type_options = [ | 
					
						
						|  | "disable", | 
					
						
						|  | "srt", | 
					
						
						|  | "vtt", | 
					
						
						|  | "ass", | 
					
						
						|  | "txt", | 
					
						
						|  | "tsv", | 
					
						
						|  | "json", | 
					
						
						|  | "aud", | 
					
						
						|  | ] | 
					
						
						|  |  | 
					
						
						|  | sub_type_output = gr.Dropdown( | 
					
						
						|  | sub_type_options, | 
					
						
						|  | value=sub_type_options[1], | 
					
						
						|  | label=lg_conf["sub_type"], | 
					
						
						|  | ) | 
					
						
						|  | soft_subtitles_to_video_gui = gr.Checkbox( | 
					
						
						|  | label=lg_conf["soft_subs_label"], | 
					
						
						|  | info=lg_conf["soft_subs_info"], | 
					
						
						|  | ) | 
					
						
						|  | burn_subtitles_to_video_gui = gr.Checkbox( | 
					
						
						|  | label=lg_conf["burn_subs_label"], | 
					
						
						|  | info=lg_conf["burn_subs_info"], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | gr.HTML("<hr></h2>") | 
					
						
						|  | gr.Markdown(lg_conf["whisper_title"]) | 
					
						
						|  | literalize_numbers_gui = gr.Checkbox( | 
					
						
						|  | True, | 
					
						
						|  | label=lg_conf["lnum_label"], | 
					
						
						|  | info=lg_conf["lnum_info"], | 
					
						
						|  | ) | 
					
						
						|  | vocal_refinement_gui = gr.Checkbox( | 
					
						
						|  | False, | 
					
						
						|  | label=lg_conf["scle_label"], | 
					
						
						|  | info=lg_conf["scle_info"], | 
					
						
						|  | ) | 
					
						
						|  | segment_duration_limit_gui = gr.Slider( | 
					
						
						|  | label=lg_conf["sd_limit_label"], | 
					
						
						|  | info=lg_conf["sd_limit_info"], | 
					
						
						|  | value=15, | 
					
						
						|  | step=1, | 
					
						
						|  | minimum=1, | 
					
						
						|  | maximum=30, | 
					
						
						|  | ) | 
					
						
						|  | whisper_model_default = ( | 
					
						
						|  | "large-v3" | 
					
						
						|  | if SoniTr.device == "cuda" | 
					
						
						|  | else "medium" | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | WHISPER_MODEL_SIZE = gr.Dropdown( | 
					
						
						|  | ASR_MODEL_OPTIONS + find_whisper_models(), | 
					
						
						|  | value=whisper_model_default, | 
					
						
						|  | label="Whisper ASR model", | 
					
						
						|  | info=lg_conf["asr_model_info"], | 
					
						
						|  | allow_custom_value=True, | 
					
						
						|  | ) | 
					
						
						|  | com_t_opt, com_t_default = ( | 
					
						
						|  | [COMPUTE_TYPE_GPU, "float16"] | 
					
						
						|  | if SoniTr.device == "cuda" | 
					
						
						|  | else [COMPUTE_TYPE_CPU, "float32"] | 
					
						
						|  | ) | 
					
						
						|  | compute_type = gr.Dropdown( | 
					
						
						|  | com_t_opt, | 
					
						
						|  | value=com_t_default, | 
					
						
						|  | label=lg_conf["ctype_label"], | 
					
						
						|  | info=lg_conf["ctype_info"], | 
					
						
						|  | ) | 
					
						
						|  | batch_size_value = 8 if os.environ.get("ZERO_GPU") != "TRUE" else 32 | 
					
						
						|  | batch_size = gr.Slider( | 
					
						
						|  | minimum=1, | 
					
						
						|  | maximum=32, | 
					
						
						|  | value=batch_size_value, | 
					
						
						|  | label=lg_conf["batchz_label"], | 
					
						
						|  | info=lg_conf["batchz_info"], | 
					
						
						|  | step=1, | 
					
						
						|  | ) | 
					
						
						|  | input_srt = gr.File( | 
					
						
						|  | label=lg_conf["srt_file_label"], | 
					
						
						|  | file_types=[".srt", ".ass", ".vtt"], | 
					
						
						|  | height=130, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | gr.HTML("<hr></h2>") | 
					
						
						|  | text_segmentation_options = [ | 
					
						
						|  | "sentence", | 
					
						
						|  | "word", | 
					
						
						|  | "character" | 
					
						
						|  | ] | 
					
						
						|  | text_segmentation_scale_gui = gr.Dropdown( | 
					
						
						|  | text_segmentation_options, | 
					
						
						|  | value=text_segmentation_options[0], | 
					
						
						|  | label=lg_conf["tsscale_label"], | 
					
						
						|  | info=lg_conf["tsscale_info"], | 
					
						
						|  | ) | 
					
						
						|  | divide_text_segments_by_gui = gr.Textbox( | 
					
						
						|  | label=lg_conf["divide_text_label"], | 
					
						
						|  | value="", | 
					
						
						|  | info=lg_conf["divide_text_info"], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | gr.HTML("<hr></h2>") | 
					
						
						|  | pyannote_models_list = list( | 
					
						
						|  | diarization_models.keys() | 
					
						
						|  | ) | 
					
						
						|  | diarization_process_dropdown = gr.Dropdown( | 
					
						
						|  | pyannote_models_list, | 
					
						
						|  | value=pyannote_models_list[1], | 
					
						
						|  | label=lg_conf["diarization_label"], | 
					
						
						|  | ) | 
					
						
						|  | translate_process_dropdown = gr.Dropdown( | 
					
						
						|  | TRANSLATION_PROCESS_OPTIONS, | 
					
						
						|  | value=TRANSLATION_PROCESS_OPTIONS[0], | 
					
						
						|  | label=lg_conf["tr_process_label"], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | gr.HTML("<hr></h2>") | 
					
						
						|  | main_output_type = gr.Dropdown( | 
					
						
						|  | OUTPUT_TYPE_OPTIONS, | 
					
						
						|  | value=OUTPUT_TYPE_OPTIONS[0], | 
					
						
						|  | label=lg_conf["out_type_label"], | 
					
						
						|  | ) | 
					
						
						|  | VIDEO_OUTPUT_NAME = gr.Textbox( | 
					
						
						|  | label=lg_conf["out_name_label"], | 
					
						
						|  | value="", | 
					
						
						|  | info=lg_conf["out_name_info"], | 
					
						
						|  | ) | 
					
						
						|  | play_sound_gui = gr.Checkbox( | 
					
						
						|  | True, | 
					
						
						|  | label=lg_conf["task_sound_label"], | 
					
						
						|  | info=lg_conf["task_sound_info"], | 
					
						
						|  | ) | 
					
						
						|  | enable_cache_gui = gr.Checkbox( | 
					
						
						|  | True, | 
					
						
						|  | label=lg_conf["cache_label"], | 
					
						
						|  | info=lg_conf["cache_info"], | 
					
						
						|  | ) | 
					
						
						|  | PREVIEW = gr.Checkbox( | 
					
						
						|  | label="Preview", info=lg_conf["preview_info"] | 
					
						
						|  | ) | 
					
						
						|  | is_gui_dummy_check = gr.Checkbox( | 
					
						
						|  | True, visible=False | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | with gr.Column(variant="compact"): | 
					
						
						|  | edit_sub_check = gr.Checkbox( | 
					
						
						|  | label=lg_conf["edit_sub_label"], | 
					
						
						|  | info=lg_conf["edit_sub_info"], | 
					
						
						|  | interactive=(False if os.environ.get("IS_DEMO") == "TRUE" else True), | 
					
						
						|  | ) | 
					
						
						|  | dummy_false_check = gr.Checkbox( | 
					
						
						|  | False, | 
					
						
						|  | visible=False, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | def visible_component_subs(input_bool): | 
					
						
						|  | if input_bool: | 
					
						
						|  | return gr.update(visible=True), gr.update( | 
					
						
						|  | visible=True | 
					
						
						|  | ) | 
					
						
						|  | else: | 
					
						
						|  | return gr.update(visible=False), gr.update( | 
					
						
						|  | visible=False | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | subs_button = gr.Button( | 
					
						
						|  | lg_conf["button_subs"], | 
					
						
						|  | variant="primary", | 
					
						
						|  | visible=False, | 
					
						
						|  | ) | 
					
						
						|  | subs_edit_space = gr.Textbox( | 
					
						
						|  | visible=False, | 
					
						
						|  | lines=10, | 
					
						
						|  | label=lg_conf["editor_sub_label"], | 
					
						
						|  | info=lg_conf["editor_sub_info"], | 
					
						
						|  | placeholder=lg_conf["editor_sub_ph"], | 
					
						
						|  | ) | 
					
						
						|  | edit_sub_check.change( | 
					
						
						|  | visible_component_subs, | 
					
						
						|  | [edit_sub_check], | 
					
						
						|  | [subs_button, subs_edit_space], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | video_button = gr.Button( | 
					
						
						|  | lg_conf["button_translate"], | 
					
						
						|  | variant="primary", | 
					
						
						|  | ) | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | video_output = gr.File( | 
					
						
						|  | label=lg_conf["output_result_label"], | 
					
						
						|  | file_count="multiple", | 
					
						
						|  | interactive=False, | 
					
						
						|  |  | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | gr.HTML("<hr></h2>") | 
					
						
						|  |  | 
					
						
						|  | if ( | 
					
						
						|  | os.getenv("YOUR_HF_TOKEN") is None | 
					
						
						|  | or os.getenv("YOUR_HF_TOKEN") == "" | 
					
						
						|  | ): | 
					
						
						|  | HFKEY = gr.Textbox( | 
					
						
						|  | visible=True, | 
					
						
						|  | label="HF Token", | 
					
						
						|  | info=lg_conf["ht_token_info"], | 
					
						
						|  | placeholder=lg_conf["ht_token_ph"], | 
					
						
						|  | ) | 
					
						
						|  | else: | 
					
						
						|  | HFKEY = gr.Textbox( | 
					
						
						|  | visible=False, | 
					
						
						|  | label="HF Token", | 
					
						
						|  | info=lg_conf["ht_token_info"], | 
					
						
						|  | placeholder=lg_conf["ht_token_ph"], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | gr.Examples( | 
					
						
						|  | examples=[ | 
					
						
						|  | [ | 
					
						
						|  | ["./assets/Video_main.mp4"], | 
					
						
						|  | "", | 
					
						
						|  | "", | 
					
						
						|  | "", | 
					
						
						|  | False, | 
					
						
						|  | whisper_model_default, | 
					
						
						|  | batch_size_value, | 
					
						
						|  | com_t_default, | 
					
						
						|  | "Spanish (es)", | 
					
						
						|  | "English (en)", | 
					
						
						|  | 1, | 
					
						
						|  | 2, | 
					
						
						|  | "en-US-EmmaMultilingualNeural-Female", | 
					
						
						|  | "en-US-AndrewMultilingualNeural-Male", | 
					
						
						|  | ], | 
					
						
						|  | ], | 
					
						
						|  | fn=SoniTr.batch_multilingual_media_conversion, | 
					
						
						|  | inputs=[ | 
					
						
						|  | video_input, | 
					
						
						|  | blink_input, | 
					
						
						|  | directory_input, | 
					
						
						|  | HFKEY, | 
					
						
						|  | PREVIEW, | 
					
						
						|  | WHISPER_MODEL_SIZE, | 
					
						
						|  | batch_size, | 
					
						
						|  | compute_type, | 
					
						
						|  | SOURCE_LANGUAGE, | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | min_speakers, | 
					
						
						|  | max_speakers, | 
					
						
						|  | tts_voice00, | 
					
						
						|  | tts_voice01, | 
					
						
						|  | ], | 
					
						
						|  | outputs=[video_output], | 
					
						
						|  | cache_examples=False, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | with gr.Tab(lg_conf["tab_docs"]): | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | with gr.Accordion("Docs", open=True): | 
					
						
						|  | with gr.Column(variant="compact"): | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | input_doc_type = gr.Dropdown( | 
					
						
						|  | [ | 
					
						
						|  | "WRITE TEXT", | 
					
						
						|  | "SUBMIT DOCUMENT", | 
					
						
						|  | "Find Document Path", | 
					
						
						|  | ], | 
					
						
						|  | value="SUBMIT DOCUMENT", | 
					
						
						|  | label=lg_conf["docs_input_label"], | 
					
						
						|  | info=lg_conf["docs_input_info"], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | def swap_visibility(data_type): | 
					
						
						|  | if data_type == "WRITE TEXT": | 
					
						
						|  | return ( | 
					
						
						|  | gr.update(visible=True, value=""), | 
					
						
						|  | gr.update(visible=False, value=None), | 
					
						
						|  | gr.update(visible=False, value=""), | 
					
						
						|  | ) | 
					
						
						|  | elif data_type == "SUBMIT DOCUMENT": | 
					
						
						|  | return ( | 
					
						
						|  | gr.update(visible=False, value=""), | 
					
						
						|  | gr.update(visible=True, value=None), | 
					
						
						|  | gr.update(visible=False, value=""), | 
					
						
						|  | ) | 
					
						
						|  | elif data_type == "Find Document Path": | 
					
						
						|  | return ( | 
					
						
						|  | gr.update(visible=False, value=""), | 
					
						
						|  | gr.update(visible=False, value=None), | 
					
						
						|  | gr.update(visible=True, value=""), | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | text_docs = gr.Textbox( | 
					
						
						|  | label="Text", | 
					
						
						|  | value="This is an example", | 
					
						
						|  | info="Write a text", | 
					
						
						|  | placeholder="...", | 
					
						
						|  | lines=5, | 
					
						
						|  | visible=False, | 
					
						
						|  | ) | 
					
						
						|  | input_docs = gr.File( | 
					
						
						|  | label="Document", visible=True | 
					
						
						|  | ) | 
					
						
						|  | directory_input_docs = gr.Textbox( | 
					
						
						|  | visible=False, | 
					
						
						|  | label="Document Path", | 
					
						
						|  | info="Example: /home/my_doc.pdf", | 
					
						
						|  | placeholder="Path goes here...", | 
					
						
						|  | ) | 
					
						
						|  | input_doc_type.change( | 
					
						
						|  | fn=swap_visibility, | 
					
						
						|  | inputs=input_doc_type, | 
					
						
						|  | outputs=[ | 
					
						
						|  | text_docs, | 
					
						
						|  | input_docs, | 
					
						
						|  | directory_input_docs, | 
					
						
						|  | ], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | gr.HTML() | 
					
						
						|  |  | 
					
						
						|  | tts_documents = gr.Dropdown( | 
					
						
						|  | list( | 
					
						
						|  | filter( | 
					
						
						|  | lambda x: x != "_XTTS_/AUTOMATIC.wav", | 
					
						
						|  | SoniTr.tts_info.tts_list(), | 
					
						
						|  | ) | 
					
						
						|  | ), | 
					
						
						|  | value="en-US-EmmaMultilingualNeural-Female", | 
					
						
						|  | label="TTS", | 
					
						
						|  | visible=True, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | gr.HTML() | 
					
						
						|  |  | 
					
						
						|  | docs_SOURCE_LANGUAGE = gr.Dropdown( | 
					
						
						|  | LANGUAGES_LIST[1:], | 
					
						
						|  | value="English (en)", | 
					
						
						|  | label=lg_conf["sl_label"], | 
					
						
						|  | info=lg_conf["docs_source_info"], | 
					
						
						|  | ) | 
					
						
						|  | docs_TRANSLATE_TO = gr.Dropdown( | 
					
						
						|  | LANGUAGES_LIST[1:], | 
					
						
						|  | value="English (en)", | 
					
						
						|  | label=lg_conf["tat_label"], | 
					
						
						|  | info=lg_conf["tat_info"], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | with gr.Accordion( | 
					
						
						|  | lg_conf["extra_setting"], open=False | 
					
						
						|  | ): | 
					
						
						|  | docs_translate_process_dropdown = gr.Dropdown( | 
					
						
						|  | DOCS_TRANSLATION_PROCESS_OPTIONS, | 
					
						
						|  | value=DOCS_TRANSLATION_PROCESS_OPTIONS[ | 
					
						
						|  | 0 | 
					
						
						|  | ], | 
					
						
						|  | label="Translation process", | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | gr.HTML("<hr></h2>") | 
					
						
						|  |  | 
					
						
						|  | docs_output_type = gr.Dropdown( | 
					
						
						|  | DOCS_OUTPUT_TYPE_OPTIONS, | 
					
						
						|  | value=DOCS_OUTPUT_TYPE_OPTIONS[2], | 
					
						
						|  | label="Output type", | 
					
						
						|  | ) | 
					
						
						|  | docs_OUTPUT_NAME = gr.Textbox( | 
					
						
						|  | label="Final file name", | 
					
						
						|  | value="", | 
					
						
						|  | info=lg_conf["out_name_info"], | 
					
						
						|  | ) | 
					
						
						|  | docs_chunk_size = gr.Number( | 
					
						
						|  | label=lg_conf["chunk_size_label"], | 
					
						
						|  | value=0, | 
					
						
						|  | visible=True, | 
					
						
						|  | interactive=True, | 
					
						
						|  | info=lg_conf["chunk_size_info"], | 
					
						
						|  | ) | 
					
						
						|  | gr.HTML("<hr></h2>") | 
					
						
						|  | start_page_gui = gr.Number( | 
					
						
						|  | step=1, | 
					
						
						|  | value=1, | 
					
						
						|  | minimum=1, | 
					
						
						|  | maximum=99999, | 
					
						
						|  | label="Start page", | 
					
						
						|  | ) | 
					
						
						|  | end_page_gui = gr.Number( | 
					
						
						|  | step=1, | 
					
						
						|  | value=99999, | 
					
						
						|  | minimum=1, | 
					
						
						|  | maximum=99999, | 
					
						
						|  | label="End page", | 
					
						
						|  | ) | 
					
						
						|  | gr.HTML("<hr>Videobook config</h2>") | 
					
						
						|  | videobook_width_gui = gr.Number( | 
					
						
						|  | step=1, | 
					
						
						|  | value=1280, | 
					
						
						|  | minimum=100, | 
					
						
						|  | maximum=4096, | 
					
						
						|  | label="Width", | 
					
						
						|  | ) | 
					
						
						|  | videobook_height_gui = gr.Number( | 
					
						
						|  | step=1, | 
					
						
						|  | value=720, | 
					
						
						|  | minimum=100, | 
					
						
						|  | maximum=4096, | 
					
						
						|  | label="Height", | 
					
						
						|  | ) | 
					
						
						|  | videobook_bcolor_gui = gr.Dropdown( | 
					
						
						|  | BORDER_COLORS, | 
					
						
						|  | value=BORDER_COLORS[0], | 
					
						
						|  | label="Border color", | 
					
						
						|  | ) | 
					
						
						|  | docs_dummy_check = gr.Checkbox( | 
					
						
						|  | True, visible=False | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | docs_button = gr.Button( | 
					
						
						|  | lg_conf["docs_button"], | 
					
						
						|  | variant="primary", | 
					
						
						|  | ) | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | docs_output = gr.File( | 
					
						
						|  | label="Result", | 
					
						
						|  | interactive=False, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | with gr.Tab("Custom voice R.V.C. (Optional)"): | 
					
						
						|  |  | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | with gr.Accordion("Get the R.V.C. Models", open=True): | 
					
						
						|  | url_links = gr.Textbox( | 
					
						
						|  | label="URLs", | 
					
						
						|  | value="", | 
					
						
						|  | info=lg_conf["cv_url_info"], | 
					
						
						|  | placeholder="urls here...", | 
					
						
						|  | lines=1, | 
					
						
						|  | ) | 
					
						
						|  | download_finish = gr.HTML() | 
					
						
						|  | download_button = gr.Button("DOWNLOAD MODELS") | 
					
						
						|  |  | 
					
						
						|  | def update_models(): | 
					
						
						|  | models_path, index_path = upload_model_list() | 
					
						
						|  |  | 
					
						
						|  | dict_models = { | 
					
						
						|  | f"fmodel{i:02d}": gr.update( | 
					
						
						|  | choices=models_path | 
					
						
						|  | ) | 
					
						
						|  | for i in range(MAX_TTS+1) | 
					
						
						|  | } | 
					
						
						|  | dict_index = { | 
					
						
						|  | f"findex{i:02d}": gr.update( | 
					
						
						|  | choices=index_path, value=None | 
					
						
						|  | ) | 
					
						
						|  | for i in range(MAX_TTS+1) | 
					
						
						|  | } | 
					
						
						|  | dict_changes = {**dict_models, **dict_index} | 
					
						
						|  | return [value for value in dict_changes.values()] | 
					
						
						|  |  | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | with gr.Accordion(lg_conf["replace_title"], open=False): | 
					
						
						|  | with gr.Column(variant="compact"): | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | gr.Markdown(lg_conf["sec1_title"]) | 
					
						
						|  | enable_custom_voice = gr.Checkbox( | 
					
						
						|  | False, | 
					
						
						|  | label="ENABLE", | 
					
						
						|  | info=lg_conf["enable_replace"] | 
					
						
						|  | ) | 
					
						
						|  | workers_custom_voice = gr.Number( | 
					
						
						|  | step=1, | 
					
						
						|  | value=1, | 
					
						
						|  | minimum=1, | 
					
						
						|  | maximum=50, | 
					
						
						|  | label="workers", | 
					
						
						|  | visible=False, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | gr.Markdown(lg_conf["sec2_title"]) | 
					
						
						|  | gr.Markdown(lg_conf["sec2_subtitle"]) | 
					
						
						|  |  | 
					
						
						|  | PITCH_ALGO_OPT = [ | 
					
						
						|  | "pm", | 
					
						
						|  | "harvest", | 
					
						
						|  | "crepe", | 
					
						
						|  | "rmvpe", | 
					
						
						|  | "rmvpe+", | 
					
						
						|  | ] | 
					
						
						|  |  | 
					
						
						|  | def model_conf(): | 
					
						
						|  | return gr.Dropdown( | 
					
						
						|  | models_path, | 
					
						
						|  |  | 
					
						
						|  | label="Model", | 
					
						
						|  | visible=True, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | def pitch_algo_conf(): | 
					
						
						|  | return gr.Dropdown( | 
					
						
						|  | PITCH_ALGO_OPT, | 
					
						
						|  | value=PITCH_ALGO_OPT[3], | 
					
						
						|  | label="Pitch algorithm", | 
					
						
						|  | visible=True, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | def pitch_lvl_conf(): | 
					
						
						|  | return gr.Slider( | 
					
						
						|  | label="Pitch level", | 
					
						
						|  | minimum=-24, | 
					
						
						|  | maximum=24, | 
					
						
						|  | step=1, | 
					
						
						|  | value=0, | 
					
						
						|  | visible=True, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | def index_conf(): | 
					
						
						|  | return gr.Dropdown( | 
					
						
						|  | index_path, | 
					
						
						|  | value=None, | 
					
						
						|  | label="Index", | 
					
						
						|  | visible=True, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | def index_inf_conf(): | 
					
						
						|  | return gr.Slider( | 
					
						
						|  | minimum=0, | 
					
						
						|  | maximum=1, | 
					
						
						|  | label="Index influence", | 
					
						
						|  | value=0.75, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | def respiration_filter_conf(): | 
					
						
						|  | return gr.Slider( | 
					
						
						|  | minimum=0, | 
					
						
						|  | maximum=7, | 
					
						
						|  | label="Respiration median filtering", | 
					
						
						|  | value=3, | 
					
						
						|  | step=1, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | def envelope_ratio_conf(): | 
					
						
						|  | return gr.Slider( | 
					
						
						|  | minimum=0, | 
					
						
						|  | maximum=1, | 
					
						
						|  | label="Envelope ratio", | 
					
						
						|  | value=0.25, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | def consonant_protec_conf(): | 
					
						
						|  | return gr.Slider( | 
					
						
						|  | minimum=0, | 
					
						
						|  | maximum=0.5, | 
					
						
						|  | label="Consonant breath protection", | 
					
						
						|  | value=0.5, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | def button_conf(tts_name): | 
					
						
						|  | return gr.Button( | 
					
						
						|  | lg_conf["cv_button_apply"]+" "+tts_name, | 
					
						
						|  | variant="primary", | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | TTS_TABS = [ | 
					
						
						|  | 'TTS Speaker {:02d}'.format(i) for i in range(1, MAX_TTS+1) | 
					
						
						|  | ] | 
					
						
						|  |  | 
					
						
						|  | CV_SUBTITLES = [ | 
					
						
						|  | lg_conf["cv_tts1"], | 
					
						
						|  | lg_conf["cv_tts2"], | 
					
						
						|  | lg_conf["cv_tts3"], | 
					
						
						|  | lg_conf["cv_tts4"], | 
					
						
						|  | lg_conf["cv_tts5"], | 
					
						
						|  | lg_conf["cv_tts6"], | 
					
						
						|  | lg_conf["cv_tts7"], | 
					
						
						|  | lg_conf["cv_tts8"], | 
					
						
						|  | lg_conf["cv_tts9"], | 
					
						
						|  | lg_conf["cv_tts10"], | 
					
						
						|  | lg_conf["cv_tts11"], | 
					
						
						|  | lg_conf["cv_tts12"], | 
					
						
						|  | ] | 
					
						
						|  |  | 
					
						
						|  | configs_storage = [] | 
					
						
						|  |  | 
					
						
						|  | for i in range(MAX_TTS): | 
					
						
						|  | with gr.Accordion(CV_SUBTITLES[i], open=False): | 
					
						
						|  | gr.Markdown(TTS_TABS[i]) | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | tag_gui = gr.Textbox( | 
					
						
						|  | value=TTS_TABS[i], visible=False | 
					
						
						|  | ) | 
					
						
						|  | model_gui = model_conf() | 
					
						
						|  | pitch_algo_gui = pitch_algo_conf() | 
					
						
						|  | pitch_lvl_gui = pitch_lvl_conf() | 
					
						
						|  | index_gui = index_conf() | 
					
						
						|  | index_inf_gui = index_inf_conf() | 
					
						
						|  | rmf_gui = respiration_filter_conf() | 
					
						
						|  | er_gui = envelope_ratio_conf() | 
					
						
						|  | cbp_gui = consonant_protec_conf() | 
					
						
						|  |  | 
					
						
						|  | with gr.Row(variant="compact"): | 
					
						
						|  | button_config = button_conf( | 
					
						
						|  | TTS_TABS[i] | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | confirm_conf = gr.HTML() | 
					
						
						|  |  | 
					
						
						|  | button_config.click( | 
					
						
						|  | SoniTr.vci.apply_conf, | 
					
						
						|  | inputs=[ | 
					
						
						|  | tag_gui, | 
					
						
						|  | model_gui, | 
					
						
						|  | pitch_algo_gui, | 
					
						
						|  | pitch_lvl_gui, | 
					
						
						|  | index_gui, | 
					
						
						|  | index_inf_gui, | 
					
						
						|  | rmf_gui, | 
					
						
						|  | er_gui, | 
					
						
						|  | cbp_gui, | 
					
						
						|  | ], | 
					
						
						|  | outputs=[confirm_conf], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | configs_storage.append({ | 
					
						
						|  | "tag": tag_gui, | 
					
						
						|  | "model": model_gui, | 
					
						
						|  | "index": index_gui, | 
					
						
						|  | }) | 
					
						
						|  |  | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | with gr.Accordion("Test R.V.C.", open=False): | 
					
						
						|  | with gr.Row(variant="compact"): | 
					
						
						|  | text_test = gr.Textbox( | 
					
						
						|  | label="Text", | 
					
						
						|  | value="This is an example", | 
					
						
						|  | info="write a text", | 
					
						
						|  | placeholder="...", | 
					
						
						|  | lines=5, | 
					
						
						|  | ) | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | tts_test = gr.Dropdown( | 
					
						
						|  | sorted(SoniTr.tts_info.list_edge), | 
					
						
						|  | value="en-GB-ThomasNeural-Male", | 
					
						
						|  | label="TTS", | 
					
						
						|  | visible=True, | 
					
						
						|  | interactive=True, | 
					
						
						|  | ) | 
					
						
						|  | model_test = model_conf() | 
					
						
						|  | index_test = index_conf() | 
					
						
						|  | pitch_test = pitch_lvl_conf() | 
					
						
						|  | pitch_alg_test = pitch_algo_conf() | 
					
						
						|  | with gr.Row(variant="compact"): | 
					
						
						|  | button_test = gr.Button("Test audio") | 
					
						
						|  |  | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | original_ttsvoice = gr.Audio() | 
					
						
						|  | ttsvoice = gr.Audio() | 
					
						
						|  |  | 
					
						
						|  | button_test.click( | 
					
						
						|  | SoniTr.vci.make_test, | 
					
						
						|  | inputs=[ | 
					
						
						|  | text_test, | 
					
						
						|  | tts_test, | 
					
						
						|  | model_test, | 
					
						
						|  | index_test, | 
					
						
						|  | pitch_test, | 
					
						
						|  | pitch_alg_test, | 
					
						
						|  | ], | 
					
						
						|  | outputs=[ttsvoice, original_ttsvoice], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | download_button.click( | 
					
						
						|  | download_list, | 
					
						
						|  | [url_links], | 
					
						
						|  | [download_finish], | 
					
						
						|  | queue=False | 
					
						
						|  | ).then( | 
					
						
						|  | update_models, | 
					
						
						|  | [], | 
					
						
						|  | [ | 
					
						
						|  | elem["model"] for elem in configs_storage | 
					
						
						|  | ] + [model_test] + [ | 
					
						
						|  | elem["index"] for elem in configs_storage | 
					
						
						|  | ] + [index_test], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | with gr.Tab(lg_conf["tab_help"]): | 
					
						
						|  | gr.Markdown(lg_conf["tutorial"]) | 
					
						
						|  | gr.Markdown(news) | 
					
						
						|  |  | 
					
						
						|  | def play_sound_alert(play_sound): | 
					
						
						|  |  | 
					
						
						|  | if not play_sound: | 
					
						
						|  | return None | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | sound_alert = "assets/sound_alert.mp3" | 
					
						
						|  |  | 
					
						
						|  | time.sleep(0.25) | 
					
						
						|  |  | 
					
						
						|  | yield None | 
					
						
						|  |  | 
					
						
						|  | time.sleep(0.25) | 
					
						
						|  | yield sound_alert | 
					
						
						|  |  | 
					
						
						|  | sound_alert_notification = gr.Audio( | 
					
						
						|  | value=None, | 
					
						
						|  | type="filepath", | 
					
						
						|  | format="mp3", | 
					
						
						|  | autoplay=True, | 
					
						
						|  | visible=False, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | if logs_in_gui: | 
					
						
						|  | logger.info("Logs in gui need public url") | 
					
						
						|  |  | 
					
						
						|  | class Logger: | 
					
						
						|  | def __init__(self, filename): | 
					
						
						|  | self.terminal = sys.stdout | 
					
						
						|  | self.log = open(filename, "w") | 
					
						
						|  |  | 
					
						
						|  | def write(self, message): | 
					
						
						|  | self.terminal.write(message) | 
					
						
						|  | self.log.write(message) | 
					
						
						|  |  | 
					
						
						|  | def flush(self): | 
					
						
						|  | self.terminal.flush() | 
					
						
						|  | self.log.flush() | 
					
						
						|  |  | 
					
						
						|  | def isatty(self): | 
					
						
						|  | return False | 
					
						
						|  |  | 
					
						
						|  | sys.stdout = Logger("output.log") | 
					
						
						|  |  | 
					
						
						|  | def read_logs(): | 
					
						
						|  | sys.stdout.flush() | 
					
						
						|  | with open("output.log", "r") as f: | 
					
						
						|  | return f.read() | 
					
						
						|  |  | 
					
						
						|  | with gr.Accordion("Logs", open=False): | 
					
						
						|  | logs = gr.Textbox(label=">>>") | 
					
						
						|  | app.load(read_logs, None, logs, every=1) | 
					
						
						|  |  | 
					
						
						|  | if SoniTr.tts_info.xtts_enabled: | 
					
						
						|  |  | 
					
						
						|  | def update_tts_list(): | 
					
						
						|  | update_dict = { | 
					
						
						|  | f"tts_voice{i:02d}": gr.update(choices=SoniTr.tts_info.tts_list()) | 
					
						
						|  | for i in range(MAX_TTS) | 
					
						
						|  | } | 
					
						
						|  | update_dict["tts_documents"] = gr.update( | 
					
						
						|  | choices=list( | 
					
						
						|  | filter( | 
					
						
						|  | lambda x: x != "_XTTS_/AUTOMATIC.wav", | 
					
						
						|  | SoniTr.tts_info.tts_list(), | 
					
						
						|  | ) | 
					
						
						|  | ) | 
					
						
						|  | ) | 
					
						
						|  | return [value for value in update_dict.values()] | 
					
						
						|  |  | 
					
						
						|  | create_xtts_wav.click( | 
					
						
						|  | create_wav_file_vc, | 
					
						
						|  | inputs=[ | 
					
						
						|  | wav_speaker_name, | 
					
						
						|  | wav_speaker_file, | 
					
						
						|  | wav_speaker_start, | 
					
						
						|  | wav_speaker_end, | 
					
						
						|  | wav_speaker_dir, | 
					
						
						|  | wav_speaker_dereverb, | 
					
						
						|  | ], | 
					
						
						|  | outputs=[wav_speaker_output], | 
					
						
						|  | ).then( | 
					
						
						|  | update_tts_list, | 
					
						
						|  | None, | 
					
						
						|  | [ | 
					
						
						|  | tts_voice00, | 
					
						
						|  | tts_voice01, | 
					
						
						|  | tts_voice02, | 
					
						
						|  | tts_voice03, | 
					
						
						|  | tts_voice04, | 
					
						
						|  | tts_voice05, | 
					
						
						|  | tts_voice06, | 
					
						
						|  | tts_voice07, | 
					
						
						|  | tts_voice08, | 
					
						
						|  | tts_voice09, | 
					
						
						|  | tts_voice10, | 
					
						
						|  | tts_voice11, | 
					
						
						|  | tts_documents, | 
					
						
						|  | ], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | subs_button.click( | 
					
						
						|  | SoniTr.batch_multilingual_media_conversion, | 
					
						
						|  | inputs=[ | 
					
						
						|  | video_input, | 
					
						
						|  | blink_input, | 
					
						
						|  | directory_input, | 
					
						
						|  | HFKEY, | 
					
						
						|  | PREVIEW, | 
					
						
						|  | WHISPER_MODEL_SIZE, | 
					
						
						|  | batch_size, | 
					
						
						|  | compute_type, | 
					
						
						|  | SOURCE_LANGUAGE, | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | min_speakers, | 
					
						
						|  | max_speakers, | 
					
						
						|  | tts_voice00, | 
					
						
						|  | tts_voice01, | 
					
						
						|  | tts_voice02, | 
					
						
						|  | tts_voice03, | 
					
						
						|  | tts_voice04, | 
					
						
						|  | tts_voice05, | 
					
						
						|  | tts_voice06, | 
					
						
						|  | tts_voice07, | 
					
						
						|  | tts_voice08, | 
					
						
						|  | tts_voice09, | 
					
						
						|  | tts_voice10, | 
					
						
						|  | tts_voice11, | 
					
						
						|  | VIDEO_OUTPUT_NAME, | 
					
						
						|  | AUDIO_MIX, | 
					
						
						|  | audio_accelerate, | 
					
						
						|  | acceleration_rate_regulation_gui, | 
					
						
						|  | volume_original_mix, | 
					
						
						|  | volume_translated_mix, | 
					
						
						|  | sub_type_output, | 
					
						
						|  | edit_sub_check, | 
					
						
						|  | dummy_false_check, | 
					
						
						|  | subs_edit_space, | 
					
						
						|  | avoid_overlap_gui, | 
					
						
						|  | vocal_refinement_gui, | 
					
						
						|  | literalize_numbers_gui, | 
					
						
						|  | segment_duration_limit_gui, | 
					
						
						|  | diarization_process_dropdown, | 
					
						
						|  | translate_process_dropdown, | 
					
						
						|  | input_srt, | 
					
						
						|  | main_output_type, | 
					
						
						|  | main_voiceless_track, | 
					
						
						|  | voice_imitation_gui, | 
					
						
						|  | voice_imitation_max_segments_gui, | 
					
						
						|  | voice_imitation_vocals_dereverb_gui, | 
					
						
						|  | voice_imitation_remove_previous_gui, | 
					
						
						|  | voice_imitation_method_gui, | 
					
						
						|  | wav_speaker_dereverb, | 
					
						
						|  | text_segmentation_scale_gui, | 
					
						
						|  | divide_text_segments_by_gui, | 
					
						
						|  | soft_subtitles_to_video_gui, | 
					
						
						|  | burn_subtitles_to_video_gui, | 
					
						
						|  | enable_cache_gui, | 
					
						
						|  | enable_custom_voice, | 
					
						
						|  | workers_custom_voice, | 
					
						
						|  | is_gui_dummy_check, | 
					
						
						|  | ], | 
					
						
						|  | outputs=subs_edit_space, | 
					
						
						|  | ).then( | 
					
						
						|  | play_sound_alert, [play_sound_gui], [sound_alert_notification] | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | video_button.click( | 
					
						
						|  | SoniTr.batch_multilingual_media_conversion, | 
					
						
						|  | inputs=[ | 
					
						
						|  | video_input, | 
					
						
						|  | blink_input, | 
					
						
						|  | directory_input, | 
					
						
						|  | HFKEY, | 
					
						
						|  | PREVIEW, | 
					
						
						|  | WHISPER_MODEL_SIZE, | 
					
						
						|  | batch_size, | 
					
						
						|  | compute_type, | 
					
						
						|  | SOURCE_LANGUAGE, | 
					
						
						|  | TRANSLATE_AUDIO_TO, | 
					
						
						|  | min_speakers, | 
					
						
						|  | max_speakers, | 
					
						
						|  | tts_voice00, | 
					
						
						|  | tts_voice01, | 
					
						
						|  | tts_voice02, | 
					
						
						|  | tts_voice03, | 
					
						
						|  | tts_voice04, | 
					
						
						|  | tts_voice05, | 
					
						
						|  | tts_voice06, | 
					
						
						|  | tts_voice07, | 
					
						
						|  | tts_voice08, | 
					
						
						|  | tts_voice09, | 
					
						
						|  | tts_voice10, | 
					
						
						|  | tts_voice11, | 
					
						
						|  | VIDEO_OUTPUT_NAME, | 
					
						
						|  | AUDIO_MIX, | 
					
						
						|  | audio_accelerate, | 
					
						
						|  | acceleration_rate_regulation_gui, | 
					
						
						|  | volume_original_mix, | 
					
						
						|  | volume_translated_mix, | 
					
						
						|  | sub_type_output, | 
					
						
						|  | dummy_false_check, | 
					
						
						|  | edit_sub_check, | 
					
						
						|  | subs_edit_space, | 
					
						
						|  | avoid_overlap_gui, | 
					
						
						|  | vocal_refinement_gui, | 
					
						
						|  | literalize_numbers_gui, | 
					
						
						|  | segment_duration_limit_gui, | 
					
						
						|  | diarization_process_dropdown, | 
					
						
						|  | translate_process_dropdown, | 
					
						
						|  | input_srt, | 
					
						
						|  | main_output_type, | 
					
						
						|  | main_voiceless_track, | 
					
						
						|  | voice_imitation_gui, | 
					
						
						|  | voice_imitation_max_segments_gui, | 
					
						
						|  | voice_imitation_vocals_dereverb_gui, | 
					
						
						|  | voice_imitation_remove_previous_gui, | 
					
						
						|  | voice_imitation_method_gui, | 
					
						
						|  | wav_speaker_dereverb, | 
					
						
						|  | text_segmentation_scale_gui, | 
					
						
						|  | divide_text_segments_by_gui, | 
					
						
						|  | soft_subtitles_to_video_gui, | 
					
						
						|  | burn_subtitles_to_video_gui, | 
					
						
						|  | enable_cache_gui, | 
					
						
						|  | enable_custom_voice, | 
					
						
						|  | workers_custom_voice, | 
					
						
						|  | is_gui_dummy_check, | 
					
						
						|  | ], | 
					
						
						|  | outputs=video_output, | 
					
						
						|  | trigger_mode="multiple", | 
					
						
						|  | ).then( | 
					
						
						|  | play_sound_alert, [play_sound_gui], [sound_alert_notification] | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | docs_button.click( | 
					
						
						|  | SoniTr.multilingual_docs_conversion, | 
					
						
						|  | inputs=[ | 
					
						
						|  | text_docs, | 
					
						
						|  | input_docs, | 
					
						
						|  | directory_input_docs, | 
					
						
						|  | docs_SOURCE_LANGUAGE, | 
					
						
						|  | docs_TRANSLATE_TO, | 
					
						
						|  | tts_documents, | 
					
						
						|  | docs_OUTPUT_NAME, | 
					
						
						|  | docs_translate_process_dropdown, | 
					
						
						|  | docs_output_type, | 
					
						
						|  | docs_chunk_size, | 
					
						
						|  | enable_custom_voice, | 
					
						
						|  | workers_custom_voice, | 
					
						
						|  | start_page_gui, | 
					
						
						|  | end_page_gui, | 
					
						
						|  | videobook_width_gui, | 
					
						
						|  | videobook_height_gui, | 
					
						
						|  | videobook_bcolor_gui, | 
					
						
						|  | docs_dummy_check, | 
					
						
						|  | ], | 
					
						
						|  | outputs=docs_output, | 
					
						
						|  | trigger_mode="multiple", | 
					
						
						|  | ).then( | 
					
						
						|  | play_sound_alert, [play_sound_gui], [sound_alert_notification] | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | return app | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_language_config(language_data, language=None, base_key="english"): | 
					
						
						|  | base_lang = language_data.get(base_key) | 
					
						
						|  |  | 
					
						
						|  | if language not in language_data: | 
					
						
						|  | logger.error( | 
					
						
						|  | f"Language {language} not found, defaulting to {base_key}" | 
					
						
						|  | ) | 
					
						
						|  | return base_lang | 
					
						
						|  |  | 
					
						
						|  | lg_conf = language_data.get(language, {}) | 
					
						
						|  | lg_conf.update((k, v) for k, v in base_lang.items() if k not in lg_conf) | 
					
						
						|  |  | 
					
						
						|  | return lg_conf | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def create_parser(): | 
					
						
						|  | parser = argparse.ArgumentParser( | 
					
						
						|  | formatter_class=argparse.ArgumentDefaultsHelpFormatter | 
					
						
						|  | ) | 
					
						
						|  | parser.add_argument( | 
					
						
						|  | "--theme", | 
					
						
						|  | type=str, | 
					
						
						|  | default="Taithrah/Minimal", | 
					
						
						|  | help=( | 
					
						
						|  | "Specify the theme; find themes in " | 
					
						
						|  | "https://huggingface.co/spaces/gradio/theme-gallery;" | 
					
						
						|  | " Example: --theme aliabid94/new-theme" | 
					
						
						|  | ), | 
					
						
						|  | ) | 
					
						
						|  | parser.add_argument( | 
					
						
						|  | "--public_url", | 
					
						
						|  | action="store_true", | 
					
						
						|  | default=False, | 
					
						
						|  | help="Enable public link", | 
					
						
						|  | ) | 
					
						
						|  | parser.add_argument( | 
					
						
						|  | "--logs_in_gui", | 
					
						
						|  | action="store_true", | 
					
						
						|  | default=False, | 
					
						
						|  | help="Displays the operations performed in Logs", | 
					
						
						|  | ) | 
					
						
						|  | parser.add_argument( | 
					
						
						|  | "--verbosity_level", | 
					
						
						|  | type=str, | 
					
						
						|  | default="info", | 
					
						
						|  | help=( | 
					
						
						|  | "Set logger verbosity level: " | 
					
						
						|  | "debug, info, warning, error, or critical" | 
					
						
						|  | ), | 
					
						
						|  | ) | 
					
						
						|  | parser.add_argument( | 
					
						
						|  | "--language", | 
					
						
						|  | type=str, | 
					
						
						|  | default="english", | 
					
						
						|  | help=" Select the language of the interface: english, spanish", | 
					
						
						|  | ) | 
					
						
						|  | parser.add_argument( | 
					
						
						|  | "--cpu_mode", | 
					
						
						|  | action="store_true", | 
					
						
						|  | default=False, | 
					
						
						|  | help="Enable CPU mode to run the program without utilizing GPU acceleration.", | 
					
						
						|  | ) | 
					
						
						|  | return parser | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if __name__ == "__main__": | 
					
						
						|  |  | 
					
						
						|  | parser = create_parser() | 
					
						
						|  |  | 
					
						
						|  | args = parser.parse_args() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | set_logging_level(args.verbosity_level) | 
					
						
						|  |  | 
					
						
						|  | for id_model in UVR_MODELS: | 
					
						
						|  | download_manager( | 
					
						
						|  | os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | models_path, index_path = upload_model_list() | 
					
						
						|  |  | 
					
						
						|  | SoniTr = SoniTranslate(cpu_mode=args.cpu_mode if os.environ.get("ZERO_GPU") != "TRUE" else "cpu") | 
					
						
						|  |  | 
					
						
						|  | lg_conf = get_language_config(language_data, language=args.language) | 
					
						
						|  |  | 
					
						
						|  | app = create_gui(args.theme, logs_in_gui=args.logs_in_gui) | 
					
						
						|  |  | 
					
						
						|  | app.queue() | 
					
						
						|  |  | 
					
						
						|  | app.launch( | 
					
						
						|  | max_threads=1, | 
					
						
						|  | share=args.public_url, | 
					
						
						|  | show_error=True, | 
					
						
						|  | quiet=False, | 
					
						
						|  | debug=(True if logger.isEnabledFor(logging.DEBUG) else False), | 
					
						
						|  | ) | 
					
						
						|  |  |