Spaces:
Running
Running
| { | |
| "models": { | |
| "whisper": [ | |
| // Configuration for the built-in models. You can remove any of these | |
| // if you don't want to use the default models. | |
| { | |
| "name": "tiny", | |
| "url": "tiny" | |
| }, | |
| { | |
| "name": "base", | |
| "url": "base" | |
| }, | |
| { | |
| "name": "small", | |
| "url": "small" | |
| }, | |
| { | |
| "name": "medium", | |
| "url": "medium" | |
| }, | |
| { | |
| "name": "large", | |
| "url": "large" | |
| }, | |
| { | |
| "name": "large-v1", | |
| "url": "large-v1" | |
| }, | |
| { | |
| "name": "large-v2", | |
| "url": "large-v2" | |
| }, | |
| { | |
| "name": "large-v3", | |
| "url": "large-v3" | |
| } | |
| // Uncomment to add custom Japanese models | |
| //{ | |
| // "name": "whisper-large-v2-mix-jp", | |
| // "url": "vumichien/whisper-large-v2-mix-jp", | |
| // // The type of the model. Can be "huggingface" or "whisper" - "whisper" is the default. | |
| // // HuggingFace models are loaded using the HuggingFace transformers library and then converted to Whisper models. | |
| // "type": "huggingface", | |
| //}, | |
| //{ | |
| // "name": "local-model", | |
| // "url": "path/to/local/model", | |
| //}, | |
| //{ | |
| // "name": "remote-model", | |
| // "url": "https://example.com/path/to/model", | |
| //} | |
| ], | |
| "m2m100": [ | |
| { | |
| "name": "m2m100_1.2B-ct2fast/michaelfeil", | |
| "url": "michaelfeil/ct2fast-m2m100_1.2B", | |
| "type": "huggingface", | |
| "tokenizer_url": "facebook/m2m100_1.2B" | |
| }, | |
| { | |
| "name": "m2m100_1.2B/facebook", | |
| "url": "facebook/m2m100_1.2B", | |
| "type": "huggingface" | |
| }, | |
| { | |
| "name": "m2m100_418M-ct2fast/michaelfeil", | |
| "url": "michaelfeil/ct2fast-m2m100_418M", | |
| "type": "huggingface", | |
| "tokenizer_url": "facebook/m2m100_418M" | |
| }, | |
| { | |
| "name": "m2m100_418M/facebook", | |
| "url": "facebook/m2m100_418M", | |
| "type": "huggingface" | |
| }, | |
| { | |
| "name": "m2m100-12B-last-ckpt/facebook", | |
| "url": "facebook/m2m100-12B-last-ckpt", | |
| "type": "huggingface" | |
| }, | |
| { | |
| "name": "m2m100-12B-ct2fast/michaelfeil", | |
| "url": "michaelfeil/ct2fast-m2m100-12B-last-ckpt", | |
| "type": "huggingface", | |
| "tokenizer_url": "facebook/m2m100-12B-last-ckpt" | |
| } | |
| ], | |
| "nllb": [ | |
| { | |
| "name": "nllb-200-distilled-1.3B-ct2:int8/JustFrederik", | |
| "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-int8", | |
| "type": "huggingface", | |
| "tokenizer_url": "facebook/nllb-200-distilled-1.3B" | |
| }, | |
| { | |
| "name": "nllb-200-distilled-1.3B-ct2fast:int8_float16/michaelfeil", | |
| "url": "michaelfeil/ct2fast-nllb-200-distilled-1.3B", | |
| "type": "huggingface", | |
| "tokenizer_url": "facebook/nllb-200-distilled-1.3B" | |
| }, | |
| { | |
| "name": "nllb-200-distilled-1.3B-ct2:float16/JustFrederik", | |
| "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-float16", | |
| "type": "huggingface", | |
| "tokenizer_url": "facebook/nllb-200-distilled-1.3B" | |
| }, | |
| { | |
| "name": "nllb-200-distilled-1.3B-ct2/JustFrederik", | |
| "url": "JustFrederik/nllb-200-distilled-1.3B-ct2", | |
| "type": "huggingface", | |
| "tokenizer_url": "facebook/nllb-200-distilled-1.3B" | |
| }, | |
| { | |
| "name": "nllb-200-1.3B-ct2:int8/JustFrederik", | |
| "url": "JustFrederik/nllb-200-1.3B-ct2-int8", | |
| "type": "huggingface", | |
| "tokenizer_url": "facebook/nllb-200-1.3B" | |
| }, | |
| { | |
| "name": "nllb-200-1.3B-ct2:float16/JustFrederik", | |
| "url": "JustFrederik/nllb-200-1.3B-ct2-float16", | |
| "type": "huggingface", | |
| "tokenizer_url": "facebook/nllb-200-1.3B" | |
| }, | |
| { | |
| "name": "nllb-200-1.3B-ct2/JustFrederik", | |
| "url": "JustFrederik/nllb-200-1.3B-ct2", | |
| "type": "huggingface", | |
| "tokenizer_url": "facebook/nllb-200-1.3B" | |
| }, | |
| { | |
| "name": "nllb-200-distilled-1.3B/facebook", | |
| "url": "facebook/nllb-200-distilled-1.3B", | |
| "type": "huggingface" | |
| }, | |
| { | |
| "name": "nllb-200-1.3B/facebook", | |
| "url": "facebook/nllb-200-1.3B", | |
| "type": "huggingface" | |
| }, | |
| { | |
| "name": "nllb-200-3.3B-ct2fast:int8_float16/michaelfeil", | |
| "url": "michaelfeil/ct2fast-nllb-200-3.3B", | |
| "type": "huggingface", | |
| "tokenizer_url": "facebook/nllb-200-3.3B" | |
| }, | |
| { | |
| "name": "nllb-200-3.3B-ct2:float16/JustFrederik", | |
| "url": "JustFrederik/nllb-200-3.3B-ct2-float16", | |
| "type": "huggingface", | |
| "tokenizer_url": "facebook/nllb-200-3.3B" | |
| }, | |
| { | |
| "name": "nllb-200-3.3B/facebook", | |
| "url": "facebook/nllb-200-3.3B", | |
| "type": "huggingface" | |
| }, | |
| { | |
| "name": "nllb-200-distilled-600M/facebook", | |
| "url": "facebook/nllb-200-distilled-600M", | |
| "type": "huggingface" | |
| }, | |
| { | |
| "name": "nllb-200-distilled-600M-ct2:int8/JustFrederik", | |
| "url": "JustFrederik/nllb-200-distilled-600M-ct2-int8", | |
| "type": "huggingface", | |
| "tokenizer_url": "facebook/nllb-200-distilled-600M" | |
| }, | |
| { | |
| "name": "nllb-200-distilled-600M-ct2:float16/JustFrederik", | |
| "url": "JustFrederik/nllb-200-distilled-600M-ct2-float16", | |
| "type": "huggingface", | |
| "tokenizer_url": "facebook/nllb-200-distilled-600M" | |
| }, | |
| { | |
| "name": "nllb-200-distilled-600M-ct2/JustFrederik", | |
| "url": "JustFrederik/nllb-200-distilled-600M-ct2", | |
| "type": "huggingface", | |
| "tokenizer_url": "facebook/nllb-200-distilled-600M" | |
| } | |
| ], | |
| "mt5": [ | |
| { | |
| "name": "mt5-zh-ja-en-trimmed/K024", | |
| "url": "K024/mt5-zh-ja-en-trimmed", | |
| "type": "huggingface" | |
| }, | |
| { | |
| "name": "mt5-zh-ja-en-trimmed-fine-tuned-v1/engmatic-earth", | |
| "url": "engmatic-earth/mt5-zh-ja-en-trimmed-fine-tuned-v1", | |
| "type": "huggingface" | |
| } | |
| ], | |
| "ALMA": [ | |
| { | |
| "name": "ALMA-7B-GPTQ/TheBloke", | |
| "url": "TheBloke/ALMA-7B-GPTQ", | |
| "type": "huggingface" | |
| }, | |
| { | |
| "name": "ALMA-13B-GPTQ/TheBloke", | |
| "url": "TheBloke/ALMA-13B-GPTQ", | |
| "type": "huggingface" | |
| }, | |
| { | |
| "name": "ALMA-7B-GGUF-Q4_K_M/TheBloke", | |
| "url": "TheBloke/ALMA-7B-GGUF", | |
| "type": "huggingface", | |
| "model_file": "alma-7b.Q4_K_M.gguf", | |
| "tokenizer_url": "haoranxu/ALMA-7B" | |
| }, | |
| { | |
| "name": "ALMA-13B-GGUF-Q4_K_M/TheBloke", | |
| "url": "TheBloke/ALMA-13B-GGUF", | |
| "type": "huggingface", | |
| "model_file": "alma-13b.Q4_K_M.gguf", | |
| "tokenizer_url": "haoranxu/ALMA-13B" | |
| }, | |
| { | |
| "name": "ALMA-7B-ct2:int8_float16/avan", | |
| "url": "avans06/ALMA-7B-ct2-int8_float16", | |
| "type": "huggingface", | |
| "tokenizer_url": "haoranxu/ALMA-7B" | |
| }, | |
| { | |
| "name": "ALMA-13B-ct2:int8_float16/avan", | |
| "url": "avans06/ALMA-13B-ct2-int8_float16", | |
| "type": "huggingface", | |
| "tokenizer_url": "haoranxu/ALMA-13B" | |
| }, | |
| { | |
| "name": "ALMA-7B/haoranxu", | |
| "url": "haoranxu/ALMA-7B", | |
| "type": "huggingface" | |
| }, | |
| { | |
| "name": "ALMA-13B/haoranxu", | |
| "url": "haoranxu/ALMA-13B", | |
| "type": "huggingface" | |
| } | |
| ], | |
| "madlad400": [ | |
| { | |
| "name": "madlad400-3b-mt-ct2-int8_float16/SoybeanMilk", | |
| "url": "SoybeanMilk/madlad400-3b-mt-ct2-int8_float16", | |
| "type": "huggingface", | |
| "tokenizer_url": "jbochi/madlad400-3b-mt" | |
| }, | |
| { | |
| "name": "madlad400-7b-mt-bt-ct2-int8_float16/avan", | |
| "url": "avans06/madlad400-7b-mt-bt-ct2-int8_float16", | |
| "type": "huggingface", | |
| "tokenizer_url": "jbochi/madlad400-7b-mt-bt" | |
| }, | |
| { | |
| "name": "madlad400-10b-mt-ct2-int8_float16/SoybeanMilk", | |
| "url": "SoybeanMilk/madlad400-10b-mt-ct2-int8_float16", | |
| "type": "huggingface", | |
| "tokenizer_url": "jbochi/madlad400-10b-mt" | |
| }, | |
| { | |
| "name": "madlad400-3b-mt/jbochi", | |
| "url": "jbochi/madlad400-3b-mt", | |
| "type": "huggingface" | |
| }, | |
| { | |
| "name": "madlad400-7b-mt-bt/jbochi", | |
| "url": "jbochi/madlad400-7b-mt-bt", | |
| "type": "huggingface" | |
| }, | |
| { | |
| "name": "madlad400-10b-mt/jbochi", | |
| "url": "jbochi/madlad400-10b-mt", | |
| "type": "huggingface" | |
| } | |
| ], | |
| "seamless": [ | |
| //{ | |
| // "name": "hf-seamless-m4t-medium/facebook", | |
| // "url": "facebook/hf-seamless-m4t-medium", | |
| // "type": "huggingface" | |
| //}, | |
| //{ | |
| // "name": "seamless-m4t-large/facebook", | |
| // "url": "facebook/seamless-m4t-large", | |
| // "type": "huggingface" | |
| //}, | |
| { | |
| "name": "seamless-m4t-v2-large/facebook", | |
| "url": "facebook/seamless-m4t-v2-large", | |
| "type": "huggingface" | |
| } | |
| ] | |
| }, | |
| // Configuration options that will be used if they are not specified in the command line arguments. | |
| // * WEBUI options * | |
| // Maximum audio file length in seconds, or -1 for no limit. Ignored by CLI. | |
| "input_audio_max_duration": 1800, | |
| // True to share the app on HuggingFace. | |
| "share": false, | |
| // The host or IP to bind to. If None, bind to localhost. | |
| "server_name": null, | |
| // The port to bind to. | |
| "server_port": 7860, | |
| // The number of workers to use for the web server. Use -1 to disable queueing. | |
| "queue_concurrency_count": 1, | |
| // Whether or not to automatically delete all uploaded files, to save disk space | |
| "delete_uploaded_files": true, | |
| // * General options * | |
| // The default implementation to use for Whisper. Can be "whisper" or "faster-whisper". | |
| // Note that you must either install the requirements for faster-whisper (requirements-fasterWhisper.txt) | |
| // or whisper (requirements.txt) | |
| "whisper_implementation": "faster-whisper", | |
| // The default model name. | |
| "default_model_name": "large-v2", | |
| // The default VAD. | |
| "default_vad": "silero-vad", | |
| // A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing. | |
| "vad_parallel_devices": "", | |
| // The number of CPU cores to use for VAD pre-processing. | |
| "vad_cpu_cores": 1, | |
| // The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout. | |
| "vad_process_timeout": 1800, | |
| // True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use. | |
| "auto_parallel": false, | |
| // Directory to save the outputs (CLI will use the current directory if not specified) | |
| "output_dir": null, | |
| // The path to save model files; uses ~/.cache/whisper by default | |
| "model_dir": null, | |
| // Device to use for PyTorch inference, or Null to use the default device | |
| "device": null, | |
| // Whether to print out the progress and debug messages | |
| "verbose": true, | |
| // Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate') | |
| "task": "transcribe", | |
| // Language spoken in the audio, specify None to perform language detection | |
| "language": null, | |
| // The window size (in seconds) to merge voice segments | |
| "vad_merge_window": 5, | |
| // The maximum size (in seconds) of a voice segment | |
| "vad_max_merge_size": 90, | |
| // The padding (in seconds) to add to each voice segment | |
| "vad_padding": 1, | |
| // Whether or not to prepend the initial prompt to each VAD segment (prepend_all_segments), or just the first segment (prepend_first_segment) | |
| "vad_initial_prompt_mode": "prepend_first_segment", | |
| // The window size of the prompt to pass to Whisper | |
| "vad_prompt_window": 3, | |
| // Temperature to use for sampling | |
| "temperature": 0, | |
| // Number of candidates when sampling with non-zero temperature | |
| "best_of": 5, | |
| // Number of beams in beam search, only applicable when temperature is zero | |
| "beam_size": 5, | |
| // Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search | |
| "patience": 1, | |
| // Optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default | |
| "length_penalty": null, | |
| // Comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations | |
| "suppress_tokens": "-1", | |
| // Optional text to provide as a prompt for the first window | |
| "initial_prompt": null, | |
| // If True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop | |
| "condition_on_previous_text": true, | |
| // Whether to perform inference in fp16; True by default | |
| "fp16": true, | |
| // The compute type used by faster-whisper. Can be "int8". "int16" or "float16". | |
| "compute_type": "auto", | |
| // Temperature to increase when falling back when the decoding fails to meet either of the thresholds below | |
| "temperature_increment_on_fallback": 0.2, | |
| // If the gzip compression ratio is higher than this value, treat the decoding as failed | |
| "compression_ratio_threshold": 2.4, | |
| // If the average log probability is lower than this value, treat the decoding as failed | |
| "logprob_threshold": -1.0, | |
| // If the probability of the <no-speech> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence | |
| "no_speech_threshold": 0.6, | |
| // [faster-whisper] The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0. | |
| "repetition_penalty": 1.0, | |
| // [faster-whisper] The model ensures that a sequence of words of no_repeat_ngram_size isn’t repeated in the output sequence. If specified, it must be a positive integer greater than 1. | |
| "no_repeat_ngram_size": 0, | |
| // (experimental) extract word-level timestamps and refine the results based on them | |
| "word_timestamps": true, | |
| // if word_timestamps is True, merge these punctuation symbols with the next word | |
| "prepend_punctuations": "\"\'“¿([{-", | |
| // if word_timestamps is True, merge these punctuation symbols with the previous word | |
| "append_punctuations": "\"\'.。,,!!??::”)]}、", | |
| // (requires --word_timestamps True) underline each word as it is spoken in srt and vtt | |
| "highlight_words": false, | |
| // Diarization settings | |
| "auth_token": null, | |
| // Whether to perform speaker diarization | |
| "diarization": false, | |
| // The number of speakers to detect | |
| "diarization_speakers": 2, | |
| // The minimum number of speakers to detect | |
| "diarization_min_speakers": 1, | |
| // The maximum number of speakers to detect | |
| "diarization_max_speakers": 8, | |
| // The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout. | |
| "diarization_process_timeout": 60, | |
| // Whisper Segments Filter | |
| "whisper_segments_filter": false, | |
| "whisper_segments_filters": [ | |
| "avg_logprob < -0.9", | |
| "(durationLen < 1.5 || segment_last), textLen > 5, avg_logprob < -0.4, no_speech_prob > 0.5", | |
| "(durationLen < 1.5 || segment_last), textLen > 5, avg_logprob < -0.4, no_speech_prob > 0.07, compression_ratio < 0.9", | |
| "(durationLen < 1.5 || segment_last), compression_ratio < 0.9, no_speech_prob > 0.1" | |
| ], | |
| // Translation - The maximum batch size. | |
| "translation_batch_size": 2, | |
| // Translation - Prevent repetitions of ngrams with this size (set 0 to disable). | |
| "translation_no_repeat_ngram_size": 4, | |
| // Translation - Beam size (1 for greedy search). | |
| "translation_num_beams": 3, | |
| // Translation - Torch Dtype float16, Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF). | |
| "translation_torch_dtype_float16": true, | |
| // Translation - Using Bitsandbytes, Load the float32 translation model into mixed-8bit or 4bit precision quantized model(not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF). | |
| "translation_using_bitsandbytes": null | |
| } |