Spaces:
Running
Running
Update the dependency package faster-whisper to version 0.10.0
Browse filesfaster-whisper officially supports the large-v3 model now, so update the large-v3 model URL in the config to the official version.
- app.py +9 -3
- config.json5 +1 -2
- requirements-fasterWhisper.txt +1 -1
- requirements.txt +1 -1
- src/whisper/fasterWhisperContainer.py +0 -4
app.py
CHANGED
|
@@ -137,7 +137,7 @@ class WhisperTranscriber:
|
|
| 137 |
vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, self.app_config.vad_padding, self.app_config.vad_prompt_window, self.app_config.vad_initial_prompt_mode)
|
| 138 |
|
| 139 |
if diarization:
|
| 140 |
-
if diarization_speakers < 1:
|
| 141 |
self.set_diarization(auth_token=self.app_config.auth_token, min_speakers=diarization_min_speakers, max_speakers=diarization_max_speakers)
|
| 142 |
else:
|
| 143 |
self.set_diarization(auth_token=self.app_config.auth_token, num_speakers=diarization_speakers, min_speakers=diarization_min_speakers, max_speakers=diarization_max_speakers)
|
|
@@ -189,7 +189,7 @@ class WhisperTranscriber:
|
|
| 189 |
|
| 190 |
# Set diarization
|
| 191 |
if diarization:
|
| 192 |
-
if diarization_speakers < 1:
|
| 193 |
self.set_diarization(auth_token=self.app_config.auth_token, min_speakers=diarization_min_speakers, max_speakers=diarization_max_speakers)
|
| 194 |
else:
|
| 195 |
self.set_diarization(auth_token=self.app_config.auth_token, num_speakers=diarization_speakers, min_speakers=diarization_min_speakers, max_speakers=diarization_max_speakers)
|
|
@@ -209,7 +209,8 @@ class WhisperTranscriber:
|
|
| 209 |
try:
|
| 210 |
progress(0, desc="init audio sources")
|
| 211 |
sources = self.__get_source(urlData, multipleFiles, microphoneData)
|
| 212 |
-
|
|
|
|
| 213 |
try:
|
| 214 |
progress(0, desc="init whisper model")
|
| 215 |
whisper_lang = get_language_from_name(languageName)
|
|
@@ -361,6 +362,11 @@ class WhisperTranscriber:
|
|
| 361 |
|
| 362 |
except ExceededMaximumDuration as e:
|
| 363 |
return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
def transcribe_file(self, model: AbstractWhisperContainer, audio_path: str, language: str, task: str = None,
|
| 366 |
vadOptions: VadOptions = VadOptions(),
|
|
|
|
| 137 |
vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, self.app_config.vad_padding, self.app_config.vad_prompt_window, self.app_config.vad_initial_prompt_mode)
|
| 138 |
|
| 139 |
if diarization:
|
| 140 |
+
if diarization_speakers is not None and diarization_speakers < 1:
|
| 141 |
self.set_diarization(auth_token=self.app_config.auth_token, min_speakers=diarization_min_speakers, max_speakers=diarization_max_speakers)
|
| 142 |
else:
|
| 143 |
self.set_diarization(auth_token=self.app_config.auth_token, num_speakers=diarization_speakers, min_speakers=diarization_min_speakers, max_speakers=diarization_max_speakers)
|
|
|
|
| 189 |
|
| 190 |
# Set diarization
|
| 191 |
if diarization:
|
| 192 |
+
if diarization_speakers is not None and diarization_speakers < 1:
|
| 193 |
self.set_diarization(auth_token=self.app_config.auth_token, min_speakers=diarization_min_speakers, max_speakers=diarization_max_speakers)
|
| 194 |
else:
|
| 195 |
self.set_diarization(auth_token=self.app_config.auth_token, num_speakers=diarization_speakers, min_speakers=diarization_min_speakers, max_speakers=diarization_max_speakers)
|
|
|
|
| 209 |
try:
|
| 210 |
progress(0, desc="init audio sources")
|
| 211 |
sources = self.__get_source(urlData, multipleFiles, microphoneData)
|
| 212 |
+
if (len(sources) == 0):
|
| 213 |
+
raise Exception("init audio sources failed...")
|
| 214 |
try:
|
| 215 |
progress(0, desc="init whisper model")
|
| 216 |
whisper_lang = get_language_from_name(languageName)
|
|
|
|
| 362 |
|
| 363 |
except ExceededMaximumDuration as e:
|
| 364 |
return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
|
| 365 |
+
except Exception as e:
|
| 366 |
+
import traceback
|
| 367 |
+
print(traceback.format_exc())
|
| 368 |
+
return [], ("Error occurred during transcribe: " + str(e)), ""
|
| 369 |
+
|
| 370 |
|
| 371 |
def transcribe_file(self, model: AbstractWhisperContainer, audio_path: str, language: str, task: str = None,
|
| 372 |
vadOptions: VadOptions = VadOptions(),
|
config.json5
CHANGED
|
@@ -28,8 +28,7 @@
|
|
| 28 |
},
|
| 29 |
{
|
| 30 |
"name": "large-v3",
|
| 31 |
-
"url": "
|
| 32 |
-
"type": "huggingface"
|
| 33 |
},
|
| 34 |
// Uncomment to add custom Japanese models
|
| 35 |
//{
|
|
|
|
| 28 |
},
|
| 29 |
{
|
| 30 |
"name": "large-v3",
|
| 31 |
+
"url": "large-v3"
|
|
|
|
| 32 |
},
|
| 33 |
// Uncomment to add custom Japanese models
|
| 34 |
//{
|
requirements-fasterWhisper.txt
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
git+https://github.com/huggingface/transformers
|
| 2 |
ctranslate2>=3.21.0
|
| 3 |
-
faster-whisper
|
| 4 |
ffmpeg-python==0.2.0
|
| 5 |
gradio==3.50.2
|
| 6 |
yt-dlp
|
|
|
|
| 1 |
git+https://github.com/huggingface/transformers
|
| 2 |
ctranslate2>=3.21.0
|
| 3 |
+
faster-whisper>=0.10.0
|
| 4 |
ffmpeg-python==0.2.0
|
| 5 |
gradio==3.50.2
|
| 6 |
yt-dlp
|
requirements.txt
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
git+https://github.com/huggingface/transformers
|
| 2 |
ctranslate2>=3.21.0
|
| 3 |
-
faster-whisper
|
| 4 |
ffmpeg-python==0.2.0
|
| 5 |
gradio==3.50.2
|
| 6 |
yt-dlp
|
|
|
|
| 1 |
git+https://github.com/huggingface/transformers
|
| 2 |
ctranslate2>=3.21.0
|
| 3 |
+
faster-whisper>=0.10.0
|
| 4 |
ffmpeg-python==0.2.0
|
| 5 |
gradio==3.50.2
|
| 6 |
yt-dlp
|
src/whisper/fasterWhisperContainer.py
CHANGED
|
@@ -55,10 +55,6 @@ class FasterWhisperContainer(AbstractWhisperContainer):
|
|
| 55 |
device = "auto"
|
| 56 |
|
| 57 |
model = WhisperModel(model_url, device=device, compute_type=self.compute_type)
|
| 58 |
-
if "large-v3" in model_url:
|
| 59 |
-
# Working with Whisper-large-v3
|
| 60 |
-
# https://github.com/guillaumekln/faster-whisper/issues/547#issuecomment-1797962599
|
| 61 |
-
model.feature_extractor.mel_filters = model.feature_extractor.get_mel_filters(model.feature_extractor.sampling_rate, model.feature_extractor.n_fft, n_mels=128)
|
| 62 |
return model
|
| 63 |
|
| 64 |
def create_callback(self, language: str = None, task: str = None,
|
|
|
|
| 55 |
device = "auto"
|
| 56 |
|
| 57 |
model = WhisperModel(model_url, device=device, compute_type=self.compute_type)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
return model
|
| 59 |
|
| 60 |
def create_callback(self, language: str = None, task: str = None,
|