Fixed the issue where Silero-VAD version 5 could not recognize speech.
Browse filesRecently, the developer snakers4 released version 5 of Silero-VAD, which caused some audio files to fail in recognizing speech. Until a proper solution is found, we have reverted to using version 4 to avoid this issue.
https://github.com/snakers4/silero-vad/issues/515
Upgrade the project's Gradio version to 5.6.0
- README.md +1 -1
- app.py +11 -11
- requirements-fasterWhisper.txt +1 -1
- requirements-whisper.txt +1 -1
- requirements.txt +1 -1
- src/vad.py +67 -3
README.md
CHANGED
|
@@ -4,7 +4,7 @@ emoji: ✨
|
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
|
|
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.6.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
app.py
CHANGED
|
@@ -1365,8 +1365,8 @@ def create_ui(app_config: ApplicationConfig):
|
|
| 1365 |
else:
|
| 1366 |
print("Queue mode disabled - progress bars will not be shown.")
|
| 1367 |
|
| 1368 |
-
demo.launch(inbrowser=app_config.autolaunch, share=app_config.share, server_name=app_config.server_name, server_port=find_free_port()
|
| 1369 |
-
ssr_mode=False) # [Gradio 5.x] ERROR: Exception in ASGI application
|
| 1370 |
|
| 1371 |
# Clean up
|
| 1372 |
ui.close()
|
|
@@ -1438,15 +1438,15 @@ if __name__ == '__main__':
|
|
| 1438 |
|
| 1439 |
updated_config = default_app_config.update(**args)
|
| 1440 |
|
| 1441 |
-
|
| 1442 |
-
|
| 1443 |
-
|
| 1444 |
-
|
| 1445 |
-
|
| 1446 |
-
|
| 1447 |
-
|
| 1448 |
-
|
| 1449 |
-
|
| 1450 |
|
| 1451 |
try:
|
| 1452 |
if torch.cuda.is_available():
|
|
|
|
| 1365 |
else:
|
| 1366 |
print("Queue mode disabled - progress bars will not be shown.")
|
| 1367 |
|
| 1368 |
+
demo.launch(inbrowser=app_config.autolaunch, share=app_config.share, server_name=app_config.server_name, server_port=find_free_port())
|
| 1369 |
+
# ,ssr_mode=False) # [Gradio 5.x] ERROR: Exception in ASGI application
|
| 1370 |
|
| 1371 |
# Clean up
|
| 1372 |
ui.close()
|
|
|
|
| 1438 |
|
| 1439 |
updated_config = default_app_config.update(**args)
|
| 1440 |
|
| 1441 |
+
updated_config.whisper_implementation = "faster-whisper"
|
| 1442 |
+
updated_config.input_audio_max_duration = -1
|
| 1443 |
+
updated_config.default_model_name = "large-v2"
|
| 1444 |
+
updated_config.output_dir = "output"
|
| 1445 |
+
updated_config.vad_max_merge_size = 90
|
| 1446 |
+
updated_config.merge_subtitle_with_sources = False
|
| 1447 |
+
updated_config.autolaunch = True
|
| 1448 |
+
updated_config.auto_parallel = False
|
| 1449 |
+
updated_config.save_downloaded_files = True
|
| 1450 |
|
| 1451 |
try:
|
| 1452 |
if torch.cuda.is_available():
|
requirements-fasterWhisper.txt
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
ctranslate2>=4.4.0
|
| 3 |
faster-whisper>=1.0.3
|
| 4 |
ffmpeg-python==0.2.0
|
| 5 |
-
gradio==5.
|
| 6 |
yt-dlp
|
| 7 |
json5
|
| 8 |
torch
|
|
|
|
| 2 |
ctranslate2>=4.4.0
|
| 3 |
faster-whisper>=1.0.3
|
| 4 |
ffmpeg-python==0.2.0
|
| 5 |
+
gradio==5.6.0
|
| 6 |
yt-dlp
|
| 7 |
json5
|
| 8 |
torch
|
requirements-whisper.txt
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
ctranslate2>=4.4.0
|
| 3 |
git+https://github.com/openai/whisper.git
|
| 4 |
ffmpeg-python==0.2.0
|
| 5 |
-
gradio==5.
|
| 6 |
yt-dlp
|
| 7 |
json5
|
| 8 |
torch
|
|
|
|
| 2 |
ctranslate2>=4.4.0
|
| 3 |
git+https://github.com/openai/whisper.git
|
| 4 |
ffmpeg-python==0.2.0
|
| 5 |
+
gradio==5.6.0
|
| 6 |
yt-dlp
|
| 7 |
json5
|
| 8 |
torch
|
requirements.txt
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
ctranslate2>=4.4.0
|
| 3 |
faster-whisper>=1.0.3
|
| 4 |
ffmpeg-python==0.2.0
|
| 5 |
-
gradio==5.
|
| 6 |
yt-dlp
|
| 7 |
json5
|
| 8 |
torch
|
|
|
|
| 2 |
ctranslate2>=4.4.0
|
| 3 |
faster-whisper>=1.0.3
|
| 4 |
ffmpeg-python==0.2.0
|
| 5 |
+
gradio==5.6.0
|
| 6 |
yt-dlp
|
| 7 |
json5
|
| 8 |
torch
|
src/vad.py
CHANGED
|
@@ -508,13 +508,77 @@ class VadSileroTranscription(AbstractTranscription):
|
|
| 508 |
"""
|
| 509 |
(get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
|
| 510 |
https://github.com/snakers4/silero-vad
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
"""
|
| 512 |
repo_owner = "snakers4"
|
| 513 |
-
repo_name = "silero-vad"
|
| 514 |
ref = "master"
|
| 515 |
|
| 516 |
try:
|
| 517 |
-
model, utils = torch.hub.load(repo_or_dir=f'{repo_owner}/{repo_name}', model='silero_vad')
|
| 518 |
except Exception as e:
|
| 519 |
hub_dir = torch.hub.get_dir()
|
| 520 |
owner_name_branch = '_'.join([repo_owner, repo_name, ref])
|
|
@@ -547,7 +611,7 @@ class VadSileroTranscription(AbstractTranscription):
|
|
| 547 |
print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
|
| 548 |
wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
|
| 549 |
|
| 550 |
-
sample_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=self.sampling_rate, threshold=SPEECH_TRESHOLD)
|
| 551 |
seconds_timestamps = self.multiply_timestamps(sample_timestamps, factor=1 / self.sampling_rate)
|
| 552 |
adjusted = self.adjust_timestamp(seconds_timestamps, adjust_seconds=chunk_start, max_source_time=chunk_start + chunk_duration)
|
| 553 |
|
|
|
|
| 508 |
"""
|
| 509 |
(get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
|
| 510 |
https://github.com/snakers4/silero-vad
|
| 511 |
+
def get_speech_timestamps(audio: torch.Tensor,
|
| 512 |
+
model,
|
| 513 |
+
threshold: float = 0.5,
|
| 514 |
+
sampling_rate: int = 16000,
|
| 515 |
+
min_speech_duration_ms: int = 250,
|
| 516 |
+
max_speech_duration_s: float = float('inf'),
|
| 517 |
+
min_silence_duration_ms: int = 100,
|
| 518 |
+
speech_pad_ms: int = 30,
|
| 519 |
+
return_seconds: bool = False,
|
| 520 |
+
visualize_probs: bool = False,
|
| 521 |
+
progress_tracking_callback: Callable[[float], None] = None,
|
| 522 |
+
neg_threshold: float = None,
|
| 523 |
+
window_size_samples: int = 512,):
|
| 524 |
+
|
| 525 |
+
This method is used for splitting long audios into speech chunks using silero VAD
|
| 526 |
+
|
| 527 |
+
Parameters
|
| 528 |
+
----------
|
| 529 |
+
audio: torch.Tensor, one dimensional
|
| 530 |
+
One dimensional float torch.Tensor, other types are casted to torch if possible
|
| 531 |
+
|
| 532 |
+
model: preloaded .jit/.onnx silero VAD model
|
| 533 |
+
|
| 534 |
+
threshold: float (default - 0.5)
|
| 535 |
+
Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
|
| 536 |
+
It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
|
| 537 |
+
|
| 538 |
+
sampling_rate: int (default - 16000)
|
| 539 |
+
Currently silero VAD models support 8000 and 16000 (or multiply of 16000) sample rates
|
| 540 |
+
|
| 541 |
+
min_speech_duration_ms: int (default - 250 milliseconds)
|
| 542 |
+
Final speech chunks shorter min_speech_duration_ms are thrown out
|
| 543 |
+
|
| 544 |
+
max_speech_duration_s: int (default - inf)
|
| 545 |
+
Maximum duration of speech chunks in seconds
|
| 546 |
+
Chunks longer than max_speech_duration_s will be split at the timestamp of the last silence that lasts more than 100ms (if any), to prevent agressive cutting.
|
| 547 |
+
Otherwise, they will be split aggressively just before max_speech_duration_s.
|
| 548 |
+
|
| 549 |
+
min_silence_duration_ms: int (default - 100 milliseconds)
|
| 550 |
+
In the end of each speech chunk wait for min_silence_duration_ms before separating it
|
| 551 |
+
|
| 552 |
+
speech_pad_ms: int (default - 30 milliseconds)
|
| 553 |
+
Final speech chunks are padded by speech_pad_ms each side
|
| 554 |
+
|
| 555 |
+
return_seconds: bool (default - False)
|
| 556 |
+
whether return timestamps in seconds (default - samples)
|
| 557 |
+
|
| 558 |
+
visualize_probs: bool (default - False)
|
| 559 |
+
whether draw prob hist or not
|
| 560 |
+
|
| 561 |
+
progress_tracking_callback: Callable[[float], None] (default - None)
|
| 562 |
+
callback function taking progress in percents as an argument
|
| 563 |
+
|
| 564 |
+
neg_threshold: float (default = threshold - 0.15)
|
| 565 |
+
Negative threshold (noise or exit threshold). If model's current state is SPEECH, values BELOW this value are considered as NON-SPEECH.
|
| 566 |
+
|
| 567 |
+
window_size_samples: int (default - 512 samples)
|
| 568 |
+
!!! DEPRECATED, DOES NOTHING !!!
|
| 569 |
+
|
| 570 |
+
Returns
|
| 571 |
+
----------
|
| 572 |
+
speeches: list of dicts
|
| 573 |
+
list containing ends and beginnings of speech chunks (samples or seconds based on return_seconds)
|
| 574 |
+
https://github.com/snakers4/silero-vad/blob/master/src/silero_vad/utils_vad.py
|
| 575 |
"""
|
| 576 |
repo_owner = "snakers4"
|
| 577 |
+
repo_name = "silero-vad:v4.0" # https://github.com/snakers4/silero-vad/issues/515
|
| 578 |
ref = "master"
|
| 579 |
|
| 580 |
try:
|
| 581 |
+
model, utils = torch.hub.load(repo_or_dir=f'{repo_owner}/{repo_name}', model='silero_vad', trust_repo=True)
|
| 582 |
except Exception as e:
|
| 583 |
hub_dir = torch.hub.get_dir()
|
| 584 |
owner_name_branch = '_'.join([repo_owner, repo_name, ref])
|
|
|
|
| 611 |
print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
|
| 612 |
wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
|
| 613 |
|
| 614 |
+
sample_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=self.sampling_rate, threshold=SPEECH_TRESHOLD) #, neg_threshold=0.15, return_seconds=True
|
| 615 |
seconds_timestamps = self.multiply_timestamps(sample_timestamps, factor=1 / self.sampling_rate)
|
| 616 |
adjusted = self.adjust_timestamp(seconds_timestamps, adjust_seconds=chunk_start, max_source_time=chunk_start + chunk_duration)
|
| 617 |
|