Dominik Macháček
commited on
Commit
·
e6648e4
1
Parent(s):
863242f
fixed silero vad chunk size
Browse filesissues #141 #121 #142 #136 etc.
- silero_vad.py → silero_vad_iterator.py +21 -9
- whisper_online.py +15 -8
silero_vad.py → silero_vad_iterator.py
RENAMED
|
@@ -2,6 +2,7 @@ import torch
|
|
| 2 |
|
| 3 |
# This is copied from silero-vad's vad_utils.py:
|
| 4 |
# https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/utils_vad.py#L340
|
|
|
|
| 5 |
|
| 6 |
# Their licence is MIT, same as ours: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
|
| 7 |
|
|
@@ -10,8 +11,8 @@ class VADIterator:
|
|
| 10 |
model,
|
| 11 |
threshold: float = 0.5,
|
| 12 |
sampling_rate: int = 16000,
|
| 13 |
-
min_silence_duration_ms: int =
|
| 14 |
-
speech_pad_ms: int =
|
| 15 |
):
|
| 16 |
|
| 17 |
"""
|
|
@@ -95,11 +96,14 @@ class VADIterator:
|
|
| 95 |
return None
|
| 96 |
|
| 97 |
#######################
|
| 98 |
-
#
|
| 99 |
-
# (see https://github.com/ufal/whisper_streaming/issues/116 )
|
| 100 |
|
| 101 |
import numpy as np
|
| 102 |
class FixedVADIterator(VADIterator):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
def reset_states(self):
|
| 105 |
super().reset_states()
|
|
@@ -107,11 +111,19 @@ class FixedVADIterator(VADIterator):
|
|
| 107 |
|
| 108 |
def __call__(self, x, return_seconds=False):
|
| 109 |
self.buffer = np.append(self.buffer, x)
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
if __name__ == "__main__":
|
| 117 |
# test/demonstrate the need for FixedVADIterator:
|
|
|
|
| 2 |
|
| 3 |
# This is copied from silero-vad's vad_utils.py:
|
| 4 |
# https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/utils_vad.py#L340
|
| 5 |
+
# (except changed defaults)
|
| 6 |
|
| 7 |
# Their licence is MIT, same as ours: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
|
| 8 |
|
|
|
|
| 11 |
model,
|
| 12 |
threshold: float = 0.5,
|
| 13 |
sampling_rate: int = 16000,
|
| 14 |
+
min_silence_duration_ms: int = 500, # makes sense on one recording that I checked
|
| 15 |
+
speech_pad_ms: int = 100 # same
|
| 16 |
):
|
| 17 |
|
| 18 |
"""
|
|
|
|
| 96 |
return None
|
| 97 |
|
| 98 |
#######################
|
| 99 |
+
# because Silero now requires exactly 512-sized audio chunks
|
|
|
|
| 100 |
|
| 101 |
import numpy as np
|
| 102 |
class FixedVADIterator(VADIterator):
|
| 103 |
+
'''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
|
| 104 |
+
If audio to be processed at once is long and multiple voiced segments detected,
|
| 105 |
+
then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
|
| 106 |
+
'''
|
| 107 |
|
| 108 |
def reset_states(self):
|
| 109 |
super().reset_states()
|
|
|
|
| 111 |
|
| 112 |
def __call__(self, x, return_seconds=False):
|
| 113 |
self.buffer = np.append(self.buffer, x)
|
| 114 |
+
ret = None
|
| 115 |
+
while len(self.buffer) >= 512:
|
| 116 |
+
r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
|
| 117 |
+
self.buffer = self.buffer[512:]
|
| 118 |
+
if ret is None:
|
| 119 |
+
ret = r
|
| 120 |
+
elif r is not None:
|
| 121 |
+
if 'end' in r:
|
| 122 |
+
ret['end'] = r['end'] # the latter end
|
| 123 |
+
if 'start' in r and 'end' in ret: # there is an earlier start.
|
| 124 |
+
# Remove end, merging this segment with the previous one.
|
| 125 |
+
del ret['end']
|
| 126 |
+
return ret if ret != {} else None
|
| 127 |
|
| 128 |
if __name__ == "__main__":
|
| 129 |
# test/demonstrate the need for FixedVADIterator:
|
whisper_online.py
CHANGED
|
@@ -534,8 +534,8 @@ class VACOnlineASRProcessor(OnlineASRProcessor):
|
|
| 534 |
repo_or_dir='snakers4/silero-vad',
|
| 535 |
model='silero_vad'
|
| 536 |
)
|
| 537 |
-
from
|
| 538 |
-
self.vac = FixedVADIterator(model) # we use
|
| 539 |
|
| 540 |
self.logfile = self.online.logfile
|
| 541 |
self.init()
|
|
@@ -561,24 +561,31 @@ class VACOnlineASRProcessor(OnlineASRProcessor):
|
|
| 561 |
self.audio_buffer = np.append(self.audio_buffer, audio)
|
| 562 |
|
| 563 |
if res is not None:
|
| 564 |
-
frame = list(res.values())[0]
|
| 565 |
if 'start' in res and 'end' not in res:
|
| 566 |
self.status = 'voice'
|
| 567 |
-
send_audio = self.audio_buffer[frame
|
| 568 |
-
self.online.init(offset=frame/self.SAMPLING_RATE)
|
| 569 |
self.online.insert_audio_chunk(send_audio)
|
| 570 |
self.current_online_chunk_buffer_size += len(send_audio)
|
| 571 |
self.clear_buffer()
|
| 572 |
elif 'end' in res and 'start' not in res:
|
| 573 |
self.status = 'nonvoice'
|
| 574 |
-
send_audio = self.audio_buffer[:frame
|
| 575 |
self.online.insert_audio_chunk(send_audio)
|
| 576 |
self.current_online_chunk_buffer_size += len(send_audio)
|
| 577 |
self.is_currently_final = True
|
| 578 |
self.clear_buffer()
|
| 579 |
else:
|
| 580 |
-
|
| 581 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 582 |
else:
|
| 583 |
if self.status == 'voice':
|
| 584 |
self.online.insert_audio_chunk(self.audio_buffer)
|
|
|
|
| 534 |
repo_or_dir='snakers4/silero-vad',
|
| 535 |
model='silero_vad'
|
| 536 |
)
|
| 537 |
+
from silero_vad_iterator import FixedVADIterator
|
| 538 |
+
self.vac = FixedVADIterator(model) # we use the default options there: 500ms silence, 100ms padding, etc.
|
| 539 |
|
| 540 |
self.logfile = self.online.logfile
|
| 541 |
self.init()
|
|
|
|
| 561 |
self.audio_buffer = np.append(self.audio_buffer, audio)
|
| 562 |
|
| 563 |
if res is not None:
|
| 564 |
+
frame = list(res.values())[0]-self.buffer_offset
|
| 565 |
if 'start' in res and 'end' not in res:
|
| 566 |
self.status = 'voice'
|
| 567 |
+
send_audio = self.audio_buffer[frame:]
|
| 568 |
+
self.online.init(offset=(frame+self.buffer_offset)/self.SAMPLING_RATE)
|
| 569 |
self.online.insert_audio_chunk(send_audio)
|
| 570 |
self.current_online_chunk_buffer_size += len(send_audio)
|
| 571 |
self.clear_buffer()
|
| 572 |
elif 'end' in res and 'start' not in res:
|
| 573 |
self.status = 'nonvoice'
|
| 574 |
+
send_audio = self.audio_buffer[:frame]
|
| 575 |
self.online.insert_audio_chunk(send_audio)
|
| 576 |
self.current_online_chunk_buffer_size += len(send_audio)
|
| 577 |
self.is_currently_final = True
|
| 578 |
self.clear_buffer()
|
| 579 |
else:
|
| 580 |
+
beg = res["start"]-self.buffer_offset
|
| 581 |
+
end = res["end"]-self.buffer_offset
|
| 582 |
+
self.status = 'nonvoice'
|
| 583 |
+
send_audio = self.audio_buffer[beg:end]
|
| 584 |
+
self.online.init(offset=(beg+self.buffer_offset)/self.SAMPLING_RATE)
|
| 585 |
+
self.online.insert_audio_chunk(send_audio)
|
| 586 |
+
self.current_online_chunk_buffer_size += len(send_audio)
|
| 587 |
+
self.is_currently_final = True
|
| 588 |
+
self.clear_buffer()
|
| 589 |
else:
|
| 590 |
if self.status == 'voice':
|
| 591 |
self.online.insert_audio_chunk(self.audio_buffer)
|