enhance chunking to handle audio buffer time limits
Browse files
whisperlivekit/whisper_streaming_custom/online_asr.py
CHANGED
|
@@ -216,31 +216,54 @@ class OnlineASRProcessor:
|
|
| 216 |
"""
|
| 217 |
If the committed tokens form at least two sentences, chunk the audio
|
| 218 |
buffer at the end time of the penultimate sentence.
|
|
|
|
| 219 |
"""
|
|
|
|
| 220 |
if not self.committed:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
return
|
|
|
|
| 222 |
logger.debug("COMPLETED SENTENCE: " + " ".join(token.text for token in self.committed))
|
| 223 |
sentences = self.words_to_sentences(self.committed)
|
| 224 |
for sentence in sentences:
|
| 225 |
logger.debug(f"\tSentence: {sentence.text}")
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
|
| 235 |
def chunk_completed_segment(self, res):
|
| 236 |
"""
|
| 237 |
Chunk the audio buffer based on segment-end timestamps reported by the ASR.
|
|
|
|
| 238 |
"""
|
|
|
|
| 239 |
if not self.committed:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
return
|
|
|
|
|
|
|
| 241 |
ends = self.asr.segments_end_ts(res)
|
| 242 |
-
last_committed_time = self.committed[-1].end
|
|
|
|
| 243 |
if len(ends) > 1:
|
|
|
|
| 244 |
e = ends[-2] + self.buffer_time_offset
|
| 245 |
while len(ends) > 2 and e > last_committed_time:
|
| 246 |
ends.pop(-1)
|
|
@@ -248,11 +271,18 @@ class OnlineASRProcessor:
|
|
| 248 |
if e <= last_committed_time:
|
| 249 |
logger.debug(f"--- Segment chunked at {e:.2f}")
|
| 250 |
self.chunk_at(e)
|
|
|
|
| 251 |
else:
|
| 252 |
logger.debug("--- Last segment not within committed area")
|
| 253 |
else:
|
| 254 |
logger.debug("--- Not enough segments to chunk")
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
def chunk_at(self, time: float):
|
| 257 |
"""
|
| 258 |
Trim both the hypothesis and audio buffer at the given time.
|
|
|
|
| 216 |
"""
|
| 217 |
If the committed tokens form at least two sentences, chunk the audio
|
| 218 |
buffer at the end time of the penultimate sentence.
|
| 219 |
+
Also ensures chunking happens if audio buffer exceeds a time limit.
|
| 220 |
"""
|
| 221 |
+
buffer_duration = len(self.audio_buffer) / self.SAMPLING_RATE
|
| 222 |
if not self.committed:
|
| 223 |
+
if buffer_duration > self.buffer_trimming_sec:
|
| 224 |
+
chunk_time = self.buffer_time_offset + (buffer_duration / 2)
|
| 225 |
+
logger.debug(f"--- No speech detected, forced chunking at {chunk_time:.2f}")
|
| 226 |
+
self.chunk_at(chunk_time)
|
| 227 |
return
|
| 228 |
+
|
| 229 |
logger.debug("COMPLETED SENTENCE: " + " ".join(token.text for token in self.committed))
|
| 230 |
sentences = self.words_to_sentences(self.committed)
|
| 231 |
for sentence in sentences:
|
| 232 |
logger.debug(f"\tSentence: {sentence.text}")
|
| 233 |
+
|
| 234 |
+
chunk_done = False
|
| 235 |
+
if len(sentences) >= 2:
|
| 236 |
+
while len(sentences) > 2:
|
| 237 |
+
sentences.pop(0)
|
| 238 |
+
chunk_time = sentences[-2].end
|
| 239 |
+
logger.debug(f"--- Sentence chunked at {chunk_time:.2f}")
|
| 240 |
+
self.chunk_at(chunk_time)
|
| 241 |
+
chunk_done = True
|
| 242 |
+
|
| 243 |
+
if not chunk_done and buffer_duration > self.buffer_trimming_sec:
|
| 244 |
+
last_committed_time = self.committed[-1].end
|
| 245 |
+
logger.debug(f"--- Not enough sentences, chunking at last committed time {last_committed_time:.2f}")
|
| 246 |
+
self.chunk_at(last_committed_time)
|
| 247 |
|
| 248 |
def chunk_completed_segment(self, res):
|
| 249 |
"""
|
| 250 |
Chunk the audio buffer based on segment-end timestamps reported by the ASR.
|
| 251 |
+
Also ensures chunking happens if audio buffer exceeds a time limit.
|
| 252 |
"""
|
| 253 |
+
buffer_duration = len(self.audio_buffer) / self.SAMPLING_RATE
|
| 254 |
if not self.committed:
|
| 255 |
+
if buffer_duration > self.buffer_trimming_sec:
|
| 256 |
+
chunk_time = self.buffer_time_offset + (buffer_duration / 2)
|
| 257 |
+
logger.debug(f"--- No speech detected, forced chunking at {chunk_time:.2f}")
|
| 258 |
+
self.chunk_at(chunk_time)
|
| 259 |
return
|
| 260 |
+
|
| 261 |
+
logger.debug("Processing committed tokens for segmenting")
|
| 262 |
ends = self.asr.segments_end_ts(res)
|
| 263 |
+
last_committed_time = self.committed[-1].end
|
| 264 |
+
chunk_done = False
|
| 265 |
if len(ends) > 1:
|
| 266 |
+
logger.debug("Multiple segments available for chunking")
|
| 267 |
e = ends[-2] + self.buffer_time_offset
|
| 268 |
while len(ends) > 2 and e > last_committed_time:
|
| 269 |
ends.pop(-1)
|
|
|
|
| 271 |
if e <= last_committed_time:
|
| 272 |
logger.debug(f"--- Segment chunked at {e:.2f}")
|
| 273 |
self.chunk_at(e)
|
| 274 |
+
chunk_done = True
|
| 275 |
else:
|
| 276 |
logger.debug("--- Last segment not within committed area")
|
| 277 |
else:
|
| 278 |
logger.debug("--- Not enough segments to chunk")
|
| 279 |
+
|
| 280 |
+
if not chunk_done and buffer_duration > self.buffer_trimming_sec:
|
| 281 |
+
logger.debug(f"--- Buffer too large, chunking at last committed time {last_committed_time:.2f}")
|
| 282 |
+
self.chunk_at(last_committed_time)
|
| 283 |
+
|
| 284 |
+
logger.debug("Segment chunking complete")
|
| 285 |
+
|
| 286 |
def chunk_at(self, time: float):
|
| 287 |
"""
|
| 288 |
Trim both the hypothesis and audio buffer at the given time.
|