Refactor PCM conversion to a dedicated function; immediate chunk addition to the diarization queue
Browse files- README.md +2 -2
- whisper_fastapi_online_server.py +16 -7
README.md
CHANGED
|
@@ -30,8 +30,8 @@ This project is based on [Whisper Streaming](https://github.com/ufal/whisper_str
|
|
| 30 |
1. **Clone the Repository**:
|
| 31 |
|
| 32 |
```bash
|
| 33 |
-
git clone https://github.com/QuentinFuxa/
|
| 34 |
-
cd
|
| 35 |
```
|
| 36 |
|
| 37 |
|
|
|
|
| 30 |
1. **Clone the Repository**:
|
| 31 |
|
| 32 |
```bash
|
| 33 |
+
git clone https://github.com/QuentinFuxa/WhisperLiveKit
|
| 34 |
+
cd WhisperLiveKit
|
| 35 |
```
|
| 36 |
|
| 37 |
|
whisper_fastapi_online_server.py
CHANGED
|
@@ -190,6 +190,16 @@ app.add_middleware(
|
|
| 190 |
with open("web/live_transcription.html", "r", encoding="utf-8") as f:
|
| 191 |
html = f.read()
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
async def start_ffmpeg_decoder():
|
| 194 |
"""
|
| 195 |
Start an FFmpeg process in async streaming mode that reads WebM from stdin
|
|
@@ -444,24 +454,23 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
| 444 |
logger.info("FFmpeg stdout closed.")
|
| 445 |
break
|
| 446 |
pcm_buffer.extend(chunk)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
if len(pcm_buffer) >= BYTES_PER_SEC:
|
| 448 |
if len(pcm_buffer) > MAX_BYTES_PER_SEC:
|
| 449 |
logger.warning(
|
| 450 |
f"""Audio buffer is too large: {len(pcm_buffer) / BYTES_PER_SEC:.2f} seconds.
|
| 451 |
The model probably struggles to keep up. Consider using a smaller model.
|
| 452 |
""")
|
| 453 |
-
|
| 454 |
-
pcm_array = (
|
| 455 |
-
np.frombuffer(pcm_buffer[:MAX_BYTES_PER_SEC], dtype=np.int16).astype(np.float32)
|
| 456 |
-
/ 32768.0
|
| 457 |
-
)
|
| 458 |
pcm_buffer = pcm_buffer[MAX_BYTES_PER_SEC:]
|
| 459 |
|
| 460 |
if args.transcription and transcription_queue:
|
| 461 |
await transcription_queue.put(pcm_array.copy())
|
| 462 |
|
| 463 |
-
if args.diarization and diarization_queue:
|
| 464 |
-
await diarization_queue.put(pcm_array.copy())
|
| 465 |
|
| 466 |
if not args.transcription and not args.diarization:
|
| 467 |
await asyncio.sleep(0.1)
|
|
|
|
| 190 |
with open("web/live_transcription.html", "r", encoding="utf-8") as f:
|
| 191 |
html = f.read()
|
| 192 |
|
| 193 |
+
def convert_pcm_to_float(pcm_buffer):
|
| 194 |
+
"""
|
| 195 |
+
Converts a PCM buffer in s16le format to a normalized NumPy array.
|
| 196 |
+
Arg: pcm_buffer. PCM buffer containing raw audio data in s16le format
|
| 197 |
+
Returns: np.ndarray. NumPy array of float32 type normalized between -1.0 and 1.0
|
| 198 |
+
"""
|
| 199 |
+
pcm_array = (np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32)
|
| 200 |
+
/ 32768.0)
|
| 201 |
+
return pcm_array
|
| 202 |
+
|
| 203 |
async def start_ffmpeg_decoder():
|
| 204 |
"""
|
| 205 |
Start an FFmpeg process in async streaming mode that reads WebM from stdin
|
|
|
|
| 454 |
logger.info("FFmpeg stdout closed.")
|
| 455 |
break
|
| 456 |
pcm_buffer.extend(chunk)
|
| 457 |
+
|
| 458 |
+
if args.diarization and diarization_queue:
|
| 459 |
+
await diarization_queue.put(convert_pcm_to_float(pcm_buffer).copy())
|
| 460 |
+
|
| 461 |
if len(pcm_buffer) >= BYTES_PER_SEC:
|
| 462 |
if len(pcm_buffer) > MAX_BYTES_PER_SEC:
|
| 463 |
logger.warning(
|
| 464 |
f"""Audio buffer is too large: {len(pcm_buffer) / BYTES_PER_SEC:.2f} seconds.
|
| 465 |
The model probably struggles to keep up. Consider using a smaller model.
|
| 466 |
""")
|
| 467 |
+
|
| 468 |
+
pcm_array = convert_pcm_to_float(pcm_buffer[:MAX_BYTES_PER_SEC])
|
|
|
|
|
|
|
|
|
|
| 469 |
pcm_buffer = pcm_buffer[MAX_BYTES_PER_SEC:]
|
| 470 |
|
| 471 |
if args.transcription and transcription_queue:
|
| 472 |
await transcription_queue.put(pcm_array.copy())
|
| 473 |
|
|
|
|
|
|
|
| 474 |
|
| 475 |
if not args.transcription and not args.diarization:
|
| 476 |
await asyncio.sleep(0.1)
|