Spaces:

langtech-innovation
/

WhisperLiveKitDiarization

Paused

App Files Files Community

qfuxa commited on Jan 19

Commit

6933483

1 Parent(s): cc68f3b

add diarization (beta). Disabled by default

Browse files

Files changed (2) hide show

src/diarization/diarization_online.py +86 -0
whisper_fastapi_online_server.py +28 -4

src/diarization/diarization_online.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from diart import SpeakerDiarization
+from diart.inference import StreamingInference
+from diart.sources import AudioSource
+from rx.subject import Subject
+import threading
+import numpy as np
+import asyncio
+class WebSocketAudioSource(AudioSource):
+    """
+    Simple custom AudioSource that blocks in read()
+    until close() is called.
+    push_audio() is used to inject new PCM chunks.
+    """
+    def __init__(self, uri: str = "websocket", sample_rate: int = 16000):
+        super().__init__(uri, sample_rate)
+        self._close_event = threading.Event()
+        self._closed = False
+    def read(self):
+        self._close_event.wait()
+    def close(self):
+        if not self._closed:
+            self._closed = True
+            self.stream.on_completed()
+            self._close_event.set()
+    def push_audio(self, chunk: np.ndarray):
+        chunk = np.expand_dims(chunk, axis=0)
+        if not self._closed:
+            self.stream.on_next(chunk)
+def create_pipeline(SAMPLE_RATE):
+    diar_pipeline = SpeakerDiarization()
+    ws_source = WebSocketAudioSource(uri="websocket_source", sample_rate=SAMPLE_RATE)
+    inference = StreamingInference(
+        pipeline=diar_pipeline,
+        source=ws_source,
+        do_plot=False,
+        show_progress=False,
+    )
+    return inference, ws_source
+def init_diart(SAMPLE_RATE):
+    inference, ws_source = create_pipeline(SAMPLE_RATE)
+    def diar_hook(result):
+        """
+        Hook called each time Diart processes a chunk.
+        result is (annotation, audio).
+        We store the label of the last segment in 'current_speaker'.
+        """
+        global l_speakers
+        l_speakers = []
+        annotation, audio = result
+        for speaker in annotation._labels:
+            segment = annotation._labels[speaker].__str__()
+            asyncio.create_task(
+            l_speakers_queue.put({"speaker": speaker, "segment": segment})
+        )
+    l_speakers_queue = asyncio.Queue()
+    inference.attach_hooks(diar_hook)
+    # Launch Diart in a background thread
+    loop = asyncio.get_event_loop()
+    diar_future = loop.run_in_executor(None, inference)
+    return inference, l_speakers_queue, ws_source
+class DiartDiarization():
+    def __init__(self, SAMPLE_RATE):
+        self.inference, self.l_speakers_queue, self.ws_source = init_diart(SAMPLE_RATE)
+    async def get_speakers(self, pcm_array):
+        self.ws_source.push_audio(pcm_array)
+        speakers = []
+        while not self.l_speakers_queue.empty():
+            speakers.append(await self.l_speakers_queue.get())
+        return speakers
+    def close(self):
+        self.ws_source.close()

whisper_fastapi_online_server.py CHANGED Viewed

@@ -10,7 +10,6 @@ from fastapi.responses import HTMLResponse
 from fastapi.middleware.cors import CORSMiddleware
 from whisper_online import backend_factory, online_factory, add_shared_args
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
@@ -37,11 +36,24 @@ parser.add_argument(
     dest="warmup_file",
     help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .",
 )
 add_shared_args(parser)
 args = parser.parse_args()
 asr, tokenizer = backend_factory(args)
 # Load demo HTML for the root endpoint
 with open("src/web/live_transcription.html", "r", encoding="utf-8") as f:
     html = f.read()
@@ -89,6 +101,9 @@ async def websocket_endpoint(websocket: WebSocket):
     online = online_factory(args, asr, tokenizer)
     print("Online loaded.")
     # Continuously read decoded PCM from ffmpeg stdout in a background task
     async def ffmpeg_stdout_reader():
         nonlocal pcm_buffer
@@ -136,9 +151,13 @@ async def websocket_endpoint(websocket: WebSocket):
                         buffer in full_transcription
                     ):  # With VAC, the buffer is not updated until the next chunk is processed
                         buffer = ""
-                    await websocket.send_json(
-                        {"transcription": transcription, "buffer": buffer}
-                    )
             except Exception as e:
                 print(f"Exception in ffmpeg_stdout_reader: {e}")
                 break
@@ -174,6 +193,11 @@ async def websocket_endpoint(websocket: WebSocket):
         ffmpeg_process.wait()
         del online

 from fastapi.middleware.cors import CORSMiddleware
 from whisper_online import backend_factory, online_factory, add_shared_args
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
     dest="warmup_file",
     help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .",
 )
+parser.add_argument(
+    "--diarization",
+    type=bool,
+    default=False,
+    help="Whether to enable speaker diarization.",
+)
 add_shared_args(parser)
 args = parser.parse_args()
 asr, tokenizer = backend_factory(args)
+if args.diarization:
+    from src.diarization.diarization_online import DiartDiarization
 # Load demo HTML for the root endpoint
 with open("src/web/live_transcription.html", "r", encoding="utf-8") as f:
     html = f.read()
     online = online_factory(args, asr, tokenizer)
     print("Online loaded.")
+    if args.diarization:
+        diarization = DiartDiarization(SAMPLE_RATE)
     # Continuously read decoded PCM from ffmpeg stdout in a background task
     async def ffmpeg_stdout_reader():
         nonlocal pcm_buffer
                         buffer in full_transcription
                     ):  # With VAC, the buffer is not updated until the next chunk is processed
                         buffer = ""
+                    response = {"transcription": transcription, "buffer": buffer}
+                    if args.diarization:
+                        speakers = await diarization.get_speakers(pcm_array)
+                        response["speakers"] = speakers
+                    await websocket.send_json(response)
             except Exception as e:
                 print(f"Exception in ffmpeg_stdout_reader: {e}")
                 break
         ffmpeg_process.wait()
         del online
+        if args.diarization:
+            # Stop Diart
+            diarization.close()