Improve speaker handling; update sleep duration and manage speaker transitions more effectively
Browse files- whisper_fastapi_online_server.py +16 -15
whisper_fastapi_online_server.py
CHANGED
|
@@ -214,10 +214,10 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
| 214 |
else:
|
| 215 |
chunk_history.append({
|
| 216 |
"beg": time() - beg_loop,
|
| 217 |
-
"end": time() - beg_loop +
|
| 218 |
"text": '',
|
| 219 |
})
|
| 220 |
-
sleep(
|
| 221 |
buffer = ''
|
| 222 |
|
| 223 |
if args.diarization:
|
|
@@ -225,28 +225,29 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
| 225 |
diarization.assign_speakers_to_chunks(chunk_history)
|
| 226 |
|
| 227 |
|
| 228 |
-
current_speaker =
|
| 229 |
-
lines = [
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
"speaker"
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
if args.diarization and ch["speaker"] and ch["speaker"] != current_speaker:
|
| 237 |
-
new_speaker = ch["speaker"]
|
| 238 |
lines.append(
|
| 239 |
{
|
| 240 |
-
"speaker":
|
| 241 |
"text": ch['text'],
|
| 242 |
"beg": format_time(ch['beg']),
|
| 243 |
"end": format_time(ch['end']),
|
|
|
|
| 244 |
}
|
| 245 |
)
|
| 246 |
-
current_speaker =
|
| 247 |
-
|
| 248 |
lines[-1]["text"] += ch['text']
|
| 249 |
lines[-1]["end"] = format_time(ch['end'])
|
|
|
|
|
|
|
| 250 |
|
| 251 |
response = {"lines": lines, "buffer": buffer}
|
| 252 |
await websocket.send_json(response)
|
|
|
|
| 214 |
else:
|
| 215 |
chunk_history.append({
|
| 216 |
"beg": time() - beg_loop,
|
| 217 |
+
"end": time() - beg_loop + 1,
|
| 218 |
"text": '',
|
| 219 |
})
|
| 220 |
+
sleep(1)
|
| 221 |
buffer = ''
|
| 222 |
|
| 223 |
if args.diarization:
|
|
|
|
| 225 |
diarization.assign_speakers_to_chunks(chunk_history)
|
| 226 |
|
| 227 |
|
| 228 |
+
current_speaker = 0
|
| 229 |
+
lines = []
|
| 230 |
+
last_end_diarized = 0
|
| 231 |
+
for ind, ch in enumerate(chunk_history):
|
| 232 |
+
speaker = ch.get("speaker", -3)
|
| 233 |
+
if speaker == -1 and ind < len(chunk_history) - 1:
|
| 234 |
+
continue
|
| 235 |
+
elif speaker != current_speaker:
|
|
|
|
|
|
|
| 236 |
lines.append(
|
| 237 |
{
|
| 238 |
+
"speaker": speaker,
|
| 239 |
"text": ch['text'],
|
| 240 |
"beg": format_time(ch['beg']),
|
| 241 |
"end": format_time(ch['end']),
|
| 242 |
+
"diff": round(ch['end'] - last_end_diarized, 2)
|
| 243 |
}
|
| 244 |
)
|
| 245 |
+
current_speaker = speaker
|
| 246 |
+
elif speaker != -1:
|
| 247 |
lines[-1]["text"] += ch['text']
|
| 248 |
lines[-1]["end"] = format_time(ch['end'])
|
| 249 |
+
if speaker != -1:
|
| 250 |
+
last_end_diarized = max(ch['end'], last_end_diarized)
|
| 251 |
|
| 252 |
response = {"lines": lines, "buffer": buffer}
|
| 253 |
await websocket.send_json(response)
|