Mahmoud Elsamadony commited on
Commit
cf179b4
·
1 Parent(s): a19727e

UPDATE fixing multiple speakers detections

Browse files
Files changed (2) hide show
  1. __pycache__/app.cpython-311.pyc +0 -0
  2. app.py +62 -0
__pycache__/app.cpython-311.pyc CHANGED
Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ
 
app.py CHANGED
@@ -51,6 +51,7 @@ initial_prompt = os.environ.get(
51
 
52
  beam_size_default = int(os.environ.get("WHISPER_BEAM_SIZE", 5))
53
  best_of_default = int(os.environ.get("WHISPER_BEST_OF", 5))
 
54
 
55
  # ---------------------------------------------------------------------------
56
  # Lazy singletons for the heavy models
@@ -147,6 +148,7 @@ def transcribe(
147
  audio_path: str,
148
  language: str,
149
  enable_diarization: bool,
 
150
  beam_size: int,
151
  best_of: int,
152
  ) -> Dict:
@@ -305,6 +307,57 @@ def transcribe(
305
  # Sort speaker turns by start time
306
  speaker_turns.sort(key=lambda x: x["start"])
307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  # Assign speakers to each transcript segment
309
  for segment in response["segments"]:
310
  seg_start = segment["start"]
@@ -406,6 +459,14 @@ def build_interface() -> gr.Blocks:
406
  value=False,
407
  info="Uses NVIDIA Sortformer model (max 4 speakers, downloads ~700MB on first use).",
408
  )
 
 
 
 
 
 
 
 
409
  beam_slider = gr.Slider(
410
  label="Beam Size",
411
  minimum=1,
@@ -432,6 +493,7 @@ def build_interface() -> gr.Blocks:
432
  audio_input,
433
  language_input,
434
  diarization_toggle,
 
435
  beam_slider,
436
  best_of_slider,
437
  ],
 
51
 
52
  beam_size_default = int(os.environ.get("WHISPER_BEAM_SIZE", 5))
53
  best_of_default = int(os.environ.get("WHISPER_BEST_OF", 5))
54
+ expected_speakers_default = int(os.environ.get("EXPECTED_SPEAKERS", 2))
55
 
56
  # ---------------------------------------------------------------------------
57
  # Lazy singletons for the heavy models
 
148
  audio_path: str,
149
  language: str,
150
  enable_diarization: bool,
151
+ expected_speakers: int,
152
  beam_size: int,
153
  best_of: int,
154
  ) -> Dict:
 
307
  # Sort speaker turns by start time
308
  speaker_turns.sort(key=lambda x: x["start"])
309
 
310
+ # Consolidate speakers if we detected more than expected
311
+ unique_speakers = set(turn["speaker"] for turn in speaker_turns)
312
+ print(f"[DEBUG] Detected {len(unique_speakers)} unique speakers: {unique_speakers}")
313
+
314
+ if expected_speakers > 0 and len(unique_speakers) > expected_speakers:
315
+ print(f"[DEBUG] Consolidating from {len(unique_speakers)} to {expected_speakers} speakers")
316
+ # Create a mapping to merge speakers
317
+ # Strategy: Merge speakers by order of first appearance, keeping the most active ones
318
+ speaker_stats = {}
319
+ for turn in speaker_turns:
320
+ spk = turn["speaker"]
321
+ if spk not in speaker_stats:
322
+ speaker_stats[spk] = {"first_appear": turn["start"], "duration": 0, "count": 0}
323
+ speaker_stats[spk]["duration"] += turn["end"] - turn["start"]
324
+ speaker_stats[spk]["count"] += 1
325
+
326
+ # Sort speakers by total speaking duration (most active first)
327
+ sorted_speakers = sorted(speaker_stats.items(), key=lambda x: x[1]["duration"], reverse=True)
328
+ print(f"[DEBUG] Speaker activity: {[(s, round(stats['duration'], 1)) for s, stats in sorted_speakers]}")
329
+
330
+ # Keep the top N most active speakers, map others to them
331
+ kept_speakers = [s[0] for s in sorted_speakers[:expected_speakers]]
332
+ speaker_mapping = {}
333
+
334
+ for spk, stats in sorted_speakers:
335
+ if spk in kept_speakers:
336
+ speaker_mapping[spk] = spk
337
+ else:
338
+ # Map this speaker to the closest kept speaker by first appearance time
339
+ closest_kept = min(kept_speakers,
340
+ key=lambda k: abs(speaker_stats[k]["first_appear"] - stats["first_appear"]))
341
+ speaker_mapping[spk] = closest_kept
342
+ print(f"[DEBUG] Mapping {spk} -> {closest_kept}")
343
+
344
+ # Apply the mapping
345
+ for turn in speaker_turns:
346
+ turn["speaker"] = speaker_mapping[turn["speaker"]]
347
+
348
+ # Merge consecutive turns from the same speaker
349
+ merged_turns = []
350
+ for turn in speaker_turns:
351
+ if merged_turns and merged_turns[-1]["speaker"] == turn["speaker"] and \
352
+ turn["start"] - merged_turns[-1]["end"] < 1.0: # Less than 1 second gap
353
+ # Extend the previous turn
354
+ merged_turns[-1]["end"] = turn["end"]
355
+ else:
356
+ merged_turns.append(turn.copy())
357
+
358
+ speaker_turns = merged_turns
359
+ print(f"[DEBUG] After consolidation: {len(speaker_turns)} speaker turns")
360
+
361
  # Assign speakers to each transcript segment
362
  for segment in response["segments"]:
363
  seg_start = segment["start"]
 
459
  value=False,
460
  info="Uses NVIDIA Sortformer model (max 4 speakers, downloads ~700MB on first use).",
461
  )
462
+ expected_speakers_slider = gr.Slider(
463
+ label="Expected Number of Speakers",
464
+ minimum=0,
465
+ maximum=4,
466
+ step=1,
467
+ value=expected_speakers_default,
468
+ info="Set to 0 for automatic detection, or specify 2-4 to consolidate speakers.",
469
+ )
470
  beam_slider = gr.Slider(
471
  label="Beam Size",
472
  minimum=1,
 
493
  audio_input,
494
  language_input,
495
  diarization_toggle,
496
+ expected_speakers_slider,
497
  beam_slider,
498
  best_of_slider,
499
  ],