Mahmoud Elsamadony
commited on
Commit
·
cf179b4
1
Parent(s):
a19727e
UPDATE fixing multiple speakers detections
Browse files- __pycache__/app.cpython-311.pyc +0 -0
- app.py +62 -0
__pycache__/app.cpython-311.pyc
CHANGED
|
Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ
|
|
|
app.py
CHANGED
|
@@ -51,6 +51,7 @@ initial_prompt = os.environ.get(
|
|
| 51 |
|
| 52 |
beam_size_default = int(os.environ.get("WHISPER_BEAM_SIZE", 5))
|
| 53 |
best_of_default = int(os.environ.get("WHISPER_BEST_OF", 5))
|
|
|
|
| 54 |
|
| 55 |
# ---------------------------------------------------------------------------
|
| 56 |
# Lazy singletons for the heavy models
|
|
@@ -147,6 +148,7 @@ def transcribe(
|
|
| 147 |
audio_path: str,
|
| 148 |
language: str,
|
| 149 |
enable_diarization: bool,
|
|
|
|
| 150 |
beam_size: int,
|
| 151 |
best_of: int,
|
| 152 |
) -> Dict:
|
|
@@ -305,6 +307,57 @@ def transcribe(
|
|
| 305 |
# Sort speaker turns by start time
|
| 306 |
speaker_turns.sort(key=lambda x: x["start"])
|
| 307 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
# Assign speakers to each transcript segment
|
| 309 |
for segment in response["segments"]:
|
| 310 |
seg_start = segment["start"]
|
|
@@ -406,6 +459,14 @@ def build_interface() -> gr.Blocks:
|
|
| 406 |
value=False,
|
| 407 |
info="Uses NVIDIA Sortformer model (max 4 speakers, downloads ~700MB on first use).",
|
| 408 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
beam_slider = gr.Slider(
|
| 410 |
label="Beam Size",
|
| 411 |
minimum=1,
|
|
@@ -432,6 +493,7 @@ def build_interface() -> gr.Blocks:
|
|
| 432 |
audio_input,
|
| 433 |
language_input,
|
| 434 |
diarization_toggle,
|
|
|
|
| 435 |
beam_slider,
|
| 436 |
best_of_slider,
|
| 437 |
],
|
|
|
|
| 51 |
|
| 52 |
beam_size_default = int(os.environ.get("WHISPER_BEAM_SIZE", 5))
|
| 53 |
best_of_default = int(os.environ.get("WHISPER_BEST_OF", 5))
|
| 54 |
+
expected_speakers_default = int(os.environ.get("EXPECTED_SPEAKERS", 2))
|
| 55 |
|
| 56 |
# ---------------------------------------------------------------------------
|
| 57 |
# Lazy singletons for the heavy models
|
|
|
|
| 148 |
audio_path: str,
|
| 149 |
language: str,
|
| 150 |
enable_diarization: bool,
|
| 151 |
+
expected_speakers: int,
|
| 152 |
beam_size: int,
|
| 153 |
best_of: int,
|
| 154 |
) -> Dict:
|
|
|
|
| 307 |
# Sort speaker turns by start time
|
| 308 |
speaker_turns.sort(key=lambda x: x["start"])
|
| 309 |
|
| 310 |
+
# Consolidate speakers if we detected more than expected
|
| 311 |
+
unique_speakers = set(turn["speaker"] for turn in speaker_turns)
|
| 312 |
+
print(f"[DEBUG] Detected {len(unique_speakers)} unique speakers: {unique_speakers}")
|
| 313 |
+
|
| 314 |
+
if expected_speakers > 0 and len(unique_speakers) > expected_speakers:
|
| 315 |
+
print(f"[DEBUG] Consolidating from {len(unique_speakers)} to {expected_speakers} speakers")
|
| 316 |
+
# Create a mapping to merge speakers
|
| 317 |
+
# Strategy: Merge speakers by order of first appearance, keeping the most active ones
|
| 318 |
+
speaker_stats = {}
|
| 319 |
+
for turn in speaker_turns:
|
| 320 |
+
spk = turn["speaker"]
|
| 321 |
+
if spk not in speaker_stats:
|
| 322 |
+
speaker_stats[spk] = {"first_appear": turn["start"], "duration": 0, "count": 0}
|
| 323 |
+
speaker_stats[spk]["duration"] += turn["end"] - turn["start"]
|
| 324 |
+
speaker_stats[spk]["count"] += 1
|
| 325 |
+
|
| 326 |
+
# Sort speakers by total speaking duration (most active first)
|
| 327 |
+
sorted_speakers = sorted(speaker_stats.items(), key=lambda x: x[1]["duration"], reverse=True)
|
| 328 |
+
print(f"[DEBUG] Speaker activity: {[(s, round(stats['duration'], 1)) for s, stats in sorted_speakers]}")
|
| 329 |
+
|
| 330 |
+
# Keep the top N most active speakers, map others to them
|
| 331 |
+
kept_speakers = [s[0] for s in sorted_speakers[:expected_speakers]]
|
| 332 |
+
speaker_mapping = {}
|
| 333 |
+
|
| 334 |
+
for spk, stats in sorted_speakers:
|
| 335 |
+
if spk in kept_speakers:
|
| 336 |
+
speaker_mapping[spk] = spk
|
| 337 |
+
else:
|
| 338 |
+
# Map this speaker to the closest kept speaker by first appearance time
|
| 339 |
+
closest_kept = min(kept_speakers,
|
| 340 |
+
key=lambda k: abs(speaker_stats[k]["first_appear"] - stats["first_appear"]))
|
| 341 |
+
speaker_mapping[spk] = closest_kept
|
| 342 |
+
print(f"[DEBUG] Mapping {spk} -> {closest_kept}")
|
| 343 |
+
|
| 344 |
+
# Apply the mapping
|
| 345 |
+
for turn in speaker_turns:
|
| 346 |
+
turn["speaker"] = speaker_mapping[turn["speaker"]]
|
| 347 |
+
|
| 348 |
+
# Merge consecutive turns from the same speaker
|
| 349 |
+
merged_turns = []
|
| 350 |
+
for turn in speaker_turns:
|
| 351 |
+
if merged_turns and merged_turns[-1]["speaker"] == turn["speaker"] and \
|
| 352 |
+
turn["start"] - merged_turns[-1]["end"] < 1.0: # Less than 1 second gap
|
| 353 |
+
# Extend the previous turn
|
| 354 |
+
merged_turns[-1]["end"] = turn["end"]
|
| 355 |
+
else:
|
| 356 |
+
merged_turns.append(turn.copy())
|
| 357 |
+
|
| 358 |
+
speaker_turns = merged_turns
|
| 359 |
+
print(f"[DEBUG] After consolidation: {len(speaker_turns)} speaker turns")
|
| 360 |
+
|
| 361 |
# Assign speakers to each transcript segment
|
| 362 |
for segment in response["segments"]:
|
| 363 |
seg_start = segment["start"]
|
|
|
|
| 459 |
value=False,
|
| 460 |
info="Uses NVIDIA Sortformer model (max 4 speakers, downloads ~700MB on first use).",
|
| 461 |
)
|
| 462 |
+
expected_speakers_slider = gr.Slider(
|
| 463 |
+
label="Expected Number of Speakers",
|
| 464 |
+
minimum=0,
|
| 465 |
+
maximum=4,
|
| 466 |
+
step=1,
|
| 467 |
+
value=expected_speakers_default,
|
| 468 |
+
info="Set to 0 for automatic detection, or specify 2-4 to consolidate speakers.",
|
| 469 |
+
)
|
| 470 |
beam_slider = gr.Slider(
|
| 471 |
label="Beam Size",
|
| 472 |
minimum=1,
|
|
|
|
| 493 |
audio_input,
|
| 494 |
language_input,
|
| 495 |
diarization_toggle,
|
| 496 |
+
expected_speakers_slider,
|
| 497 |
beam_slider,
|
| 498 |
best_of_slider,
|
| 499 |
],
|