Spaces:

soheillotfi
/

whisper-diarization-demo

Sleeping

App Files Files Community

whisper-diarization-demo / app.py

soheillotfi

removed the size limit, added stronger models to the list

cb03940 about 2 months ago

raw

history blame contribute delete

11.7 kB

	import gradio as gr
	import os
	import tempfile
	import subprocess
	import time
	from pathlib import Path
	import zipfile
	import io
	import sys

	# Demo configuration
	DEMO_MODE = True
	# File size limit removed - no restrictions
	# For multi-language audio, use larger models (medium, large-v2, large-v3) for better accuracy
	ALLOWED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en", "medium", "large-v2", "large-v3"]

	# Password authentication removed for security

	def check_file_size(file_path):
	"""Check if file exists and get size info"""
	if not file_path:
	return False, "No file provided"

	try:
	file_size = os.path.getsize(file_path) / (1024*1024)
	return True, f"File size: {file_size:.1f}MB"
	except Exception as e:
	return False, f"Error checking file: {str(e)}"

	def run_diarization(audio_file, model, language, enable_stemming, suppress_numerals, batch_size, processing_mode, num_speakers):
	"""Main diarization function"""

	if not audio_file:
	return None, None, "❌ Please upload an audio file."

	size_ok, size_msg = check_file_size(audio_file)
	if not size_ok:
	return None, None, f"❌ {size_msg}"

	# Log file size for monitoring (no restriction)
	print(f"Processing file: {size_msg}")

	try:
	# Prepare command
	cmd = [sys.executable, "diarize1.py", "-a", audio_file]
	cmd.extend(["--whisper-model", model])
	cmd.extend(["--device", "cpu"])
	cmd.extend(["--batch-size", str(batch_size)])

	if language and language != "auto":
	cmd.extend(["--language", language])

	if not enable_stemming:
	cmd.append("--no-stem")

	if suppress_numerals:
	cmd.append("--suppress_numerals")

	# Add speaker separation options
	if processing_mode == "Speaker Separation":
	cmd.extend(["--num-speakers", str(num_speakers)])

	# Run the process
	process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
	stdout, stderr = process.communicate(timeout=1800)

	if process.returncode == 0:
	# Look for output files
	audio_dir = os.path.dirname(audio_file)
	audio_name = os.path.splitext(os.path.basename(audio_file))[0]

	output_files = []
	transcript_content = ""

	import glob
	for ext in ['.txt', '.srt']:
	pattern = f"{audio_name}*{ext}"
	matches = glob.glob(os.path.join(audio_dir, pattern))
	output_files.extend(matches)

	if output_files:
	# Create ZIP
	temp_dir = tempfile.mkdtemp()
	zip_path = os.path.join(temp_dir, "results.zip")
	with zipfile.ZipFile(zip_path, 'w') as zip_file:
	for file_path in output_files:
	zip_file.write(file_path, os.path.basename(file_path))
	if file_path.endswith('.txt'):
	with open(file_path, 'r', encoding='utf-8') as f:
	transcript_content = f.read()

	# Parse transcript for speaker separation
	speaker_1_text, speaker_2_text = parse_speakers(transcript_content)
	return zip_path, speaker_1_text, speaker_2_text, f"✅ Processing complete! Generated {len(output_files)} files."
	else:
	return None, "", "", "❌ No output files generated."
	else:
	return None, "", "", f"❌ Processing failed: {stderr}"

	except Exception as e:
	return None, "", "", f"❌ Error: {str(e)}"

	def parse_speakers(transcript_content):
	"""Parse transcript content and separate by speakers"""
	if not transcript_content:
	return "", ""

	lines = transcript_content.split('\n')
	speaker_1_lines = []
	speaker_2_lines = []

	for line in lines:
	line = line.strip()
	if not line:
	continue

	# Look for speaker labels (common formats)
	if line.startswith('SPEAKER_00') or line.startswith('Speaker 0') or line.startswith('[SPEAKER_00]'):
	speaker_1_lines.append(line)
	elif line.startswith('SPEAKER_01') or line.startswith('Speaker 1') or line.startswith('[SPEAKER_01]'):
	speaker_2_lines.append(line)
	else:
	# If no clear speaker label, try to detect from content
	if 'speaker' in line.lower():
	if '0' in line or 'one' in line.lower() or 'first' in line.lower():
	speaker_1_lines.append(line)
	elif '1' in line or 'two' in line.lower() or 'second' in line.lower():
	speaker_2_lines.append(line)
	else:
	# Default to speaker 1 if unclear
	speaker_1_lines.append(line)
	else:
	# If no speaker indication, add to both or alternate
	if len(speaker_1_lines) <= len(speaker_2_lines):
	speaker_1_lines.append(line)
	else:
	speaker_2_lines.append(line)

	speaker_1_text = '\n'.join(speaker_1_lines) if speaker_1_lines else "No content detected for Speaker 1"
	speaker_2_text = '\n'.join(speaker_2_lines) if speaker_2_lines else "No content detected for Speaker 2"

	return speaker_1_text, speaker_2_text

	def update_speaker_visibility(mode):
	"""Show/hide speaker count based on processing mode"""
	if mode == "Speaker Separation":
	return gr.update(visible=True)
	else:
	return gr.update(visible=False)

	# Create interface
	with gr.Blocks(title="🎤 Whisper Speaker Diarization Demo") as demo:

	gr.HTML("""
	<div style="background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin-bottom: 20px; display: flex; justify-content: space-between; align-items: center;">
	<div style="text-align: left;">
	<h1>🎤 Whisper Speaker Diarization Demo</h1>
	<p>AI-powered speaker identification and transcription</p>
	</div>
	<div style="text-align: right;">
	<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/8/8e/Telecom_Paris_Logo.svg/512px-Telecom_Paris_Logo.svg.png" alt="Telecom Paris" style="height: 80px; width: auto; filter: brightness(0) invert(1);">
	</div>
	</div>
	""")

	gr.HTML("""
	<div style="background-color: #fff3cd; border: 1px solid #ffeaa7; border-radius: 5px; padding: 15px; margin: 10px 0;">
	<h3>🔬 Processing Information</h3>
	<ul>
	<li>💻 CPU processing only (slower than GPU)</li>
	<li>📦 No file size limits - process any audio length</li>
	<li>🌍 For multi-language audio, use larger models (medium, large-v2, large-v3)</li>
	<li>⚡ Larger models provide better accuracy but take longer to process</li>
	<li>⚠️ Very large files may take significant time and memory</li>
	</ul>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=2):
	# Password input removed for security

	audio_input = gr.Audio(label="📁 Upload Audio File (No size limit)", type="filepath", sources=["upload"])
	gr.Markdown("Supported: MP3, WAV, M4A, FLAC, etc.")

	with gr.Row():
	model_input = gr.Dropdown(choices=ALLOWED_MODELS, value="base.en", label="🎯 Whisper Model")
	language_input = gr.Dropdown(choices=["auto", "en", "es", "fr", "de", "it"], value="auto", label="🌍 Language")

	# Processing mode selection
	processing_mode = gr.Radio(
	choices=["Standard Diarization", "Speaker Separation"],
	value="Standard Diarization",
	label="🎯 Processing Mode"
	)
	gr.Markdown("Standard: Traditional diarization \| Separation: Pre-separate speakers")

	num_speakers = gr.Radio(
	choices=[2, 3],
	value=2,
	label="👥 Number of Speakers",
	visible=False,
	info="Select how many speakers are in your audio"
	)

	with gr.Row():
	stemming_input = gr.Checkbox(label="🎵 Audio Enhancement", value=False)
	numerals_input = gr.Checkbox(label="🔢 Suppress Numerals", value=False)

	batch_size_input = gr.Slider(minimum=1, maximum=4, value=2, step=1, label="⚡ Batch Size")

	process_btn = gr.Button("🎙️ Start Demo Transcription", variant="primary", size="lg")

	with gr.Column(scale=1):
	gr.Markdown("""
	### 📚 How to Use
	1. Upload audio (any size)
	2. Choose processing mode
	3. Configure settings (optional)
	4. Click process and wait
	5. Download results

	### 🎯 Processing Modes
	- Standard: Traditional speaker diarization
	- Speaker Separation: Pre-separate speakers first

	### 🌍 Model Selection
	- tiny.en/base.en/small.en: Fast, English only
	- medium.en: Better accuracy, English only
	- medium/large-v2/large-v3: Best for multi-language audio

	### ⚠️ Large File Warning
	- Large files will take longer to process
	- Monitor system resources during processing
	""")

	download_output = gr.File(label="📦 Download Results", visible=False)

	# Separate transcript windows for each speaker
	with gr.Row(visible=False) as transcript_row:
	with gr.Column():
	speaker1_output = gr.Textbox(
	label="🗣️ Speaker 1 Transcript",
	lines=15,
	max_lines=20,
	show_copy_button=True,
	container=True,
	interactive=False
	)
	with gr.Column():
	speaker2_output = gr.Textbox(
	label="🗣️ Speaker 2 Transcript",
	lines=15,
	max_lines=20,
	show_copy_button=True,
	container=True,
	interactive=False
	)

	result_output = gr.Textbox(label="📋 Results", lines=5)

	# Wire up mode visibility
	processing_mode.change(
	fn=update_speaker_visibility,
	inputs=[processing_mode],
	outputs=[num_speakers]
	)

	def process_wrapper(*args):
	download_file, speaker1_text, speaker2_text, result_text = run_diarization(*args)
	has_transcripts = bool(speaker1_text or speaker2_text)
	return (
	download_file,
	speaker1_text or "",
	speaker2_text or "",
	result_text or "",
	gr.update(visible=download_file is not None),
	gr.update(visible=has_transcripts)
	)

	process_btn.click(
	fn=process_wrapper,
	inputs=[audio_input, model_input, language_input, stemming_input, numerals_input, batch_size_input, processing_mode, num_speakers],
	outputs=[download_output, speaker1_output, speaker2_output, result_output, download_output, transcript_row]
	)

	if __name__ == "__main__":
	demo.launch()