""" API Client for VTT with Diarization Hugging Face Space Usage example for calling the space via Gradio Client API """ from gradio_client import Client import os # Your Hugging Face Space URL SPACE_URL = "MahmoudElsamadony/vtt-with-diariazation" def transcribe_audio( audio_file_path: str, language: str = "ar", enable_diarization: bool = False, beam_size: int = 5, best_of: int = 5, ): """ Transcribe audio file using the Hugging Face Space API Args: audio_file_path: Path to the audio file (mp3, wav, m4a, etc.) language: Language code ("ar", "en", "fr", etc.) or "" for auto-detect enable_diarization: Whether to enable speaker diarization beam_size: Beam size for Whisper (1-10) best_of: Best of parameter for Whisper (1-10) Returns: tuple: (transcript_text, detailed_json) """ # Initialize the client client = Client(SPACE_URL) # Call the transcribe function result = client.predict( audio_path=audio_file_path, language=language, enable_diarization=enable_diarization, beam_size=beam_size, best_of=best_of, api_name="/predict" ) return result def main(): """Example usage of the API client""" # Example 1: Basic transcription (Arabic, no diarization) print("=" * 60) print("Example 1: Basic Arabic transcription") print("=" * 60) # Replace with your actual audio file path audio_file = "path/to/your/audio.mp3" if os.path.exists(audio_file): transcript, details = transcribe_audio( audio_file_path=audio_file, language="ar", enable_diarization=False, ) print(f"\nTranscript:\n{transcript}\n") print(f"Language: {details.get('language')}") print(f"Duration: {details.get('duration')} seconds") print(f"Number of segments: {len(details.get('segments', []))}") else: print(f"Audio file not found: {audio_file}") print("\n" + "=" * 60) print("Example 2: Transcription with speaker diarization") print("=" * 60) # Example 2: Transcription with diarization if os.path.exists(audio_file): transcript, details = transcribe_audio( audio_file_path=audio_file, language="ar", enable_diarization=True, beam_size=5, best_of=5, ) print(f"\nTranscript:\n{transcript}\n") # Print speaker turns if "speakers" in details: print("\nSpeaker turns:") for turn in details["speakers"][:5]: # Show first 5 turns print(f" {turn['speaker']}: {turn['start']:.2f}s - {turn['end']:.2f}s") # Print segments with speakers print("\nSegments with speakers:") for segment in details.get("segments", [])[:3]: # Show first 3 segments speaker = segment.get("speaker", "Unknown") text = segment.get("text", "") start = segment.get("start", 0) print(f" [{start:.2f}s] {speaker}: {text}") else: print(f"Audio file not found: {audio_file}") print("\n" + "=" * 60) print("Example 3: Auto-detect language") print("=" * 60) # Example 3: Auto-detect language if os.path.exists(audio_file): transcript, details = transcribe_audio( audio_file_path=audio_file, language="", # Empty string for auto-detect enable_diarization=False, ) print(f"\nDetected language: {details.get('language')}") print(f"Language probability: {details.get('language_probability'):.2%}") print(f"\nTranscript:\n{transcript}") else: print(f"Audio file not found: {audio_file}") if __name__ == "__main__": # Install gradio_client first: # pip install gradio_client main()