Spaces:
Paused
Paused
| import whisper | |
| import numpy as np | |
| import logging | |
| import io | |
| import librosa | |
| logger = logging.getLogger(__name__) | |
| class LanguageDetector: | |
| def __init__(self, model_name="tiny"): | |
| """ | |
| Initialize the language detector with a Whisper model. | |
| Args: | |
| model_name (str): Name of the Whisper model to use. Default is "tiny" which is sufficient for language detection. | |
| """ | |
| self.model = whisper.load_model(model_name) | |
| logger.info(f"Loaded Whisper model {model_name} for language detection") | |
| def detect_language_from_file(self, audio_file_path): | |
| """ | |
| Detect language from an audio file. | |
| Args: | |
| audio_file_path (str): Path to the audio file | |
| Returns: | |
| str: Detected language code (e.g., "en", "fr", etc.) | |
| float: Confidence score | |
| dict: All language probabilities | |
| """ | |
| try: | |
| # Load and preprocess audio | |
| audio = whisper.load_audio(audio_file_path) | |
| audio = whisper.pad_or_trim(audio) | |
| # Make log-Mel spectrogram with correct dimensions | |
| mel = whisper.log_mel_spectrogram(audio, n_mels=128).to(self.model.device) | |
| # Detect language | |
| _, probs = self.model.detect_language(mel) | |
| detected_lang = max(probs, key=probs.get) | |
| confidence = probs[detected_lang] | |
| return detected_lang, confidence, probs | |
| except Exception as e: | |
| logger.error(f"Error in language detection: {e}") | |
| raise | |
| def detect_language_from_bytes(self, audio_bytes): | |
| """ | |
| Detect language from audio bytes. | |
| Args: | |
| audio_bytes (bytes): Audio data in bytes | |
| Returns: | |
| str: Detected language code (e.g., "en", "fr", etc.) | |
| float: Confidence score | |
| dict: All language probabilities | |
| """ | |
| try: | |
| # Convert bytes to numpy array using librosa | |
| audio_data = io.BytesIO(audio_bytes) | |
| audio, sr = librosa.load(audio_data, sr=16000) | |
| # Convert to format expected by Whisper | |
| audio = (audio * 32768).astype(np.int16) | |
| # Load and preprocess audio | |
| audio = whisper.pad_or_trim(audio) | |
| # Make log-Mel spectrogram with correct dimensions | |
| mel = whisper.log_mel_spectrogram(audio, n_mels=128).to(self.model.device) | |
| # Detect language | |
| _, probs = self.model.detect_language(mel) | |
| detected_lang = max(probs, key=probs.get) | |
| confidence = probs[detected_lang] | |
| return detected_lang, confidence, probs | |
| except Exception as e: | |
| logger.error(f"Error in language detection: {e}") | |
| raise |