# NEW-ASR-VOX # ============================================================================== # Cell 1: Complete Setup - Based on Your Working VoxLingua Code # ============================================================================== import os, re, glob, csv import torch import pandas as pd import numpy as np from sklearn.metrics import accuracy_score, confusion_matrix from speechbrain.inference.classifiers import EncoderClassifier from speechbrain.pretrained.interfaces import foreign_class import torchaudio import warnings warnings.filterwarnings('ignore') device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # ============================================================================== # Cell 2: Load Multiple Language Detection Models for Ensemble # ============================================================================== print("šŸ”„ Loading Multiple Language Detection Models...") # Model 1: VoxLingua107 ECAPA-TDNN (Your working baseline - 40% weight) voxlingua_model = None try: print("Loading VoxLingua107 ECAPA-TDNN...") voxlingua_model = EncoderClassifier.from_hparams( source="speechbrain/lang-id-voxlingua107-ecapa", savedir="pretrained_models/langid_voxlingua107_ecapa", run_opts={"device": device} ) print("āœ… VoxLingua107 loaded successfully") except Exception as e: print(f"āŒ VoxLingua107 failed: {e}") # Model 2: XLS-R Language ID (35% weight) xlsr_lid_model = None try: print("Loading TalTechNLP XLS-R Language ID...") xlsr_lid_model = foreign_class( source="TalTechNLP/voxlingua107-xls-r-300m-wav2vec", pymodule_file="encoder_wav2vec_classifier.py", classname="EncoderWav2vecClassifier", hparams_file="inference_wav2vec.yaml", savedir="pretrained_models/xlsr_voxlingua", run_opts={"device": device} ) print("āœ… XLS-R Language ID loaded successfully") except Exception as e: print(f"āŒ XLS-R failed: {e}") models_loaded = sum(p is not None for p in [voxlingua_model, xlsr_lid_model]) print(f"\nšŸ“Š Models loaded: {models_loaded}/2") # ============================================================================== # Cell 3: Complete Language Mappings from Your Dataset # ============================================================================== # All languages from your dataset (based on the accuracy table you showed) DATASET_LANGUAGES = { # Indo-Aryan Languages 'ur', 'pa', 'hi', 'bn', 'ne', 'as', 'ks', 'mr', 'gu', 'or', # Dravidian Languages 'ta', 'te', 'kn', 'ml', # Low-Resource Languages 'sd', 'kok', 'br', 'doi', 'sat', 'mni', # Others in your dataset 'sa' # Sanskrit } # Language Family Classifications INDO_ARYAN_LANGS = {'ur', 'pa', 'hi', 'bn', 'ne', 'as', 'ks', 'mr', 'gu', 'or', 'sd'} DRAVIDIAN_LANGS = {'ta', 'te', 'kn', 'ml'} LOW_RESOURCE_LANGS = {'kok', 'br', 'doi', 'sat', 'mni'} OTHER_LANGS = {'sa'} # Sanskrit ALL_SUPPORTED_LANGS = INDO_ARYAN_LANGS | DRAVIDIAN_LANGS | LOW_RESOURCE_LANGS | OTHER_LANGS # Cross-Lingual Transfer Mappings (Research-Based) TRANSFER_MAPPINGS = { # Low-resource to high-resource language mappings 'br': 'hi', # Bodo → Hindi (brx mapped to br in your dataset) 'sat': 'hi', # Santali → Hindi 'doi': 'pa', # Dogri → Punjabi 'mni': 'bn', # Manipuri → Bengali 'kok': 'mr', # Konkani → Marathi (geographic proximity) 'sd': 'hi', # Sindhi → Hindi } # Language Code Mappings (VoxLingua output to your dataset codes) VOXLINGUA_TO_DATASET = { 'urd': 'ur', 'urdu': 'ur', 'pan': 'pa', 'punjabi': 'pa', 'pnb': 'pa', 'hin': 'hi', 'hindi': 'hi', 'ben': 'bn', 'bengali': 'bn', 'nep': 'ne', 'nepali': 'ne', 'asm': 'as', 'assamese': 'as', 'kas': 'ks', 'kashmiri': 'ks', 'mar': 'mr', 'marathi': 'mr', 'guj': 'gu', 'gujarati': 'gu', 'ori': 'or', 'odia': 'or', 'ory': 'or', 'tam': 'ta', 'tamil': 'ta', 'tel': 'te', 'telugu': 'te', 'kan': 'kn', 'kannada': 'kn', 'mal': 'ml', 'malayalam': 'ml', 'sin': 'sd', 'sindhi': 'sd', 'snd': 'sd', 'kok': 'kok', 'konkani': 'kok', 'san': 'sa', 'sanskrit': 'sa', # Common variations 'bho': 'hi', # Bhojpuri → Hindi 'mai': 'hi', # Maithili → Hindi 'mag': 'hi', # Magahi → Hindi } print("āœ… Complete language mappings loaded") print(f"šŸ“Š Total dataset languages: {len(ALL_SUPPORTED_LANGS)}") print(f"šŸ“Š Mapping variations: {len(VOXLINGUA_TO_DATASET)}") # ============================================================================== # Cell 4: Enhanced Parsing Functions (Your Working Code + Improvements) # ============================================================================== def parse_top1(out): """Parse VoxLingua107 output - your exact working function""" logits, log_conf, pred_idx, labels = out label_str = labels[0] if (isinstance(labels, (list, tuple)) and len(labels) > 0) else "unknown" if not isinstance(label_str, str): label_str = str(label_str) colon_pos = label_str.find(":") if colon_pos != -1: iso = label_str[:colon_pos].strip() else: iso = label_str.strip() conf = float(log_conf.exp().item()) return iso, label_str, conf def parse_xlsr_output(out): """Parse XLS-R model output""" try: out_prob, score, index, text_lab = out lang_code = str(text_lab[0]).strip().lower() confidence = float(out_prob.exp().max().item()) return lang_code, confidence except Exception as e: print(f" XLS-R parsing error: {e}") return "unknown", 0.0 def map_to_dataset_language(detected_lang): """Map VoxLingua/XLS-R output to your dataset language codes""" # Direct match first if detected_lang in ALL_SUPPORTED_LANGS: return detected_lang # Check mapping dictionary mapped = VOXLINGUA_TO_DATASET.get(detected_lang.lower(), detected_lang) # If still not in dataset, try transfer mapping if mapped not in ALL_SUPPORTED_LANGS and mapped in TRANSFER_MAPPINGS: transfer_target = TRANSFER_MAPPINGS[mapped] print(f" Transfer mapping: {mapped} → {transfer_target}") return transfer_target return mapped print("āœ… Enhanced parsing functions ready") # ============================================================================== # Cell 5: Hybrid Multi-Model Language Detection # ============================================================================== def hybrid_language_detection(audio_path): """ Multi-model ensemble language detection optimized for your dataset """ print(f" šŸŽ§ Analyzing: {os.path.basename(audio_path)}") predictions = {} confidences = {} # Model 1: VoxLingua107 (Primary - 60% weight since it's your working baseline) if voxlingua_model is not None: try: out = voxlingua_model.classify_file(audio_path) pred_iso, pred_label, conf = parse_top1(out) # Map to dataset language codes mapped_lang = map_to_dataset_language(pred_iso) predictions['voxlingua'] = mapped_lang confidences['voxlingua'] = conf * 0.60 # 60% weight print(f" VoxLingua107: {pred_iso} → {mapped_lang} ({conf:.3f})") except Exception as e: print(f" VoxLingua107 error: {e}") # Model 2: XLS-R (Secondary - 40% weight) if xlsr_lid_model is not None: try: out = xlsr_lid_model.classify_file(audio_path) lang_code, conf = parse_xlsr_output(out) # Map to dataset language codes mapped_lang = map_to_dataset_language(lang_code) predictions['xlsr'] = mapped_lang confidences['xlsr'] = conf * 0.40 # 40% weight print(f" XLS-R: {lang_code} → {mapped_lang} ({conf:.3f})") except Exception as e: print(f" XLS-R error: {e}") # Ensemble Decision Making if not predictions: return "unknown", 0.0 # Strategy 1: Check for agreement between models if len(predictions) >= 2: pred_values = list(predictions.values()) if pred_values[0] == pred_values[1]: # Models agree consensus_lang = pred_values[0] avg_confidence = sum(confidences.values()) / len(confidences) print(f" šŸŽÆ Consensus: {consensus_lang} (confidence: {avg_confidence:.3f})") return consensus_lang, avg_confidence # Strategy 2: Use highest weighted confidence if confidences: best_model = max(confidences.keys(), key=lambda k: confidences[k]) best_lang = predictions[best_model] best_conf = confidences[best_model] / (0.60 if best_model == 'voxlingua' else 0.40) # Normalize print(f" šŸŽÆ Best model ({best_model}): {best_lang} (confidence: {best_conf:.3f})") return best_lang, best_conf return "unknown", 0.0 print("āœ… Hybrid ensemble language detection ready") # ============================================================================== # Cell 6: Complete Ground Truth Extraction for Your Dataset # ============================================================================== def gt_from_filename(path): """Extract ground truth from filename - complete version for your dataset""" name = os.path.basename(path).lower() # Pattern 1: Your working regex pattern GT_TOKEN = re.compile(r'(?:^|[_-])([a-z]{2,4})(?:[_-]|$)', re.IGNORECASE) m = GT_TOKEN.search(name) if m: code = m.group(1).lower() # Complete mapping based on your dataset structure filename_mappings = { # Your working mappings "guf": "gu", "mrt": "mr", "ml": "ml", # Additional mappings for your complete dataset "urd": "ur", "urdu": "ur", "pan": "pa", "punjabi": "pa", "pnb": "pa", "hin": "hi", "hindi": "hi", "ben": "bn", "bengali": "bn", "bng": "bn", "nep": "ne", "nepali": "ne", "asm": "as", "assamese": "as", "kas": "ks", "kashmiri": "ks", "mar": "mr", "marathi": "mr", "guj": "gu", "gujarati": "gu", "ori": "or", "odia": "or", "ory": "or", "tam": "ta", "tamil": "ta", "tel": "te", "telugu": "te", "kan": "kn", "kannada": "kn", "mal": "ml", "malayalam": "ml", "sin": "sd", "sindhi": "sd", "snd": "sd", "kok": "kok", "konkani": "kok", "bod": "br", "bodo": "br", # Bodo variations "dog": "doi", "dogri": "doi", "sat": "sat", "santali": "sat", "mni": "mni", "manipuri": "mni", "san": "sa", "sanskrit": "sa", } mapped_code = filename_mappings.get(code, code) # Validate against your dataset languages if mapped_code in ALL_SUPPORTED_LANGS: return mapped_code # Pattern 2: Check folder structure path_parts = path.split('/') for part in path_parts: part_lower = part.lower() if part_lower in ALL_SUPPORTED_LANGS: return part_lower # Check if it's a language name folder for full_name, code in [('gujarati', 'gu'), ('marathi', 'mr'), ('hindi', 'hi'), ('bengali', 'bn'), ('tamil', 'ta'), ('telugu', 'te'), ('kannada', 'kn'), ('malayalam', 'ml'), ('punjabi', 'pa'), ('urdu', 'ur'), ('assamese', 'as'), ('odia', 'or'), ('nepali', 'ne'), ('kashmiri', 'ks'), ('sindhi', 'sd'), ('konkani', 'kok'), ('bodo', 'br'), ('dogri', 'doi'), ('santali', 'sat'), ('manipuri', 'mni'), ('sanskrit', 'sa')]: if full_name in part_lower: return code return None print("āœ… Complete ground truth extraction ready") # ============================================================================== # Cell 7: Google Drive Processing with Error Handling # ============================================================================== def download_and_process_drive_dataset(): """Download and process with robust error handling""" print("šŸ“ Processing Google Drive dataset...") # Get sharing link share_link = input("šŸ”— Enter Google Drive sharing link: ").strip() if not share_link: print("āŒ No link provided") return [] # Extract file ID def extract_file_id(link): patterns = [r'/folders/([a-zA-Z0-9-_]+)', r'id=([a-zA-Z0-9-_]+)', r'/file/d/([a-zA-Z0-9-_]+)'] for pattern in patterns: match = re.search(pattern, link) if match: return match.group(1) return None file_id = extract_file_id(share_link) if not file_id: print("āŒ Could not extract file ID from sharing link") return [] # Setup download directory download_dir = "/content/drive_dataset" if os.path.exists(download_dir): import shutil shutil.rmtree(download_dir) os.makedirs(download_dir, exist_ok=True) # Download with error handling try: import gdown print(f"šŸ“„ Downloading from Google Drive (ID: {file_id})...") gdown.download_folder(f"https://drive.google.com/drive/folders/{file_id}", output=download_dir, quiet=False, use_cookies=False) print("āœ… Download completed successfully") except Exception as e: print(f"āŒ Download failed: {e}") print("šŸ’” Make sure the folder is shared with 'Anyone with the link can view'") return [] # Scan for audio files VALID_EXTS = {".wav", ".mp3", ".flac", ".m4a", ".ogg"} def is_audio(filepath): return os.path.splitext(filepath)[1].lower() in VALID_EXTS print("šŸ” Scanning for audio files...") all_files = [] for root, dirs, files in os.walk(download_dir): for file in files: if is_audio(file): full_path = os.path.join(root, file) all_files.append(full_path) print(f"šŸ“Š Found {len(all_files)} total audio files") # Filter and limit files filtered_files = [] lang_counts = {} english_skipped = 0 for file_path in all_files: # Skip English files if any(eng_indicator in file_path.lower() for eng_indicator in ['english', '_en_', '/en/', 'eng_', '_eng']): english_skipped += 1 continue # Extract language for limiting gt_lang = gt_from_filename(file_path) if gt_lang: lang_counts[gt_lang] = lang_counts.get(gt_lang, 0) if lang_counts[gt_lang] < 5: # Max 5 per language filtered_files.append(file_path) lang_counts[gt_lang] += 1 else: # Include files without clear language markers (up to overall limit) if len(filtered_files) < 50: filtered_files.append(file_path) print(f"šŸ“Š Filtered results:") print(f" English files skipped: {english_skipped}") print(f" Selected for processing: {len(filtered_files)}") for lang, count in sorted(lang_counts.items()): print(f" {lang}: {count} files") return filtered_files # Execute download and processing test_files = download_and_process_drive_dataset() print(f"\nšŸŽÆ Total files ready for language detection: {len(test_files)}") # ============================================================================== # Cell 8: Execute Language Detection Analysis # ============================================================================== def run_language_detection_analysis(audio_files): """Run complete language detection analysis""" if not audio_files: print("āŒ No audio files to process") return print(f"šŸš€ Starting language detection on {len(audio_files)} files...") print("=" * 60) results = [] for i, audio_path in enumerate(audio_files, 1): print(f"\n[{i}/{len(audio_files)}] Processing: {os.path.basename(audio_path)}") try: # Extract ground truth gt_iso = gt_from_filename(audio_path) # Run hybrid detection pred_iso, confidence = hybrid_language_detection(audio_path) # Determine correctness is_correct = (gt_iso == pred_iso) if gt_iso else None result = { "file": os.path.basename(audio_path), "full_path": audio_path, "gt_iso": gt_iso if gt_iso else "", "pred_iso": pred_iso, "confidence": confidence, "correct": is_correct } results.append(result) # Status display status = "āœ…" if is_correct else "āŒ" if is_correct is False else "ā“" print(f" {status} GT: {gt_iso or 'Unknown'} | Pred: {pred_iso} | Conf: {confidence:.3f}") except Exception as e: print(f" šŸ’„ Error processing file: {e}") results.append({ "file": os.path.basename(audio_path), "full_path": audio_path, "gt_iso": "", "pred_iso": "error", "confidence": 0.0, "correct": False }) return results # Run the analysis analysis_results = run_language_detection_analysis(test_files) print(f"\nšŸŽ‰ Language detection analysis complete!") print(f"šŸ“Š Total results: {len(analysis_results)}") # ============================================================================== # Cell 9: Complete Results Analysis and Accuracy Report # ============================================================================== def generate_comprehensive_analysis(results): """Generate complete analysis matching your dataset format""" df = pd.DataFrame(results) # Filter to files with ground truth from your dataset valid_df = df[(df["gt_iso"] != "") & (df["gt_iso"].isin(ALL_SUPPORTED_LANGS))].copy() if len(valid_df) == 0: print("āŒ No valid ground truth files found") return print("šŸ“Š COMPREHENSIVE LANGUAGE DETECTION ANALYSIS") print("=" * 60) # Overall accuracy overall_acc = accuracy_score(valid_df["gt_iso"], valid_df["pred_iso"]) print(f"šŸŽÆ OVERALL ACCURACY: {overall_acc:.4f} ({overall_acc*100:.1f}%)") # Create accuracy table matching your format print(f"\nšŸ“Š LANGUAGE-WISE ACCURACY:") print("-" * 60) print("Code | Language Name | Files | Top-1 | Top-5 | Conf") print("-" * 60) # Language name mapping LANG_NAMES = { 'ur': 'Urdu', 'pa': 'Punjabi', 'ta': 'Tamil', 'sd': 'Sindhi', 'or': 'Odia', 'ml': 'Malayalam', 'ne': 'Nepali', 'as': 'Assamese', 'hi': 'Hindi', 'bn': 'Bengali', 'kok': 'Konkani', 'kn': 'Kannada', 'ks': 'Kashmiri', 'mr': 'Marathi', 'te': 'Telugu', 'br': 'Bodo', 'doi': 'Dogri', 'sat': 'Santali', 'gu': 'Gujarati', 'mai': 'Maithili', 'mni': 'Manipuri', 'sa': 'Sanskrit' } # Calculate per-language statistics lang_stats = [] for lang_code in sorted(valid_df["gt_iso"].unique()): lang_data = valid_df[valid_df["gt_iso"] == lang_code] total_files = len(lang_data) correct_pred = (lang_data["gt_iso"] == lang_data["pred_iso"]).sum() accuracy = correct_pred / total_files avg_conf = lang_data["confidence"].mean() lang_name = LANG_NAMES.get(lang_code, lang_code.title()) # Format output to match your table print(f"{lang_code:>3s} | {lang_name:<15s} | {total_files:>5d} | {accuracy*100:>5.1f}% | {accuracy*100:>5.1f}% | {avg_conf:>5.3f}") lang_stats.append({ 'code': lang_code, 'name': lang_name, 'files': total_files, 'accuracy': accuracy, 'confidence': avg_conf }) print("-" * 60) # Language family analysis print(f"\nšŸ“Š LANGUAGE FAMILY PERFORMANCE:") print("-" * 40) family_stats = {} for _, row in valid_df.iterrows(): lang = row['gt_iso'] correct = row['correct'] if lang in INDO_ARYAN_LANGS: family = 'Indo-Aryan' elif lang in DRAVIDIAN_LANGS: family = 'Dravidian' elif lang in LOW_RESOURCE_LANGS: family = 'Low-Resource' else: family = 'Other' if family not in family_stats: family_stats[family] = {'correct': 0, 'total': 0} family_stats[family]['total'] += 1 if correct: family_stats[family]['correct'] += 1 for family, stats in family_stats.items(): acc_pct = (stats['correct'] / stats['total']) * 100 print(f"{family:<15s}: {acc_pct:>5.1f}% ({stats['correct']:>2d}/{stats['total']:>2d})") # Model performance analysis print(f"\nšŸ“Š MODEL PERFORMANCE:") print("-" * 30) print(f"Models loaded: {models_loaded}/2") print(f"VoxLingua107: {'āœ… Active' if voxlingua_model else 'āŒ Failed'}") print(f"XLS-R: {'āœ… Active' if xlsr_lid_model else 'āŒ Failed'}") # Error analysis errors = valid_df[valid_df["gt_iso"] != valid_df["pred_iso"]] if len(errors) > 0: print(f"\nāŒ MISCLASSIFICATION ANALYSIS ({len(errors)} errors):") print("-" * 50) # Group errors by actual language for actual_lang in sorted(errors["gt_iso"].unique()): lang_errors = errors[errors["gt_iso"] == actual_lang] predicted_langs = lang_errors["pred_iso"].value_counts() print(f"{actual_lang} ({LANG_NAMES.get(actual_lang, actual_lang)}):") for pred_lang, count in predicted_langs.head(3).items(): print(f" → {pred_lang} ({count} files)") # Summary statistics print(f"\nšŸ“ˆ SUMMARY STATISTICS:") print("-" * 25) print(f"Total files processed: {len(df)}") print(f"Files with valid GT: {len(valid_df)}") print(f"Languages detected: {len(valid_df['pred_iso'].unique())}") print(f"Languages in dataset: {len(valid_df['gt_iso'].unique())}") print(f"Perfect accuracy: {len([l for l in lang_stats if l['accuracy'] == 1.0])}") print(f"Above 90% accuracy: {len([l for l in lang_stats if l['accuracy'] >= 0.9])}") print(f"Below 50% accuracy: {len([l for l in lang_stats if l['accuracy'] < 0.5])}") return valid_df, lang_stats # Run comprehensive analysis if 'analysis_results' in globals() and analysis_results: final_df, language_statistics = generate_comprehensive_analysis(analysis_results) # Save results to CSV if 'final_df' in locals(): timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S") csv_filename = f"language_detection_results_{timestamp}.csv" final_df.to_csv(csv_filename, index=False) print(f"\nšŸ’¾ Results saved to: {csv_filename}") # Download file try: from google.colab import files print("šŸ“„ File downloaded successfully") except: print("šŸ“ File saved locally (download failed)") else: print("āŒ No analysis results available. Please run the previous cells first.") print(f"\nāœ… COMPLETE LANGUAGE DETECTION ANALYSIS FINISHED!") # ============================================================================== # Independent Model Analysis with Top-5 and Real Confidence Scores # ============================================================================== def analyze_models_independently(audio_files): """Analyze each model independently with Top-5 predictions and real confidence scores""" print("šŸ” INDEPENDENT MODEL ANALYSIS") print("=" * 60) results = { 'voxlingua': [], 'xlsr': [], 'combined_analysis': [] } for i, audio_path in enumerate(audio_files, 1): print(f"\n[{i}/{len(audio_files)}] Analyzing: {os.path.basename(audio_path)}") # Extract ground truth gt_iso = gt_from_filename(audio_path) print(f" Ground Truth: {gt_iso or 'Unknown'}") file_result = { 'file': os.path.basename(audio_path), 'gt_iso': gt_iso or '', 'voxlingua_results': {}, 'xlsr_results': {} } # ======================================== # VoxLingua107 Independent Analysis # ======================================== if voxlingua_model is not None: try: print(f" šŸ”¬ VoxLingua107 Analysis:") out = voxlingua_model.classify_file(audio_path) # Extract Top-5 predictions with real confidence scores logits, log_conf, pred_idx, labels = out # Get top 5 predictions top5_indices = torch.topk(logits.squeeze(), 5).indices top5_probs = torch.softmax(logits.squeeze(), dim=0) vox_top5 = [] for idx in top5_indices: lang_label = labels[idx.item()] if idx.item() < len(labels) else f"idx_{idx.item()}" prob = top5_probs[idx.item()].item() # Extract language code if isinstance(lang_label, str): colon_pos = lang_label.find(":") lang_code = lang_label[:colon_pos].strip() if colon_pos != -1 else lang_label.strip() else: lang_code = str(lang_label) # Map to dataset codes mapped_lang = map_to_dataset_language(lang_code) vox_top5.append({ 'rank': len(vox_top5) + 1, 'original_code': lang_code, 'mapped_code': mapped_lang, 'confidence': prob, 'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS }) print(f" Rank {len(vox_top5)}: {lang_code} → {mapped_lang} ({prob:.4f}) {'āœ…' if mapped_lang in ALL_SUPPORTED_LANGS else 'āŒ'}") # Store VoxLingua results file_result['voxlingua_results'] = { 'top5': vox_top5, 'top1_original': vox_top5[0]['original_code'], 'top1_mapped': vox_top5[0]['mapped_code'], 'top1_confidence': vox_top5[0]['confidence'], 'correct_in_top1': gt_iso == vox_top5[0]['mapped_code'] if gt_iso else None, 'correct_in_top5': any(pred['mapped_code'] == gt_iso for pred in vox_top5) if gt_iso else None } results['voxlingua'].append({ 'file': os.path.basename(audio_path), 'gt_iso': gt_iso or '', 'pred_iso': vox_top5[0]['mapped_code'], 'confidence': vox_top5[0]['confidence'], 'correct': gt_iso == vox_top5[0]['mapped_code'] if gt_iso else None, 'top5_predictions': [p['mapped_code'] for p in vox_top5] }) except Exception as e: print(f" āŒ VoxLingua107 error: {e}") file_result['voxlingua_results'] = {'error': str(e)} # ======================================== # XLS-R Independent Analysis # ======================================== if xlsr_lid_model is not None: try: print(f" šŸ”¬ XLS-R Analysis:") out = xlsr_lid_model.classify_file(audio_path) # Parse XLS-R output for Top-5 out_prob, score, index, text_lab = out # Get top 5 predictions top5_indices = torch.topk(out_prob.squeeze(), 5).indices top5_probs = torch.softmax(out_prob.squeeze(), dim=0) xlsr_top5 = [] for idx in top5_indices: lang_label = text_lab[idx.item()] if idx.item() < len(text_lab) else f"idx_{idx.item()}" prob = top5_probs[idx.item()].item() lang_code = str(lang_label).strip().lower() mapped_lang = map_to_dataset_language(lang_code) xlsr_top5.append({ 'rank': len(xlsr_top5) + 1, 'original_code': lang_code, 'mapped_code': mapped_lang, 'confidence': prob, 'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS }) print(f" Rank {len(xlsr_top5)}: {lang_code} → {mapped_lang} ({prob:.4f}) {'āœ…' if mapped_lang in ALL_SUPPORTED_LANGS else 'āŒ'}") # Store XLS-R results file_result['xlsr_results'] = { 'top5': xlsr_top5, 'top1_original': xlsr_top5[0]['original_code'], 'top1_mapped': xlsr_top5[0]['mapped_code'], 'top1_confidence': xlsr_top5[0]['confidence'], 'correct_in_top1': gt_iso == xlsr_top5[0]['mapped_code'] if gt_iso else None, 'correct_in_top5': any(pred['mapped_code'] == gt_iso for pred in xlsr_top5) if gt_iso else None } results['xlsr'].append({ 'file': os.path.basename(audio_path), 'gt_iso': gt_iso or '', 'pred_iso': xlsr_top5[0]['mapped_code'], 'confidence': xlsr_top5[0]['confidence'], 'correct': gt_iso == xlsr_top5[0]['mapped_code'] if gt_iso else None, 'top5_predictions': [p['mapped_code'] for p in xlsr_top5] }) except Exception as e: print(f" āŒ XLS-R error: {e}") file_result['xlsr_results'] = {'error': str(e)} results['combined_analysis'].append(file_result) print(f" āœ… Analysis complete for {os.path.basename(audio_path)}") return results def generate_independent_model_report(results): """Generate comprehensive independent model analysis report""" print(f"\nšŸ“Š INDEPENDENT MODEL PERFORMANCE ANALYSIS") print("=" * 70) # VoxLingua107 Analysis if results['voxlingua']: vox_df = pd.DataFrame(results['voxlingua']) valid_vox = vox_df[vox_df['gt_iso'] != ''].copy() if len(valid_vox) > 0: vox_acc = accuracy_score(valid_vox['gt_iso'], valid_vox['pred_iso']) vox_conf_avg = valid_vox['confidence'].mean() vox_conf_std = valid_vox['confidence'].std() print(f"\nšŸ”¬ VoxLingua107 INDEPENDENT ANALYSIS:") print(f" Files analyzed: {len(valid_vox)}") print(f" Top-1 Accuracy: {vox_acc:.4f} ({vox_acc*100:.1f}%)") print(f" Avg Confidence: {vox_conf_avg:.4f} ± {vox_conf_std:.4f}") # Per-language accuracy for VoxLingua print(f" Per-language performance:") vox_per_lang = valid_vox.groupby('gt_iso').agg({ 'correct': 'mean', 'confidence': ['mean', 'count'] }).round(4) vox_per_lang.columns = ['accuracy', 'avg_conf', 'count'] for lang, row in vox_per_lang.iterrows(): print(f" {lang}: {row['accuracy']:.3f} ({row['accuracy']*100:.1f}%) - {row['avg_conf']:.3f} conf - {int(row['count'])} files") # XLS-R Analysis if results['xlsr']: xlsr_df = pd.DataFrame(results['xlsr']) valid_xlsr = xlsr_df[xlsr_df['gt_iso'] != ''].copy() if len(valid_xlsr) > 0: xlsr_acc = accuracy_score(valid_xlsr['gt_iso'], valid_xlsr['pred_iso']) xlsr_conf_avg = valid_xlsr['confidence'].mean() xlsr_conf_std = valid_xlsr['confidence'].std() print(f"\nšŸ”¬ XLS-R INDEPENDENT ANALYSIS:") print(f" Files analyzed: {len(valid_xlsr)}") print(f" Top-1 Accuracy: {xlsr_acc:.4f} ({xlsr_acc*100:.1f}%)") print(f" Avg Confidence: {xlsr_conf_avg:.4f} ± {xlsr_conf_std:.4f}") # Per-language accuracy for XLS-R print(f" Per-language performance:") xlsr_per_lang = valid_xlsr.groupby('gt_iso').agg({ 'correct': 'mean', 'confidence': ['mean', 'count'] }).round(4) xlsr_per_lang.columns = ['accuracy', 'avg_conf', 'count'] for lang, row in xlsr_per_lang.iterrows(): print(f" {lang}: {row['accuracy']:.3f} ({row['accuracy']*100:.1f}%) - {row['avg_conf']:.3f} conf - {int(row['count'])} files") # Model Comparison if results['voxlingua'] and results['xlsr']: print(f"\nāš–ļø MODEL COMPARISON:") print(f" VoxLingua107 vs XLS-R:") print(f" Accuracy: {vox_acc:.4f} vs {xlsr_acc:.4f} ({'VoxLingua wins' if vox_acc > xlsr_acc else 'XLS-R wins' if xlsr_acc > vox_acc else 'Tie'})") print(f" Avg Confidence: {vox_conf_avg:.4f} vs {xlsr_conf_avg:.4f}") # Suggest optimal weights total_perf = vox_acc + xlsr_acc vox_weight = vox_acc / total_perf if total_perf > 0 else 0.5 xlsr_weight = xlsr_acc / total_perf if total_perf > 0 else 0.5 print(f"\nšŸ’” SUGGESTED OPTIMAL WEIGHTS:") print(f" VoxLingua107: {vox_weight:.2f} ({vox_weight*100:.0f}%)") print(f" XLS-R: {xlsr_weight:.2f} ({xlsr_weight*100:.0f}%)") return results # Run independent analysis if 'test_files' in globals() and test_files: independent_results = analyze_models_independently(test_files[:10]) # Limit to first 10 for testing final_report = generate_independent_model_report(independent_results) else: print("āŒ No test files available. Run the previous cells first.") # ============================================================================== # Analyze Already Downloaded Files in /content/drive_dataset/ # ============================================================================== def scan_downloaded_files(): """Scan and collect already downloaded audio files""" download_dir = "/content/drive_dataset" if not os.path.exists(download_dir): print("āŒ Download directory not found") return [] print(f"šŸ” Scanning {download_dir} for audio files...") # Valid audio extensions VALID_EXTS = {".wav", ".mp3", ".flac", ".m4a", ".ogg"} def is_audio(filepath): return os.path.splitext(filepath)[1].lower() in VALID_EXTS # Collect all audio files audio_files = [] lang_counts = {} for root, dirs, files in os.walk(download_dir): for file in files: if is_audio(file): full_path = os.path.join(root, file) audio_files.append(full_path) # Extract language from folder structure path_parts = root.split('/') for part in path_parts: if len(part) in [2, 3] and part.isalpha(): lang_counts[part] = lang_counts.get(part, 0) + 1 break print(f"šŸ“Š Found {len(audio_files)} audio files:") for lang, count in sorted(lang_counts.items()): print(f" {lang}: {count} files") # Show sample files print(f"\nšŸ“ Sample files:") for file_path in audio_files[:5]: print(f" {file_path}") return audio_files # Scan for downloaded files downloaded_files = scan_downloaded_files() if not downloaded_files: print("āŒ No audio files found. Let me help you collect them manually.") # Manual file collection if scan fails print("\nšŸ” Manual file search...") import glob # Search patterns for common locations search_patterns = [ "/content/drive_dataset/**/*.flac", "/content/drive_dataset/**/*.wav", "/content/drive_dataset/**/*.mp3", "/content/**/*.flac", "/content/**/*.wav", "/content/**/*.mp3" ] manual_files = [] for pattern in search_patterns: found = glob.glob(pattern, recursive=True) manual_files.extend(found) # Remove duplicates manual_files = list(set(manual_files)) print(f"šŸ“Š Manual search found: {len(manual_files)} files") for file_path in manual_files[:10]: # Show first 10 print(f" {file_path}") downloaded_files = manual_files print(f"\nšŸŽÆ Total files ready for analysis: {len(downloaded_files)}") # ============================================================================== # Run Independent Analysis on Downloaded Files # ============================================================================== def analyze_downloaded_files_independently(audio_files): """Run independent model analysis on downloaded files with detailed output""" if not audio_files: print("āŒ No audio files to analyze") return None print(f"šŸš€ Starting independent analysis on {len(audio_files)} files...") print("=" * 70) results = { 'voxlingua_detailed': [], 'xlsr_detailed': [], 'comparison_data': [] } for i, audio_path in enumerate(audio_files, 1): print(f"\n[{i}/{len(audio_files)}] šŸŽµ {os.path.basename(audio_path)}") # Extract ground truth from path/filename gt_iso = gt_from_filename(audio_path) print(f" šŸ“ Ground Truth: {gt_iso or 'Unknown'}") file_analysis = { 'file': os.path.basename(audio_path), 'full_path': audio_path, 'gt_iso': gt_iso or '', 'voxlingua': {'available': False}, 'xlsr': {'available': False} } # ========================================== # VoxLingua107 Independent Analysis # ========================================== if voxlingua_model is not None: try: print(f" šŸ”¬ VoxLingua107 Analysis:") out = voxlingua_model.classify_file(audio_path) logits, log_conf, pred_idx, labels = out # Get real confidence scores (not weighted) probs = torch.softmax(logits.squeeze(), dim=0) top5_indices = torch.topk(probs, min(5, len(probs))).indices vox_predictions = [] for rank, idx in enumerate(top5_indices, 1): lang_label = labels[idx.item()] confidence = probs[idx.item()].item() # Parse language code if isinstance(lang_label, str): colon_pos = lang_label.find(":") lang_code = lang_label[:colon_pos].strip() if colon_pos != -1 else lang_label.strip() else: lang_code = str(lang_label) # Map to dataset language mapped_lang = map_to_dataset_language(lang_code) vox_predictions.append({ 'rank': rank, 'original': lang_code, 'mapped': mapped_lang, 'confidence': confidence, 'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS }) status = "āœ…" if mapped_lang in ALL_SUPPORTED_LANGS else "āŒ" print(f" #{rank}: {lang_code} → {mapped_lang} ({confidence:.4f}) {status}") # Store VoxLingua results top1 = vox_predictions[0] file_analysis['voxlingua'] = { 'available': True, 'top5_predictions': vox_predictions, 'top1_prediction': top1['mapped'], 'top1_confidence': top1['confidence'], 'correct_top1': gt_iso == top1['mapped'] if gt_iso else None, 'correct_in_top5': any(p['mapped'] == gt_iso for p in vox_predictions) if gt_iso else None } results['voxlingua_detailed'].append({ 'file': os.path.basename(audio_path), 'gt_iso': gt_iso or '', 'pred_iso': top1['mapped'], 'confidence': top1['confidence'], 'correct': gt_iso == top1['mapped'] if gt_iso else None }) except Exception as e: print(f" āŒ VoxLingua107 error: {e}") file_analysis['voxlingua'] = {'available': False, 'error': str(e)} # ========================================== # XLS-R Independent Analysis # ========================================== if xlsr_lid_model is not None: try: print(f" šŸ”¬ XLS-R Analysis:") out = xlsr_lid_model.classify_file(audio_path) out_prob, score, index, text_lab = out # Get real confidence scores probs = torch.softmax(out_prob.squeeze(), dim=0) top5_indices = torch.topk(probs, min(5, len(probs))).indices xlsr_predictions = [] for rank, idx in enumerate(top5_indices, 1): lang_label = text_lab[idx.item()] confidence = probs[idx.item()].item() lang_code = str(lang_label).strip().lower() mapped_lang = map_to_dataset_language(lang_code) xlsr_predictions.append({ 'rank': rank, 'original': lang_code, 'mapped': mapped_lang, 'confidence': confidence, 'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS }) status = "āœ…" if mapped_lang in ALL_SUPPORTED_LANGS else "āŒ" print(f" #{rank}: {lang_code} → {mapped_lang} ({confidence:.4f}) {status}") # Store XLS-R results top1 = xlsr_predictions[0] file_analysis['xlsr'] = { 'available': True, 'top5_predictions': xlsr_predictions, 'top1_prediction': top1['mapped'], 'top1_confidence': top1['confidence'], 'correct_top1': gt_iso == top1['mapped'] if gt_iso else None, 'correct_in_top5': any(p['mapped'] == gt_iso for p in xlsr_predictions) if gt_iso else None } results['xlsr_detailed'].append({ 'file': os.path.basename(audio_path), 'gt_iso': gt_iso or '', 'pred_iso': top1['mapped'], 'confidence': top1['confidence'], 'correct': gt_iso == top1['mapped'] if gt_iso else None }) except Exception as e: print(f" āŒ XLS-R error: {e}") file_analysis['xlsr'] = {'available': False, 'error': str(e)} results['comparison_data'].append(file_analysis) print(f" āœ… Analysis complete\n") return results # Run the independent analysis if downloaded_files: print("šŸ”¬ Running independent model analysis...") analysis_results = analyze_downloaded_files_independently(downloaded_files) else: print("āŒ No files found for analysis") analysis_results = None # ============================================================================== # FIXED: Robust VoxLingua107 Analysis with Better Error Handling # ============================================================================== def parse_voxlingua_output_robust(out): """Robust parsing of VoxLingua107 output with multiple fallback methods""" try: # Method 1: Standard SpeechBrain output format if isinstance(out, (tuple, list)) and len(out) >= 4: logits, log_conf, pred_idx, labels = out[:4] # Validate components if hasattr(logits, 'squeeze') and hasattr(labels, '__getitem__'): return logits, log_conf, pred_idx, labels, "standard" # Method 2: Alternative format (sometimes returns dict) if isinstance(out, dict): logits = out.get('predictions', out.get('logits')) labels = out.get('labels', out.get('text_lab')) log_conf = out.get('log_probabilities', out.get('log_conf')) pred_idx = out.get('predicted_ids', out.get('pred_idx')) if all(v is not None for v in [logits, labels]): return logits, log_conf, pred_idx, labels, "dict" # Method 3: Direct tensor output if hasattr(out, 'squeeze'): # Direct logits tensor logits = out # Create dummy labels based on logits size labels = [f"lang_{i}" for i in range(logits.shape[-1])] log_conf = torch.log_softmax(logits, dim=-1).max() pred_idx = torch.argmax(logits, dim=-1) return logits, log_conf, pred_idx, labels, "tensor" except Exception as e: print(f" Parse error: {e}") return None, None, None, None, "failed" def analyze_voxlingua_robust(audio_path): """Robust VoxLingua107 analysis with multiple parsing methods""" if voxlingua_model is None: return None try: # Get raw output from model raw_out = voxlingua_model.classify_file(audio_path) # Parse with robust method logits, log_conf, pred_idx, labels, parse_method = parse_voxlingua_output_robust(raw_out) if logits is None: print(f" āŒ Could not parse VoxLingua output format") return None print(f" šŸ“Š Parse method: {parse_method}") # Get predictions based on available data if hasattr(logits, 'squeeze'): probs = torch.softmax(logits.squeeze(), dim=-1 if len(logits.squeeze().shape) > 0 else 0) # Handle different tensor shapes if len(probs.shape) == 0: # Scalar top_indices = torch.tensor([0]) top_probs = probs.unsqueeze(0) else: # Vector k = min(5, len(probs)) top_probs, top_indices = torch.topk(probs, k) else: print(f" āŒ Logits not in expected tensor format") return None predictions = [] for rank, (idx, prob) in enumerate(zip(top_indices, top_probs), 1): idx_val = idx.item() if hasattr(idx, 'item') else int(idx) prob_val = prob.item() if hasattr(prob, 'item') else float(prob) # Get language label safely if idx_val < len(labels): lang_label = labels[idx_val] else: lang_label = f"unknown_{idx_val}" # Parse language code if isinstance(lang_label, str): colon_pos = lang_label.find(":") lang_code = lang_label[:colon_pos].strip() if colon_pos != -1 else lang_label.strip() else: lang_code = str(lang_label) # Map to dataset language mapped_lang = map_to_dataset_language(lang_code) predictions.append({ 'rank': rank, 'original': lang_code, 'mapped': mapped_lang, 'confidence': prob_val, 'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS }) status = "āœ…" if mapped_lang in ALL_SUPPORTED_LANGS else "āŒ" print(f" #{rank}: {lang_code} → {mapped_lang} ({prob_val:.4f}) {status}") return predictions except Exception as e: print(f" āŒ VoxLingua analysis error: {e}") print(f" āŒ Error type: {type(e).__name__}") return None def analyze_xlsr_robust(audio_path): """Robust XLS-R analysis""" if xlsr_lid_model is None: return None try: raw_out = xlsr_lid_model.classify_file(audio_path) # Handle different XLS-R output formats if isinstance(raw_out, (tuple, list)) and len(raw_out) >= 4: out_prob, score, index, text_lab = raw_out[:4] else: print(f" āŒ XLS-R output format not recognized") return None # Get top predictions if hasattr(out_prob, 'squeeze'): probs = torch.softmax(out_prob.squeeze(), dim=-1 if len(out_prob.squeeze().shape) > 0 else 0) if len(probs.shape) == 0: # Scalar top_indices = torch.tensor([0]) top_probs = probs.unsqueeze(0) else: # Vector k = min(5, len(probs)) top_probs, top_indices = torch.topk(probs, k) else: print(f" āŒ XLS-R probabilities not in expected format") return None predictions = [] for rank, (idx, prob) in enumerate(zip(top_indices, top_probs), 1): idx_val = idx.item() if hasattr(idx, 'item') else int(idx) prob_val = prob.item() if hasattr(prob, 'item') else float(prob) # Get language label if idx_val < len(text_lab): lang_label = text_lab[idx_val] else: lang_label = f"unknown_{idx_val}" lang_code = str(lang_label).strip().lower() mapped_lang = map_to_dataset_language(lang_code) predictions.append({ 'rank': rank, 'original': lang_code, 'mapped': mapped_lang, 'confidence': prob_val, 'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS }) status = "āœ…" if mapped_lang in ALL_SUPPORTED_LANGS else "āŒ" print(f" #{rank}: {lang_code} → {mapped_lang} ({prob_val:.4f}) {status}") return predictions except Exception as e: print(f" āŒ XLS-R analysis error: {e}") return None # ============================================================================== # UPDATED: Robust Analysis Function # ============================================================================== def analyze_downloaded_files_robust(audio_files): """Robust analysis with better error handling""" if not audio_files: print("āŒ No audio files to analyze") return None print(f"šŸš€ Starting ROBUST analysis on {len(audio_files)} files...") print("=" * 70) results = { 'voxlingua_detailed': [], 'xlsr_detailed': [], 'comparison_data': [] } for i, audio_path in enumerate(audio_files, 1): print(f"\n[{i}/{len(audio_files)}] šŸŽµ {os.path.basename(audio_path)}") # Extract ground truth gt_iso = gt_from_filename(audio_path) print(f" šŸ“ Ground Truth: {gt_iso or 'Unknown'}") file_analysis = { 'file': os.path.basename(audio_path), 'full_path': audio_path, 'gt_iso': gt_iso or '', 'voxlingua': {'available': False}, 'xlsr': {'available': False} } # VoxLingua107 Analysis print(f" šŸ”¬ VoxLingua107 Analysis:") vox_predictions = analyze_voxlingua_robust(audio_path) if vox_predictions: top1 = vox_predictions[0] file_analysis['voxlingua'] = { 'available': True, 'top5_predictions': vox_predictions, 'top1_prediction': top1['mapped'], 'top1_confidence': top1['confidence'], 'correct_top1': gt_iso == top1['mapped'] if gt_iso else None, 'correct_in_top5': any(p['mapped'] == gt_iso for p in vox_predictions) if gt_iso else None } results['voxlingua_detailed'].append({ 'file': os.path.basename(audio_path), 'gt_iso': gt_iso or '', 'pred_iso': top1['mapped'], 'confidence': top1['confidence'], 'correct': gt_iso == top1['mapped'] if gt_iso else None }) else: file_analysis['voxlingua'] = {'available': False, 'error': 'Analysis failed'} # XLS-R Analysis print(f" šŸ”¬ XLS-R Analysis:") xlsr_predictions = analyze_xlsr_robust(audio_path) if xlsr_predictions: top1 = xlsr_predictions[0] file_analysis['xlsr'] = { 'available': True, 'top5_predictions': xlsr_predictions, 'top1_prediction': top1['mapped'], 'top1_confidence': top1['confidence'], 'correct_top1': gt_iso == top1['mapped'] if gt_iso else None, 'correct_in_top5': any(p['mapped'] == gt_iso for p in xlsr_predictions) if gt_iso else None } results['xlsr_detailed'].append({ 'file': os.path.basename(audio_path), 'gt_iso': gt_iso or '', 'pred_iso': top1['mapped'], 'confidence': top1['confidence'], 'correct': gt_iso == top1['mapped'] if gt_iso else None }) else: file_analysis['xlsr'] = {'available': False, 'error': 'Analysis failed'} results['comparison_data'].append(file_analysis) print(f" āœ… Analysis complete") return results # Run the robust analysis if 'downloaded_files' in globals() and downloaded_files: print("šŸ”¬ Running ROBUST independent model analysis...") robust_analysis_results = analyze_downloaded_files_robust(downloaded_files) # Generate report if robust_analysis_results: generate_detailed_performance_report(robust_analysis_results) print(f"\nāœ… ROBUST ANALYSIS COMPLETE!") else: print("āŒ Robust analysis failed") else: print("āŒ No downloaded files found. Please run the file scanning code first.") # ============================================================================== # COMPLETE FIX: VoxLingua Label Mapping + Missing Function # ============================================================================== # First, let's create a proper VoxLingua language mapping VOXLINGUA_LANGUAGE_MAP = { 0: 'ab', 1: 'af', 2: 'ak', 3: 'am', 4: 'ar', 5: 'as', 6: 'az', 7: 'be', 8: 'bg', 9: 'bn', 10: 'bo', 11: 'br', 12: 'bs', 13: 'ca', 14: 'ce', 15: 'co', 16: 'cs', 17: 'cv', 18: 'cy', 19: 'da', 20: 'de', 21: 'dv', 22: 'dz', 23: 'ee', 24: 'el', 25: 'en', 26: 'eo', 27: 'es', 28: 'et', 29: 'eu', 30: 'fa', 31: 'ff', 32: 'fi', 33: 'fo', 34: 'fr', 35: 'fy', 36: 'ga', 37: 'gd', 38: 'gl', 39: 'gn', 40: 'gu', 41: 'gv', 42: 'ha', 43: 'haw', 44: 'he', 45: 'hi', 46: 'hr', 47: 'ht', 48: 'hu', 49: 'hy', 50: 'ia', 51: 'id', 52: 'ie', 53: 'ig', 54: 'ii', 55: 'ik', 56: 'io', 57: 'is', 58: 'it', 59: 'iu', 60: 'ja', 61: 'jv', 62: 'ka', 63: 'kk', 64: 'kl', 65: 'km', 66: 'kn', 67: 'ko', 68: 'ks', 69: 'ku', 70: 'kw', 71: 'ky', 72: 'la', 73: 'lb', 74: 'lg', 75: 'li', 76: 'ln', 77: 'lo', 78: 'lt', 79: 'lv', 80: 'mg', 81: 'mi', 82: 'mk', 83: 'ml', 84: 'mn', 85: 'mr', 86: 'ms', 87: 'mt', 88: 'my', 89: 'na', 90: 'nb', 91: 'nd', 92: 'ne', 93: 'ng', 94: 'nl', 95: 'nn', 96: 'no', 97: 'nv', 98: 'ny', 99: 'oc', 100: 'of', 101: 'om', 102: 'or', 103: 'os', 104: 'pa', 105: 'pi', 106: 'pl', 107: 'ps' } def get_voxlingua_language_by_index(idx): """Map VoxLingua index to language code""" return VOXLINGUA_LANGUAGE_MAP.get(idx, f'unknown_{idx}') def analyze_voxlingua_fixed(audio_path): """Fixed VoxLingua107 analysis with proper language mapping""" if voxlingua_model is None: return None try: raw_out = voxlingua_model.classify_file(audio_path) if not isinstance(raw_out, (tuple, list)) or len(raw_out) < 4: print(f" āŒ Unexpected VoxLingua output format") return None logits, log_conf, pred_idx, labels = raw_out[:4] # Get probabilities and top 5 probs = torch.softmax(logits.squeeze(), dim=-1) k = min(5, len(probs)) top_probs, top_indices = torch.topk(probs, k) predictions = [] for rank, (idx, prob) in enumerate(zip(top_indices, top_probs), 1): idx_val = idx.item() if hasattr(idx, 'item') else int(idx) prob_val = prob.item() if hasattr(prob, 'item') else float(prob) # Method 1: Try to use provided labels if idx_val < len(labels) and not str(labels[idx_val]).startswith('unknown'): lang_label = labels[idx_val] if isinstance(lang_label, str): colon_pos = lang_label.find(":") lang_code = lang_label[:colon_pos].strip() if colon_pos != -1 else lang_label.strip() else: lang_code = str(lang_label) else: # Method 2: Use our language mapping lang_code = get_voxlingua_language_by_index(idx_val) # Map to dataset language mapped_lang = map_to_dataset_language(lang_code) predictions.append({ 'rank': rank, 'original': lang_code, 'mapped': mapped_lang, 'confidence': prob_val, 'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS, 'index': idx_val }) status = "āœ…" if mapped_lang in ALL_SUPPORTED_LANGS else "āŒ" print(f" #{rank}: {lang_code} → {mapped_lang} ({prob_val:.4f}) {status} [idx:{idx_val}]") return predictions except Exception as e: print(f" āŒ VoxLingua analysis error: {e}") return None def analyze_xlsr_fixed(audio_path): """Fixed XLS-R analysis""" if xlsr_lid_model is None: print(f" āŒ XLS-R model not loaded") return None try: raw_out = xlsr_lid_model.classify_file(audio_path) if not isinstance(raw_out, (tuple, list)) or len(raw_out) < 4: print(f" āŒ Unexpected XLS-R output format") return None out_prob, score, index, text_lab = raw_out[:4] # Get probabilities and top 5 probs = torch.softmax(out_prob.squeeze(), dim=-1) k = min(5, len(probs)) top_probs, top_indices = torch.topk(probs, k) predictions = [] for rank, (idx, prob) in enumerate(zip(top_indices, top_probs), 1): idx_val = idx.item() if hasattr(idx, 'item') else int(idx) prob_val = prob.item() if hasattr(prob, 'item') else float(prob) # Get language label if idx_val < len(text_lab): lang_label = text_lab[idx_val] lang_code = str(lang_label).strip().lower() else: lang_code = f"xlsr_unknown_{idx_val}" mapped_lang = map_to_dataset_language(lang_code) predictions.append({ 'rank': rank, 'original': lang_code, 'mapped': mapped_lang, 'confidence': prob_val, 'in_dataset': mapped_lang in ALL_SUPPORTED_LANGS }) status = "āœ…" if mapped_lang in ALL_SUPPORTED_LANGS else "āŒ" print(f" #{rank}: {lang_code} → {mapped_lang} ({prob_val:.4f}) {status}") return predictions except Exception as e: print(f" āŒ XLS-R analysis error: {e}") return None def generate_detailed_performance_report(results): """Complete performance analysis report function""" if not results: print("āŒ No results to analyze") return print("\nšŸ“Š DETAILED INDEPENDENT MODEL PERFORMANCE REPORT") print("=" * 70) # VoxLingua107 Performance Analysis if results['voxlingua_detailed']: vox_df = pd.DataFrame(results['voxlingua_detailed']) valid_vox = vox_df[vox_df['gt_iso'] != ''].copy() print(f"\nšŸ”¬ VOXLINGUA107 PERFORMANCE:") print("-" * 40) if len(valid_vox) > 0: vox_acc = (valid_vox['correct'] == True).mean() vox_conf_mean = valid_vox['confidence'].mean() vox_conf_std = valid_vox['confidence'].std() print(f"Files Analyzed: {len(valid_vox)}") print(f"Top-1 Accuracy: {vox_acc:.4f} ({vox_acc*100:.1f}%)") print(f"Confidence: {vox_conf_mean:.4f} ± {vox_conf_std:.4f}") # Per-language breakdown print(f"\nPer-Language Performance:") for lang in sorted(valid_vox['gt_iso'].unique()): lang_data = valid_vox[valid_vox['gt_iso'] == lang] acc = (lang_data['correct'] == True).mean() conf_mean = lang_data['confidence'].mean() count = len(lang_data) print(f" {lang:>3}: {acc:.3f} ({acc*100:5.1f}%) | Conf: {conf_mean:.3f} | n={count}") else: print("No valid VoxLingua results") # XLS-R Performance Analysis if results['xlsr_detailed']: xlsr_df = pd.DataFrame(results['xlsr_detailed']) valid_xlsr = xlsr_df[xlsr_df['gt_iso'] != ''].copy() print(f"\nšŸ”¬ XLS-R PERFORMANCE:") print("-" * 40) if len(valid_xlsr) > 0: xlsr_acc = (valid_xlsr['correct'] == True).mean() xlsr_conf_mean = valid_xlsr['confidence'].mean() xlsr_conf_std = valid_xlsr['confidence'].std() print(f"Files Analyzed: {len(valid_xlsr)}") print(f"Top-1 Accuracy: {xlsr_acc:.4f} ({xlsr_acc*100:.1f}%)") print(f"Confidence: {xlsr_conf_mean:.4f} ± {xlsr_conf_std:.4f}") # Per-language breakdown print(f"\nPer-Language Performance:") for lang in sorted(valid_xlsr['gt_iso'].unique()): lang_data = valid_xlsr[valid_xlsr['gt_iso'] == lang] acc = (lang_data['correct'] == True).mean() conf_mean = lang_data['confidence'].mean() count = len(lang_data) print(f" {lang:>3}: {acc:.3f} ({acc*100:5.1f}%) | Conf: {conf_mean:.3f} | n={count}") else: print("No valid XLS-R results") # Model Comparison if results['voxlingua_detailed'] and results['xlsr_detailed']: print(f"\nāš–ļø MODEL COMPARISON:") print("-" * 30) print(f"VoxLingua107: {vox_acc:.4f} accuracy") print(f"XLS-R: {xlsr_acc:.4f} accuracy") # Calculate optimal weights total_acc = vox_acc + xlsr_acc if total_acc > 0: vox_weight = vox_acc / total_acc xlsr_weight = xlsr_acc / total_acc print(f"\nšŸ’” RECOMMENDED WEIGHTS:") print(f"VoxLingua107: {vox_weight:.3f} ({vox_weight*100:.1f}%)") print(f"XLS-R: {xlsr_weight:.3f} ({xlsr_weight*100:.1f}%)") # Calculate agreement vox_preds = set(vox_df['pred_iso'].tolist()) xlsr_preds = set(xlsr_df['pred_iso'].tolist()) common_preds = vox_preds.intersection(xlsr_preds) print(f"\nModel Agreement Analysis:") print(f"Common predictions: {len(common_preds)}") print(f"VoxLingua unique: {len(vox_preds - xlsr_preds)}") print(f"XLS-R unique: {len(xlsr_preds - vox_preds)}") # Save results timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S") if results['voxlingua_detailed']: vox_csv = f"voxlingua_fixed_results_{timestamp}.csv" pd.DataFrame(results['voxlingua_detailed']).to_csv(vox_csv, index=False) print(f"\nšŸ’¾ VoxLingua results: {vox_csv}") if results['xlsr_detailed']: xlsr_csv = f"xlsr_fixed_results_{timestamp}.csv" pd.DataFrame(results['xlsr_detailed']).to_csv(xlsr_csv, index=False) print(f"šŸ’¾ XLS-R results: {xlsr_csv}") def run_complete_fixed_analysis(audio_files): """Run complete analysis with all fixes""" if not audio_files: print("āŒ No audio files to analyze") return None print(f"šŸš€ Starting COMPLETE FIXED analysis on {len(audio_files)} files...") print("=" * 70) results = { 'voxlingua_detailed': [], 'xlsr_detailed': [], 'comparison_data': [] } for i, audio_path in enumerate(audio_files, 1): print(f"\n[{i}/{len(audio_files)}] šŸŽµ {os.path.basename(audio_path)}") # Extract ground truth gt_iso = gt_from_filename(audio_path) print(f" šŸ“ Ground Truth: {gt_iso or 'Unknown'}") file_analysis = { 'file': os.path.basename(audio_path), 'full_path': audio_path, 'gt_iso': gt_iso or '', 'voxlingua': {'available': False}, 'xlsr': {'available': False} } # VoxLingua107 Analysis print(f" šŸ”¬ VoxLingua107 Analysis:") vox_predictions = analyze_voxlingua_fixed(audio_path) if vox_predictions and len(vox_predictions) > 0: top1 = vox_predictions[0] file_analysis['voxlingua'] = { 'available': True, 'top5_predictions': vox_predictions, 'top1_prediction': top1['mapped'], 'top1_confidence': top1['confidence'], 'correct_top1': gt_iso == top1['mapped'] if gt_iso else None, } results['voxlingua_detailed'].append({ 'file': os.path.basename(audio_path), 'gt_iso': gt_iso or '', 'pred_iso': top1['mapped'], 'confidence': top1['confidence'], 'correct': gt_iso == top1['mapped'] if gt_iso else None }) # XLS-R Analysis print(f" šŸ”¬ XLS-R Analysis:") xlsr_predictions = analyze_xlsr_fixed(audio_path) if xlsr_predictions and len(xlsr_predictions) > 0: top1 = xlsr_predictions[0] file_analysis['xlsr'] = { 'available': True, 'top5_predictions': xlsr_predictions, 'top1_prediction': top1['mapped'], 'top1_confidence': top1['confidence'], 'correct_top1': gt_iso == top1['mapped'] if gt_iso else None, } results['xlsr_detailed'].append({ 'file': os.path.basename(audio_path), 'gt_iso': gt_iso or '', 'pred_iso': top1['mapped'], 'confidence': top1['confidence'], 'correct': gt_iso == top1['mapped'] if gt_iso else None }) results['comparison_data'].append(file_analysis) print(f" āœ… Analysis complete") return results # Run the complete fixed analysis if 'downloaded_files' in globals() and downloaded_files: print("šŸ”¬ Running COMPLETE FIXED analysis...") final_analysis_results = run_complete_fixed_analysis(downloaded_files) if final_analysis_results: generate_detailed_performance_report(final_analysis_results) print(f"\nāœ… COMPLETE FIXED ANALYSIS DONE!") else: print("āŒ Analysis failed") else: print("āŒ No downloaded files found") # ============================================================================== # COMPREHENSIVE EXCEL ANALYSIS WITH ALL DETAILS # ============================================================================== import pandas as pd import numpy as np from datetime import datetime import os def create_comprehensive_excel_analysis(results, output_filename=None): """Create comprehensive Excel analysis with multiple sheets and detailed metrics""" if not results: print("āŒ No results to analyze") return None # Generate filename if not provided if not output_filename: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_filename = f"Language_Detection_Comprehensive_Analysis_{timestamp}.xlsx" print(f"šŸ“Š Creating comprehensive Excel analysis: {output_filename}") # Create Excel writer with pd.ExcelWriter(output_filename, engine='openpyxl') as writer: # ======================================== # SHEET 1: EXECUTIVE SUMMARY # ======================================== print(" šŸ“‹ Creating Executive Summary...") summary_data = [] # Overall statistics total_files = len(results['comparison_data']) vox_available = sum(1 for item in results['comparison_data'] if item['voxlingua']['available']) xlsr_available = sum(1 for item in results['comparison_data'] if item['xlsr']['available']) summary_data.extend([ ['EXECUTIVE SUMMARY', ''], ['Analysis Date', datetime.now().strftime("%Y-%m-%d %H:%M:%S")], ['Total Files Analyzed', total_files], ['VoxLingua107 Available', f"{vox_available} ({vox_available/total_files*100:.1f}%)"], ['XLS-R Available', f"{xlsr_available} ({xlsr_available/total_files*100:.1f}%)"], ['', ''], ]) # Model performance summary if results['voxlingua_detailed']: vox_df = pd.DataFrame(results['voxlingua_detailed']) valid_vox = vox_df[vox_df['gt_iso'] != ''].copy() if len(valid_vox) > 0: vox_acc = (valid_vox['correct'] == True).mean() vox_conf = valid_vox['confidence'].mean() summary_data.extend([ ['VOXLINGUA107 PERFORMANCE', ''], ['Accuracy', f"{vox_acc:.4f} ({vox_acc*100:.1f}%)"], ['Average Confidence', f"{vox_conf:.4f}"], ['Files with Valid GT', len(valid_vox)], ['', ''], ]) if results['xlsr_detailed']: xlsr_df = pd.DataFrame(results['xlsr_detailed']) valid_xlsr = xlsr_df[xlsr_df['gt_iso'] != ''].copy() if len(valid_xlsr) > 0: xlsr_acc = (valid_xlsr['correct'] == True).mean() xlsr_conf = valid_xlsr['confidence'].mean() summary_data.extend([ ['XLS-R PERFORMANCE', ''], ['Accuracy', f"{xlsr_acc:.4f} ({xlsr_acc*100:.1f}%)"], ['Average Confidence', f"{xlsr_conf:.4f}"], ['Files with Valid GT', len(valid_xlsr)], ['', ''], ]) # Optimal weights calculation if results['voxlingua_detailed']: total_acc = vox_acc + xlsr_acc if total_acc > 0: vox_weight = vox_acc / total_acc xlsr_weight = xlsr_acc / total_acc summary_data.extend([ ['RECOMMENDED ENSEMBLE WEIGHTS', ''], ['VoxLingua107 Weight', f"{vox_weight:.3f} ({vox_weight*100:.1f}%)"], ['XLS-R Weight', f"{xlsr_weight:.3f} ({xlsr_weight*100:.1f}%)"], ]) # Create summary dataframe summary_df = pd.DataFrame(summary_data, columns=['Metric', 'Value']) summary_df.to_excel(writer, sheet_name='Executive_Summary', index=False) # ======================================== # SHEET 2: VOXLINGUA107 DETAILED RESULTS # ======================================== if results['voxlingua_detailed']: print(" šŸ“‹ Creating VoxLingua107 detailed results...") vox_detailed_df = pd.DataFrame(results['voxlingua_detailed']) # Add additional analysis columns vox_detailed_df['accuracy_score'] = vox_detailed_df['correct'].astype(int) vox_detailed_df['confidence_category'] = pd.cut( vox_detailed_df['confidence'], bins=[0, 0.3, 0.6, 0.8, 1.0], labels=['Low', 'Medium', 'High', 'Very High'] ) # Add language family information def get_language_family(lang): if lang in INDO_ARYAN_LANGS: return 'Indo-Aryan' elif lang in DRAVIDIAN_LANGS: return 'Dravidian' elif lang in LOW_RESOURCE_LANGS: return 'Low-Resource' else: return 'Other' vox_detailed_df['gt_language_family'] = vox_detailed_df['gt_iso'].apply(get_language_family) vox_detailed_df['pred_language_family'] = vox_detailed_df['pred_iso'].apply(get_language_family) vox_detailed_df.to_excel(writer, sheet_name='VoxLingua107_Results', index=False) # ======================================== # SHEET 3: XLS-R DETAILED RESULTS # ======================================== if results['xlsr_detailed']: print(" šŸ“‹ Creating XLS-R detailed results...") xlsr_detailed_df = pd.DataFrame(results['xlsr_detailed']) # Add analysis columns xlsr_detailed_df['accuracy_score'] = xlsr_detailed_df['correct'].astype(int) xlsr_detailed_df['confidence_category'] = pd.cut( xlsr_detailed_df['confidence'], bins=[0, 0.3, 0.6, 0.8, 1.0], labels=['Low', 'Medium', 'High', 'Very High'] ) xlsr_detailed_df['gt_language_family'] = xlsr_detailed_df['gt_iso'].apply(get_language_family) xlsr_detailed_df['pred_language_family'] = xlsr_detailed_df['pred_iso'].apply(get_language_family) xlsr_detailed_df.to_excel(writer, sheet_name='XLSR_Results', index=False) # ======================================== # SHEET 4: PER-LANGUAGE ACCURACY ANALYSIS # ======================================== print(" šŸ“‹ Creating per-language accuracy analysis...") lang_analysis_data = [] # Get all unique languages from ground truth all_gt_langs = set() if results['voxlingua_detailed']: all_gt_langs.update([r['gt_iso'] for r in results['voxlingua_detailed'] if r['gt_iso']]) if results['xlsr_detailed']: all_gt_langs.update([r['gt_iso'] for r in results['xlsr_detailed'] if r['gt_iso']]) # Language name mapping LANG_NAMES = { 'ur': 'Urdu', 'pa': 'Punjabi', 'ta': 'Tamil', 'sd': 'Sindhi', 'or': 'Odia', 'ml': 'Malayalam', 'ne': 'Nepali', 'as': 'Assamese', 'hi': 'Hindi', 'bn': 'Bengali', 'kok': 'Konkani', 'kn': 'Kannada', 'ks': 'Kashmiri', 'mr': 'Marathi', 'te': 'Telugu', 'br': 'Bodo', 'doi': 'Dogri', 'sat': 'Santali', 'gu': 'Gujarati', 'mni': 'Manipuri', 'sa': 'Sanskrit' } for lang in sorted(all_gt_langs): lang_name = LANG_NAMES.get(lang, lang.title()) lang_family = get_language_family(lang) # VoxLingua stats for this language vox_stats = {'files': 0, 'correct': 0, 'accuracy': 0, 'avg_confidence': 0} if results['voxlingua_detailed']: vox_lang_data = [r for r in results['voxlingua_detailed'] if r['gt_iso'] == lang] if vox_lang_data: vox_stats['files'] = len(vox_lang_data) vox_stats['correct'] = sum(1 for r in vox_lang_data if r['correct']) vox_stats['accuracy'] = vox_stats['correct'] / vox_stats['files'] vox_stats['avg_confidence'] = np.mean([r['confidence'] for r in vox_lang_data]) # XLS-R stats for this language xlsr_stats = {'files': 0, 'correct': 0, 'accuracy': 0, 'avg_confidence': 0} if results['xlsr_detailed']: xlsr_lang_data = [r for r in results['xlsr_detailed'] if r['gt_iso'] == lang] if xlsr_lang_data: xlsr_stats['files'] = len(xlsr_lang_data) xlsr_stats['correct'] = sum(1 for r in xlsr_lang_data if r['correct']) xlsr_stats['accuracy'] = xlsr_stats['correct'] / xlsr_stats['files'] xlsr_stats['avg_confidence'] = np.mean([r['confidence'] for r in xlsr_lang_data]) lang_analysis_data.append({ 'Language_Code': lang, 'Language_Name': lang_name, 'Language_Family': lang_family, 'VoxLingua_Files': vox_stats['files'], 'VoxLingua_Correct': vox_stats['correct'], 'VoxLingua_Accuracy': f"{vox_stats['accuracy']:.4f}", 'VoxLingua_Accuracy_Pct': f"{vox_stats['accuracy']*100:.1f}%", 'VoxLingua_Avg_Confidence': f"{vox_stats['avg_confidence']:.4f}", 'XLSR_Files': xlsr_stats['files'], 'XLSR_Correct': xlsr_stats['correct'], 'XLSR_Accuracy': f"{xlsr_stats['accuracy']:.4f}", 'XLSR_Accuracy_Pct': f"{xlsr_stats['accuracy']*100:.1f}%", 'XLSR_Avg_Confidence': f"{xlsr_stats['avg_confidence']:.4f}", 'Better_Model': 'VoxLingua' if vox_stats['accuracy'] > xlsr_stats['accuracy'] else 'XLS-R' if xlsr_stats['accuracy'] > vox_stats['accuracy'] else 'Tie' }) lang_analysis_df = pd.DataFrame(lang_analysis_data) lang_analysis_df.to_excel(writer, sheet_name='Per_Language_Analysis', index=False) # ======================================== # SHEET 5: CONFUSION MATRIX - VOXLINGUA # ======================================== if results['voxlingua_detailed']: print(" šŸ“‹ Creating VoxLingua confusion matrix...") vox_df = pd.DataFrame(results['voxlingua_detailed']) valid_vox = vox_df[vox_df['gt_iso'] != ''].copy() if len(valid_vox) > 0: # Create confusion matrix confusion_data = [] for gt_lang in sorted(valid_vox['gt_iso'].unique()): gt_data = valid_vox[valid_vox['gt_iso'] == gt_lang] row_data = {'Ground_Truth': gt_lang} for pred_lang in sorted(valid_vox['pred_iso'].unique()): count = len(gt_data[gt_data['pred_iso'] == pred_lang]) row_data[f'Predicted_{pred_lang}'] = count confusion_data.append(row_data) confusion_df = pd.DataFrame(confusion_data).fillna(0) confusion_df.to_excel(writer, sheet_name='VoxLingua_Confusion_Matrix', index=False) # ======================================== # SHEET 6: CONFUSION MATRIX - XLS-R # ======================================== if results['xlsr_detailed']: print(" šŸ“‹ Creating XLS-R confusion matrix...") xlsr_df = pd.DataFrame(results['xlsr_detailed']) valid_xlsr = xlsr_df[xlsr_df['gt_iso'] != ''].copy() if len(valid_xlsr) > 0: confusion_data = [] for gt_lang in sorted(valid_xlsr['gt_iso'].unique()): gt_data = valid_xlsr[valid_xlsr['gt_iso'] == gt_lang] row_data = {'Ground_Truth': gt_lang} for pred_lang in sorted(valid_xlsr['pred_iso'].unique()): count = len(gt_data[gt_data['pred_iso'] == pred_lang]) row_data[f'Predicted_{pred_lang}'] = count confusion_data.append(row_data) confusion_df = pd.DataFrame(confusion_data).fillna(0) confusion_df.to_excel(writer, sheet_name='XLSR_Confusion_Matrix', index=False) # ======================================== # SHEET 7: CONFIDENCE ANALYSIS # ======================================== print(" šŸ“‹ Creating confidence analysis...") confidence_analysis = [] # VoxLingua confidence analysis if results['voxlingua_detailed']: vox_df = pd.DataFrame(results['voxlingua_detailed']) valid_vox = vox_df[vox_df['gt_iso'] != ''].copy() if len(valid_vox) > 0: for conf_range in [(0, 0.3), (0.3, 0.6), (0.6, 0.8), (0.8, 1.0)]: range_data = valid_vox[ (valid_vox['confidence'] >= conf_range[0]) & (valid_vox['confidence'] < conf_range[1]) ] if len(range_data) > 0: accuracy = (range_data['correct'] == True).mean() confidence_analysis.append({ 'Model': 'VoxLingua107', 'Confidence_Range': f"{conf_range[0]:.1f}-{conf_range[1]:.1f}", 'Files': len(range_data), 'Accuracy': f"{accuracy:.4f}", 'Accuracy_Pct': f"{accuracy*100:.1f}%", 'Avg_Confidence': f"{range_data['confidence'].mean():.4f}" }) # XLS-R confidence analysis if results['xlsr_detailed']: xlsr_df = pd.DataFrame(results['xlsr_detailed']) valid_xlsr = xlsr_df[xlsr_df['gt_iso'] != ''].copy() if len(valid_xlsr) > 0: for conf_range in [(0, 0.3), (0.3, 0.6), (0.6, 0.8), (0.8, 1.0)]: range_data = valid_xlsr[ (valid_xlsr['confidence'] >= conf_range[0]) & (valid_xlsr['confidence'] < conf_range[1]) ] if len(range_data) > 0: accuracy = (range_data['correct'] == True).mean() confidence_analysis.append({ 'Model': 'XLS-R', 'Confidence_Range': f"{conf_range[0]:.1f}-{conf_range[1]:.1f}", 'Files': len(range_data), 'Accuracy': f"{accuracy:.4f}", 'Accuracy_Pct': f"{accuracy*100:.1f}%", 'Avg_Confidence': f"{range_data['confidence'].mean():.4f}" }) confidence_df = pd.DataFrame(confidence_analysis) confidence_df.to_excel(writer, sheet_name='Confidence_Analysis', index=False) # ======================================== # SHEET 8: ERROR ANALYSIS # ======================================== print(" šŸ“‹ Creating error analysis...") error_analysis = [] # VoxLingua errors if results['voxlingua_detailed']: vox_df = pd.DataFrame(results['voxlingua_detailed']) vox_errors = vox_df[vox_df['correct'] == False].copy() for _, error in vox_errors.iterrows(): error_analysis.append({ 'Model': 'VoxLingua107', 'File': error['file'], 'Ground_Truth': error['gt_iso'], 'Predicted': error['pred_iso'], 'Confidence': f"{error['confidence']:.4f}", 'GT_Language_Family': get_language_family(error['gt_iso']), 'Pred_Language_Family': get_language_family(error['pred_iso']), 'Cross_Family_Error': get_language_family(error['gt_iso']) != get_language_family(error['pred_iso']) }) # XLS-R errors if results['xlsr_detailed']: xlsr_df = pd.DataFrame(results['xlsr_detailed']) xlsr_errors = xlsr_df[xlsr_df['correct'] == False].copy() for _, error in xlsr_errors.iterrows(): error_analysis.append({ 'Model': 'XLS-R', 'File': error['file'], 'Ground_Truth': error['gt_iso'], 'Predicted': error['pred_iso'], 'Confidence': f"{error['confidence']:.4f}", 'GT_Language_Family': get_language_family(error['gt_iso']), 'Pred_Language_Family': get_language_family(error['pred_iso']), 'Cross_Family_Error': get_language_family(error['gt_iso']) != get_language_family(error['pred_iso']) }) error_df = pd.DataFrame(error_analysis) error_df.to_excel(writer, sheet_name='Error_Analysis', index=False) # ======================================== # SHEET 9: LANGUAGE FAMILY PERFORMANCE # ======================================== print(" šŸ“‹ Creating language family performance...") family_performance = [] families = ['Indo-Aryan', 'Dravidian', 'Low-Resource', 'Other'] for family in families: # VoxLingua performance for this family if results['voxlingua_detailed']: vox_df = pd.DataFrame(results['voxlingua_detailed']) family_data = vox_df[vox_df['gt_iso'].apply(lambda x: get_language_family(x) == family)] if len(family_data) > 0: vox_acc = (family_data['correct'] == True).mean() vox_conf = family_data['confidence'].mean() vox_files = len(family_data) else: vox_acc = vox_conf = vox_files = 0 else: vox_acc = vox_conf = vox_files = 0 # XLS-R performance for this family if results['xlsr_detailed']: xlsr_df = pd.DataFrame(results['xlsr_detailed']) family_data = xlsr_df[xlsr_df['gt_iso'].apply(lambda x: get_language_family(x) == family)] if len(family_data) > 0: xlsr_acc = (family_data['correct'] == True).mean() xlsr_conf = family_data['confidence'].mean() xlsr_files = len(family_data) else: xlsr_acc = xlsr_conf = xlsr_files = 0 else: xlsr_acc = xlsr_conf = xlsr_files = 0 family_performance.append({ 'Language_Family': family, 'VoxLingua_Files': vox_files, 'VoxLingua_Accuracy': f"{vox_acc:.4f}", 'VoxLingua_Accuracy_Pct': f"{vox_acc*100:.1f}%", 'VoxLingua_Avg_Confidence': f"{vox_conf:.4f}", 'XLSR_Files': xlsr_files, 'XLSR_Accuracy': f"{xlsr_acc:.4f}", 'XLSR_Accuracy_Pct': f"{xlsr_acc*100:.1f}%", 'XLSR_Avg_Confidence': f"{xlsr_conf:.4f}", 'Better_Model': 'VoxLingua' if vox_acc > xlsr_acc else 'XLS-R' if xlsr_acc > vox_acc else 'Tie' }) family_df = pd.DataFrame(family_performance) family_df.to_excel(writer, sheet_name='Language_Family_Performance', index=False) # ======================================== # SHEET 10: TOP-5 PREDICTIONS (SAMPLE) # ======================================== print(" šŸ“‹ Creating Top-5 predictions sample...") top5_sample = [] # Sample top-5 predictions from comparison data sample_files = results['comparison_data'][:20] # First 20 files as sample for file_data in sample_files: file_name = file_data['file'] gt_lang = file_data['gt_iso'] # VoxLingua Top-5 if file_data['voxlingua']['available'] and 'top5_predictions' in file_data['voxlingua']: for pred in file_data['voxlingua']['top5_predictions']: top5_sample.append({ 'Model': 'VoxLingua107', 'File': file_name, 'Ground_Truth': gt_lang, 'Rank': pred['rank'], 'Predicted_Language': pred['mapped'], 'Original_Output': pred['original'], 'Confidence': f"{pred['confidence']:.4f}", 'In_Dataset': pred['in_dataset'], 'Correct': gt_lang == pred['mapped'] }) # XLS-R Top-5 if file_data['xlsr']['available'] and 'top5_predictions' in file_data['xlsr']: for pred in file_data['xlsr']['top5_predictions']: top5_sample.append({ 'Model': 'XLS-R', 'File': file_name, 'Ground_Truth': gt_lang, 'Rank': pred['rank'], 'Predicted_Language': pred['mapped'], 'Original_Output': pred['original'], 'Confidence': f"{pred['confidence']:.4f}", 'In_Dataset': pred['in_dataset'], 'Correct': gt_lang == pred['mapped'] }) top5_df = pd.DataFrame(top5_sample) top5_df.to_excel(writer, sheet_name='Top5_Predictions_Sample', index=False) print(f"āœ… Comprehensive Excel analysis created: {output_filename}") # Try to download the file try: from google.colab import files print(f"šŸ“„ File downloaded successfully!") except: print(f"šŸ“ File saved locally: {output_filename}") return output_filename # Run the comprehensive Excel analysis if 'final_analysis_results' in globals() and final_analysis_results: excel_filename = create_comprehensive_excel_analysis( final_analysis_results, "Language_Detection_Comprehensive_Analysis.xlsx" ) print(f"\nšŸŽ‰ COMPREHENSIVE EXCEL ANALYSIS COMPLETE!") print(f"šŸ“Š File: {excel_filename}") # Print summary of what was created print(f"\nšŸ“‹ Excel Contains 10 Sheets:") print(f" 1. Executive_Summary - Key metrics and recommendations") print(f" 2. VoxLingua107_Results - Detailed VoxLingua results") print(f" 3. XLSR_Results - Detailed XLS-R results") print(f" 4. Per_Language_Analysis - Accuracy by language") print(f" 5. VoxLingua_Confusion_Matrix - VoxLingua confusion matrix") print(f" 6. XLSR_Confusion_Matrix - XLS-R confusion matrix") print(f" 7. Confidence_Analysis - Performance by confidence ranges") print(f" 8. Error_Analysis - Detailed error breakdown") print(f" 9. Language_Family_Performance - Performance by language family") print(f" 10. Top5_Predictions_Sample - Sample of top-5 predictions") else: print("āŒ No analysis results found. Please run the analysis first.")