Spaces:

jacob-c
/

largermodel_lyrics_generation

Paused

App Files Files Community

root commited on Mar 24

Commit

bb9a8b1

1 Parent(s): 7dfa01d

ss

Browse files

Files changed (1) hide show

app.py +65 -43

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import gradio as gr
 import torch
 import numpy as np
 from transformers import (
-    AutoModelForSequenceClassification,
     AutoFeatureExtractor,
     AutoTokenizer,
     pipeline,
@@ -18,7 +18,8 @@ from utils import (
     extract_mfcc_features,
     calculate_lyrics_length,
     format_genre_results,
-    ensure_cuda_availability
 )
 # Login to Hugging Face Hub if token is provided
@@ -33,17 +34,25 @@ SAMPLE_RATE = 22050  # Standard sample rate for audio processing
 # Check CUDA availability (for informational purposes)
 CUDA_AVAILABLE = ensure_cuda_availability()
-# Load genre classification model
 try:
-    # Try to load feature extractor first (for audio models)
-    genre_processor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME)
-    print(f"Loaded feature extractor for genre classification model: {GENRE_MODEL_NAME}")
 except Exception as e:
-    print(f"Error loading feature extractor, using basic processing: {str(e)}")
-    genre_processor = None
-# Load the model
-genre_model = AutoModelForSequenceClassification.from_pretrained(GENRE_MODEL_NAME)
 # Load LLM with appropriate quantization for T4 GPU
 bnb_config = BitsAndBytesConfig(
@@ -76,48 +85,61 @@ def extract_audio_features(audio_file):
     # Get audio duration in seconds
     duration = extract_audio_duration(y, sr)
-    # Extract MFCCs for genre classification
     mfccs_mean = extract_mfcc_features(y, sr, n_mfcc=20)
     return {
         "features": mfccs_mean,
         "duration": duration,
         "waveform": y,
-        "sample_rate": sr
     }
 def classify_genre(audio_data):
     """Classify the genre of the audio using the loaded model."""
-    if genre_processor is not None:
-        # Use the feature extractor if available
-        inputs = genre_processor(
-            audio_data["waveform"],
-            sampling_rate=audio_data["sample_rate"],
-            return_tensors="pt"
-        )
-    else:
-        # Fallback to basic feature processing
-        # Convert MFCC features to tensor and reshape appropriately
-        features_tensor = torch.tensor(audio_data["features"]).unsqueeze(0)
-        inputs = {"input_features": features_tensor}
-    with torch.no_grad():
-        outputs = genre_model(**inputs)
-        predictions = outputs.logits.softmax(dim=-1)
-    # Get the top 3 genres
-    values, indices = torch.topk(predictions, 3)
-    # Map indices to genre labels
-    genre_labels = genre_model.config.id2label
-    top_genres = []
-    for i, (value, index) in enumerate(zip(values[0], indices[0])):
-        genre = genre_labels[index.item()]
-        confidence = value.item()
-        top_genres.append((genre, confidence))
-    return top_genres
 def generate_lyrics(genre, duration):
     """Generate lyrics based on the genre and with appropriate length."""

 import torch
 import numpy as np
 from transformers import (
+    AutoModelForAudioClassification,
     AutoFeatureExtractor,
     AutoTokenizer,
     pipeline,
     extract_mfcc_features,
     calculate_lyrics_length,
     format_genre_results,
+    ensure_cuda_availability,
+    preprocess_audio_for_model
 )
 # Login to Hugging Face Hub if token is provided
 # Check CUDA availability (for informational purposes)
 CUDA_AVAILABLE = ensure_cuda_availability()
+# Create genre classification pipeline
+print(f"Loading audio classification model: {GENRE_MODEL_NAME}")
 try:
+    genre_classifier = pipeline(
+        "audio-classification",
+        model=GENRE_MODEL_NAME,
+        device=0 if CUDA_AVAILABLE else -1
+    )
+    print("Successfully loaded audio classification pipeline")
 except Exception as e:
+    print(f"Error creating pipeline: {str(e)}")
+    # Fallback to manual loading
+    try:
+        genre_processor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME)
+        genre_model = AutoModelForAudioClassification.from_pretrained(GENRE_MODEL_NAME)
+        print("Successfully loaded audio classification model and feature extractor")
+    except Exception as e2:
+        print(f"Error loading model components: {str(e2)}")
+        raise RuntimeError(f"Could not load genre classification model: {str(e2)}")
 # Load LLM with appropriate quantization for T4 GPU
 bnb_config = BitsAndBytesConfig(
     # Get audio duration in seconds
     duration = extract_audio_duration(y, sr)
+    # Extract MFCCs for genre classification (may not be needed with the pipeline)
     mfccs_mean = extract_mfcc_features(y, sr, n_mfcc=20)
     return {
         "features": mfccs_mean,
         "duration": duration,
         "waveform": y,
+        "sample_rate": sr,
+        "path": audio_file  # Keep path for the pipeline
     }
 def classify_genre(audio_data):
     """Classify the genre of the audio using the loaded model."""
+    try:
+        # First attempt: Try using the pipeline if available
+        if 'genre_classifier' in globals():
+            results = genre_classifier(audio_data["path"])
+            # Transform pipeline results to our expected format
+            top_genres = [(result["label"], result["score"]) for result in results[:3]]
+            return top_genres
+        # Second attempt: Use manually loaded model components
+        elif 'genre_processor' in globals() and 'genre_model' in globals():
+            # Process audio input with feature extractor
+            inputs = genre_processor(
+                audio_data["waveform"],
+                sampling_rate=audio_data["sample_rate"],
+                return_tensors="pt"
+            )
+            with torch.no_grad():
+                outputs = genre_model(**inputs)
+                predictions = outputs.logits.softmax(dim=-1)
+            # Get the top 3 genres
+            values, indices = torch.topk(predictions, 3)
+            # Map indices to genre labels
+            genre_labels = genre_model.config.id2label
+            top_genres = []
+            for i, (value, index) in enumerate(zip(values[0], indices[0])):
+                genre = genre_labels[index.item()]
+                confidence = value.item()
+                top_genres.append((genre, confidence))
+            return top_genres
+        else:
+            raise ValueError("No genre classification model available")
+    except Exception as e:
+        print(f"Error in genre classification: {str(e)}")
+        # Fallback: return a default genre if everything fails
+        return [("rock", 1.0)]
 def generate_lyrics(genre, duration):
     """Generate lyrics based on the genre and with appropriate length."""