Spaces:

jacob-c
/

syllables_matching_experiment

Paused

App Files Files Community

root commited on May 16

Commit

e3108aa

1 Parent(s): 801647a

ss

Browse files

Files changed (1) hide show

app.py +79 -39

app.py CHANGED Viewed

@@ -38,19 +38,25 @@ SAMPLE_RATE = 22050  # Standard sample rate for audio processing
 # Check CUDA availability (for informational purposes)
 CUDA_AVAILABLE = ensure_cuda_availability()
-# Load models
-@functools.lru_cache(maxsize=1)
-def load_genre_model():
-    print("Loading genre classification model...")
-    return pipeline(
-        "audio-classification",
-        model=GENRE_MODEL_NAME,
-        device=0 if CUDA_AVAILABLE else -1
     )
-@functools.lru_cache(maxsize=1)
-def load_llm_pipeline():
-    print("Loading Qwen LLM model with 4-bit quantization...")
     # Configure 4-bit quantization for better performance
     quantization_config = BitsAndBytesConfig(
         load_in_4bit=True,
@@ -59,17 +65,19 @@ def load_llm_pipeline():
         bnb_4bit_use_double_quant=True
     )
-    return pipeline(
-        "text-generation",
-        model=LLM_MODEL_NAME,
         device_map="auto",
         trust_remote_code=True,
-        model_kwargs={
-            "torch_dtype": torch.float16,
-            "quantization_config": quantization_config,
-            "use_cache": True
-        }
     )
 # Create music analyzer instance
 music_analyzer = MusicAnalyzer()
@@ -95,17 +103,30 @@ def process_audio(audio_file):
         emotion = music_analysis["emotion_analysis"]["primary_emotion"]
         theme = music_analysis["theme_analysis"]["primary_theme"]
-        # Use genre classification pipeline
-        genre_classifier = load_genre_model()
-        # Resample audio to 16000 Hz for the genre model
-        y_16k = librosa.resample(y, orig_sr=sr, target_sr=16000)
-        # Classify genre
-        genre_results = genre_classifier({"raw": y_16k, "sampling_rate": 16000})
-        # Get top genres
-        top_genres = [(genre["label"], genre["score"]) for genre in genre_results]
         # Format genre results for display
         genre_results_text = format_genre_results(top_genres)
@@ -145,8 +166,9 @@ def generate_lyrics(music_analysis, genre, duration):
         emotion = music_analysis["emotion_analysis"]["primary_emotion"]
         theme = music_analysis["theme_analysis"]["primary_theme"]
-        # Load LLM pipeline
-        text_generator = load_llm_pipeline()
         # Construct prompt for the LLM
         prompt = f"""Write lyrics for a {genre} song with these specifications:
@@ -169,17 +191,36 @@ IMPORTANT INSTRUCTIONS:
 - Keep lyrics concise enough to fit the duration when sung at the given tempo
 """
-        # Generate lyrics using the LLM pipeline
-        generation_result = text_generator(
-            prompt,
             max_new_tokens=1024,
             do_sample=True,
             temperature=0.7,
             top_p=0.9,
-            return_full_text=False
         )
-        lyrics = generation_result[0]["generated_text"]
         # Enhanced post-processing to remove ALL structural elements and thinking
         # Remove any lines with section labels using a more comprehensive pattern
@@ -262,5 +303,4 @@ if __name__ == "__main__":
     demo.launch()
 else:
     # For Hugging Face Spaces
-    app = demo

 # Check CUDA availability (for informational purposes)
 CUDA_AVAILABLE = ensure_cuda_availability()
+# Load models at initialization time
+print("Loading genre classification model...")
+try:
+    genre_feature_extractor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME)
+    genre_model = AutoModelForAudioClassification.from_pretrained(
+        GENRE_MODEL_NAME,
+        device_map="auto" if CUDA_AVAILABLE else None
     )
+    # Create a convenience wrapper function with the same interface as before
+    def get_genre_model():
+        return genre_model, genre_feature_extractor
+except Exception as e:
+    print(f"Error loading genre model: {str(e)}")
+    genre_model = None
+    genre_feature_extractor = None
+# Load LLM and tokenizer at initialization time
+print("Loading Qwen LLM model with 4-bit quantization...")
+try:
     # Configure 4-bit quantization for better performance
     quantization_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_use_double_quant=True
     )
+    llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
+    llm_model = AutoModelForCausalLM.from_pretrained(
+        LLM_MODEL_NAME,
+        quantization_config=quantization_config,
         device_map="auto",
         trust_remote_code=True,
+        torch_dtype=torch.float16,
+        use_cache=True
     )
+except Exception as e:
+    print(f"Error loading LLM model: {str(e)}")
+    llm_tokenizer = None
+    llm_model = None
 # Create music analyzer instance
 music_analyzer = MusicAnalyzer()
         emotion = music_analysis["emotion_analysis"]["primary_emotion"]
         theme = music_analysis["theme_analysis"]["primary_theme"]
+        # Use genre classification directly instead of pipeline
+        if genre_model is not None and genre_feature_extractor is not None:
+            # Resample audio to 16000 Hz for the genre model
+            y_16k = librosa.resample(y, orig_sr=sr, target_sr=16000)
+            # Extract features
+            inputs = genre_feature_extractor(
+                y_16k,
+                sampling_rate=16000,
+                return_tensors="pt"
+            ).to(genre_model.device)
+            # Classify genre
+            with torch.no_grad():
+                outputs = genre_model(**inputs)
+                logits = outputs.logits
+                probs = torch.nn.functional.softmax(logits, dim=-1)
+            # Get top genres
+            values, indices = torch.topk(probs[0], k=5)
+            top_genres = [(genre_model.config.id2label[idx.item()], val.item()) for val, idx in zip(values, indices)]
+        else:
+            # Fallback if model loading failed
+            top_genres = [("Unknown", 1.0)]
         # Format genre results for display
         genre_results_text = format_genre_results(top_genres)
         emotion = music_analysis["emotion_analysis"]["primary_emotion"]
         theme = music_analysis["theme_analysis"]["primary_theme"]
+        # Verify LLM is loaded
+        if llm_model is None or llm_tokenizer is None:
+            return "Error: LLM model not properly loaded"
         # Construct prompt for the LLM
         prompt = f"""Write lyrics for a {genre} song with these specifications:
 - Keep lyrics concise enough to fit the duration when sung at the given tempo
 """
+        # Generate lyrics using the LLM model directly
+        # Format as chat message
+        messages = [
+            {"role": "user", "content": prompt}
+        ]
+        # Apply chat template
+        text = llm_tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        # Tokenize and move to model device
+        model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device)
+        # Generate with optimized parameters
+        generated_ids = llm_model.generate(
+            **model_inputs,
             max_new_tokens=1024,
             do_sample=True,
             temperature=0.7,
             top_p=0.9,
+            repetition_penalty=1.1,
+            pad_token_id=llm_tokenizer.eos_token_id
         )
+        # Decode the output
+        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
+        lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
         # Enhanced post-processing to remove ALL structural elements and thinking
         # Remove any lines with section labels using a more comprehensive pattern
     demo.launch()
 else:
     # For Hugging Face Spaces
+    app = demo