Spaces:

jacob-c
/

syllables_matching_experiment

Paused

App Files Files Community

root commited on May 2

Commit

4ddd8f4

1 Parent(s): db4c558

syllables trying first

Browse files

Files changed (3) hide show

app.py +509 -52
requirements.txt +2 -1
utils.py +43 -29

app.py CHANGED Viewed

@@ -3,6 +3,8 @@ import io
 import gradio as gr
 import torch
 import numpy as np
 from transformers import (
     AutoModelForAudioClassification,
     AutoFeatureExtractor,
@@ -103,6 +105,41 @@ llm_pipeline = pipeline(
 # Initialize music emotion analyzer
 music_analyzer = MusicAnalyzer()
 def extract_audio_features(audio_file):
     """Extract audio features from an audio file."""
     try:
@@ -228,19 +265,83 @@ def detect_music(audio_data):
         print(f"Error in music detection: {str(e)}")
         return False, []
 def detect_beats(y, sr):
-    """Detect beats in the audio using librosa."""
     # Get tempo and beat frames
     tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
     # Convert beat frames to time in seconds
     beat_times = librosa.frames_to_time(beat_frames, sr=sr)
     return {
         "tempo": tempo,
         "beat_frames": beat_frames,
         "beat_times": beat_times,
-        "beat_count": len(beat_times)
     }
 def detect_sections(y, sr):
@@ -300,6 +401,124 @@ def detect_sections(y, sr):
     return sections
 def estimate_syllables_per_section(beats_info, sections):
     """Estimate the number of syllables needed for each section based on beats."""
     syllables_per_section = []
@@ -314,17 +533,31 @@ def estimate_syllables_per_section(beats_info, sections):
         # Calculate syllables based on section type and beat count
         beat_count = len(section_beats)
-        # Adjust syllable count based on section type and genre conventions
-        if section["type"] == "verse":
-            # Verses typically have more syllables per beat (more words)
-            syllable_count = beat_count * 1.2
-        elif section["type"] == "chorus":
-            # Choruses often have fewer syllables per beat (more sustained notes)
-            syllable_count = beat_count * 0.9
-        elif section["type"] == "bridge":
-            syllable_count = beat_count * 1.0
-        else:  # intro, outro
-            syllable_count = beat_count * 0.5  # Often instrumental or sparse lyrics
         syllables_per_section.append({
             "type": section["type"],
@@ -332,7 +565,8 @@ def estimate_syllables_per_section(beats_info, sections):
             "end": section["end"],
             "duration": section["duration"],
             "beat_count": beat_count,
-            "syllable_count": int(syllable_count)
         })
     return syllables_per_section
@@ -342,40 +576,71 @@ def calculate_detailed_song_structure(audio_data):
     y = audio_data["waveform"]
     sr = audio_data["sample_rate"]
-    # Detect beats
     beats_info = detect_beats(y, sr)
     # Detect sections
     sections = detect_sections(y, sr)
-    # Estimate syllables per section
     syllables_info = estimate_syllables_per_section(beats_info, sections)
     return {
         "beats": beats_info,
         "sections": sections,
-        "syllables": syllables_info
     }
-def generate_lyrics(genre, duration, emotion_results):
-    """Generate lyrics based on the genre and with appropriate length."""
-    # Calculate appropriate lyrics length based on audio duration
-    lines_count = calculate_lyrics_length(duration)
-    # Calculate approximate number of verses and chorus
-    if lines_count <= 6:
-        # Very short song - one verse and chorus
-        verse_lines = 2
-        chorus_lines = 2
-    elif lines_count <= 10:
-        # Medium song - two verses and chorus
-        verse_lines = 3
-        chorus_lines = 2
-    else:
-        # Longer song - two verses, chorus, and bridge
-        verse_lines = 3
-        chorus_lines = 2
     # Extract emotion and theme data from analysis results
     primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"]
     primary_theme = emotion_results["theme_analysis"]["primary_theme"]
@@ -389,8 +654,129 @@ def generate_lyrics(genre, duration, emotion_results):
     key = emotion_results["tonal_analysis"]["key"]
     mode = emotion_results["tonal_analysis"]["mode"]
-    # Create prompt for the LLM
-    prompt = f"""
 You are a talented songwriter who specializes in {genre} music.
 Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
@@ -400,18 +786,58 @@ Music analysis has detected the following qualities in the music:
 - Primary emotion: {primary_emotion}
 - Primary theme: {primary_theme}
 The lyrics should:
 - Perfectly capture the essence and style of {genre} music
 - Express the {primary_emotion} emotion and {primary_theme} theme
-- Be approximately {lines_count} lines long
-- Have a coherent theme and flow
 - Follow this structure:
   * Verse: {verse_lines} lines
   * Chorus: {chorus_lines} lines
-  * {f'Bridge: 2 lines' if lines_count > 10 else ''}
 - Be completely original
 - Match the song duration of {duration:.1f} seconds
-- Keep each line concise and impactful
 Your lyrics:
 """
@@ -429,19 +855,31 @@ Your lyrics:
     # Extract and clean generated lyrics
     lyrics = response[0]["generated_text"].strip()
-    # Add section labels if they're not present
-    if "Verse" not in lyrics and "Chorus" not in lyrics:
         lines = lyrics.split('\n')
         formatted_lyrics = []
-        current_section = "Verse"
         for i, line in enumerate(lines):
-            if i == 0:
                 formatted_lyrics.append("[Verse]")
-            elif i == verse_lines:
                 formatted_lyrics.append("\n[Chorus]")
-            elif i == verse_lines + chorus_lines and lines_count > 10:
                 formatted_lyrics.append("\n[Bridge]")
             formatted_lyrics.append(line)
         lyrics = '\n'.join(formatted_lyrics)
     return lyrics
@@ -496,10 +934,10 @@ def process_audio(audio_file):
             # Continue with a simpler approach if this fails
             song_structure = None
-        # Generate lyrics based on top genre and emotion analysis
         try:
             primary_genre, _ = top_genres[0]
-            lyrics = generate_lyrics(primary_genre, audio_data["duration"], emotion_results)
         except Exception as e:
             print(f"Error generating lyrics: {str(e)}")
             lyrics = f"Error generating lyrics: {str(e)}"
@@ -555,7 +993,25 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
                     emotion_text += "\n\nSong Structure:\n"
                     for section in song_structure["syllables"]:
                         emotion_text += f"- {section['type'].capitalize()}: {section['start']:.1f}s to {section['end']:.1f}s "
-                        emotion_text += f"({section['duration']:.1f}s, {section['beat_count']} beats, ~{section['syllable_count']} syllables)\n"
                 except Exception as e:
                     print(f"Error displaying song structure: {str(e)}")
                     # Continue without showing structure details
@@ -590,8 +1046,9 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
     2. The system will classify the genre using the dima806/music_genres_classification model
     3. The system will analyze the musical emotion and theme using advanced audio processing
     4. The system will identify the song structure, beats, and timing patterns
-    5. Based on the detected genre, emotion, and structure, it will generate lyrics that match the beats, sections, and flow of the music
-    6. The lyrics will include appropriate section markings and syllable counts to align with the music
     """)
 # Launch the app

 import gradio as gr
 import torch
 import numpy as np
+import re
+import pronouncing  # Add this to requirements.txt for syllable counting
 from transformers import (
     AutoModelForAudioClassification,
     AutoFeatureExtractor,
 # Initialize music emotion analyzer
 music_analyzer = MusicAnalyzer()
+# New function: Count syllables in text
+def count_syllables(text):
+    """Count syllables in a given text using the pronouncing library."""
+    words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
+    syllable_count = 0
+    for word in words:
+        # Get pronunciations for the word
+        pronunciations = pronouncing.phones_for_word(word)
+        if pronunciations:
+            # Count syllables in the first pronunciation
+            syllable_count += pronouncing.syllable_count(pronunciations[0])
+        else:
+            # Fallback: estimate syllables based on vowel groups
+            vowels = "aeiouy"
+            count = 0
+            prev_is_vowel = False
+            for char in word:
+                is_vowel = char.lower() in vowels
+                if is_vowel and not prev_is_vowel:
+                    count += 1
+                prev_is_vowel = is_vowel
+            if word.endswith('e'):
+                count -= 1
+            if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
+                count += 1
+            if count == 0:
+                count = 1
+            syllable_count += count
+    return syllable_count
 def extract_audio_features(audio_file):
     """Extract audio features from an audio file."""
     try:
         print(f"Error in music detection: {str(e)}")
         return False, []
+# Enhanced detect_beats function for better rhythm analysis
 def detect_beats(y, sr):
+    """Detect beats and create a detailed rhythmic map of the audio."""
     # Get tempo and beat frames
     tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
     # Convert beat frames to time in seconds
     beat_times = librosa.frames_to_time(beat_frames, sr=sr)
+    # Calculate beat strength to identify strong and weak beats
+    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
+    beat_strengths = [onset_env[frame] for frame in beat_frames if frame < len(onset_env)]
+    # If we couldn't get strengths for all beats, use average for missing ones
+    if beat_strengths:
+        avg_strength = sum(beat_strengths) / len(beat_strengths)
+        while len(beat_strengths) < len(beat_times):
+            beat_strengths.append(avg_strength)
+    else:
+        beat_strengths = [1.0] * len(beat_times)
+    # Calculate time intervals between beats (for rhythm pattern detection)
+    intervals = []
+    for i in range(1, len(beat_times)):
+        intervals.append(beat_times[i] - beat_times[i-1])
+    # Try to detect time signature based on beat pattern
+    time_signature = 4  # Default assumption of 4/4 time
+    if len(beat_strengths) > 8:
+        strength_pattern = []
+        for i in range(0, len(beat_strengths), 2):
+            if i+1 < len(beat_strengths):
+                ratio = beat_strengths[i] / (beat_strengths[i+1] + 0.0001)
+                strength_pattern.append(ratio)
+        # Check if we have a clear 3/4 pattern (strong-weak-weak)
+        if strength_pattern:
+            three_pattern = sum(1 for r in strength_pattern if r > 1.2) / len(strength_pattern)
+            if three_pattern > 0.6:
+                time_signature = 3
+    # Group beats into phrases
+    phrases = []
+    current_phrase = []
+    for i in range(len(beat_times)):
+        current_phrase.append(i)
+        # Look for natural phrase boundaries
+        if i < len(beat_times) - 1:
+            is_stronger_next = False
+            if i < len(beat_strengths) - 1:
+                is_stronger_next = beat_strengths[i+1] > beat_strengths[i] * 1.2
+            is_longer_gap = False
+            if i < len(beat_times) - 1 and intervals:
+                current_gap = beat_times[i+1] - beat_times[i]
+                avg_gap = sum(intervals) / len(intervals)
+                is_longer_gap = current_gap > avg_gap * 1.3
+            if (is_stronger_next or is_longer_gap) and len(current_phrase) >= 2:
+                phrases.append(current_phrase)
+                current_phrase = []
+    # Add the last phrase if not empty
+    if current_phrase:
+        phrases.append(current_phrase)
     return {
         "tempo": tempo,
         "beat_frames": beat_frames,
         "beat_times": beat_times,
+        "beat_count": len(beat_times),
+        "beat_strengths": beat_strengths,
+        "intervals": intervals,
+        "time_signature": time_signature,
+        "phrases": phrases
     }
 def detect_sections(y, sr):
     return sections
+# New function: Create flexible syllable templates
+def create_flexible_syllable_templates(beats_info):
+    """Create syllable templates based purely on beat patterns without assuming song structure."""
+    # Get the beat times and strengths
+    beat_times = beats_info["beat_times"]
+    beat_strengths = beats_info.get("beat_strengths", [1.0] * len(beat_times))
+    phrases = beats_info.get("phrases", [])
+    # If no phrases were detected, create a simple division
+    if not phrases:
+        # Default to 4-beat phrases
+        phrases = []
+        for i in range(0, len(beat_times), 4):
+            end_idx = min(i + 4, len(beat_times))
+            if end_idx - i >= 2:  # Ensure at least 2 beats per phrase
+                phrases.append(list(range(i, end_idx)))
+    # Create syllable templates for each phrase
+    syllable_templates = []
+    for phrase in phrases:
+        # Calculate appropriate syllable count for this phrase
+        beat_count = len(phrase)
+        phrase_strengths = [beat_strengths[i] for i in phrase if i < len(beat_strengths)]
+        avg_strength = sum(phrase_strengths) / len(phrase_strengths) if phrase_strengths else 1.0
+        # Base calculation: 1-2 syllables per beat depending on tempo
+        tempo = beats_info.get("tempo", 120)
+        if tempo > 120:  # Fast tempo
+            syllables_per_beat = 1.0
+        elif tempo > 90:  # Medium tempo
+            syllables_per_beat = 1.5
+        else:  # Slow tempo
+            syllables_per_beat = 2.0
+        # Adjust for beat strength
+        syllables_per_beat *= (0.8 + (avg_strength * 0.4))
+        # Calculate total syllables for the phrase
+        phrase_syllables = int(beat_count * syllables_per_beat)
+        if phrase_syllables < 2:
+            phrase_syllables = 2
+        syllable_templates.append(str(phrase_syllables))
+    return "-".join(syllable_templates)
+# New function: Analyze flexible structure
+def analyze_flexible_structure(audio_data):
+    """Analyze music structure without assuming traditional song sections."""
+    y = audio_data["waveform"]
+    sr = audio_data["sample_rate"]
+    # Enhanced beat detection
+    beats_info = detect_beats(y, sr)
+    # Identify segments with similar audio features (using MFCC)
+    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
+    # Use agglomerative clustering to find segment boundaries
+    segment_boundaries = librosa.segment.agglomerative(mfcc, 3)
+    segment_times = librosa.frames_to_time(segment_boundaries, sr=sr)
+    # Create segments
+    segments = []
+    for i in range(len(segment_times)-1):
+        start = segment_times[i]
+        end = segment_times[i+1]
+        # Find beats within this segment
+        segment_beats = []
+        for j, time in enumerate(beats_info["beat_times"]):
+            if start <= time < end:
+                segment_beats.append(j)
+        # Create syllable template for this segment
+        if segment_beats:
+            segment_beats_info = {
+                "beat_times": [beats_info["beat_times"][j] for j in segment_beats],
+                "tempo": beats_info.get("tempo", 120)
+            }
+            if "beat_strengths" in beats_info:
+                segment_beats_info["beat_strengths"] = [
+                    beats_info["beat_strengths"][j] for j in segment_beats
+                    if j < len(beats_info["beat_strengths"])
+                ]
+            if "intervals" in beats_info:
+                segment_beats_info["intervals"] = beats_info["intervals"]
+            if "phrases" in beats_info:
+                # Filter phrases to include only beats in this segment
+                segment_phrases = []
+                for phrase in beats_info["phrases"]:
+                    segment_phrase = [beat_idx for beat_idx in phrase if beat_idx in segment_beats]
+                    if len(segment_phrase) >= 2:
+                        segment_phrases.append(segment_phrase)
+                segment_beats_info["phrases"] = segment_phrases
+            syllable_template = create_flexible_syllable_templates(segment_beats_info)
+        else:
+            syllable_template = "4"  # Default fallback
+        segments.append({
+            "start": start,
+            "end": end,
+            "duration": end - start,
+            "syllable_template": syllable_template
+        })
+    return {
+        "beats": beats_info,
+        "segments": segments
+    }
+# Enhanced estimate_syllables_per_section function
 def estimate_syllables_per_section(beats_info, sections):
     """Estimate the number of syllables needed for each section based on beats."""
     syllables_per_section = []
         # Calculate syllables based on section type and beat count
         beat_count = len(section_beats)
+        # Extract beat strengths for this section if available
+        section_beat_strengths = []
+        if "beat_strengths" in beats_info:
+            for i, beat_time in enumerate(beats_info["beat_times"]):
+                if section["start"] <= beat_time < section["end"] and i < len(beats_info["beat_strengths"]):
+                    section_beat_strengths.append(beats_info["beat_strengths"][i])
+        # Create a segment-specific beat info structure for template creation
+        segment_beats_info = {
+            "beat_times": section_beats,
+            "tempo": beats_info.get("tempo", 120)
+        }
+        if section_beat_strengths:
+            segment_beats_info["beat_strengths"] = section_beat_strengths
+        if "intervals" in beats_info:
+            segment_beats_info["intervals"] = beats_info["intervals"]
+        # Create a detailed syllable template for this section
+        syllable_template = create_flexible_syllable_templates(segment_beats_info)
+        # Calculate estimated syllable count
+        expected_counts = [int(count) for count in syllable_template.split("-")]
+        total_syllables = sum(expected_counts)
         syllables_per_section.append({
             "type": section["type"],
             "end": section["end"],
             "duration": section["duration"],
             "beat_count": beat_count,
+            "syllable_count": total_syllables,
+            "syllable_template": syllable_template
         })
     return syllables_per_section
     y = audio_data["waveform"]
     sr = audio_data["sample_rate"]
+    # Enhanced beat detection
     beats_info = detect_beats(y, sr)
     # Detect sections
     sections = detect_sections(y, sr)
+    # Create enhanced syllable info per section
     syllables_info = estimate_syllables_per_section(beats_info, sections)
+    # Get flexible structure analysis as an alternative approach
+    try:
+        flexible_structure = analyze_flexible_structure(audio_data)
+    except Exception as e:
+        print(f"Warning: Flexible structure analysis failed: {str(e)}")
+        flexible_structure = None
     return {
         "beats": beats_info,
         "sections": sections,
+        "syllables": syllables_info,
+        "flexible_structure": flexible_structure
     }
+# New function: Verify syllable counts
+def verify_flexible_syllable_counts(lyrics, templates):
+    """Verify that the generated lyrics match the required syllable counts."""
+    # Split lyrics into lines
+    lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
+    # Check syllable counts for each line
+    verification_notes = []
+    for i, line in enumerate(lines):
+        if i >= len(templates):
+            break
+        template = templates[i]
+        # Handle different template formats
+        if isinstance(template, dict) and "syllable_template" in template:
+            expected_counts = [int(count) for count in template["syllable_template"].split("-")]
+        elif isinstance(template, str):
+            expected_counts = [int(count) for count in template.split("-")]
+        else:
+            continue
+        # Count actual syllables
+        actual_count = count_syllables(line)
+        # Calculate difference
+        total_expected = sum(expected_counts)
+        if abs(actual_count - total_expected) > 2:  # Allow small differences
+            verification_notes.append(f"Line {i+1}: Expected {total_expected} syllables, got {actual_count}")
+    # If we found issues, add them as notes at the end of the lyrics
+    if verification_notes:
+        lyrics += "\n\n[Note: Potential rhythm mismatches in these lines:]\n"
+        lyrics += "\n".join(verification_notes)
+        lyrics += "\n[You may want to adjust these lines to match the music's rhythm better]"
+    return lyrics
+# Modified generate_lyrics function
+def generate_lyrics(genre, duration, emotion_results, song_structure=None):
+    """Generate lyrics based on the genre, emotion, and structure analysis."""
     # Extract emotion and theme data from analysis results
     primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"]
     primary_theme = emotion_results["theme_analysis"]["primary_theme"]
     key = emotion_results["tonal_analysis"]["key"]
     mode = emotion_results["tonal_analysis"]["mode"]
+    # Format syllable templates for the prompt
+    syllable_guidance = ""
+    templates_for_verification = []
+    if song_structure:
+        # Try to use flexible structure if available
+        if "flexible_structure" in song_structure and song_structure["flexible_structure"]:
+            flexible = song_structure["flexible_structure"]
+            if "segments" in flexible and flexible["segments"]:
+                syllable_guidance = "Follow these exact syllable patterns for each line:\n"
+                for i, segment in enumerate(flexible["segments"]):
+                    if i < 15:  # Limit to 15 lines to keep prompt manageable
+                        syllable_guidance += f"Line {i+1}: {segment['syllable_template']} syllables\n"
+                        templates_for_verification.append(segment["syllable_template"])
+        # Fallback to traditional sections if needed
+        elif "syllables" in song_structure and song_structure["syllables"]:
+            syllable_guidance = "Follow these syllable patterns for each section:\n"
+            for section in song_structure["syllables"]:
+                if "syllable_template" in section:
+                    syllable_guidance += f"[{section['type'].capitalize()}]: {section['syllable_template']} syllables per line\n"
+                elif "syllable_count" in section:
+                    syllable_guidance += f"[{section['type'].capitalize()}]: ~{section['syllable_count']} syllables total\n"
+                if "syllable_template" in section:
+                    templates_for_verification.append(section)
+    # If we couldn't get specific templates, use general guidance
+    if not syllable_guidance:
+        syllable_guidance = "Make sure each line has an appropriate syllable count for singing:\n"
+        syllable_guidance += "- For faster sections (tempo > 120 BPM): 4-6 syllables per line\n"
+        syllable_guidance += "- For medium tempo sections: 6-8 syllables per line\n"
+        syllable_guidance += "- For slower sections (tempo < 90 BPM): 8-10 syllables per line\n"
+    # Add examples of syllable counting
+    syllable_guidance += "\nExamples of syllable counting:\n"
+    syllable_guidance += "- 'I can see the light' = 4 syllables\n"
+    syllable_guidance += "- 'When it fades a-way' = 4 syllables\n"
+    syllable_guidance += "- 'The sun is shin-ing bright to-day' = 8 syllables\n"
+    syllable_guidance += "- 'I'll be wait-ing for you' = 6 syllables\n"
+    # Determine if we should use traditional sections or not
+    use_sections = True
+    if song_structure and "flexible_structure" in song_structure and song_structure["flexible_structure"]:
+        # If we have more than 4 segments, it's likely not a traditional song structure
+        if "segments" in song_structure["flexible_structure"]:
+            segments = song_structure["flexible_structure"]["segments"]
+            if len(segments) > 4:
+                use_sections = False
+    # Create enhanced prompt for the LLM
+    if use_sections:
+        # Traditional approach with sections
+        # Calculate appropriate lyrics length and section distribution
+        try:
+            if song_structure and "beats" in song_structure:
+                beats_info = song_structure["beats"]
+                tempo = beats_info.get("tempo", 120)
+                time_signature = beats_info.get("time_signature", 4)
+                lines_structure = calculate_lyrics_length(duration, tempo, time_signature)
+                # Handle both possible return types
+                if isinstance(lines_structure, dict):
+                    total_lines = lines_structure["lines_count"]
+                    # Extract section line counts if available
+                    verse_lines = 0
+                    chorus_lines = 0
+                    bridge_lines = 0
+                    for section in lines_structure["sections"]:
+                        if section["type"] == "verse":
+                            verse_lines = section["lines"]
+                        elif section["type"] == "chorus":
+                            chorus_lines = section["lines"]
+                        elif section["type"] == "bridge":
+                            bridge_lines = section["lines"]
+                else:
+                    # The function returned just an integer (old behavior)
+                    total_lines = lines_structure
+                    # Default section distribution based on total lines
+                    if total_lines <= 6:
+                        verse_lines = 2
+                        chorus_lines = 2
+                        bridge_lines = 0
+                    elif total_lines <= 10:
+                        verse_lines = 3
+                        chorus_lines = 2
+                        bridge_lines = 0
+                    else:
+                        verse_lines = 3
+                        chorus_lines = 2
+                        bridge_lines = 2
+            else:
+                # Fallback to simple calculation
+                total_lines = max(4, int(duration / 10))
+                # Default section distribution
+                if total_lines <= 6:
+                    verse_lines = 2
+                    chorus_lines = 2
+                    bridge_lines = 0
+                elif total_lines <= 10:
+                    verse_lines = 3
+                    chorus_lines = 2
+                    bridge_lines = 0
+                else:
+                    verse_lines = 3
+                    chorus_lines = 2
+                    bridge_lines = 2
+        except Exception as e:
+            print(f"Error calculating lyrics length: {str(e)}")
+            total_lines = max(4, int(duration / 10))
+            # Default section distribution
+            verse_lines = 3
+            chorus_lines = 2
+            bridge_lines = 0
+        prompt = f"""
 You are a talented songwriter who specializes in {genre} music.
 Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
 - Primary emotion: {primary_emotion}
 - Primary theme: {primary_theme}
+IMPORTANT: The lyrics must match the rhythm of the music exactly!
+{syllable_guidance}
+When writing the lyrics:
+1. Count syllables carefully for each line to match the specified pattern
+2. Ensure words fall naturally on the beat
+3. Place stressed syllables on strong beats
+4. Create a coherent theme throughout the lyrics
 The lyrics should:
 - Perfectly capture the essence and style of {genre} music
 - Express the {primary_emotion} emotion and {primary_theme} theme
+- Be approximately {total_lines} lines long
 - Follow this structure:
   * Verse: {verse_lines} lines
   * Chorus: {chorus_lines} lines
+  * {f'Bridge: {bridge_lines} lines' if bridge_lines > 0 else ''}
 - Be completely original
 - Match the song duration of {duration:.1f} seconds
+Your lyrics:
+"""
+    else:
+        # Flexible approach without traditional sections
+        prompt = f"""
+You are a talented songwriter who specializes in {genre} music.
+Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long.
+Music analysis has detected the following qualities:
+- Tempo: {tempo:.1f} BPM
+- Key: {key} {mode}
+- Primary emotion: {primary_emotion}
+- Primary theme: {primary_theme}
+IMPORTANT: The lyrics must match the rhythm of the music exactly!
+{syllable_guidance}
+When writing the lyrics:
+1. Count syllables carefully for each line to match the specified pattern
+2. Ensure words fall naturally on the beat
+3. Place stressed syllables on strong beats
+4. Create coherent lyrics that would work for this music segment
+The lyrics should:
+- Perfectly capture the essence and style of {genre} music
+- Express the {primary_emotion} emotion and {primary_theme} theme
+- Be completely original
+- Maintain a consistent theme throughout
+- Match the audio segment duration of {duration:.1f} seconds
+DON'T include any section labels like [Verse] or [Chorus] unless specifically instructed.
+Instead, write lyrics that flow naturally and match the music's rhythm.
 Your lyrics:
 """
     # Extract and clean generated lyrics
     lyrics = response[0]["generated_text"].strip()
+    # Verify syllable counts if we have templates
+    if templates_for_verification:
+        lyrics = verify_flexible_syllable_counts(lyrics, templates_for_verification)
+    # Add section labels if they're not present and we're using the traditional approach
+    if use_sections and "Verse" not in lyrics and "Chorus" not in lyrics:
         lines = lyrics.split('\n')
         formatted_lyrics = []
+        line_count = 0
         for i, line in enumerate(lines):
+            if not line.strip():
+                formatted_lyrics.append(line)
+                continue
+            if line_count == 0:
                 formatted_lyrics.append("[Verse]")
+            elif line_count == verse_lines:
                 formatted_lyrics.append("\n[Chorus]")
+            elif line_count == verse_lines + chorus_lines and bridge_lines > 0:
                 formatted_lyrics.append("\n[Bridge]")
             formatted_lyrics.append(line)
+            line_count += 1
         lyrics = '\n'.join(formatted_lyrics)
     return lyrics
             # Continue with a simpler approach if this fails
             song_structure = None
+        # Generate lyrics based on top genre, emotion analysis, and song structure
         try:
             primary_genre, _ = top_genres[0]
+            lyrics = generate_lyrics(primary_genre, audio_data["duration"], emotion_results, song_structure)
         except Exception as e:
             print(f"Error generating lyrics: {str(e)}")
             lyrics = f"Error generating lyrics: {str(e)}"
                     emotion_text += "\n\nSong Structure:\n"
                     for section in song_structure["syllables"]:
                         emotion_text += f"- {section['type'].capitalize()}: {section['start']:.1f}s to {section['end']:.1f}s "
+                        emotion_text += f"({section['duration']:.1f}s, {section['beat_count']} beats, "
+                        if "syllable_template" in section:
+                            emotion_text += f"template: {section['syllable_template']})\n"
+                        else:
+                            emotion_text += f"~{section['syllable_count']} syllables)\n"
+                    # Add flexible structure info if available
+                    if "flexible_structure" in song_structure and song_structure["flexible_structure"]:
+                        flexible = song_structure["flexible_structure"]
+                        if "segments" in flexible and flexible["segments"]:
+                            emotion_text += "\nDetailed Rhythm Analysis:\n"
+                            for i, segment in enumerate(flexible["segments"][:5]):  # Show first 5 segments
+                                emotion_text += f"- Segment {i+1}: {segment['start']:.1f}s to {segment['end']:.1f}s, "
+                                emotion_text += f"pattern: {segment['syllable_template']}\n"
+                            if len(flexible["segments"]) > 5:
+                                emotion_text += f"  (+ {len(flexible['segments']) - 5} more segments)\n"
                 except Exception as e:
                     print(f"Error displaying song structure: {str(e)}")
                     # Continue without showing structure details
     2. The system will classify the genre using the dima806/music_genres_classification model
     3. The system will analyze the musical emotion and theme using advanced audio processing
     4. The system will identify the song structure, beats, and timing patterns
+    5. The system will create syllable templates that precisely match the rhythm of the music
+    6. Based on the detected genre, emotion, and syllable templates, it will generate lyrics that align perfectly with the beats
+    7. The system verifies syllable counts to ensure the generated lyrics can be sung naturally with the music
     """)
 # Launch the app

requirements.txt CHANGED Viewed

@@ -11,4 +11,5 @@ sentencepiece>=0.1.99
 safetensors>=0.4.1
 scipy>=1.12.0
 soundfile>=0.12.1
-matplotlib>=3.7.0

 safetensors>=0.4.1
 scipy>=1.12.0
 soundfile>=0.12.1
+matplotlib>=3.7.0
+pronouncing>=0.2.0

utils.py CHANGED Viewed

@@ -37,39 +37,53 @@ def extract_mfcc_features(y, sr, n_mfcc=20):
         # Return a fallback feature vector if extraction fails
         return np.zeros(n_mfcc)
-def calculate_lyrics_length(duration):
-    """
-    Calculate appropriate lyrics length based on audio duration.
-    Uses a more conservative calculation that generates shorter lyrics:
-    - Average words per line (8-10 words)
-    - Reduced words per minute (45 words instead of 135)
-    - Simplified song structure
-    """
-    # Convert duration to minutes
-    duration_minutes = duration / 60
-    # Calculate total words based on duration
-    # Using 45 words per minute (reduced from 135)
-    total_words = int(duration_minutes * 90)
-    # Calculate number of lines
-    # Assuming 8-10 words per line
-    words_per_line = 9  # average
-    total_lines = total_words // words_per_line
-    # Adjust for song structure with shorter lengths
-    if total_lines < 6:
-        # Very short song - keep it simple
-        return max(2, total_lines)
-    elif total_lines < 10:
-        # Short song - one verse and chorus
-        return min(6, total_lines)
-    elif total_lines < 15:
-        # Medium song - two verses and chorus
-        return min(10, total_lines)
     else:
-        # Longer song - two verses, chorus, and bridge
-        return min(15, total_lines)
 def format_genre_results(top_genres):
     """Format genre classification results for display."""

         # Return a fallback feature vector if extraction fails
         return np.zeros(n_mfcc)
+def calculate_lyrics_length(duration, tempo=100, time_signature=4):
+    """Calculate appropriate lyrics structure based on musical principles."""
+    # Legacy behavior - simple calculation based on duration
+    lines_count = max(4, int(duration / 10))
+    # If only duration was provided (original usage), return just the integer
+    if not isinstance(tempo, (int, float)) or not isinstance(time_signature, (int, float)):
+        return lines_count
+    # Enhanced calculation
+    beats_per_minute = tempo
+    beats_per_second = beats_per_minute / 60
+    total_beats = duration * beats_per_second
+    total_measures = total_beats / time_signature
+    # Determine section distributions
+    verse_lines = 0
+    chorus_lines = 0
+    bridge_lines = 0
+    if lines_count <= 6:
+        verse_lines = 2
+        chorus_lines = 2
+    elif lines_count <= 10:
+        verse_lines = 3
+        chorus_lines = 2
     else:
+        verse_lines = 3
+        chorus_lines = 2
+        bridge_lines = 2
+    # Create structured output
+    song_structure = {
+        "total_measures": int(total_measures),
+        "lines_count": lines_count,  # Include the original line count
+        "sections": [
+            {"type": "verse", "lines": verse_lines, "measures": int(total_measures * 0.4)},
+            {"type": "chorus", "lines": chorus_lines, "measures": int(total_measures * 0.3)}
+        ]
+    }
+    if bridge_lines > 0:
+        song_structure["sections"].append(
+            {"type": "bridge", "lines": bridge_lines, "measures": int(total_measures * 0.2)}
+        )
+    return song_structure
 def format_genre_results(top_genres):
     """Format genre classification results for display."""