Spaces:

asdd12e2ad
/

yourmt3

Runtime error

App Files Files Community

lokman2k5 commited on Aug 5

Commit

e3807d4

1 Parent(s): 53d2e48

Add application file

Browse files

Files changed (1) hide show

app.py +347 -0

app.py CHANGED Viewed

	@@ -0,0 +1,347 @@

+import spaces
+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'amt/src')))
+import subprocess
+from typing import Tuple, Dict, Literal
+from ctypes import ArgumentError
+from html_helper import *
+from model_helper import *
+import torchaudio
+import glob
+import gradio as gr
+from gradio_log import Log
+from pathlib import Path
+# gradio_log
+log_file = 'amt/log.txt'
+Path(log_file).touch()
+# @title Load Checkpoint
+model_name = 'YPTF.MoE+Multi (noPS)' # @param ["YMT3+", "YPTF+Single (noPS)", "YPTF+Multi (PS)", "YPTF.MoE+Multi (noPS)", "YPTF.MoE+Multi (PS)"]
+precision = '16'# if torch.cuda.is_available() else '32'# @param ["32", "bf16-mixed", "16"]
+project = '2024'
+if model_name == "YMT3+":
+    checkpoint = "[email protected]"
+    args = [checkpoint, '-p', project, '-pr', precision]
+elif model_name == "YPTF+Single (noPS)":
+    checkpoint = "ptf_all_cross_rebal5_mirst_xk2_edr005_attend_c_full_plus_b100@model.ckpt"
+    args = [checkpoint, '-p', project, '-enc', 'perceiver-tf', '-ac', 'spec',
+            '-hop', '300', '-atc', '1', '-pr', precision]
+elif model_name == "YPTF+Multi (PS)":
+    checkpoint = "mc13_256_all_cross_v6_xk5_amp0811_edr005_attend_c_full_plus_2psn_nl26_sb_b26r_800k@model.ckpt"
+    args = [checkpoint, '-p', project, '-tk', 'mc13_full_plus_256',
+            '-dec', 'multi-t5', '-nl', '26', '-enc', 'perceiver-tf',
+            '-ac', 'spec', '-hop', '300', '-atc', '1', '-pr', precision]
+elif model_name == "YPTF.MoE+Multi (noPS)":
+    checkpoint = "mc13_256_g4_all_v7_mt3f_sqr_rms_moe_wf4_n8k2_silu_rope_rp_b36_nops@last.ckpt"
+    args = [checkpoint, '-p', project, '-tk', 'mc13_full_plus_256', '-dec', 'multi-t5',
+            '-nl', '26', '-enc', 'perceiver-tf', '-sqr', '1', '-ff', 'moe',
+            '-wf', '4', '-nmoe', '8', '-kmoe', '2', '-act', 'silu', '-epe', 'rope',
+            '-rp', '1', '-ac', 'spec', '-hop', '300', '-atc', '1', '-pr', precision]
+elif model_name == "YPTF.MoE+Multi (PS)":
+    checkpoint = "mc13_256_g4_all_v7_mt3f_sqr_rms_moe_wf4_n8k2_silu_rope_rp_b80_ps2@model.ckpt"
+    args = [checkpoint, '-p', project, '-tk', 'mc13_full_plus_256', '-dec', 'multi-t5',
+            '-nl', '26', '-enc', 'perceiver-tf', '-sqr', '1', '-ff', 'moe',
+            '-wf', '4', '-nmoe', '8', '-kmoe', '2', '-act', 'silu', '-epe', 'rope',
+            '-rp', '1', '-ac', 'spec', '-hop', '300', '-atc', '1', '-pr', precision]
+else:
+    raise ValueError(model_name)
+model = load_model_checkpoint(args=args, device="cpu")
+#model.to("cuda")
+# Keep model on CPU for HuggingFace Spaces free tier
+print("Model loaded on CPU for HuggingFace Spaces deployment")
+# @title GradIO helper
+def prepare_media(source_path_or_url: os.PathLike,
+                  source_type: Literal['audio_filepath', 'youtube_url'],
+                  delete_video: bool = True,
+                  simulate = False) -> Dict:
+    """prepare media from source path or youtube, and return audio info"""
+    # Get audio_file
+    if source_type == 'audio_filepath':
+        audio_file = source_path_or_url
+    elif source_type == 'youtube_url':
+        if os.path.exists('/download/yt_audio.mp3'):
+            os.remove('/download/yt_audio.mp3')
+        # Download from youtube
+        with open(log_file, 'w') as lf:
+            audio_file = './downloaded/yt_audio'
+            command = ['yt-dlp', '-x', source_path_or_url, '-f', 'bestaudio',
+                '-o', audio_file, '--audio-format', 'mp3', '--restrict-filenames',
+                '--extractor-retries', '10',
+                '--force-overwrites', '--username', 'oauth2', '--password', '', '-v']
+            if simulate:
+                command = command + ['-s']
+            process = subprocess.Popen(command,
+                stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+            for line in iter(process.stdout.readline, ''):
+                # Filter out unnecessary messages
+                print(line)
+                if "www.google.com/device" in line:
+                    hl_text = line.replace("https://www.google.com/device", "\033[93mhttps://www.google.com/device\x1b[0m").split()
+                    hl_text[-1] = "\x1b[31;1m" + hl_text[-1] + "\x1b[0m"
+                    lf.write(' '.join(hl_text)); lf.flush()
+                elif "Authorization successful" in line or "Video unavailable" in line:
+                    lf.write(line); lf.flush()
+            process.stdout.close()
+            process.wait()
+        audio_file += '.mp3'
+    else:
+        raise ValueError(source_type)
+    # Create info
+    info = torchaudio.info(audio_file)
+    return {
+        "filepath": audio_file,
+        "track_name": os.path.basename(audio_file).split('.')[0],
+        "sample_rate": int(info.sample_rate),
+        "bits_per_sample": int(info.bits_per_sample),
+        "num_channels": int(info.num_channels),
+        "num_frames": int(info.num_frames),
+        "duration": int(info.num_frames / info.sample_rate),
+        "encoding": str.lower(info.encoding),
+        }
+@spaces.GPU(duration=120)  # 2 minute timeout for CPU inference
+def process_audio(audio_filepath, instrument_hint=None):
+    if audio_filepath is None:
+        return None
+    try:
+        print(f"Processing audio: {audio_filepath}")
+        if instrument_hint and instrument_hint != "Auto (detect all instruments)":
+            print(f"Using instrument hint: {instrument_hint}")
+        audio_info = prepare_media(audio_filepath, source_type='audio_filepath')
+        midifile = transcribe(model, audio_info, instrument_hint)
+        midifile = to_data_url(midifile)
+        return create_html_from_midi(midifile) # html midiplayer
+    except Exception as e:
+        print(f"Error in process_audio: {e}")
+        import traceback
+        traceback.print_exc()
+        return f"<p style='color: red;'>Error processing audio: {str(e)}</p>"
+# @spaces.GPU  # Comment out for Colab
+def process_audio_yt_temp(youtube_url):
+    if youtube_url is None:
+        return None
+    elif youtube_url == "https://youtu.be/5vJBhdjvVcE?si=s3NFG_SlVju0Iklg":
+        midifile = "./mid/Free Jazz Intro Music - Piano Sway (Intro B - 10 seconds) - OurMusicBox.mid"
+    elif youtube_url == "https://youtu.be/mw5VIEIvuMI?si=Dp9UFVw00Tl8CXe2":
+        midifile = "./mid/Naomi Scott   Speechless from Aladdin Official Video Sony vevo Music.mid"
+    elif youtube_url == "https://youtu.be/OXXRoa1U6xU?si=dpYMun4LjZHNydSb":
+        midifile = "./mid/Mozart_Sonata_for_Piano_and_Violin_(getmp3.pro).mid"
+    midifile = to_data_url(midifile)
+    return create_html_from_midi(midifile) # html midiplayer
+@spaces.GPU(duration=120)
+def process_video(youtube_url, instrument_hint=None):
+    if 'youtu' not in youtube_url:
+        return None
+    audio_info = prepare_media(youtube_url, source_type='youtube_url')
+    midifile = transcribe(model, audio_info, instrument_hint)
+    midifile = to_data_url(midifile)
+    return create_html_from_midi(midifile) # html midiplayer
+def play_video(youtube_url):
+    if 'youtu' not in youtube_url:
+        return None
+    return create_html_youtube_player(youtube_url)
+# def oauth_google():
+#     return create_html_oauth()
+AUDIO_EXAMPLES = glob.glob('examples/*.*', recursive=True)
+YOUTUBE_EXAMPLES = ["https://youtu.be/5vJBhdjvVcE?si=s3NFG_SlVju0Iklg",
+                    "https://youtu.be/mw5VIEIvuMI?si=Dp9UFVw00Tl8CXe2",
+                    "https://youtu.be/OXXRoa1U6xU?si=dpYMun4LjZHNydSb"]
+# YOUTUBE_EXAMPLES = ["https://youtu.be/5vJBhdjvVcE?si=s3NFG_SlVju0Iklg",
+#                     "https://www.youtube.com/watch?v=vMboypSkj3c",
+#                     "https://youtu.be/vRd5KEjX8vw?si=b-qw633ZjaX6Uxy5",
+#                     "https://youtu.be/bnS-HK_lTHA?si=PQLVAab3QHMbv0S3https://youtu.be/zJB0nnOc7bM?si=EA1DN8nHWJcpQWp_",
+#                     "https://youtu.be/7mjQooXt28o?si=qqmMxCxwqBlLPDI2",
+#                     "https://youtu.be/mIWYTg55h10?si=WkbtKfL6NlNquvT8"]
+theme = gr.Theme.from_hub("gradio/dracula_revamped")
+theme.text_md = '10px'
+theme.text_lg = '12px'
+theme.body_background_fill_dark = '#060a1c' #'#372037'# '#a17ba5' #'#73d3ac'
+theme.border_color_primary_dark = '#45507328'
+theme.block_background_fill_dark = '#3845685c'
+theme.body_text_color_dark = 'white'
+theme.block_title_text_color_dark = 'black'
+theme.body_text_color_subdued_dark = '#e4e9e9'
+css = """
+.gradio-container {
+    background: linear-gradient(-45deg, #ee7752, #e73c7e, #23a6d5, #23d5ab);
+    background-size: 400% 400%;
+    animation: gradient 15s ease infinite;
+    height: 100vh;
+}
+@keyframes gradient {
+    0% {background-position: 0% 50%;}
+    50% {background-position: 100% 50%;}
+    100% {background-position: 0% 50%;}
+}
+#mylog {font-size: 12pt; line-height: 1.2; min-height: 2em; max-height: 4em;}
+"""
+with gr.Blocks(theme=theme, css=css) as demo:
+    with gr.Row():
+        with gr.Column(scale=10):
+            gr.Markdown(
+            f"""
+            ## 🎶YourMT3+: Multi-instrument Music Transcription with Enhanced Transformer Architectures and Cross-dataset Stem Augmentation
+            - Model name: `{model_name}`
+                <details>
+                <summary>▶model details◀</summary>
+                | **Component**            | **Details**                                      |
+                |--------------------------|--------------------------------------------------|
+                | Encoder backbone         | Perceiver-TF + Mixture of Experts (2/8)          |
+                | Decoder backbone         | Multi-channel T5-small                           |
+                | Tokenizer                | MT3 tokens with Singing extension                |
+                | Dataset                  | YourMT3 dataset                                  |
+                | Augmentation strategy    | Intra-/Cross dataset stem augment, No Pitch-shifting |
+                | FP Precision             | BF16-mixed for training, FP16 for inference      |
+                </details>
+            ## Caution:
+            - For acadmic reproduction purpose, we strongly recommend to use [Colab Demo](https://colab.research.google.com/drive/1AgOVEBfZknDkjmSRA7leoa81a2vrnhBG?usp=sharing) with multiple checkpoints.
+            ## YouTube transcription (Sorry!! YouTube blocked HuggingFace IP. We display a few pre-transcribed examples in the below!):
+            - Select one from the `Examples`, click `Get Audio from YouTube`, and then press `Transcribe`.
+            <div style="display: inline-block;">
+                <a href="https://arxiv.org/abs/2407.04822">
+                    <img src="https://img.shields.io/badge/arXiv:2407.04822-B31B1B?logo=arxiv&logoColor=fff&style=plastic" alt="arXiv Badge"/>
+                </a>
+            </div>
+            <div style="display: inline-block;">
+                <a href="https://github.com/mimbres/YourMT3">
+                    <img src="https://img.shields.io/badge/GitHub-181717?logo=github&logoColor=fff&style=plastic" alt="GitHub Badge"/>
+                </a>
+            </div>
+            <div style="display: inline-block;">
+                <a href="https://colab.research.google.com/drive/1AgOVEBfZknDkjmSRA7leoa81a2vrnhBG?usp=sharing">
+                    <img src="https://img.shields.io/badge/Google%20Colab-F9AB00?logo=googlecolab&logoColor=fff&style=plastic"/>
+                </a>
+            </div>
+            """)
+    with gr.Group():
+        with gr.Tab("From YouTube"):
+            with gr.Column(scale=4):
+                # Input URL
+                youtube_url = gr.Textbox(label="YouTube Link URL",
+                        placeholder="https://youtu.be/...")
+                # Display examples
+                gr.Examples(examples=YOUTUBE_EXAMPLES, inputs=youtube_url)
+                # Play button
+                play_video_button = gr.Button("Get Audio from YouTube", variant="primary")
+                # Play youtube
+                youtube_player = gr.HTML(render=True)
+            with gr.Column(scale=4):
+                # Instrument selection for YouTube
+                youtube_instrument_selector = gr.Dropdown(
+                    choices=["Auto (detect all instruments)", "Vocals/Singing", "Guitar", "Piano",
+                            "Violin", "Drums", "Bass", "Saxophone", "Flute"],
+                    value="Auto (detect all instruments)",
+                    label="Target Instrument",
+                    info="Choose the specific instrument you want to transcribe"
+                )
+                with gr.Row():
+                    # Submit button
+                    transcribe_video_button = gr.Button("Transcribe", variant="primary")
+                    # Oauth button
+                    oauth_button = gr.Button("google.com/device", variant="primary", link="https://www.google.com/device")
+            with gr.Column(scale=1):
+                # Transcribe
+                output_tab2 = gr.HTML(render=True)
+                # video_output = gr.Text(label="Video Info")
+                def process_youtube_with_instrument(url, instrument_choice):
+                    # Map UI choices to internal instrument hints
+                    instrument_map = {
+                        "Auto (detect all instruments)": None,
+                        "Vocals/Singing": "vocals",
+                        "Guitar": "guitar",
+                        "Piano": "piano",
+                        "Violin": "violin",
+                        "Drums": "drums",
+                        "Bass": "bass",
+                        "Saxophone": "saxophone",
+                        "Flute": "flute"
+                    }
+                    instrument_hint = instrument_map.get(instrument_choice, None)
+                    # For now, using the temp function - you can replace with process_video when ready
+                    return process_audio_yt_temp(url)  # TODO: Replace with process_video(url, instrument_hint)
+                transcribe_video_button.click(process_youtube_with_instrument, inputs=[youtube_url, youtube_instrument_selector], outputs=output_tab2)
+                # transcribe_video_button.click(process_video, inputs=youtube_url, outputs=output_tab2)
+                # Play
+                play_video_button.click(play_video, inputs=youtube_url, outputs=youtube_player)
+            with gr.Column(scale=1):
+                Log(log_file, dark=True, xterm_font_size=12, elem_id='mylog')
+        with gr.Tab("Upload audio"):
+            # Input
+            audio_input = gr.Audio(label="Record Audio", type="filepath",
+                                show_share_button=True, show_download_button=True)
+            # Instrument selection
+            instrument_selector = gr.Dropdown(
+                choices=["Auto (detect all instruments)", "Vocals/Singing", "Guitar", "Piano",
+                        "Violin", "Drums", "Bass", "Saxophone", "Flute"],
+                value="Auto (detect all instruments)",
+                label="Target Instrument",
+                info="Choose the specific instrument you want to transcribe, or 'Auto' for all instruments"
+            )
+            # Display examples
+            gr.Examples(examples=AUDIO_EXAMPLES, inputs=audio_input)
+            # Submit button
+            transcribe_audio_button = gr.Button("Transcribe", variant="primary")
+            # Transcribe
+            output_tab1 = gr.HTML()
+            def process_with_instrument(audio_file, instrument_choice):
+                # Map UI choices to internal instrument hints
+                instrument_map = {
+                    "Auto (detect all instruments)": None,
+                    "Vocals/Singing": "vocals",
+                    "Guitar": "guitar",
+                    "Piano": "piano",
+                    "Violin": "violin",
+                    "Drums": "drums",
+                    "Bass": "bass",
+                    "Saxophone": "saxophone",
+                    "Flute": "flute"
+                }
+                instrument_hint = instrument_map.get(instrument_choice, None)
+                print(f"UI choice: {instrument_choice} -> instrument_hint: {instrument_hint}")
+                return process_audio(audio_file, instrument_hint)
+            transcribe_audio_button.click(process_with_instrument, inputs=[audio_input, instrument_selector], outputs=output_tab1)
+# Launch for HuggingFace Spaces
+demo.launch(debug=True)