Spaces:
Running
on
Zero
Running
on
Zero
| """preprocess_maestro.py""" | |
| import os | |
| import glob | |
| import re | |
| import json | |
| from typing import Dict, List, Tuple | |
| import numpy as np | |
| from utils.audio import get_audio_file_info | |
| from utils.midi import midi2note, note_event2midi | |
| from utils.note2event import note2note_event, note_event2event | |
| from utils.event2note import event2note_event | |
| from utils.note_event_dataclasses import Note, NoteEvent | |
| from utils.utils import note_event2token2note_event_sanity_check | |
| from utils.utils import assert_note_events_almost_equal | |
| def create_note_event_and_note_from_midi(mid_file: str, | |
| id: str, | |
| ignore_pedal: bool = False) -> Tuple[Dict, Dict]: | |
| """Extracts note or note_event and metadata from midi: | |
| Returns: | |
| notes (dict): note events and metadata. | |
| note_events (dict): note events and metadata. | |
| """ | |
| notes, dur_sec = midi2note( | |
| mid_file, | |
| binary_velocity=True, | |
| ch_9_as_drum=False, | |
| force_all_drum=False, | |
| force_all_program_to=0, # always piano | |
| trim_overlap=True, | |
| fix_offset=True, | |
| quantize=True, | |
| verbose=0, | |
| minimum_offset_sec=0.01, | |
| drum_offset_sec=0.01, | |
| ignore_pedal=ignore_pedal) | |
| return { # notes | |
| 'maps_id': id, | |
| 'program': [0], | |
| 'is_drum': [0], | |
| 'duration_sec': dur_sec + 0.01, | |
| 'notes': notes, | |
| }, { # note_events | |
| 'maps_id': id, | |
| 'program': [0], | |
| 'is_drum': [0], | |
| 'duration_sec': dur_sec + 0.01, | |
| 'note_events': note2note_event(notes), | |
| } | |
| def note_event2event_sanity_check(note_events: List[NoteEvent]): | |
| """Sanity check for note events.""" | |
| events = note_event2event(note_events, None) | |
| note_events2, _, _ = event2note_event(events) | |
| assert_note_events_almost_equal(note_events, note_events2) | |
| def preprocess_maestro16k(data_home=os.PathLike, | |
| dataset_name='maestro', | |
| ignore_pedal=False, | |
| sanity_check=False) -> None: | |
| """ | |
| Splits: | |
| - train: 962 files | |
| - validation: 137 files | |
| - test: 177 files | |
| - all: 1276 file | |
| Writes: | |
| - {dataset_name}_{split}_file_list.json: a dictionary with the following keys: | |
| { | |
| index: | |
| { | |
| 'maestro_id': maestro_id, | |
| 'n_frames': (int), | |
| 'mix_audio_file': 'path/to/mix.wav', | |
| 'notes_file': 'path/to/notes.npy', | |
| 'note_events_file': 'path/to/note_events.npy', | |
| 'midi_file': 'path/to/midi.mid', | |
| 'program': List[int], | |
| 'is_drum': List[int], # 0 or 1 | |
| } | |
| } | |
| """ | |
| # Directory and file paths | |
| base_dir = os.path.join(data_home, dataset_name + '_yourmt3_16k') | |
| output_index_dir = os.path.join(data_home, 'yourmt3_indexes') | |
| os.makedirs(output_index_dir, exist_ok=True) | |
| # Get metadata | |
| metadata_file = os.path.join(base_dir, 'maestro-v3.0.0.json') | |
| with open(metadata_file, 'r') as f: | |
| _metadata = json.load(f) | |
| metadata = {} | |
| ids_all = list(range(len(_metadata['canonical_composer']))) | |
| assert len(ids_all) == 1276 | |
| for i in ids_all: | |
| metadata[i] = {} | |
| for key in ['split', 'midi_filename', 'audio_filename', 'duration']: | |
| metadata[i][key] = _metadata[key][str(i)] | |
| # Collect ids and prepend base_dir to filenames | |
| ids = {'all': ids_all, 'train': [], 'validation': [], 'test': []} | |
| for i in ids_all: | |
| m = metadata[i] | |
| ids[m['split']].append(i) | |
| # Prepend base_dir | |
| m['midi_filename'] = os.path.join(base_dir, m['midi_filename']) | |
| m['audio_filename'] = os.path.join(base_dir, m['audio_filename']) | |
| # Rename '.midi' to '.mid' | |
| if '.midi' in m['midi_filename'] and not os.path.exists(m['midi_filename'].replace( | |
| '.midi', '.mid')): | |
| os.rename(m['midi_filename'], m['midi_filename'].replace('.midi', '.mid')) | |
| m['midi_filename'] = m['midi_filename'].replace('.midi', '.mid') | |
| # File sanity check | |
| assert os.path.exists(m['midi_filename']) and '.mid' == m['midi_filename'][-4:] | |
| assert os.path.exists(m['audio_filename']) and '.wav' in m['audio_filename'] | |
| assert len(ids['train']) == 962 | |
| assert len(ids['validation']) == 137 | |
| assert len(ids['test']) == 177 | |
| # Create 'all' filelist, and process MIDI | |
| file_list = {} | |
| for i in ids['all']: | |
| m = metadata[i] | |
| mix_audio_file = m['audio_filename'] | |
| fs, n_frames, n_channels = get_audio_file_info(mix_audio_file) | |
| assert fs == 16000 and n_channels == 1 | |
| n_frames = min(int(m['duration'] * 16000), n_frames) | |
| assert n_frames > 32001 | |
| notes_file = m['midi_filename'].replace('.mid', '_notes.npy') | |
| note_events_file = m['midi_filename'].replace('.mid', '_note_events.npy') | |
| midi_file = m['midi_filename'] | |
| file_list[i] = { | |
| 'maestro_id': i, | |
| 'n_frames': n_frames, | |
| 'mix_audio_file': mix_audio_file, | |
| 'notes_file': notes_file, | |
| 'note_events_file': note_events_file, | |
| 'midi_file': midi_file, | |
| 'program': [0], | |
| 'is_drum': [0], | |
| } | |
| # Process MIDI | |
| notes, note_events = create_note_event_and_note_from_midi( | |
| mid_file=midi_file, id=i, ignore_pedal=ignore_pedal) | |
| if sanity_check: | |
| # sanity check | |
| print(f'Sanity check for {i}: {midi_file}') | |
| note_event2token2note_event_sanity_check(note_events['note_events'], notes['notes']) | |
| np.save(notes_file, notes, allow_pickle=True, fix_imports=False) | |
| print(f'Created {notes_file}') | |
| np.save(note_events_file, note_events, allow_pickle=True, fix_imports=False) | |
| print(f'Created {note_events_file}') | |
| # Save index | |
| for split in ['all', 'train', 'validation', 'test']: | |
| fl = {} | |
| for i, maestro_id in enumerate(ids[split]): | |
| fl[i] = file_list[maestro_id] | |
| output_index_file = os.path.join(output_index_dir, f'{dataset_name}_{split}_file_list.json') | |
| with open(output_index_file, 'w') as f: | |
| json.dump(fl, f, indent=4) | |
| print(f'Created {output_index_file}') | |