Spaces:
Running
on
Zero
Running
on
Zero
| """preprocess_rwc_pop.py""" | |
| import os | |
| import json | |
| import csv | |
| from typing import Dict, List, Tuple | |
| import numpy as np | |
| from utils.audio import get_audio_file_info, load_audio_file | |
| from utils.midi import midi2note, note_event2midi | |
| from utils.note2event import note2note_event, sort_notes, validate_notes, trim_overlapping_notes, extract_program_from_notes | |
| from utils.event2note import event2note_event | |
| from utils.note_event_dataclasses import Note, NoteEvent | |
| from utils.utils import note_event2token2note_event_sanity_check | |
| from mido import Message, MidiFile | |
| ID_NO_BASS = ['071', '072', '073', '074', '075', '076', '077', '078', '079', '080'] # 10 files | |
| def check_file_existence(file: str) -> bool: | |
| """Checks if file exists.""" | |
| res = True | |
| if not os.path.exists(file): | |
| res = False | |
| elif get_audio_file_info(file)[1] < 10 * 16000: | |
| print(f'File {file} is too short.') | |
| res = False | |
| return res | |
| def create_note_event_and_note_from_midi(mid_file: str, | |
| id: str, | |
| ignore_pedal: bool = True) -> Tuple[Dict, Dict]: | |
| """Extracts note or note_event and metadata from midi: | |
| Returns: | |
| notes (dict): note events and metadata. | |
| note_events (dict): note events and metadata. | |
| """ | |
| notes, dur_sec, programs = midi2note( | |
| mid_file, | |
| binary_velocity=True, | |
| ch_9_as_drum=True, | |
| trim_overlap=True, | |
| fix_offset=True, | |
| quantize=True, | |
| verbose=0, | |
| minimum_offset_sec=0.01, | |
| drum_offset_sec=0.01, | |
| ignore_pedal=ignore_pedal, | |
| return_programs=True) | |
| # Check drum availability | |
| has_drum = False | |
| for note in notes: | |
| if note.is_drum: | |
| has_drum = True | |
| is_drum = [0] * len(programs) | |
| if has_drum: | |
| is_drum[9] = 1 | |
| return { # notes | |
| 'rwc_pop_id': id, | |
| 'program': programs, | |
| 'is_drum': is_drum, | |
| 'duration_sec': dur_sec, | |
| 'notes': notes, | |
| }, { # note_events | |
| 'rwc_pop_id': id, | |
| 'program': programs, | |
| 'is_drum': is_drum, | |
| 'duration_sec': dur_sec, | |
| 'note_events': note2note_event(notes), | |
| } | |
| def preprocess_rwc_pop16k(data_home=os.PathLike, dataset_name='rwc_pop') -> None: | |
| # Directory and file paths | |
| base_dir = os.path.join(data_home, dataset_name + '_yourmt3_16k') | |
| output_index_dir = os.path.join(data_home, 'yourmt3_indexes') | |
| os.makedirs(output_index_dir, exist_ok=True) | |
| # Load CSV: construct id to midi/wav dictionary | |
| csv_file = os.path.join(base_dir, 'wav_to_midi_filename_mapping.csv') | |
| rwc_bass = {} | |
| with open(csv_file, 'r') as f: | |
| reader = csv.reader(f) | |
| headers = next(reader) | |
| for row in reader: | |
| id = row[2] | |
| # Skip unused ids | |
| # if id in UNUSED_IDS: | |
| # continue | |
| # if id in MULTI_BASS_IDS: | |
| # continue | |
| mix_audio_file = os.path.join(base_dir, headers[0] + row[0], | |
| row[1] + ' ' + headers[1] + '.wav') | |
| assert check_file_existence(mix_audio_file) | |
| # mid_file = os.path.join(base_dir, 'MIDI', id + '.mid') | |
| mid_file = os.path.join(base_dir, 'MIDI-Bass-Octave-fixed-v2', id + '_bass.mid') | |
| # assert os.path.exists(mid_file) | |
| if not os.path.exists(mid_file): | |
| print(mid_file, "does not exist") | |
| continue | |
| notes_file = mid_file.replace('.mid', '_notes.npy') | |
| note_events_file = mid_file.replace('.mid', '_note_events.npy') | |
| rwc_bass[id] = { | |
| 'rwc_pop_id': id, | |
| 'n_frames': get_audio_file_info(mix_audio_file)[1], | |
| 'mix_audio_file': mix_audio_file, | |
| 'notes_file': notes_file, | |
| 'note_events_file': note_events_file, | |
| 'midi_file': mid_file, | |
| 'program': None, | |
| 'is_drum': None, | |
| } | |
| assert len(rwc_bass) == 90 | |
| # Create note and note_event files | |
| for id in rwc_bass.keys(): | |
| midi_file = rwc_bass[id]['midi_file'] | |
| notes_file = rwc_bass[id]['notes_file'] | |
| note_events_file = rwc_bass[id]['note_events_file'] | |
| # Create note and note_event files | |
| notes, note_events = create_note_event_and_note_from_midi(midi_file, id, ignore_pedal=True) | |
| # Update programs and is_drum | |
| rwc_bass[id]['program'] = notes['program'] | |
| rwc_bass[id]['is_drum'] = notes['is_drum'] | |
| # Save note and note_event files | |
| np.save(notes_file, notes, allow_pickle=True, fix_imports=False) | |
| print(f'Created {notes_file}') | |
| np.save(note_events_file, note_events, allow_pickle=True, fix_imports=False) | |
| print(f'Created {note_events_file}') | |
| # saving bpm 120 midi files | |
| bpm120_midi_file = midi_file.replace('.mid', '_bpm120.mid') | |
| note_event2midi(note_events['note_events'], bpm120_midi_file) | |
| print(f'Created {bpm120_midi_file}') | |
| # Save index file | |
| split = 'bass' | |
| output_index_file = os.path.join(output_index_dir, f'rwc_pop_{split}_file_list.json') | |
| file_list = {} | |
| for i, id in enumerate(rwc_bass.keys()): | |
| file_list[i] = rwc_bass[id] | |
| with open(output_index_file, 'w') as f: | |
| json.dump(file_list, f, indent=4) | |
| print(f'Created {output_index_file}') | |