Spaces:
Running
on
Zero
Running
on
Zero
| import os, sys | |
| import subprocess, tqdm | |
| from concurrent.futures import ThreadPoolExecutor | |
| def align_folders(audio_root, subfolder, subsubfolder): | |
| # Construct output folder path | |
| file_root = os.path.dirname(audio_root) | |
| out_folder = f"{file_root}/alignment/{subfolder}/{subsubfolder}" | |
| # Create the output directory | |
| os.makedirs(out_folder, exist_ok=True) | |
| # Construct the MFA align command | |
| command = [ | |
| "mfa", "align", "--single_speaker", "-j", "8", "--clean", | |
| f"{audio_root}/{subfolder}/{subsubfolder}", "english_us_arpa", "english_us_arpa", | |
| out_folder, "--beam", "50", "--retry_beam", "400", "--output_format", "csv" | |
| ] | |
| # Run the command | |
| subprocess.run(command, check=True) | |
| def main(file_root = "/data/scratch/pyp/datasets/librilight/librilight_example_preprocessed", max_parallel_jobs=10, max_spk=100, partition="1/10", n_workers=64): | |
| # Find all subfolder/subsubfolder combinations | |
| tasks = [] | |
| audio_root = os.path.join(file_root, "audio") | |
| for subfolder in os.listdir(audio_root): | |
| subfolder_path = os.path.join(audio_root, subfolder) | |
| if os.path.isdir(subfolder_path): | |
| for subsubfolder in os.listdir(subfolder_path): | |
| subsubfolder_path = os.path.join(subfolder_path, subsubfolder) | |
| if os.path.isdir(subsubfolder_path): | |
| tasks.append((audio_root, subfolder, subsubfolder)) | |
| speaker_folder_map = {} | |
| for audio_root, subfolder, subsubfolder in tasks: | |
| if os.path.join(audio_root, subfolder) not in speaker_folder_map: | |
| speaker_folder_map[os.path.join(audio_root, subfolder)] = [os.path.join(audio_root, subfolder, subsubfolder)] | |
| else: | |
| speaker_folder_map[os.path.join(audio_root, subfolder)].append(os.path.join(audio_root, subfolder, subsubfolder)) | |
| speaker_folder_partitions = [] | |
| for audio_root_subfolder, speaker_folders in speaker_folder_map.items(): | |
| speaker_folder_partitions.extend([speaker_folders[i:i+max_spk] for i in range(0, len(speaker_folders), max_spk)]) | |
| s, e = partition.split("/") | |
| s, e = int(s)-1, int(e) | |
| cur_tasks = speaker_folder_partitions[s::e] | |
| import secrets, string | |
| import soundfile, glob | |
| from joblib import Parallel, delayed | |
| def delete_corrupted(fn): | |
| try: | |
| x = soundfile.read(fn) | |
| except: | |
| print(f"removing corrupted file: {fn}") | |
| os.remove(fn) | |
| for j, task in enumerate(tqdm.tqdm(cur_tasks)): | |
| # get subfolder for the current task | |
| subs = [item.split("/")[-2] for item in task] | |
| # assert that all subs are the same | |
| assert len(set(subs)) == 1, subs | |
| sub = subs[0] | |
| # randomly generate a foldername | |
| # generate a random character | |
| # make softlink from item in task to temp folder | |
| random_string = ''.join(secrets.choice(string.ascii_letters + string.digits) for i in range(10)) | |
| temp_folder = os.path.join(file_root, "softlink_audio", random_string) | |
| os.makedirs(temp_folder, exist_ok=True) | |
| out_folder = f"{file_root}/alignment/{sub}" | |
| all_out_speaker_folders = [os.path.join(out_folder, os.path.basename(item)) for item in task] | |
| if sum(os.path.isdir(curpath) for curpath in all_out_speaker_folders) == len(all_out_speaker_folders): | |
| continue | |
| # remove audio files that are corrupted | |
| all_audio_files = [audiofile for item in task for audiofile in glob.glob(item+"/*/*.flac")] | |
| Parallel(n_jobs=n_workers)(delayed(delete_corrupted)(audiofn) for audiofn in all_audio_files) | |
| for item in task: | |
| # make softlink from subsubfolder to a new folder in temp folder | |
| os.symlink(item, os.path.join(temp_folder, os.path.basename(item))) | |
| # run mfa on the linked folder, but save alignment to the correct folder | |
| command = f"mfa align -j {n_workers} {temp_folder} english_us_arpa english_us_arpa {out_folder} --beam 50 --retry_beam 200 --output_format csv --quiet --use_mp --temporary_directory {temp_folder}_temp" | |
| os.system(command) | |
| # delete the temp_folder | |
| os.system(f"rm -r {temp_folder}") | |
| if __name__ == "__main__": | |
| import fire | |
| fire.Fire(main) |