Spaces:
Running
on
Zero
Running
on
Zero
| # for each each audio segment, find the non-overlapping neighboring segments | |
| from importlib.resources import path | |
| import pathlib | |
| import soundfile as sf | |
| import numpy as np | |
| import json | |
| import multiprocessing | |
| import argparse | |
| import tqdm | |
| import gzip | |
| import time | |
| import os | |
| from tokenizer import TextTokenizer, tokenize_text | |
| import glob | |
| import sys | |
| import os, random, numpy as np, socket | |
| import json | |
| import tqdm | |
| import json | |
| import tqdm | |
| def write_jsonl(data, fn): | |
| with open(fn, "w") as file: | |
| for entry in data: | |
| file.write(json.dumps(entry, ensure_ascii=False) + "\n") | |
| def read_jsonl(file_path): | |
| cur_data = [] | |
| with open(file_path, 'r', encoding='utf-8-sig') as file: | |
| for line in file: | |
| cur_data.append(json.loads(line.strip())) | |
| return cur_data | |
| from collections import defaultdict | |
| # Function to create a defaultdict recursively | |
| def nested_defaultdict(levels, inner_type): | |
| if levels <= 1: | |
| return defaultdict(inner_type) | |
| return defaultdict(lambda: nested_defaultdict(levels-1, inner_type)) | |
| def find_neighbor(args): | |
| split2manifest = { | |
| "train": [ | |
| "libriheavy_cuts_small.jsonl", | |
| "libriheavy_cuts_medium.jsonl", | |
| "libriheavy_cuts_large.jsonl", | |
| "libriheavy_long_cuts_small.jsonl", | |
| "libriheavy_long_cuts_medium.jsonl", | |
| "libriheavy_long_cuts_large.jsonl" | |
| ], | |
| "valid": [ | |
| "libriheavy_cuts_dev.jsonl", | |
| "libriheavy_long_cuts_dev.jsonl" | |
| ], | |
| "test": [ | |
| "libriheavy_cuts_test_clean.jsonl", | |
| "libriheavy_cuts_test_other.jsonl", | |
| "libriheavy_long_cuts_test_clean.jsonl", | |
| "libriheavy_long_cuts_test_other.jsonl" | |
| ] | |
| } | |
| stime = time.time() | |
| organized_data = nested_defaultdict(4, list) | |
| for mani_fn in split2manifest[args.split]: | |
| # data = open_mani(os.path.join(mani_dir, mani_fn)) | |
| mani_full_fn = os.path.join(args.manifest_dir, mani_fn) | |
| data = read_jsonl(mani_full_fn) | |
| for item in data: | |
| file_id = item['supervisions'][0]['id'] + '.flac' | |
| recording_id = item['recording']['id'] + '.flac' | |
| sizeSplit, spk, book, flac = recording_id.split("/") # e.g. 'medium/100/emerald_city_librivox_64kb_mp3/emeraldcity_01_baum_64kb' | |
| if os.path.isfile(os.path.join(args.audio_dir, recording_id)): | |
| vad = (item['start'], item['start']+item['duration']) | |
| text = item['supervisions'][0]['custom']['texts'][0] | |
| file_id = file_id.replace(".flac", "") + f"_{vad[0]:.2f}_{vad[1]:.2f}.flac" | |
| organized_data[sizeSplit][spk][book][recording_id].append({"file_id": file_id, "vad":vad, "text": text}) | |
| # # for each recording_id, find the non-overlapping neighboring segments based on vad | |
| # for sizeSplit in organized_data: | |
| # for spk in organized_data[sizeSplit]: | |
| # for book in organized_data[sizeSplit][spk]: | |
| # for recording_id in organized_data[sizeSplit][spk][book]: | |
| # segments = organized_data[sizeSplit][spk][book][recording_id] | |
| # segments.sort(key=lambda x: x['vad'][0]) | |
| # for i in range(len(segments)): | |
| # # for segment i, find the non-overlapping neighboring segments | |
| # write_fn = os.path.join(args.output_dir, f"{segments[i]['file_id'].replace('.flac', '.txt')}") | |
| # neighbors = [] | |
| # distance = [] | |
| # for j in range(len(segments)): | |
| # if segments[i]['vad'][1] < segments[j]['vad'][0] or segments[i]['vad'][0] > segments[j]['vad'][0]: | |
| # neighbors.append(segments[j]['file_id'].replace('.flac', '.txt')) | |
| # distance.append(min(abs(segments[i]['vad'][1] - segments[j]['vad'][0]), abs(segments[i]['vad'][0] - segments[j]['vad'][1]))) | |
| # # order neighbors by distance | |
| # neighbors_distance = [[x, dist] for dist, x in sorted(zip(distance, neighbors))] | |
| # os.makedirs(os.path.dirname(write_fn), exist_ok=True) | |
| # with open(write_fn, "w") as f: | |
| # # note that there might be no neighbors, in which case the file is empty | |
| # for neighbor, dist in neighbors_distance: | |
| # f.write(f"{neighbor}\t{dist}\n") | |
| # use multiprocessing.Pool for the above | |
| segments = [organized_data[sizeSplit][spk][book][recording_id] for sizeSplit in organized_data for spk in organized_data[sizeSplit] for book in organized_data[sizeSplit][spk] for recording_id in organized_data[sizeSplit][spk][book]] | |
| # only keep those that are exist | |
| print(f"originally total {len(segments)} segments") | |
| segments = [seg for seg in segments if os.path.isfile(os.path.join("/".join(args.output_dir.split("/")[:-1]),"audio", seg[0]['file_id']))] | |
| print(f"after check existance, total {len(segments)} segments") | |
| print(f"organizing took {(time.time()-stime)/60:.2f} minutes") | |
| with multiprocessing.Pool(processes=args.n_workers) as pool: | |
| for _ in tqdm.tqdm(pool.imap_unordered(find_neighbor_each, segments), total=len(segments)): | |
| pass | |
| # audio_root = "/data/scratch/pyp/datasets/librilight/preprocessed/audio" | |
| def find_neighbor_each(segments): | |
| # for each recording_id, find the non-overlapping neighboring segments based on vad | |
| # only keep segments that have audio files | |
| # actually only keep segments that have ipa_alignment files | |
| segments = [seg for seg in segments if os.path.isfile(os.path.join("/".join(args.output_dir.split("/")[:-1]),"ipa_alignment", seg['file_id'].replace(".flac", ".txt")))] | |
| if len(segments) <= 1: | |
| return | |
| for i in range(len(segments)): | |
| # for segment i, find the non-overlapping neighboring segments | |
| write_fn = os.path.join(args.output_dir, f"{segments[i]['file_id'].replace('.flac', '.txt')}") | |
| neighbors = [] | |
| distance = [] | |
| for j in range(len(segments)): | |
| if segments[i]['vad'][1] < segments[j]['vad'][0] or segments[i]['vad'][0] > segments[j]['vad'][0]: | |
| neighbors.append(segments[j]) | |
| distance.append(min(abs(segments[i]['vad'][1] - segments[j]['vad'][0]), abs(segments[i]['vad'][0] - segments[j]['vad'][1]))) | |
| if len(neighbors) == 0: | |
| continue | |
| # order neighbors by distance | |
| index = np.argsort(distance) | |
| neighbors_distance = [[neighbors[ind], distance[ind]] for ind in index] | |
| os.makedirs(os.path.dirname(write_fn), exist_ok=True) | |
| with open(write_fn, "w") as f: | |
| # note that there might be no neighbors, in which case the file is empty | |
| for neighbor, dist in neighbors_distance: | |
| f.write(f"{neighbor['file_id'].replace('.flac', '.txt')}\t{dist}\t{neighbor['vad'][1] - neighbor['vad'][0]}\n") # file_id, distance, duration | |
| def parse_args(): | |
| parser = argparse.ArgumentParser(description="Cut a dataset in small " | |
| "sequences using VAD files") | |
| parser.add_argument('--split', type=str, default='train', choices=['train', 'valid', 'test'], help="train = libriheavy_cuts_{small,medium,large}.jsonl.gz, valid = libriheavy_cuts_dev_{clean,other}.jsonl.gz, test = libriheavy_cuts_test_{clean,other}.jsonl.gz") | |
| parser.add_argument('--audio_dir', type=str, default="/data/scratch/pyp/datasets/librilight_example", | |
| help="Path to the audio directory") | |
| parser.add_argument('--manifest_dir', type=str, default="/data/scratch/pyp/datasets/librilight/libriheavy", help="path to the transcription file's dir, can be downloaded https://huggingface.co/datasets/pkufool/libriheavy/tree/main/v0.1") | |
| parser.add_argument('--output_dir', type=str, default="/data/scratch/pyp/datasets/librilight/librilight_example_preprocessed/neighbors", | |
| help="Path to the output directory") | |
| parser.add_argument('--n_workers', type=int, default=16, | |
| help="Number of parallel worker processes") | |
| return parser.parse_args() | |
| if __name__ == "__main__": | |
| args = parse_args() | |
| pathlib.Path(args.output_dir).mkdir(exist_ok=True, parents=True) | |
| find_neighbor(args) |