Spaces:
Running
on
Zero
Running
on
Zero
| # find split, spk, books in libriheavy_cuts_dev.jsonl, libriheavy_cuts_test_clean.jsonl, libriheavy_cuts_test_other.jsonl | |
| # those would be in "id" field | |
| import sys | |
| import os, random, numpy as np, socket | |
| import json | |
| import tqdm | |
| def write_jsonl(data, fn): | |
| with open(fn, "w") as file: | |
| for entry in data: | |
| file.write(json.dumps(entry, ensure_ascii=False) + "\n") | |
| def read_jsonl(file_path): | |
| cur_data = [] | |
| with open(file_path, 'r', encoding='utf-8-sig') as file: | |
| for line in file: | |
| cur_data.append(json.loads(line.strip())) | |
| return cur_data | |
| import os | |
| dataroot=os.environ["DATAROOT"] | |
| manifestroot=os.path.join(dataroot, "libriheavy") | |
| tgt_names = ['libriheavy_cuts_dev.jsonl', 'libriheavy_cuts_test_clean.jsonl', 'libriheavy_cuts_test_other.jsonl'] | |
| orig_names = ['libriheavy_long_original_cuts_small.jsonl', 'libriheavy_long_original_cuts_medium.jsonl', 'libriheavy_long_original_cuts_large.jsonl'] | |
| id2split = {} | |
| data = read_jsonl(os.path.join(manifestroot, "libriheavy_cuts_dev.jsonl")) | |
| dev_ids = set(["/".join(item['id'].split("/")[:3]) for item in data]) | |
| data = read_jsonl(os.path.join(manifestroot, "libriheavy_cuts_test_clean.jsonl")) | |
| test_clean_ids = set(["/".join(item['id'].split("/")[:3]) for item in data]) | |
| data = read_jsonl(os.path.join(manifestroot, "libriheavy_cuts_test_other.jsonl")) | |
| test_other_ids = set(["/".join(item['id'].split("/")[:3]) for item in data]) | |
| long_dev = [] | |
| long_test_clean = [] | |
| long_test_other = [] | |
| for orig_name in orig_names: | |
| keep = [] | |
| data = read_jsonl(os.path.join(manifestroot, orig_name)) | |
| for item in tqdm.tqdm(data): | |
| if "/".join(item['id'].split("/")[:3]) in dev_ids: | |
| long_dev.append(item) | |
| elif "/".join(item['id'].split("/")[:3]) in test_clean_ids: | |
| long_test_clean.append(item) | |
| elif "/".join(item['id'].split("/")[:3]) in test_other_ids: | |
| long_test_other.append(item) | |
| else: | |
| keep.append(item) | |
| write_jsonl(keep, os.path.join(manifestroot, orig_name.replace("_original", ""))) | |
| write_jsonl(long_dev, os.path.join(manifestroot, "libriheavy_long_cuts_dev.jsonl")) | |
| write_jsonl(long_test_clean, os.path.join(manifestroot, "libriheavy_long_cuts_test_clean.jsonl")) | |
| write_jsonl(long_test_other, os.path.join(manifestroot, "libriheavy_long_cuts_test_other.jsonl")) |