Spaces:
Running
on
Zero
Running
on
Zero
| # we have raw transcript at | |
| # /data/scratch/pyp/datasets/librilight/preprocessed/audio | |
| # we have word and ARPA alignment at | |
| # /data/scratch/pyp/datasets/librilight/preprocessed/alignment | |
| # we have manifest at /data/scratch/pyp/datasets/librilight/preprocessed/manifest_mimi | |
| # where each row is like large/10022/essayoncriticism_1505_librivox_64kb_mp3/essayoncriticism_01_pope_64kb_5_610.32_630.08.flac 19.76 | |
| # we want to create IPA alignment from the raw transcript and word alignment, using phonemizer | |
| # save at /data/scratch/pyp/datasets/librilight/preprocessed/ipa_alignment | |
| # since ipa phonemized results are 1-to-1 with words (10 words might lead to a ipa sequence of 7 phonemes), we have to run phonemizer on each segment of the word sequence | |
| import os, string, csv, random, tqdm, glob | |
| from tokenizer import TextTokenizer, tokenize_text | |
| def remove_punctuation(input_string): | |
| translator = str.maketrans('', '', string.punctuation) | |
| return input_string.translate(translator) | |
| def create_alignment(fn, trans_dir, align_dir, audio_ext, trans_ext, arpa_ext, text_tokenizer, use_prob, ipa_alignment_fn, save=False, prompt_dur=30): | |
| os.makedirs(os.path.dirname(ipa_alignment_fn), exist_ok=True) | |
| trans_fn = os.path.join(trans_dir, fn.replace(audio_ext, trans_ext)) | |
| if not os.path.isfile(trans_fn): | |
| return [], True | |
| align_fn = os.path.join(align_dir, fn.replace(audio_ext, arpa_ext)) | |
| if not os.path.isfile(align_fn): | |
| return [], True | |
| # get raw transcript | |
| with open(trans_fn, 'r') as f: | |
| transcript = f.read().strip() | |
| raw_word_list = transcript.split(" ") | |
| # get word alignment | |
| with open(align_fn, 'r') as f: | |
| word_alignment = csv.reader(f) | |
| word_alignment = [row for row in word_alignment if row[3]=='words'] | |
| ipa_alignment = [] | |
| for j, (item, raw_word) in enumerate(zip(word_alignment, raw_word_list)): | |
| start, end, word = float(item[0]), float(item[1]), item[2] | |
| if end > prompt_dur: | |
| break | |
| punc_re_raw_word = remove_punctuation(raw_word) | |
| if not remove_punctuation(word).lower() == punc_re_raw_word.lower(): | |
| # print(f"word from alignment csv: {word}, word from txt: {raw_word}") | |
| return ipa_alignment, True | |
| if random.random() < use_prob: | |
| cur_words = " ".join(raw_word_list[:j+1]) | |
| phn = tokenize_text(text_tokenizer, cur_words) | |
| if len(phn) == 0: | |
| continue | |
| phn = " ".join(phn) | |
| start = 0 # at this point, we always start from the beginning of the sentence | |
| ipa_alignment.append([start, end, phn]) | |
| if save: | |
| if ipa_alignment: | |
| with open(ipa_alignment_fn, 'w') as f: | |
| for item in ipa_alignment: | |
| f.write(f"{item[0]}\t{item[1]}\t{item[2]}\n") | |
| else: | |
| return ipa_alignment, False | |
| def main( | |
| data_root: str = '/data/scratch/pyp/datasets/librilight/preprocessed', | |
| audio_ext: str = '.flac', | |
| arpa_ext: str = '.csv', | |
| trans_ext: str = '.txt', | |
| split: str = 'valid', | |
| use_prob: float = 0.5, | |
| max_dur: float = 30., # do not consider utterance longer than this | |
| prompt_dur: float = 30., # do not consider prompt longer than this | |
| ): | |
| text_tokenizer = TextTokenizer() | |
| trans_dir = f'{data_root}/audio' | |
| align_dir = f'{data_root}/alignment' | |
| manifest_fn = f"{data_root}/manifest_final_encodec/{split}*=*.txt" | |
| manifest_fns = glob.glob(manifest_fn) | |
| target_dir = f'{data_root}/ipa_alignment' | |
| encodec_sr = 50 | |
| os.makedirs(target_dir, exist_ok=True) | |
| manifest = [] | |
| for manifest_fn in manifest_fns: | |
| with open(manifest_fn, 'r') as f: | |
| temp = [l.strip().split("\t") for l in f.readlines()] | |
| manifest += [l[0] + audio_ext for l in temp if float(l[1])/encodec_sr < max_dur] | |
| # # sequential processing | |
| n_flags = 0 | |
| zero_words = 0 | |
| for j, fn in enumerate(tqdm.tqdm(manifest)): | |
| ipa_alignment_fn = os.path.join(target_dir, fn.replace(audio_ext, '.txt')) | |
| ipa_alignment, flag = create_alignment(fn, trans_dir, align_dir, audio_ext, trans_ext, arpa_ext, text_tokenizer, use_prob, ipa_alignment_fn, prompt_dur=prompt_dur) | |
| n_flags += flag | |
| if not ipa_alignment: | |
| zero_words += 1 | |
| # print(f"{n_flags} out of {j+1} utterances have mismatched words") | |
| # print(f"{zero_words} out of {j+1} utterances have zero words") | |
| if ipa_alignment: | |
| with open(ipa_alignment_fn, 'w') as f: | |
| for item in ipa_alignment: | |
| f.write(f"{item[0]}\t{item[1]}\t{item[2]}\n") | |
| # # # # do the above using joblib parallisim | |
| # print(f"Processing {len(manifest)} utterances") | |
| # from joblib import Parallel, delayed | |
| # Parallel(n_jobs=32, verbose=2)(delayed(create_alignment)(fn, trans_dir, align_dir, audio_ext, trans_ext, arpa_ext, text_tokenizer, use_prob, os.path.join(target_dir, fn.replace(audio_ext, '.txt')), save=True) for fn in manifest) | |
| if __name__ == "__main__": | |
| import fire | |
| fire.Fire(main) |