Spaces:
Runtime error
Runtime error
| import re | |
| from num2words import num2words | |
| punctuation = r'[\s,.?!/)\'\]>]' | |
| alphabet_map = { | |
| "A": " Ei ", | |
| "B": " Bee ", | |
| "C": " See ", | |
| "D": " Dee ", | |
| "E": " Eee ", | |
| "F": " Eff ", | |
| "G": " Jee ", | |
| "H": " Eich ", | |
| "I": " Eye ", | |
| "J": " Jay ", | |
| "K": " Kay ", | |
| "L": " El ", | |
| "M": " Emm ", | |
| "N": " Enn ", | |
| "O": " Ohh ", | |
| "P": " Pee ", | |
| "Q": " Queue ", | |
| "R": " Are ", | |
| "S": " Ess ", | |
| "T": " Tee ", | |
| "U": " You ", | |
| "V": " Vee ", | |
| "W": " Double You ", | |
| "X": " Ex ", | |
| "Y": " Why ", | |
| "Z": " Zed " # Zed is weird, as I (da3dsoul) am American, but most of the voice models sound British, so it matches | |
| } | |
| def preprocess(string): | |
| # the order for some of these matter | |
| # For example, you need to remove the commas in numbers before expanding them | |
| string = remove_surrounded_chars(string) | |
| string = string.replace('"', '') | |
| string = string.replace('\u201D', '').replace('\u201C', '') # right and left quote | |
| string = string.replace('\u201F', '') # italic looking quote | |
| string = string.replace('\n', ' ') | |
| string = convert_num_locale(string) | |
| string = replace_negative(string) | |
| string = replace_roman(string) | |
| string = hyphen_range_to(string) | |
| string = num_to_words(string) | |
| # TODO Try to use a ML predictor to expand abbreviations. It's hard, dependent on context, and whether to actually | |
| # try to say the abbreviation or spell it out as I've done below is not agreed upon | |
| # For now, expand abbreviations to pronunciations | |
| # replace_abbreviations adds a lot of unnecessary whitespace to ensure separation | |
| string = replace_abbreviations(string) | |
| string = replace_lowercase_abbreviations(string) | |
| # cleanup whitespaces | |
| # remove whitespace before punctuation | |
| string = re.sub(rf'\s+({punctuation})', r'\1', string) | |
| string = string.strip() | |
| # compact whitespace | |
| string = ' '.join(string.split()) | |
| return string | |
| def remove_surrounded_chars(string): | |
| # first this expression will check if there is a string nested exclusively between a alt= | |
| # and a style= string. This would correspond to only a the alt text of an embedded image | |
| # If it matches it will only keep that part as the string, and rend it for further processing | |
| # Afterwards this expression matches to 'as few symbols as possible (0 upwards) between any | |
| # asterisks' OR' as few symbols as possible (0 upwards) between an asterisk and the end of the string' | |
| if re.search(r'(?<=alt=)(.*)(?=style=)', string, re.DOTALL): | |
| m = re.search(r'(?<=alt=)(.*)(?=style=)', string, re.DOTALL) | |
| string = m.group(0) | |
| return re.sub(r'\*[^*]*?(\*|$)', '', string) | |
| def convert_num_locale(text): | |
| # This detects locale and converts it to American without comma separators | |
| pattern = re.compile(r'(?:\s|^)\d{1,3}(?:\.\d{3})+(,\d+)(?:\s|$)') | |
| result = text | |
| while True: | |
| match = pattern.search(result) | |
| if match is None: | |
| break | |
| start = match.start() | |
| end = match.end() | |
| result = result[0:start] + result[start:end].replace('.', '').replace(',', '.') + result[end:len(result)] | |
| # removes comma separators from existing American numbers | |
| pattern = re.compile(r'(\d),(\d)') | |
| result = pattern.sub(r'\1\2', result) | |
| return result | |
| def replace_negative(string): | |
| # handles situations like -5. -5 would become negative 5, which would then be expanded to negative five | |
| return re.sub(rf'(\s)(-)(\d+)({punctuation})', r'\1negative \3\4', string) | |
| def replace_roman(string): | |
| # find a string of roman numerals. | |
| # Only 2 or more, to avoid capturing I and single character abbreviations, like names | |
| pattern = re.compile(rf'\s[IVXLCDM]{{2,}}{punctuation}') | |
| result = string | |
| while True: | |
| match = pattern.search(result) | |
| if match is None: | |
| break | |
| start = match.start() | |
| end = match.end() | |
| result = result[0:start + 1] + str(roman_to_int(result[start + 1:end - 1])) + result[end - 1:len(result)] | |
| return result | |
| def roman_to_int(s): | |
| rom_val = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} | |
| int_val = 0 | |
| for i in range(len(s)): | |
| if i > 0 and rom_val[s[i]] > rom_val[s[i - 1]]: | |
| int_val += rom_val[s[i]] - 2 * rom_val[s[i - 1]] | |
| else: | |
| int_val += rom_val[s[i]] | |
| return int_val | |
| def hyphen_range_to(text): | |
| pattern = re.compile(r'(\d+)[-–](\d+)') | |
| result = pattern.sub(lambda x: x.group(1) + ' to ' + x.group(2), text) | |
| return result | |
| def num_to_words(text): | |
| # 1000 or 10.23 | |
| pattern = re.compile(r'\d+\.\d+|\d+') | |
| result = pattern.sub(lambda x: num2words(float(x.group())), text) | |
| return result | |
| def replace_abbreviations(string): | |
| # abbreviations 1 to 4 characters long. It will get things like A and I, but those are pronounced with their letter | |
| pattern = re.compile(rf'(^|[\s(.\'\[<])([A-Z]{{1,4}})({punctuation}|$)') | |
| result = string | |
| while True: | |
| match = pattern.search(result) | |
| if match is None: | |
| break | |
| start = match.start() | |
| end = match.end() | |
| result = result[0:start] + replace_abbreviation(result[start:end]) + result[end:len(result)] | |
| return result | |
| def replace_lowercase_abbreviations(string): | |
| # abbreviations 1 to 4 characters long, separated by dots i.e. e.g. | |
| pattern = re.compile(rf'(^|[\s(.\'\[<])(([a-z]\.){{1,4}})({punctuation}|$)') | |
| result = string | |
| while True: | |
| match = pattern.search(result) | |
| if match is None: | |
| break | |
| start = match.start() | |
| end = match.end() | |
| result = result[0:start] + replace_abbreviation(result[start:end].upper()) + result[end:len(result)] | |
| return result | |
| def replace_abbreviation(string): | |
| result = "" | |
| for char in string: | |
| result += match_mapping(char) | |
| return result | |
| def match_mapping(char): | |
| for mapping in alphabet_map.keys(): | |
| if char == mapping: | |
| return alphabet_map[char] | |
| return char | |
| def __main__(args): | |
| print(preprocess(args[1])) | |
| if __name__ == "__main__": | |
| import sys | |
| __main__(sys.argv) | |