Spaces:
Running
on
T4
Running
on
T4
Update Preprocessing/TextFrontend.py
Browse files
Preprocessing/TextFrontend.py
CHANGED
|
@@ -4,6 +4,7 @@
|
|
| 4 |
import json
|
| 5 |
import logging
|
| 6 |
import re
|
|
|
|
| 7 |
|
| 8 |
import torch
|
| 9 |
from dragonmapper.transcriptions import pinyin_to_ipa
|
|
@@ -848,7 +849,7 @@ class ArticulatoryCombinedTextFrontend:
|
|
| 848 |
# languages use different tones denoted by different numbering
|
| 849 |
# systems. At this point in the script, it is attempted to unify
|
| 850 |
# them all to the tones in the IPA standard.
|
| 851 |
-
if self.g2p_lang == "vi":
|
| 852 |
phones = phones.replace('1', "˧")
|
| 853 |
phones = phones.replace('2', "˨˩")
|
| 854 |
phones = phones.replace('ɜ', "˧˥") # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
|
|
@@ -1052,11 +1053,49 @@ def english_text_expansion(text):
|
|
| 1052 |
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in
|
| 1053 |
[('Mrs.', 'misess'), ('Mr.', 'mister'), ('Dr.', 'doctor'), ('St.', 'saint'), ('Co.', 'company'), ('Jr.', 'junior'), ('Maj.', 'major'),
|
| 1054 |
('Gen.', 'general'), ('Drs.', 'doctors'), ('Rev.', 'reverend'), ('Lt.', 'lieutenant'), ('Hon.', 'honorable'), ('Sgt.', 'sergeant'),
|
| 1055 |
-
('Capt.', 'captain'), ('Esq.', 'esquire'), ('Ltd.', 'limited'), ('Col.', 'colonel'), ('Ft.', 'fort')]]
|
| 1056 |
for regex, replacement in _abbreviations:
|
| 1057 |
text = re.sub(regex, replacement, text)
|
| 1058 |
return text
|
| 1059 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1060 |
|
| 1061 |
def remove_french_spacing(text):
|
| 1062 |
text = text.replace(" »", '"').replace("« ", '"')
|
|
@@ -1066,6 +1105,7 @@ def remove_french_spacing(text):
|
|
| 1066 |
|
| 1067 |
|
| 1068 |
def convert_kanji_to_pinyin_mandarin(text):
|
|
|
|
| 1069 |
return " ".join([x[0] for x in pinyin(text)])
|
| 1070 |
|
| 1071 |
|
|
@@ -1074,7 +1114,7 @@ def get_language_id(language):
|
|
| 1074 |
iso_codes_to_ids = load_json_from_path("Preprocessing/multilinguality/iso_lookup.json")[-1]
|
| 1075 |
except FileNotFoundError:
|
| 1076 |
try:
|
| 1077 |
-
iso_codes_to_ids = load_json_from_path("multilinguality/iso_lookup.json")[-1]
|
| 1078 |
except FileNotFoundError:
|
| 1079 |
iso_codes_to_ids = load_json_from_path("iso_lookup.json")[-1]
|
| 1080 |
if language not in iso_codes_to_ids:
|
|
@@ -1090,7 +1130,7 @@ if __name__ == '__main__':
|
|
| 1090 |
|
| 1091 |
print("\n\nChinese Test")
|
| 1092 |
tf = ArticulatoryCombinedTextFrontend(language="cmn")
|
| 1093 |
-
tf.string_to_tensor("
|
| 1094 |
tf.string_to_tensor("李绅 《悯农》 锄禾日当午, 汗滴禾下土。 谁知盘中餐, 粒粒皆辛苦。", view=True)
|
| 1095 |
tf.string_to_tensor("巴 拔 把 爸 吧", view=True)
|
| 1096 |
|
|
|
|
| 4 |
import json
|
| 5 |
import logging
|
| 6 |
import re
|
| 7 |
+
from pathlib import Path
|
| 8 |
|
| 9 |
import torch
|
| 10 |
from dragonmapper.transcriptions import pinyin_to_ipa
|
|
|
|
| 849 |
# languages use different tones denoted by different numbering
|
| 850 |
# systems. At this point in the script, it is attempted to unify
|
| 851 |
# them all to the tones in the IPA standard.
|
| 852 |
+
if self.g2p_lang == "vi" or self.g2p_lang == "vi-vn-x-central" or self.g2p_lang == "vi-vn-x-south":
|
| 853 |
phones = phones.replace('1', "˧")
|
| 854 |
phones = phones.replace('2', "˨˩")
|
| 855 |
phones = phones.replace('ɜ', "˧˥") # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
|
|
|
|
| 1053 |
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in
|
| 1054 |
[('Mrs.', 'misess'), ('Mr.', 'mister'), ('Dr.', 'doctor'), ('St.', 'saint'), ('Co.', 'company'), ('Jr.', 'junior'), ('Maj.', 'major'),
|
| 1055 |
('Gen.', 'general'), ('Drs.', 'doctors'), ('Rev.', 'reverend'), ('Lt.', 'lieutenant'), ('Hon.', 'honorable'), ('Sgt.', 'sergeant'),
|
| 1056 |
+
('Capt.', 'captain'), ('Esq.', 'esquire'), ('Ltd.', 'limited'), ('Col.', 'colonel'), ('Ft.', 'fort'), ('e.g.', ', for example, '), ('TTS', 'text to speech')]]
|
| 1057 |
for regex, replacement in _abbreviations:
|
| 1058 |
text = re.sub(regex, replacement, text)
|
| 1059 |
return text
|
| 1060 |
|
| 1061 |
+
def chinese_number_conversion(text):
|
| 1062 |
+
# https://gist.github.com/gumblex/0d65cad2ba607fd14de7?permalink_comment_id=4063512#gistcomment-4063512
|
| 1063 |
+
import bisect
|
| 1064 |
+
zhdigits = '零一二三四五六七八九'
|
| 1065 |
+
zhplaces = {
|
| 1066 |
+
0: '',
|
| 1067 |
+
1: '十',
|
| 1068 |
+
2: '百',
|
| 1069 |
+
3: '千',
|
| 1070 |
+
4: '万',
|
| 1071 |
+
8: '亿',
|
| 1072 |
+
}
|
| 1073 |
+
zhplace_keys = sorted(zhplaces.keys())
|
| 1074 |
+
|
| 1075 |
+
def numdigits(n):
|
| 1076 |
+
return len(str(abs(n)))
|
| 1077 |
+
|
| 1078 |
+
def _zhnum(n):
|
| 1079 |
+
if n < 10:
|
| 1080 |
+
return zhdigits[n]
|
| 1081 |
+
named_place_len = zhplace_keys[bisect.bisect_right(zhplace_keys,
|
| 1082 |
+
numdigits(n) - 1) - 1]
|
| 1083 |
+
left_part, right_part = n // 10 ** named_place_len, n % 10 ** named_place_len
|
| 1084 |
+
return (_zhnum(left_part) +
|
| 1085 |
+
zhplaces[named_place_len] +
|
| 1086 |
+
((zhdigits[0] if numdigits(right_part) != named_place_len else '') +
|
| 1087 |
+
_zhnum(right_part)
|
| 1088 |
+
if right_part else ''))
|
| 1089 |
+
|
| 1090 |
+
def zhnum(n):
|
| 1091 |
+
answer = ('负' if n < 0 else '') + _zhnum(abs(n))
|
| 1092 |
+
answer = re.sub(r'^一十', '十', answer)
|
| 1093 |
+
answer = re.sub(r'(?<![零十])二(?=[千万亿])', r'两', answer)
|
| 1094 |
+
return answer
|
| 1095 |
+
|
| 1096 |
+
|
| 1097 |
+
return re.sub(r'\d+', lambda x: zhnum(int(x.group())), text)
|
| 1098 |
+
|
| 1099 |
|
| 1100 |
def remove_french_spacing(text):
|
| 1101 |
text = text.replace(" »", '"').replace("« ", '"')
|
|
|
|
| 1105 |
|
| 1106 |
|
| 1107 |
def convert_kanji_to_pinyin_mandarin(text):
|
| 1108 |
+
text = chinese_number_conversion(text)
|
| 1109 |
return " ".join([x[0] for x in pinyin(text)])
|
| 1110 |
|
| 1111 |
|
|
|
|
| 1114 |
iso_codes_to_ids = load_json_from_path("Preprocessing/multilinguality/iso_lookup.json")[-1]
|
| 1115 |
except FileNotFoundError:
|
| 1116 |
try:
|
| 1117 |
+
iso_codes_to_ids = load_json_from_path(str(Path(__file__).parent / "multilinguality/iso_lookup.json"))[-1]
|
| 1118 |
except FileNotFoundError:
|
| 1119 |
iso_codes_to_ids = load_json_from_path("iso_lookup.json")[-1]
|
| 1120 |
if language not in iso_codes_to_ids:
|
|
|
|
| 1130 |
|
| 1131 |
print("\n\nChinese Test")
|
| 1132 |
tf = ArticulatoryCombinedTextFrontend(language="cmn")
|
| 1133 |
+
tf.string_to_tensor("这是一个复杂的句子,19423 它甚至包含一个停顿。", view=True)
|
| 1134 |
tf.string_to_tensor("李绅 《悯农》 锄禾日当午, 汗滴禾下土。 谁知盘中餐, 粒粒皆辛苦。", view=True)
|
| 1135 |
tf.string_to_tensor("巴 拔 把 爸 吧", view=True)
|
| 1136 |
|