Spaces:
Sleeping
Sleeping
text
Browse files- text/__init__.py +23 -23
- text/__pycache__/__init__.cpython-310.pyc +0 -0
- text/__pycache__/cantonese.cpython-310.pyc +0 -0
- text/__pycache__/cleaners.cpython-310.pyc +0 -0
- text/__pycache__/english.cpython-310.pyc +0 -0
- text/__pycache__/japanese.cpython-310.pyc +0 -0
- text/__pycache__/korean.cpython-310.pyc +0 -0
- text/__pycache__/mandarin.cpython-310.pyc +0 -0
- text/__pycache__/ngu_dialect.cpython-310.pyc +0 -0
- text/__pycache__/shanghainese.cpython-310.pyc +0 -0
- text/cantonese.py +1 -19
- text/cleaners.py +14 -0
- text/mandarin.py +18 -3
- text/shanghainese.py +1 -19
text/__init__.py
CHANGED
|
@@ -2,31 +2,31 @@
|
|
| 2 |
from text import cleaners
|
| 3 |
|
| 4 |
|
| 5 |
-
def text_to_sequence(text, symbols, cleaner_names):
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
| 14 |
|
| 15 |
-
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
def _clean_text(text, cleaner_names):
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
| 2 |
from text import cleaners
|
| 3 |
|
| 4 |
|
| 5 |
+
def text_to_sequence(text, symbols, cleaner_names, bert_embedding=False):
|
| 6 |
+
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
| 7 |
+
Args:
|
| 8 |
+
text: string to convert to a sequence
|
| 9 |
+
cleaner_names: names of the cleaner functions to run the text through
|
| 10 |
+
Returns:
|
| 11 |
+
List of integers corresponding to the symbols in the text
|
| 12 |
+
'''
|
|
|
|
| 13 |
|
| 14 |
+
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
| 15 |
|
| 16 |
+
if bert_embedding:
|
| 17 |
+
cleaned_text, char_embeds = _clean_text(text, cleaner_names)
|
| 18 |
+
sequence = [_symbol_to_id[symbol] for symbol in cleaned_text.split()]
|
| 19 |
+
return sequence, char_embeds
|
| 20 |
+
else:
|
| 21 |
+
cleaned_text = _clean_text(text, cleaner_names)
|
| 22 |
+
sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
|
| 23 |
+
return sequence
|
| 24 |
|
| 25 |
|
| 26 |
def _clean_text(text, cleaner_names):
|
| 27 |
+
for name in cleaner_names:
|
| 28 |
+
cleaner = getattr(cleaners, name)
|
| 29 |
+
if not cleaner:
|
| 30 |
+
raise Exception('Unknown cleaner: %s' % name)
|
| 31 |
+
text = cleaner(text)
|
| 32 |
+
return text
|
text/__pycache__/__init__.cpython-310.pyc
DELETED
|
Binary file (1.21 kB)
|
|
|
text/__pycache__/cantonese.cpython-310.pyc
DELETED
|
Binary file (2.34 kB)
|
|
|
text/__pycache__/cleaners.cpython-310.pyc
DELETED
|
Binary file (11 kB)
|
|
|
text/__pycache__/english.cpython-310.pyc
DELETED
|
Binary file (4.69 kB)
|
|
|
text/__pycache__/japanese.cpython-310.pyc
DELETED
|
Binary file (4.13 kB)
|
|
|
text/__pycache__/korean.cpython-310.pyc
DELETED
|
Binary file (5.58 kB)
|
|
|
text/__pycache__/mandarin.cpython-310.pyc
DELETED
|
Binary file (6.53 kB)
|
|
|
text/__pycache__/ngu_dialect.cpython-310.pyc
DELETED
|
Binary file (1.17 kB)
|
|
|
text/__pycache__/shanghainese.cpython-310.pyc
DELETED
|
Binary file (2.51 kB)
|
|
|
text/cantonese.py
CHANGED
|
@@ -35,25 +35,6 @@ _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
| 35 |
('Z', 'iː˨sɛːt̚˥')
|
| 36 |
]]
|
| 37 |
|
| 38 |
-
_symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
|
| 39 |
-
('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
|
| 40 |
-
('([0-9]+)/([0-9]+)', r'\2分之\1'),
|
| 41 |
-
('\+', r'加'),
|
| 42 |
-
('([0-9]+)-([0-9]+)', r'\1减\2'),
|
| 43 |
-
('×', r'乘以'),
|
| 44 |
-
('([0-9]+)x([0-9]+)', r'\1乘以\2'),
|
| 45 |
-
('([0-9]+)\*([0-9]+)', r'\1乘以\2'),
|
| 46 |
-
('÷', r'除以'),
|
| 47 |
-
('=', r'等于'),
|
| 48 |
-
('≠', r'不等于'),
|
| 49 |
-
]]
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
def symbols_to_chinese(text):
|
| 53 |
-
for regex, replacement in _symbols_to_chinese:
|
| 54 |
-
text = re.sub(regex, replacement, text)
|
| 55 |
-
return text
|
| 56 |
-
|
| 57 |
|
| 58 |
def number_to_cantonese(text):
|
| 59 |
return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
|
|
@@ -66,6 +47,7 @@ def latin_to_ipa(text):
|
|
| 66 |
|
| 67 |
|
| 68 |
def cantonese_to_ipa(text):
|
|
|
|
| 69 |
text = symbols_to_chinese(text)
|
| 70 |
text = number_to_cantonese(text.upper())
|
| 71 |
text = converter.convert(text).replace('-', '').replace('$', ' ')
|
|
|
|
| 35 |
('Z', 'iː˨sɛːt̚˥')
|
| 36 |
]]
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
def number_to_cantonese(text):
|
| 40 |
return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
|
|
|
|
| 47 |
|
| 48 |
|
| 49 |
def cantonese_to_ipa(text):
|
| 50 |
+
from text.mandarin import symbols_to_chinese
|
| 51 |
text = symbols_to_chinese(text)
|
| 52 |
text = number_to_cantonese(text.upper())
|
| 53 |
text = converter.convert(text).replace('-', '').replace('$', ' ')
|
text/cleaners.py
CHANGED
|
@@ -247,3 +247,17 @@ def chinese_dialect_cleaners(text):
|
|
| 247 |
text = re.sub(r'\s+$', '', text)
|
| 248 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 249 |
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
text = re.sub(r'\s+$', '', text)
|
| 248 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
| 249 |
return text
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def bert_chinese_cleaners(text):
|
| 253 |
+
from text import mandarin
|
| 254 |
+
matches = re.findall(r"\[ZH\](.*?)\[ZH\]", text)
|
| 255 |
+
text = "".join(matches)
|
| 256 |
+
if text[-1] not in [".", "。", ",", ","]: text += "."
|
| 257 |
+
text = mandarin.symbols_to_chinese(text)
|
| 258 |
+
text = mandarin.number_transform_to_chinese(text)
|
| 259 |
+
if not hasattr(bert_chinese_cleaners, "tts_front"):
|
| 260 |
+
bert_chinese_cleaners.tts_front = mandarin.VITS_PinYin_model()
|
| 261 |
+
tts_front = bert_chinese_cleaners.tts_front
|
| 262 |
+
cleaned_text, char_embeds = tts_front.chinese_to_phonemes(text)
|
| 263 |
+
return cleaned_text, char_embeds
|
text/mandarin.py
CHANGED
|
@@ -262,6 +262,11 @@ def number_to_chinese(text):
|
|
| 262 |
return text
|
| 263 |
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
def chinese_to_bopomofo(text):
|
| 266 |
text = text.replace('、', ',').replace(';', ',').replace(':', ',')
|
| 267 |
words = jieba.lcut(text, cut_all=False)
|
|
@@ -305,7 +310,7 @@ def bopomofo_to_ipa2(text):
|
|
| 305 |
|
| 306 |
def chinese_to_romaji(text):
|
| 307 |
text = symbols_to_chinese(text)
|
| 308 |
-
text =
|
| 309 |
text = chinese_to_bopomofo(text)
|
| 310 |
text = latin_to_bopomofo(text)
|
| 311 |
text = bopomofo_to_romaji(text)
|
|
@@ -326,7 +331,7 @@ def chinese_to_lazy_ipa(text):
|
|
| 326 |
|
| 327 |
def chinese_to_ipa(text):
|
| 328 |
text = symbols_to_chinese(text)
|
| 329 |
-
text =
|
| 330 |
text = chinese_to_bopomofo(text)
|
| 331 |
text = latin_to_bopomofo(text)
|
| 332 |
text = bopomofo_to_ipa(text)
|
|
@@ -340,7 +345,7 @@ def chinese_to_ipa(text):
|
|
| 340 |
|
| 341 |
def chinese_to_ipa2(text):
|
| 342 |
text = symbols_to_chinese(text)
|
| 343 |
-
text =
|
| 344 |
text = chinese_to_bopomofo(text)
|
| 345 |
text = latin_to_bopomofo(text)
|
| 346 |
text = bopomofo_to_ipa2(text)
|
|
@@ -349,3 +354,13 @@ def chinese_to_ipa2(text):
|
|
| 349 |
text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
|
| 350 |
text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
|
| 351 |
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
return text
|
| 263 |
|
| 264 |
|
| 265 |
+
def number_transform_to_chinese(text):
|
| 266 |
+
text = cn2an.transform(text, "an2cn")
|
| 267 |
+
return text
|
| 268 |
+
|
| 269 |
+
|
| 270 |
def chinese_to_bopomofo(text):
|
| 271 |
text = text.replace('、', ',').replace(';', ',').replace(':', ',')
|
| 272 |
words = jieba.lcut(text, cut_all=False)
|
|
|
|
| 310 |
|
| 311 |
def chinese_to_romaji(text):
|
| 312 |
text = symbols_to_chinese(text)
|
| 313 |
+
text = number_transform_to_chinese(text)
|
| 314 |
text = chinese_to_bopomofo(text)
|
| 315 |
text = latin_to_bopomofo(text)
|
| 316 |
text = bopomofo_to_romaji(text)
|
|
|
|
| 331 |
|
| 332 |
def chinese_to_ipa(text):
|
| 333 |
text = symbols_to_chinese(text)
|
| 334 |
+
text = number_transform_to_chinese(text)
|
| 335 |
text = chinese_to_bopomofo(text)
|
| 336 |
text = latin_to_bopomofo(text)
|
| 337 |
text = bopomofo_to_ipa(text)
|
|
|
|
| 345 |
|
| 346 |
def chinese_to_ipa2(text):
|
| 347 |
text = symbols_to_chinese(text)
|
| 348 |
+
text = number_transform_to_chinese(text)
|
| 349 |
text = chinese_to_bopomofo(text)
|
| 350 |
text = latin_to_bopomofo(text)
|
| 351 |
text = bopomofo_to_ipa2(text)
|
|
|
|
| 354 |
text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
|
| 355 |
text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
|
| 356 |
return text
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
def VITS_PinYin_model():
|
| 360 |
+
import torch
|
| 361 |
+
import config
|
| 362 |
+
from vits_pinyin import VITS_PinYin
|
| 363 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 364 |
+
# pinyin
|
| 365 |
+
tts_front = VITS_PinYin(f"{config.ABS_PATH}/bert", device)
|
| 366 |
+
return tts_front
|
text/shanghainese.py
CHANGED
|
@@ -35,25 +35,6 @@ _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
| 35 |
('Z', 'zᴇ')
|
| 36 |
]]
|
| 37 |
|
| 38 |
-
_symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
|
| 39 |
-
('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
|
| 40 |
-
('([0-9]+)/([0-9]+)', r'\2分之\1'),
|
| 41 |
-
('\+', r'加'),
|
| 42 |
-
('([0-9]+)-([0-9]+)', r'\1减\2'),
|
| 43 |
-
('×', r'乘以'),
|
| 44 |
-
('([0-9]+)x([0-9]+)', r'\1乘以\2'),
|
| 45 |
-
('([0-9]+)\*([0-9]+)', r'\1乘以\2'),
|
| 46 |
-
('÷', r'除以'),
|
| 47 |
-
('=', r'等于'),
|
| 48 |
-
('≠', r'不等于'),
|
| 49 |
-
]]
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
def symbols_to_chinese(text):
|
| 53 |
-
for regex, replacement in _symbols_to_chinese:
|
| 54 |
-
text = re.sub(regex, replacement, text)
|
| 55 |
-
return text
|
| 56 |
-
|
| 57 |
|
| 58 |
def _number_to_shanghainese(num):
|
| 59 |
num = cn2an.an2cn(num).replace('一十', '十').replace('二十', '廿').replace('二', '两')
|
|
@@ -71,6 +52,7 @@ def latin_to_ipa(text):
|
|
| 71 |
|
| 72 |
|
| 73 |
def shanghainese_to_ipa(text):
|
|
|
|
| 74 |
text = symbols_to_chinese(text)
|
| 75 |
text = number_to_shanghainese(text.upper())
|
| 76 |
text = converter.convert(text).replace('-', '').replace('$', ' ')
|
|
|
|
| 35 |
('Z', 'zᴇ')
|
| 36 |
]]
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
def _number_to_shanghainese(num):
|
| 40 |
num = cn2an.an2cn(num).replace('一十', '十').replace('二十', '廿').replace('二', '两')
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
def shanghainese_to_ipa(text):
|
| 55 |
+
from text.mandarin import symbols_to_chinese
|
| 56 |
text = symbols_to_chinese(text)
|
| 57 |
text = number_to_shanghainese(text.upper())
|
| 58 |
text = converter.convert(text).replace('-', '').replace('$', ' ')
|