Spaces:
Running
on
Zero
Running
on
Zero
| import logging | |
| from pathlib import Path | |
| import re | |
| from typing import Iterable | |
| from typing import List | |
| from typing import Optional | |
| from typing import Union | |
| import warnings | |
| # import g2p_en | |
| import jamo | |
| from funasr_detach.tokenizer.abs_tokenizer import AbsTokenizer | |
| g2p_classes = [ | |
| None, | |
| "g2p_en", | |
| "g2p_en_no_space", | |
| "pyopenjtalk", | |
| "pyopenjtalk_kana", | |
| "pyopenjtalk_accent", | |
| "pyopenjtalk_accent_with_pause", | |
| "pyopenjtalk_prosody", | |
| "pypinyin_g2p", | |
| "pypinyin_g2p_phone", | |
| "espeak_ng_arabic", | |
| "espeak_ng_german", | |
| "espeak_ng_french", | |
| "espeak_ng_spanish", | |
| "espeak_ng_russian", | |
| "espeak_ng_greek", | |
| "espeak_ng_finnish", | |
| "espeak_ng_hungarian", | |
| "espeak_ng_dutch", | |
| "espeak_ng_english_us_vits", | |
| "espeak_ng_hindi", | |
| "g2pk", | |
| "g2pk_no_space", | |
| "korean_jaso", | |
| "korean_jaso_no_space", | |
| ] | |
| def split_by_space(text) -> List[str]: | |
| if " " in text: | |
| text = text.replace(" ", " <space> ") | |
| return [c.replace("<space>", " ") for c in text.split(" ")] | |
| else: | |
| return text.split(" ") | |
| def pyopenjtalk_g2p(text) -> List[str]: | |
| import pyopenjtalk | |
| # phones is a str object separated by space | |
| phones = pyopenjtalk.g2p(text, kana=False) | |
| phones = phones.split(" ") | |
| return phones | |
| def pyopenjtalk_g2p_accent(text) -> List[str]: | |
| import pyopenjtalk | |
| import re | |
| phones = [] | |
| for labels in pyopenjtalk.run_frontend(text)[1]: | |
| p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9]+)", labels) | |
| if len(p) == 1: | |
| phones += [p[0][0], p[0][2], p[0][1]] | |
| return phones | |
| def pyopenjtalk_g2p_accent_with_pause(text) -> List[str]: | |
| import pyopenjtalk | |
| import re | |
| phones = [] | |
| for labels in pyopenjtalk.run_frontend(text)[1]: | |
| if labels.split("-")[1].split("+")[0] == "pau": | |
| phones += ["pau"] | |
| continue | |
| p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9]+)", labels) | |
| if len(p) == 1: | |
| phones += [p[0][0], p[0][2], p[0][1]] | |
| return phones | |
| def pyopenjtalk_g2p_kana(text) -> List[str]: | |
| import pyopenjtalk | |
| kanas = pyopenjtalk.g2p(text, kana=True) | |
| return list(kanas) | |
| def pyopenjtalk_g2p_prosody(text: str, drop_unvoiced_vowels: bool = True) -> List[str]: | |
| """Extract phoneme + prosoody symbol sequence from input full-context labels. | |
| The algorithm is based on `Prosodic features control by symbols as input of | |
| sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks. | |
| Args: | |
| text (str): Input text. | |
| drop_unvoiced_vowels (bool): whether to drop unvoiced vowels. | |
| Returns: | |
| List[str]: List of phoneme + prosody symbols. | |
| Examples: | |
| >>> from funasr_detach.tokenizer.phoneme_tokenizer import pyopenjtalk_g2p_prosody | |
| >>> pyopenjtalk_g2p_prosody("こんにちは。") | |
| ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$'] | |
| .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic | |
| modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104 | |
| """ | |
| import pyopenjtalk | |
| labels = pyopenjtalk.run_frontend(text)[1] | |
| N = len(labels) | |
| phones = [] | |
| for n in range(N): | |
| lab_curr = labels[n] | |
| # current phoneme | |
| p3 = re.search(r"\-(.*?)\+", lab_curr).group(1) | |
| # deal unvoiced vowels as normal vowels | |
| if drop_unvoiced_vowels and p3 in "AEIOU": | |
| p3 = p3.lower() | |
| # deal with sil at the beginning and the end of text | |
| if p3 == "sil": | |
| assert n == 0 or n == N - 1 | |
| if n == 0: | |
| phones.append("^") | |
| elif n == N - 1: | |
| # check question form or not | |
| e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr) | |
| if e3 == 0: | |
| phones.append("$") | |
| elif e3 == 1: | |
| phones.append("?") | |
| continue | |
| elif p3 == "pau": | |
| phones.append("_") | |
| continue | |
| else: | |
| phones.append(p3) | |
| # accent type and position info (forward or backward) | |
| a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr) | |
| a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr) | |
| a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr) | |
| # number of mora in accent phrase | |
| f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr) | |
| a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1]) | |
| # accent phrase border | |
| if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl": | |
| phones.append("#") | |
| # pitch falling | |
| elif a1 == 0 and a2_next == a2 + 1 and a2 != f1: | |
| phones.append("]") | |
| # pitch rising | |
| elif a2 == 1 and a2_next == 2: | |
| phones.append("[") | |
| return phones | |
| def _numeric_feature_by_regex(regex, s): | |
| match = re.search(regex, s) | |
| if match is None: | |
| return -50 | |
| return int(match.group(1)) | |
| def pypinyin_g2p(text) -> List[str]: | |
| from pypinyin import pinyin | |
| from pypinyin import Style | |
| phones = [phone[0] for phone in pinyin(text, style=Style.TONE3)] | |
| return phones | |
| def pypinyin_g2p_phone(text) -> List[str]: | |
| from pypinyin import pinyin | |
| from pypinyin import Style | |
| from pypinyin.style._utils import get_finals | |
| from pypinyin.style._utils import get_initials | |
| phones = [ | |
| p | |
| for phone in pinyin(text, style=Style.TONE3) | |
| for p in [ | |
| get_initials(phone[0], strict=True), | |
| get_finals(phone[0], strict=True), | |
| ] | |
| if len(p) != 0 | |
| ] | |
| return phones | |
| class G2p_en: | |
| """On behalf of g2p_en.G2p. | |
| g2p_en.G2p isn't pickalable and it can't be copied to the other processes | |
| via multiprocessing module. | |
| As a workaround, g2p_en.G2p is instantiated upon calling this class. | |
| """ | |
| def __init__(self, no_space: bool = False): | |
| self.no_space = no_space | |
| self.g2p = None | |
| def __call__(self, text) -> List[str]: | |
| if self.g2p is None: | |
| self.g2p = g2p_en.G2p() | |
| phones = self.g2p(text) | |
| if self.no_space: | |
| # remove space which represents word serapater | |
| phones = list(filter(lambda s: s != " ", phones)) | |
| return phones | |
| class G2pk: | |
| """On behalf of g2pk.G2p. | |
| g2pk.G2p isn't pickalable and it can't be copied to the other processes | |
| via multiprocessing module. | |
| As a workaround, g2pk.G2p is instantiated upon calling this class. | |
| """ | |
| def __init__( | |
| self, descritive=False, group_vowels=False, to_syl=False, no_space=False | |
| ): | |
| self.descritive = descritive | |
| self.group_vowels = group_vowels | |
| self.to_syl = to_syl | |
| self.no_space = no_space | |
| self.g2p = None | |
| def __call__(self, text) -> List[str]: | |
| if self.g2p is None: | |
| import g2pk | |
| self.g2p = g2pk.G2p() | |
| phones = list( | |
| self.g2p( | |
| text, | |
| descriptive=self.descritive, | |
| group_vowels=self.group_vowels, | |
| to_syl=self.to_syl, | |
| ) | |
| ) | |
| if self.no_space: | |
| # remove space which represents word serapater | |
| phones = list(filter(lambda s: s != " ", phones)) | |
| return phones | |
| class Jaso: | |
| PUNC = "!'(),-.:;?" | |
| SPACE = " " | |
| JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)]) | |
| JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)]) | |
| JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)]) | |
| VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + PUNC + SPACE | |
| def __init__(self, space_symbol=" ", no_space=False): | |
| self.space_symbol = space_symbol | |
| self.no_space = no_space | |
| def _text_to_jaso(self, line: str) -> List[str]: | |
| jasos = list(jamo.hangul_to_jamo(line)) | |
| return jasos | |
| def _remove_non_korean_characters(self, tokens): | |
| new_tokens = [token for token in tokens if token in self.VALID_CHARS] | |
| return new_tokens | |
| def __call__(self, text) -> List[str]: | |
| graphemes = [x for x in self._text_to_jaso(text)] | |
| graphemes = self._remove_non_korean_characters(graphemes) | |
| if self.no_space: | |
| graphemes = list(filter(lambda s: s != " ", graphemes)) | |
| else: | |
| graphemes = [x if x != " " else self.space_symbol for x in graphemes] | |
| return graphemes | |
| class Phonemizer: | |
| """Phonemizer module for various languages. | |
| This is wrapper module of https://github.com/bootphon/phonemizer. | |
| You can define various g2p modules by specifying options for phonemizer. | |
| See available options: | |
| https://github.com/bootphon/phonemizer/blob/master/phonemizer/phonemize.py#L32 | |
| """ | |
| def __init__( | |
| self, | |
| backend, | |
| word_separator: Optional[str] = None, | |
| syllable_separator: Optional[str] = None, | |
| phone_separator: Optional[str] = " ", | |
| strip=False, | |
| split_by_single_token: bool = False, | |
| **phonemizer_kwargs, | |
| ): | |
| # delayed import | |
| from phonemizer.backend import BACKENDS | |
| from phonemizer.separator import Separator | |
| self.separator = Separator( | |
| word=word_separator, | |
| syllable=syllable_separator, | |
| phone=phone_separator, | |
| ) | |
| # define logger to suppress the warning in phonemizer | |
| logger = logging.getLogger("phonemizer") | |
| logger.setLevel(logging.ERROR) | |
| self.phonemizer = BACKENDS[backend]( | |
| **phonemizer_kwargs, | |
| logger=logger, | |
| ) | |
| self.strip = strip | |
| self.split_by_single_token = split_by_single_token | |
| def __call__(self, text) -> List[str]: | |
| tokens = self.phonemizer.phonemize( | |
| [text], | |
| separator=self.separator, | |
| strip=self.strip, | |
| njobs=1, | |
| )[0] | |
| if not self.split_by_single_token: | |
| return tokens.split() | |
| else: | |
| # "a: ab" -> ["a", ":", "<space>", "a", "b"] | |
| # TODO(kan-bayashi): space replacement should be dealt in PhonemeTokenizer | |
| return [c.replace(" ", "<space>") for c in tokens] | |
| class PhonemeTokenizer(AbsTokenizer): | |
| def __init__( | |
| self, | |
| g2p_type: Union[None, str], | |
| non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, | |
| space_symbol: str = "<space>", | |
| remove_non_linguistic_symbols: bool = False, | |
| ): | |
| if g2p_type is None: | |
| self.g2p = split_by_space | |
| elif g2p_type == "g2p_en": | |
| self.g2p = G2p_en(no_space=False) | |
| elif g2p_type == "g2p_en_no_space": | |
| self.g2p = G2p_en(no_space=True) | |
| elif g2p_type == "pyopenjtalk": | |
| self.g2p = pyopenjtalk_g2p | |
| elif g2p_type == "pyopenjtalk_kana": | |
| self.g2p = pyopenjtalk_g2p_kana | |
| elif g2p_type == "pyopenjtalk_accent": | |
| self.g2p = pyopenjtalk_g2p_accent | |
| elif g2p_type == "pyopenjtalk_accent_with_pause": | |
| self.g2p = pyopenjtalk_g2p_accent_with_pause | |
| elif g2p_type == "pyopenjtalk_prosody": | |
| self.g2p = pyopenjtalk_g2p_prosody | |
| elif g2p_type == "pypinyin_g2p": | |
| self.g2p = pypinyin_g2p | |
| elif g2p_type == "pypinyin_g2p_phone": | |
| self.g2p = pypinyin_g2p_phone | |
| elif g2p_type == "espeak_ng_arabic": | |
| self.g2p = Phonemizer( | |
| language="ar", | |
| backend="espeak", | |
| with_stress=True, | |
| preserve_punctuation=True, | |
| ) | |
| elif g2p_type == "espeak_ng_german": | |
| self.g2p = Phonemizer( | |
| language="de", | |
| backend="espeak", | |
| with_stress=True, | |
| preserve_punctuation=True, | |
| ) | |
| elif g2p_type == "espeak_ng_french": | |
| self.g2p = Phonemizer( | |
| language="fr-fr", | |
| backend="espeak", | |
| with_stress=True, | |
| preserve_punctuation=True, | |
| ) | |
| elif g2p_type == "espeak_ng_spanish": | |
| self.g2p = Phonemizer( | |
| language="es", | |
| backend="espeak", | |
| with_stress=True, | |
| preserve_punctuation=True, | |
| ) | |
| elif g2p_type == "espeak_ng_russian": | |
| self.g2p = Phonemizer( | |
| language="ru", | |
| backend="espeak", | |
| with_stress=True, | |
| preserve_punctuation=True, | |
| ) | |
| elif g2p_type == "espeak_ng_greek": | |
| self.g2p = Phonemizer( | |
| language="el", | |
| backend="espeak", | |
| with_stress=True, | |
| preserve_punctuation=True, | |
| ) | |
| elif g2p_type == "espeak_ng_finnish": | |
| self.g2p = Phonemizer( | |
| language="fi", | |
| backend="espeak", | |
| with_stress=True, | |
| preserve_punctuation=True, | |
| ) | |
| elif g2p_type == "espeak_ng_hungarian": | |
| self.g2p = Phonemizer( | |
| language="hu", | |
| backend="espeak", | |
| with_stress=True, | |
| preserve_punctuation=True, | |
| ) | |
| elif g2p_type == "espeak_ng_dutch": | |
| self.g2p = Phonemizer( | |
| language="nl", | |
| backend="espeak", | |
| with_stress=True, | |
| preserve_punctuation=True, | |
| ) | |
| elif g2p_type == "espeak_ng_hindi": | |
| self.g2p = Phonemizer( | |
| language="hi", | |
| backend="espeak", | |
| with_stress=True, | |
| preserve_punctuation=True, | |
| ) | |
| elif g2p_type == "g2pk": | |
| self.g2p = G2pk(no_space=False) | |
| elif g2p_type == "g2pk_no_space": | |
| self.g2p = G2pk(no_space=True) | |
| elif g2p_type == "espeak_ng_english_us_vits": | |
| # VITS official implementation-like processing | |
| # Reference: https://github.com/jaywalnut310/vits | |
| self.g2p = Phonemizer( | |
| language="en-us", | |
| backend="espeak", | |
| with_stress=True, | |
| preserve_punctuation=True, | |
| strip=True, | |
| word_separator=" ", | |
| phone_separator="", | |
| split_by_single_token=True, | |
| ) | |
| elif g2p_type == "korean_jaso": | |
| self.g2p = Jaso(space_symbol=space_symbol, no_space=False) | |
| elif g2p_type == "korean_jaso_no_space": | |
| self.g2p = Jaso(no_space=True) | |
| else: | |
| raise NotImplementedError(f"Not supported: g2p_type={g2p_type}") | |
| self.g2p_type = g2p_type | |
| self.space_symbol = space_symbol | |
| if non_linguistic_symbols is None: | |
| self.non_linguistic_symbols = set() | |
| elif isinstance(non_linguistic_symbols, (Path, str)): | |
| non_linguistic_symbols = Path(non_linguistic_symbols) | |
| try: | |
| with non_linguistic_symbols.open("r", encoding="utf-8") as f: | |
| self.non_linguistic_symbols = set(line.rstrip() for line in f) | |
| except FileNotFoundError: | |
| warnings.warn(f"{non_linguistic_symbols} doesn't exist.") | |
| self.non_linguistic_symbols = set() | |
| else: | |
| self.non_linguistic_symbols = set(non_linguistic_symbols) | |
| self.remove_non_linguistic_symbols = remove_non_linguistic_symbols | |
| def __repr__(self): | |
| return ( | |
| f"{self.__class__.__name__}(" | |
| f'g2p_type="{self.g2p_type}", ' | |
| f'space_symbol="{self.space_symbol}", ' | |
| f'non_linguistic_symbols="{self.non_linguistic_symbols}"' | |
| ")" | |
| ) | |
| def text2tokens(self, line: str) -> List[str]: | |
| tokens = [] | |
| while len(line) != 0: | |
| for w in self.non_linguistic_symbols: | |
| if line.startswith(w): | |
| if not self.remove_non_linguistic_symbols: | |
| tokens.append(line[: len(w)]) | |
| line = line[len(w) :] | |
| break | |
| else: | |
| t = line[0] | |
| tokens.append(t) | |
| line = line[1:] | |
| line = "".join(tokens) | |
| tokens = self.g2p(line) | |
| return tokens | |
| def tokens2text(self, tokens: Iterable[str]) -> str: | |
| # phoneme type is not invertible | |
| return "".join(tokens) | |