Spaces:
Running
on
Zero
Running
on
Zero
| from pathlib import Path | |
| from typing import Iterable | |
| from typing import Union | |
| from funasr_detach.tokenizer.abs_tokenizer import AbsTokenizer | |
| from funasr_detach.tokenizer.char_tokenizer import CharTokenizer | |
| from funasr_detach.tokenizer.phoneme_tokenizer import PhonemeTokenizer | |
| from funasr_detach.tokenizer.sentencepiece_tokenizer import SentencepiecesTokenizer | |
| from funasr_detach.tokenizer.word_tokenizer import WordTokenizer | |
| def build_tokenizer( | |
| token_type: str, | |
| bpemodel: Union[Path, str, Iterable[str]] = None, | |
| non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, | |
| remove_non_linguistic_symbols: bool = False, | |
| space_symbol: str = "<space>", | |
| delimiter: str = None, | |
| g2p_type: str = None, | |
| ) -> AbsTokenizer: | |
| """A helper function to instantiate Tokenizer""" | |
| if token_type == "bpe": | |
| if bpemodel is None: | |
| raise ValueError('bpemodel is required if token_type = "bpe"') | |
| if remove_non_linguistic_symbols: | |
| raise RuntimeError( | |
| "remove_non_linguistic_symbols is not implemented for token_type=bpe" | |
| ) | |
| return SentencepiecesTokenizer(bpemodel) | |
| elif token_type == "word": | |
| if remove_non_linguistic_symbols and non_linguistic_symbols is not None: | |
| return WordTokenizer( | |
| delimiter=delimiter, | |
| non_linguistic_symbols=non_linguistic_symbols, | |
| remove_non_linguistic_symbols=True, | |
| ) | |
| else: | |
| return WordTokenizer(delimiter=delimiter) | |
| elif token_type == "char": | |
| return CharTokenizer( | |
| non_linguistic_symbols=non_linguistic_symbols, | |
| space_symbol=space_symbol, | |
| remove_non_linguistic_symbols=remove_non_linguistic_symbols, | |
| ) | |
| elif token_type == "phn": | |
| return PhonemeTokenizer( | |
| g2p_type=g2p_type, | |
| non_linguistic_symbols=non_linguistic_symbols, | |
| space_symbol=space_symbol, | |
| remove_non_linguistic_symbols=remove_non_linguistic_symbols, | |
| ) | |
| else: | |
| raise ValueError( | |
| f"token_mode must be one of bpe, word, char or phn: " f"{token_type}" | |
| ) | |