Spaces:
Sleeping
Sleeping
| """Updated version of core.py from | |
| https://github.com/yamatt/homoglyphs/tree/main/homoglyphs_fork | |
| for modern python3 | |
| """ | |
| from collections import defaultdict | |
| import json | |
| from itertools import product | |
| import os | |
| import unicodedata | |
| # Actions if char not in alphabet | |
| STRATEGY_LOAD = 1 # load category for this char | |
| STRATEGY_IGNORE = 2 # add char to result | |
| STRATEGY_REMOVE = 3 # remove char from result | |
| ASCII_RANGE = range(128) | |
| CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| DATA_LOCATION = os.path.join(CURRENT_DIR, "homoglyph_data") | |
| class Categories: | |
| """ | |
| Work with aliases from ISO 15924. | |
| https://en.wikipedia.org/wiki/ISO_15924#List_of_codes | |
| """ | |
| fpath = os.path.join(DATA_LOCATION, "categories.json") | |
| def _get_ranges(cls, categories): | |
| """ | |
| :return: iter: (start code, end code) | |
| :rtype: list | |
| """ | |
| with open(cls.fpath, encoding="utf-8") as f: | |
| data = json.load(f) | |
| for category in categories: | |
| if category not in data["aliases"]: | |
| raise ValueError("Invalid category: {}".format(category)) | |
| for point in data["points"]: | |
| if point[2] in categories: | |
| yield point[:2] | |
| def get_alphabet(cls, categories): | |
| """ | |
| :return: set of chars in alphabet by categories list | |
| :rtype: set | |
| """ | |
| alphabet = set() | |
| for start, end in cls._get_ranges(categories): | |
| chars = (chr(code) for code in range(start, end + 1)) | |
| alphabet.update(chars) | |
| return alphabet | |
| def detect(cls, char): | |
| """ | |
| :return: category | |
| :rtype: str | |
| """ | |
| with open(cls.fpath, encoding="utf-8") as f: | |
| data = json.load(f) | |
| # try detect category by unicodedata | |
| try: | |
| category = unicodedata.name(char).split()[0] | |
| except (TypeError, ValueError): | |
| # In Python2 unicodedata.name raise error for non-unicode chars | |
| # Python3 raise ValueError for non-unicode characters | |
| pass | |
| else: | |
| if category in data["aliases"]: | |
| return category | |
| # try detect category by ranges from JSON file. | |
| code = ord(char) | |
| for point in data["points"]: | |
| if point[0] <= code <= point[1]: | |
| return point[2] | |
| def get_all(cls): | |
| with open(cls.fpath, encoding="utf-8") as f: | |
| data = json.load(f) | |
| return set(data["aliases"]) | |
| class Languages: | |
| fpath = os.path.join(DATA_LOCATION, "languages.json") | |
| def get_alphabet(cls, languages): | |
| """ | |
| :return: set of chars in alphabet by languages list | |
| :rtype: set | |
| """ | |
| with open(cls.fpath, encoding="utf-8") as f: | |
| data = json.load(f) | |
| alphabet = set() | |
| for lang in languages: | |
| if lang not in data: | |
| raise ValueError("Invalid language code: {}".format(lang)) | |
| alphabet.update(data[lang]) | |
| return alphabet | |
| def detect(cls, char): | |
| """ | |
| :return: set of languages which alphabet contains passed char. | |
| :rtype: set | |
| """ | |
| with open(cls.fpath, encoding="utf-8") as f: | |
| data = json.load(f) | |
| languages = set() | |
| for lang, alphabet in data.items(): | |
| if char in alphabet: | |
| languages.add(lang) | |
| return languages | |
| def get_all(cls): | |
| with open(cls.fpath, encoding="utf-8") as f: | |
| data = json.load(f) | |
| return set(data.keys()) | |
| class Homoglyphs: | |
| def __init__( | |
| self, | |
| categories=None, | |
| languages=None, | |
| alphabet=None, | |
| strategy=STRATEGY_IGNORE, | |
| ascii_strategy=STRATEGY_IGNORE, | |
| ascii_range=ASCII_RANGE, | |
| ): | |
| # strategies | |
| if strategy not in (STRATEGY_LOAD, STRATEGY_IGNORE, STRATEGY_REMOVE): | |
| raise ValueError("Invalid strategy") | |
| self.strategy = strategy | |
| self.ascii_strategy = ascii_strategy | |
| self.ascii_range = ascii_range | |
| # Homoglyphs must be initialized by any alphabet for correct work | |
| if not categories and not languages and not alphabet: | |
| categories = ("LATIN", "COMMON") | |
| # cats and langs | |
| self.categories = set(categories or []) | |
| self.languages = set(languages or []) | |
| # alphabet | |
| self.alphabet = set(alphabet or []) | |
| if self.categories: | |
| alphabet = Categories.get_alphabet(self.categories) | |
| self.alphabet.update(alphabet) | |
| if self.languages: | |
| alphabet = Languages.get_alphabet(self.languages) | |
| self.alphabet.update(alphabet) | |
| self.table = self.get_table(self.alphabet) | |
| def get_table(alphabet): | |
| table = defaultdict(set) | |
| with open(os.path.join(DATA_LOCATION, "confusables_sept2022.json")) as f: | |
| data = json.load(f) | |
| for char in alphabet: | |
| if char in data: | |
| for homoglyph in data[char]: | |
| if homoglyph in alphabet: | |
| table[char].add(homoglyph) | |
| return table | |
| def get_restricted_table(source_alphabet, target_alphabet): | |
| table = defaultdict(set) | |
| with open(os.path.join(DATA_LOCATION, "confusables_sept2022.json")) as f: | |
| data = json.load(f) | |
| for char in source_alphabet: | |
| if char in data: | |
| for homoglyph in data[char]: | |
| if homoglyph in target_alphabet: | |
| table[char].add(homoglyph) | |
| return table | |
| def uniq_and_sort(data): | |
| result = list(set(data)) | |
| result.sort(key=lambda x: (-len(x), x)) | |
| return result | |
| def _update_alphabet(self, char): | |
| # try detect languages | |
| langs = Languages.detect(char) | |
| if langs: | |
| self.languages.update(langs) | |
| alphabet = Languages.get_alphabet(langs) | |
| self.alphabet.update(alphabet) | |
| else: | |
| # try detect categories | |
| category = Categories.detect(char) | |
| if category is None: | |
| return False | |
| self.categories.add(category) | |
| alphabet = Categories.get_alphabet([category]) | |
| self.alphabet.update(alphabet) | |
| # update table for new alphabet | |
| self.table = self.get_table(self.alphabet) | |
| return True | |
| def _get_char_variants(self, char): | |
| if char not in self.alphabet: | |
| if self.strategy == STRATEGY_LOAD: | |
| if not self._update_alphabet(char): | |
| return [] | |
| elif self.strategy == STRATEGY_IGNORE: | |
| return [char] | |
| elif self.strategy == STRATEGY_REMOVE: | |
| return [] | |
| # find alternative chars for current char | |
| alt_chars = self.table.get(char, set()) | |
| if alt_chars: | |
| # find alternative chars for alternative chars for current char | |
| alt_chars2 = [self.table.get(alt_char, set()) for alt_char in alt_chars] | |
| # combine all alternatives | |
| alt_chars.update(*alt_chars2) | |
| # add current char to alternatives | |
| alt_chars.add(char) | |
| # uniq, sort and return | |
| return self.uniq_and_sort(alt_chars) | |
| def _get_combinations(self, text, ascii=False): | |
| variations = [] | |
| for char in text: | |
| alt_chars = self._get_char_variants(char) | |
| if ascii: | |
| alt_chars = [char for char in alt_chars if ord(char) in self.ascii_range] | |
| if not alt_chars and self.ascii_strategy == STRATEGY_IGNORE: | |
| return | |
| if alt_chars: | |
| variations.append(alt_chars) | |
| if variations: | |
| for variant in product(*variations): | |
| yield "".join(variant) | |
| def get_combinations(self, text): | |
| return list(self._get_combinations(text)) | |
| def _to_ascii(self, text): | |
| for variant in self._get_combinations(text, ascii=True): | |
| if max(map(ord, variant)) in self.ascii_range: | |
| yield variant | |
| def to_ascii(self, text): | |
| return self.uniq_and_sort(self._to_ascii(text)) | |