Spaces:
Running
Running
Commit
·
46fe50f
1
Parent(s):
c45b82d
detect the script automatically, if not specified
Browse files- app.py +9 -6
- myv_translit.py +36 -1
- test_translit.py +10 -1
app.py
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
|
| 3 |
-
from myv_translit import lat2cyr, cyr2lat
|
| 4 |
|
|
|
|
| 5 |
|
| 6 |
-
|
|
|
|
| 7 |
first_e_with_hacek = not not_first_e_with_hacek
|
| 8 |
soft_l_after_vowels = not not_soft_l_after_vowels
|
| 9 |
-
if
|
|
|
|
|
|
|
|
|
|
| 10 |
result = cyr2lat(input_text, joint_acute=joint_acute, first_e_with_hacek=first_e_with_hacek, soft_l_after_vowels=soft_l_after_vowels)
|
| 11 |
else:
|
| 12 |
result = lat2cyr(input_text, joint_acute=joint_acute, first_e_with_hacek=first_e_with_hacek, soft_l_after_vowels=soft_l_after_vowels)
|
|
@@ -21,14 +26,12 @@ article = """
|
|
| 21 |
- http://valks.erzja.info/2020/04/30/эрзянский-алфавит/
|
| 22 |
"""
|
| 23 |
|
| 24 |
-
directions = ['lat -> кир', 'кир -> lat']
|
| 25 |
-
|
| 26 |
|
| 27 |
interface = gr.Interface(
|
| 28 |
transliterator,
|
| 29 |
[
|
| 30 |
gr.Textbox(label="Текст", lines=2, placeholder='text to transliterate'),
|
| 31 |
-
gr.Radio(choices=
|
| 32 |
gr.Checkbox(value=True, label='L + ́ -> Ĺ'),
|
| 33 |
gr.Checkbox(value=False, label='ěrzä -> erzä'),
|
| 34 |
gr.Checkbox(value=False, label='peĺks -> pelks'),
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
|
| 3 |
+
from myv_translit import lat2cyr, cyr2lat, detect_script
|
| 4 |
|
| 5 |
+
DIRECTIONS = ['lat -> кир', 'кир -> lat']
|
| 6 |
|
| 7 |
+
|
| 8 |
+
def transliterator(input_text, direction, joint_acute=True, not_first_e_with_hacek=False, not_soft_l_after_vowels=True):
|
| 9 |
first_e_with_hacek = not not_first_e_with_hacek
|
| 10 |
soft_l_after_vowels = not not_soft_l_after_vowels
|
| 11 |
+
if direction is None:
|
| 12 |
+
code = detect_script(input_text)
|
| 13 |
+
direction = DIRECTIONS[int(code != 'lat')]
|
| 14 |
+
if direction == DIRECTIONS[1]:
|
| 15 |
result = cyr2lat(input_text, joint_acute=joint_acute, first_e_with_hacek=first_e_with_hacek, soft_l_after_vowels=soft_l_after_vowels)
|
| 16 |
else:
|
| 17 |
result = lat2cyr(input_text, joint_acute=joint_acute, first_e_with_hacek=first_e_with_hacek, soft_l_after_vowels=soft_l_after_vowels)
|
|
|
|
| 26 |
- http://valks.erzja.info/2020/04/30/эрзянский-алфавит/
|
| 27 |
"""
|
| 28 |
|
|
|
|
|
|
|
| 29 |
|
| 30 |
interface = gr.Interface(
|
| 31 |
transliterator,
|
| 32 |
[
|
| 33 |
gr.Textbox(label="Текст", lines=2, placeholder='text to transliterate'),
|
| 34 |
+
gr.Radio(choices=DIRECTIONS, type="value", interactive=True, label='Направление'),
|
| 35 |
gr.Checkbox(value=True, label='L + ́ -> Ĺ'),
|
| 36 |
gr.Checkbox(value=False, label='ěrzä -> erzä'),
|
| 37 |
gr.Checkbox(value=False, label='peĺks -> pelks'),
|
myv_translit.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import re
|
| 2 |
-
|
| 3 |
|
| 4 |
_cyr2lat = [
|
| 5 |
{'find_what': 'А', 'replacer': 'A', 're': False},
|
|
@@ -265,3 +265,38 @@ def cyr2lat(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels
|
|
| 265 |
def lat2cyr(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels=True):
|
| 266 |
# todo: support all the optional settings
|
| 267 |
return transliterate_with_rules(text, _lat2cyr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import re
|
| 2 |
+
from collections import Counter
|
| 3 |
|
| 4 |
_cyr2lat = [
|
| 5 |
{'find_what': 'А', 'replacer': 'A', 're': False},
|
|
|
|
| 265 |
def lat2cyr(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels=True):
|
| 266 |
# todo: support all the optional settings
|
| 267 |
return transliterate_with_rules(text, _lat2cyr)
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
CYR_CHARS = 'абвгдеёжзиклмнопрстуфхцчшщъыьэюя'
|
| 271 |
+
BASIC_LAT_CHARS = 'abcdefghijklmnopqrtuvwxyz'
|
| 272 |
+
ACCENT_LAT_CHARS = 'ěäüöśźćńŕťďĺ'
|
| 273 |
+
LAT_CHARS = BASIC_LAT_CHARS + ACCENT_LAT_CHARS
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def detect_script(text: str, min_prevalence: float = 2.0, min_detectable: float = 0.1) -> str:
|
| 277 |
+
""" Detect the script of the text.
|
| 278 |
+
Possible values:
|
| 279 |
+
- cyr - Cyrillic
|
| 280 |
+
- lat - Latin
|
| 281 |
+
- mix - Mixed Cyrillic and Latin script
|
| 282 |
+
- unk - Unknown script (probably neither Latin nor Cyrillic)
|
| 283 |
+
"""
|
| 284 |
+
cyr, lat, other = 0, 0, 0
|
| 285 |
+
char_cnt = Counter(text.lower())
|
| 286 |
+
for char, cnt in char_cnt.items():
|
| 287 |
+
if char in CYR_CHARS:
|
| 288 |
+
cyr += cnt
|
| 289 |
+
elif char in LAT_CHARS:
|
| 290 |
+
lat += cnt
|
| 291 |
+
else:
|
| 292 |
+
other += cnt
|
| 293 |
+
total = cyr + lat + other
|
| 294 |
+
if not total:
|
| 295 |
+
return 'unk'
|
| 296 |
+
if (cyr + lat) / total < min_detectable:
|
| 297 |
+
return 'unk'
|
| 298 |
+
if cyr >= lat * min_prevalence:
|
| 299 |
+
return 'cyr'
|
| 300 |
+
if lat >= cyr * min_prevalence:
|
| 301 |
+
return 'lat'
|
| 302 |
+
return 'mix'
|
test_translit.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from myv_translit import cyr2lat
|
| 2 |
|
| 3 |
|
| 4 |
def test_join_acute():
|
|
@@ -18,6 +18,15 @@ def test_soft_l():
|
|
| 18 |
assert cyr2lat('пелькс', soft_l_after_vowels=False) == 'pelks'
|
| 19 |
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
# todo: test on a larger corpus
|
| 22 |
# todo: test cyclical consistency
|
| 23 |
|
|
|
|
| 1 |
+
from myv_translit import cyr2lat, detect_script
|
| 2 |
|
| 3 |
|
| 4 |
def test_join_acute():
|
|
|
|
| 18 |
assert cyr2lat('пелькс', soft_l_after_vowels=False) == 'pelks'
|
| 19 |
|
| 20 |
|
| 21 |
+
def test_detection():
|
| 22 |
+
assert detect_script('123 456?? 8743 098543 ???...,.! @%%&&& хз') == 'unk'
|
| 23 |
+
assert detect_script('ěrzä') == 'lat'
|
| 24 |
+
assert detect_script('ěrzä ю') == 'lat'
|
| 25 |
+
assert detect_script('ЭРЗЯ') == 'cyr'
|
| 26 |
+
assert detect_script('ЭРЗЯ d') == 'cyr'
|
| 27 |
+
assert detect_script('ěrzä эрзянь') == 'mix'
|
| 28 |
+
|
| 29 |
+
|
| 30 |
# todo: test on a larger corpus
|
| 31 |
# todo: test cyclical consistency
|
| 32 |
|