Spaces:
Running
Running
Commit
·
7a5c5b0
1
Parent(s):
6f08302
apply the "less diacritics" rules for lat -> cyr
Browse files- myv_translit.py +20 -1
- test_translit.py +17 -0
myv_translit.py
CHANGED
|
@@ -138,12 +138,28 @@ _cyr2lat_first_e = [
|
|
| 138 |
{'find_what': '\\bě', 'replacer': 'e', 're': True},
|
| 139 |
]
|
| 140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
_cyr2lat_soft_l_after_vowels = [
|
| 142 |
# joint acutes | disjoint acutes
|
| 143 |
{'find_what': '([iěeIĚE])(Ĺ|Ĺ)', 'replacer': '\\1L', 're': True},
|
| 144 |
{'find_what': '([iěeIĚE])(ĺ|ĺ)', 'replacer': '\\1l', 're': True},
|
| 145 |
]
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
_lat2cyr = [
|
| 148 |
{'find_what': 'Ŕ', 'replacer': 'Ŕ', 're': False},
|
| 149 |
{'find_what': 'Ĺ', 'replacer': 'Ĺ', 're': False},
|
|
@@ -299,8 +315,11 @@ def cyr2lat(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels
|
|
| 299 |
|
| 300 |
|
| 301 |
def lat2cyr(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels=True):
|
| 302 |
-
|
|
|
|
| 303 |
text = transliterate_with_rules(text, _lat2cyr)
|
|
|
|
|
|
|
| 304 |
text = transliterate_with_rules(text, _lat2cyr_special_cases)
|
| 305 |
return text
|
| 306 |
|
|
|
|
| 138 |
{'find_what': '\\bě', 'replacer': 'e', 're': True},
|
| 139 |
]
|
| 140 |
|
| 141 |
+
_lat2cyr_first_e = [
|
| 142 |
+
{'find_what': '\\bE', 'replacer': 'Ě', 're': True},
|
| 143 |
+
{'find_what': '\\be', 'replacer': 'ě', 're': True},
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
_cyr2lat_soft_l_after_vowels = [
|
| 147 |
# joint acutes | disjoint acutes
|
| 148 |
{'find_what': '([iěeIĚE])(Ĺ|Ĺ)', 'replacer': '\\1L', 're': True},
|
| 149 |
{'find_what': '([iěeIĚE])(ĺ|ĺ)', 'replacer': '\\1l', 're': True},
|
| 150 |
]
|
| 151 |
|
| 152 |
+
_lat2cyr_soft_l_after_vowels = [
|
| 153 |
+
# add the soft sign, but only if the next letter is not softening
|
| 154 |
+
{'find_what': '([иэеИЭЕ])(Л)\\b', 'replacer': '\\1Ль', 're': True},
|
| 155 |
+
{'find_what': '([иэеИЭЕ])(л)\\b', 'replacer': '\\1ль', 're': True},
|
| 156 |
+
{'find_what': '([иэеИЭЕ])(Л)([^ьъиеюяю])', 'replacer': '\\1ЛЬ\\3', 're': True},
|
| 157 |
+
{'find_what': '([иэеИЭЕ])(л)([^ьъиеюяю])', 'replacer': '\\1ль\\3', 're': True},
|
| 158 |
+
# special cases when L is still hard
|
| 159 |
+
# todo: fix all the exclusions from the list in https://t.me/ravo_club/9776
|
| 160 |
+
{'find_what': '([иэеИЭЕ][Лл])([Ьь])(ГАД|ГАВТ|гад|гавт)', 'replacer': '\\1\\3', 're': True},
|
| 161 |
+
]
|
| 162 |
+
|
| 163 |
_lat2cyr = [
|
| 164 |
{'find_what': 'Ŕ', 'replacer': 'Ŕ', 're': False},
|
| 165 |
{'find_what': 'Ĺ', 'replacer': 'Ĺ', 're': False},
|
|
|
|
| 315 |
|
| 316 |
|
| 317 |
def lat2cyr(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels=True):
|
| 318 |
+
if not first_e_with_hacek:
|
| 319 |
+
text = transliterate_with_rules(text, _lat2cyr_first_e)
|
| 320 |
text = transliterate_with_rules(text, _lat2cyr)
|
| 321 |
+
if not soft_l_after_vowels:
|
| 322 |
+
text = transliterate_with_rules(text, _lat2cyr_soft_l_after_vowels)
|
| 323 |
text = transliterate_with_rules(text, _lat2cyr_special_cases)
|
| 324 |
return text
|
| 325 |
|
test_translit.py
CHANGED
|
@@ -53,6 +53,23 @@ def test_consistency():
|
|
| 53 |
line_cyr2 = lat2cyr(line_lat)
|
| 54 |
assert line_cyr == line_cyr2
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
def test_zontik():
|
| 58 |
with open('examples/zontik_cyr.txt', 'r') as f:
|
|
|
|
| 53 |
line_cyr2 = lat2cyr(line_lat)
|
| 54 |
assert line_cyr == line_cyr2
|
| 55 |
|
| 56 |
+
for line_cyr in lines:
|
| 57 |
+
line_lat = cyr2lat(line_cyr, joint_acute=False)
|
| 58 |
+
line_cyr2 = lat2cyr(line_lat, joint_acute=False)
|
| 59 |
+
assert line_cyr == line_cyr2
|
| 60 |
+
|
| 61 |
+
for line_cyr in lines:
|
| 62 |
+
line_lat = cyr2lat(line_cyr, soft_l_after_vowels=False)
|
| 63 |
+
line_cyr2 = lat2cyr(line_lat, soft_l_after_vowels=False)
|
| 64 |
+
assert line_cyr == line_cyr2
|
| 65 |
+
|
| 66 |
+
for line_cyr in lines:
|
| 67 |
+
if ' ежос' in line_cyr: # normally, this does not happen in the Erzya language
|
| 68 |
+
continue
|
| 69 |
+
line_lat = cyr2lat(line_cyr, first_e_with_hacek=False)
|
| 70 |
+
line_cyr2 = lat2cyr(line_lat, first_e_with_hacek=False)
|
| 71 |
+
assert line_cyr == line_cyr2
|
| 72 |
+
|
| 73 |
|
| 74 |
def test_zontik():
|
| 75 |
with open('examples/zontik_cyr.txt', 'r') as f:
|