Spaces:
Sleeping
Sleeping
Mel Seto
commited on
Commit
·
2fc8513
1
Parent(s):
f162916
adjust formatting
Browse files- run.py +5 -1
- scripts/chid-dataset-processor.py +8 -6
- tests/test_utils.py +10 -5
- utils/utils.py +2 -1
run.py
CHANGED
|
@@ -1,10 +1,14 @@
|
|
| 1 |
from multiprocessing import freeze_support
|
|
|
|
| 2 |
from watchfiles import run_process
|
|
|
|
| 3 |
import app # Import app module
|
| 4 |
|
|
|
|
| 5 |
def start_app():
|
| 6 |
app.launch_app()
|
| 7 |
|
|
|
|
| 8 |
if __name__ == "__main__":
|
| 9 |
freeze_support()
|
| 10 |
-
run_process(
|
|
|
|
| 1 |
from multiprocessing import freeze_support
|
| 2 |
+
|
| 3 |
from watchfiles import run_process
|
| 4 |
+
|
| 5 |
import app # Import app module
|
| 6 |
|
| 7 |
+
|
| 8 |
def start_app():
|
| 9 |
app.launch_app()
|
| 10 |
|
| 11 |
+
|
| 12 |
if __name__ == "__main__":
|
| 13 |
freeze_support()
|
| 14 |
+
run_process(".", target=start_app)
|
scripts/chid-dataset-processor.py
CHANGED
|
@@ -1,17 +1,20 @@
|
|
| 1 |
# Code generated with ChatGPT
|
| 2 |
|
| 3 |
import json
|
| 4 |
-
|
| 5 |
from datasets import load_dataset
|
|
|
|
| 6 |
|
| 7 |
# --- Load ChID dataset ---
|
| 8 |
dataset = load_dataset("thu-coai/chid")
|
| 9 |
|
|
|
|
| 10 |
# --- Helper to extract ground truth idioms ---
|
| 11 |
def get_ground_truth(example):
|
| 12 |
data = json.loads(example["text"])
|
| 13 |
# Take the first candidate for each content passage
|
| 14 |
-
return [c[0] for c in data["candidates"][:len(data["content"])]]
|
|
|
|
| 15 |
|
| 16 |
# --- Collect all ground truth idioms ---
|
| 17 |
ground_truths = []
|
|
@@ -23,18 +26,17 @@ idiom_set = {idiom for idiom in ground_truths if len(idiom) == 4}
|
|
| 23 |
|
| 24 |
print(f"Filtered 4-character idioms: {len(idiom_set)}")
|
| 25 |
|
|
|
|
| 26 |
# --- Generate pinyin for each idiom ---
|
| 27 |
def generate_pinyin(idiom):
|
| 28 |
pinyin_list = pinyin(idiom, style=Style.TONE)
|
| 29 |
return " ".join([s[0] for s in pinyin_list])
|
| 30 |
|
|
|
|
| 31 |
# --- Build reference dataset ---
|
| 32 |
reference_data = []
|
| 33 |
for idiom in idiom_set:
|
| 34 |
-
reference_data.append({
|
| 35 |
-
"idiom": idiom,
|
| 36 |
-
"pinyin": generate_pinyin(idiom)
|
| 37 |
-
})
|
| 38 |
|
| 39 |
# --- Save to JSON ---
|
| 40 |
with open("chid_idiom_reference_with_pinyin.json", "w", encoding="utf-8") as f:
|
|
|
|
| 1 |
# Code generated with ChatGPT
|
| 2 |
|
| 3 |
import json
|
| 4 |
+
|
| 5 |
from datasets import load_dataset
|
| 6 |
+
from pypinyin import Style, pinyin
|
| 7 |
|
| 8 |
# --- Load ChID dataset ---
|
| 9 |
dataset = load_dataset("thu-coai/chid")
|
| 10 |
|
| 11 |
+
|
| 12 |
# --- Helper to extract ground truth idioms ---
|
| 13 |
def get_ground_truth(example):
|
| 14 |
data = json.loads(example["text"])
|
| 15 |
# Take the first candidate for each content passage
|
| 16 |
+
return [c[0] for c in data["candidates"][: len(data["content"])]]
|
| 17 |
+
|
| 18 |
|
| 19 |
# --- Collect all ground truth idioms ---
|
| 20 |
ground_truths = []
|
|
|
|
| 26 |
|
| 27 |
print(f"Filtered 4-character idioms: {len(idiom_set)}")
|
| 28 |
|
| 29 |
+
|
| 30 |
# --- Generate pinyin for each idiom ---
|
| 31 |
def generate_pinyin(idiom):
|
| 32 |
pinyin_list = pinyin(idiom, style=Style.TONE)
|
| 33 |
return " ".join([s[0] for s in pinyin_list])
|
| 34 |
|
| 35 |
+
|
| 36 |
# --- Build reference dataset ---
|
| 37 |
reference_data = []
|
| 38 |
for idiom in idiom_set:
|
| 39 |
+
reference_data.append({"idiom": idiom, "pinyin": generate_pinyin(idiom)})
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# --- Save to JSON ---
|
| 42 |
with open("chid_idiom_reference_with_pinyin.json", "w", encoding="utf-8") as f:
|
tests/test_utils.py
CHANGED
|
@@ -1,10 +1,15 @@
|
|
| 1 |
import pytest
|
|
|
|
| 2 |
from utils.utils import get_pinyin
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
def test_get_pinyin_accent(text, expected):
|
| 10 |
assert get_pinyin(text) == expected
|
|
|
|
| 1 |
import pytest
|
| 2 |
+
|
| 3 |
from utils.utils import get_pinyin
|
| 4 |
|
| 5 |
+
|
| 6 |
+
@pytest.mark.parametrize(
|
| 7 |
+
"text, expected",
|
| 8 |
+
[
|
| 9 |
+
("举棋不定", "jǔ qí bù dìng"),
|
| 10 |
+
("风", "fēng"),
|
| 11 |
+
("不怕慢,就怕站", "bù pà màn , jiù pà zhàn"),
|
| 12 |
+
],
|
| 13 |
+
)
|
| 14 |
def test_get_pinyin_accent(text, expected):
|
| 15 |
assert get_pinyin(text) == expected
|
utils/utils.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
-
from pypinyin import
|
|
|
|
| 2 |
|
| 3 |
def get_pinyin(text: str):
|
| 4 |
"""Convert Chinese characters to pinyin with tones."""
|
|
|
|
| 1 |
+
from pypinyin import Style, pinyin
|
| 2 |
+
|
| 3 |
|
| 4 |
def get_pinyin(text: str):
|
| 5 |
"""Convert Chinese characters to pinyin with tones."""
|