idiom-finder

Sleeping

Mel Seto commited on Sep 4

Commit

2fc8513

1 Parent(s): f162916

adjust formatting

Files changed (4) hide show

run.py CHANGED Viewed

@@ -1,10 +1,14 @@
 from multiprocessing import freeze_support
 from watchfiles import run_process
 import app  # Import app module
 def start_app():
     app.launch_app()
 if __name__ == "__main__":
     freeze_support()
-    run_process('.', target=start_app)

 from multiprocessing import freeze_support
 from watchfiles import run_process
 import app  # Import app module
 def start_app():
     app.launch_app()
 if __name__ == "__main__":
     freeze_support()
+    run_process(".", target=start_app)

scripts/chid-dataset-processor.py CHANGED Viewed

@@ -1,17 +1,20 @@
 # Code generated with ChatGPT
 import json
-from pypinyin import pinyin, Style
 from datasets import load_dataset
 # --- Load ChID dataset ---
 dataset = load_dataset("thu-coai/chid")
 # --- Helper to extract ground truth idioms ---
 def get_ground_truth(example):
     data = json.loads(example["text"])
     # Take the first candidate for each content passage
-    return [c[0] for c in data["candidates"][:len(data["content"])]]
 # --- Collect all ground truth idioms ---
 ground_truths = []
@@ -23,18 +26,17 @@ idiom_set = {idiom for idiom in ground_truths if len(idiom) == 4}
 print(f"Filtered 4-character idioms: {len(idiom_set)}")
 # --- Generate pinyin for each idiom ---
 def generate_pinyin(idiom):
     pinyin_list = pinyin(idiom, style=Style.TONE)
     return " ".join([s[0] for s in pinyin_list])
 # --- Build reference dataset ---
 reference_data = []
 for idiom in idiom_set:
-    reference_data.append({
-        "idiom": idiom,
-        "pinyin": generate_pinyin(idiom)
-    })
 # --- Save to JSON ---
 with open("chid_idiom_reference_with_pinyin.json", "w", encoding="utf-8") as f:

 # Code generated with ChatGPT
 import json
 from datasets import load_dataset
+from pypinyin import Style, pinyin
 # --- Load ChID dataset ---
 dataset = load_dataset("thu-coai/chid")
 # --- Helper to extract ground truth idioms ---
 def get_ground_truth(example):
     data = json.loads(example["text"])
     # Take the first candidate for each content passage
+    return [c[0] for c in data["candidates"][: len(data["content"])]]
 # --- Collect all ground truth idioms ---
 ground_truths = []
 print(f"Filtered 4-character idioms: {len(idiom_set)}")
 # --- Generate pinyin for each idiom ---
 def generate_pinyin(idiom):
     pinyin_list = pinyin(idiom, style=Style.TONE)
     return " ".join([s[0] for s in pinyin_list])
 # --- Build reference dataset ---
 reference_data = []
 for idiom in idiom_set:
+    reference_data.append({"idiom": idiom, "pinyin": generate_pinyin(idiom)})
 # --- Save to JSON ---
 with open("chid_idiom_reference_with_pinyin.json", "w", encoding="utf-8") as f:

tests/test_utils.py CHANGED Viewed

@@ -1,10 +1,15 @@
 import pytest
 from utils.utils import get_pinyin
-@pytest.mark.parametrize("text, expected", [
-    ("举棋不定", "jǔ qí bù dìng"),
-    ("风", "fēng"),
-    ("不怕慢，就怕站", "bù pà màn ， jiù pà zhàn"),
-])
 def test_get_pinyin_accent(text, expected):
     assert get_pinyin(text) == expected

 import pytest
 from utils.utils import get_pinyin
+@pytest.mark.parametrize(
+    "text, expected",
+    [
+        ("举棋不定", "jǔ qí bù dìng"),
+        ("风", "fēng"),
+        ("不怕慢，就怕站", "bù pà màn ， jiù pà zhàn"),
+    ],
+)
 def test_get_pinyin_accent(text, expected):
     assert get_pinyin(text) == expected

utils/utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from pypinyin import pinyin, Style
 def get_pinyin(text: str):
     """Convert Chinese characters to pinyin with tones."""

+from pypinyin import Style, pinyin
 def get_pinyin(text: str):
     """Convert Chinese characters to pinyin with tones."""