Mel Seto commited on
Commit
2fc8513
·
1 Parent(s): f162916

adjust formatting

Browse files
run.py CHANGED
@@ -1,10 +1,14 @@
1
  from multiprocessing import freeze_support
 
2
  from watchfiles import run_process
 
3
  import app # Import app module
4
 
 
5
  def start_app():
6
  app.launch_app()
7
 
 
8
  if __name__ == "__main__":
9
  freeze_support()
10
- run_process('.', target=start_app)
 
1
  from multiprocessing import freeze_support
2
+
3
  from watchfiles import run_process
4
+
5
  import app # Import app module
6
 
7
+
8
  def start_app():
9
  app.launch_app()
10
 
11
+
12
  if __name__ == "__main__":
13
  freeze_support()
14
+ run_process(".", target=start_app)
scripts/chid-dataset-processor.py CHANGED
@@ -1,17 +1,20 @@
1
  # Code generated with ChatGPT
2
 
3
  import json
4
- from pypinyin import pinyin, Style
5
  from datasets import load_dataset
 
6
 
7
  # --- Load ChID dataset ---
8
  dataset = load_dataset("thu-coai/chid")
9
 
 
10
  # --- Helper to extract ground truth idioms ---
11
  def get_ground_truth(example):
12
  data = json.loads(example["text"])
13
  # Take the first candidate for each content passage
14
- return [c[0] for c in data["candidates"][:len(data["content"])]]
 
15
 
16
  # --- Collect all ground truth idioms ---
17
  ground_truths = []
@@ -23,18 +26,17 @@ idiom_set = {idiom for idiom in ground_truths if len(idiom) == 4}
23
 
24
  print(f"Filtered 4-character idioms: {len(idiom_set)}")
25
 
 
26
  # --- Generate pinyin for each idiom ---
27
  def generate_pinyin(idiom):
28
  pinyin_list = pinyin(idiom, style=Style.TONE)
29
  return " ".join([s[0] for s in pinyin_list])
30
 
 
31
  # --- Build reference dataset ---
32
  reference_data = []
33
  for idiom in idiom_set:
34
- reference_data.append({
35
- "idiom": idiom,
36
- "pinyin": generate_pinyin(idiom)
37
- })
38
 
39
  # --- Save to JSON ---
40
  with open("chid_idiom_reference_with_pinyin.json", "w", encoding="utf-8") as f:
 
1
  # Code generated with ChatGPT
2
 
3
  import json
4
+
5
  from datasets import load_dataset
6
+ from pypinyin import Style, pinyin
7
 
8
  # --- Load ChID dataset ---
9
  dataset = load_dataset("thu-coai/chid")
10
 
11
+
12
  # --- Helper to extract ground truth idioms ---
13
  def get_ground_truth(example):
14
  data = json.loads(example["text"])
15
  # Take the first candidate for each content passage
16
+ return [c[0] for c in data["candidates"][: len(data["content"])]]
17
+
18
 
19
  # --- Collect all ground truth idioms ---
20
  ground_truths = []
 
26
 
27
  print(f"Filtered 4-character idioms: {len(idiom_set)}")
28
 
29
+
30
  # --- Generate pinyin for each idiom ---
31
  def generate_pinyin(idiom):
32
  pinyin_list = pinyin(idiom, style=Style.TONE)
33
  return " ".join([s[0] for s in pinyin_list])
34
 
35
+
36
  # --- Build reference dataset ---
37
  reference_data = []
38
  for idiom in idiom_set:
39
+ reference_data.append({"idiom": idiom, "pinyin": generate_pinyin(idiom)})
 
 
 
40
 
41
  # --- Save to JSON ---
42
  with open("chid_idiom_reference_with_pinyin.json", "w", encoding="utf-8") as f:
tests/test_utils.py CHANGED
@@ -1,10 +1,15 @@
1
  import pytest
 
2
  from utils.utils import get_pinyin
3
 
4
- @pytest.mark.parametrize("text, expected", [
5
- ("举棋不定", "jǔ qí bù dìng"),
6
- ("风", "fēng"),
7
- ("不怕慢,就怕站", "bù pà màn , jiù pà zhàn"),
8
- ])
 
 
 
 
9
  def test_get_pinyin_accent(text, expected):
10
  assert get_pinyin(text) == expected
 
1
  import pytest
2
+
3
  from utils.utils import get_pinyin
4
 
5
+
6
+ @pytest.mark.parametrize(
7
+ "text, expected",
8
+ [
9
+ ("举棋不定", "jǔ qí bù dìng"),
10
+ ("风", "fēng"),
11
+ ("不怕慢,就怕站", "bù pà màn , jiù pà zhàn"),
12
+ ],
13
+ )
14
  def test_get_pinyin_accent(text, expected):
15
  assert get_pinyin(text) == expected
utils/utils.py CHANGED
@@ -1,4 +1,5 @@
1
- from pypinyin import pinyin, Style
 
2
 
3
  def get_pinyin(text: str):
4
  """Convert Chinese characters to pinyin with tones."""
 
1
+ from pypinyin import Style, pinyin
2
+
3
 
4
  def get_pinyin(text: str):
5
  """Convert Chinese characters to pinyin with tones."""