Mel Seto commited on
Commit
021749e
·
1 Parent(s): c1e01a2

add traditional character idiom

Browse files
Files changed (3) hide show
  1. pyproject.toml +1 -0
  2. src/app.py +27 -10
  3. uv.lock +13 -0
pyproject.toml CHANGED
@@ -13,6 +13,7 @@ dependencies = [
13
  "pypinyin>=0.55.0",
14
  "sentence-transformers>=2.2.2",
15
  "numpy>=1.26.0",
 
16
  ]
17
 
18
  [dependency-groups]
 
13
  "pypinyin>=0.55.0",
14
  "sentence-transformers>=2.2.2",
15
  "numpy>=1.26.0",
16
+ "opencc>=1.1.9",
17
  ]
18
 
19
  [dependency-groups]
src/app.py CHANGED
@@ -1,11 +1,10 @@
1
- import json
2
  import os
3
 
4
  import gradio as gr
5
  from cerebras.cloud.sdk import Cerebras
6
  from dotenv import load_dotenv
 
7
 
8
- from retrieval.retriever import retrieve_idiom
9
  from utils.utils import get_pinyin
10
  from verification.verifier import verify_idiom_exists
11
 
@@ -17,6 +16,8 @@ load_dotenv()
17
  MODEL = "gpt-oss-120b"
18
  USE_MOCK = False # ✅ Toggle between mock and real API
19
 
 
 
20
 
21
  # ======================
22
  # Instantiate client (if not mocking)
@@ -31,9 +32,11 @@ if not USE_MOCK:
31
  # ======================
32
  def find_idiom_mock():
33
  idiom = "对症下药"
 
34
  explanation = """duì zhèng xià yào<br><br>
35
  To prescribe the right medicine; to take the right approach to a problem."""
36
- return idiom, explanation
 
37
 
38
 
39
  # ======================
@@ -46,7 +49,7 @@ EXAMPLE_CACHE = {}
46
 
47
  def find_idiom(situation: str, max_attempts: int = 3):
48
  """
49
- Generate a verified Chinese idiom for a given situation.
50
 
51
  Uses verify_idiom_exists() to confirm idiom validity.
52
  """
@@ -58,7 +61,7 @@ def find_idiom(situation: str, max_attempts: int = 3):
58
  1. A Chinese idiom (includes 成語、俗語、諺語),
59
  written in simplified Chinese characters,
60
  that conveys the idea of the given situation.
61
- 2. Its literal English translation
62
  3. Explain idiom in English. Keep explanation to 2-3 concise sentences.
63
 
64
  Format:
@@ -78,6 +81,7 @@ Answer:"""
78
  lines = [line.strip() for line in generated_text.split("\n") if line.strip()]
79
 
80
  llm_idiom = lines[0] if lines else generated_text
 
81
 
82
  # 2️⃣ Verify idiom using CC-CEDICT + Wiktionary
83
  if verify_idiom_exists(llm_idiom):
@@ -86,14 +90,27 @@ Answer:"""
86
  if len(lines) >= 3:
87
  translation = lines[1]
88
  meaning = " ".join(lines[2:])
89
- explanation = f"{pinyin_text}<br><br>{translation}<br><br>{meaning}"
90
  else:
91
- explanation = f"{pinyin_text}<br><br>{' '.join(lines[1:])}"
92
-
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  EXAMPLE_CACHE[situation] = (llm_idiom, explanation)
94
- return llm_idiom, explanation
 
95
  else:
96
- print(f"Attempt {attempt}: '{llm_idiom}' failed verification, retrying...")
97
 
98
  # Fallback if no verified idiom found
99
  fallback_idiom = "未找到成语"
 
 
1
  import os
2
 
3
  import gradio as gr
4
  from cerebras.cloud.sdk import Cerebras
5
  from dotenv import load_dotenv
6
+ from opencc import OpenCC
7
 
 
8
  from utils.utils import get_pinyin
9
  from verification.verifier import verify_idiom_exists
10
 
 
16
  MODEL = "gpt-oss-120b"
17
  USE_MOCK = False # ✅ Toggle between mock and real API
18
 
19
+ # simplified to traditional Chinese character converter
20
+ char_converter = OpenCC('s2t')
21
 
22
  # ======================
23
  # Instantiate client (if not mocking)
 
32
  # ======================
33
  def find_idiom_mock():
34
  idiom = "对症下药"
35
+ trad_idiom = char_converter.convert(idiom)
36
  explanation = """duì zhèng xià yào<br><br>
37
  To prescribe the right medicine; to take the right approach to a problem."""
38
+ idiom_output = f"{idiom}<br>{trad_idiom}"
39
+ return idiom_output, explanation
40
 
41
 
42
  # ======================
 
49
 
50
  def find_idiom(situation: str, max_attempts: int = 3):
51
  """
52
+ Find a verified Chinese idiom for a given situation.
53
 
54
  Uses verify_idiom_exists() to confirm idiom validity.
55
  """
 
61
  1. A Chinese idiom (includes 成語、俗語、諺語),
62
  written in simplified Chinese characters,
63
  that conveys the idea of the given situation.
64
+ 2. Its literal English translationx
65
  3. Explain idiom in English. Keep explanation to 2-3 concise sentences.
66
 
67
  Format:
 
81
  lines = [line.strip() for line in generated_text.split("\n") if line.strip()]
82
 
83
  llm_idiom = lines[0] if lines else generated_text
84
+ trad_idiom = char_converter.convert(llm_idiom) if char_converter else None
85
 
86
  # 2️⃣ Verify idiom using CC-CEDICT + Wiktionary
87
  if verify_idiom_exists(llm_idiom):
 
90
  if len(lines) >= 3:
91
  translation = lines[1]
92
  meaning = " ".join(lines[2:])
 
93
  else:
94
+ translation = ""
95
+ meaning = " ".join(lines[1:])
96
+
97
+ explanation = f"""
98
+ <div style="line-height: 1.6;">
99
+ <p style="margin: 0;">
100
+ {pinyin_text}
101
+ </p>
102
+ <hr style="border: none; border-top: 1px solid #ddd; margin: 8px 0;">
103
+ <p style="margin: 0;">
104
+ <i>{translation}</i><br>
105
+ {meaning}
106
+ </p>
107
+ </div>
108
+ """
109
  EXAMPLE_CACHE[situation] = (llm_idiom, explanation)
110
+ idiom_output = f"{llm_idiom}<br>{trad_idiom}"
111
+ return idiom_output, explanation
112
  else:
113
+ print(f"Attempt {attempt}: '{idiom_output}' failed verification, retrying...")
114
 
115
  # Fallback if no verified idiom found
116
  fallback_idiom = "未找到成语"
uv.lock CHANGED
@@ -325,6 +325,7 @@ dependencies = [
325
  { name = "gradio" },
326
  { name = "numpy" },
327
  { name = "ollama" },
 
328
  { name = "pycccedict" },
329
  { name = "pypinyin" },
330
  { name = "sentence-transformers" },
@@ -347,6 +348,7 @@ requires-dist = [
347
  { name = "gradio", specifier = ">=4.44.1" },
348
  { name = "numpy", specifier = ">=1.26.0" },
349
  { name = "ollama", specifier = ">=0.5.3" },
 
350
  { name = "pycccedict", specifier = ">=1.2.0" },
351
  { name = "pypinyin", specifier = ">=0.55.0" },
352
  { name = "sentence-transformers", specifier = ">=2.2.2" },
@@ -1093,6 +1095,17 @@ wheels = [
1093
  { url = "https://files.pythonhosted.org/packages/be/f6/2091e50b8b6c3e6901f6eab283d5efd66fb71c86ddb1b4d68766c3eeba0f/ollama-0.5.3-py3-none-any.whl", hash = "sha256:a8303b413d99a9043dbf77ebf11ced672396b59bec27e6d5db67c88f01b279d2", size = 13490, upload-time = "2025-08-07T21:44:09.353Z" },
1094
  ]
1095
 
 
 
 
 
 
 
 
 
 
 
 
1096
  [[package]]
1097
  name = "orjson"
1098
  version = "3.11.3"
 
325
  { name = "gradio" },
326
  { name = "numpy" },
327
  { name = "ollama" },
328
+ { name = "opencc" },
329
  { name = "pycccedict" },
330
  { name = "pypinyin" },
331
  { name = "sentence-transformers" },
 
348
  { name = "gradio", specifier = ">=4.44.1" },
349
  { name = "numpy", specifier = ">=1.26.0" },
350
  { name = "ollama", specifier = ">=0.5.3" },
351
+ { name = "opencc", specifier = ">=1.1.9" },
352
  { name = "pycccedict", specifier = ">=1.2.0" },
353
  { name = "pypinyin", specifier = ">=0.55.0" },
354
  { name = "sentence-transformers", specifier = ">=2.2.2" },
 
1095
  { url = "https://files.pythonhosted.org/packages/be/f6/2091e50b8b6c3e6901f6eab283d5efd66fb71c86ddb1b4d68766c3eeba0f/ollama-0.5.3-py3-none-any.whl", hash = "sha256:a8303b413d99a9043dbf77ebf11ced672396b59bec27e6d5db67c88f01b279d2", size = 13490, upload-time = "2025-08-07T21:44:09.353Z" },
1096
  ]
1097
 
1098
+ [[package]]
1099
+ name = "opencc"
1100
+ version = "1.1.9"
1101
+ source = { registry = "https://pypi.org/simple" }
1102
+ sdist = { url = "https://files.pythonhosted.org/packages/e6/a2/0e86df1284143c389a3a6e33b159394da34b500a62b2b9c918949a2e6438/opencc-1.1.9.tar.gz", hash = "sha256:8ad72283732951303390fae33a1ceda98ac9b03368a8f2912edc934d74077e4a", size = 3409025, upload-time = "2024-08-08T04:55:32.483Z" }
1103
+ wheels = [
1104
+ { url = "https://files.pythonhosted.org/packages/8c/58/d1f270e9d329d4f4f7c1963f9700aa9c9d6f0c5042c641005da6369b4c8e/OpenCC-1.1.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:48bc3e37942b91a9cf51f525631792f79378e5332bdba9e10c05f6e7fe9036ca", size = 1482516, upload-time = "2024-08-08T05:03:36.889Z" },
1105
+ { url = "https://files.pythonhosted.org/packages/23/f0/ddd3522a142ebb66b30c7d30509de940979c2fb30a9edbf417fdfc37278d/OpenCC-1.1.9-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:1c5d1489bdaf9dc2865f0ea30eb565093253e73c1868d9c19554c8a044b545d4", size = 1653167, upload-time = "2024-08-08T04:55:26.395Z" },
1106
+ { url = "https://files.pythonhosted.org/packages/2c/67/fb4fb43c1502fd9f14646211d9643ef67e8123455e176af6668265d2f875/OpenCC-1.1.9-cp312-cp312-win_amd64.whl", hash = "sha256:64f8d22c8505b65e8ee2d6e73241cbc92785d38b3c93885b423d7c4fcd31c679", size = 1756337, upload-time = "2024-08-08T04:58:34.798Z" },
1107
+ ]
1108
+
1109
  [[package]]
1110
  name = "orjson"
1111
  version = "3.11.3"