Mel Seto
commited on
Commit
·
021749e
1
Parent(s):
c1e01a2
add traditional character idiom
Browse files- pyproject.toml +1 -0
- src/app.py +27 -10
- uv.lock +13 -0
pyproject.toml
CHANGED
|
@@ -13,6 +13,7 @@ dependencies = [
|
|
| 13 |
"pypinyin>=0.55.0",
|
| 14 |
"sentence-transformers>=2.2.2",
|
| 15 |
"numpy>=1.26.0",
|
|
|
|
| 16 |
]
|
| 17 |
|
| 18 |
[dependency-groups]
|
|
|
|
| 13 |
"pypinyin>=0.55.0",
|
| 14 |
"sentence-transformers>=2.2.2",
|
| 15 |
"numpy>=1.26.0",
|
| 16 |
+
"opencc>=1.1.9",
|
| 17 |
]
|
| 18 |
|
| 19 |
[dependency-groups]
|
src/app.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
-
import json
|
| 2 |
import os
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
from cerebras.cloud.sdk import Cerebras
|
| 6 |
from dotenv import load_dotenv
|
|
|
|
| 7 |
|
| 8 |
-
from retrieval.retriever import retrieve_idiom
|
| 9 |
from utils.utils import get_pinyin
|
| 10 |
from verification.verifier import verify_idiom_exists
|
| 11 |
|
|
@@ -17,6 +16,8 @@ load_dotenv()
|
|
| 17 |
MODEL = "gpt-oss-120b"
|
| 18 |
USE_MOCK = False # ✅ Toggle between mock and real API
|
| 19 |
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# ======================
|
| 22 |
# Instantiate client (if not mocking)
|
|
@@ -31,9 +32,11 @@ if not USE_MOCK:
|
|
| 31 |
# ======================
|
| 32 |
def find_idiom_mock():
|
| 33 |
idiom = "对症下药"
|
|
|
|
| 34 |
explanation = """duì zhèng xià yào<br><br>
|
| 35 |
To prescribe the right medicine; to take the right approach to a problem."""
|
| 36 |
-
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
# ======================
|
|
@@ -46,7 +49,7 @@ EXAMPLE_CACHE = {}
|
|
| 46 |
|
| 47 |
def find_idiom(situation: str, max_attempts: int = 3):
|
| 48 |
"""
|
| 49 |
-
|
| 50 |
|
| 51 |
Uses verify_idiom_exists() to confirm idiom validity.
|
| 52 |
"""
|
|
@@ -58,7 +61,7 @@ def find_idiom(situation: str, max_attempts: int = 3):
|
|
| 58 |
1. A Chinese idiom (includes 成語、俗語、諺語),
|
| 59 |
written in simplified Chinese characters,
|
| 60 |
that conveys the idea of the given situation.
|
| 61 |
-
2. Its literal English
|
| 62 |
3. Explain idiom in English. Keep explanation to 2-3 concise sentences.
|
| 63 |
|
| 64 |
Format:
|
|
@@ -78,6 +81,7 @@ Answer:"""
|
|
| 78 |
lines = [line.strip() for line in generated_text.split("\n") if line.strip()]
|
| 79 |
|
| 80 |
llm_idiom = lines[0] if lines else generated_text
|
|
|
|
| 81 |
|
| 82 |
# 2️⃣ Verify idiom using CC-CEDICT + Wiktionary
|
| 83 |
if verify_idiom_exists(llm_idiom):
|
|
@@ -86,14 +90,27 @@ Answer:"""
|
|
| 86 |
if len(lines) >= 3:
|
| 87 |
translation = lines[1]
|
| 88 |
meaning = " ".join(lines[2:])
|
| 89 |
-
explanation = f"{pinyin_text}<br><br>{translation}<br><br>{meaning}"
|
| 90 |
else:
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
EXAMPLE_CACHE[situation] = (llm_idiom, explanation)
|
| 94 |
-
|
|
|
|
| 95 |
else:
|
| 96 |
-
print(f"Attempt {attempt}: '{
|
| 97 |
|
| 98 |
# Fallback if no verified idiom found
|
| 99 |
fallback_idiom = "未找到成语"
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
import gradio as gr
|
| 4 |
from cerebras.cloud.sdk import Cerebras
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
+
from opencc import OpenCC
|
| 7 |
|
|
|
|
| 8 |
from utils.utils import get_pinyin
|
| 9 |
from verification.verifier import verify_idiom_exists
|
| 10 |
|
|
|
|
| 16 |
MODEL = "gpt-oss-120b"
|
| 17 |
USE_MOCK = False # ✅ Toggle between mock and real API
|
| 18 |
|
| 19 |
+
# simplified to traditional Chinese character converter
|
| 20 |
+
char_converter = OpenCC('s2t')
|
| 21 |
|
| 22 |
# ======================
|
| 23 |
# Instantiate client (if not mocking)
|
|
|
|
| 32 |
# ======================
|
| 33 |
def find_idiom_mock():
|
| 34 |
idiom = "对症下药"
|
| 35 |
+
trad_idiom = char_converter.convert(idiom)
|
| 36 |
explanation = """duì zhèng xià yào<br><br>
|
| 37 |
To prescribe the right medicine; to take the right approach to a problem."""
|
| 38 |
+
idiom_output = f"{idiom}<br>{trad_idiom}"
|
| 39 |
+
return idiom_output, explanation
|
| 40 |
|
| 41 |
|
| 42 |
# ======================
|
|
|
|
| 49 |
|
| 50 |
def find_idiom(situation: str, max_attempts: int = 3):
|
| 51 |
"""
|
| 52 |
+
Find a verified Chinese idiom for a given situation.
|
| 53 |
|
| 54 |
Uses verify_idiom_exists() to confirm idiom validity.
|
| 55 |
"""
|
|
|
|
| 61 |
1. A Chinese idiom (includes 成語、俗語、諺語),
|
| 62 |
written in simplified Chinese characters,
|
| 63 |
that conveys the idea of the given situation.
|
| 64 |
+
2. Its literal English translationx
|
| 65 |
3. Explain idiom in English. Keep explanation to 2-3 concise sentences.
|
| 66 |
|
| 67 |
Format:
|
|
|
|
| 81 |
lines = [line.strip() for line in generated_text.split("\n") if line.strip()]
|
| 82 |
|
| 83 |
llm_idiom = lines[0] if lines else generated_text
|
| 84 |
+
trad_idiom = char_converter.convert(llm_idiom) if char_converter else None
|
| 85 |
|
| 86 |
# 2️⃣ Verify idiom using CC-CEDICT + Wiktionary
|
| 87 |
if verify_idiom_exists(llm_idiom):
|
|
|
|
| 90 |
if len(lines) >= 3:
|
| 91 |
translation = lines[1]
|
| 92 |
meaning = " ".join(lines[2:])
|
|
|
|
| 93 |
else:
|
| 94 |
+
translation = ""
|
| 95 |
+
meaning = " ".join(lines[1:])
|
| 96 |
+
|
| 97 |
+
explanation = f"""
|
| 98 |
+
<div style="line-height: 1.6;">
|
| 99 |
+
<p style="margin: 0;">
|
| 100 |
+
{pinyin_text}
|
| 101 |
+
</p>
|
| 102 |
+
<hr style="border: none; border-top: 1px solid #ddd; margin: 8px 0;">
|
| 103 |
+
<p style="margin: 0;">
|
| 104 |
+
<i>{translation}</i><br>
|
| 105 |
+
{meaning}
|
| 106 |
+
</p>
|
| 107 |
+
</div>
|
| 108 |
+
"""
|
| 109 |
EXAMPLE_CACHE[situation] = (llm_idiom, explanation)
|
| 110 |
+
idiom_output = f"{llm_idiom}<br>{trad_idiom}"
|
| 111 |
+
return idiom_output, explanation
|
| 112 |
else:
|
| 113 |
+
print(f"Attempt {attempt}: '{idiom_output}' failed verification, retrying...")
|
| 114 |
|
| 115 |
# Fallback if no verified idiom found
|
| 116 |
fallback_idiom = "未找到成语"
|
uv.lock
CHANGED
|
@@ -325,6 +325,7 @@ dependencies = [
|
|
| 325 |
{ name = "gradio" },
|
| 326 |
{ name = "numpy" },
|
| 327 |
{ name = "ollama" },
|
|
|
|
| 328 |
{ name = "pycccedict" },
|
| 329 |
{ name = "pypinyin" },
|
| 330 |
{ name = "sentence-transformers" },
|
|
@@ -347,6 +348,7 @@ requires-dist = [
|
|
| 347 |
{ name = "gradio", specifier = ">=4.44.1" },
|
| 348 |
{ name = "numpy", specifier = ">=1.26.0" },
|
| 349 |
{ name = "ollama", specifier = ">=0.5.3" },
|
|
|
|
| 350 |
{ name = "pycccedict", specifier = ">=1.2.0" },
|
| 351 |
{ name = "pypinyin", specifier = ">=0.55.0" },
|
| 352 |
{ name = "sentence-transformers", specifier = ">=2.2.2" },
|
|
@@ -1093,6 +1095,17 @@ wheels = [
|
|
| 1093 |
{ url = "https://files.pythonhosted.org/packages/be/f6/2091e50b8b6c3e6901f6eab283d5efd66fb71c86ddb1b4d68766c3eeba0f/ollama-0.5.3-py3-none-any.whl", hash = "sha256:a8303b413d99a9043dbf77ebf11ced672396b59bec27e6d5db67c88f01b279d2", size = 13490, upload-time = "2025-08-07T21:44:09.353Z" },
|
| 1094 |
]
|
| 1095 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1096 |
[[package]]
|
| 1097 |
name = "orjson"
|
| 1098 |
version = "3.11.3"
|
|
|
|
| 325 |
{ name = "gradio" },
|
| 326 |
{ name = "numpy" },
|
| 327 |
{ name = "ollama" },
|
| 328 |
+
{ name = "opencc" },
|
| 329 |
{ name = "pycccedict" },
|
| 330 |
{ name = "pypinyin" },
|
| 331 |
{ name = "sentence-transformers" },
|
|
|
|
| 348 |
{ name = "gradio", specifier = ">=4.44.1" },
|
| 349 |
{ name = "numpy", specifier = ">=1.26.0" },
|
| 350 |
{ name = "ollama", specifier = ">=0.5.3" },
|
| 351 |
+
{ name = "opencc", specifier = ">=1.1.9" },
|
| 352 |
{ name = "pycccedict", specifier = ">=1.2.0" },
|
| 353 |
{ name = "pypinyin", specifier = ">=0.55.0" },
|
| 354 |
{ name = "sentence-transformers", specifier = ">=2.2.2" },
|
|
|
|
| 1095 |
{ url = "https://files.pythonhosted.org/packages/be/f6/2091e50b8b6c3e6901f6eab283d5efd66fb71c86ddb1b4d68766c3eeba0f/ollama-0.5.3-py3-none-any.whl", hash = "sha256:a8303b413d99a9043dbf77ebf11ced672396b59bec27e6d5db67c88f01b279d2", size = 13490, upload-time = "2025-08-07T21:44:09.353Z" },
|
| 1096 |
]
|
| 1097 |
|
| 1098 |
+
[[package]]
|
| 1099 |
+
name = "opencc"
|
| 1100 |
+
version = "1.1.9"
|
| 1101 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1102 |
+
sdist = { url = "https://files.pythonhosted.org/packages/e6/a2/0e86df1284143c389a3a6e33b159394da34b500a62b2b9c918949a2e6438/opencc-1.1.9.tar.gz", hash = "sha256:8ad72283732951303390fae33a1ceda98ac9b03368a8f2912edc934d74077e4a", size = 3409025, upload-time = "2024-08-08T04:55:32.483Z" }
|
| 1103 |
+
wheels = [
|
| 1104 |
+
{ url = "https://files.pythonhosted.org/packages/8c/58/d1f270e9d329d4f4f7c1963f9700aa9c9d6f0c5042c641005da6369b4c8e/OpenCC-1.1.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:48bc3e37942b91a9cf51f525631792f79378e5332bdba9e10c05f6e7fe9036ca", size = 1482516, upload-time = "2024-08-08T05:03:36.889Z" },
|
| 1105 |
+
{ url = "https://files.pythonhosted.org/packages/23/f0/ddd3522a142ebb66b30c7d30509de940979c2fb30a9edbf417fdfc37278d/OpenCC-1.1.9-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:1c5d1489bdaf9dc2865f0ea30eb565093253e73c1868d9c19554c8a044b545d4", size = 1653167, upload-time = "2024-08-08T04:55:26.395Z" },
|
| 1106 |
+
{ url = "https://files.pythonhosted.org/packages/2c/67/fb4fb43c1502fd9f14646211d9643ef67e8123455e176af6668265d2f875/OpenCC-1.1.9-cp312-cp312-win_amd64.whl", hash = "sha256:64f8d22c8505b65e8ee2d6e73241cbc92785d38b3c93885b423d7c4fcd31c679", size = 1756337, upload-time = "2024-08-08T04:58:34.798Z" },
|
| 1107 |
+
]
|
| 1108 |
+
|
| 1109 |
[[package]]
|
| 1110 |
name = "orjson"
|
| 1111 |
version = "3.11.3"
|