Mel Seto commited on
Commit
456f698
·
1 Parent(s): 5fd5661

Add verification module that verifies LLM results using CcCEdict and Wikitionary

Browse files
pytest.ini CHANGED
@@ -1,2 +1,6 @@
1
  [pytest]
2
  pythonpath = src
 
 
 
 
 
1
  [pytest]
2
  pythonpath = src
3
+
4
+ markers =
5
+ integration: mark test as an integration test that uses real data or external APIs
6
+
src/singletons.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from pycccedict.cccedict import CcCedict
2
+
3
+ CC_DICT = CcCedict()
src/utils/utils.py CHANGED
@@ -1,7 +1,29 @@
 
1
  from pypinyin import Style, pinyin
2
 
 
3
 
4
  def get_pinyin(text: str):
5
  """Convert Chinese characters to pinyin with tones."""
6
  py_list = pinyin(text, style=Style.TONE, heteronym=False)
7
  return " ".join([syllable[0] for syllable in py_list])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
  from pypinyin import Style, pinyin
3
 
4
+ from singletons import CC_DICT
5
 
6
  def get_pinyin(text: str):
7
  """Convert Chinese characters to pinyin with tones."""
8
  py_list = pinyin(text, style=Style.TONE, heteronym=False)
9
  return " ".join([syllable[0] for syllable in py_list])
10
+
11
+
12
+ def verify_idiom_exists(idiom: str) -> bool:
13
+ """Verify idiom first via CC-CEDICT, then Wiktionary API."""
14
+ # Step 1: Local CC-CEDICT lookup
15
+ if CC_DICT.get_definitions(idiom):
16
+ return True
17
+
18
+ # Step 2: Wiktionary fallback
19
+ try:
20
+ url = "https://zh.wiktionary.org/w/api.php"
21
+ params = {"action": "query", "titles": idiom, "format": "json"}
22
+ response = requests.get(url, params=params, timeout=3)
23
+ data = response.json()
24
+ pages = data.get("query", {}).get("pages", {})
25
+ return not ("-1" in pages)
26
+ except Exception:
27
+ # Network or API failure — assume unknown
28
+ return False
29
+
{utils → src/verification}/__init__.py RENAMED
File without changes
src/verification/verifier.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ from singletons import CC_DICT
3
+ from verification.wiktionary_client import WiktionaryClient
4
+
5
+
6
+ def verify_idiom_exists(idiom: str, wiktionary_client: Optional[WiktionaryClient] = None) -> bool:
7
+ """Verify idiom exists via CC-CEDICT or optional Wiktionary client."""
8
+ # Step 1: Local CC-CEDICT lookup
9
+ if CC_DICT.get_definitions(idiom):
10
+ return True
11
+
12
+ # Step 2: Wiktionary fallback
13
+ client = wiktionary_client or WiktionaryClient()
14
+ return client.exists(idiom)
src/verification/wiktionary_client.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+
4
+ _WIKTIONARY_CACHE = {}
5
+
6
+ class WiktionaryClient:
7
+ """Simple wrapper for Wiktionary API queries."""
8
+ BASE_URL = "https://zh.wiktionary.org/w/api.php"
9
+ HEADERS = {"User-Agent": "MyChineseIdiomApp/1.0 ([email protected])"}
10
+
11
+ def exists(self, term: str) -> bool:
12
+ import requests
13
+
14
+ if term in _WIKTIONARY_CACHE:
15
+ return _WIKTIONARY_CACHE[term]
16
+
17
+ try:
18
+ params = {"action": "query", "titles": term, "format": "json"}
19
+ response = requests.get(self.BASE_URL, params=params, headers=self.HEADERS, timeout=5)
20
+ response.raise_for_status()
21
+ data = response.json()
22
+ pages = data.get("query", {}).get("pages", {})
23
+ exists = "-1" not in pages
24
+ except Exception:
25
+ exists = False
26
+
27
+ _WIKTIONARY_CACHE[term] = exists
28
+ return exists
tests/conftest.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from verification.wiktionary_client import WiktionaryClient
3
+
4
+ @pytest.fixture
5
+ def wiktionary_client():
6
+ """Return a real Wiktionary client for integration tests."""
7
+ return WiktionaryClient()
tests/test_utils.py CHANGED
@@ -1,6 +1,5 @@
1
  import pytest
2
-
3
- from utils.utils import get_pinyin
4
 
5
 
6
  @pytest.mark.parametrize(
@@ -12,4 +11,4 @@ from utils.utils import get_pinyin
12
  ],
13
  )
14
  def test_get_pinyin_accent(text, expected):
15
- assert get_pinyin(text) == expected
 
1
  import pytest
2
+ from utils import utils
 
3
 
4
 
5
  @pytest.mark.parametrize(
 
11
  ],
12
  )
13
  def test_get_pinyin_accent(text, expected):
14
+ assert utils.get_pinyin(text) == expected
tests/test_verifier_integration.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from verification.verifier import verify_idiom_exists
3
+ from singletons import CC_DICT
4
+
5
+
6
+ @pytest.mark.integration
7
+ def test_real_idiom_in_cc_cedict():
8
+ """Verify a known idiom exists in CC-CEDICT."""
9
+ idiom = "山珍海味" # pick a real idiom in your CC_DICT
10
+ assert verify_idiom_exists(idiom) is True
11
+
12
+ @pytest.mark.integration
13
+ def test_fake_idiom_not_in_cc_cedict():
14
+ """Verify a non-existent idiom returns False."""
15
+ idiom = "不存在的成语"
16
+ assert verify_idiom_exists(idiom) is False
17
+
18
+ @pytest.mark.integration
19
+ def test_real_idiom_wiktionary(wiktionary_client):
20
+ """Verify that a known idiom exists on Wiktionary."""
21
+ idiom = "成语"
22
+ assert verify_idiom_exists(idiom, wiktionary_client=wiktionary_client) is True
23
+
24
+ @pytest.mark.integration
25
+ def test_fake_idiom_wiktionary(wiktionary_client):
26
+ """Verify that a made-up idiom does not exist on Wiktionary."""
27
+ idiom = "不存在的成语"
28
+ assert verify_idiom_exists(idiom, wiktionary_client=wiktionary_client) is False
tests/test_verifier_unit.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from unittest.mock import MagicMock
3
+ import verification.verifier as verifier_module
4
+ from verification.wiktionary_client import WiktionaryClient
5
+
6
+ class FakeClient:
7
+ def __init__(self, return_value: bool):
8
+ self.return_value = return_value
9
+ self.called_with = None
10
+
11
+ def exists(self, idiom: str) -> bool:
12
+ self.called_with = idiom
13
+ return self.return_value
14
+
15
+ def test_returns_true_if_in_cc_cedict(monkeypatch):
16
+ """Return True if CC-CEDICT has definitions."""
17
+ mock_CC_DICT = MagicMock()
18
+ mock_CC_DICT.get_definitions.return_value = ["dummy definition"]
19
+
20
+ # Patch CC_DICT inside verifier module
21
+ monkeypatch.setattr(verifier_module, "CC_DICT", mock_CC_DICT)
22
+
23
+ result = verifier_module.verify_idiom_exists("山珍海味")
24
+ assert result is True
25
+ mock_CC_DICT.get_definitions.assert_called_once_with("山珍海味")
26
+
27
+
28
+ def test_returns_false_if_not_in_cc_cedict(monkeypatch):
29
+ """Return False if CC-CEDICT has no definitions and no Wiktionary client."""
30
+ mock_CC_DICT = MagicMock()
31
+ mock_CC_DICT.get_definitions.return_value = []
32
+
33
+ monkeypatch.setattr(verifier_module, "CC_DICT", mock_CC_DICT)
34
+
35
+ result = verifier_module.verify_idiom_exists("不存在的成语")
36
+ assert result is False
37
+ mock_CC_DICT.get_definitions.assert_called_once_with("不存在的成语")
38
+
39
+
40
+ def test_returns_true_with_wiktionary_fallback(monkeypatch):
41
+ """Return True if Wiktionary client finds the idiom."""
42
+ mock_CC_DICT = MagicMock()
43
+ mock_CC_DICT.get_definitions.return_value = []
44
+
45
+ monkeypatch.setattr(verifier_module, "CC_DICT", mock_CC_DICT)
46
+ wik_client = FakeClient(True)
47
+
48
+ result = verifier_module.verify_idiom_exists("非数据集成语", wik_client)
49
+ assert result is True
utils/utils.py DELETED
@@ -1,7 +0,0 @@
1
- from pypinyin import Style, pinyin
2
-
3
-
4
- def get_pinyin(text: str):
5
- """Convert Chinese characters to pinyin with tones."""
6
- py_list = pinyin(text, style=Style.TONE, heteronym=False)
7
- return " ".join([syllable[0] for syllable in py_list])