Mel Seto commited on
Commit
73d6bd0
·
unverified ·
2 Parent(s): 80dc191 f04a87b

Merge pull request #3 from mel-seto/remove-rag

Browse files
src/app.py CHANGED
@@ -103,30 +103,11 @@ Answer:"""
103
  # ======================
104
  # UI Wrapper
105
  # ======================
106
- def update_ui(situation, mode):
107
- if mode == "RAG":
108
- top_idioms = retrieve_idiom(situation, top_k=2)
109
- formatted_idioms = []
110
- for idiom_entry in top_idioms:
111
- # Split "<Chinese>: <English>" format
112
- if ": " in idiom_entry:
113
- chinese, english = idiom_entry.split(": ", 1)
114
- else:
115
- chinese, english = idiom_entry, ""
116
- pinyin_text = get_pinyin(chinese)
117
- formatted_idioms.append(f"<div class='idiom-entry'><b>{chinese}</b><br>{pinyin_text}<br>{english}</div>")
118
-
119
- # Combine all entries with horizontal separators
120
- idiom = "<hr>".join(formatted_idioms)
121
- explanation = "Retrieved using embeddings (RAG)."
122
- elif mode == "LLM":
123
- if USE_MOCK:
124
- idiom, explanation = generate_idiom_mock()
125
- else:
126
- idiom, explanation = generate_idiom(situation)
127
  else:
128
- idiom = "Unknown mode"
129
- explanation = ""
130
 
131
  return (
132
  f"<div class='idiom-output'>{idiom}</div>",
@@ -148,11 +129,6 @@ def launch_app():
148
  lines=2,
149
  placeholder="e.g., When facing a big challenge",
150
  )
151
- mode_dropdown = gr.Dropdown(
152
- ["LLM", "RAG"],
153
- label="Mode",
154
- value="RAG",
155
- )
156
  generate_btn = gr.Button("✨ Find Idiom")
157
 
158
  # ✅ Example situations
@@ -174,7 +150,7 @@ def launch_app():
174
  # pylint: disable=no-member
175
  generate_btn.click(
176
  fn=update_ui,
177
- inputs=[situation, mode_dropdown],
178
  outputs=[idiom_output, explanation_output],
179
  )
180
 
 
103
  # ======================
104
  # UI Wrapper
105
  # ======================
106
+ def update_ui(situation):
107
+ if USE_MOCK:
108
+ idiom, explanation = generate_idiom_mock()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  else:
110
+ idiom, explanation = generate_idiom(situation)
 
111
 
112
  return (
113
  f"<div class='idiom-output'>{idiom}</div>",
 
129
  lines=2,
130
  placeholder="e.g., When facing a big challenge",
131
  )
 
 
 
 
 
132
  generate_btn = gr.Button("✨ Find Idiom")
133
 
134
  # ✅ Example situations
 
150
  # pylint: disable=no-member
151
  generate_btn.click(
152
  fn=update_ui,
153
+ inputs=[situation],
154
  outputs=[idiom_output, explanation_output],
155
  )
156
 
src/retrieval/__init__.py DELETED
File without changes
src/retrieval/constants.py DELETED
@@ -1 +0,0 @@
1
- EMBEDDING_MODEL = "intfloat/multilingual-e5-small"
 
 
src/retrieval/embed_corpus.py DELETED
@@ -1,26 +0,0 @@
1
- """
2
- This script needs to be re-run each time EMBEDDING_MODEL is updated.
3
- """
4
-
5
- import json
6
- import numpy as np
7
- from sentence_transformers import SentenceTransformer
8
-
9
- from constants import EMBEDDING_MODEL
10
-
11
-
12
- INPUT_FILE = "data/idioms-and-definitions.json"
13
- EMBED_FILE = "data/idiom_embeddings.npy"
14
-
15
- embedder = SentenceTransformer(EMBEDDING_MODEL)
16
-
17
- # Load idioms
18
- with open(INPUT_FILE, "r", encoding="utf-8") as f:
19
- corpus = json.load(f)
20
-
21
- # Compute embeddings
22
- embeddings = embedder.encode(corpus, convert_to_tensor=False, show_progress_bar=True)
23
-
24
- # Save to disk
25
- np.save(EMBED_FILE, embeddings)
26
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/retrieval/retriever.py DELETED
@@ -1,51 +0,0 @@
1
- import json
2
- import numpy as np
3
- import requests
4
- from sentence_transformers import SentenceTransformer
5
- import os
6
-
7
- from .constants import EMBEDDING_MODEL
8
-
9
-
10
- # HF Dataset URL for the embeddings
11
- EMBED_URL = "https://huggingface.co/datasets/chinese-enthusiasts/idiom-embeddings/resolve/main/idiom_embeddings.npy"
12
- JSON_URL = "https://huggingface.co/datasets/chinese-enthusiasts/idiom-definitions/resolve/main/idioms-and-definitions.json"
13
-
14
- # Ensure 'data/' exists
15
- os.makedirs("data", exist_ok=True)
16
- EMBED_FILE = "data/idiom_embeddings.npy"
17
- JSON_FILE = "data/idioms-and-definitions.json"
18
-
19
- # Download embeddings if not present
20
- if not os.path.exists(EMBED_FILE):
21
- print("Downloading embeddings...")
22
- r = requests.get(EMBED_URL)
23
- with open(EMBED_FILE, "wb") as f:
24
- f.write(r.content)
25
- print("Done.")
26
-
27
- # Download idioms JSON if not present
28
- if not os.path.exists(JSON_FILE):
29
- print("Downloading idioms JSON...")
30
- r = requests.get(JSON_URL)
31
- with open(JSON_FILE, "wb") as f:
32
- f.write(r.content)
33
- print("Done.")
34
-
35
- # Load embeddings
36
- corpus_embeddings = np.load(EMBED_FILE)
37
-
38
- # Load idioms
39
- with open(JSON_FILE, "r", encoding="utf-8") as f:
40
- corpus = json.load(f)
41
-
42
- # Initialize embedder
43
- embedder = SentenceTransformer(EMBEDDING_MODEL)
44
-
45
- def retrieve_idiom(situation: str, top_k=5):
46
- query_emb = embedder.encode([situation], convert_to_tensor=False)
47
- similarities = np.dot(corpus_embeddings, query_emb[0]) / (
48
- np.linalg.norm(corpus_embeddings, axis=1) * np.linalg.norm(query_emb[0])
49
- )
50
- top_idx = np.argsort(similarities)[::-1][:top_k]
51
- return [corpus[i] for i in top_idx]