guipenedo HF Staff commited on
Commit
24d5c2f
·
1 Parent(s): 62a0a98

clean up design

Browse files
Files changed (1) hide show
  1. app.py +69 -125
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import gradio as gr
 
2
  from datatrove.pipeline.readers import ParquetReader
3
  from default_wiki_pipeline import _parse_and_clean_wikicode, mwparserfromhell
4
 
@@ -23,7 +24,15 @@ def _build_header_markdown(doc) -> str:
23
  header += f"\n\n[{url}]({url})"
24
  return header
25
 
26
- def matches_filters(doc, require_has_math: bool | None, require_has_infobox: bool | None) -> bool:
 
 
 
 
 
 
 
 
27
  meta = doc.metadata or {}
28
  if require_has_math and not bool(meta.get("has_math")):
29
  return False
@@ -32,22 +41,37 @@ def matches_filters(doc, require_has_math: bool | None, require_has_infobox: boo
32
  return True
33
 
34
 
35
- def find_next_matching_from(docs_cache, reader_iter, start_idx: int, require_has_math: bool | None, require_has_infobox: bool | None):
36
- # Scan cache first
 
 
 
 
 
 
 
 
37
  i = max(-1, start_idx)
38
  while i + 1 < len(docs_cache):
39
  i += 1
40
- if matches_filters(docs_cache[i], require_has_math, require_has_infobox):
41
- return i, docs_cache, reader_iter
 
 
 
42
  # Stream until found or exhausted
43
  while True:
44
  prev_len = len(docs_cache)
45
  docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, prev_len)
46
  if len(docs_cache) == prev_len:
47
  break
48
- if matches_filters(docs_cache[-1], require_has_math, require_has_infobox):
49
- return len(docs_cache) - 1, docs_cache, reader_iter
50
- return -1, docs_cache, reader_iter
 
 
 
 
51
 
52
  def render_iframe(url: str, height: int = 800) -> str:
53
  safe_url = url or "about:blank"
@@ -87,7 +111,7 @@ def _ensure_until_index(docs_cache, reader_iter, target_idx: int):
87
  return docs_cache, reader_iter
88
 
89
 
90
- def on_select_language(lang: str, require_has_math: bool, require_has_infobox: bool):
91
  """Load documents for the selected language from HF Parquet and display."""
92
  language = (lang or "").strip()
93
  if not language:
@@ -101,81 +125,16 @@ def on_select_language(lang: str, require_has_math: bool, require_has_infobox: b
101
  docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, 0)
102
  if not docs_cache:
103
  return (-1, [], reader_iter, "No documents found.", {}, "", [], render_iframe(""))
104
- # Find first doc matching filters (starting before 0)
105
- idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, -1, require_has_math, require_has_infobox)
 
 
106
  if idx == -1:
107
- return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
108
- left, left_meta, md, info, right, header = render_idx(docs_cache, idx)
109
- return (idx, docs_cache, reader_iter, left, left, left_meta, header, md, info, right)
110
 
111
 
112
- def on_find(docs_cache, idx: int, reader_iter, id_query: str, require_has_math: bool, require_has_infobox: bool):
113
- query = (id_query or "").strip()
114
- if not docs_cache and reader_iter is None:
115
- return -1, docs_cache, reader_iter, "No documents loaded.", {}, "", [], render_iframe("")
116
- if not query:
117
- docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, 0)
118
- new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, -1, require_has_math, require_has_infobox)
119
- if new_idx == -1:
120
- return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
121
- left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
122
- return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
123
- # Exact match in cache
124
- for i, doc in enumerate(docs_cache):
125
- meta = (getattr(doc, "metadata", None) or {})
126
- doc_id = (getattr(doc, "id", None) or "")
127
- url = meta.get("url") or ""
128
- if doc_id == query or meta.get("wikidata_id") == query or url == query:
129
- left, left_meta, md, info, right, header = render_idx(docs_cache, i)
130
- if matches_filters(doc, require_has_math, require_has_infobox):
131
- return i, docs_cache, reader_iter, left, left_meta, header, md, info, right
132
- new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, i, require_has_math, require_has_infobox)
133
- if new_idx == -1:
134
- return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
135
- left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
136
- return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
137
- # Suffix match in cache
138
- for i, doc in enumerate(docs_cache):
139
- doc_id = (getattr(doc, "id", None) or "")
140
- meta = (getattr(doc, "metadata", None) or {})
141
- url = meta.get("url") or ""
142
- if doc_id.endswith(f"/{query}") or url.endswith(query):
143
- left, left_meta, md, info, right, header = render_idx(docs_cache, i)
144
- if matches_filters(doc, require_has_math, require_has_infobox):
145
- return i, docs_cache, reader_iter, left, left_meta, header, md, info, right
146
- new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, i, require_has_math, require_has_infobox)
147
- if new_idx == -1:
148
- return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
149
- left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
150
- return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
151
- # Stream forward until found or exhausted
152
- found_idx = None
153
- while True:
154
- prev_len = len(docs_cache)
155
- docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, prev_len)
156
- if len(docs_cache) == prev_len:
157
- break
158
- doc = docs_cache[-1]
159
- meta = (getattr(doc, "metadata", None) or {})
160
- doc_id = (getattr(doc, "id", None) or "")
161
- url = meta.get("url") or ""
162
- if doc_id == query or meta.get("wikidata_id") == query or url.endswith(query) or url == query or doc_id.endswith(f"/{query}"):
163
- found_idx = len(docs_cache) - 1
164
- break
165
- if found_idx is not None:
166
- new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, found_idx - 1, require_has_math, require_has_infobox)
167
- if new_idx == -1:
168
- return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
169
- left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
170
- return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
171
- target_idx = 0 if docs_cache else -1
172
- if target_idx == -1:
173
- return -1, docs_cache, reader_iter, "No documents found.", {}, "", [], render_iframe("")
174
- new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, target_idx - 1, require_has_math, require_has_infobox)
175
- if new_idx == -1:
176
- return (-1, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe(""))
177
- left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
178
- return new_idx, docs_cache, reader_iter, left, left_meta, header, md, info, right
179
 
180
 
181
  def show_doc(doc):
@@ -199,49 +158,38 @@ def render_idx(docs, idx: int):
199
  return left, left_meta, md, info, right, header
200
 
201
 
202
- def on_prev(docs_cache, idx: int, reader_iter, require_has_math: bool, require_has_infobox: bool):
203
  if not docs_cache:
204
  # Try to ensure at least first doc is loaded
205
  docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, 0)
206
  if not docs_cache:
207
  return idx, docs_cache, reader_iter, "No documents.", {}, "", [], render_iframe("")
208
  new_idx = max(0, idx - 1)
209
- # Apply filters going backwards by scanning from start to new_idx
210
- filtered_idx = new_idx
211
- if new_idx >= 0:
212
- for i in range(new_idx, -1, -1):
213
- if matches_filters(docs_cache[i], require_has_math, require_has_infobox):
214
- filtered_idx = i
215
- break
216
- left, left_meta, md, info, right, header = render_idx(docs_cache, filtered_idx)
217
- return filtered_idx, docs_cache, reader_iter, left, left, left_meta, header, md, info, right
218
-
219
-
220
- def on_next(docs_cache, idx: int, reader_iter, require_has_math: bool, require_has_infobox: bool):
 
221
  target_idx = idx + 1 if idx >= 0 else 0
222
  docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, target_idx)
223
  if not docs_cache:
224
  return idx, docs_cache, reader_iter, "No documents.", {}, "", [], render_iframe("")
225
- new_idx = min(len(docs_cache) - 1, target_idx)
226
- # Apply filters forward
227
- new_idx, docs_cache, reader_iter = find_next_matching_from(docs_cache, reader_iter, idx, require_has_math, require_has_infobox)
 
228
  if new_idx == -1:
229
- return idx, docs_cache, reader_iter, "No documents match filters.", {}, "", [], render_iframe("")
230
- left, left_meta, md, info, right, header = render_idx(docs_cache, new_idx)
231
- return new_idx, docs_cache, reader_iter, left, left, left_meta, header, md, info, right
232
-
233
-
234
- SCROLL_TO_TOP_JS = """
235
- () => {
236
- const ids = ["left_text_box", "right_markdown_box"];
237
- for (const id of ids) {
238
- const root = document.getElementById(id);
239
- if (!root) continue;
240
- const ta = root.querySelector('textarea');
241
- if (ta) ta.scrollTop = 0;
242
- }
243
- }
244
- """
245
 
246
  with gr.Blocks() as demo:
247
  idx_state = gr.State(value=-1, time_to_live=900)
@@ -264,6 +212,8 @@ with gr.Blocks() as demo:
264
  with gr.Column(scale=1):
265
  require_has_math = gr.Checkbox(label="Has math", value=False)
266
  require_has_infobox = gr.Checkbox(label="Has infobox", value=False)
 
 
267
  with gr.Row():
268
  with gr.Column():
269
  with gr.Tab("FineWiki raw"):
@@ -285,21 +235,15 @@ with gr.Blocks() as demo:
285
  right_markdown = gr.Textbox(label="wikimedia/wikipedia extraction", lines=30, elem_id="right_markdown_box")
286
 
287
 
288
- _ev1 = language_select.change(on_select_language, inputs=[language_select, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text_raw, left_text_md, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
289
- _ev1.then(js=SCROLL_TO_TOP_JS)
290
- _ev2 = demo.load(on_select_language, inputs=[language_select, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text_raw, left_text_md, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
291
- _ev2.then(js=SCROLL_TO_TOP_JS)
292
  # find_btn.click(on_find, inputs=[docs_state, idx_state, iter_state, id_input, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
293
 
294
  # Visibility toggles driven directly by checkbox changes
295
- _ev4 = prev_btn.click(on_prev, inputs=[docs_state, idx_state, iter_state, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text_raw, left_text_md, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
296
- _ev4.then(js=SCROLL_TO_TOP_JS)
297
- _ev5 = next_btn.click(on_next, inputs=[docs_state, idx_state, iter_state, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text_raw, left_text_md, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
298
- _ev5.then(js=SCROLL_TO_TOP_JS)
299
- _ev4 = prev_btn2.click(on_prev, inputs=[docs_state, idx_state, iter_state, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text_raw, left_text_md, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
300
- _ev4.then(js=SCROLL_TO_TOP_JS)
301
- _ev5 = next_btn2.click(on_next, inputs=[docs_state, idx_state, iter_state, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text_raw, left_text_md, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
302
- _ev5.then(js=SCROLL_TO_TOP_JS)
303
 
304
  # Enable global queue to coordinate concurrent requests safely
305
  demo.queue(default_concurrency_limit=1, max_size=128)
 
1
  import gradio as gr
2
+ import re
3
  from datatrove.pipeline.readers import ParquetReader
4
  from default_wiki_pipeline import _parse_and_clean_wikicode, mwparserfromhell
5
 
 
24
  header += f"\n\n[{url}]({url})"
25
  return header
26
 
27
+ def has_markdown_table(md_text: str) -> bool:
28
+ return bool(re.search(r"(?m)^\s*\|.+\|\s*\n\s*\|?\s*:?-{3,}.*$", md_text or ""))
29
+
30
+
31
+ def has_code_fence(md_text: str) -> bool:
32
+ return "```" in (md_text or "")
33
+
34
+
35
+ def matches_prefilters(doc, require_has_math: bool | None, require_has_infobox: bool | None) -> bool:
36
  meta = doc.metadata or {}
37
  if require_has_math and not bool(meta.get("has_math")):
38
  return False
 
41
  return True
42
 
43
 
44
+ def postfilters_ok(md_text: str, require_has_table: bool | None, require_has_code: bool | None) -> bool:
45
+ if require_has_table and not has_markdown_table(md_text):
46
+ return False
47
+ if require_has_code and not has_code_fence(md_text):
48
+ return False
49
+ return True
50
+
51
+
52
+ def find_next_valid(docs_cache, reader_iter, start_idx: int, require_has_math: bool | None, require_has_infobox: bool | None, require_has_table: bool | None, require_has_code: bool | None):
53
+ # Scan cache first (forward from start_idx)
54
  i = max(-1, start_idx)
55
  while i + 1 < len(docs_cache):
56
  i += 1
57
+ if not matches_prefilters(docs_cache[i], require_has_math, require_has_infobox):
58
+ continue
59
+ left, left_meta, md, info, right, header = render_idx(docs_cache, i)
60
+ if postfilters_ok(left, require_has_table, require_has_code):
61
+ return i, docs_cache, reader_iter, left, md, left_meta, header, md, info, right
62
  # Stream until found or exhausted
63
  while True:
64
  prev_len = len(docs_cache)
65
  docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, prev_len)
66
  if len(docs_cache) == prev_len:
67
  break
68
+ if not matches_prefilters(docs_cache[-1], require_has_math, require_has_infobox):
69
+ continue
70
+ left, left_meta, md, info, right, header = render_idx(docs_cache, len(docs_cache) - 1)
71
+ if postfilters_ok(left, require_has_table, require_has_code):
72
+ idx = len(docs_cache) - 1
73
+ return idx, docs_cache, reader_iter, left, md, left_meta, header, md, info, right
74
+ return -1, docs_cache, reader_iter, "No documents match filters.", "", {}, "", "", [], render_iframe("")
75
 
76
  def render_iframe(url: str, height: int = 800) -> str:
77
  safe_url = url or "about:blank"
 
111
  return docs_cache, reader_iter
112
 
113
 
114
+ def on_select_language(lang: str, require_has_math: bool, require_has_infobox: bool, require_has_table: bool, require_has_code: bool):
115
  """Load documents for the selected language from HF Parquet and display."""
116
  language = (lang or "").strip()
117
  if not language:
 
125
  docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, 0)
126
  if not docs_cache:
127
  return (-1, [], reader_iter, "No documents found.", {}, "", [], render_iframe(""))
128
+ # Find first doc matching pre- and post-filters
129
+ idx, docs_cache, reader_iter, left, md, left_meta, header, right_md, info, right = find_next_valid(
130
+ docs_cache, reader_iter, -1, require_has_math, require_has_infobox, require_has_table, require_has_code
131
+ )
132
  if idx == -1:
133
+ return (-1, docs_cache, reader_iter, "No documents match filters.", "", {}, "", "", [], render_iframe(""))
134
+ return (idx, docs_cache, reader_iter, left, md, left_meta, header, right_md, info, right)
 
135
 
136
 
137
+ # on_find removed per user request
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
 
140
  def show_doc(doc):
 
158
  return left, left_meta, md, info, right, header
159
 
160
 
161
+ def on_prev(docs_cache, idx: int, reader_iter, require_has_math: bool, require_has_infobox: bool, require_has_table: bool, require_has_code: bool):
162
  if not docs_cache:
163
  # Try to ensure at least first doc is loaded
164
  docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, 0)
165
  if not docs_cache:
166
  return idx, docs_cache, reader_iter, "No documents.", {}, "", [], render_iframe("")
167
  new_idx = max(0, idx - 1)
168
+ # Apply prefilters going backwards by scanning from start to new_idx; evaluate postfilters on candidate
169
+ filtered_idx = -1
170
+ for i in range(new_idx, -1, -1):
171
+ if not matches_prefilters(docs_cache[i], require_has_math, require_has_infobox):
172
+ continue
173
+ left, left_meta, md, info, right, header = render_idx(docs_cache, i)
174
+ if postfilters_ok(md, require_has_table, require_has_code):
175
+ filtered_idx = i
176
+ return filtered_idx, docs_cache, reader_iter, left, md, left_meta, header, md, info, right
177
+ return idx, docs_cache, reader_iter, "No documents match filters.", "", {}, "", "", [], render_iframe("")
178
+
179
+
180
+ def on_next(docs_cache, idx: int, reader_iter, require_has_math: bool, require_has_infobox: bool, require_has_table: bool, require_has_code: bool):
181
  target_idx = idx + 1 if idx >= 0 else 0
182
  docs_cache, reader_iter = _ensure_until_index(docs_cache, reader_iter, target_idx)
183
  if not docs_cache:
184
  return idx, docs_cache, reader_iter, "No documents.", {}, "", [], render_iframe("")
185
+ # Apply filters forward using new finder
186
+ new_idx, docs_cache, reader_iter, left, md, left_meta, header, right_md, info, right = find_next_valid(
187
+ docs_cache, reader_iter, idx, require_has_math, require_has_infobox, require_has_table, require_has_code
188
+ )
189
  if new_idx == -1:
190
+ return idx, docs_cache, reader_iter, "No documents match filters.", "", {}, "", "", [], render_iframe("")
191
+ return new_idx, docs_cache, reader_iter, left, md, left_meta, header, right_md, info, right
192
+
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
  with gr.Blocks() as demo:
195
  idx_state = gr.State(value=-1, time_to_live=900)
 
212
  with gr.Column(scale=1):
213
  require_has_math = gr.Checkbox(label="Has math", value=False)
214
  require_has_infobox = gr.Checkbox(label="Has infobox", value=False)
215
+ require_has_table = gr.Checkbox(label="Has table", value=False)
216
+ require_has_code = gr.Checkbox(label="Has pre/code", value=False)
217
  with gr.Row():
218
  with gr.Column():
219
  with gr.Tab("FineWiki raw"):
 
235
  right_markdown = gr.Textbox(label="wikimedia/wikipedia extraction", lines=30, elem_id="right_markdown_box")
236
 
237
 
238
+ language_select.change(on_select_language, inputs=[language_select, require_has_math, require_has_infobox, require_has_table, require_has_code], outputs=[idx_state, docs_state, iter_state, left_text_raw, left_text_md, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
239
+ demo.load(on_select_language, inputs=[language_select, require_has_math, require_has_infobox, require_has_table, require_has_code], outputs=[idx_state, docs_state, iter_state, left_text_raw, left_text_md, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
 
 
240
  # find_btn.click(on_find, inputs=[docs_state, idx_state, iter_state, id_input, require_has_math, require_has_infobox], outputs=[idx_state, docs_state, iter_state, left_text, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
241
 
242
  # Visibility toggles driven directly by checkbox changes
243
+ prev_btn.click(on_prev, inputs=[docs_state, idx_state, iter_state, require_has_math, require_has_infobox, require_has_table, require_has_code], outputs=[idx_state, docs_state, iter_state, left_text_raw, left_text_md, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
244
+ next_btn.click(on_next, inputs=[docs_state, idx_state, iter_state, require_has_math, require_has_infobox, require_has_table, require_has_code], outputs=[idx_state, docs_state, iter_state, left_text_raw, left_text_md, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
245
+ prev_btn2.click(on_prev, inputs=[docs_state, idx_state, iter_state, require_has_math, require_has_infobox, require_has_table, require_has_code], outputs=[idx_state, docs_state, iter_state, left_text_raw, left_text_md, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
246
+ next_btn2.click(on_next, inputs=[docs_state, idx_state, iter_state, require_has_math, require_has_infobox, require_has_table, require_has_code], outputs=[idx_state, docs_state, iter_state, left_text_raw, left_text_md, left_meta, header_md, right_markdown, right_infoboxes, right_iframe], concurrency_limit=1)
 
 
 
 
247
 
248
  # Enable global queue to coordinate concurrent requests safely
249
  demo.queue(default_concurrency_limit=1, max_size=128)