Spaces:
Runtime error
Runtime error
fix em highlights
Browse files
app.py
CHANGED
|
@@ -12,7 +12,9 @@ from huggingface_hub import HfApi
|
|
| 12 |
hf_api = HfApi()
|
| 13 |
roots_datasets = {
|
| 14 |
dset.id.split("/")[-1]: dset
|
| 15 |
-
for dset in hf_api.list_datasets(
|
|
|
|
|
|
|
| 16 |
}
|
| 17 |
|
| 18 |
|
|
@@ -64,7 +66,9 @@ def process_pii(text):
|
|
| 64 |
for tag in PII_TAGS:
|
| 65 |
text = text.replace(
|
| 66 |
PII_PREFIX + tag,
|
| 67 |
-
"""<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(
|
|
|
|
|
|
|
| 68 |
)
|
| 69 |
return text
|
| 70 |
|
|
@@ -99,17 +103,11 @@ def format_result(result, highlight_terms, exact_search, datasets_filter=None):
|
|
| 99 |
return ""
|
| 100 |
|
| 101 |
if exact_search:
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
tokens_html
|
| 106 |
-
|
| 107 |
-
norm_token = normalize(token)
|
| 108 |
-
if norm_token in highlight_terms:
|
| 109 |
-
tokens_html.append("<b>{}</b>".format(token))
|
| 110 |
-
else:
|
| 111 |
-
tokens_html.append(token)
|
| 112 |
-
tokens_html = " ".join(tokens_html)
|
| 113 |
else:
|
| 114 |
tokens = text.split()
|
| 115 |
tokens_html = []
|
|
@@ -154,7 +152,9 @@ def format_result(result, highlight_terms, exact_search, datasets_filter=None):
|
|
| 154 |
return "<p>" + result_html + "</p>"
|
| 155 |
|
| 156 |
|
| 157 |
-
def format_result_page(
|
|
|
|
|
|
|
| 158 |
filtered_num_results = 0
|
| 159 |
header_html = ""
|
| 160 |
|
|
@@ -179,7 +179,9 @@ def format_result_page(language, results, highlight_terms, num_results, exact_se
|
|
| 179 |
continue
|
| 180 |
results_for_lang_html = ""
|
| 181 |
for result in results_for_lang:
|
| 182 |
-
result_html = format_result(
|
|
|
|
|
|
|
| 183 |
if result_html != "":
|
| 184 |
filtered_num_results += 1
|
| 185 |
results_for_lang_html += result_html
|
|
@@ -221,7 +223,9 @@ def extract_results_from_payload(query, language, payload, exact_search):
|
|
| 221 |
text = result["text"]
|
| 222 |
url = (
|
| 223 |
result["meta"]["url"]
|
| 224 |
-
if "meta" in result
|
|
|
|
|
|
|
| 225 |
else None
|
| 226 |
)
|
| 227 |
docid = result["docid"]
|
|
@@ -259,7 +263,11 @@ def request_payload(query, language, exact_search, num_results=10, received_resu
|
|
| 259 |
post_data = {"query": query, "k": num_results, "received_results": received_results}
|
| 260 |
if language != "detect_language":
|
| 261 |
post_data["lang"] = language
|
| 262 |
-
address =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
output = requests.post(
|
| 264 |
address,
|
| 265 |
headers={"Content-type": "application/json"},
|
|
@@ -270,7 +278,9 @@ def request_payload(query, language, exact_search, num_results=10, received_resu
|
|
| 270 |
return payload
|
| 271 |
|
| 272 |
|
| 273 |
-
title =
|
|
|
|
|
|
|
| 274 |
description = """
|
| 275 |
|
| 276 |
The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
|
|
@@ -389,7 +399,9 @@ if __name__ == "__main__":
|
|
| 389 |
payload,
|
| 390 |
exact_search,
|
| 391 |
)
|
| 392 |
-
result_page = format_result_page(
|
|
|
|
|
|
|
| 393 |
return (
|
| 394 |
processed_results,
|
| 395 |
highlight_terms,
|
|
@@ -410,13 +422,19 @@ if __name__ == "__main__":
|
|
| 410 |
datasets,
|
| 411 |
) = run_query(query, lang, k, dropdown_input, 0)
|
| 412 |
has_more_results = exact_search and (num_results > k)
|
| 413 |
-
current_results =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
return [
|
| 415 |
processed_results,
|
| 416 |
highlight_terms,
|
| 417 |
num_results,
|
| 418 |
exact_search,
|
| 419 |
-
gr.update(visible=True)
|
|
|
|
|
|
|
| 420 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
| 421 |
gr.update(visible=has_more_results),
|
| 422 |
current_results,
|
|
@@ -439,8 +457,12 @@ if __name__ == "__main__":
|
|
| 439 |
result_page,
|
| 440 |
datasets,
|
| 441 |
) = run_query(query, lang, k, dropdown_input, received_results)
|
| 442 |
-
current_results = sum(
|
| 443 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
print("received_results", received_results)
|
| 445 |
print("current_results", current_results)
|
| 446 |
print("has_more_results", has_more_results)
|
|
@@ -449,7 +471,9 @@ if __name__ == "__main__":
|
|
| 449 |
highlight_terms,
|
| 450 |
num_results,
|
| 451 |
exact_search,
|
| 452 |
-
gr.update(visible=True)
|
|
|
|
|
|
|
| 453 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
| 454 |
gr.update(visible=current_results >= k and has_more_results),
|
| 455 |
received_results + current_results,
|
|
|
|
| 12 |
hf_api = HfApi()
|
| 13 |
roots_datasets = {
|
| 14 |
dset.id.split("/")[-1]: dset
|
| 15 |
+
for dset in hf_api.list_datasets(
|
| 16 |
+
author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token")
|
| 17 |
+
)
|
| 18 |
}
|
| 19 |
|
| 20 |
|
|
|
|
| 66 |
for tag in PII_TAGS:
|
| 67 |
text = text.replace(
|
| 68 |
PII_PREFIX + tag,
|
| 69 |
+
"""<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(
|
| 70 |
+
tag
|
| 71 |
+
),
|
| 72 |
)
|
| 73 |
return text
|
| 74 |
|
|
|
|
| 103 |
return ""
|
| 104 |
|
| 105 |
if exact_search:
|
| 106 |
+
query_start = text.find(highlight_terms)
|
| 107 |
+
query_end = query_start + len(highlight_terms)
|
| 108 |
+
tokens_html = text[0:query_start]
|
| 109 |
+
tokens_html += "<b>{}</b>".format(text[query_start:query_end])
|
| 110 |
+
tokens_html += text[query_end:]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
else:
|
| 112 |
tokens = text.split()
|
| 113 |
tokens_html = []
|
|
|
|
| 152 |
return "<p>" + result_html + "</p>"
|
| 153 |
|
| 154 |
|
| 155 |
+
def format_result_page(
|
| 156 |
+
language, results, highlight_terms, num_results, exact_search, datasets_filter=None
|
| 157 |
+
) -> gr.HTML:
|
| 158 |
filtered_num_results = 0
|
| 159 |
header_html = ""
|
| 160 |
|
|
|
|
| 179 |
continue
|
| 180 |
results_for_lang_html = ""
|
| 181 |
for result in results_for_lang:
|
| 182 |
+
result_html = format_result(
|
| 183 |
+
result, highlight_terms, exact_search, datasets_filter
|
| 184 |
+
)
|
| 185 |
if result_html != "":
|
| 186 |
filtered_num_results += 1
|
| 187 |
results_for_lang_html += result_html
|
|
|
|
| 223 |
text = result["text"]
|
| 224 |
url = (
|
| 225 |
result["meta"]["url"]
|
| 226 |
+
if "meta" in result
|
| 227 |
+
and result["meta"] is not None
|
| 228 |
+
and "url" in result["meta"]
|
| 229 |
else None
|
| 230 |
)
|
| 231 |
docid = result["docid"]
|
|
|
|
| 263 |
post_data = {"query": query, "k": num_results, "received_results": received_results}
|
| 264 |
if language != "detect_language":
|
| 265 |
post_data["lang"] = language
|
| 266 |
+
address = (
|
| 267 |
+
os.environ.get("address_exact_search")
|
| 268 |
+
if exact_search
|
| 269 |
+
else os.environ.get("address")
|
| 270 |
+
)
|
| 271 |
output = requests.post(
|
| 272 |
address,
|
| 273 |
headers={"Content-type": "application/json"},
|
|
|
|
| 278 |
return payload
|
| 279 |
|
| 280 |
|
| 281 |
+
title = (
|
| 282 |
+
"""<p style="text-align: center; font-size:28px"> πΈ π ROOTS search tool π πΈ </p>"""
|
| 283 |
+
)
|
| 284 |
description = """
|
| 285 |
|
| 286 |
The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
|
|
|
|
| 399 |
payload,
|
| 400 |
exact_search,
|
| 401 |
)
|
| 402 |
+
result_page = format_result_page(
|
| 403 |
+
lang, processed_results, highlight_terms, num_results, exact_search
|
| 404 |
+
)
|
| 405 |
return (
|
| 406 |
processed_results,
|
| 407 |
highlight_terms,
|
|
|
|
| 422 |
datasets,
|
| 423 |
) = run_query(query, lang, k, dropdown_input, 0)
|
| 424 |
has_more_results = exact_search and (num_results > k)
|
| 425 |
+
current_results = (
|
| 426 |
+
len(next(iter(processed_results.values())))
|
| 427 |
+
if len(processed_results) > 0
|
| 428 |
+
else 0
|
| 429 |
+
)
|
| 430 |
return [
|
| 431 |
processed_results,
|
| 432 |
highlight_terms,
|
| 433 |
num_results,
|
| 434 |
exact_search,
|
| 435 |
+
gr.update(visible=True)
|
| 436 |
+
if current_results > 0
|
| 437 |
+
else gr.update(visible=False),
|
| 438 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
| 439 |
gr.update(visible=has_more_results),
|
| 440 |
current_results,
|
|
|
|
| 457 |
result_page,
|
| 458 |
datasets,
|
| 459 |
) = run_query(query, lang, k, dropdown_input, received_results)
|
| 460 |
+
current_results = sum(
|
| 461 |
+
len(results) for results in processed_results.values()
|
| 462 |
+
)
|
| 463 |
+
has_more_results = exact_search and (
|
| 464 |
+
received_results + current_results < num_results
|
| 465 |
+
)
|
| 466 |
print("received_results", received_results)
|
| 467 |
print("current_results", current_results)
|
| 468 |
print("has_more_results", has_more_results)
|
|
|
|
| 471 |
highlight_terms,
|
| 472 |
num_results,
|
| 473 |
exact_search,
|
| 474 |
+
gr.update(visible=True)
|
| 475 |
+
if current_results > 0
|
| 476 |
+
else gr.update(visible=False),
|
| 477 |
gr.Dropdown.update(choices=datasets, value=datasets),
|
| 478 |
gr.update(visible=current_results >= k and has_more_results),
|
| 479 |
received_results + current_results,
|