Spaces:
Runtime error
Runtime error
readd exact
Browse files
app.py
CHANGED
|
@@ -71,6 +71,23 @@ def extract_lang_from_docid(docid):
|
|
| 71 |
return docid.split("_")[1]
|
| 72 |
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
def format_result(result, highlight_terms, exact_search, datasets_filter=None):
|
| 75 |
text, url, docid = result
|
| 76 |
if datasets_filter is not None:
|
|
@@ -80,11 +97,17 @@ def format_result(result, highlight_terms, exact_search, datasets_filter=None):
|
|
| 80 |
return ""
|
| 81 |
|
| 82 |
if exact_search:
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
tokens_html
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
else:
|
| 89 |
tokens = text.split()
|
| 90 |
tokens_html = []
|
|
@@ -247,8 +270,6 @@ def request_payload(query, language, exact_search, num_results=10, received_resu
|
|
| 247 |
|
| 248 |
title = """<p style="text-align: center; font-size:28px"> πΈ π ROOTS search tool π πΈ </p>"""
|
| 249 |
description = """
|
| 250 |
-
## We're running maintenance works on the exact search index, so it may not work properly until the end of the day, Thursday 30th of March.
|
| 251 |
-
|
| 252 |
|
| 253 |
The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
|
| 254 |
of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). The ROOTS Search
|
|
|
|
| 71 |
return docid.split("_")[1]
|
| 72 |
|
| 73 |
|
| 74 |
+
def normalize(document):
|
| 75 |
+
def remove_articles(text):
|
| 76 |
+
return re.sub(r"\b(a|an|the)\b", " ", text)
|
| 77 |
+
|
| 78 |
+
def white_space_fix(text):
|
| 79 |
+
return " ".join(text.split())
|
| 80 |
+
|
| 81 |
+
def remove_punc(text):
|
| 82 |
+
exclude = set(string.punctuation)
|
| 83 |
+
return "".join(ch for ch in text if ch not in exclude)
|
| 84 |
+
|
| 85 |
+
def lower(text):
|
| 86 |
+
return text.lower()
|
| 87 |
+
|
| 88 |
+
return white_space_fix(remove_articles(remove_punc(lower(document))))
|
| 89 |
+
|
| 90 |
+
|
| 91 |
def format_result(result, highlight_terms, exact_search, datasets_filter=None):
|
| 92 |
text, url, docid = result
|
| 93 |
if datasets_filter is not None:
|
|
|
|
| 97 |
return ""
|
| 98 |
|
| 99 |
if exact_search:
|
| 100 |
+
highlight_terms = normalize(highlight_terms).split()
|
| 101 |
+
print("highlight_terms", highlight_terms)
|
| 102 |
+
tokens = text.split()
|
| 103 |
+
tokens_html = []
|
| 104 |
+
for token in tokens:
|
| 105 |
+
norm_token = normalize(token)
|
| 106 |
+
if norm_token in highlight_terms:
|
| 107 |
+
tokens_html.append("<b>{}</b>".format(token))
|
| 108 |
+
else:
|
| 109 |
+
tokens_html.append(token)
|
| 110 |
+
tokens_html = " ".join(tokens_html)
|
| 111 |
else:
|
| 112 |
tokens = text.split()
|
| 113 |
tokens_html = []
|
|
|
|
| 270 |
|
| 271 |
title = """<p style="text-align: center; font-size:28px"> πΈ π ROOTS search tool π πΈ </p>"""
|
| 272 |
description = """
|
|
|
|
|
|
|
| 273 |
|
| 274 |
The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
|
| 275 |
of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). The ROOTS Search
|