Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	maintenance message
Browse files
    	
        app.py
    CHANGED
    
    | 
         @@ -10,9 +10,7 @@ from huggingface_hub import HfApi 
     | 
|
| 10 | 
         
             
            hf_api = HfApi()
         
     | 
| 11 | 
         
             
            roots_datasets = {
         
     | 
| 12 | 
         
             
                dset.id.split("/")[-1]: dset
         
     | 
| 13 | 
         
            -
                for dset in hf_api.list_datasets(
         
     | 
| 14 | 
         
            -
                    author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token")
         
     | 
| 15 | 
         
            -
                )
         
     | 
| 16 | 
         
             
            }
         
     | 
| 17 | 
         | 
| 18 | 
         | 
| 
         @@ -64,9 +62,7 @@ def process_pii(text): 
     | 
|
| 64 | 
         
             
                for tag in PII_TAGS:
         
     | 
| 65 | 
         
             
                    text = text.replace(
         
     | 
| 66 | 
         
             
                        PII_PREFIX + tag,
         
     | 
| 67 | 
         
            -
                        """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(
         
     | 
| 68 | 
         
            -
                            tag
         
     | 
| 69 | 
         
            -
                        ),
         
     | 
| 70 | 
         
             
                    )
         
     | 
| 71 | 
         
             
                return text
         
     | 
| 72 | 
         | 
| 
         @@ -133,9 +129,7 @@ def format_result(result, highlight_terms, exact_search, datasets_filter=None): 
     | 
|
| 133 | 
         
             
                return "<p>" + result_html + "</p>"
         
     | 
| 134 | 
         | 
| 135 | 
         | 
| 136 | 
         
            -
            def format_result_page(
         
     | 
| 137 | 
         
            -
                language, results, highlight_terms, num_results, exact_search, datasets_filter=None
         
     | 
| 138 | 
         
            -
            ) -> gr.HTML:
         
     | 
| 139 | 
         
             
                filtered_num_results = 0
         
     | 
| 140 | 
         
             
                header_html = ""
         
     | 
| 141 | 
         | 
| 
         @@ -160,9 +154,7 @@ def format_result_page( 
     | 
|
| 160 | 
         
             
                        continue
         
     | 
| 161 | 
         
             
                    results_for_lang_html = ""
         
     | 
| 162 | 
         
             
                    for result in results_for_lang:
         
     | 
| 163 | 
         
            -
                        result_html = format_result(
         
     | 
| 164 | 
         
            -
                            result, highlight_terms, exact_search, datasets_filter
         
     | 
| 165 | 
         
            -
                        )
         
     | 
| 166 | 
         
             
                        if result_html != "":
         
     | 
| 167 | 
         
             
                            filtered_num_results += 1
         
     | 
| 168 | 
         
             
                        results_for_lang_html += result_html
         
     | 
| 
         @@ -204,9 +196,7 @@ def extract_results_from_payload(query, language, payload, exact_search): 
     | 
|
| 204 | 
         
             
                        text = result["text"]
         
     | 
| 205 | 
         
             
                        url = (
         
     | 
| 206 | 
         
             
                            result["meta"]["url"]
         
     | 
| 207 | 
         
            -
                            if "meta" in result
         
     | 
| 208 | 
         
            -
                            and result["meta"] is not None
         
     | 
| 209 | 
         
            -
                            and "url" in result["meta"]
         
     | 
| 210 | 
         
             
                            else None
         
     | 
| 211 | 
         
             
                        )
         
     | 
| 212 | 
         
             
                        docid = result["docid"]
         
     | 
| 
         @@ -244,11 +234,7 @@ def request_payload(query, language, exact_search, num_results=10, received_resu 
     | 
|
| 244 | 
         
             
                post_data = {"query": query, "k": num_results, "received_results": received_results}
         
     | 
| 245 | 
         
             
                if language != "detect_language":
         
     | 
| 246 | 
         
             
                    post_data["lang"] = language
         
     | 
| 247 | 
         
            -
                address = (
         
     | 
| 248 | 
         
            -
                    os.environ.get("address_exact_search")
         
     | 
| 249 | 
         
            -
                    if exact_search
         
     | 
| 250 | 
         
            -
                    else os.environ.get("address")
         
     | 
| 251 | 
         
            -
                )
         
     | 
| 252 | 
         
             
                output = requests.post(
         
     | 
| 253 | 
         
             
                    address,
         
     | 
| 254 | 
         
             
                    headers={"Content-type": "application/json"},
         
     | 
| 
         @@ -259,10 +245,12 @@ def request_payload(query, language, exact_search, num_results=10, received_resu 
     | 
|
| 259 | 
         
             
                return payload
         
     | 
| 260 | 
         | 
| 261 | 
         | 
| 262 | 
         
            -
            title =  
     | 
| 263 | 
         
            -
                """<p style="text-align: center; font-size:28px"> πΈ π ROOTS search tool π πΈ </p>"""
         
     | 
| 264 | 
         
            -
            )
         
     | 
| 265 | 
         
             
            description = """
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 266 | 
         
             
            The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
         
     | 
| 267 | 
         
             
            of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). The ROOTS Search
         
     | 
| 268 | 
         
             
            Tool allows you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages
         
     | 
| 
         @@ -379,9 +367,7 @@ if __name__ == "__main__": 
     | 
|
| 379 | 
         
             
                            payload,
         
     | 
| 380 | 
         
             
                            exact_search,
         
     | 
| 381 | 
         
             
                        )
         
     | 
| 382 | 
         
            -
                        result_page = format_result_page(
         
     | 
| 383 | 
         
            -
                            lang, processed_results, highlight_terms, num_results, exact_search
         
     | 
| 384 | 
         
            -
                        )
         
     | 
| 385 | 
         
             
                        return (
         
     | 
| 386 | 
         
             
                            processed_results,
         
     | 
| 387 | 
         
             
                            highlight_terms,
         
     | 
| 
         @@ -402,19 +388,13 @@ if __name__ == "__main__": 
     | 
|
| 402 | 
         
             
                            datasets,
         
     | 
| 403 | 
         
             
                        ) = run_query(query, lang, k, dropdown_input, 0)
         
     | 
| 404 | 
         
             
                        has_more_results = exact_search and (num_results > k)
         
     | 
| 405 | 
         
            -
                        current_results = (
         
     | 
| 406 | 
         
            -
                            len(next(iter(processed_results.values())))
         
     | 
| 407 | 
         
            -
                            if len(processed_results) > 0
         
     | 
| 408 | 
         
            -
                            else 0
         
     | 
| 409 | 
         
            -
                        )
         
     | 
| 410 | 
         
             
                        return [
         
     | 
| 411 | 
         
             
                            processed_results,
         
     | 
| 412 | 
         
             
                            highlight_terms,
         
     | 
| 413 | 
         
             
                            num_results,
         
     | 
| 414 | 
         
             
                            exact_search,
         
     | 
| 415 | 
         
            -
                            gr.update(visible=True)
         
     | 
| 416 | 
         
            -
                            if current_results > 0
         
     | 
| 417 | 
         
            -
                            else gr.update(visible=False),
         
     | 
| 418 | 
         
             
                            gr.Dropdown.update(choices=datasets, value=datasets),
         
     | 
| 419 | 
         
             
                            gr.update(visible=has_more_results),
         
     | 
| 420 | 
         
             
                            current_results,
         
     | 
| 
         @@ -437,12 +417,8 @@ if __name__ == "__main__": 
     | 
|
| 437 | 
         
             
                            result_page,
         
     | 
| 438 | 
         
             
                            datasets,
         
     | 
| 439 | 
         
             
                        ) = run_query(query, lang, k, dropdown_input, received_results)
         
     | 
| 440 | 
         
            -
                        current_results = sum(
         
     | 
| 441 | 
         
            -
             
     | 
| 442 | 
         
            -
                        )
         
     | 
| 443 | 
         
            -
                        has_more_results = exact_search and (
         
     | 
| 444 | 
         
            -
                            received_results + current_results < num_results
         
     | 
| 445 | 
         
            -
                        )
         
     | 
| 446 | 
         
             
                        print("received_results", received_results)
         
     | 
| 447 | 
         
             
                        print("current_results", current_results)
         
     | 
| 448 | 
         
             
                        print("has_more_results", has_more_results)
         
     | 
| 
         @@ -451,9 +427,7 @@ if __name__ == "__main__": 
     | 
|
| 451 | 
         
             
                            highlight_terms,
         
     | 
| 452 | 
         
             
                            num_results,
         
     | 
| 453 | 
         
             
                            exact_search,
         
     | 
| 454 | 
         
            -
                            gr.update(visible=True)
         
     | 
| 455 | 
         
            -
                            if current_results > 0
         
     | 
| 456 | 
         
            -
                            else gr.update(visible=False),
         
     | 
| 457 | 
         
             
                            gr.Dropdown.update(choices=datasets, value=datasets),
         
     | 
| 458 | 
         
             
                            gr.update(visible=current_results >= k and has_more_results),
         
     | 
| 459 | 
         
             
                            received_results + current_results,
         
     | 
| 
         | 
|
| 10 | 
         
             
            hf_api = HfApi()
         
     | 
| 11 | 
         
             
            roots_datasets = {
         
     | 
| 12 | 
         
             
                dset.id.split("/")[-1]: dset
         
     | 
| 13 | 
         
            +
                for dset in hf_api.list_datasets(author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token"))
         
     | 
| 
         | 
|
| 
         | 
|
| 14 | 
         
             
            }
         
     | 
| 15 | 
         | 
| 16 | 
         | 
| 
         | 
|
| 62 | 
         
             
                for tag in PII_TAGS:
         
     | 
| 63 | 
         
             
                    text = text.replace(
         
     | 
| 64 | 
         
             
                        PII_PREFIX + tag,
         
     | 
| 65 | 
         
            +
                        """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(tag),
         
     | 
| 
         | 
|
| 
         | 
|
| 66 | 
         
             
                    )
         
     | 
| 67 | 
         
             
                return text
         
     | 
| 68 | 
         | 
| 
         | 
|
| 129 | 
         
             
                return "<p>" + result_html + "</p>"
         
     | 
| 130 | 
         | 
| 131 | 
         | 
| 132 | 
         
            +
            def format_result_page(language, results, highlight_terms, num_results, exact_search, datasets_filter=None) -> gr.HTML:
         
     | 
| 
         | 
|
| 
         | 
|
| 133 | 
         
             
                filtered_num_results = 0
         
     | 
| 134 | 
         
             
                header_html = ""
         
     | 
| 135 | 
         | 
| 
         | 
|
| 154 | 
         
             
                        continue
         
     | 
| 155 | 
         
             
                    results_for_lang_html = ""
         
     | 
| 156 | 
         
             
                    for result in results_for_lang:
         
     | 
| 157 | 
         
            +
                        result_html = format_result(result, highlight_terms, exact_search, datasets_filter)
         
     | 
| 
         | 
|
| 
         | 
|
| 158 | 
         
             
                        if result_html != "":
         
     | 
| 159 | 
         
             
                            filtered_num_results += 1
         
     | 
| 160 | 
         
             
                        results_for_lang_html += result_html
         
     | 
| 
         | 
|
| 196 | 
         
             
                        text = result["text"]
         
     | 
| 197 | 
         
             
                        url = (
         
     | 
| 198 | 
         
             
                            result["meta"]["url"]
         
     | 
| 199 | 
         
            +
                            if "meta" in result and result["meta"] is not None and "url" in result["meta"]
         
     | 
| 
         | 
|
| 
         | 
|
| 200 | 
         
             
                            else None
         
     | 
| 201 | 
         
             
                        )
         
     | 
| 202 | 
         
             
                        docid = result["docid"]
         
     | 
| 
         | 
|
| 234 | 
         
             
                post_data = {"query": query, "k": num_results, "received_results": received_results}
         
     | 
| 235 | 
         
             
                if language != "detect_language":
         
     | 
| 236 | 
         
             
                    post_data["lang"] = language
         
     | 
| 237 | 
         
            +
                address = os.environ.get("address_exact_search") if exact_search else os.environ.get("address")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 238 | 
         
             
                output = requests.post(
         
     | 
| 239 | 
         
             
                    address,
         
     | 
| 240 | 
         
             
                    headers={"Content-type": "application/json"},
         
     | 
| 
         | 
|
| 245 | 
         
             
                return payload
         
     | 
| 246 | 
         | 
| 247 | 
         | 
| 248 | 
         
            +
            title = """<p style="text-align: center; font-size:28px"> πΈ π ROOTS search tool π πΈ </p>"""
         
     | 
| 
         | 
|
| 
         | 
|
| 249 | 
         
             
            description = """
         
     | 
| 250 | 
         
            +
            # We're running maintenance works on the exact search index, so it may not work properly until the end of the day,
         
     | 
| 251 | 
         
            +
            Monday 27th of March.
         
     | 
| 252 | 
         
            +
             
     | 
| 253 | 
         
            +
             
     | 
| 254 | 
         
             
            The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
         
     | 
| 255 | 
         
             
            of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). The ROOTS Search
         
     | 
| 256 | 
         
             
            Tool allows you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages
         
     | 
| 
         | 
|
| 367 | 
         
             
                            payload,
         
     | 
| 368 | 
         
             
                            exact_search,
         
     | 
| 369 | 
         
             
                        )
         
     | 
| 370 | 
         
            +
                        result_page = format_result_page(lang, processed_results, highlight_terms, num_results, exact_search)
         
     | 
| 
         | 
|
| 
         | 
|
| 371 | 
         
             
                        return (
         
     | 
| 372 | 
         
             
                            processed_results,
         
     | 
| 373 | 
         
             
                            highlight_terms,
         
     | 
| 
         | 
|
| 388 | 
         
             
                            datasets,
         
     | 
| 389 | 
         
             
                        ) = run_query(query, lang, k, dropdown_input, 0)
         
     | 
| 390 | 
         
             
                        has_more_results = exact_search and (num_results > k)
         
     | 
| 391 | 
         
            +
                        current_results = len(next(iter(processed_results.values()))) if len(processed_results) > 0 else 0
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 392 | 
         
             
                        return [
         
     | 
| 393 | 
         
             
                            processed_results,
         
     | 
| 394 | 
         
             
                            highlight_terms,
         
     | 
| 395 | 
         
             
                            num_results,
         
     | 
| 396 | 
         
             
                            exact_search,
         
     | 
| 397 | 
         
            +
                            gr.update(visible=True) if current_results > 0 else gr.update(visible=False),
         
     | 
| 
         | 
|
| 
         | 
|
| 398 | 
         
             
                            gr.Dropdown.update(choices=datasets, value=datasets),
         
     | 
| 399 | 
         
             
                            gr.update(visible=has_more_results),
         
     | 
| 400 | 
         
             
                            current_results,
         
     | 
| 
         | 
|
| 417 | 
         
             
                            result_page,
         
     | 
| 418 | 
         
             
                            datasets,
         
     | 
| 419 | 
         
             
                        ) = run_query(query, lang, k, dropdown_input, received_results)
         
     | 
| 420 | 
         
            +
                        current_results = sum(len(results) for results in processed_results.values())
         
     | 
| 421 | 
         
            +
                        has_more_results = exact_search and (received_results + current_results < num_results)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 422 | 
         
             
                        print("received_results", received_results)
         
     | 
| 423 | 
         
             
                        print("current_results", current_results)
         
     | 
| 424 | 
         
             
                        print("has_more_results", has_more_results)
         
     | 
| 
         | 
|
| 427 | 
         
             
                            highlight_terms,
         
     | 
| 428 | 
         
             
                            num_results,
         
     | 
| 429 | 
         
             
                            exact_search,
         
     | 
| 430 | 
         
            +
                            gr.update(visible=True) if current_results > 0 else gr.update(visible=False),
         
     | 
| 
         | 
|
| 
         | 
|
| 431 | 
         
             
                            gr.Dropdown.update(choices=datasets, value=datasets),
         
     | 
| 432 | 
         
             
                            gr.update(visible=current_results >= k and has_more_results),
         
     | 
| 433 | 
         
             
                            received_results + current_results,
         
     |