Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Update tool - fix
Browse files- README.md +6 -6
- app.py +484 -278
- spaces.code-workspace +0 -8
    	
        README.md
    CHANGED
    
    | @@ -1,13 +1,13 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
            -
            title:  | 
| 3 | 
            -
            emoji:  | 
| 4 | 
             
            colorFrom: blue
         | 
| 5 | 
            -
            colorTo:  | 
| 6 | 
            -
            sdk:  | 
| 7 | 
            -
            sdk_version:  | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: false
         | 
|  | |
| 10 | 
             
            ---
         | 
| 11 |  | 
| 12 | 
             
            Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
         | 
| 13 | 
            -
             | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
            +
            title: Roots Search Tool - dev tier
         | 
| 3 | 
            +
            emoji: π
         | 
| 4 | 
             
            colorFrom: blue
         | 
| 5 | 
            +
            colorTo: indigo
         | 
| 6 | 
            +
            sdk: gradio
         | 
| 7 | 
            +
            sdk_version: 3.18.0
         | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: false
         | 
| 10 | 
            +
            license: apache-2.0
         | 
| 11 | 
             
            ---
         | 
| 12 |  | 
| 13 | 
             
            Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
         | 
|  | 
    	
        app.py
    CHANGED
    
    | @@ -1,153 +1,59 @@ | |
| 1 | 
             
            import json
         | 
| 2 | 
             
            import os
         | 
| 3 | 
            -
            import  | 
|  | |
| 4 |  | 
| 5 | 
            -
            import  | 
| 6 | 
            -
            import streamlit.components.v1 as components
         | 
| 7 | 
             
            import requests
         | 
|  | |
| 8 |  | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
            st.set_page_config(page_title="Gaia Search ππ", layout="wide")
         | 
| 16 | 
            -
             | 
| 17 | 
            -
            os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True)
         | 
| 18 | 
            -
            with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file:
         | 
| 19 | 
            -
                file.write('[theme]\nbase="light"')
         | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
            corpus_name_map = {
         | 
| 23 | 
            -
                "LAION": "laion",
         | 
| 24 | 
            -
                "ROOTS": "roots",
         | 
| 25 | 
            -
                "The Pile": "pile",
         | 
| 26 | 
            -
                "C4": "c4",
         | 
| 27 | 
             
            }
         | 
| 28 |  | 
| 29 | 
            -
            st.sidebar.markdown(
         | 
| 30 | 
            -
                """
         | 
| 31 | 
            -
                <style>
         | 
| 32 | 
            -
                .aligncenter {
         | 
| 33 | 
            -
                    text-align: center;
         | 
| 34 | 
            -
                    font-weight: bold;
         | 
| 35 | 
            -
                    font-size: 36px;
         | 
| 36 | 
            -
                }
         | 
| 37 | 
            -
                </style>
         | 
| 38 | 
            -
                <p class="aligncenter">Gaia Search ππ</p>
         | 
| 39 | 
            -
                <p>A search engine for large scale texual
         | 
| 40 | 
            -
                corpora. Most of the datasets included in the tool are based on Common
         | 
| 41 | 
            -
                Crawl. By using the tool, you are also bound by the Common Crawl terms
         | 
| 42 | 
            -
                of use in respect of the content contained in the datasets.
         | 
| 43 | 
            -
                </p>
         | 
| 44 | 
            -
                """,
         | 
| 45 | 
            -
                unsafe_allow_html=True,
         | 
| 46 | 
            -
            )
         | 
| 47 |  | 
| 48 | 
            -
             | 
| 49 | 
            -
                "" | 
| 50 | 
            -
                 | 
| 51 | 
            -
                 | 
| 52 | 
            -
             | 
| 53 | 
            -
                 | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
| 64 | 
            -
             | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 | 
            -
            query = st.sidebar.text_input(label="Query", placeholder="Type your query here")
         | 
| 70 | 
            -
            corpus = st.sidebar.selectbox(
         | 
| 71 | 
            -
                "Corpus",
         | 
| 72 | 
            -
                tuple(corpus_name_map.keys()),
         | 
| 73 | 
            -
                index=2,
         | 
| 74 | 
            -
            )
         | 
| 75 | 
            -
            max_results = st.sidebar.slider(
         | 
| 76 | 
            -
                "Max Results",
         | 
| 77 | 
            -
                min_value=1,
         | 
| 78 | 
            -
                max_value=100,
         | 
| 79 | 
            -
                step=1,
         | 
| 80 | 
            -
                value=10,
         | 
| 81 | 
            -
                help="Max Number of Documents to return",
         | 
| 82 | 
            -
            )
         | 
| 83 | 
            -
             | 
| 84 | 
            -
            # dark_mode_toggle = """
         | 
| 85 | 
            -
            #     <script>
         | 
| 86 | 
            -
            #         function load_image(id){
         | 
| 87 | 
            -
            #             console.log(id)
         | 
| 88 | 
            -
            #             var x = document.getElementById(id);
         | 
| 89 | 
            -
            #             console.log(x)
         | 
| 90 | 
            -
            #             if (x.style.display === "none") {
         | 
| 91 | 
            -
            #                 x.style.display = "block";
         | 
| 92 | 
            -
            #             } else {
         | 
| 93 | 
            -
            #                 x.style.display = "none";
         | 
| 94 | 
            -
            #             }
         | 
| 95 | 
            -
            #         };
         | 
| 96 | 
            -
            #         function myFunction() {
         | 
| 97 | 
            -
            #         var element = document.body;
         | 
| 98 | 
            -
            #         element.classList.toggle("dark-mode");
         | 
| 99 | 
            -
            #         }
         | 
| 100 | 
            -
            #     </script>
         | 
| 101 | 
            -
            #     <button onclick="myFunction()">Toggle dark mode</button>
         | 
| 102 | 
            -
            # """
         | 
| 103 | 
            -
            # st.sidebar.markdown(dark_mode_toggle, unsafe_allow_html=True)
         | 
| 104 | 
            -
             | 
| 105 | 
            -
             | 
| 106 | 
            -
            footer = """
         | 
| 107 | 
            -
                <style>
         | 
| 108 | 
            -
                    .footer {
         | 
| 109 | 
            -
                        position: fixed;
         | 
| 110 | 
            -
                        left: 0;
         | 
| 111 | 
            -
                        bottom: 0;
         | 
| 112 | 
            -
                        width: 100%;
         | 
| 113 | 
            -
                        background-color: white;
         | 
| 114 | 
            -
                        color: black;
         | 
| 115 | 
            -
                        text-align: center;
         | 
| 116 | 
            -
                    }
         | 
| 117 | 
            -
                </style>
         | 
| 118 | 
            -
                <div class="footer">
         | 
| 119 | 
            -
                <p>Powered by <a href="https://huggingface.co/" >HuggingFace π€</a> and <a href="https://github.com/castorini/pyserini" >Pyserini π¦</a></p>
         | 
| 120 | 
            -
                </div>
         | 
| 121 | 
            -
            """
         | 
| 122 | 
            -
            st.sidebar.markdown(footer, unsafe_allow_html=True)
         | 
| 123 | 
            -
             | 
| 124 | 
            -
             | 
| 125 | 
            -
            def scisearch(query, corpus, num_results=10):
         | 
| 126 | 
            -
                try:
         | 
| 127 | 
            -
                    print(query, corpus, num_results)
         | 
| 128 | 
            -
                    query = query.strip()
         | 
| 129 | 
            -
                    if query == "" or query is None:
         | 
| 130 | 
            -
                        return
         | 
| 131 | 
            -
             | 
| 132 | 
            -
                    post_data = {"query": query, "corpus": corpus, "k": num_results, "lang": "all"}
         | 
| 133 | 
            -
                    address = (
         | 
| 134 | 
            -
                        os.environ.get("address")
         | 
| 135 | 
            -
                        if corpus != "roots"
         | 
| 136 | 
            -
                        else "http://34.116.206.238:8080"
         | 
| 137 | 
             
                    )
         | 
| 138 | 
            -
             | 
| 139 | 
            -
                     | 
| 140 | 
            -
             | 
| 141 | 
            -
                         | 
| 142 | 
            -
                         | 
| 143 | 
            -
                         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 144 | 
             
                    )
         | 
| 145 | 
            -
             | 
| 146 | 
            -
                    payload = json.loads(output.text)
         | 
| 147 | 
            -
                    return payload["results"], payload["highlight_terms"]
         | 
| 148 | 
            -
             | 
| 149 | 
            -
                except Exception as e:
         | 
| 150 | 
            -
                    print(e)
         | 
| 151 |  | 
| 152 |  | 
| 153 | 
             
            PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
         | 
| @@ -165,55 +71,103 @@ def process_pii(text): | |
| 165 | 
             
                return text
         | 
| 166 |  | 
| 167 |  | 
| 168 | 
            -
            def highlight_string(paragraph: str, highlight_terms: list) -> str:
         | 
| 169 | 
            -
                tokens = paragraph.split()
         | 
| 170 | 
            -
                tokens_html = []
         | 
| 171 | 
            -
                for token in tokens:
         | 
| 172 | 
            -
                    if token in highlight_terms:
         | 
| 173 | 
            -
                        tokens_html.append("<b>{}</b>".format(token))
         | 
| 174 | 
            -
                    else:
         | 
| 175 | 
            -
                        tokens_html.append(token)
         | 
| 176 | 
            -
                tokens_html = " ".join(tokens_html)
         | 
| 177 | 
            -
                return process_pii(tokens_html)
         | 
| 178 | 
            -
             | 
| 179 | 
            -
             | 
| 180 | 
             
            def extract_lang_from_docid(docid):
         | 
| 181 | 
             
                return docid.split("_")[1]
         | 
| 182 |  | 
| 183 |  | 
| 184 | 
            -
            def format_result(result, highlight_terms):
         | 
| 185 | 
            -
                text = result | 
| 186 | 
            -
                 | 
| 187 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 188 | 
             
                language = extract_lang_from_docid(docid)
         | 
| 189 | 
            -
                result_html = """
         | 
| 190 | 
             
                    <span style='font-size:14px; font-family: Arial; color:MediumAquaMarine'>Language: {} | </span>
         | 
| 191 | 
            -
                    <span style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {} | </span | 
|  | |
|  | |
|  | |
| 192 | 
             
                    <span style='font-family: Arial;'>{}</span><br>
         | 
| 193 | 
             
                    <br>
         | 
| 194 | 
             
                """.format(
         | 
| 195 | 
            -
                    language,  | 
| 196 | 
             
                )
         | 
| 197 | 
             
                return "<p>" + result_html + "</p>"
         | 
| 198 |  | 
| 199 |  | 
| 200 | 
            -
            def  | 
| 201 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
| 202 |  | 
| 203 | 
            -
                if  | 
| 204 | 
            -
                     | 
| 205 | 
            -
             | 
| 206 | 
            -
                         | 
| 207 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 208 | 
             
                            result_page_html += """<div style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
         | 
| 209 | 
             
                                No results for language: <b>{}</b></div>""".format(
         | 
| 210 | 
             
                                lang
         | 
| 211 | 
             
                            )
         | 
| 212 | 
            -
             | 
| 213 | 
            -
             | 
| 214 | 
            -
             | 
| 215 | 
            -
             | 
| 216 | 
            -
                             | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 217 | 
             
                        results_for_lang_html = f"""
         | 
| 218 | 
             
                            <details>
         | 
| 219 | 
             
                                <summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'>
         | 
| @@ -221,117 +175,369 @@ def process_results(corpus: str, hits: Union[list, dict], highlight_terms: list) | |
| 221 | 
             
                                </summary>
         | 
| 222 | 
             
                                {results_for_lang_html}
         | 
| 223 | 
             
                            </details>"""
         | 
| 224 | 
            -
             | 
| 225 | 
            -
             | 
| 226 | 
            -
             | 
| 227 | 
            -
             | 
| 228 | 
            -
             | 
| 229 | 
            -
                         | 
| 230 | 
            -
             | 
| 231 | 
            -
             | 
| 232 | 
            -
             | 
| 233 | 
            -
             | 
| 234 | 
            -
             | 
| 235 | 
            -
             | 
| 236 | 
            -
             | 
| 237 | 
            -
             | 
| 238 | 
            -
             | 
| 239 | 
            -
             | 
| 240 | 
            -
             | 
| 241 | 
            -
             | 
| 242 | 
            -
             | 
| 243 | 
            -
             | 
| 244 | 
            -
             | 
| 245 | 
            -
             | 
| 246 | 
            -
             | 
| 247 | 
            -
             | 
| 248 | 
            -
             | 
| 249 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 250 | 
             
                        )
         | 
| 251 | 
            -
             | 
| 252 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
| 253 |  | 
| 254 |  | 
| 255 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
| 256 |  | 
| 257 | 
            -
             | 
| 258 | 
            -
             | 
| 259 | 
            -
                if  | 
| 260 | 
            -
                     | 
| 261 | 
            -
             | 
| 262 | 
            -
                         | 
|  | |
|  | |
| 263 | 
             
                        </p><br><hr><br>"""
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 264 | 
             
                    )
         | 
| 265 | 
            -
             | 
| 266 | 
            -
             | 
| 267 | 
            -
             | 
| 268 | 
            -
             | 
| 269 | 
            -
             | 
| 270 | 
            -
                             | 
| 271 | 
            -
                             | 
| 272 | 
            -
                             | 
| 273 | 
            -
             | 
| 274 | 
            -
             | 
| 275 | 
            -
             | 
| 276 | 
            -
             | 
| 277 | 
            -
             | 
| 278 | 
            -
             | 
| 279 | 
            -
                     | 
| 280 | 
            -
             | 
| 281 | 
            -
                     | 
| 282 | 
            -
             | 
| 283 | 
            -
             | 
| 284 | 
            -
             | 
| 285 | 
            -
             | 
| 286 | 
            -
             | 
| 287 | 
            -
             | 
| 288 | 
            -
             | 
| 289 | 
            -
             | 
| 290 | 
            -
             | 
| 291 | 
            -
             | 
| 292 | 
            -
             | 
| 293 | 
            -
             | 
| 294 | 
            -
             | 
| 295 | 
            -
             | 
| 296 | 
            -
             | 
| 297 | 
            -
             | 
| 298 | 
            -
             | 
| 299 | 
            -
             | 
| 300 | 
            -
             | 
| 301 | 
            -
             | 
| 302 | 
            -
                     | 
| 303 | 
            -
             | 
| 304 | 
            -
                     | 
| 305 | 
            -
                         | 
| 306 | 
            -
                         | 
| 307 | 
            -
             | 
| 308 | 
            -
             | 
| 309 | 
            -
             | 
| 310 | 
            -
                             | 
| 311 | 
            -
             | 
| 312 | 
            -
                             | 
| 313 | 
            -
             | 
| 314 | 
            -
             | 
| 315 | 
            -
                                color: gray;
         | 
| 316 | 
            -
                            }
         | 
| 317 | 
            -
                            .searchresult h2 {
         | 
| 318 | 
            -
                                font-size: 19px;
         | 
| 319 | 
            -
                                line-height: 18px;
         | 
| 320 | 
            -
                                font-weight: normal;
         | 
| 321 | 
            -
                                color: rgb(7, 111, 222);
         | 
| 322 | 
            -
                                margin-bottom: 0px;
         | 
| 323 | 
            -
                                margin-top: 25px;
         | 
| 324 | 
            -
                                color: #7978FF;"
         | 
| 325 | 
            -
                            }
         | 
| 326 | 
            -
                            .searchresult a {
         | 
| 327 | 
            -
                                font-size: 12px;
         | 
| 328 | 
            -
                                line-height: 12px;
         | 
| 329 | 
            -
                                color: green;
         | 
| 330 | 
            -
                                margin-bottom: 0px;
         | 
| 331 | 
            -
                            }
         | 
| 332 | 
            -
                        </style>
         | 
| 333 | 
            -
                        """
         | 
| 334 | 
            -
                        + rendered_results,
         | 
| 335 | 
            -
                        height=800,
         | 
| 336 | 
            -
                        scrolling=True,
         | 
| 337 | 
             
                    )
         | 
|  | 
|  | |
| 1 | 
             
            import json
         | 
| 2 | 
             
            import os
         | 
| 3 | 
            +
            import traceback
         | 
| 4 | 
            +
            from typing import List, Tuple
         | 
| 5 |  | 
| 6 | 
            +
            import gradio as gr
         | 
|  | |
| 7 | 
             
            import requests
         | 
| 8 | 
            +
            from huggingface_hub import HfApi
         | 
| 9 |  | 
| 10 | 
            +
            hf_api = HfApi()
         | 
| 11 | 
            +
            roots_datasets = {
         | 
| 12 | 
            +
                dset.id.split("/")[-1]: dset
         | 
| 13 | 
            +
                for dset in hf_api.list_datasets(
         | 
| 14 | 
            +
                    author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token")
         | 
| 15 | 
            +
                )
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 16 | 
             
            }
         | 
| 17 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 18 |  | 
| 19 | 
            +
            def get_docid_html(docid):
         | 
| 20 | 
            +
                data_org, dataset, docid = docid.split("/")
         | 
| 21 | 
            +
                metadata = roots_datasets[dataset]
         | 
| 22 | 
            +
                locked_color = "LightGray"
         | 
| 23 | 
            +
                open_color = "#7978FF"
         | 
| 24 | 
            +
                if metadata.private:
         | 
| 25 | 
            +
                    docid_html = """
         | 
| 26 | 
            +
                    <a title="This dataset is private. See the introductory text for more information"
         | 
| 27 | 
            +
                        style="color:{locked_color}; font-weight: bold; text-decoration:none"
         | 
| 28 | 
            +
                        onmouseover="style='color:{locked_color}; font-weight: bold; text-decoration:underline'"
         | 
| 29 | 
            +
                        onmouseout="style='color:{locked_color}; font-weight: bold; text-decoration:none'"
         | 
| 30 | 
            +
                        href="https://huggingface.co/datasets/bigscience-data/{dataset}"
         | 
| 31 | 
            +
                        target="_blank">
         | 
| 32 | 
            +
                        π{dataset}
         | 
| 33 | 
            +
                    </a>
         | 
| 34 | 
            +
                    <span style="color:{open_color}; ">/{docid}</span>""".format(
         | 
| 35 | 
            +
                        dataset=dataset,
         | 
| 36 | 
            +
                        docid=docid,
         | 
| 37 | 
            +
                        locked_color=locked_color,
         | 
| 38 | 
            +
                        open_color=open_color,
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 39 | 
             
                    )
         | 
| 40 | 
            +
                else:
         | 
| 41 | 
            +
                    docid_html = """
         | 
| 42 | 
            +
                    <a title="This dataset is licensed {metadata}"
         | 
| 43 | 
            +
                        style="color:{open_color}; font-weight: bold; text-decoration:none"
         | 
| 44 | 
            +
                        onmouseover="style='color:{open_color}; font-weight: bold; text-decoration:underline'"
         | 
| 45 | 
            +
                        onmouseout="style='color:{open_color}; font-weight: bold; text-decoration:none'"
         | 
| 46 | 
            +
                        href="https://huggingface.co/datasets/bigscience-data/{dataset}"
         | 
| 47 | 
            +
                        target="_blank">
         | 
| 48 | 
            +
                        {dataset}
         | 
| 49 | 
            +
                    </a>
         | 
| 50 | 
            +
                    <span style="color:{open_color}; ">/{docid}</span>""".format(
         | 
| 51 | 
            +
                        metadata=metadata.tags[0].split(":")[-1],
         | 
| 52 | 
            +
                        dataset=dataset,
         | 
| 53 | 
            +
                        docid=docid,
         | 
| 54 | 
            +
                        open_color=open_color,
         | 
| 55 | 
             
                    )
         | 
| 56 | 
            +
                return docid_html
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 57 |  | 
| 58 |  | 
| 59 | 
             
            PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
         | 
|  | |
| 71 | 
             
                return text
         | 
| 72 |  | 
| 73 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 74 | 
             
            def extract_lang_from_docid(docid):
         | 
| 75 | 
             
                return docid.split("_")[1]
         | 
| 76 |  | 
| 77 |  | 
| 78 | 
            +
            def format_result(result, highlight_terms, exact_search, datasets_filter=None):
         | 
| 79 | 
            +
                text, url, docid = result
         | 
| 80 | 
            +
                if datasets_filter is not None:
         | 
| 81 | 
            +
                    datasets_filter = set(datasets_filter)
         | 
| 82 | 
            +
                    dataset = docid.split("/")[1]
         | 
| 83 | 
            +
                    if not dataset in datasets_filter:
         | 
| 84 | 
            +
                        return ""
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                if exact_search:
         | 
| 87 | 
            +
                    query_start = text.find(highlight_terms)
         | 
| 88 | 
            +
                    query_end = query_start + len(highlight_terms)
         | 
| 89 | 
            +
                    tokens_html = text[0:query_start]
         | 
| 90 | 
            +
                    tokens_html += "<b>{}</b>".format(text[query_start:query_end])
         | 
| 91 | 
            +
                    tokens_html += text[query_end:]
         | 
| 92 | 
            +
                else:
         | 
| 93 | 
            +
                    tokens = text.split()
         | 
| 94 | 
            +
                    tokens_html = []
         | 
| 95 | 
            +
                    for token in tokens:
         | 
| 96 | 
            +
                        if token in highlight_terms:
         | 
| 97 | 
            +
                            tokens_html.append("<b>{}</b>".format(token))
         | 
| 98 | 
            +
                        else:
         | 
| 99 | 
            +
                            tokens_html.append(token)
         | 
| 100 | 
            +
                    tokens_html = " ".join(tokens_html)
         | 
| 101 | 
            +
                tokens_html = process_pii(tokens_html)
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                url_html = (
         | 
| 104 | 
            +
                    """
         | 
| 105 | 
            +
                    <span style='font-size:12px; font-family: Arial; color:Silver; text-align: left;'>
         | 
| 106 | 
            +
                        <a style='text-decoration:none; color:Silver;'
         | 
| 107 | 
            +
                            onmouseover="style='text-decoration:underline; color:Silver;'"
         | 
| 108 | 
            +
                            onmouseout="style='text-decoration:none; color:Silver;'"
         | 
| 109 | 
            +
                            href='{url}'
         | 
| 110 | 
            +
                            target="_blank">
         | 
| 111 | 
            +
                            {url}
         | 
| 112 | 
            +
                        </a>
         | 
| 113 | 
            +
                    </span><br>
         | 
| 114 | 
            +
                """.format(
         | 
| 115 | 
            +
                        url=url
         | 
| 116 | 
            +
                    )
         | 
| 117 | 
            +
                    if url is not None
         | 
| 118 | 
            +
                    else ""
         | 
| 119 | 
            +
                )
         | 
| 120 | 
            +
                docid_html = get_docid_html(docid)
         | 
| 121 | 
             
                language = extract_lang_from_docid(docid)
         | 
| 122 | 
            +
                result_html = """{}
         | 
| 123 | 
             
                    <span style='font-size:14px; font-family: Arial; color:MediumAquaMarine'>Language: {} | </span>
         | 
| 124 | 
            +
                    <span style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {} | </span>
         | 
| 125 | 
            +
                    <a href="https://forms.gle/AdBLLwRApqcLkHYA8" target="_blank">
         | 
| 126 | 
            +
                        <button style="color:#ffcdf8; ">π΄ββ οΈ Flag result π΄ββ οΈ</button>
         | 
| 127 | 
            +
                    </a><br>
         | 
| 128 | 
             
                    <span style='font-family: Arial;'>{}</span><br>
         | 
| 129 | 
             
                    <br>
         | 
| 130 | 
             
                """.format(
         | 
| 131 | 
            +
                    url_html, language, docid_html, tokens_html
         | 
| 132 | 
             
                )
         | 
| 133 | 
             
                return "<p>" + result_html + "</p>"
         | 
| 134 |  | 
| 135 |  | 
| 136 | 
            +
            def format_result_page(
         | 
| 137 | 
            +
                language, results, highlight_terms, num_results, exact_search, datasets_filter=None
         | 
| 138 | 
            +
            ) -> gr.HTML:
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                filtered_num_results = 0
         | 
| 141 | 
            +
                header_html = ""
         | 
| 142 |  | 
| 143 | 
            +
                if language == "detect_language" and not exact_search:
         | 
| 144 | 
            +
                    header_html += """<div style='font-family: Arial; color:MediumAquaMarine; text-align: center; line-height: 3em'>
         | 
| 145 | 
            +
                        Detected language: <b style='color:MediumAquaMarine'>{}</b></div>""".format(
         | 
| 146 | 
            +
                        list(results.keys())[0]
         | 
| 147 | 
            +
                    )
         | 
| 148 | 
            +
             | 
| 149 | 
            +
                result_page_html = ""
         | 
| 150 | 
            +
                for lang, results_for_lang in results.items():
         | 
| 151 | 
            +
                    print("Processing language", lang)
         | 
| 152 | 
            +
                    if len(results_for_lang) == 0:
         | 
| 153 | 
            +
                        if exact_search:
         | 
| 154 | 
            +
                            result_page_html += """<div style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
         | 
| 155 | 
            +
                                No results found.</div>"""
         | 
| 156 | 
            +
                        else:
         | 
| 157 | 
             
                            result_page_html += """<div style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
         | 
| 158 | 
             
                                No results for language: <b>{}</b></div>""".format(
         | 
| 159 | 
             
                                lang
         | 
| 160 | 
             
                            )
         | 
| 161 | 
            +
                        continue
         | 
| 162 | 
            +
                    results_for_lang_html = ""
         | 
| 163 | 
            +
                    for result in results_for_lang:
         | 
| 164 | 
            +
                        result_html = format_result(
         | 
| 165 | 
            +
                            result, highlight_terms, exact_search, datasets_filter
         | 
| 166 | 
            +
                        )
         | 
| 167 | 
            +
                        if result_html != "":
         | 
| 168 | 
            +
                            filtered_num_results += 1
         | 
| 169 | 
            +
                        results_for_lang_html += result_html
         | 
| 170 | 
            +
                    if language == "all" and not exact_search:
         | 
| 171 | 
             
                        results_for_lang_html = f"""
         | 
| 172 | 
             
                            <details>
         | 
| 173 | 
             
                                <summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'>
         | 
|  | |
| 175 | 
             
                                </summary>
         | 
| 176 | 
             
                                {results_for_lang_html}
         | 
| 177 | 
             
                            </details>"""
         | 
| 178 | 
            +
                    result_page_html += results_for_lang_html
         | 
| 179 | 
            +
             | 
| 180 | 
            +
                if num_results is not None:
         | 
| 181 | 
            +
                    header_html += """<div style='font-family: Arial; color:MediumAquaMarine; text-align: center; line-height: 3em'>
         | 
| 182 | 
            +
                        Total number of matches: <b style='color:MediumAquaMarine'>{}</b></div>""".format(
         | 
| 183 | 
            +
                        num_results
         | 
| 184 | 
            +
                    )
         | 
| 185 | 
            +
                return header_html + result_page_html
         | 
| 186 | 
            +
             | 
| 187 | 
            +
             | 
| 188 | 
            +
            def extract_results_from_payload(query, language, payload, exact_search):
         | 
| 189 | 
            +
                results = payload["results"]
         | 
| 190 | 
            +
                processed_results = dict()
         | 
| 191 | 
            +
                datasets = set()
         | 
| 192 | 
            +
                highlight_terms = None
         | 
| 193 | 
            +
                num_results = None
         | 
| 194 | 
            +
             | 
| 195 | 
            +
                if exact_search:
         | 
| 196 | 
            +
                    highlight_terms = query
         | 
| 197 | 
            +
                    num_results = payload["num_results"]
         | 
| 198 | 
            +
                    results = {"dummy": results}
         | 
| 199 | 
            +
                else:
         | 
| 200 | 
            +
                    highlight_terms = payload["highlight_terms"]
         | 
| 201 | 
            +
             | 
| 202 | 
            +
                for lang, results_for_lang in results.items():
         | 
| 203 | 
            +
                    processed_results[lang] = list()
         | 
| 204 | 
            +
                    for result in results_for_lang:
         | 
| 205 | 
            +
                        text = result["text"]
         | 
| 206 | 
            +
                        url = (
         | 
| 207 | 
            +
                            result["meta"]["url"]
         | 
| 208 | 
            +
                            if "meta" in result
         | 
| 209 | 
            +
                            and result["meta"] is not None
         | 
| 210 | 
            +
                            and "url" in result["meta"]
         | 
| 211 | 
            +
                            else None
         | 
| 212 | 
             
                        )
         | 
| 213 | 
            +
                        docid = result["docid"]
         | 
| 214 | 
            +
                        _, dataset, _ = docid.split("/")
         | 
| 215 | 
            +
                        datasets.add(dataset)
         | 
| 216 | 
            +
                        processed_results[lang].append((text, url, docid))
         | 
| 217 | 
            +
             | 
| 218 | 
            +
                return processed_results, highlight_terms, num_results, list(datasets)
         | 
| 219 |  | 
| 220 |  | 
| 221 | 
            +
            def no_query_error_message():
         | 
| 222 | 
            +
                return f"""
         | 
| 223 | 
            +
                    <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
         | 
| 224 | 
            +
                    Please provide a non-empty query.
         | 
| 225 | 
            +
                    </p><br><hr><br>"""
         | 
| 226 |  | 
| 227 | 
            +
             | 
| 228 | 
            +
            def process_error(error_type, payload):
         | 
| 229 | 
            +
                if error_type == "unsupported_lang":
         | 
| 230 | 
            +
                    detected_lang = payload["err"]["meta"]["detected_lang"]
         | 
| 231 | 
            +
                    return f"""
         | 
| 232 | 
            +
                        <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
         | 
| 233 | 
            +
                        Detected language <b>{detected_lang}</b> is not supported.<br>
         | 
| 234 | 
            +
                        Please choose a language from the dropdown or type another query.
         | 
| 235 | 
             
                        </p><br><hr><br>"""
         | 
| 236 | 
            +
             | 
| 237 | 
            +
             | 
| 238 | 
            +
            def extract_error_from_payload(payload):
         | 
| 239 | 
            +
                if "err" in payload:
         | 
| 240 | 
            +
                    return payload["err"]["type"]
         | 
| 241 | 
            +
                return None
         | 
| 242 | 
            +
             | 
| 243 | 
            +
             | 
| 244 | 
            +
            def request_payload(query, language, exact_search, num_results=10, received_results=0):
         | 
| 245 | 
            +
                post_data = {"query": query, "k": num_results, "received_results": received_results}
         | 
| 246 | 
            +
                if language != "detect_language":
         | 
| 247 | 
            +
                    post_data["lang"] = language
         | 
| 248 | 
            +
                address = "http://34.105.160.81:8080" if exact_search else os.environ.get("address")
         | 
| 249 | 
            +
                output = requests.post(
         | 
| 250 | 
            +
                    address,
         | 
| 251 | 
            +
                    headers={"Content-type": "application/json"},
         | 
| 252 | 
            +
                    data=json.dumps(post_data),
         | 
| 253 | 
            +
                    timeout=60,
         | 
| 254 | 
            +
                )
         | 
| 255 | 
            +
                payload = json.loads(output.text)
         | 
| 256 | 
            +
                return payload
         | 
| 257 | 
            +
             | 
| 258 | 
            +
             | 
| 259 | 
            +
            title = (
         | 
| 260 | 
            +
                """<p style="text-align: center; font-size:28px"> πΈ π ROOTS search tool π πΈ </p>"""
         | 
| 261 | 
            +
            )
         | 
| 262 | 
            +
            description = """
         | 
| 263 | 
            +
            The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
         | 
| 264 | 
            +
            of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). This tool allows
         | 
| 265 | 
            +
            you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages included in
         | 
| 266 | 
            +
            ROOTS. You can read more about the details of the tool design
         | 
| 267 | 
            +
            [here](https://huggingface.co/spaces/bigscience-data/scisearch/blob/main/roots_search_tool_specs.pdf). For more
         | 
| 268 | 
            +
            information and instructions on how to access the full corpus check [this form](https://forms.gle/qyYswbEL5kA23Wu99)."""
         | 
| 269 | 
            +
             | 
| 270 | 
            +
             | 
| 271 | 
            +
            if __name__ == "__main__":
         | 
| 272 | 
            +
                demo = gr.Blocks(css=".underline-on-hover:hover { text-decoration: underline; }")
         | 
| 273 | 
            +
             | 
| 274 | 
            +
                with demo:
         | 
| 275 | 
            +
                    processed_results_state = gr.State([])
         | 
| 276 | 
            +
                    highlight_terms_state = gr.State([])
         | 
| 277 | 
            +
                    num_results_state = gr.State(0)
         | 
| 278 | 
            +
                    exact_search_state = gr.State(False)
         | 
| 279 | 
            +
                    received_results_state = gr.State(0)
         | 
| 280 | 
            +
             | 
| 281 | 
            +
                    with gr.Row():
         | 
| 282 | 
            +
                        gr.Markdown(value=title)
         | 
| 283 | 
            +
                    with gr.Row():
         | 
| 284 | 
            +
                        gr.Markdown(value=description)
         | 
| 285 | 
            +
                    with gr.Row():
         | 
| 286 | 
            +
                        query = gr.Textbox(
         | 
| 287 | 
            +
                            lines=1,
         | 
| 288 | 
            +
                            max_lines=1,
         | 
| 289 | 
            +
                            placeholder="Put your query in double quotes for exact search.",
         | 
| 290 | 
            +
                            label="Query",
         | 
| 291 | 
            +
                        )
         | 
| 292 | 
            +
                    with gr.Row():
         | 
| 293 | 
            +
                        lang = gr.Dropdown(
         | 
| 294 | 
            +
                            choices=[
         | 
| 295 | 
            +
                                "ar",
         | 
| 296 | 
            +
                                "ca",
         | 
| 297 | 
            +
                                "code",
         | 
| 298 | 
            +
                                "en",
         | 
| 299 | 
            +
                                "es",
         | 
| 300 | 
            +
                                "eu",
         | 
| 301 | 
            +
                                "fr",
         | 
| 302 | 
            +
                                "id",
         | 
| 303 | 
            +
                                "indic",
         | 
| 304 | 
            +
                                "nigercongo",
         | 
| 305 | 
            +
                                "pt",
         | 
| 306 | 
            +
                                "vi",
         | 
| 307 | 
            +
                                "zh",
         | 
| 308 | 
            +
                                "detect_language",
         | 
| 309 | 
            +
                                "all",
         | 
| 310 | 
            +
                            ],
         | 
| 311 | 
            +
                            value="en",
         | 
| 312 | 
            +
                            label="Language",
         | 
| 313 | 
            +
                        )
         | 
| 314 | 
            +
                        k = gr.Slider(
         | 
| 315 | 
            +
                            1,
         | 
| 316 | 
            +
                            100,
         | 
| 317 | 
            +
                            value=10,
         | 
| 318 | 
            +
                            step=1,
         | 
| 319 | 
            +
                            label="Max Results in fuzzy search or Max Results per page in exact search",
         | 
| 320 | 
            +
                        )
         | 
| 321 | 
            +
                    with gr.Row():
         | 
| 322 | 
            +
                        submit_btn = gr.Button("Submit")
         | 
| 323 | 
            +
                    with gr.Row(visible=False) as datasets_filter:
         | 
| 324 | 
            +
                        available_datasets = gr.Dropdown(
         | 
| 325 | 
            +
                            type="value",
         | 
| 326 | 
            +
                            choices=[],
         | 
| 327 | 
            +
                            value=[],
         | 
| 328 | 
            +
                            label="Datasets Filter",
         | 
| 329 | 
            +
                            multiselect=True,
         | 
| 330 | 
            +
                        )
         | 
| 331 | 
            +
                    with gr.Row():
         | 
| 332 | 
            +
                        result_page_html = gr.HTML(label="Results")
         | 
| 333 | 
            +
             | 
| 334 | 
            +
                    with gr.Row(visible=False) as pagination:
         | 
| 335 | 
            +
                        next_page_btn = gr.Button("Next Page")
         | 
| 336 | 
            +
             | 
| 337 | 
            +
                    def run_query(query, lang, k, dropdown_input, received_results):
         | 
| 338 | 
            +
                        query = query.strip()
         | 
| 339 | 
            +
                        exact_search = False
         | 
| 340 | 
            +
                        if query.startswith('"') and query.endswith('"') and len(query) >= 2:
         | 
| 341 | 
            +
                            exact_search = True
         | 
| 342 | 
            +
                            query = query[1:-1]
         | 
| 343 | 
            +
                        else:
         | 
| 344 | 
            +
                            query = " ".join(query.split())
         | 
| 345 | 
            +
                        if query == "" or query is None:
         | 
| 346 | 
            +
                            return (
         | 
| 347 | 
            +
                                [],
         | 
| 348 | 
            +
                                [],
         | 
| 349 | 
            +
                                0,
         | 
| 350 | 
            +
                                False,
         | 
| 351 | 
            +
                                no_query_error_message(),
         | 
| 352 | 
            +
                                [],
         | 
| 353 | 
            +
                            )
         | 
| 354 | 
            +
             | 
| 355 | 
            +
                        payload = request_payload(query, lang, exact_search, k, received_results)
         | 
| 356 | 
            +
                        err = extract_error_from_payload(payload)
         | 
| 357 | 
            +
                        if err is not None:
         | 
| 358 | 
            +
                            return (
         | 
| 359 | 
            +
                                [],
         | 
| 360 | 
            +
                                [],
         | 
| 361 | 
            +
                                0,
         | 
| 362 | 
            +
                                False,
         | 
| 363 | 
            +
                                process_error(err, payload),
         | 
| 364 | 
            +
                                [],
         | 
| 365 | 
            +
                            )
         | 
| 366 | 
            +
             | 
| 367 | 
            +
                        (
         | 
| 368 | 
            +
                            processed_results,
         | 
| 369 | 
            +
                            highlight_terms,
         | 
| 370 | 
            +
                            num_results,
         | 
| 371 | 
            +
                            ds,
         | 
| 372 | 
            +
                        ) = extract_results_from_payload(
         | 
| 373 | 
            +
                            query,
         | 
| 374 | 
            +
                            lang,
         | 
| 375 | 
            +
                            payload,
         | 
| 376 | 
            +
                            exact_search,
         | 
| 377 | 
            +
                        )
         | 
| 378 | 
            +
                        result_page = format_result_page(
         | 
| 379 | 
            +
                            lang, processed_results, highlight_terms, num_results, exact_search
         | 
| 380 | 
            +
                        )
         | 
| 381 | 
            +
                        return (
         | 
| 382 | 
            +
                            processed_results,
         | 
| 383 | 
            +
                            highlight_terms,
         | 
| 384 | 
            +
                            num_results,
         | 
| 385 | 
            +
                            exact_search,
         | 
| 386 | 
            +
                            result_page,
         | 
| 387 | 
            +
                            ds,
         | 
| 388 | 
            +
                        )
         | 
| 389 | 
            +
             | 
| 390 | 
            +
                    def submit(query, lang, k, dropdown_input):
         | 
| 391 | 
            +
                        print("submitting", query, lang, k)
         | 
| 392 | 
            +
                        (
         | 
| 393 | 
            +
                            processed_results,
         | 
| 394 | 
            +
                            highlight_terms,
         | 
| 395 | 
            +
                            num_results,
         | 
| 396 | 
            +
                            exact_search,
         | 
| 397 | 
            +
                            result_page,
         | 
| 398 | 
            +
                            datasets,
         | 
| 399 | 
            +
                        ) = run_query(query, lang, k, dropdown_input, 0)
         | 
| 400 | 
            +
                        has_more_results = exact_search and (num_results > k)
         | 
| 401 | 
            +
                        current_results = (
         | 
| 402 | 
            +
                            len(next(iter(processed_results.values())))
         | 
| 403 | 
            +
                            if len(processed_results) > 0
         | 
| 404 | 
            +
                            else 0
         | 
| 405 | 
            +
                        )
         | 
| 406 | 
            +
                        return [
         | 
| 407 | 
            +
                            processed_results,
         | 
| 408 | 
            +
                            highlight_terms,
         | 
| 409 | 
            +
                            num_results,
         | 
| 410 | 
            +
                            exact_search,
         | 
| 411 | 
            +
                            gr.update(visible=True)
         | 
| 412 | 
            +
                            if current_results > 0
         | 
| 413 | 
            +
                            else gr.update(visible=False),
         | 
| 414 | 
            +
                            gr.Dropdown.update(choices=datasets, value=datasets),
         | 
| 415 | 
            +
                            gr.update(visible=has_more_results),
         | 
| 416 | 
            +
                            current_results,
         | 
| 417 | 
            +
                            result_page,
         | 
| 418 | 
            +
                        ]
         | 
| 419 | 
            +
             | 
| 420 | 
            +
                    def next_page(
         | 
| 421 | 
            +
                        query,
         | 
| 422 | 
            +
                        lang,
         | 
| 423 | 
            +
                        k,
         | 
| 424 | 
            +
                        dropdown_input,
         | 
| 425 | 
            +
                        received_results,
         | 
| 426 | 
            +
                        processed_results,
         | 
| 427 | 
            +
                    ):
         | 
| 428 | 
            +
                        (
         | 
| 429 | 
            +
                            processed_results,
         | 
| 430 | 
            +
                            highlight_terms,
         | 
| 431 | 
            +
                            num_results,
         | 
| 432 | 
            +
                            exact_search,
         | 
| 433 | 
            +
                            result_page,
         | 
| 434 | 
            +
                            datasets,
         | 
| 435 | 
            +
                        ) = run_query(query, lang, k, dropdown_input, received_results)
         | 
| 436 | 
            +
                        current_results = sum(
         | 
| 437 | 
            +
                            len(results) for results in processed_results.values()
         | 
| 438 | 
            +
                        )
         | 
| 439 | 
            +
                        has_more_results = exact_search and (
         | 
| 440 | 
            +
                            received_results + current_results < num_results
         | 
| 441 | 
            +
                        )
         | 
| 442 | 
            +
                        print("received_results", received_results)
         | 
| 443 | 
            +
                        print("current_results", current_results)
         | 
| 444 | 
            +
                        print("has_more_results", has_more_results)
         | 
| 445 | 
            +
                        return [
         | 
| 446 | 
            +
                            processed_results,
         | 
| 447 | 
            +
                            highlight_terms,
         | 
| 448 | 
            +
                            num_results,
         | 
| 449 | 
            +
                            exact_search,
         | 
| 450 | 
            +
                            gr.update(visible=True)
         | 
| 451 | 
            +
                            if current_results > 0
         | 
| 452 | 
            +
                            else gr.update(visible=False),
         | 
| 453 | 
            +
                            gr.Dropdown.update(choices=datasets, value=datasets),
         | 
| 454 | 
            +
                            gr.update(visible=current_results >= k and has_more_results),
         | 
| 455 | 
            +
                            received_results + current_results,
         | 
| 456 | 
            +
                            result_page,
         | 
| 457 | 
            +
                        ]
         | 
| 458 | 
            +
             | 
| 459 | 
            +
                    def filter_datasets(
         | 
| 460 | 
            +
                        lang,
         | 
| 461 | 
            +
                        processed_results,
         | 
| 462 | 
            +
                        highlight_terms,
         | 
| 463 | 
            +
                        num_results,
         | 
| 464 | 
            +
                        exact_search,
         | 
| 465 | 
            +
                        datasets_filter,
         | 
| 466 | 
            +
                    ):
         | 
| 467 | 
            +
                        result_page_html = format_result_page(
         | 
| 468 | 
            +
                            lang,
         | 
| 469 | 
            +
                            processed_results,
         | 
| 470 | 
            +
                            highlight_terms,
         | 
| 471 | 
            +
                            num_results,
         | 
| 472 | 
            +
                            exact_search,
         | 
| 473 | 
            +
                            datasets_filter,
         | 
| 474 | 
            +
                        )
         | 
| 475 | 
            +
                        return result_page_html
         | 
| 476 | 
            +
             | 
| 477 | 
            +
                    query.submit(
         | 
| 478 | 
            +
                        fn=submit,
         | 
| 479 | 
            +
                        inputs=[query, lang, k, available_datasets],
         | 
| 480 | 
            +
                        outputs=[
         | 
| 481 | 
            +
                            processed_results_state,
         | 
| 482 | 
            +
                            highlight_terms_state,
         | 
| 483 | 
            +
                            num_results_state,
         | 
| 484 | 
            +
                            exact_search_state,
         | 
| 485 | 
            +
                            datasets_filter,
         | 
| 486 | 
            +
                            available_datasets,
         | 
| 487 | 
            +
                            pagination,
         | 
| 488 | 
            +
                            received_results_state,
         | 
| 489 | 
            +
                            result_page_html,
         | 
| 490 | 
            +
                        ],
         | 
| 491 | 
             
                    )
         | 
| 492 | 
            +
                    submit_btn.click(
         | 
| 493 | 
            +
                        submit,
         | 
| 494 | 
            +
                        inputs=[query, lang, k, available_datasets],
         | 
| 495 | 
            +
                        outputs=[
         | 
| 496 | 
            +
                            processed_results_state,
         | 
| 497 | 
            +
                            highlight_terms_state,
         | 
| 498 | 
            +
                            num_results_state,
         | 
| 499 | 
            +
                            exact_search_state,
         | 
| 500 | 
            +
                            datasets_filter,
         | 
| 501 | 
            +
                            available_datasets,
         | 
| 502 | 
            +
                            pagination,
         | 
| 503 | 
            +
                            received_results_state,
         | 
| 504 | 
            +
                            result_page_html,
         | 
| 505 | 
            +
                        ],
         | 
| 506 | 
            +
                    )
         | 
| 507 | 
            +
             | 
| 508 | 
            +
                    next_page_btn.click(
         | 
| 509 | 
            +
                        next_page,
         | 
| 510 | 
            +
                        inputs=[
         | 
| 511 | 
            +
                            query,
         | 
| 512 | 
            +
                            lang,
         | 
| 513 | 
            +
                            k,
         | 
| 514 | 
            +
                            available_datasets,
         | 
| 515 | 
            +
                            received_results_state,
         | 
| 516 | 
            +
                            processed_results_state,
         | 
| 517 | 
            +
                        ],
         | 
| 518 | 
            +
                        outputs=[
         | 
| 519 | 
            +
                            processed_results_state,
         | 
| 520 | 
            +
                            highlight_terms_state,
         | 
| 521 | 
            +
                            num_results_state,
         | 
| 522 | 
            +
                            exact_search_state,
         | 
| 523 | 
            +
                            datasets_filter,
         | 
| 524 | 
            +
                            available_datasets,
         | 
| 525 | 
            +
                            pagination,
         | 
| 526 | 
            +
                            received_results_state,
         | 
| 527 | 
            +
                            result_page_html,
         | 
| 528 | 
            +
                        ],
         | 
| 529 | 
            +
                    )
         | 
| 530 | 
            +
             | 
| 531 | 
            +
                    available_datasets.change(
         | 
| 532 | 
            +
                        filter_datasets,
         | 
| 533 | 
            +
                        inputs=[
         | 
| 534 | 
            +
                            lang,
         | 
| 535 | 
            +
                            processed_results_state,
         | 
| 536 | 
            +
                            highlight_terms_state,
         | 
| 537 | 
            +
                            num_results_state,
         | 
| 538 | 
            +
                            exact_search_state,
         | 
| 539 | 
            +
                            available_datasets,
         | 
| 540 | 
            +
                        ],
         | 
| 541 | 
            +
                        outputs=result_page_html,
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 542 | 
             
                    )
         | 
| 543 | 
            +
                demo.launch(enable_queue=True, debug=True)
         | 
    	
        spaces.code-workspace
    DELETED
    
    | @@ -1,8 +0,0 @@ | |
| 1 | 
            -
            {
         | 
| 2 | 
            -
            	"folders": [
         | 
| 3 | 
            -
            		{
         | 
| 4 | 
            -
            			"path": ".."
         | 
| 5 | 
            -
            		}
         | 
| 6 | 
            -
            	],
         | 
| 7 | 
            -
            	"settings": {}
         | 
| 8 | 
            -
            }
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 

