Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Update Tool
Browse files- .gitignore +1 -0
 - README.md +7 -7
 - app.py +307 -230
 
    	
        .gitignore
    ADDED
    
    | 
         @@ -0,0 +1 @@ 
     | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            *.code-workspace
         
     | 
    	
        README.md
    CHANGED
    
    | 
         @@ -1,13 +1,13 @@ 
     | 
|
| 1 | 
         
             
            ---
         
     | 
| 2 | 
         
            -
            title:  
     | 
| 3 | 
         
            -
            emoji:  
     | 
| 4 | 
         
            -
            colorFrom:  
     | 
| 5 | 
         
            -
            colorTo:  
     | 
| 6 | 
         
            -
            sdk:  
     | 
| 7 | 
         
            -
            sdk_version:  
     | 
| 8 | 
         
             
            app_file: app.py
         
     | 
| 9 | 
         
             
            pinned: false
         
     | 
| 10 | 
         
            -
            license: apache-2.0
         
     | 
| 11 | 
         
             
            ---
         
     | 
| 12 | 
         | 
| 13 | 
         
             
            Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
         
     | 
| 
         | 
| 
         | 
|
| 1 | 
         
             
            ---
         
     | 
| 2 | 
         
            +
            title: GΓA / gaia / gΓ¦a
         
     | 
| 3 | 
         
            +
            emoji: ππ
         
     | 
| 4 | 
         
            +
            colorFrom: blue
         
     | 
| 5 | 
         
            +
            colorTo: red
         
     | 
| 6 | 
         
            +
            sdk: streamlit
         
     | 
| 7 | 
         
            +
            sdk_version: 1.18.1
         
     | 
| 8 | 
         
             
            app_file: app.py
         
     | 
| 9 | 
         
             
            pinned: false
         
     | 
| 
         | 
|
| 10 | 
         
             
            ---
         
     | 
| 11 | 
         | 
| 12 | 
         
             
            Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
         
     | 
| 13 | 
         
            +
             
     | 
    	
        app.py
    CHANGED
    
    | 
         @@ -1,260 +1,337 @@ 
     | 
|
| 1 | 
         
            -
            import http.client as http_client
         
     | 
| 2 | 
         
             
            import json
         
     | 
| 3 | 
         
            -
            import logging
         
     | 
| 4 | 
         
             
            import os
         
     | 
| 5 | 
         
            -
            import  
     | 
| 6 | 
         
            -
            import string
         
     | 
| 7 | 
         
            -
            import traceback
         
     | 
| 8 | 
         | 
| 9 | 
         
            -
            import  
     | 
| 
         | 
|
| 10 | 
         
             
            import requests
         
     | 
| 11 | 
         
            -
            from huggingface_hub import HfApi
         
     | 
| 12 | 
         
            -
             
     | 
| 13 | 
         
            -
            hf_api = HfApi()
         
     | 
| 14 | 
         
            -
            roots_datasets = {dset.id.split("/")[-1]:dset for dset in hf_api.list_datasets(author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token"))}
         
     | 
| 15 | 
         
            -
             
     | 
| 16 | 
         
            -
            def get_docid_html(docid):
         
     | 
| 17 | 
         
            -
                data_org, dataset, docid = docid.split("/")
         
     | 
| 18 | 
         
            -
                metadata = roots_datasets[dataset]
         
     | 
| 19 | 
         
            -
                if metadata.private:
         
     | 
| 20 | 
         
            -
                    docid_html = (
         
     | 
| 21 | 
         
            -
                        f"<a "
         
     | 
| 22 | 
         
            -
                        f'class="underline-on-hover"'
         
     | 
| 23 | 
         
            -
                        f'title="This dataset is private. See the introductory text for more information"'
         
     | 
| 24 | 
         
            -
                        f'style="color:#AA4A44;"'
         
     | 
| 25 | 
         
            -
                        f'href="https://huggingface.co/datasets/bigscience-data/{dataset}"'
         
     | 
| 26 | 
         
            -
                        f'target="_blank"><b>π{dataset}</b></a><span style="color: #7978FF;">/{docid}</span>'
         
     | 
| 27 | 
         
            -
                    )
         
     | 
| 28 | 
         
            -
                else:
         
     | 
| 29 | 
         
            -
                    docid_html = (
         
     | 
| 30 | 
         
            -
                        f"<a "
         
     | 
| 31 | 
         
            -
                        f'class="underline-on-hover"'
         
     | 
| 32 | 
         
            -
                        f'title="This dataset is licensed {metadata.tags[0].split(":")[-1]}"'
         
     | 
| 33 | 
         
            -
                        f'style="color:#2D31FA;"'
         
     | 
| 34 | 
         
            -
                        f'href="https://huggingface.co/datasets/bigscience-data/{dataset}"'
         
     | 
| 35 | 
         
            -
                        f'target="_blank"><b>{dataset}</b></a><span style="color: #7978FF;">/{docid}</span>'
         
     | 
| 36 | 
         
            -
                    )        
         
     | 
| 37 | 
         
            -
                return docid_html
         
     | 
| 38 | 
         
            -
             
     | 
| 39 | 
         
            -
             
     | 
| 40 | 
         
            -
            PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
         
     | 
| 41 | 
         
            -
            PII_PREFIX = "PI:"
         
     | 
| 42 | 
         
            -
             
     | 
| 43 | 
         
            -
             
     | 
| 44 | 
         
            -
            def process_pii(text):
         
     | 
| 45 | 
         
            -
                for tag in PII_TAGS:
         
     | 
| 46 | 
         
            -
                    text = text.replace(
         
     | 
| 47 | 
         
            -
                        PII_PREFIX + tag,
         
     | 
| 48 | 
         
            -
                        """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(tag),
         
     | 
| 49 | 
         
            -
                    )
         
     | 
| 50 | 
         
            -
                return text
         
     | 
| 51 | 
         | 
| 52 | 
         
            -
             
     | 
| 53 | 
         
            -
             
     | 
| 54 | 
         
            -
             
     | 
| 55 | 
         
            -
             
     | 
| 56 | 
         
            -
             
     | 
| 57 | 
         
            -
             
     | 
| 58 | 
         
            -
             
     | 
| 59 | 
         
            -
             
     | 
| 60 | 
         
            -
             
     | 
| 61 | 
         
            -
             
     | 
| 62 | 
         
            -
             
     | 
| 63 | 
         
            -
             
     | 
| 64 | 
         
            -
             
     | 
| 65 | 
         
            -
             
     | 
| 66 | 
         
            -
             
     | 
| 67 | 
         
            -
             
     | 
| 68 | 
         
            -
             
     | 
| 69 | 
         
            -
             
     | 
| 70 | 
         
            -
             
     | 
| 71 | 
         
            -
             
     | 
| 72 | 
         
            -
             
     | 
| 73 | 
         
            -
             
     | 
| 74 | 
         
            -
             
     | 
| 75 | 
         
            -
             
     | 
| 76 | 
         
            -
             
     | 
| 77 | 
         
            -
                     
     | 
| 78 | 
         
            -
                     
     | 
| 79 | 
         
            -
             
     | 
| 80 | 
         
            -
             
     | 
| 81 | 
         
            -
             
     | 
| 82 | 
         
            -
             
     | 
| 83 | 
         
            -
             
     | 
| 84 | 
         
            -
             
     | 
| 85 | 
         
            -
             
     | 
| 86 | 
         
            -
             
     | 
| 87 | 
         
            -
                 
     | 
| 88 | 
         
            -
             
     | 
| 89 | 
         
            -
             
     | 
| 90 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 91 | 
         
             
                try:
         
     | 
| 92 | 
         
            -
                    query  
     | 
| 
         | 
|
| 93 | 
         
             
                    if query == "" or query is None:
         
     | 
| 94 | 
         
            -
                        return 
     | 
| 95 | 
         | 
| 96 | 
         
            -
                    post_data = {"query": query, "k": num_results}
         
     | 
| 97 | 
         
            -
                     
     | 
| 98 | 
         
            -
                         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 99 | 
         | 
| 100 | 
         
             
                    output = requests.post(
         
     | 
| 101 | 
         
            -
                         
     | 
| 102 | 
         
             
                        headers={"Content-type": "application/json"},
         
     | 
| 103 | 
         
             
                        data=json.dumps(post_data),
         
     | 
| 104 | 
         
             
                        timeout=60,
         
     | 
| 105 | 
         
             
                    )
         
     | 
| 106 | 
         | 
| 107 | 
         
             
                    payload = json.loads(output.text)
         
     | 
| 108 | 
         
            -
             
     | 
| 109 | 
         
            -
                    if "err" in payload:
         
     | 
| 110 | 
         
            -
                        if payload["err"]["type"] == "unsupported_lang":
         
     | 
| 111 | 
         
            -
                            detected_lang = payload["err"]["meta"]["detected_lang"]
         
     | 
| 112 | 
         
            -
                            return f"""
         
     | 
| 113 | 
         
            -
                                <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
         
     | 
| 114 | 
         
            -
                                Detected language <b>{detected_lang}</b> is not supported.<br>
         
     | 
| 115 | 
         
            -
                                Please choose a language from the dropdown or type another query.
         
     | 
| 116 | 
         
            -
                                </p><br><hr><br>"""
         
     | 
| 117 | 
         
            -
             
     | 
| 118 | 
         
            -
                    results = payload["results"]
         
     | 
| 119 | 
         
            -
                    highlight_terms = payload["highlight_terms"]
         
     | 
| 120 | 
         
            -
             
     | 
| 121 | 
         
            -
                    if language == "detect_language":
         
     | 
| 122 | 
         
            -
                        results = list(results.values())[0]
         
     | 
| 123 | 
         
            -
                        return (
         
     | 
| 124 | 
         
            -
                            (
         
     | 
| 125 | 
         
            -
                                f"""<p style='font-family: Arial; color:MediumAquaMarine; text-align: center; line-height: 3em'>
         
     | 
| 126 | 
         
            -
                            Detected language: <b>{results[0]["lang"]}</b></p><br><hr><br>"""
         
     | 
| 127 | 
         
            -
                                if len(results) > 0 and language == "detect_language"
         
     | 
| 128 | 
         
            -
                                else ""
         
     | 
| 129 | 
         
            -
                            )
         
     | 
| 130 | 
         
            -
                            + process_results(results, highlight_terms)
         
     | 
| 131 | 
         
            -
                        )
         
     | 
| 132 | 
         
            -
             
     | 
| 133 | 
         
            -
                    if language == "all":
         
     | 
| 134 | 
         
            -
                        results_html = ""
         
     | 
| 135 | 
         
            -
                        for lang, results_for_lang in results.items():
         
     | 
| 136 | 
         
            -
                            if len(results_for_lang) == 0:
         
     | 
| 137 | 
         
            -
                                results_html += f"""<p style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
         
     | 
| 138 | 
         
            -
                                        No results for language: <b>{lang}</b><hr></p>"""
         
     | 
| 139 | 
         
            -
                                continue
         
     | 
| 140 | 
         
            -
             
     | 
| 141 | 
         
            -
                            collapsible_results = f"""
         
     | 
| 142 | 
         
            -
                                <details>
         
     | 
| 143 | 
         
            -
                                    <summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'>
         
     | 
| 144 | 
         
            -
                                        Results for language: <b>{lang}</b><hr>
         
     | 
| 145 | 
         
            -
                                    </summary>
         
     | 
| 146 | 
         
            -
                                    {process_results(results_for_lang, highlight_terms)}
         
     | 
| 147 | 
         
            -
                                </details>"""
         
     | 
| 148 | 
         
            -
                            results_html += collapsible_results
         
     | 
| 149 | 
         
            -
                        return results_html
         
     | 
| 150 | 
         
            -
             
     | 
| 151 | 
         
            -
                    results = list(results.values())[0]
         
     | 
| 152 | 
         
            -
                    return process_results(results, highlight_terms)
         
     | 
| 153 | 
         | 
| 154 | 
         
             
                except Exception as e:
         
     | 
| 155 | 
         
            -
                    results_html = f"""
         
     | 
| 156 | 
         
            -
                            <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
         
     | 
| 157 | 
         
            -
                            Raised {type(e).__name__}</p>
         
     | 
| 158 | 
         
            -
                            <p style='font-size:14px; font-family: Arial; '>
         
     | 
| 159 | 
         
            -
                            Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.
         
     | 
| 160 | 
         
            -
                            </p>
         
     | 
| 161 | 
         
            -
                        """
         
     | 
| 162 | 
         
             
                    print(e)
         
     | 
| 163 | 
         
            -
                    print(traceback.format_exc())
         
     | 
| 164 | 
         | 
| 165 | 
         
            -
                return results_html
         
     | 
| 166 | 
         | 
| 
         | 
|
| 
         | 
|
| 167 | 
         | 
| 168 | 
         
            -
            def flag(query, language, num_results, issue_description):
         
     | 
| 169 | 
         
            -
                try:
         
     | 
| 170 | 
         
            -
                    post_data = {"query": query, "k": num_results, "flag": True, "description": issue_description}
         
     | 
| 171 | 
         
            -
                    if language != "detect_language":
         
     | 
| 172 | 
         
            -
                        post_data["lang"] = language
         
     | 
| 173 | 
         | 
| 174 | 
         
            -
             
     | 
| 175 | 
         
            -
             
     | 
| 176 | 
         
            -
             
     | 
| 177 | 
         
            -
                         
     | 
| 178 | 
         
            -
                         
     | 
| 
         | 
|
| 
         | 
|
| 179 | 
         
             
                    )
         
     | 
| 180 | 
         
            -
             
     | 
| 181 | 
         
            -
                    results = json.loads(output.text)
         
     | 
| 182 | 
         
            -
                except:
         
     | 
| 183 | 
         
            -
                    print("Error flagging")
         
     | 
| 184 | 
         
            -
                return ""
         
     | 
| 185 | 
         | 
| 186 | 
         | 
| 187 | 
         
            -
             
     | 
| 188 | 
         
            -
             
     | 
| 189 | 
         
            -
             
     | 
| 190 | 
         
            -
             
     | 
| 191 | 
         
            -
             
     | 
| 192 | 
         
            -
             
     | 
| 193 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 194 | 
         | 
| 195 | 
         | 
| 196 | 
         
            -
             
     | 
| 197 | 
         
            -
                 
     | 
| 198 | 
         
            -
                    css=".underline-on-hover:hover { text-decoration: underline; } .flagging { font-size:12px; color:Silver; }"
         
     | 
| 199 | 
         
            -
                )
         
     | 
| 200 | 
         | 
| 201 | 
         
            -
                 
     | 
| 202 | 
         
            -
                     
     | 
| 203 | 
         
            -
             
     | 
| 204 | 
         
            -
             
     | 
| 205 | 
         
            -
                         
     | 
| 206 | 
         
            -
             
     | 
| 207 | 
         
            -
             
     | 
| 208 | 
         
            -
             
     | 
| 209 | 
         
            -
             
     | 
| 210 | 
         
            -
             
     | 
| 211 | 
         
            -
             
     | 
| 212 | 
         
            -
             
     | 
| 213 | 
         
            -
             
     | 
| 214 | 
         
            -
             
     | 
| 215 | 
         
            -
             
     | 
| 216 | 
         
            -
             
     | 
| 217 | 
         
            -
                                 
     | 
| 218 | 
         
            -
             
     | 
| 219 | 
         
            -
                                 
     | 
| 220 | 
         
            -
                                 
     | 
| 221 | 
         
            -
             
     | 
| 222 | 
         
            -
             
     | 
| 223 | 
         
            -
             
     | 
| 224 | 
         
            -
             
     | 
| 225 | 
         
            -
             
     | 
| 226 | 
         
            -
             
     | 
| 227 | 
         
            -
                        )
         
     | 
| 228 | 
         
            -
             
     | 
| 229 | 
         
            -
             
     | 
| 230 | 
         
            -
             
     | 
| 231 | 
         
            -
             
     | 
| 232 | 
         
            -
             
     | 
| 233 | 
         
            -
                         
     | 
| 234 | 
         
            -
             
     | 
| 235 | 
         
            -
             
     | 
| 236 | 
         
            -
             
     | 
| 237 | 
         
            -
             
     | 
| 238 | 
         
            -
             
     | 
| 239 | 
         
            -
                         
     | 
| 240 | 
         
            -
                             
     | 
| 241 | 
         
            -
                             
     | 
| 242 | 
         
            -
             
     | 
| 243 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 244 | 
         
             
                        )
         
     | 
| 245 | 
         
            -
             
     | 
| 246 | 
         
            -
             
     | 
| 247 | 
         
            -
             
     | 
| 248 | 
         
            -
             
     | 
| 249 | 
         
            -
             
     | 
| 250 | 
         
            -
             
     | 
| 251 | 
         
            -
             
     | 
| 252 | 
         
            -
             
     | 
| 253 | 
         
            -
             
     | 
| 254 | 
         
            -
             
     | 
| 255 | 
         
            -
                         
     | 
| 256 | 
         
            -
             
     | 
| 257 | 
         
            -
             
     | 
| 258 | 
         
            -
                     
     | 
| 259 | 
         
            -
             
     | 
| 260 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 
         | 
|
| 1 | 
         
             
            import json
         
     | 
| 
         | 
|
| 2 | 
         
             
            import os
         
     | 
| 3 | 
         
            +
            import pprint
         
     | 
| 
         | 
|
| 
         | 
|
| 4 | 
         | 
| 5 | 
         
            +
            import streamlit as st
         
     | 
| 6 | 
         
            +
            import streamlit.components.v1 as components
         
     | 
| 7 | 
         
             
            import requests
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 8 | 
         | 
| 9 | 
         
            +
            from typing import Union
         
     | 
| 10 | 
         
            +
             
     | 
| 11 | 
         
            +
            pp = pprint.PrettyPrinter(indent=2)
         
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
            os.environ["address"] = "http://34.79.83.149:8080"
         
     | 
| 14 | 
         
            +
             
     | 
| 15 | 
         
            +
            st.set_page_config(page_title="Gaia Search ππ", layout="wide")
         
     | 
| 16 | 
         
            +
             
     | 
| 17 | 
         
            +
            os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True)
         
     | 
| 18 | 
         
            +
            with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file:
         
     | 
| 19 | 
         
            +
                file.write('[theme]\nbase="light"')
         
     | 
| 20 | 
         
            +
             
     | 
| 21 | 
         
            +
             
     | 
| 22 | 
         
            +
            corpus_name_map = {
         
     | 
| 23 | 
         
            +
                "LAION": "laion",
         
     | 
| 24 | 
         
            +
                "ROOTS": "roots",
         
     | 
| 25 | 
         
            +
                "The Pile": "pile",
         
     | 
| 26 | 
         
            +
                "C4": "c4",
         
     | 
| 27 | 
         
            +
            }
         
     | 
| 28 | 
         
            +
             
     | 
| 29 | 
         
            +
            st.sidebar.markdown(
         
     | 
| 30 | 
         
            +
                """
         
     | 
| 31 | 
         
            +
                <style>
         
     | 
| 32 | 
         
            +
                .aligncenter {
         
     | 
| 33 | 
         
            +
                    text-align: center;
         
     | 
| 34 | 
         
            +
                    font-weight: bold;
         
     | 
| 35 | 
         
            +
                    font-size: 36px;
         
     | 
| 36 | 
         
            +
                }
         
     | 
| 37 | 
         
            +
                </style>
         
     | 
| 38 | 
         
            +
                <p class="aligncenter">Gaia Search ππ</p>
         
     | 
| 39 | 
         
            +
                <p>A search engine for large scale texual
         
     | 
| 40 | 
         
            +
                corpora. Most of the datasets included in the tool are based on Common
         
     | 
| 41 | 
         
            +
                Crawl. By using the tool, you are also bound by the Common Crawl terms
         
     | 
| 42 | 
         
            +
                of use in respect of the content contained in the datasets.
         
     | 
| 43 | 
         
            +
                </p>
         
     | 
| 44 | 
         
            +
                """,
         
     | 
| 45 | 
         
            +
                unsafe_allow_html=True,
         
     | 
| 46 | 
         
            +
            )
         
     | 
| 47 | 
         
            +
             
     | 
| 48 | 
         
            +
            st.sidebar.markdown(
         
     | 
| 49 | 
         
            +
                """
         
     | 
| 50 | 
         
            +
                <style>
         
     | 
| 51 | 
         
            +
                .aligncenter {
         
     | 
| 52 | 
         
            +
                    text-align: center;
         
     | 
| 53 | 
         
            +
                }
         
     | 
| 54 | 
         
            +
                </style>
         
     | 
| 55 | 
         
            +
                <p style='text-align: center'>
         
     | 
| 56 | 
         
            +
                <a href="" style="color:#7978FF;">GitHub</a> | <a href="" style="color:#7978FF;" >Project Report</a> | <a href="" style="color:#7978FF;" >Colab</a> 
         
     | 
| 57 | 
         
            +
                </p>
         
     | 
| 58 | 
         
            +
                """,
         
     | 
| 59 | 
         
            +
                unsafe_allow_html=True,
         
     | 
| 60 | 
         
            +
            )
         
     | 
| 61 | 
         
            +
             
     | 
| 62 | 
         
            +
            # <p class="aligncenter">
         
     | 
| 63 | 
         
            +
            #     <a href="" target="_blank">
         
     | 
| 64 | 
         
            +
            #         <img src="https://colab.research.google.com/assets/colab-badge.svg"/>
         
     | 
| 65 | 
         
            +
            #     </a>
         
     | 
| 66 | 
         
            +
            # </p>
         
     | 
| 67 | 
         
            +
             
     | 
| 68 | 
         
            +
             
     | 
| 69 | 
         
            +
            query = st.sidebar.text_input(label="Query", placeholder="Type your query here")
         
     | 
| 70 | 
         
            +
            corpus = st.sidebar.selectbox(
         
     | 
| 71 | 
         
            +
                "Corpus",
         
     | 
| 72 | 
         
            +
                tuple(corpus_name_map.keys()),
         
     | 
| 73 | 
         
            +
                index=2,
         
     | 
| 74 | 
         
            +
            )
         
     | 
| 75 | 
         
            +
            max_results = st.sidebar.slider(
         
     | 
| 76 | 
         
            +
                "Max Results",
         
     | 
| 77 | 
         
            +
                min_value=1,
         
     | 
| 78 | 
         
            +
                max_value=100,
         
     | 
| 79 | 
         
            +
                step=1,
         
     | 
| 80 | 
         
            +
                value=10,
         
     | 
| 81 | 
         
            +
                help="Max Number of Documents to return",
         
     | 
| 82 | 
         
            +
            )
         
     | 
| 83 | 
         
            +
             
     | 
| 84 | 
         
            +
            # dark_mode_toggle = """
         
     | 
| 85 | 
         
            +
            #     <script>
         
     | 
| 86 | 
         
            +
            #         function load_image(id){
         
     | 
| 87 | 
         
            +
            #             console.log(id)
         
     | 
| 88 | 
         
            +
            #             var x = document.getElementById(id);
         
     | 
| 89 | 
         
            +
            #             console.log(x)
         
     | 
| 90 | 
         
            +
            #             if (x.style.display === "none") {
         
     | 
| 91 | 
         
            +
            #                 x.style.display = "block";
         
     | 
| 92 | 
         
            +
            #             } else {
         
     | 
| 93 | 
         
            +
            #                 x.style.display = "none";
         
     | 
| 94 | 
         
            +
            #             }
         
     | 
| 95 | 
         
            +
            #         };
         
     | 
| 96 | 
         
            +
            #         function myFunction() {
         
     | 
| 97 | 
         
            +
            #         var element = document.body;
         
     | 
| 98 | 
         
            +
            #         element.classList.toggle("dark-mode");
         
     | 
| 99 | 
         
            +
            #         }
         
     | 
| 100 | 
         
            +
            #     </script>
         
     | 
| 101 | 
         
            +
            #     <button onclick="myFunction()">Toggle dark mode</button>
         
     | 
| 102 | 
         
            +
            # """
         
     | 
| 103 | 
         
            +
            # st.sidebar.markdown(dark_mode_toggle, unsafe_allow_html=True)
         
     | 
| 104 | 
         
            +
             
     | 
| 105 | 
         
            +
             
     | 
| 106 | 
         
            +
            footer = """
         
     | 
| 107 | 
         
            +
                <style>
         
     | 
| 108 | 
         
            +
                    .footer {
         
     | 
| 109 | 
         
            +
                        position: fixed;
         
     | 
| 110 | 
         
            +
                        left: 0;
         
     | 
| 111 | 
         
            +
                        bottom: 0;
         
     | 
| 112 | 
         
            +
                        width: 100%;
         
     | 
| 113 | 
         
            +
                        background-color: white;
         
     | 
| 114 | 
         
            +
                        color: black;
         
     | 
| 115 | 
         
            +
                        text-align: center;
         
     | 
| 116 | 
         
            +
                    }
         
     | 
| 117 | 
         
            +
                </style>
         
     | 
| 118 | 
         
            +
                <div class="footer">
         
     | 
| 119 | 
         
            +
                <p>Powered by <a href="https://huggingface.co/" >HuggingFace π€</a> and <a href="https://github.com/castorini/pyserini" >Pyserini π¦</a></p>
         
     | 
| 120 | 
         
            +
                </div>
         
     | 
| 121 | 
         
            +
            """
         
     | 
| 122 | 
         
            +
            st.sidebar.markdown(footer, unsafe_allow_html=True)
         
     | 
| 123 | 
         
            +
             
     | 
| 124 | 
         
            +
             
     | 
| 125 | 
         
            +
            def scisearch(query, corpus, num_results=10):
         
     | 
| 126 | 
         
             
                try:
         
     | 
| 127 | 
         
            +
                    print(query, corpus, num_results)
         
     | 
| 128 | 
         
            +
                    query = query.strip()
         
     | 
| 129 | 
         
             
                    if query == "" or query is None:
         
     | 
| 130 | 
         
            +
                        return
         
     | 
| 131 | 
         | 
| 132 | 
         
            +
                    post_data = {"query": query, "corpus": corpus, "k": num_results, "lang": "all"}
         
     | 
| 133 | 
         
            +
                    address = (
         
     | 
| 134 | 
         
            +
                        os.environ.get("address")
         
     | 
| 135 | 
         
            +
                        if corpus != "roots"
         
     | 
| 136 | 
         
            +
                        else "http://34.116.206.238:8080"
         
     | 
| 137 | 
         
            +
                    )
         
     | 
| 138 | 
         | 
| 139 | 
         
             
                    output = requests.post(
         
     | 
| 140 | 
         
            +
                        address,
         
     | 
| 141 | 
         
             
                        headers={"Content-type": "application/json"},
         
     | 
| 142 | 
         
             
                        data=json.dumps(post_data),
         
     | 
| 143 | 
         
             
                        timeout=60,
         
     | 
| 144 | 
         
             
                    )
         
     | 
| 145 | 
         | 
| 146 | 
         
             
                    payload = json.loads(output.text)
         
     | 
| 147 | 
         
            +
                    return payload["results"], payload["highlight_terms"]
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 148 | 
         | 
| 149 | 
         
             
                except Exception as e:
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 150 | 
         
             
                    print(e)
         
     | 
| 
         | 
|
| 151 | 
         | 
| 
         | 
|
| 152 | 
         | 
| 153 | 
         
            +
            PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
         
     | 
| 154 | 
         
            +
            PII_PREFIX = "PI:"
         
     | 
| 155 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 156 | 
         | 
| 157 | 
         
            +
            def process_pii(text):
         
     | 
| 158 | 
         
            +
                for tag in PII_TAGS:
         
     | 
| 159 | 
         
            +
                    text = text.replace(
         
     | 
| 160 | 
         
            +
                        PII_PREFIX + tag,
         
     | 
| 161 | 
         
            +
                        """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(
         
     | 
| 162 | 
         
            +
                            tag
         
     | 
| 163 | 
         
            +
                        ),
         
     | 
| 164 | 
         
             
                    )
         
     | 
| 165 | 
         
            +
                return text
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 166 | 
         | 
| 167 | 
         | 
| 168 | 
         
            +
            def highlight_string(paragraph: str, highlight_terms: list) -> str:
         
     | 
| 169 | 
         
            +
                tokens = paragraph.split()
         
     | 
| 170 | 
         
            +
                tokens_html = []
         
     | 
| 171 | 
         
            +
                for token in tokens:
         
     | 
| 172 | 
         
            +
                    if token in highlight_terms:
         
     | 
| 173 | 
         
            +
                        tokens_html.append("<b>{}</b>".format(token))
         
     | 
| 174 | 
         
            +
                    else:
         
     | 
| 175 | 
         
            +
                        tokens_html.append(token)
         
     | 
| 176 | 
         
            +
                tokens_html = " ".join(tokens_html)
         
     | 
| 177 | 
         
            +
                return process_pii(tokens_html)
         
     | 
| 178 | 
         
            +
             
     | 
| 179 | 
         
            +
             
     | 
| 180 | 
         
            +
            def extract_lang_from_docid(docid):
         
     | 
| 181 | 
         
            +
                return docid.split("_")[1]
         
     | 
| 182 | 
         
            +
             
     | 
| 183 | 
         
            +
             
     | 
| 184 | 
         
            +
            def format_result(result, highlight_terms):
         
     | 
| 185 | 
         
            +
                text = result["text"]
         
     | 
| 186 | 
         
            +
                docid = result["docid"]
         
     | 
| 187 | 
         
            +
                tokens_html = highlight_string(text, highlight_terms)
         
     | 
| 188 | 
         
            +
                language = extract_lang_from_docid(docid)
         
     | 
| 189 | 
         
            +
                result_html = """
         
     | 
| 190 | 
         
            +
                    <span style='font-size:14px; font-family: Arial; color:MediumAquaMarine'>Language: {} | </span>
         
     | 
| 191 | 
         
            +
                    <span style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {} | </span><br>
         
     | 
| 192 | 
         
            +
                    <span style='font-family: Arial;'>{}</span><br>
         
     | 
| 193 | 
         
            +
                    <br>
         
     | 
| 194 | 
         
            +
                """.format(
         
     | 
| 195 | 
         
            +
                    language, docid, tokens_html
         
     | 
| 196 | 
         
            +
                )
         
     | 
| 197 | 
         
            +
                return "<p>" + result_html + "</p>"
         
     | 
| 198 | 
         | 
| 199 | 
         | 
| 200 | 
         
            +
            def process_results(corpus: str, hits: Union[list, dict], highlight_terms: list) -> str:
         
     | 
| 201 | 
         
            +
                hit_list = []
         
     | 
| 
         | 
|
| 
         | 
|
| 202 | 
         | 
| 203 | 
         
            +
                if corpus == "roots":
         
     | 
| 204 | 
         
            +
                    result_page_html = ""
         
     | 
| 205 | 
         
            +
                    for lang, results_for_lang in hits.items():
         
     | 
| 206 | 
         
            +
                        print("Processing language", lang)
         
     | 
| 207 | 
         
            +
                        if len(results_for_lang) == 0:
         
     | 
| 208 | 
         
            +
                            result_page_html += """<div style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
         
     | 
| 209 | 
         
            +
                                No results for language: <b>{}</b></div>""".format(
         
     | 
| 210 | 
         
            +
                                lang
         
     | 
| 211 | 
         
            +
                            )
         
     | 
| 212 | 
         
            +
                            continue
         
     | 
| 213 | 
         
            +
                        results_for_lang_html = ""
         
     | 
| 214 | 
         
            +
                        for result in results_for_lang:
         
     | 
| 215 | 
         
            +
                            result_html = format_result(result, highlight_terms)
         
     | 
| 216 | 
         
            +
                            results_for_lang_html += result_html
         
     | 
| 217 | 
         
            +
                        results_for_lang_html = f"""
         
     | 
| 218 | 
         
            +
                            <details>
         
     | 
| 219 | 
         
            +
                                <summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'>
         
     | 
| 220 | 
         
            +
                                    Results for language: <b>{lang}</b>
         
     | 
| 221 | 
         
            +
                                </summary>
         
     | 
| 222 | 
         
            +
                                {results_for_lang_html}
         
     | 
| 223 | 
         
            +
                            </details>"""
         
     | 
| 224 | 
         
            +
                        result_page_html += results_for_lang_html
         
     | 
| 225 | 
         
            +
                    return result_page_html
         
     | 
| 226 | 
         
            +
             
     | 
| 227 | 
         
            +
                for hit in hits:
         
     | 
| 228 | 
         
            +
                    res_head = f"""
         
     | 
| 229 | 
         
            +
                        <p class="searchresult" style="color: #7978FF;">Document ID: {hit['docid']} | Score: {round(hit['score'], 2)}</p>
         
     | 
| 230 | 
         
            +
                        """
         
     | 
| 231 | 
         
            +
                    if corpus == "laion":
         
     | 
| 232 | 
         
            +
                        res_head += f"""
         
     | 
| 233 | 
         
            +
                            <p style="color: #7978FF;">Caption:</p>
         
     | 
| 234 | 
         
            +
                            <p>{highlight_string(hit['text'], highlight_terms)}</p>
         
     | 
| 235 | 
         
            +
                        """
         
     | 
| 236 | 
         
            +
                        if (
         
     | 
| 237 | 
         
            +
                            "meta" in hit
         
     | 
| 238 | 
         
            +
                            and hit["meta"] is not None
         
     | 
| 239 | 
         
            +
                            and "docs" in hit["meta"]
         
     | 
| 240 | 
         
            +
                            and len(hit["meta"]["docs"]) > 0
         
     | 
| 241 | 
         
            +
                        ):
         
     | 
| 242 | 
         
            +
                            res_head += """<p style="color: #7978FF;"> Image links:</p><ul>"""
         
     | 
| 243 | 
         
            +
                            for subhit in hit["meta"]["docs"]:
         
     | 
| 244 | 
         
            +
                                res_head += f"""<li><a href={subhit["URL"]} target="_blank" style="color:#ffcdf8; ">{subhit["URL"]}</a></li>"""
         
     | 
| 245 | 
         
            +
                            res_head += "</ul>"
         
     | 
| 246 | 
         
            +
                        res_head += "<hr>"
         
     | 
| 247 | 
         
            +
                    else:
         
     | 
| 248 | 
         
            +
                        res_head += (
         
     | 
| 249 | 
         
            +
                            f"""<p>{highlight_string(hit['text'], highlight_terms)}</p></div><hr>"""
         
     | 
| 250 | 
         
             
                        )
         
     | 
| 251 | 
         
            +
                    hit_list.append(res_head)
         
     | 
| 252 | 
         
            +
                return " ".join(hit_list)
         
     | 
| 253 | 
         
            +
             
     | 
| 254 | 
         
            +
             
     | 
| 255 | 
         
            +
            submit_button = st.sidebar.button("Search", type="primary")
         
     | 
| 256 | 
         
            +
             
     | 
| 257 | 
         
            +
            if submit_button or query:
         
     | 
| 258 | 
         
            +
                query = query.strip()
         
     | 
| 259 | 
         
            +
                if query is None or query == "":
         
     | 
| 260 | 
         
            +
                    components.html(
         
     | 
| 261 | 
         
            +
                        """<p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
         
     | 
| 262 | 
         
            +
                        Please provide a non-empty query.
         
     | 
| 263 | 
         
            +
                        </p><br><hr><br>"""
         
     | 
| 264 | 
         
            +
                    )
         
     | 
| 265 | 
         
            +
                else:
         
     | 
| 266 | 
         
            +
                    hits, highlight_terms = scisearch(query, corpus_name_map[corpus], max_results)
         
     | 
| 267 | 
         
            +
                    html_results = process_results(corpus_name_map[corpus], hits, highlight_terms)
         
     | 
| 268 | 
         
            +
                    rendered_results = f"""
         
     | 
| 269 | 
         
            +
                        <div id="searchresultsarea">
         
     | 
| 270 | 
         
            +
                            <br>
         
     | 
| 271 | 
         
            +
                            <p id="searchresultsnumber">About {max_results} results</p>
         
     | 
| 272 | 
         
            +
                            {html_results}
         
     | 
| 273 | 
         
            +
                         </div>"""
         
     | 
| 274 | 
         
            +
                    # st.markdown(
         
     | 
| 275 | 
         
            +
                    #     """
         
     | 
| 276 | 
         
            +
                    #     <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
         
     | 
| 277 | 
         
            +
                    #         integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
         
     | 
| 278 | 
         
            +
                    #     """,
         
     | 
| 279 | 
         
            +
                    #     unsafe_allow_html=True,
         
     | 
| 280 | 
         
            +
                    # )
         
     | 
| 281 | 
         
            +
                    # st.markdown(
         
     | 
| 282 | 
         
            +
                    #     """
         
     | 
| 283 | 
         
            +
                    #     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
         
     | 
| 284 | 
         
            +
                    #     """,
         
     | 
| 285 | 
         
            +
                    #     unsafe_allow_html=True,
         
     | 
| 286 | 
         
            +
                    # )
         
     | 
| 287 | 
         
            +
                    # st.markdown(
         
     | 
| 288 | 
         
            +
                    #     f"""
         
     | 
| 289 | 
         
            +
                    #     <div class="row no-gutters mt-3 align-items-center">
         
     | 
| 290 | 
         
            +
                    #         Gaia Search ππ
         
     | 
| 291 | 
         
            +
                    #         <div class="col col-md-4">
         
     | 
| 292 | 
         
            +
                    #             <input class="form-control border-secondary rounded-pill pr-5" type="search" value="{query}" id="example-search-input2">
         
     | 
| 293 | 
         
            +
                    #         </div>
         
     | 
| 294 | 
         
            +
                    #         <div class="col-auto">
         
     | 
| 295 | 
         
            +
                    #             <button class="btn btn-outline-light text-dark border-0 rounded-pill ml-n5" type="button">
         
     | 
| 296 | 
         
            +
                    #                 <i class="fa fa-search"></i>
         
     | 
| 297 | 
         
            +
                    #             </button>
         
     | 
| 298 | 
         
            +
                    #         </div>
         
     | 
| 299 | 
         
            +
                    #     </div>
         
     | 
| 300 | 
         
            +
                    #     """,
         
     | 
| 301 | 
         
            +
                    #     unsafe_allow_html=True,
         
     | 
| 302 | 
         
            +
                    # )
         
     | 
| 303 | 
         
            +
                    # .bk-root{position:relative;width:auto;height:auto;box-sizing:border-box;font-family:Helvetica, Arial, sans-serif;font-size:13px;}.bk-root .bk,.bk-root .bk:before,.bk-root .bk:after{box-sizing:inherit;margin:0;border:0;padding:0;background-image:none;font-family:inherit;font-size:100%;line-height:1.42857143;}.bk-root pre.bk{font-family:Courier, monospace;}
         
     | 
| 304 | 
         
            +
                    components.html(
         
     | 
| 305 | 
         
            +
                        """
         
     | 
| 306 | 
         
            +
                        <head>
         
     | 
| 307 | 
         
            +
                        <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
         
     | 
| 308 | 
         
            +
                        </head>
         
     | 
| 309 | 
         
            +
                        <style>
         
     | 
| 310 | 
         
            +
                            #searchresultsarea {
         
     | 
| 311 | 
         
            +
                                font-family: "Source Sans Pro", sans-serif;
         
     | 
| 312 | 
         
            +
                            }
         
     | 
| 313 | 
         
            +
                            #searchresultsnumber {
         
     | 
| 314 | 
         
            +
                                font-size: 0.8rem;
         
     | 
| 315 | 
         
            +
                                color: gray;
         
     | 
| 316 | 
         
            +
                            }
         
     | 
| 317 | 
         
            +
                            .searchresult h2 {
         
     | 
| 318 | 
         
            +
                                font-size: 19px;
         
     | 
| 319 | 
         
            +
                                line-height: 18px;
         
     | 
| 320 | 
         
            +
                                font-weight: normal;
         
     | 
| 321 | 
         
            +
                                color: rgb(7, 111, 222);
         
     | 
| 322 | 
         
            +
                                margin-bottom: 0px;
         
     | 
| 323 | 
         
            +
                                margin-top: 25px;
         
     | 
| 324 | 
         
            +
                                color: #7978FF;"
         
     | 
| 325 | 
         
            +
                            }
         
     | 
| 326 | 
         
            +
                            .searchresult a {
         
     | 
| 327 | 
         
            +
                                font-size: 12px;
         
     | 
| 328 | 
         
            +
                                line-height: 12px;
         
     | 
| 329 | 
         
            +
                                color: green;
         
     | 
| 330 | 
         
            +
                                margin-bottom: 0px;
         
     | 
| 331 | 
         
            +
                            }
         
     | 
| 332 | 
         
            +
                        </style>
         
     | 
| 333 | 
         
            +
                        """
         
     | 
| 334 | 
         
            +
                        + rendered_results,
         
     | 
| 335 | 
         
            +
                        height=800,
         
     | 
| 336 | 
         
            +
                        scrolling=True,
         
     | 
| 337 | 
         
            +
                    )
         
     |