Spaces:

mantisnlp
/

SearchMesh

Sleeping

App Files Files Community

Nick Sorros commited on Jun 23, 2022

Commit

b493a01

1 Parent(s): cacf814

Tag more grants and implement most common

Browse files

Files changed (4) hide show

app.py +18 -11
preprocess.py +8 -1
tag.py +2 -2
tagged_grants.jsonl +0 -0

app.py CHANGED Viewed

@@ -1,39 +1,46 @@
 import streamlit as st
 import srsly
 def search(query):
     results = []
     for grant in grants:
         if query in grant["tags"]:
-            results.append({
-                "title": grant["title"],
-                "tags": grant["tags"]
-            })
     st.session_state["results"] = results
 st.header("Search 🔎 grants using MeSH 🔖")
 st.sidebar.header("Information ℹ")
-st.sidebar.write("A complete list of MeSH tags can be found here https://meshb.nlm.nih.gov/treeView")
 st.sidebar.write("The grants data can be found https://www.threesixtygiving.org/")
-st.sidebar.write("The model used to tag grants is https://huggingface.co/Wellcome/WellcomeBertMesh")
 if "grants" not in st.session_state:
     st.session_state["grants"] = list(srsly.read_jsonl("tagged_grants.jsonl"))
 grants = st.session_state["grants"]
-query = st.text_input("", value="Humans")
 st.button("Search 🔎", on_click=search, kwargs={"query": query})
 if "results" in st.session_state:
     st.caption("Related MeSH terms")
-    unique_tags = list(set(list([tag for res in st.session_state["results"] for tag in res["tags"]])))
     columns = st.columns(5)
     for row_i in range(3):
         for col_i, col in enumerate(columns):
             with col:
-                tag_i = row_i*5 + col_i
-                if tag_i < len(unique_tags):
-                    tag = unique_tags[tag_i]
                     st.button(tag, on_click=search, kwargs={"query": tag})
     st.table(st.session_state["results"])

+from collections import Counter
 import streamlit as st
 import srsly
 def search(query):
     results = []
     for grant in grants:
         if query in grant["tags"]:
+            results.append({"title": grant["title"], "tags": grant["tags"]})
     st.session_state["results"] = results
 st.header("Search 🔎 grants using MeSH 🔖")
 st.sidebar.header("Information ℹ")
+st.sidebar.write(
+    "A complete list of MeSH tags can be found here https://meshb.nlm.nih.gov/treeView"
+)
 st.sidebar.write("The grants data can be found https://www.threesixtygiving.org/")
+st.sidebar.write(
+    "The model used to tag grants is https://huggingface.co/Wellcome/WellcomeBertMesh"
+)
 if "grants" not in st.session_state:
     st.session_state["grants"] = list(srsly.read_jsonl("tagged_grants.jsonl"))
 grants = st.session_state["grants"]
+query = st.text_input("", value="Malaria")
 st.button("Search 🔎", on_click=search, kwargs={"query": query})
 if "results" in st.session_state:
     st.caption("Related MeSH terms")
+    retrieved_tags = [tag for res in st.session_state["results"] for tag in res["tags"]]
+    most_common_tags = [tag for tag, _ in Counter(retrieved_tags).most_common(20)]
     columns = st.columns(5)
     for row_i in range(3):
         for col_i, col in enumerate(columns):
             with col:
+                tag_i = row_i * 5 + col_i
+                if tag_i < len(most_common_tags):
+                    tag = most_common_tags[tag_i]
                     st.button(tag, on_click=search, kwargs={"query": tag})
     st.table(st.session_state["results"])

preprocess.py CHANGED Viewed

@@ -3,14 +3,21 @@ import json
 from tqdm import tqdm
 import typer
 def preprocess(data_path, processed_data_path):
     with open(data_path) as f:
         data = json.loads(f.read())
     with open(processed_data_path, "w") as f:
         for grant in tqdm(data["grants"]):
-            if any([org["name"] == "The Wellcome Trust" for org in grant["fundingOrganization"]]):
                 f.write(json.dumps(grant) + "\n")
 if __name__ == "__main__":
     typer.run(preprocess)

 from tqdm import tqdm
 import typer
 def preprocess(data_path, processed_data_path):
     with open(data_path) as f:
         data = json.loads(f.read())
     with open(processed_data_path, "w") as f:
         for grant in tqdm(data["grants"]):
+            if any(
+                [
+                    org["name"] == "The Wellcome Trust"
+                    for org in grant["fundingOrganization"]
+                ]
+            ):
                 f.write(json.dumps(grant) + "\n")
 if __name__ == "__main__":
     typer.run(preprocess)

tag.py CHANGED Viewed

@@ -24,13 +24,13 @@ def tag(data_path, tagged_data_path, sample_size: int = 10):
     texts = [grant["title_and_description"] for grant in data]
     for batch_index in tqdm(range(0, len(texts), 10)):
-        batch_texts = texts[batch_index:batch_index+10]
         inputs = tokenizer(batch_texts, padding="max_length")
         labels = model(**inputs, return_labels=True)
         for i, tags in enumerate(labels):
-            data[batch_index+i]["tags"] = tags
     srsly.write_jsonl(tagged_data_path, data)

     texts = [grant["title_and_description"] for grant in data]
     for batch_index in tqdm(range(0, len(texts), 10)):
+        batch_texts = texts[batch_index : batch_index + 10]
         inputs = tokenizer(batch_texts, padding="max_length")
         labels = model(**inputs, return_labels=True)
         for i, tags in enumerate(labels):
+            data[batch_index + i]["tags"] = tags
     srsly.write_jsonl(tagged_data_path, data)

tagged_grants.jsonl CHANGED Viewed

The diff for this file is too large to render. See raw diff