Spaces:
Sleeping
Sleeping
Nick Sorros
commited on
Commit
Β·
b493a01
1
Parent(s):
cacf814
Tag more grants and implement most common
Browse files- app.py +18 -11
- preprocess.py +8 -1
- tag.py +2 -2
- tagged_grants.jsonl +0 -0
app.py
CHANGED
|
@@ -1,39 +1,46 @@
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import srsly
|
| 3 |
|
|
|
|
| 4 |
def search(query):
|
| 5 |
results = []
|
| 6 |
for grant in grants:
|
| 7 |
if query in grant["tags"]:
|
| 8 |
-
results.append({
|
| 9 |
-
"title": grant["title"],
|
| 10 |
-
"tags": grant["tags"]
|
| 11 |
-
})
|
| 12 |
st.session_state["results"] = results
|
| 13 |
|
|
|
|
| 14 |
st.header("Search π grants using MeSH π")
|
| 15 |
st.sidebar.header("Information βΉ")
|
| 16 |
-
st.sidebar.write(
|
|
|
|
|
|
|
| 17 |
st.sidebar.write("The grants data can be found https://www.threesixtygiving.org/")
|
| 18 |
-
st.sidebar.write(
|
|
|
|
|
|
|
| 19 |
|
| 20 |
if "grants" not in st.session_state:
|
| 21 |
st.session_state["grants"] = list(srsly.read_jsonl("tagged_grants.jsonl"))
|
| 22 |
|
| 23 |
grants = st.session_state["grants"]
|
| 24 |
|
| 25 |
-
query = st.text_input("", value="
|
| 26 |
st.button("Search π", on_click=search, kwargs={"query": query})
|
| 27 |
|
| 28 |
if "results" in st.session_state:
|
| 29 |
st.caption("Related MeSH terms")
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
| 31 |
columns = st.columns(5)
|
| 32 |
for row_i in range(3):
|
| 33 |
for col_i, col in enumerate(columns):
|
| 34 |
with col:
|
| 35 |
-
tag_i = row_i*5 + col_i
|
| 36 |
-
if tag_i < len(
|
| 37 |
-
tag =
|
| 38 |
st.button(tag, on_click=search, kwargs={"query": tag})
|
| 39 |
st.table(st.session_state["results"])
|
|
|
|
| 1 |
+
from collections import Counter
|
| 2 |
import streamlit as st
|
| 3 |
import srsly
|
| 4 |
|
| 5 |
+
|
| 6 |
def search(query):
|
| 7 |
results = []
|
| 8 |
for grant in grants:
|
| 9 |
if query in grant["tags"]:
|
| 10 |
+
results.append({"title": grant["title"], "tags": grant["tags"]})
|
|
|
|
|
|
|
|
|
|
| 11 |
st.session_state["results"] = results
|
| 12 |
|
| 13 |
+
|
| 14 |
st.header("Search π grants using MeSH π")
|
| 15 |
st.sidebar.header("Information βΉ")
|
| 16 |
+
st.sidebar.write(
|
| 17 |
+
"A complete list of MeSH tags can be found here https://meshb.nlm.nih.gov/treeView"
|
| 18 |
+
)
|
| 19 |
st.sidebar.write("The grants data can be found https://www.threesixtygiving.org/")
|
| 20 |
+
st.sidebar.write(
|
| 21 |
+
"The model used to tag grants is https://huggingface.co/Wellcome/WellcomeBertMesh"
|
| 22 |
+
)
|
| 23 |
|
| 24 |
if "grants" not in st.session_state:
|
| 25 |
st.session_state["grants"] = list(srsly.read_jsonl("tagged_grants.jsonl"))
|
| 26 |
|
| 27 |
grants = st.session_state["grants"]
|
| 28 |
|
| 29 |
+
query = st.text_input("", value="Malaria")
|
| 30 |
st.button("Search π", on_click=search, kwargs={"query": query})
|
| 31 |
|
| 32 |
if "results" in st.session_state:
|
| 33 |
st.caption("Related MeSH terms")
|
| 34 |
+
|
| 35 |
+
retrieved_tags = [tag for res in st.session_state["results"] for tag in res["tags"]]
|
| 36 |
+
most_common_tags = [tag for tag, _ in Counter(retrieved_tags).most_common(20)]
|
| 37 |
+
|
| 38 |
columns = st.columns(5)
|
| 39 |
for row_i in range(3):
|
| 40 |
for col_i, col in enumerate(columns):
|
| 41 |
with col:
|
| 42 |
+
tag_i = row_i * 5 + col_i
|
| 43 |
+
if tag_i < len(most_common_tags):
|
| 44 |
+
tag = most_common_tags[tag_i]
|
| 45 |
st.button(tag, on_click=search, kwargs={"query": tag})
|
| 46 |
st.table(st.session_state["results"])
|
preprocess.py
CHANGED
|
@@ -3,14 +3,21 @@ import json
|
|
| 3 |
from tqdm import tqdm
|
| 4 |
import typer
|
| 5 |
|
|
|
|
| 6 |
def preprocess(data_path, processed_data_path):
|
| 7 |
with open(data_path) as f:
|
| 8 |
data = json.loads(f.read())
|
| 9 |
|
| 10 |
with open(processed_data_path, "w") as f:
|
| 11 |
for grant in tqdm(data["grants"]):
|
| 12 |
-
if any(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
f.write(json.dumps(grant) + "\n")
|
| 14 |
|
|
|
|
| 15 |
if __name__ == "__main__":
|
| 16 |
typer.run(preprocess)
|
|
|
|
| 3 |
from tqdm import tqdm
|
| 4 |
import typer
|
| 5 |
|
| 6 |
+
|
| 7 |
def preprocess(data_path, processed_data_path):
|
| 8 |
with open(data_path) as f:
|
| 9 |
data = json.loads(f.read())
|
| 10 |
|
| 11 |
with open(processed_data_path, "w") as f:
|
| 12 |
for grant in tqdm(data["grants"]):
|
| 13 |
+
if any(
|
| 14 |
+
[
|
| 15 |
+
org["name"] == "The Wellcome Trust"
|
| 16 |
+
for org in grant["fundingOrganization"]
|
| 17 |
+
]
|
| 18 |
+
):
|
| 19 |
f.write(json.dumps(grant) + "\n")
|
| 20 |
|
| 21 |
+
|
| 22 |
if __name__ == "__main__":
|
| 23 |
typer.run(preprocess)
|
tag.py
CHANGED
|
@@ -24,13 +24,13 @@ def tag(data_path, tagged_data_path, sample_size: int = 10):
|
|
| 24 |
|
| 25 |
texts = [grant["title_and_description"] for grant in data]
|
| 26 |
for batch_index in tqdm(range(0, len(texts), 10)):
|
| 27 |
-
batch_texts = texts[batch_index:batch_index+10]
|
| 28 |
|
| 29 |
inputs = tokenizer(batch_texts, padding="max_length")
|
| 30 |
labels = model(**inputs, return_labels=True)
|
| 31 |
|
| 32 |
for i, tags in enumerate(labels):
|
| 33 |
-
data[batch_index+i]["tags"] = tags
|
| 34 |
|
| 35 |
srsly.write_jsonl(tagged_data_path, data)
|
| 36 |
|
|
|
|
| 24 |
|
| 25 |
texts = [grant["title_and_description"] for grant in data]
|
| 26 |
for batch_index in tqdm(range(0, len(texts), 10)):
|
| 27 |
+
batch_texts = texts[batch_index : batch_index + 10]
|
| 28 |
|
| 29 |
inputs = tokenizer(batch_texts, padding="max_length")
|
| 30 |
labels = model(**inputs, return_labels=True)
|
| 31 |
|
| 32 |
for i, tags in enumerate(labels):
|
| 33 |
+
data[batch_index + i]["tags"] = tags
|
| 34 |
|
| 35 |
srsly.write_jsonl(tagged_data_path, data)
|
| 36 |
|
tagged_grants.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|