Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +113 -32
src/streamlit_app.py
CHANGED
|
@@ -25,34 +25,68 @@ def load_model():
|
|
| 25 |
return SentenceTransformer('all-MiniLM-L6-v2', cache_folder='/tmp/huggingface')
|
| 26 |
|
| 27 |
@st.cache_data(ttl=3600, show_spinner=False)
|
| 28 |
-
def search_openalex_papers(query, num_results=50, country_code=None):
|
| 29 |
"""
|
| 30 |
Search OpenAlex for papers related to the query
|
| 31 |
Optionally filter by author's country
|
|
|
|
| 32 |
|
| 33 |
-
Note: Results are cached for 1 hour based on query, num_results, and
|
|
|
|
| 34 |
"""
|
| 35 |
base_url = "https://api.openalex.org/works"
|
|
|
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
"select": "id,title,abstract_inverted_index,authorships,publication_year,cited_by_count,display_name",
|
| 41 |
-
"mailto": "[email protected]" # Polite pool
|
| 42 |
-
}
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
-
|
| 49 |
-
response = requests.get(base_url, params=params, timeout=30)
|
| 50 |
-
response.raise_for_status()
|
| 51 |
-
data = response.json()
|
| 52 |
-
return data.get("results", [])
|
| 53 |
-
except Exception as e:
|
| 54 |
-
st.error(f"Error fetching papers: {str(e)}")
|
| 55 |
-
return []
|
| 56 |
|
| 57 |
def reconstruct_abstract(inverted_index):
|
| 58 |
"""
|
|
@@ -247,14 +281,39 @@ def main():
|
|
| 247 |
|
| 248 |
country_code = country_options[selected_country]
|
| 249 |
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
step=10
|
| 256 |
)
|
| 257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
top_papers_display = st.sidebar.slider(
|
| 259 |
"Top papers to display",
|
| 260 |
min_value=5,
|
|
@@ -291,16 +350,24 @@ def main():
|
|
| 291 |
|
| 292 |
if search_button and query:
|
| 293 |
# Display search parameters
|
| 294 |
-
st.info(f"π Searching: **{query}** | Papers: **{num_papers}** | Country: **{selected_country}** | Min papers
|
| 295 |
|
| 296 |
# Load model
|
| 297 |
with st.spinner("Loading semantic model..."):
|
| 298 |
model = load_model()
|
| 299 |
|
| 300 |
# Search papers
|
| 301 |
-
search_key = f"{query}_{num_papers}_{country_code}"
|
| 302 |
-
|
| 303 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
|
| 305 |
if not papers:
|
| 306 |
st.warning("No papers found. Try different search terms.")
|
|
@@ -310,14 +377,19 @@ def main():
|
|
| 310 |
|
| 311 |
# Show debug info in expander
|
| 312 |
with st.expander("π Search Details", expanded=False):
|
|
|
|
| 313 |
st.write(f"**Query:** {query}")
|
| 314 |
-
st.write(f"**
|
|
|
|
|
|
|
| 315 |
st.write(f"**Country filter:** {selected_country} ({country_code or 'None'})")
|
| 316 |
-
st.write(f"**Papers returned:** {len(papers)}")
|
| 317 |
st.write(f"**First paper:** {papers[0].get('display_name', 'N/A')[:100]}...")
|
| 318 |
st.write(f"**Last paper:** {papers[-1].get('display_name', 'N/A')[:100]}...")
|
| 319 |
|
| 320 |
# Prepare papers for semantic search
|
|
|
|
|
|
|
|
|
|
| 321 |
with st.spinner("Analyzing papers with semantic search..."):
|
| 322 |
paper_texts = []
|
| 323 |
valid_papers = []
|
|
@@ -339,7 +411,11 @@ def main():
|
|
| 339 |
|
| 340 |
# Generate embeddings
|
| 341 |
query_embedding = model.encode(query, convert_to_tensor=False)
|
| 342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
|
| 344 |
# Calculate similarities
|
| 345 |
similarities = calculate_semantic_similarity(query_embedding, paper_embeddings)
|
|
@@ -348,6 +424,11 @@ def main():
|
|
| 348 |
sorted_indices = np.argsort(similarities)[::-1]
|
| 349 |
sorted_papers = [valid_papers[i] for i in sorted_indices]
|
| 350 |
sorted_scores = [similarities[i] for i in sorted_indices]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
|
| 352 |
# Display top papers
|
| 353 |
st.header(f"π Top {top_papers_display} Most Relevant Papers")
|
|
|
|
| 25 |
return SentenceTransformer('all-MiniLM-L6-v2', cache_folder='/tmp/huggingface')
|
| 26 |
|
| 27 |
@st.cache_data(ttl=3600, show_spinner=False)
|
| 28 |
+
def search_openalex_papers(query, num_results=50, country_code=None, use_fulltext=False):
|
| 29 |
"""
|
| 30 |
Search OpenAlex for papers related to the query
|
| 31 |
Optionally filter by author's country
|
| 32 |
+
Optionally use full-text search (searches title + abstract + full text when available)
|
| 33 |
|
| 34 |
+
Note: Results are cached for 1 hour based on query, num_results, country_code, and use_fulltext
|
| 35 |
+
For large requests (>100), uses pagination
|
| 36 |
"""
|
| 37 |
base_url = "https://api.openalex.org/works"
|
| 38 |
+
all_papers = []
|
| 39 |
|
| 40 |
+
# OpenAlex max per_page is 200, so we need pagination for large requests
|
| 41 |
+
per_page = min(200, num_results)
|
| 42 |
+
num_pages = (num_results + per_page - 1) // per_page # Ceiling division
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
+
for page in range(1, num_pages + 1):
|
| 45 |
+
params = {
|
| 46 |
+
"per_page": per_page,
|
| 47 |
+
"page": page,
|
| 48 |
+
"select": "id,title,abstract_inverted_index,authorships,publication_year,cited_by_count,display_name",
|
| 49 |
+
"mailto": "[email protected]" # Polite pool
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
# Build filter string
|
| 53 |
+
filters = []
|
| 54 |
+
|
| 55 |
+
if use_fulltext:
|
| 56 |
+
# Full-text search (searches title + abstract + full text when available)
|
| 57 |
+
filters.append(f"fulltext.search:{query}")
|
| 58 |
+
else:
|
| 59 |
+
# Standard search (title + abstract only)
|
| 60 |
+
params["search"] = query
|
| 61 |
+
|
| 62 |
+
# Add country filter if specified
|
| 63 |
+
if country_code:
|
| 64 |
+
filters.append(f"authorships.countries:{country_code}")
|
| 65 |
+
|
| 66 |
+
# Combine filters with comma (AND operation)
|
| 67 |
+
if filters:
|
| 68 |
+
params["filter"] = ",".join(filters)
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
response = requests.get(base_url, params=params, timeout=30)
|
| 72 |
+
response.raise_for_status()
|
| 73 |
+
data = response.json()
|
| 74 |
+
papers = data.get("results", [])
|
| 75 |
+
all_papers.extend(papers)
|
| 76 |
+
|
| 77 |
+
# If we got fewer papers than requested, no more pages available
|
| 78 |
+
if len(papers) < per_page:
|
| 79 |
+
break
|
| 80 |
+
|
| 81 |
+
# Rate limiting - be nice to OpenAlex
|
| 82 |
+
if page < num_pages:
|
| 83 |
+
time.sleep(0.1) # 100ms delay between requests
|
| 84 |
+
|
| 85 |
+
except Exception as e:
|
| 86 |
+
st.error(f"Error fetching papers (page {page}): {str(e)}")
|
| 87 |
+
break
|
| 88 |
|
| 89 |
+
return all_papers[:num_results] # Return exactly what was requested
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
def reconstruct_abstract(inverted_index):
|
| 92 |
"""
|
|
|
|
| 281 |
|
| 282 |
country_code = country_options[selected_country]
|
| 283 |
|
| 284 |
+
# Search mode selection
|
| 285 |
+
search_mode = st.sidebar.radio(
|
| 286 |
+
"Search Mode",
|
| 287 |
+
options=["Quick Search", "Deep Search"],
|
| 288 |
+
help="Quick: 50-100 papers in 30s | Deep: 1,000-5,000 papers in 2-5 min"
|
|
|
|
| 289 |
)
|
| 290 |
|
| 291 |
+
# Full-text search option
|
| 292 |
+
use_fulltext = st.sidebar.checkbox(
|
| 293 |
+
"Include full text (when available)",
|
| 294 |
+
value=False,
|
| 295 |
+
help="Search title + abstract + full text. Full text available for ~10-15% of papers. May find more specific matches."
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
if search_mode == "Quick Search":
|
| 299 |
+
num_papers = st.sidebar.slider(
|
| 300 |
+
"Number of papers to fetch",
|
| 301 |
+
min_value=20,
|
| 302 |
+
max_value=100,
|
| 303 |
+
value=50,
|
| 304 |
+
step=10
|
| 305 |
+
)
|
| 306 |
+
else: # Deep Search
|
| 307 |
+
num_papers = st.sidebar.slider(
|
| 308 |
+
"Number of papers to fetch",
|
| 309 |
+
min_value=500,
|
| 310 |
+
max_value=5000,
|
| 311 |
+
value=1000,
|
| 312 |
+
step=500,
|
| 313 |
+
help="β οΈ Deep search takes 2-5 minutes"
|
| 314 |
+
)
|
| 315 |
+
st.sidebar.warning("β οΈ Deep search will take 2-5 minutes to complete")
|
| 316 |
+
|
| 317 |
top_papers_display = st.sidebar.slider(
|
| 318 |
"Top papers to display",
|
| 319 |
min_value=5,
|
|
|
|
| 350 |
|
| 351 |
if search_button and query:
|
| 352 |
# Display search parameters
|
| 353 |
+
st.info(f"π Searching: **{query}** | Mode: **{search_mode}** | Papers: **{num_papers}** | Country: **{selected_country}** | Full-text: **{'Yes' if use_fulltext else 'No'}** | Min papers/author: **{min_papers_per_author}**")
|
| 354 |
|
| 355 |
# Load model
|
| 356 |
with st.spinner("Loading semantic model..."):
|
| 357 |
model = load_model()
|
| 358 |
|
| 359 |
# Search papers
|
| 360 |
+
search_key = f"{query}_{num_papers}_{country_code}_{use_fulltext}"
|
| 361 |
+
|
| 362 |
+
if search_mode == "Deep Search":
|
| 363 |
+
progress_text = f"π Deep search in progress: Fetching up to {num_papers} papers from OpenAlex..."
|
| 364 |
+
progress_bar = st.progress(0, text=progress_text)
|
| 365 |
+
|
| 366 |
+
with st.spinner(f"Searching OpenAlex for papers about '{query}'{' from ' + selected_country if country_code else ''}{' (including full text)' if use_fulltext else ''}..."):
|
| 367 |
+
papers = search_openalex_papers(query, num_papers, country_code, use_fulltext)
|
| 368 |
+
|
| 369 |
+
if search_mode == "Deep Search":
|
| 370 |
+
progress_bar.progress(33, text="π Papers fetched! Now generating embeddings...")
|
| 371 |
|
| 372 |
if not papers:
|
| 373 |
st.warning("No papers found. Try different search terms.")
|
|
|
|
| 377 |
|
| 378 |
# Show debug info in expander
|
| 379 |
with st.expander("π Search Details", expanded=False):
|
| 380 |
+
st.write(f"**Search Mode:** {search_mode}")
|
| 381 |
st.write(f"**Query:** {query}")
|
| 382 |
+
st.write(f"**Full-text search:** {'Enabled' if use_fulltext else 'Disabled'}")
|
| 383 |
+
st.write(f"**Papers requested:** {num_papers}")
|
| 384 |
+
st.write(f"**Papers fetched:** {len(papers)}")
|
| 385 |
st.write(f"**Country filter:** {selected_country} ({country_code or 'None'})")
|
|
|
|
| 386 |
st.write(f"**First paper:** {papers[0].get('display_name', 'N/A')[:100]}...")
|
| 387 |
st.write(f"**Last paper:** {papers[-1].get('display_name', 'N/A')[:100]}...")
|
| 388 |
|
| 389 |
# Prepare papers for semantic search
|
| 390 |
+
if search_mode == "Deep Search":
|
| 391 |
+
progress_bar.progress(50, text="π§ Generating semantic embeddings...")
|
| 392 |
+
|
| 393 |
with st.spinner("Analyzing papers with semantic search..."):
|
| 394 |
paper_texts = []
|
| 395 |
valid_papers = []
|
|
|
|
| 411 |
|
| 412 |
# Generate embeddings
|
| 413 |
query_embedding = model.encode(query, convert_to_tensor=False)
|
| 414 |
+
|
| 415 |
+
if search_mode == "Deep Search":
|
| 416 |
+
progress_bar.progress(66, text=f"π’ Computing similarity for {len(paper_texts)} papers...")
|
| 417 |
+
|
| 418 |
+
paper_embeddings = model.encode(paper_texts, convert_to_tensor=False, show_progress_bar=False)
|
| 419 |
|
| 420 |
# Calculate similarities
|
| 421 |
similarities = calculate_semantic_similarity(query_embedding, paper_embeddings)
|
|
|
|
| 424 |
sorted_indices = np.argsort(similarities)[::-1]
|
| 425 |
sorted_papers = [valid_papers[i] for i in sorted_indices]
|
| 426 |
sorted_scores = [similarities[i] for i in sorted_indices]
|
| 427 |
+
|
| 428 |
+
if search_mode == "Deep Search":
|
| 429 |
+
progress_bar.progress(100, text="β
Complete!")
|
| 430 |
+
time.sleep(0.5)
|
| 431 |
+
progress_bar.empty()
|
| 432 |
|
| 433 |
# Display top papers
|
| 434 |
st.header(f"π Top {top_papers_display} Most Relevant Papers")
|