INLEXIO commited on
Commit
0c2af38
Β·
verified Β·
1 Parent(s): 71956d4

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +113 -32
src/streamlit_app.py CHANGED
@@ -25,34 +25,68 @@ def load_model():
25
  return SentenceTransformer('all-MiniLM-L6-v2', cache_folder='/tmp/huggingface')
26
 
27
  @st.cache_data(ttl=3600, show_spinner=False)
28
- def search_openalex_papers(query, num_results=50, country_code=None):
29
  """
30
  Search OpenAlex for papers related to the query
31
  Optionally filter by author's country
 
32
 
33
- Note: Results are cached for 1 hour based on query, num_results, and country_code
 
34
  """
35
  base_url = "https://api.openalex.org/works"
 
36
 
37
- params = {
38
- "search": query,
39
- "per_page": num_results,
40
- "select": "id,title,abstract_inverted_index,authorships,publication_year,cited_by_count,display_name",
41
- "mailto": "[email protected]" # Polite pool
42
- }
43
 
44
- # Add country filter if specified
45
- if country_code:
46
- params["filter"] = f"authorships.countries:{country_code}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- try:
49
- response = requests.get(base_url, params=params, timeout=30)
50
- response.raise_for_status()
51
- data = response.json()
52
- return data.get("results", [])
53
- except Exception as e:
54
- st.error(f"Error fetching papers: {str(e)}")
55
- return []
56
 
57
  def reconstruct_abstract(inverted_index):
58
  """
@@ -247,14 +281,39 @@ def main():
247
 
248
  country_code = country_options[selected_country]
249
 
250
- num_papers = st.sidebar.slider(
251
- "Number of papers to fetch",
252
- min_value=20,
253
- max_value=100,
254
- value=50,
255
- step=10
256
  )
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  top_papers_display = st.sidebar.slider(
259
  "Top papers to display",
260
  min_value=5,
@@ -291,16 +350,24 @@ def main():
291
 
292
  if search_button and query:
293
  # Display search parameters
294
- st.info(f"πŸ” Searching: **{query}** | Papers: **{num_papers}** | Country: **{selected_country}** | Min papers per author: **{min_papers_per_author}**")
295
 
296
  # Load model
297
  with st.spinner("Loading semantic model..."):
298
  model = load_model()
299
 
300
  # Search papers
301
- search_key = f"{query}_{num_papers}_{country_code}"
302
- with st.spinner(f"Searching OpenAlex for papers about '{query}'{' from ' + selected_country if country_code else ''}..."):
303
- papers = search_openalex_papers(query, num_papers, country_code)
 
 
 
 
 
 
 
 
304
 
305
  if not papers:
306
  st.warning("No papers found. Try different search terms.")
@@ -310,14 +377,19 @@ def main():
310
 
311
  # Show debug info in expander
312
  with st.expander("πŸ” Search Details", expanded=False):
 
313
  st.write(f"**Query:** {query}")
314
- st.write(f"**Papers fetched:** {num_papers}")
 
 
315
  st.write(f"**Country filter:** {selected_country} ({country_code or 'None'})")
316
- st.write(f"**Papers returned:** {len(papers)}")
317
  st.write(f"**First paper:** {papers[0].get('display_name', 'N/A')[:100]}...")
318
  st.write(f"**Last paper:** {papers[-1].get('display_name', 'N/A')[:100]}...")
319
 
320
  # Prepare papers for semantic search
 
 
 
321
  with st.spinner("Analyzing papers with semantic search..."):
322
  paper_texts = []
323
  valid_papers = []
@@ -339,7 +411,11 @@ def main():
339
 
340
  # Generate embeddings
341
  query_embedding = model.encode(query, convert_to_tensor=False)
342
- paper_embeddings = model.encode(paper_texts, convert_to_tensor=False, show_progress_bar=True)
 
 
 
 
343
 
344
  # Calculate similarities
345
  similarities = calculate_semantic_similarity(query_embedding, paper_embeddings)
@@ -348,6 +424,11 @@ def main():
348
  sorted_indices = np.argsort(similarities)[::-1]
349
  sorted_papers = [valid_papers[i] for i in sorted_indices]
350
  sorted_scores = [similarities[i] for i in sorted_indices]
 
 
 
 
 
351
 
352
  # Display top papers
353
  st.header(f"πŸ“„ Top {top_papers_display} Most Relevant Papers")
 
25
  return SentenceTransformer('all-MiniLM-L6-v2', cache_folder='/tmp/huggingface')
26
 
27
  @st.cache_data(ttl=3600, show_spinner=False)
28
+ def search_openalex_papers(query, num_results=50, country_code=None, use_fulltext=False):
29
  """
30
  Search OpenAlex for papers related to the query
31
  Optionally filter by author's country
32
+ Optionally use full-text search (searches title + abstract + full text when available)
33
 
34
+ Note: Results are cached for 1 hour based on query, num_results, country_code, and use_fulltext
35
+ For large requests (>100), uses pagination
36
  """
37
  base_url = "https://api.openalex.org/works"
38
+ all_papers = []
39
 
40
+ # OpenAlex max per_page is 200, so we need pagination for large requests
41
+ per_page = min(200, num_results)
42
+ num_pages = (num_results + per_page - 1) // per_page # Ceiling division
 
 
 
43
 
44
+ for page in range(1, num_pages + 1):
45
+ params = {
46
+ "per_page": per_page,
47
+ "page": page,
48
+ "select": "id,title,abstract_inverted_index,authorships,publication_year,cited_by_count,display_name",
49
+ "mailto": "[email protected]" # Polite pool
50
+ }
51
+
52
+ # Build filter string
53
+ filters = []
54
+
55
+ if use_fulltext:
56
+ # Full-text search (searches title + abstract + full text when available)
57
+ filters.append(f"fulltext.search:{query}")
58
+ else:
59
+ # Standard search (title + abstract only)
60
+ params["search"] = query
61
+
62
+ # Add country filter if specified
63
+ if country_code:
64
+ filters.append(f"authorships.countries:{country_code}")
65
+
66
+ # Combine filters with comma (AND operation)
67
+ if filters:
68
+ params["filter"] = ",".join(filters)
69
+
70
+ try:
71
+ response = requests.get(base_url, params=params, timeout=30)
72
+ response.raise_for_status()
73
+ data = response.json()
74
+ papers = data.get("results", [])
75
+ all_papers.extend(papers)
76
+
77
+ # If we got fewer papers than requested, no more pages available
78
+ if len(papers) < per_page:
79
+ break
80
+
81
+ # Rate limiting - be nice to OpenAlex
82
+ if page < num_pages:
83
+ time.sleep(0.1) # 100ms delay between requests
84
+
85
+ except Exception as e:
86
+ st.error(f"Error fetching papers (page {page}): {str(e)}")
87
+ break
88
 
89
+ return all_papers[:num_results] # Return exactly what was requested
 
 
 
 
 
 
 
90
 
91
  def reconstruct_abstract(inverted_index):
92
  """
 
281
 
282
  country_code = country_options[selected_country]
283
 
284
+ # Search mode selection
285
+ search_mode = st.sidebar.radio(
286
+ "Search Mode",
287
+ options=["Quick Search", "Deep Search"],
288
+ help="Quick: 50-100 papers in 30s | Deep: 1,000-5,000 papers in 2-5 min"
 
289
  )
290
 
291
+ # Full-text search option
292
+ use_fulltext = st.sidebar.checkbox(
293
+ "Include full text (when available)",
294
+ value=False,
295
+ help="Search title + abstract + full text. Full text available for ~10-15% of papers. May find more specific matches."
296
+ )
297
+
298
+ if search_mode == "Quick Search":
299
+ num_papers = st.sidebar.slider(
300
+ "Number of papers to fetch",
301
+ min_value=20,
302
+ max_value=100,
303
+ value=50,
304
+ step=10
305
+ )
306
+ else: # Deep Search
307
+ num_papers = st.sidebar.slider(
308
+ "Number of papers to fetch",
309
+ min_value=500,
310
+ max_value=5000,
311
+ value=1000,
312
+ step=500,
313
+ help="⚠️ Deep search takes 2-5 minutes"
314
+ )
315
+ st.sidebar.warning("⚠️ Deep search will take 2-5 minutes to complete")
316
+
317
  top_papers_display = st.sidebar.slider(
318
  "Top papers to display",
319
  min_value=5,
 
350
 
351
  if search_button and query:
352
  # Display search parameters
353
+ st.info(f"πŸ” Searching: **{query}** | Mode: **{search_mode}** | Papers: **{num_papers}** | Country: **{selected_country}** | Full-text: **{'Yes' if use_fulltext else 'No'}** | Min papers/author: **{min_papers_per_author}**")
354
 
355
  # Load model
356
  with st.spinner("Loading semantic model..."):
357
  model = load_model()
358
 
359
  # Search papers
360
+ search_key = f"{query}_{num_papers}_{country_code}_{use_fulltext}"
361
+
362
+ if search_mode == "Deep Search":
363
+ progress_text = f"πŸ” Deep search in progress: Fetching up to {num_papers} papers from OpenAlex..."
364
+ progress_bar = st.progress(0, text=progress_text)
365
+
366
+ with st.spinner(f"Searching OpenAlex for papers about '{query}'{' from ' + selected_country if country_code else ''}{' (including full text)' if use_fulltext else ''}..."):
367
+ papers = search_openalex_papers(query, num_papers, country_code, use_fulltext)
368
+
369
+ if search_mode == "Deep Search":
370
+ progress_bar.progress(33, text="πŸ“„ Papers fetched! Now generating embeddings...")
371
 
372
  if not papers:
373
  st.warning("No papers found. Try different search terms.")
 
377
 
378
  # Show debug info in expander
379
  with st.expander("πŸ” Search Details", expanded=False):
380
+ st.write(f"**Search Mode:** {search_mode}")
381
  st.write(f"**Query:** {query}")
382
+ st.write(f"**Full-text search:** {'Enabled' if use_fulltext else 'Disabled'}")
383
+ st.write(f"**Papers requested:** {num_papers}")
384
+ st.write(f"**Papers fetched:** {len(papers)}")
385
  st.write(f"**Country filter:** {selected_country} ({country_code or 'None'})")
 
386
  st.write(f"**First paper:** {papers[0].get('display_name', 'N/A')[:100]}...")
387
  st.write(f"**Last paper:** {papers[-1].get('display_name', 'N/A')[:100]}...")
388
 
389
  # Prepare papers for semantic search
390
+ if search_mode == "Deep Search":
391
+ progress_bar.progress(50, text="🧠 Generating semantic embeddings...")
392
+
393
  with st.spinner("Analyzing papers with semantic search..."):
394
  paper_texts = []
395
  valid_papers = []
 
411
 
412
  # Generate embeddings
413
  query_embedding = model.encode(query, convert_to_tensor=False)
414
+
415
+ if search_mode == "Deep Search":
416
+ progress_bar.progress(66, text=f"πŸ”’ Computing similarity for {len(paper_texts)} papers...")
417
+
418
+ paper_embeddings = model.encode(paper_texts, convert_to_tensor=False, show_progress_bar=False)
419
 
420
  # Calculate similarities
421
  similarities = calculate_semantic_similarity(query_embedding, paper_embeddings)
 
424
  sorted_indices = np.argsort(similarities)[::-1]
425
  sorted_papers = [valid_papers[i] for i in sorted_indices]
426
  sorted_scores = [similarities[i] for i in sorted_indices]
427
+
428
+ if search_mode == "Deep Search":
429
+ progress_bar.progress(100, text="βœ… Complete!")
430
+ time.sleep(0.5)
431
+ progress_bar.empty()
432
 
433
  # Display top papers
434
  st.header(f"πŸ“„ Top {top_papers_display} Most Relevant Papers")