INLEXIO commited on
Commit
2c6e629
Β·
verified Β·
1 Parent(s): 0c2af38

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +94 -95
src/streamlit_app.py CHANGED
@@ -5,12 +5,35 @@ import numpy as np
5
  from collections import defaultdict
6
  import time
7
  import os
 
8
 
9
- # Set cache directory to writable location
10
  os.environ['HF_HOME'] = '/tmp/huggingface'
11
  os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface'
12
  os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/tmp/huggingface'
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # Page config
15
  st.set_page_config(
16
  page_title="OpenAlex Semantic Search",
@@ -24,14 +47,15 @@ def load_model():
24
  """Load the sentence transformer model"""
25
  return SentenceTransformer('all-MiniLM-L6-v2', cache_folder='/tmp/huggingface')
26
 
27
- @st.cache_data(ttl=3600, show_spinner=False)
 
28
  def search_openalex_papers(query, num_results=50, country_code=None, use_fulltext=False):
29
  """
30
  Search OpenAlex for papers related to the query
31
  Optionally filter by author's country
32
  Optionally use full-text search (searches title + abstract + full text when available)
33
 
34
- Note: Results are cached for 1 hour based on query, num_results, country_code, and use_fulltext
35
  For large requests (>100), uses pagination
36
  """
37
  base_url = "https://api.openalex.org/works"
@@ -105,10 +129,12 @@ def reconstruct_abstract(inverted_index):
105
  words_with_positions.sort(key=lambda x: x[0])
106
  return " ".join([word for _, word in words_with_positions])
107
 
108
- @st.cache_data(ttl=3600)
 
109
  def get_author_details(author_id):
110
  """
111
  Fetch detailed author information from OpenAlex
 
112
  """
113
  base_url = f"https://api.openalex.org/authors/{author_id}"
114
 
@@ -200,24 +226,24 @@ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
200
  normalized_h_index = data['h_index'] / 100.0 # Assume max h-index of 100
201
  normalized_citations = np.log1p(data['total_citations']) / 15.0 # Log scale
202
 
203
- # Composite score: weighted combination
204
  composite_score = (
205
- 0.5 * avg_relevance + # 50% semantic relevance
206
- 0.3 * normalized_h_index + # 30% h-index
207
- 0.2 * normalized_citations # 20% citations
208
  )
209
 
210
  ranked_authors.append({
211
- 'author_id': author_id,
212
  'name': data['name'],
213
- 'institution': data['institution'],
214
  'h_index': data['h_index'],
215
  'total_citations': data['total_citations'],
216
  'works_count': data['works_count'],
217
  'num_relevant_papers': len(data['paper_scores']),
218
  'avg_relevance_score': avg_relevance,
219
  'composite_score': composite_score,
220
- 'openalex_url': f"https://openalex.org/{author_id}"
 
221
  })
222
 
223
  # Sort by composite score
@@ -225,121 +251,94 @@ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
225
 
226
  return ranked_authors
227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  def main():
 
229
  st.title("πŸ”¬ OpenAlex Semantic Search")
230
  st.markdown("""
231
- Search for academic papers and discover top researchers using semantic search powered by OpenAlex.
232
-
233
- **How it works:**
234
- 1. Enter your search terms (e.g., "machine learning for drug discovery")
235
- 2. Optionally filter by author country
236
- 3. The app finds relevant papers using semantic similarity
237
- 4. Authors are ranked by relevance, h-index, and citation metrics
238
  """)
239
 
240
- # Sidebar controls
241
- st.sidebar.header("Search Settings")
242
-
243
- # Clear cache button
244
- if st.sidebar.button("πŸ”„ Clear Cache", help="Clear cached search results to force fresh data"):
245
- st.cache_data.clear()
246
- st.sidebar.success("Cache cleared!")
247
- st.rerun()
248
-
249
- st.sidebar.divider()
250
-
251
- # Country filter
252
- country_options = {
253
- "Any Country": None,
254
- "United States": "US",
255
- "United Kingdom": "GB",
256
- "Germany": "DE",
257
- "France": "FR",
258
- "Canada": "CA",
259
- "Australia": "AU",
260
- "China": "CN",
261
- "Japan": "JP",
262
- "India": "IN",
263
- "South Korea": "KR",
264
- "Netherlands": "NL",
265
- "Switzerland": "CH",
266
- "Sweden": "SE",
267
- "Italy": "IT",
268
- "Spain": "ES",
269
- "Brazil": "BR",
270
- "Singapore": "SG",
271
- "Israel": "IL",
272
- "Belgium": "BE",
273
- "Austria": "AT",
274
- }
275
-
276
- selected_country = st.sidebar.selectbox(
277
- "Filter by author country",
278
- options=list(country_options.keys()),
279
- help="Filter papers by the country of at least one author. Uses ISO country codes from OpenAlex data."
280
- )
281
-
282
- country_code = country_options[selected_country]
283
 
284
  # Search mode selection
285
  search_mode = st.sidebar.radio(
286
  "Search Mode",
287
- options=["Quick Search", "Deep Search"],
288
- help="Quick: 50-100 papers in 30s | Deep: 1,000-5,000 papers in 2-5 min"
289
- )
290
-
291
- # Full-text search option
292
- use_fulltext = st.sidebar.checkbox(
293
- "Include full text (when available)",
294
- value=False,
295
- help="Search title + abstract + full text. Full text available for ~10-15% of papers. May find more specific matches."
296
  )
297
 
 
298
  if search_mode == "Quick Search":
299
  num_papers = st.sidebar.slider(
300
- "Number of papers to fetch",
301
  min_value=20,
302
  max_value=100,
303
  value=50,
304
- step=10
 
305
  )
306
- else: # Deep Search
307
  num_papers = st.sidebar.slider(
308
- "Number of papers to fetch",
309
- min_value=500,
310
- max_value=5000,
311
- value=1000,
312
- step=500,
313
- help="⚠️ Deep search takes 2-5 minutes"
314
  )
315
- st.sidebar.warning("⚠️ Deep search will take 2-5 minutes to complete")
316
 
317
- top_papers_display = st.sidebar.slider(
318
- "Top papers to display",
319
- min_value=5,
320
- max_value=30,
321
- value=10,
322
- step=5
323
  )
 
324
 
325
- top_authors_display = st.sidebar.slider(
326
- "Top authors to display",
327
- min_value=5,
328
- max_value=50,
329
- value=20,
330
- step=5
331
  )
332
 
 
333
  min_papers_per_author = st.sidebar.slider(
334
  "Minimum papers per author",
335
  min_value=1,
336
  max_value=5,
337
  value=2,
338
- step=1,
339
- help="Minimum number of relevant papers an author must have to be included"
340
  )
341
 
342
- # Main search input
 
 
 
 
 
 
 
 
 
 
 
343
  query = st.text_input(
344
  "Enter your search query:",
345
  placeholder="e.g., 'graph neural networks for protein structure prediction'",
 
5
  from collections import defaultdict
6
  import time
7
  import os
8
+ import shutil
9
 
10
+ # Set cache directory to /tmp (gets cleared on restart)
11
  os.environ['HF_HOME'] = '/tmp/huggingface'
12
  os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface'
13
  os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/tmp/huggingface'
14
 
15
+ # Clear old cache on startup to prevent accumulation
16
+ def clear_old_cache():
17
+ """Clear /tmp cache if it gets too large"""
18
+ cache_dir = '/tmp/huggingface'
19
+ try:
20
+ if os.path.exists(cache_dir):
21
+ size_mb = sum(
22
+ os.path.getsize(os.path.join(dirpath, filename))
23
+ for dirpath, dirnames, filenames in os.walk(cache_dir)
24
+ for filename in filenames
25
+ ) / (1024 * 1024)
26
+
27
+ # If cache > 5GB, clear it
28
+ if size_mb > 5000:
29
+ shutil.rmtree(cache_dir)
30
+ os.makedirs(cache_dir)
31
+ except:
32
+ pass
33
+
34
+ # Run cleanup on startup
35
+ clear_old_cache()
36
+
37
  # Page config
38
  st.set_page_config(
39
  page_title="OpenAlex Semantic Search",
 
47
  """Load the sentence transformer model"""
48
  return SentenceTransformer('all-MiniLM-L6-v2', cache_folder='/tmp/huggingface')
49
 
50
+ # LIMITED CACHE: Only store 50 recent searches
51
+ @st.cache_data(ttl=3600, max_entries=50, show_spinner=False)
52
  def search_openalex_papers(query, num_results=50, country_code=None, use_fulltext=False):
53
  """
54
  Search OpenAlex for papers related to the query
55
  Optionally filter by author's country
56
  Optionally use full-text search (searches title + abstract + full text when available)
57
 
58
+ Note: Results are cached for 1 hour, max 50 searches stored
59
  For large requests (>100), uses pagination
60
  """
61
  base_url = "https://api.openalex.org/works"
 
129
  words_with_positions.sort(key=lambda x: x[0])
130
  return " ".join([word for _, word in words_with_positions])
131
 
132
+ # LIMITED CACHE: Only store 200 recent author lookups
133
+ @st.cache_data(ttl=3600, max_entries=200)
134
  def get_author_details(author_id):
135
  """
136
  Fetch detailed author information from OpenAlex
137
+ Cache limited to 200 authors to prevent storage issues
138
  """
139
  base_url = f"https://api.openalex.org/authors/{author_id}"
140
 
 
226
  normalized_h_index = data['h_index'] / 100.0 # Assume max h-index of 100
227
  normalized_citations = np.log1p(data['total_citations']) / 15.0 # Log scale
228
 
229
+ # Weighted composite score
230
  composite_score = (
231
+ 0.5 * avg_relevance + # 50% relevance
232
+ 0.3 * min(normalized_h_index, 1.0) + # 30% h-index
233
+ 0.2 * min(normalized_citations, 1.0) # 20% citations
234
  )
235
 
236
  ranked_authors.append({
 
237
  'name': data['name'],
238
+ 'id': author_id,
239
  'h_index': data['h_index'],
240
  'total_citations': data['total_citations'],
241
  'works_count': data['works_count'],
242
  'num_relevant_papers': len(data['paper_scores']),
243
  'avg_relevance_score': avg_relevance,
244
  'composite_score': composite_score,
245
+ 'institution': data['institution'],
246
+ 'openalex_url': f"https://openalex.org/A{author_id}"
247
  })
248
 
249
  # Sort by composite score
 
251
 
252
  return ranked_authors
253
 
254
+ # Define country codes
255
+ COUNTRIES = {
256
+ "All Countries": None,
257
+ "Australia": "AU",
258
+ "Canada": "CA",
259
+ "China": "CN",
260
+ "France": "FR",
261
+ "Germany": "DE",
262
+ "India": "IN",
263
+ "Japan": "JP",
264
+ "United Kingdom": "GB",
265
+ "United States": "US",
266
+ }
267
+
268
  def main():
269
+ # Header
270
  st.title("πŸ”¬ OpenAlex Semantic Search")
271
  st.markdown("""
272
+ Search for research papers and discover top researchers using semantic similarity matching.
273
+ This tool searches the OpenAlex database and ranks results by relevance, not just citations.
 
 
 
 
 
274
  """)
275
 
276
+ # Sidebar configuration
277
+ st.sidebar.header("βš™οΈ Search Configuration")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
  # Search mode selection
280
  search_mode = st.sidebar.radio(
281
  "Search Mode",
282
+ ["Quick Search", "Deep Search"],
283
+ help="Quick: 50-100 papers (~30s) | Deep: 500-1,000 papers (2-5 min)"
 
 
 
 
 
 
 
284
  )
285
 
286
+ # Number of papers based on mode
287
  if search_mode == "Quick Search":
288
  num_papers = st.sidebar.slider(
289
+ "Number of papers to analyze",
290
  min_value=20,
291
  max_value=100,
292
  value=50,
293
+ step=10,
294
+ help="More papers = more comprehensive but slower"
295
  )
296
+ else: # Deep Search - LIMIT TO 1000 to prevent storage issues
297
  num_papers = st.sidebar.slider(
298
+ "Number of papers to analyze",
299
+ min_value=100,
300
+ max_value=1000, # REDUCED from 5000
301
+ value=500,
302
+ step=100,
303
+ help="⚠️ Limited to 1000 papers to prevent storage issues. Deep search takes 2-5 minutes."
304
  )
 
305
 
306
+ # Country filter
307
+ selected_country = st.sidebar.selectbox(
308
+ "Filter by author country (optional)",
309
+ options=list(COUNTRIES.keys()),
310
+ help="Only include papers where at least one author is from this country"
 
311
  )
312
+ country_code = COUNTRIES[selected_country]
313
 
314
+ # Full-text search option
315
+ use_fulltext = st.sidebar.checkbox(
316
+ "Include full text (when available)",
317
+ value=False,
318
+ help="Search within full paper text (not just title/abstract). ~10-15% of papers have full text available. Slightly slower."
 
319
  )
320
 
321
+ # Minimum papers per author
322
  min_papers_per_author = st.sidebar.slider(
323
  "Minimum papers per author",
324
  min_value=1,
325
  max_value=5,
326
  value=2,
327
+ help="Filters out authors who appear in fewer than N papers"
 
328
  )
329
 
330
+ # Display settings
331
+ st.sidebar.header("πŸ“Š Display Settings")
332
+ top_papers_display = st.sidebar.slider("Number of top papers to show", 5, 50, 10)
333
+ top_authors_display = st.sidebar.slider("Number of top authors to show", 5, 50, 10)
334
+
335
+ # Storage usage info
336
+ st.sidebar.markdown("---")
337
+ st.sidebar.info("πŸ’Ύ Cache limited to prevent storage issues:\n- Max 50 searches stored\n- Max 200 authors cached\n- Max 1000 papers in Deep Search")
338
+
339
+ # Main search interface
340
+ st.header("πŸ” Search Query")
341
+
342
  query = st.text_input(
343
  "Enter your search query:",
344
  placeholder="e.g., 'graph neural networks for protein structure prediction'",