INLEXIO commited on
Commit
2e73ba2
Β·
verified Β·
1 Parent(s): de16096

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +203 -20
src/streamlit_app.py CHANGED
@@ -52,11 +52,12 @@ def load_model():
52
 
53
  # LIMITED CACHE: Only store 50 recent searches
54
  @st.cache_data(ttl=3600, max_entries=50, show_spinner=False)
55
- def search_openalex_papers(query, num_results=50, country_code=None, use_fulltext=False):
56
  """
57
  Search OpenAlex for papers related to the query
58
  Optionally filter by author's country
59
  Optionally use full-text search (searches title + abstract + full text when available)
 
60
 
61
  Note: Results are cached for 1 hour, max 50 searches stored
62
  For large requests (>100), uses pagination
@@ -90,6 +91,12 @@ def search_openalex_papers(query, num_results=50, country_code=None, use_fulltex
90
  if country_code:
91
  filters.append(f"authorships.countries:{country_code}")
92
 
 
 
 
 
 
 
93
  # Combine filters with comma (AND operation)
94
  if filters:
95
  params["filter"] = ",".join(filters)
@@ -152,6 +159,66 @@ def get_author_details(author_id):
152
  except Exception as e:
153
  return None
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  def calculate_semantic_similarity(query_embedding, paper_embeddings):
156
  """
157
  Calculate cosine similarity between query and papers
@@ -164,12 +231,14 @@ def calculate_semantic_similarity(query_embedding, paper_embeddings):
164
  similarities = np.dot(paper_norms, query_norm)
165
  return similarities
166
 
167
- def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
168
  """
169
  Extract authors from papers and rank them based on:
170
  - Semantic relevance (average of their paper scores)
171
  - H-index
172
  - Total citations
 
 
173
  """
174
  author_data = defaultdict(lambda: {
175
  'name': '',
@@ -179,7 +248,9 @@ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
179
  'total_citations': 0,
180
  'works_count': 0,
181
  'h_index': 0,
182
- 'institution': ''
 
 
183
  })
184
 
185
  # Collect author information from papers
@@ -220,6 +291,35 @@ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
220
 
221
  progress_bar.empty()
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  # Calculate composite score for ranking
224
  ranked_authors = []
225
  for author_id, data in filtered_authors.items():
@@ -230,11 +330,21 @@ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
230
  normalized_citations = np.log1p(data['total_citations']) / 15.0 # Log scale
231
 
232
  # Weighted composite score
233
- composite_score = (
234
- 0.5 * avg_relevance + # 50% relevance
235
- 0.3 * min(normalized_h_index, 1.0) + # 30% h-index
236
- 0.2 * min(normalized_citations, 1.0) # 20% citations
237
- )
 
 
 
 
 
 
 
 
 
 
238
 
239
  ranked_authors.append({
240
  'name': data['name'],
@@ -244,6 +354,8 @@ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
244
  'works_count': data['works_count'],
245
  'num_relevant_papers': len(data['paper_scores']),
246
  'avg_relevance_score': avg_relevance,
 
 
247
  'composite_score': composite_score,
248
  'institution': data['institution'],
249
  'openalex_url': f"https://openalex.org/A{author_id}"
@@ -315,6 +427,37 @@ def main():
315
  )
316
  country_code = COUNTRIES[selected_country]
317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  # Full-text search option
319
  use_fulltext = st.sidebar.checkbox(
320
  "Include full text (when available)",
@@ -322,6 +465,26 @@ def main():
322
  help="Search within full paper text (not just title/abstract). ~10-15% of papers have full text available. Slightly slower."
323
  )
324
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  # Minimum papers per author
326
  min_papers_per_author = st.sidebar.slider(
327
  "Minimum papers per author",
@@ -353,21 +516,25 @@ def main():
353
 
354
  if search_button and query:
355
  # Display search parameters
356
- st.info(f"πŸ” Searching: **{query}** | Mode: **{search_mode}** | Papers: **{num_papers}** | Country: **{selected_country}** | Full-text: **{'Yes' if use_fulltext else 'No'}** | Min papers/author: **{min_papers_per_author}**")
 
 
 
357
 
358
  # Load model
359
  with st.spinner("Loading semantic model..."):
360
  model = load_model()
361
 
362
  # Search papers
363
- search_key = f"{query}_{num_papers}_{country_code}_{use_fulltext}"
364
 
365
  if search_mode == "Deep Search":
366
  progress_text = f"πŸ” Deep search in progress: Fetching up to {num_papers} papers from OpenAlex..."
367
  progress_bar = st.progress(0, text=progress_text)
368
 
369
- with st.spinner(f"Searching OpenAlex for papers about '{query}'{' from ' + selected_country if country_code else ''}{' (including full text)' if use_fulltext else ''}..."):
370
- papers = search_openalex_papers(query, num_papers, country_code, use_fulltext)
 
371
 
372
  if search_mode == "Deep Search":
373
  progress_bar.progress(33, text="πŸ“„ Papers fetched! Now generating embeddings...")
@@ -383,9 +550,11 @@ def main():
383
  st.write(f"**Search Mode:** {search_mode}")
384
  st.write(f"**Query:** {query}")
385
  st.write(f"**Full-text search:** {'Enabled' if use_fulltext else 'Disabled'}")
 
386
  st.write(f"**Papers requested:** {num_papers}")
387
  st.write(f"**Papers fetched:** {len(papers)}")
388
  st.write(f"**Country filter:** {selected_country} ({country_code or 'None'})")
 
389
  st.write(f"**First paper:** {papers[0].get('display_name', 'N/A')[:100]}...")
390
  st.write(f"**Last paper:** {papers[-1].get('display_name', 'N/A')[:100]}...")
391
 
@@ -469,7 +638,9 @@ def main():
469
  sorted_scores,
470
  model,
471
  query_embedding,
472
- min_papers=min_papers_per_author
 
 
473
  )
474
 
475
  if not ranked_authors:
@@ -495,9 +666,15 @@ def main():
495
  st.metric("Citations", f"{author['total_citations']:,}")
496
 
497
  with col4:
498
- st.metric("Relevance", f"{author['avg_relevance_score']:.3f}")
 
 
 
499
 
500
- st.caption(f"Total works: {author['works_count']} | Relevant papers: {author['num_relevant_papers']}")
 
 
 
501
  st.divider()
502
 
503
  # Download results
@@ -511,14 +688,17 @@ def main():
511
  csv_writer = csv.writer(csv_buffer)
512
 
513
  # Write header
514
- csv_writer.writerow([
515
  'Rank', 'Name', 'Institution', 'H-Index', 'Total Citations',
516
  'Total Works', 'Relevant Papers', 'Avg Relevance Score', 'Composite Score', 'OpenAlex URL'
517
- ])
 
 
 
518
 
519
  # Write data
520
  for idx, author in enumerate(ranked_authors, 1):
521
- csv_writer.writerow([
522
  idx,
523
  author['name'],
524
  author['institution'],
@@ -528,8 +708,11 @@ def main():
528
  author['num_relevant_papers'],
529
  f"{author['avg_relevance_score']:.4f}",
530
  f"{author['composite_score']:.4f}",
531
- author['openalex_url']
532
- ])
 
 
 
533
 
534
  csv_data = csv_buffer.getvalue()
535
 
 
52
 
53
  # LIMITED CACHE: Only store 50 recent searches
54
  @st.cache_data(ttl=3600, max_entries=50, show_spinner=False)
55
+ def search_openalex_papers(query, num_results=50, country_code=None, use_fulltext=False, year_min=None, year_max=None):
56
  """
57
  Search OpenAlex for papers related to the query
58
  Optionally filter by author's country
59
  Optionally use full-text search (searches title + abstract + full text when available)
60
+ Optionally filter by publication year range
61
 
62
  Note: Results are cached for 1 hour, max 50 searches stored
63
  For large requests (>100), uses pagination
 
91
  if country_code:
92
  filters.append(f"authorships.countries:{country_code}")
93
 
94
+ # Add year range filter if specified
95
+ if year_min is not None:
96
+ filters.append(f"publication_year:>{year_min-1}") # Greater than or equal
97
+ if year_max is not None:
98
+ filters.append(f"publication_year:<{year_max+1}") # Less than or equal
99
+
100
  # Combine filters with comma (AND operation)
101
  if filters:
102
  params["filter"] = ",".join(filters)
 
159
  except Exception as e:
160
  return None
161
 
162
+ # LIMITED CACHE: Only store 200 recent author works lookups
163
+ @st.cache_data(ttl=3600, max_entries=200)
164
+ def get_author_works(author_id, max_works=20):
165
+ """
166
+ Fetch author's recent works for validation
167
+ Returns up to max_works most recent papers by this author
168
+ """
169
+ base_url = "https://api.openalex.org/works"
170
+
171
+ params = {
172
+ "filter": f"author.id:A{author_id}",
173
+ "per_page": max_works,
174
+ "sort": "cited_by_count:desc", # Get most cited papers
175
+ "select": "id,title,abstract_inverted_index,publication_year",
176
+ "mailto": "[email protected]"
177
+ }
178
+
179
+ try:
180
+ response = requests.get(base_url, params=params, timeout=10)
181
+ response.raise_for_status()
182
+ data = response.json()
183
+ return data.get("results", [])
184
+ except Exception as e:
185
+ return []
186
+
187
+ def validate_author_relevance(author_id, query_embedding, model, threshold=0.25, max_works=20):
188
+ """
189
+ Validate if an author is actually relevant to the search query
190
+ by checking semantic similarity of their body of work
191
+
192
+ Returns: (is_valid, avg_similarity, num_works_checked)
193
+ """
194
+ # Fetch author's works
195
+ works = get_author_works(author_id, max_works)
196
+
197
+ if not works:
198
+ return False, 0.0, 0
199
+
200
+ # Generate embeddings for author's works
201
+ work_texts = []
202
+ for work in works:
203
+ title = work.get('title', '') or work.get('display_name', '')
204
+ abstract = reconstruct_abstract(work.get('abstract_inverted_index', {}))
205
+ text = f"{title} {title} {abstract}"
206
+ if text.strip():
207
+ work_texts.append(text)
208
+
209
+ if not work_texts:
210
+ return False, 0.0, 0
211
+
212
+ # Calculate similarity to query
213
+ work_embeddings = model.encode(work_texts, convert_to_tensor=False, show_progress_bar=False)
214
+ similarities = calculate_semantic_similarity(query_embedding, work_embeddings)
215
+ avg_similarity = np.mean(similarities)
216
+
217
+ # Author is valid if their average work similarity exceeds threshold
218
+ is_valid = avg_similarity >= threshold
219
+
220
+ return is_valid, avg_similarity, len(work_texts)
221
+
222
  def calculate_semantic_similarity(query_embedding, paper_embeddings):
223
  """
224
  Calculate cosine similarity between query and papers
 
231
  similarities = np.dot(paper_norms, query_norm)
232
  return similarities
233
 
234
+ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2, validate_authors=True, validation_threshold=0.25):
235
  """
236
  Extract authors from papers and rank them based on:
237
  - Semantic relevance (average of their paper scores)
238
  - H-index
239
  - Total citations
240
+
241
+ If validate_authors=True, checks each author's body of work for relevance
242
  """
243
  author_data = defaultdict(lambda: {
244
  'name': '',
 
248
  'total_citations': 0,
249
  'works_count': 0,
250
  'h_index': 0,
251
+ 'institution': '',
252
+ 'validation_score': 0.0,
253
+ 'validated': False
254
  })
255
 
256
  # Collect author information from papers
 
291
 
292
  progress_bar.empty()
293
 
294
+ # Validate authors if requested
295
+ if validate_authors:
296
+ with st.spinner(f"Validating author relevance (checking their body of work)..."):
297
+ progress_bar = st.progress(0)
298
+ validated_count = 0
299
+
300
+ for idx, (author_id, data) in enumerate(filtered_authors.items()):
301
+ is_valid, val_score, num_works = validate_author_relevance(
302
+ author_id, query_embedding, model, validation_threshold
303
+ )
304
+ data['validated'] = is_valid
305
+ data['validation_score'] = val_score
306
+ data['num_works_checked'] = num_works
307
+
308
+ if is_valid:
309
+ validated_count += 1
310
+
311
+ progress_bar.progress((idx + 1) / len(filtered_authors))
312
+ time.sleep(0.1) # Rate limiting
313
+
314
+ progress_bar.empty()
315
+ st.success(f"βœ… {validated_count}/{len(filtered_authors)} authors validated as relevant to your query")
316
+
317
+ # Filter to only validated authors
318
+ filtered_authors = {
319
+ aid: data for aid, data in filtered_authors.items()
320
+ if data['validated']
321
+ }
322
+
323
  # Calculate composite score for ranking
324
  ranked_authors = []
325
  for author_id, data in filtered_authors.items():
 
330
  normalized_citations = np.log1p(data['total_citations']) / 15.0 # Log scale
331
 
332
  # Weighted composite score
333
+ if validate_authors:
334
+ # Include validation score in composite
335
+ composite_score = (
336
+ 0.4 * avg_relevance + # 40% relevance in initial papers
337
+ 0.3 * data['validation_score'] + # 30% validation (their body of work)
338
+ 0.2 * min(normalized_h_index, 1.0) + # 20% h-index
339
+ 0.1 * min(normalized_citations, 1.0) # 10% citations
340
+ )
341
+ else:
342
+ # Original scoring without validation
343
+ composite_score = (
344
+ 0.5 * avg_relevance + # 50% relevance
345
+ 0.3 * min(normalized_h_index, 1.0) + # 30% h-index
346
+ 0.2 * min(normalized_citations, 1.0) # 20% citations
347
+ )
348
 
349
  ranked_authors.append({
350
  'name': data['name'],
 
354
  'works_count': data['works_count'],
355
  'num_relevant_papers': len(data['paper_scores']),
356
  'avg_relevance_score': avg_relevance,
357
+ 'validation_score': data['validation_score'],
358
+ 'validated': data['validated'],
359
  'composite_score': composite_score,
360
  'institution': data['institution'],
361
  'openalex_url': f"https://openalex.org/A{author_id}"
 
427
  )
428
  country_code = COUNTRIES[selected_country]
429
 
430
+ # Year range filter
431
+ st.sidebar.subheader("πŸ“… Year Range")
432
+ current_year = 2025
433
+ use_year_filter = st.sidebar.checkbox(
434
+ "Limit by publication year",
435
+ value=False,
436
+ help="Filter papers by publication year range"
437
+ )
438
+
439
+ if use_year_filter:
440
+ year_col1, year_col2 = st.sidebar.columns(2)
441
+ with year_col1:
442
+ year_min = st.number_input(
443
+ "From",
444
+ min_value=1900,
445
+ max_value=current_year,
446
+ value=2015,
447
+ step=1
448
+ )
449
+ with year_col2:
450
+ year_max = st.number_input(
451
+ "To",
452
+ min_value=1900,
453
+ max_value=current_year,
454
+ value=current_year,
455
+ step=1
456
+ )
457
+ else:
458
+ year_min = None
459
+ year_max = None
460
+
461
  # Full-text search option
462
  use_fulltext = st.sidebar.checkbox(
463
  "Include full text (when available)",
 
465
  help="Search within full paper text (not just title/abstract). ~10-15% of papers have full text available. Slightly slower."
466
  )
467
 
468
+ # Author validation
469
+ st.sidebar.subheader("πŸ‘€ Author Validation")
470
+ validate_authors = st.sidebar.checkbox(
471
+ "Validate authors' body of work",
472
+ value=True,
473
+ help="Check each author's recent papers to confirm they're actually working in this area. More accurate but slower."
474
+ )
475
+
476
+ if validate_authors:
477
+ validation_threshold = st.sidebar.slider(
478
+ "Validation threshold",
479
+ min_value=0.15,
480
+ max_value=0.50,
481
+ value=0.25,
482
+ step=0.05,
483
+ help="Minimum average similarity score for author's works. Higher = stricter filter."
484
+ )
485
+ else:
486
+ validation_threshold = 0.25
487
+
488
  # Minimum papers per author
489
  min_papers_per_author = st.sidebar.slider(
490
  "Minimum papers per author",
 
516
 
517
  if search_button and query:
518
  # Display search parameters
519
+ year_range_text = f"Years: **{year_min}-{year_max}**" if use_year_filter else "Years: **All**"
520
+ validation_text = f"Validation: **On (threshold {validation_threshold})**" if validate_authors else "Validation: **Off**"
521
+
522
+ st.info(f"πŸ” Searching: **{query}** | Mode: **{search_mode}** | Papers: **{num_papers}** | {year_range_text} | Country: **{selected_country}** | Full-text: **{'Yes' if use_fulltext else 'No'}** | {validation_text} | Min papers/author: **{min_papers_per_author}**")
523
 
524
  # Load model
525
  with st.spinner("Loading semantic model..."):
526
  model = load_model()
527
 
528
  # Search papers
529
+ search_key = f"{query}_{num_papers}_{country_code}_{use_fulltext}_{year_min}_{year_max}"
530
 
531
  if search_mode == "Deep Search":
532
  progress_text = f"πŸ” Deep search in progress: Fetching up to {num_papers} papers from OpenAlex..."
533
  progress_bar = st.progress(0, text=progress_text)
534
 
535
+ year_filter_text = f" from {year_min}-{year_max}" if use_year_filter else ""
536
+ with st.spinner(f"Searching OpenAlex for papers about '{query}'{year_filter_text}{' from ' + selected_country if country_code else ''}{' (including full text)' if use_fulltext else ''}..."):
537
+ papers = search_openalex_papers(query, num_papers, country_code, use_fulltext, year_min, year_max)
538
 
539
  if search_mode == "Deep Search":
540
  progress_bar.progress(33, text="πŸ“„ Papers fetched! Now generating embeddings...")
 
550
  st.write(f"**Search Mode:** {search_mode}")
551
  st.write(f"**Query:** {query}")
552
  st.write(f"**Full-text search:** {'Enabled' if use_fulltext else 'Disabled'}")
553
+ st.write(f"**Year range:** {year_min}-{year_max}" if use_year_filter else "**Year range:** All years")
554
  st.write(f"**Papers requested:** {num_papers}")
555
  st.write(f"**Papers fetched:** {len(papers)}")
556
  st.write(f"**Country filter:** {selected_country} ({country_code or 'None'})")
557
+ st.write(f"**Author validation:** {'Enabled (threshold: ' + str(validation_threshold) + ')' if validate_authors else 'Disabled'}")
558
  st.write(f"**First paper:** {papers[0].get('display_name', 'N/A')[:100]}...")
559
  st.write(f"**Last paper:** {papers[-1].get('display_name', 'N/A')[:100]}...")
560
 
 
638
  sorted_scores,
639
  model,
640
  query_embedding,
641
+ min_papers=min_papers_per_author,
642
+ validate_authors=validate_authors,
643
+ validation_threshold=validation_threshold
644
  )
645
 
646
  if not ranked_authors:
 
666
  st.metric("Citations", f"{author['total_citations']:,}")
667
 
668
  with col4:
669
+ if validate_authors:
670
+ st.metric("Body Relevance", f"{author['validation_score']:.3f}")
671
+ else:
672
+ st.metric("Relevance", f"{author['avg_relevance_score']:.3f}")
673
 
674
+ caption_text = f"Total works: {author['works_count']} | Relevant papers: {author['num_relevant_papers']}"
675
+ if validate_authors:
676
+ caption_text += f" | Paper relevance: {author['avg_relevance_score']:.3f}"
677
+ st.caption(caption_text)
678
  st.divider()
679
 
680
  # Download results
 
688
  csv_writer = csv.writer(csv_buffer)
689
 
690
  # Write header
691
+ header = [
692
  'Rank', 'Name', 'Institution', 'H-Index', 'Total Citations',
693
  'Total Works', 'Relevant Papers', 'Avg Relevance Score', 'Composite Score', 'OpenAlex URL'
694
+ ]
695
+ if validate_authors:
696
+ header.insert(-1, 'Body of Work Validation Score')
697
+ csv_writer.writerow(header)
698
 
699
  # Write data
700
  for idx, author in enumerate(ranked_authors, 1):
701
+ row = [
702
  idx,
703
  author['name'],
704
  author['institution'],
 
708
  author['num_relevant_papers'],
709
  f"{author['avg_relevance_score']:.4f}",
710
  f"{author['composite_score']:.4f}",
711
+ ]
712
+ if validate_authors:
713
+ row.append(f"{author['validation_score']:.4f}")
714
+ row.append(author['openalex_url'])
715
+ csv_writer.writerow(row)
716
 
717
  csv_data = csv_buffer.getvalue()
718