Spaces:

INLEXIO
/

semantic-search

Sleeping

App Files Files Community

semantic-search / src /streamlit_app.py

INLEXIO

Update src/streamlit_app.py

2e73ba2 verified 14 days ago

raw

history blame contribute delete

28.6 kB

	import streamlit as st
	import requests
	from sentence_transformers import SentenceTransformer
	import numpy as np
	from collections import defaultdict
	import time
	import os
	import shutil

	# Set cache directory to /tmp (gets cleared on restart)
	os.environ['HF_HOME'] = '/tmp/huggingface'
	os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface'
	os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/tmp/huggingface'

	# Clear old cache on startup to prevent accumulation
	def clear_old_cache():
	"""Clear /tmp cache if it gets too large"""
	cache_dir = '/tmp/huggingface'
	try:
	if os.path.exists(cache_dir):
	size_mb = sum(
	os.path.getsize(os.path.join(dirpath, filename))
	for dirpath, dirnames, filenames in os.walk(cache_dir)
	for filename in filenames
	) / (1024 * 1024)

	# If cache > 5GB, clear it
	if size_mb > 5000:
	shutil.rmtree(cache_dir)
	os.makedirs(cache_dir)
	except:
	pass

	# Run cleanup on startup
	clear_old_cache()

	# Page config
	st.set_page_config(
	page_title="OpenAlex Semantic Search",
	page_icon="🔬",
	layout="wide"
	)

	# Cache the model loading
	@st.cache_resource
	def load_model():
	"""Load the SPECTER model - trained specifically on scientific papers"""
	# SPECTER is much better for scientific content than general models
	# Model size: ~440MB (vs ~80MB for MiniLM)
	# Embedding size: 768 dimensions (vs 384 for MiniLM)
	return SentenceTransformer('allenai/specter', cache_folder='/tmp/huggingface')

	# LIMITED CACHE: Only store 50 recent searches
	@st.cache_data(ttl=3600, max_entries=50, show_spinner=False)
	def search_openalex_papers(query, num_results=50, country_code=None, use_fulltext=False, year_min=None, year_max=None):
	"""
	Search OpenAlex for papers related to the query
	Optionally filter by author's country
	Optionally use full-text search (searches title + abstract + full text when available)
	Optionally filter by publication year range

	Note: Results are cached for 1 hour, max 50 searches stored
	For large requests (>100), uses pagination
	"""
	base_url = "https://api.openalex.org/works"
	all_papers = []

	# OpenAlex max per_page is 200, so we need pagination for large requests
	per_page = min(200, num_results)
	num_pages = (num_results + per_page - 1) // per_page # Ceiling division

	for page in range(1, num_pages + 1):
	params = {
	"per_page": per_page,
	"page": page,
	"select": "id,title,abstract_inverted_index,authorships,publication_year,cited_by_count,display_name",
	"mailto": "[email protected]" # Polite pool
	}

	# Build filter string
	filters = []

	if use_fulltext:
	# Full-text search (searches title + abstract + full text when available)
	filters.append(f"fulltext.search:{query}")
	else:
	# Standard search (title + abstract only)
	params["search"] = query

	# Add country filter if specified
	if country_code:
	filters.append(f"authorships.countries:{country_code}")

	# Add year range filter if specified
	if year_min is not None:
	filters.append(f"publication_year:>{year_min-1}") # Greater than or equal
	if year_max is not None:
	filters.append(f"publication_year:<{year_max+1}") # Less than or equal

	# Combine filters with comma (AND operation)
	if filters:
	params["filter"] = ",".join(filters)

	try:
	response = requests.get(base_url, params=params, timeout=30)
	response.raise_for_status()
	data = response.json()
	papers = data.get("results", [])
	all_papers.extend(papers)

	# If we got fewer papers than requested, no more pages available
	if len(papers) < per_page:
	break

	# Rate limiting - be nice to OpenAlex
	if page < num_pages:
	time.sleep(0.1) # 100ms delay between requests

	except Exception as e:
	st.error(f"Error fetching papers (page {page}): {str(e)}")
	break

	return all_papers[:num_results] # Return exactly what was requested

	def reconstruct_abstract(inverted_index):
	"""
	Reconstruct abstract from OpenAlex inverted index format
	"""
	if not inverted_index:
	return ""

	# Create list of (position, word) tuples
	words_with_positions = []
	for word, positions in inverted_index.items():
	for pos in positions:
	words_with_positions.append((pos, word))

	# Sort by position and join
	words_with_positions.sort(key=lambda x: x[0])
	return " ".join([word for _, word in words_with_positions])

	# LIMITED CACHE: Only store 200 recent author lookups
	@st.cache_data(ttl=3600, max_entries=200)
	def get_author_details(author_id):
	"""
	Fetch detailed author information from OpenAlex
	Cache limited to 200 authors to prevent storage issues
	"""
	base_url = f"https://api.openalex.org/authors/{author_id}"

	params = {
	"mailto": "[email protected]"
	}

	try:
	response = requests.get(base_url, params=params, timeout=10)
	response.raise_for_status()
	return response.json()
	except Exception as e:
	return None

	# LIMITED CACHE: Only store 200 recent author works lookups
	@st.cache_data(ttl=3600, max_entries=200)
	def get_author_works(author_id, max_works=20):
	"""
	Fetch author's recent works for validation
	Returns up to max_works most recent papers by this author
	"""
	base_url = "https://api.openalex.org/works"

	params = {
	"filter": f"author.id:A{author_id}",
	"per_page": max_works,
	"sort": "cited_by_count:desc", # Get most cited papers
	"select": "id,title,abstract_inverted_index,publication_year",
	"mailto": "[email protected]"
	}

	try:
	response = requests.get(base_url, params=params, timeout=10)
	response.raise_for_status()
	data = response.json()
	return data.get("results", [])
	except Exception as e:
	return []

	def validate_author_relevance(author_id, query_embedding, model, threshold=0.25, max_works=20):
	"""
	Validate if an author is actually relevant to the search query
	by checking semantic similarity of their body of work

	Returns: (is_valid, avg_similarity, num_works_checked)
	"""
	# Fetch author's works
	works = get_author_works(author_id, max_works)

	if not works:
	return False, 0.0, 0

	# Generate embeddings for author's works
	work_texts = []
	for work in works:
	title = work.get('title', '') or work.get('display_name', '')
	abstract = reconstruct_abstract(work.get('abstract_inverted_index', {}))
	text = f"{title} {title} {abstract}"
	if text.strip():
	work_texts.append(text)

	if not work_texts:
	return False, 0.0, 0

	# Calculate similarity to query
	work_embeddings = model.encode(work_texts, convert_to_tensor=False, show_progress_bar=False)
	similarities = calculate_semantic_similarity(query_embedding, work_embeddings)
	avg_similarity = np.mean(similarities)

	# Author is valid if their average work similarity exceeds threshold
	is_valid = avg_similarity >= threshold

	return is_valid, avg_similarity, len(work_texts)

	def calculate_semantic_similarity(query_embedding, paper_embeddings):
	"""
	Calculate cosine similarity between query and papers
	"""
	# Normalize embeddings
	query_norm = query_embedding / np.linalg.norm(query_embedding)
	paper_norms = paper_embeddings / np.linalg.norm(paper_embeddings, axis=1, keepdims=True)

	# Calculate cosine similarity
	similarities = np.dot(paper_norms, query_norm)
	return similarities

	def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2, validate_authors=True, validation_threshold=0.25):
	"""
	Extract authors from papers and rank them based on:
	- Semantic relevance (average of their paper scores)
	- H-index
	- Total citations

	If validate_authors=True, checks each author's body of work for relevance
	"""
	author_data = defaultdict(lambda: {
	'name': '',
	'id': '',
	'paper_scores': [],
	'paper_ids': [],
	'total_citations': 0,
	'works_count': 0,
	'h_index': 0,
	'institution': '',
	'validation_score': 0.0,
	'validated': False
	})

	# Collect author information from papers
	for paper, score in zip(papers, paper_scores):
	for authorship in paper.get('authorships', []):
	author = authorship.get('author', {})
	author_id = author.get('id', '').split('/')[-1] if author.get('id') else None

	if author_id and author_id.startswith('A'):
	author_data[author_id]['name'] = author.get('display_name', 'Unknown')
	author_data[author_id]['id'] = author_id
	author_data[author_id]['paper_scores'].append(score)
	author_data[author_id]['paper_ids'].append(paper.get('id', ''))

	# Get institution
	institutions = authorship.get('institutions', [])
	if institutions and not author_data[author_id]['institution']:
	author_data[author_id]['institution'] = institutions[0].get('display_name', '')

	# Filter authors with minimum paper count
	filtered_authors = {
	aid: data for aid, data in author_data.items()
	if len(data['paper_scores']) >= min_papers
	}

	# Fetch detailed metrics for each author
	with st.spinner(f"Fetching metrics for {len(filtered_authors)} authors..."):
	progress_bar = st.progress(0)
	for idx, (author_id, data) in enumerate(filtered_authors.items()):
	author_details = get_author_details(author_id)
	if author_details:
	data['h_index'] = author_details.get('summary_stats', {}).get('h_index', 0)
	data['total_citations'] = author_details.get('cited_by_count', 0)
	data['works_count'] = author_details.get('works_count', 0)

	progress_bar.progress((idx + 1) / len(filtered_authors))
	time.sleep(0.1) # Rate limiting

	progress_bar.empty()

	# Validate authors if requested
	if validate_authors:
	with st.spinner(f"Validating author relevance (checking their body of work)..."):
	progress_bar = st.progress(0)
	validated_count = 0

	for idx, (author_id, data) in enumerate(filtered_authors.items()):
	is_valid, val_score, num_works = validate_author_relevance(
	author_id, query_embedding, model, validation_threshold
	)
	data['validated'] = is_valid
	data['validation_score'] = val_score
	data['num_works_checked'] = num_works

	if is_valid:
	validated_count += 1

	progress_bar.progress((idx + 1) / len(filtered_authors))
	time.sleep(0.1) # Rate limiting

	progress_bar.empty()
	st.success(f"✅ {validated_count}/{len(filtered_authors)} authors validated as relevant to your query")

	# Filter to only validated authors
	filtered_authors = {
	aid: data for aid, data in filtered_authors.items()
	if data['validated']
	}

	# Calculate composite score for ranking
	ranked_authors = []
	for author_id, data in filtered_authors.items():
	avg_relevance = np.mean(data['paper_scores'])

	# Normalize metrics (using log scale for citations)
	normalized_h_index = data['h_index'] / 100.0 # Assume max h-index of 100
	normalized_citations = np.log1p(data['total_citations']) / 15.0 # Log scale

	# Weighted composite score
	if validate_authors:
	# Include validation score in composite
	composite_score = (
	0.4 * avg_relevance + # 40% relevance in initial papers
	0.3 * data['validation_score'] + # 30% validation (their body of work)
	0.2 * min(normalized_h_index, 1.0) + # 20% h-index
	0.1 * min(normalized_citations, 1.0) # 10% citations
	)
	else:
	# Original scoring without validation
	composite_score = (
	0.5 * avg_relevance + # 50% relevance
	0.3 * min(normalized_h_index, 1.0) + # 30% h-index
	0.2 * min(normalized_citations, 1.0) # 20% citations
	)

	ranked_authors.append({
	'name': data['name'],
	'id': author_id,
	'h_index': data['h_index'],
	'total_citations': data['total_citations'],
	'works_count': data['works_count'],
	'num_relevant_papers': len(data['paper_scores']),
	'avg_relevance_score': avg_relevance,
	'validation_score': data['validation_score'],
	'validated': data['validated'],
	'composite_score': composite_score,
	'institution': data['institution'],
	'openalex_url': f"https://openalex.org/A{author_id}"
	})

	# Sort by composite score
	ranked_authors.sort(key=lambda x: x['composite_score'], reverse=True)

	return ranked_authors

	# Define country codes
	COUNTRIES = {
	"All Countries": None,
	"Australia": "AU",
	"Canada": "CA",
	"China": "CN",
	"France": "FR",
	"Germany": "DE",
	"India": "IN",
	"Japan": "JP",
	"United Kingdom": "GB",
	"United States": "US",
	}

	def main():
	# Header
	st.title("🔬 OpenAlex Semantic Search")
	st.markdown("""
	Search for research papers and discover top researchers using semantic similarity matching.
	This tool uses SPECTER (Scientific Paper Embeddings using Citation-informed TransformERs),
	a model specifically trained on scientific papers for better relevance matching.
	""")

	# Sidebar configuration
	st.sidebar.header("⚙️ Search Configuration")

	# Search mode selection
	search_mode = st.sidebar.radio(
	"Search Mode",
	["Quick Search", "Deep Search"],
	help="Quick: 50-100 papers (~30s) \| Deep: 500-1,000 papers (2-5 min)"
	)

	# Number of papers based on mode
	if search_mode == "Quick Search":
	num_papers = st.sidebar.slider(
	"Number of papers to analyze",
	min_value=20,
	max_value=100,
	value=50,
	step=10,
	help="More papers = more comprehensive but slower"
	)
	else: # Deep Search - LIMIT TO 1000 to prevent storage issues
	num_papers = st.sidebar.slider(
	"Number of papers to analyze",
	min_value=100,
	max_value=1000, # REDUCED from 5000
	value=500,
	step=100,
	help="⚠️ Limited to 1000 papers to prevent storage issues. Deep search takes 2-5 minutes."
	)

	# Country filter
	selected_country = st.sidebar.selectbox(
	"Filter by author country (optional)",
	options=list(COUNTRIES.keys()),
	help="Only include papers where at least one author is from this country"
	)
	country_code = COUNTRIES[selected_country]

	# Year range filter
	st.sidebar.subheader("📅 Year Range")
	current_year = 2025
	use_year_filter = st.sidebar.checkbox(
	"Limit by publication year",
	value=False,
	help="Filter papers by publication year range"
	)

	if use_year_filter:
	year_col1, year_col2 = st.sidebar.columns(2)
	with year_col1:
	year_min = st.number_input(
	"From",
	min_value=1900,
	max_value=current_year,
	value=2015,
	step=1
	)
	with year_col2:
	year_max = st.number_input(
	"To",
	min_value=1900,
	max_value=current_year,
	value=current_year,
	step=1
	)
	else:
	year_min = None
	year_max = None

	# Full-text search option
	use_fulltext = st.sidebar.checkbox(
	"Include full text (when available)",
	value=False,
	help="Search within full paper text (not just title/abstract). ~10-15% of papers have full text available. Slightly slower."
	)

	# Author validation
	st.sidebar.subheader("👤 Author Validation")
	validate_authors = st.sidebar.checkbox(
	"Validate authors' body of work",
	value=True,
	help="Check each author's recent papers to confirm they're actually working in this area. More accurate but slower."
	)

	if validate_authors:
	validation_threshold = st.sidebar.slider(
	"Validation threshold",
	min_value=0.15,
	max_value=0.50,
	value=0.25,
	step=0.05,
	help="Minimum average similarity score for author's works. Higher = stricter filter."
	)
	else:
	validation_threshold = 0.25

	# Minimum papers per author
	min_papers_per_author = st.sidebar.slider(
	"Minimum papers per author",
	min_value=1,
	max_value=5,
	value=2,
	help="Filters out authors who appear in fewer than N papers"
	)

	# Display settings
	st.sidebar.header("📊 Display Settings")
	top_papers_display = st.sidebar.slider("Number of top papers to show", 5, 50, 10)
	top_authors_display = st.sidebar.slider("Number of top authors to show", 5, 50, 10)

	# Storage usage info
	st.sidebar.markdown("---")
	st.sidebar.info("💾 Cache limited to prevent storage issues:\n- Max 50 searches stored\n- Max 200 authors cached\n- Max 1000 papers in Deep Search")

	# Main search interface
	st.header("🔍 Search Query")

	query = st.text_input(
	"Enter your search query:",
	placeholder="e.g., 'graph neural networks for protein structure prediction'",
	help="Enter keywords or a description of what you're looking for"
	)

	search_button = st.button("🔍 Search", type="primary")

	if search_button and query:
	# Display search parameters
	year_range_text = f"Years: {year_min}-{year_max}" if use_year_filter else "Years: All"
	validation_text = f"Validation: On (threshold {validation_threshold})" if validate_authors else "Validation: Off"

	st.info(f"🔍 Searching: {query} \| Mode: {search_mode} \| Papers: {num_papers} \| {year_range_text} \| Country: {selected_country} \| Full-text: {'Yes' if use_fulltext else 'No'} \| {validation_text} \| Min papers/author: {min_papers_per_author}")

	# Load model
	with st.spinner("Loading semantic model..."):
	model = load_model()

	# Search papers
	search_key = f"{query}_{num_papers}_{country_code}_{use_fulltext}_{year_min}_{year_max}"

	if search_mode == "Deep Search":
	progress_text = f"🔍 Deep search in progress: Fetching up to {num_papers} papers from OpenAlex..."
	progress_bar = st.progress(0, text=progress_text)

	year_filter_text = f" from {year_min}-{year_max}" if use_year_filter else ""
	with st.spinner(f"Searching OpenAlex for papers about '{query}'{year_filter_text}{' from ' + selected_country if country_code else ''}{' (including full text)' if use_fulltext else ''}..."):
	papers = search_openalex_papers(query, num_papers, country_code, use_fulltext, year_min, year_max)

	if search_mode == "Deep Search":
	progress_bar.progress(33, text="📄 Papers fetched! Now generating embeddings...")

	if not papers:
	st.warning("No papers found. Try different search terms.")
	return

	st.success(f"Found {len(papers)} papers!")

	# Show debug info in expander
	with st.expander("🔍 Search Details", expanded=False):
	st.write(f"Search Mode: {search_mode}")
	st.write(f"Query: {query}")
	st.write(f"Full-text search: {'Enabled' if use_fulltext else 'Disabled'}")
	st.write(f"Year range: {year_min}-{year_max}" if use_year_filter else "Year range: All years")
	st.write(f"Papers requested: {num_papers}")
	st.write(f"Papers fetched: {len(papers)}")
	st.write(f"Country filter: {selected_country} ({country_code or 'None'})")
	st.write(f"Author validation: {'Enabled (threshold: ' + str(validation_threshold) + ')' if validate_authors else 'Disabled'}")
	st.write(f"First paper: {papers[0].get('display_name', 'N/A')[:100]}...")
	st.write(f"Last paper: {papers[-1].get('display_name', 'N/A')[:100]}...")

	# Prepare papers for semantic search
	if search_mode == "Deep Search":
	progress_bar.progress(50, text="🧠 Generating semantic embeddings...")

	with st.spinner("Analyzing papers with semantic search..."):
	paper_texts = []
	valid_papers = []

	for paper in papers:
	title = paper.get('display_name', '') or paper.get('title', '')
	abstract = reconstruct_abstract(paper.get('abstract_inverted_index', {}))

	# Combine title and abstract (title weighted more)
	text = f"{title} {title} {abstract}" # Title appears twice for emphasis

	if text.strip():
	paper_texts.append(text)
	valid_papers.append(paper)

	if not paper_texts:
	st.error("No valid paper content found.")
	return

	# Generate embeddings
	query_embedding = model.encode(query, convert_to_tensor=False)

	if search_mode == "Deep Search":
	progress_bar.progress(66, text=f"🔢 Computing similarity for {len(paper_texts)} papers...")

	paper_embeddings = model.encode(paper_texts, convert_to_tensor=False, show_progress_bar=False)

	# Calculate similarities
	similarities = calculate_semantic_similarity(query_embedding, paper_embeddings)

	# Sort papers by similarity
	sorted_indices = np.argsort(similarities)[::-1]
	sorted_papers = [valid_papers[i] for i in sorted_indices]
	sorted_scores = [similarities[i] for i in sorted_indices]

	if search_mode == "Deep Search":
	progress_bar.progress(100, text="✅ Complete!")
	time.sleep(0.5)
	progress_bar.empty()

	# Display top papers
	st.header(f"📄 Top {top_papers_display} Most Relevant Papers")

	for idx, (paper, score) in enumerate(zip(sorted_papers[:top_papers_display], sorted_scores[:top_papers_display])):
	with st.expander(f"{idx+1}. {paper.get('display_name', 'Untitled')} (Relevance: {score:.3f})"):
	col1, col2 = st.columns([3, 1])

	with col1:
	abstract = reconstruct_abstract(paper.get('abstract_inverted_index', {}))
	if abstract:
	st.markdown(f"Abstract: {abstract[:500]}{'...' if len(abstract) > 500 else ''}")
	else:
	st.markdown("No abstract available")

	# Authors
	authors = [a.get('author', {}).get('display_name', 'Unknown')
	for a in paper.get('authorships', [])]
	if authors:
	st.markdown(f"Authors: {', '.join(authors[:5])}{'...' if len(authors) > 5 else ''}")

	with col2:
	st.metric("Year", paper.get('publication_year', 'N/A'))
	st.metric("Citations", paper.get('cited_by_count', 0))

	paper_id = paper.get('id', '').split('/')[-1]
	if paper_id:
	st.markdown(f"[View on OpenAlex](https://openalex.org/{paper_id})")

	# Rank authors
	st.header(f"👨‍🔬 Top {top_authors_display} Researchers")

	ranked_authors = rank_authors(
	sorted_papers,
	sorted_scores,
	model,
	query_embedding,
	min_papers=min_papers_per_author,
	validate_authors=validate_authors,
	validation_threshold=validation_threshold
	)

	if not ranked_authors:
	st.warning(f"No authors found with at least {min_papers_per_author} relevant papers.")
	return

	# Display authors in a table
	st.markdown(f"Found {len(ranked_authors)} researchers with at least {min_papers_per_author} relevant papers.")

	for idx, author in enumerate(ranked_authors[:top_authors_display], 1):
	with st.container():
	col1, col2, col3, col4 = st.columns([3, 1, 1, 1])

	with col1:
	st.markdown(f"{idx}. [{author['name']}]({author['openalex_url']})")
	if author['institution']:
	st.caption(author['institution'])

	with col2:
	st.metric("H-Index", author['h_index'])

	with col3:
	st.metric("Citations", f"{author['total_citations']:,}")

	with col4:
	if validate_authors:
	st.metric("Body Relevance", f"{author['validation_score']:.3f}")
	else:
	st.metric("Relevance", f"{author['avg_relevance_score']:.3f}")

	caption_text = f"Total works: {author['works_count']} \| Relevant papers: {author['num_relevant_papers']}"
	if validate_authors:
	caption_text += f" \| Paper relevance: {author['avg_relevance_score']:.3f}"
	st.caption(caption_text)
	st.divider()

	# Download results
	st.header("📥 Download Results")

	# Prepare CSV data for authors
	import io
	import csv

	csv_buffer = io.StringIO()
	csv_writer = csv.writer(csv_buffer)

	# Write header
	header = [
	'Rank', 'Name', 'Institution', 'H-Index', 'Total Citations',
	'Total Works', 'Relevant Papers', 'Avg Relevance Score', 'Composite Score', 'OpenAlex URL'
	]
	if validate_authors:
	header.insert(-1, 'Body of Work Validation Score')
	csv_writer.writerow(header)

	# Write data
	for idx, author in enumerate(ranked_authors, 1):
	row = [
	idx,
	author['name'],
	author['institution'],
	author['h_index'],
	author['total_citations'],
	author['works_count'],
	author['num_relevant_papers'],
	f"{author['avg_relevance_score']:.4f}",
	f"{author['composite_score']:.4f}",
	]
	if validate_authors:
	row.append(f"{author['validation_score']:.4f}")
	row.append(author['openalex_url'])
	csv_writer.writerow(row)

	csv_data = csv_buffer.getvalue()

	st.download_button(
	label="Download Author Rankings (CSV)",
	data=csv_data,
	file_name=f"openalex_authors_{query.replace(' ', '_')[:30]}.csv",
	mime="text/csv"
	)

	if __name__ == "__main__":
	main()