Spaces:

INLEXIO
/

semantic-search

Sleeping

App Files Files Community

semantic-search / src /streamlit_app.py

INLEXIO

Update src/streamlit_app.py

5b1baf3 verified about 1 month ago

raw

history blame

14.3 kB

	import streamlit as st
	import requests
	from sentence_transformers import SentenceTransformer
	import numpy as np
	from collections import defaultdict
	import time

	# Page config
	st.set_page_config(
	page_title="OpenAlex Semantic Search",
	page_icon="🔬",
	layout="wide"
	)

	# Cache the model loading
	@st.cache_resource
	def load_model():
	"""Load the sentence transformer model"""
	return SentenceTransformer('all-MiniLM-L6-v2')

	@st.cache_data(ttl=3600)
	def search_openalex_papers(query, num_results=50):
	"""
	Search OpenAlex for papers related to the query
	"""
	base_url = "https://api.openalex.org/works"

	params = {
	"search": query,
	"per_page": num_results,
	"select": "id,title,abstract_inverted_index,authorships,publication_year,cited_by_count,display_name",
	"mailto": "[email protected]" # Polite pool
	}

	try:
	response = requests.get(base_url, params=params, timeout=30)
	response.raise_for_status()
	data = response.json()
	return data.get("results", [])
	except Exception as e:
	st.error(f"Error fetching papers: {str(e)}")
	return []

	def reconstruct_abstract(inverted_index):
	"""
	Reconstruct abstract from OpenAlex inverted index format
	"""
	if not inverted_index:
	return ""

	# Create list of (position, word) tuples
	words_with_positions = []
	for word, positions in inverted_index.items():
	for pos in positions:
	words_with_positions.append((pos, word))

	# Sort by position and join
	words_with_positions.sort(key=lambda x: x[0])
	return " ".join([word for _, word in words_with_positions])

	@st.cache_data(ttl=3600)
	def get_author_details(author_id):
	"""
	Fetch detailed author information from OpenAlex
	"""
	base_url = f"https://api.openalex.org/authors/{author_id}"

	params = {
	"mailto": "[email protected]"
	}

	try:
	response = requests.get(base_url, params=params, timeout=10)
	response.raise_for_status()
	return response.json()
	except Exception as e:
	return None

	def calculate_semantic_similarity(query_embedding, paper_embeddings):
	"""
	Calculate cosine similarity between query and papers
	"""
	# Normalize embeddings
	query_norm = query_embedding / np.linalg.norm(query_embedding)
	paper_norms = paper_embeddings / np.linalg.norm(paper_embeddings, axis=1, keepdims=True)

	# Calculate cosine similarity
	similarities = np.dot(paper_norms, query_norm)
	return similarities

	def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
	"""
	Extract authors from papers and rank them based on:
	- Semantic relevance (average of their paper scores)
	- H-index
	- Total citations
	"""
	author_data = defaultdict(lambda: {
	'name': '',
	'id': '',
	'paper_scores': [],
	'paper_ids': [],
	'total_citations': 0,
	'works_count': 0,
	'h_index': 0,
	'institution': ''
	})

	# Collect author information from papers
	for paper, score in zip(papers, paper_scores):
	for authorship in paper.get('authorships', []):
	author = authorship.get('author', {})
	author_id = author.get('id', '').split('/')[-1] if author.get('id') else None

	if author_id and author_id.startswith('A'):
	author_data[author_id]['name'] = author.get('display_name', 'Unknown')
	author_data[author_id]['id'] = author_id
	author_data[author_id]['paper_scores'].append(score)
	author_data[author_id]['paper_ids'].append(paper.get('id', ''))

	# Get institution
	institutions = authorship.get('institutions', [])
	if institutions and not author_data[author_id]['institution']:
	author_data[author_id]['institution'] = institutions[0].get('display_name', '')

	# Filter authors with minimum paper count
	filtered_authors = {
	aid: data for aid, data in author_data.items()
	if len(data['paper_scores']) >= min_papers
	}

	# Fetch detailed metrics for each author
	with st.spinner(f"Fetching metrics for {len(filtered_authors)} authors..."):
	progress_bar = st.progress(0)
	for idx, (author_id, data) in enumerate(filtered_authors.items()):
	author_details = get_author_details(author_id)
	if author_details:
	data['h_index'] = author_details.get('summary_stats', {}).get('h_index', 0)
	data['total_citations'] = author_details.get('cited_by_count', 0)
	data['works_count'] = author_details.get('works_count', 0)

	progress_bar.progress((idx + 1) / len(filtered_authors))
	time.sleep(0.1) # Rate limiting

	progress_bar.empty()

	# Calculate composite score for ranking
	ranked_authors = []
	for author_id, data in filtered_authors.items():
	avg_relevance = np.mean(data['paper_scores'])

	# Normalize metrics (using log scale for citations)
	normalized_h_index = data['h_index'] / 100.0 # Assume max h-index of 100
	normalized_citations = np.log1p(data['total_citations']) / 15.0 # Log scale

	# Composite score: weighted combination
	composite_score = (
	0.5 * avg_relevance + # 50% semantic relevance
	0.3 * normalized_h_index + # 30% h-index
	0.2 * normalized_citations # 20% citations
	)

	ranked_authors.append({
	'author_id': author_id,
	'name': data['name'],
	'institution': data['institution'],
	'h_index': data['h_index'],
	'total_citations': data['total_citations'],
	'works_count': data['works_count'],
	'num_relevant_papers': len(data['paper_scores']),
	'avg_relevance_score': avg_relevance,
	'composite_score': composite_score,
	'openalex_url': f"https://openalex.org/{author_id}"
	})

	# Sort by composite score
	ranked_authors.sort(key=lambda x: x['composite_score'], reverse=True)

	return ranked_authors

	def main():
	st.title("🔬 OpenAlex Semantic Search")
	st.markdown("""
	Search for academic papers and discover top researchers using semantic search powered by OpenAlex.

	How it works:
	1. Enter your search terms (e.g., "machine learning for drug discovery")
	2. The app finds relevant papers using semantic similarity
	3. Authors are ranked by relevance, h-index, and citation metrics
	""")

	# Sidebar controls
	st.sidebar.header("Search Settings")

	num_papers = st.sidebar.slider(
	"Number of papers to fetch",
	min_value=20,
	max_value=100,
	value=50,
	step=10
	)

	top_papers_display = st.sidebar.slider(
	"Top papers to display",
	min_value=5,
	max_value=30,
	value=10,
	step=5
	)

	top_authors_display = st.sidebar.slider(
	"Top authors to display",
	min_value=5,
	max_value=50,
	value=20,
	step=5
	)

	min_papers_per_author = st.sidebar.slider(
	"Minimum papers per author",
	min_value=1,
	max_value=5,
	value=2,
	step=1,
	help="Minimum number of relevant papers an author must have to be included"
	)

	# Main search input
	query = st.text_input(
	"Enter your search query:",
	placeholder="e.g., 'graph neural networks for protein structure prediction'",
	help="Enter keywords or a description of what you're looking for"
	)

	search_button = st.button("🔍 Search", type="primary")

	if search_button and query:
	# Load model
	with st.spinner("Loading semantic model..."):
	model = load_model()

	# Search papers
	with st.spinner(f"Searching OpenAlex for papers about '{query}'..."):
	papers = search_openalex_papers(query, num_papers)

	if not papers:
	st.warning("No papers found. Try different search terms.")
	return

	st.success(f"Found {len(papers)} papers!")

	# Prepare papers for semantic search
	with st.spinner("Analyzing papers with semantic search..."):
	paper_texts = []
	valid_papers = []

	for paper in papers:
	title = paper.get('display_name', '') or paper.get('title', '')
	abstract = reconstruct_abstract(paper.get('abstract_inverted_index', {}))

	# Combine title and abstract (title weighted more)
	text = f"{title} {title} {abstract}" # Title appears twice for emphasis

	if text.strip():
	paper_texts.append(text)
	valid_papers.append(paper)

	if not paper_texts:
	st.error("No valid paper content found.")
	return

	# Generate embeddings
	query_embedding = model.encode(query, convert_to_tensor=False)
	paper_embeddings = model.encode(paper_texts, convert_to_tensor=False, show_progress_bar=True)

	# Calculate similarities
	similarities = calculate_semantic_similarity(query_embedding, paper_embeddings)

	# Sort papers by similarity
	sorted_indices = np.argsort(similarities)[::-1]
	sorted_papers = [valid_papers[i] for i in sorted_indices]
	sorted_scores = [similarities[i] for i in sorted_indices]

	# Display top papers
	st.header(f"📄 Top {top_papers_display} Most Relevant Papers")

	for idx, (paper, score) in enumerate(zip(sorted_papers[:top_papers_display], sorted_scores[:top_papers_display])):
	with st.expander(f"{idx+1}. {paper.get('display_name', 'Untitled')} (Relevance: {score:.3f})"):
	col1, col2 = st.columns([3, 1])

	with col1:
	abstract = reconstruct_abstract(paper.get('abstract_inverted_index', {}))
	if abstract:
	st.markdown(f"Abstract: {abstract[:500]}{'...' if len(abstract) > 500 else ''}")
	else:
	st.markdown("No abstract available")

	# Authors
	authors = [a.get('author', {}).get('display_name', 'Unknown')
	for a in paper.get('authorships', [])]
	if authors:
	st.markdown(f"Authors: {', '.join(authors[:5])}{'...' if len(authors) > 5 else ''}")

	with col2:
	st.metric("Year", paper.get('publication_year', 'N/A'))
	st.metric("Citations", paper.get('cited_by_count', 0))

	paper_id = paper.get('id', '').split('/')[-1]
	if paper_id:
	st.markdown(f"[View on OpenAlex](https://openalex.org/{paper_id})")

	# Rank authors
	st.header(f"👨‍🔬 Top {top_authors_display} Researchers")

	ranked_authors = rank_authors(
	sorted_papers,
	sorted_scores,
	model,
	query_embedding,
	min_papers=min_papers_per_author
	)

	if not ranked_authors:
	st.warning(f"No authors found with at least {min_papers_per_author} relevant papers.")
	return

	# Display authors in a table
	st.markdown(f"Found {len(ranked_authors)} researchers with at least {min_papers_per_author} relevant papers.")

	for idx, author in enumerate(ranked_authors[:top_authors_display], 1):
	with st.container():
	col1, col2, col3, col4 = st.columns([3, 1, 1, 1])

	with col1:
	st.markdown(f"{idx}. [{author['name']}]({author['openalex_url']})")
	if author['institution']:
	st.caption(author['institution'])

	with col2:
	st.metric("H-Index", author['h_index'])

	with col3:
	st.metric("Citations", f"{author['total_citations']:,}")

	with col4:
	st.metric("Relevance", f"{author['avg_relevance_score']:.3f}")

	st.caption(f"Total works: {author['works_count']} \| Relevant papers: {author['num_relevant_papers']}")
	st.divider()

	# Download results
	st.header("📥 Download Results")

	# Prepare CSV data for authors
	import io
	import csv

	csv_buffer = io.StringIO()
	csv_writer = csv.writer(csv_buffer)

	# Write header
	csv_writer.writerow([
	'Rank', 'Name', 'Institution', 'H-Index', 'Total Citations',
	'Total Works', 'Relevant Papers', 'Avg Relevance Score', 'Composite Score', 'OpenAlex URL'
	])

	# Write data
	for idx, author in enumerate(ranked_authors, 1):
	csv_writer.writerow([
	idx,
	author['name'],
	author['institution'],
	author['h_index'],
	author['total_citations'],
	author['works_count'],
	author['num_relevant_papers'],
	f"{author['avg_relevance_score']:.4f}",
	f"{author['composite_score']:.4f}",
	author['openalex_url']
	])

	csv_data = csv_buffer.getvalue()

	st.download_button(
	label="Download Author Rankings (CSV)",
	data=csv_data,
	file_name=f"openalex_authors_{query.replace(' ', '_')[:30]}.csv",
	mime="text/csv"
	)

	if __name__ == "__main__":
	main()