semantic-search / src /streamlit_app.py
INLEXIO's picture
Update src/streamlit_app.py
5b1baf3 verified
raw
history blame
14.3 kB
import streamlit as st
import requests
from sentence_transformers import SentenceTransformer
import numpy as np
from collections import defaultdict
import time
# Page config
st.set_page_config(
page_title="OpenAlex Semantic Search",
page_icon="πŸ”¬",
layout="wide"
)
# Cache the model loading
@st.cache_resource
def load_model():
"""Load the sentence transformer model"""
return SentenceTransformer('all-MiniLM-L6-v2')
@st.cache_data(ttl=3600)
def search_openalex_papers(query, num_results=50):
"""
Search OpenAlex for papers related to the query
"""
base_url = "https://api.openalex.org/works"
params = {
"search": query,
"per_page": num_results,
"select": "id,title,abstract_inverted_index,authorships,publication_year,cited_by_count,display_name",
"mailto": "[email protected]" # Polite pool
}
try:
response = requests.get(base_url, params=params, timeout=30)
response.raise_for_status()
data = response.json()
return data.get("results", [])
except Exception as e:
st.error(f"Error fetching papers: {str(e)}")
return []
def reconstruct_abstract(inverted_index):
"""
Reconstruct abstract from OpenAlex inverted index format
"""
if not inverted_index:
return ""
# Create list of (position, word) tuples
words_with_positions = []
for word, positions in inverted_index.items():
for pos in positions:
words_with_positions.append((pos, word))
# Sort by position and join
words_with_positions.sort(key=lambda x: x[0])
return " ".join([word for _, word in words_with_positions])
@st.cache_data(ttl=3600)
def get_author_details(author_id):
"""
Fetch detailed author information from OpenAlex
"""
base_url = f"https://api.openalex.org/authors/{author_id}"
params = {
"mailto": "[email protected]"
}
try:
response = requests.get(base_url, params=params, timeout=10)
response.raise_for_status()
return response.json()
except Exception as e:
return None
def calculate_semantic_similarity(query_embedding, paper_embeddings):
"""
Calculate cosine similarity between query and papers
"""
# Normalize embeddings
query_norm = query_embedding / np.linalg.norm(query_embedding)
paper_norms = paper_embeddings / np.linalg.norm(paper_embeddings, axis=1, keepdims=True)
# Calculate cosine similarity
similarities = np.dot(paper_norms, query_norm)
return similarities
def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
"""
Extract authors from papers and rank them based on:
- Semantic relevance (average of their paper scores)
- H-index
- Total citations
"""
author_data = defaultdict(lambda: {
'name': '',
'id': '',
'paper_scores': [],
'paper_ids': [],
'total_citations': 0,
'works_count': 0,
'h_index': 0,
'institution': ''
})
# Collect author information from papers
for paper, score in zip(papers, paper_scores):
for authorship in paper.get('authorships', []):
author = authorship.get('author', {})
author_id = author.get('id', '').split('/')[-1] if author.get('id') else None
if author_id and author_id.startswith('A'):
author_data[author_id]['name'] = author.get('display_name', 'Unknown')
author_data[author_id]['id'] = author_id
author_data[author_id]['paper_scores'].append(score)
author_data[author_id]['paper_ids'].append(paper.get('id', ''))
# Get institution
institutions = authorship.get('institutions', [])
if institutions and not author_data[author_id]['institution']:
author_data[author_id]['institution'] = institutions[0].get('display_name', '')
# Filter authors with minimum paper count
filtered_authors = {
aid: data for aid, data in author_data.items()
if len(data['paper_scores']) >= min_papers
}
# Fetch detailed metrics for each author
with st.spinner(f"Fetching metrics for {len(filtered_authors)} authors..."):
progress_bar = st.progress(0)
for idx, (author_id, data) in enumerate(filtered_authors.items()):
author_details = get_author_details(author_id)
if author_details:
data['h_index'] = author_details.get('summary_stats', {}).get('h_index', 0)
data['total_citations'] = author_details.get('cited_by_count', 0)
data['works_count'] = author_details.get('works_count', 0)
progress_bar.progress((idx + 1) / len(filtered_authors))
time.sleep(0.1) # Rate limiting
progress_bar.empty()
# Calculate composite score for ranking
ranked_authors = []
for author_id, data in filtered_authors.items():
avg_relevance = np.mean(data['paper_scores'])
# Normalize metrics (using log scale for citations)
normalized_h_index = data['h_index'] / 100.0 # Assume max h-index of 100
normalized_citations = np.log1p(data['total_citations']) / 15.0 # Log scale
# Composite score: weighted combination
composite_score = (
0.5 * avg_relevance + # 50% semantic relevance
0.3 * normalized_h_index + # 30% h-index
0.2 * normalized_citations # 20% citations
)
ranked_authors.append({
'author_id': author_id,
'name': data['name'],
'institution': data['institution'],
'h_index': data['h_index'],
'total_citations': data['total_citations'],
'works_count': data['works_count'],
'num_relevant_papers': len(data['paper_scores']),
'avg_relevance_score': avg_relevance,
'composite_score': composite_score,
'openalex_url': f"https://openalex.org/{author_id}"
})
# Sort by composite score
ranked_authors.sort(key=lambda x: x['composite_score'], reverse=True)
return ranked_authors
def main():
st.title("πŸ”¬ OpenAlex Semantic Search")
st.markdown("""
Search for academic papers and discover top researchers using semantic search powered by OpenAlex.
**How it works:**
1. Enter your search terms (e.g., "machine learning for drug discovery")
2. The app finds relevant papers using semantic similarity
3. Authors are ranked by relevance, h-index, and citation metrics
""")
# Sidebar controls
st.sidebar.header("Search Settings")
num_papers = st.sidebar.slider(
"Number of papers to fetch",
min_value=20,
max_value=100,
value=50,
step=10
)
top_papers_display = st.sidebar.slider(
"Top papers to display",
min_value=5,
max_value=30,
value=10,
step=5
)
top_authors_display = st.sidebar.slider(
"Top authors to display",
min_value=5,
max_value=50,
value=20,
step=5
)
min_papers_per_author = st.sidebar.slider(
"Minimum papers per author",
min_value=1,
max_value=5,
value=2,
step=1,
help="Minimum number of relevant papers an author must have to be included"
)
# Main search input
query = st.text_input(
"Enter your search query:",
placeholder="e.g., 'graph neural networks for protein structure prediction'",
help="Enter keywords or a description of what you're looking for"
)
search_button = st.button("πŸ” Search", type="primary")
if search_button and query:
# Load model
with st.spinner("Loading semantic model..."):
model = load_model()
# Search papers
with st.spinner(f"Searching OpenAlex for papers about '{query}'..."):
papers = search_openalex_papers(query, num_papers)
if not papers:
st.warning("No papers found. Try different search terms.")
return
st.success(f"Found {len(papers)} papers!")
# Prepare papers for semantic search
with st.spinner("Analyzing papers with semantic search..."):
paper_texts = []
valid_papers = []
for paper in papers:
title = paper.get('display_name', '') or paper.get('title', '')
abstract = reconstruct_abstract(paper.get('abstract_inverted_index', {}))
# Combine title and abstract (title weighted more)
text = f"{title} {title} {abstract}" # Title appears twice for emphasis
if text.strip():
paper_texts.append(text)
valid_papers.append(paper)
if not paper_texts:
st.error("No valid paper content found.")
return
# Generate embeddings
query_embedding = model.encode(query, convert_to_tensor=False)
paper_embeddings = model.encode(paper_texts, convert_to_tensor=False, show_progress_bar=True)
# Calculate similarities
similarities = calculate_semantic_similarity(query_embedding, paper_embeddings)
# Sort papers by similarity
sorted_indices = np.argsort(similarities)[::-1]
sorted_papers = [valid_papers[i] for i in sorted_indices]
sorted_scores = [similarities[i] for i in sorted_indices]
# Display top papers
st.header(f"πŸ“„ Top {top_papers_display} Most Relevant Papers")
for idx, (paper, score) in enumerate(zip(sorted_papers[:top_papers_display], sorted_scores[:top_papers_display])):
with st.expander(f"**{idx+1}. {paper.get('display_name', 'Untitled')}** (Relevance: {score:.3f})"):
col1, col2 = st.columns([3, 1])
with col1:
abstract = reconstruct_abstract(paper.get('abstract_inverted_index', {}))
if abstract:
st.markdown(f"**Abstract:** {abstract[:500]}{'...' if len(abstract) > 500 else ''}")
else:
st.markdown("*No abstract available*")
# Authors
authors = [a.get('author', {}).get('display_name', 'Unknown')
for a in paper.get('authorships', [])]
if authors:
st.markdown(f"**Authors:** {', '.join(authors[:5])}{'...' if len(authors) > 5 else ''}")
with col2:
st.metric("Year", paper.get('publication_year', 'N/A'))
st.metric("Citations", paper.get('cited_by_count', 0))
paper_id = paper.get('id', '').split('/')[-1]
if paper_id:
st.markdown(f"[View on OpenAlex](https://openalex.org/{paper_id})")
# Rank authors
st.header(f"πŸ‘¨β€πŸ”¬ Top {top_authors_display} Researchers")
ranked_authors = rank_authors(
sorted_papers,
sorted_scores,
model,
query_embedding,
min_papers=min_papers_per_author
)
if not ranked_authors:
st.warning(f"No authors found with at least {min_papers_per_author} relevant papers.")
return
# Display authors in a table
st.markdown(f"Found {len(ranked_authors)} researchers with at least {min_papers_per_author} relevant papers.")
for idx, author in enumerate(ranked_authors[:top_authors_display], 1):
with st.container():
col1, col2, col3, col4 = st.columns([3, 1, 1, 1])
with col1:
st.markdown(f"**{idx}. [{author['name']}]({author['openalex_url']})**")
if author['institution']:
st.caption(author['institution'])
with col2:
st.metric("H-Index", author['h_index'])
with col3:
st.metric("Citations", f"{author['total_citations']:,}")
with col4:
st.metric("Relevance", f"{author['avg_relevance_score']:.3f}")
st.caption(f"Total works: {author['works_count']} | Relevant papers: {author['num_relevant_papers']}")
st.divider()
# Download results
st.header("πŸ“₯ Download Results")
# Prepare CSV data for authors
import io
import csv
csv_buffer = io.StringIO()
csv_writer = csv.writer(csv_buffer)
# Write header
csv_writer.writerow([
'Rank', 'Name', 'Institution', 'H-Index', 'Total Citations',
'Total Works', 'Relevant Papers', 'Avg Relevance Score', 'Composite Score', 'OpenAlex URL'
])
# Write data
for idx, author in enumerate(ranked_authors, 1):
csv_writer.writerow([
idx,
author['name'],
author['institution'],
author['h_index'],
author['total_citations'],
author['works_count'],
author['num_relevant_papers'],
f"{author['avg_relevance_score']:.4f}",
f"{author['composite_score']:.4f}",
author['openalex_url']
])
csv_data = csv_buffer.getvalue()
st.download_button(
label="Download Author Rankings (CSV)",
data=csv_data,
file_name=f"openalex_authors_{query.replace(' ', '_')[:30]}.csv",
mime="text/csv"
)
if __name__ == "__main__":
main()