import streamlit as st import requests from sentence_transformers import SentenceTransformer import numpy as np from collections import defaultdict import time # Page config st.set_page_config( page_title="OpenAlex Semantic Search", page_icon="🔬", layout="wide" ) # Cache the model loading @st.cache_resource def load_model(): """Load the sentence transformer model""" return SentenceTransformer('all-MiniLM-L6-v2') @st.cache_data(ttl=3600) def search_openalex_papers(query, num_results=50): """ Search OpenAlex for papers related to the query """ base_url = "https://api.openalex.org/works" params = { "search": query, "per_page": num_results, "select": "id,title,abstract_inverted_index,authorships,publication_year,cited_by_count,display_name", "mailto": "user@example.com" # Polite pool } try: response = requests.get(base_url, params=params, timeout=30) response.raise_for_status() data = response.json() return data.get("results", []) except Exception as e: st.error(f"Error fetching papers: {str(e)}") return [] def reconstruct_abstract(inverted_index): """ Reconstruct abstract from OpenAlex inverted index format """ if not inverted_index: return "" # Create list of (position, word) tuples words_with_positions = [] for word, positions in inverted_index.items(): for pos in positions: words_with_positions.append((pos, word)) # Sort by position and join words_with_positions.sort(key=lambda x: x[0]) return " ".join([word for _, word in words_with_positions]) @st.cache_data(ttl=3600) def get_author_details(author_id): """ Fetch detailed author information from OpenAlex """ base_url = f"https://api.openalex.org/authors/{author_id}" params = { "mailto": "user@example.com" } try: response = requests.get(base_url, params=params, timeout=10) response.raise_for_status() return response.json() except Exception as e: return None def calculate_semantic_similarity(query_embedding, paper_embeddings): """ Calculate cosine similarity between query and papers """ # Normalize embeddings query_norm = query_embedding / np.linalg.norm(query_embedding) paper_norms = paper_embeddings / np.linalg.norm(paper_embeddings, axis=1, keepdims=True) # Calculate cosine similarity similarities = np.dot(paper_norms, query_norm) return similarities def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2): """ Extract authors from papers and rank them based on: - Semantic relevance (average of their paper scores) - H-index - Total citations """ author_data = defaultdict(lambda: { 'name': '', 'id': '', 'paper_scores': [], 'paper_ids': [], 'total_citations': 0, 'works_count': 0, 'h_index': 0, 'institution': '' }) # Collect author information from papers for paper, score in zip(papers, paper_scores): for authorship in paper.get('authorships', []): author = authorship.get('author', {}) author_id = author.get('id', '').split('/')[-1] if author.get('id') else None if author_id and author_id.startswith('A'): author_data[author_id]['name'] = author.get('display_name', 'Unknown') author_data[author_id]['id'] = author_id author_data[author_id]['paper_scores'].append(score) author_data[author_id]['paper_ids'].append(paper.get('id', '')) # Get institution institutions = authorship.get('institutions', []) if institutions and not author_data[author_id]['institution']: author_data[author_id]['institution'] = institutions[0].get('display_name', '') # Filter authors with minimum paper count filtered_authors = { aid: data for aid, data in author_data.items() if len(data['paper_scores']) >= min_papers } # Fetch detailed metrics for each author with st.spinner(f"Fetching metrics for {len(filtered_authors)} authors..."): progress_bar = st.progress(0) for idx, (author_id, data) in enumerate(filtered_authors.items()): author_details = get_author_details(author_id) if author_details: data['h_index'] = author_details.get('summary_stats', {}).get('h_index', 0) data['total_citations'] = author_details.get('cited_by_count', 0) data['works_count'] = author_details.get('works_count', 0) progress_bar.progress((idx + 1) / len(filtered_authors)) time.sleep(0.1) # Rate limiting progress_bar.empty() # Calculate composite score for ranking ranked_authors = [] for author_id, data in filtered_authors.items(): avg_relevance = np.mean(data['paper_scores']) # Normalize metrics (using log scale for citations) normalized_h_index = data['h_index'] / 100.0 # Assume max h-index of 100 normalized_citations = np.log1p(data['total_citations']) / 15.0 # Log scale # Composite score: weighted combination composite_score = ( 0.5 * avg_relevance + # 50% semantic relevance 0.3 * normalized_h_index + # 30% h-index 0.2 * normalized_citations # 20% citations ) ranked_authors.append({ 'author_id': author_id, 'name': data['name'], 'institution': data['institution'], 'h_index': data['h_index'], 'total_citations': data['total_citations'], 'works_count': data['works_count'], 'num_relevant_papers': len(data['paper_scores']), 'avg_relevance_score': avg_relevance, 'composite_score': composite_score, 'openalex_url': f"https://openalex.org/{author_id}" }) # Sort by composite score ranked_authors.sort(key=lambda x: x['composite_score'], reverse=True) return ranked_authors def main(): st.title("🔬 OpenAlex Semantic Search") st.markdown(""" Search for academic papers and discover top researchers using semantic search powered by OpenAlex. **How it works:** 1. Enter your search terms (e.g., "machine learning for drug discovery") 2. The app finds relevant papers using semantic similarity 3. Authors are ranked by relevance, h-index, and citation metrics """) # Sidebar controls st.sidebar.header("Search Settings") num_papers = st.sidebar.slider( "Number of papers to fetch", min_value=20, max_value=100, value=50, step=10 ) top_papers_display = st.sidebar.slider( "Top papers to display", min_value=5, max_value=30, value=10, step=5 ) top_authors_display = st.sidebar.slider( "Top authors to display", min_value=5, max_value=50, value=20, step=5 ) min_papers_per_author = st.sidebar.slider( "Minimum papers per author", min_value=1, max_value=5, value=2, step=1, help="Minimum number of relevant papers an author must have to be included" ) # Main search input query = st.text_input( "Enter your search query:", placeholder="e.g., 'graph neural networks for protein structure prediction'", help="Enter keywords or a description of what you're looking for" ) search_button = st.button("🔍 Search", type="primary") if search_button and query: # Load model with st.spinner("Loading semantic model..."): model = load_model() # Search papers with st.spinner(f"Searching OpenAlex for papers about '{query}'..."): papers = search_openalex_papers(query, num_papers) if not papers: st.warning("No papers found. Try different search terms.") return st.success(f"Found {len(papers)} papers!") # Prepare papers for semantic search with st.spinner("Analyzing papers with semantic search..."): paper_texts = [] valid_papers = [] for paper in papers: title = paper.get('display_name', '') or paper.get('title', '') abstract = reconstruct_abstract(paper.get('abstract_inverted_index', {})) # Combine title and abstract (title weighted more) text = f"{title} {title} {abstract}" # Title appears twice for emphasis if text.strip(): paper_texts.append(text) valid_papers.append(paper) if not paper_texts: st.error("No valid paper content found.") return # Generate embeddings query_embedding = model.encode(query, convert_to_tensor=False) paper_embeddings = model.encode(paper_texts, convert_to_tensor=False, show_progress_bar=True) # Calculate similarities similarities = calculate_semantic_similarity(query_embedding, paper_embeddings) # Sort papers by similarity sorted_indices = np.argsort(similarities)[::-1] sorted_papers = [valid_papers[i] for i in sorted_indices] sorted_scores = [similarities[i] for i in sorted_indices] # Display top papers st.header(f"📄 Top {top_papers_display} Most Relevant Papers") for idx, (paper, score) in enumerate(zip(sorted_papers[:top_papers_display], sorted_scores[:top_papers_display])): with st.expander(f"**{idx+1}. {paper.get('display_name', 'Untitled')}** (Relevance: {score:.3f})"): col1, col2 = st.columns([3, 1]) with col1: abstract = reconstruct_abstract(paper.get('abstract_inverted_index', {})) if abstract: st.markdown(f"**Abstract:** {abstract[:500]}{'...' if len(abstract) > 500 else ''}") else: st.markdown("*No abstract available*") # Authors authors = [a.get('author', {}).get('display_name', 'Unknown') for a in paper.get('authorships', [])] if authors: st.markdown(f"**Authors:** {', '.join(authors[:5])}{'...' if len(authors) > 5 else ''}") with col2: st.metric("Year", paper.get('publication_year', 'N/A')) st.metric("Citations", paper.get('cited_by_count', 0)) paper_id = paper.get('id', '').split('/')[-1] if paper_id: st.markdown(f"[View on OpenAlex](https://openalex.org/{paper_id})") # Rank authors st.header(f"👨‍🔬 Top {top_authors_display} Researchers") ranked_authors = rank_authors( sorted_papers, sorted_scores, model, query_embedding, min_papers=min_papers_per_author ) if not ranked_authors: st.warning(f"No authors found with at least {min_papers_per_author} relevant papers.") return # Display authors in a table st.markdown(f"Found {len(ranked_authors)} researchers with at least {min_papers_per_author} relevant papers.") for idx, author in enumerate(ranked_authors[:top_authors_display], 1): with st.container(): col1, col2, col3, col4 = st.columns([3, 1, 1, 1]) with col1: st.markdown(f"**{idx}. [{author['name']}]({author['openalex_url']})**") if author['institution']: st.caption(author['institution']) with col2: st.metric("H-Index", author['h_index']) with col3: st.metric("Citations", f"{author['total_citations']:,}") with col4: st.metric("Relevance", f"{author['avg_relevance_score']:.3f}") st.caption(f"Total works: {author['works_count']} | Relevant papers: {author['num_relevant_papers']}") st.divider() # Download results st.header("📥 Download Results") # Prepare CSV data for authors import io import csv csv_buffer = io.StringIO() csv_writer = csv.writer(csv_buffer) # Write header csv_writer.writerow([ 'Rank', 'Name', 'Institution', 'H-Index', 'Total Citations', 'Total Works', 'Relevant Papers', 'Avg Relevance Score', 'Composite Score', 'OpenAlex URL' ]) # Write data for idx, author in enumerate(ranked_authors, 1): csv_writer.writerow([ idx, author['name'], author['institution'], author['h_index'], author['total_citations'], author['works_count'], author['num_relevant_papers'], f"{author['avg_relevance_score']:.4f}", f"{author['composite_score']:.4f}", author['openalex_url'] ]) csv_data = csv_buffer.getvalue() st.download_button( label="Download Author Rankings (CSV)", data=csv_data, file_name=f"openalex_authors_{query.replace(' ', '_')[:30]}.csv", mime="text/csv" ) if __name__ == "__main__": main()