Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import requests | |
| from sentence_transformers import SentenceTransformer | |
| import numpy as np | |
| from collections import defaultdict | |
| import time | |
| # Page config | |
| st.set_page_config( | |
| page_title="OpenAlex Semantic Search", | |
| page_icon="π¬", | |
| layout="wide" | |
| ) | |
| # Cache the model loading | |
| def load_model(): | |
| """Load the sentence transformer model""" | |
| return SentenceTransformer('all-MiniLM-L6-v2') | |
| def search_openalex_papers(query, num_results=50): | |
| """ | |
| Search OpenAlex for papers related to the query | |
| """ | |
| base_url = "https://api.openalex.org/works" | |
| params = { | |
| "search": query, | |
| "per_page": num_results, | |
| "select": "id,title,abstract_inverted_index,authorships,publication_year,cited_by_count,display_name", | |
| "mailto": "[email protected]" # Polite pool | |
| } | |
| try: | |
| response = requests.get(base_url, params=params, timeout=30) | |
| response.raise_for_status() | |
| data = response.json() | |
| return data.get("results", []) | |
| except Exception as e: | |
| st.error(f"Error fetching papers: {str(e)}") | |
| return [] | |
| def reconstruct_abstract(inverted_index): | |
| """ | |
| Reconstruct abstract from OpenAlex inverted index format | |
| """ | |
| if not inverted_index: | |
| return "" | |
| # Create list of (position, word) tuples | |
| words_with_positions = [] | |
| for word, positions in inverted_index.items(): | |
| for pos in positions: | |
| words_with_positions.append((pos, word)) | |
| # Sort by position and join | |
| words_with_positions.sort(key=lambda x: x[0]) | |
| return " ".join([word for _, word in words_with_positions]) | |
| def get_author_details(author_id): | |
| """ | |
| Fetch detailed author information from OpenAlex | |
| """ | |
| base_url = f"https://api.openalex.org/authors/{author_id}" | |
| params = { | |
| "mailto": "[email protected]" | |
| } | |
| try: | |
| response = requests.get(base_url, params=params, timeout=10) | |
| response.raise_for_status() | |
| return response.json() | |
| except Exception as e: | |
| return None | |
| def calculate_semantic_similarity(query_embedding, paper_embeddings): | |
| """ | |
| Calculate cosine similarity between query and papers | |
| """ | |
| # Normalize embeddings | |
| query_norm = query_embedding / np.linalg.norm(query_embedding) | |
| paper_norms = paper_embeddings / np.linalg.norm(paper_embeddings, axis=1, keepdims=True) | |
| # Calculate cosine similarity | |
| similarities = np.dot(paper_norms, query_norm) | |
| return similarities | |
| def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2): | |
| """ | |
| Extract authors from papers and rank them based on: | |
| - Semantic relevance (average of their paper scores) | |
| - H-index | |
| - Total citations | |
| """ | |
| author_data = defaultdict(lambda: { | |
| 'name': '', | |
| 'id': '', | |
| 'paper_scores': [], | |
| 'paper_ids': [], | |
| 'total_citations': 0, | |
| 'works_count': 0, | |
| 'h_index': 0, | |
| 'institution': '' | |
| }) | |
| # Collect author information from papers | |
| for paper, score in zip(papers, paper_scores): | |
| for authorship in paper.get('authorships', []): | |
| author = authorship.get('author', {}) | |
| author_id = author.get('id', '').split('/')[-1] if author.get('id') else None | |
| if author_id and author_id.startswith('A'): | |
| author_data[author_id]['name'] = author.get('display_name', 'Unknown') | |
| author_data[author_id]['id'] = author_id | |
| author_data[author_id]['paper_scores'].append(score) | |
| author_data[author_id]['paper_ids'].append(paper.get('id', '')) | |
| # Get institution | |
| institutions = authorship.get('institutions', []) | |
| if institutions and not author_data[author_id]['institution']: | |
| author_data[author_id]['institution'] = institutions[0].get('display_name', '') | |
| # Filter authors with minimum paper count | |
| filtered_authors = { | |
| aid: data for aid, data in author_data.items() | |
| if len(data['paper_scores']) >= min_papers | |
| } | |
| # Fetch detailed metrics for each author | |
| with st.spinner(f"Fetching metrics for {len(filtered_authors)} authors..."): | |
| progress_bar = st.progress(0) | |
| for idx, (author_id, data) in enumerate(filtered_authors.items()): | |
| author_details = get_author_details(author_id) | |
| if author_details: | |
| data['h_index'] = author_details.get('summary_stats', {}).get('h_index', 0) | |
| data['total_citations'] = author_details.get('cited_by_count', 0) | |
| data['works_count'] = author_details.get('works_count', 0) | |
| progress_bar.progress((idx + 1) / len(filtered_authors)) | |
| time.sleep(0.1) # Rate limiting | |
| progress_bar.empty() | |
| # Calculate composite score for ranking | |
| ranked_authors = [] | |
| for author_id, data in filtered_authors.items(): | |
| avg_relevance = np.mean(data['paper_scores']) | |
| # Normalize metrics (using log scale for citations) | |
| normalized_h_index = data['h_index'] / 100.0 # Assume max h-index of 100 | |
| normalized_citations = np.log1p(data['total_citations']) / 15.0 # Log scale | |
| # Composite score: weighted combination | |
| composite_score = ( | |
| 0.5 * avg_relevance + # 50% semantic relevance | |
| 0.3 * normalized_h_index + # 30% h-index | |
| 0.2 * normalized_citations # 20% citations | |
| ) | |
| ranked_authors.append({ | |
| 'author_id': author_id, | |
| 'name': data['name'], | |
| 'institution': data['institution'], | |
| 'h_index': data['h_index'], | |
| 'total_citations': data['total_citations'], | |
| 'works_count': data['works_count'], | |
| 'num_relevant_papers': len(data['paper_scores']), | |
| 'avg_relevance_score': avg_relevance, | |
| 'composite_score': composite_score, | |
| 'openalex_url': f"https://openalex.org/{author_id}" | |
| }) | |
| # Sort by composite score | |
| ranked_authors.sort(key=lambda x: x['composite_score'], reverse=True) | |
| return ranked_authors | |
| def main(): | |
| st.title("π¬ OpenAlex Semantic Search") | |
| st.markdown(""" | |
| Search for academic papers and discover top researchers using semantic search powered by OpenAlex. | |
| **How it works:** | |
| 1. Enter your search terms (e.g., "machine learning for drug discovery") | |
| 2. The app finds relevant papers using semantic similarity | |
| 3. Authors are ranked by relevance, h-index, and citation metrics | |
| """) | |
| # Sidebar controls | |
| st.sidebar.header("Search Settings") | |
| num_papers = st.sidebar.slider( | |
| "Number of papers to fetch", | |
| min_value=20, | |
| max_value=100, | |
| value=50, | |
| step=10 | |
| ) | |
| top_papers_display = st.sidebar.slider( | |
| "Top papers to display", | |
| min_value=5, | |
| max_value=30, | |
| value=10, | |
| step=5 | |
| ) | |
| top_authors_display = st.sidebar.slider( | |
| "Top authors to display", | |
| min_value=5, | |
| max_value=50, | |
| value=20, | |
| step=5 | |
| ) | |
| min_papers_per_author = st.sidebar.slider( | |
| "Minimum papers per author", | |
| min_value=1, | |
| max_value=5, | |
| value=2, | |
| step=1, | |
| help="Minimum number of relevant papers an author must have to be included" | |
| ) | |
| # Main search input | |
| query = st.text_input( | |
| "Enter your search query:", | |
| placeholder="e.g., 'graph neural networks for protein structure prediction'", | |
| help="Enter keywords or a description of what you're looking for" | |
| ) | |
| search_button = st.button("π Search", type="primary") | |
| if search_button and query: | |
| # Load model | |
| with st.spinner("Loading semantic model..."): | |
| model = load_model() | |
| # Search papers | |
| with st.spinner(f"Searching OpenAlex for papers about '{query}'..."): | |
| papers = search_openalex_papers(query, num_papers) | |
| if not papers: | |
| st.warning("No papers found. Try different search terms.") | |
| return | |
| st.success(f"Found {len(papers)} papers!") | |
| # Prepare papers for semantic search | |
| with st.spinner("Analyzing papers with semantic search..."): | |
| paper_texts = [] | |
| valid_papers = [] | |
| for paper in papers: | |
| title = paper.get('display_name', '') or paper.get('title', '') | |
| abstract = reconstruct_abstract(paper.get('abstract_inverted_index', {})) | |
| # Combine title and abstract (title weighted more) | |
| text = f"{title} {title} {abstract}" # Title appears twice for emphasis | |
| if text.strip(): | |
| paper_texts.append(text) | |
| valid_papers.append(paper) | |
| if not paper_texts: | |
| st.error("No valid paper content found.") | |
| return | |
| # Generate embeddings | |
| query_embedding = model.encode(query, convert_to_tensor=False) | |
| paper_embeddings = model.encode(paper_texts, convert_to_tensor=False, show_progress_bar=True) | |
| # Calculate similarities | |
| similarities = calculate_semantic_similarity(query_embedding, paper_embeddings) | |
| # Sort papers by similarity | |
| sorted_indices = np.argsort(similarities)[::-1] | |
| sorted_papers = [valid_papers[i] for i in sorted_indices] | |
| sorted_scores = [similarities[i] for i in sorted_indices] | |
| # Display top papers | |
| st.header(f"π Top {top_papers_display} Most Relevant Papers") | |
| for idx, (paper, score) in enumerate(zip(sorted_papers[:top_papers_display], sorted_scores[:top_papers_display])): | |
| with st.expander(f"**{idx+1}. {paper.get('display_name', 'Untitled')}** (Relevance: {score:.3f})"): | |
| col1, col2 = st.columns([3, 1]) | |
| with col1: | |
| abstract = reconstruct_abstract(paper.get('abstract_inverted_index', {})) | |
| if abstract: | |
| st.markdown(f"**Abstract:** {abstract[:500]}{'...' if len(abstract) > 500 else ''}") | |
| else: | |
| st.markdown("*No abstract available*") | |
| # Authors | |
| authors = [a.get('author', {}).get('display_name', 'Unknown') | |
| for a in paper.get('authorships', [])] | |
| if authors: | |
| st.markdown(f"**Authors:** {', '.join(authors[:5])}{'...' if len(authors) > 5 else ''}") | |
| with col2: | |
| st.metric("Year", paper.get('publication_year', 'N/A')) | |
| st.metric("Citations", paper.get('cited_by_count', 0)) | |
| paper_id = paper.get('id', '').split('/')[-1] | |
| if paper_id: | |
| st.markdown(f"[View on OpenAlex](https://openalex.org/{paper_id})") | |
| # Rank authors | |
| st.header(f"π¨βπ¬ Top {top_authors_display} Researchers") | |
| ranked_authors = rank_authors( | |
| sorted_papers, | |
| sorted_scores, | |
| model, | |
| query_embedding, | |
| min_papers=min_papers_per_author | |
| ) | |
| if not ranked_authors: | |
| st.warning(f"No authors found with at least {min_papers_per_author} relevant papers.") | |
| return | |
| # Display authors in a table | |
| st.markdown(f"Found {len(ranked_authors)} researchers with at least {min_papers_per_author} relevant papers.") | |
| for idx, author in enumerate(ranked_authors[:top_authors_display], 1): | |
| with st.container(): | |
| col1, col2, col3, col4 = st.columns([3, 1, 1, 1]) | |
| with col1: | |
| st.markdown(f"**{idx}. [{author['name']}]({author['openalex_url']})**") | |
| if author['institution']: | |
| st.caption(author['institution']) | |
| with col2: | |
| st.metric("H-Index", author['h_index']) | |
| with col3: | |
| st.metric("Citations", f"{author['total_citations']:,}") | |
| with col4: | |
| st.metric("Relevance", f"{author['avg_relevance_score']:.3f}") | |
| st.caption(f"Total works: {author['works_count']} | Relevant papers: {author['num_relevant_papers']}") | |
| st.divider() | |
| # Download results | |
| st.header("π₯ Download Results") | |
| # Prepare CSV data for authors | |
| import io | |
| import csv | |
| csv_buffer = io.StringIO() | |
| csv_writer = csv.writer(csv_buffer) | |
| # Write header | |
| csv_writer.writerow([ | |
| 'Rank', 'Name', 'Institution', 'H-Index', 'Total Citations', | |
| 'Total Works', 'Relevant Papers', 'Avg Relevance Score', 'Composite Score', 'OpenAlex URL' | |
| ]) | |
| # Write data | |
| for idx, author in enumerate(ranked_authors, 1): | |
| csv_writer.writerow([ | |
| idx, | |
| author['name'], | |
| author['institution'], | |
| author['h_index'], | |
| author['total_citations'], | |
| author['works_count'], | |
| author['num_relevant_papers'], | |
| f"{author['avg_relevance_score']:.4f}", | |
| f"{author['composite_score']:.4f}", | |
| author['openalex_url'] | |
| ]) | |
| csv_data = csv_buffer.getvalue() | |
| st.download_button( | |
| label="Download Author Rankings (CSV)", | |
| data=csv_data, | |
| file_name=f"openalex_authors_{query.replace(' ', '_')[:30]}.csv", | |
| mime="text/csv" | |
| ) | |
| if __name__ == "__main__": | |
| main() |