Spaces:

INLEXIO
/

semantic-search

Sleeping

File size: 14,330 Bytes

import streamlit as st
import requests
from sentence_transformers import SentenceTransformer
import numpy as np
from collections import defaultdict
import time

# Page config
st.set_page_config(
    page_title="OpenAlex Semantic Search",
    page_icon="🔬",
    layout="wide"
)

# Cache the model loading
@st.cache_resource
def load_model():
    """Load the sentence transformer model"""
    return SentenceTransformer('all-MiniLM-L6-v2')

@st.cache_data(ttl=3600)
def search_openalex_papers(query, num_results=50):
    """
    Search OpenAlex for papers related to the query
    """
    base_url = "https://api.openalex.org/works"
    
    params = {
        "search": query,
        "per_page": num_results,
        "select": "id,title,abstract_inverted_index,authorships,publication_year,cited_by_count,display_name",
        "mailto": "[email protected]"  # Polite pool
    }
    
    try:
        response = requests.get(base_url, params=params, timeout=30)
        response.raise_for_status()
        data = response.json()
        return data.get("results", [])
    except Exception as e:
        st.error(f"Error fetching papers: {str(e)}")
        return []

def reconstruct_abstract(inverted_index):
    """
    Reconstruct abstract from OpenAlex inverted index format
    """
    if not inverted_index:
        return ""
    
    # Create list of (position, word) tuples
    words_with_positions = []
    for word, positions in inverted_index.items():
        for pos in positions:
            words_with_positions.append((pos, word))
    
    # Sort by position and join
    words_with_positions.sort(key=lambda x: x[0])
    return " ".join([word for _, word in words_with_positions])

@st.cache_data(ttl=3600)
def get_author_details(author_id):
    """
    Fetch detailed author information from OpenAlex
    """
    base_url = f"https://api.openalex.org/authors/{author_id}"
    
    params = {
        "mailto": "[email protected]"
    }
    
    try:
        response = requests.get(base_url, params=params, timeout=10)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        return None

def calculate_semantic_similarity(query_embedding, paper_embeddings):
    """
    Calculate cosine similarity between query and papers
    """
    # Normalize embeddings
    query_norm = query_embedding / np.linalg.norm(query_embedding)
    paper_norms = paper_embeddings / np.linalg.norm(paper_embeddings, axis=1, keepdims=True)
    
    # Calculate cosine similarity
    similarities = np.dot(paper_norms, query_norm)
    return similarities

def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
    """
    Extract authors from papers and rank them based on:
    - Semantic relevance (average of their paper scores)
    - H-index
    - Total citations
    """
    author_data = defaultdict(lambda: {
        'name': '',
        'id': '',
        'paper_scores': [],
        'paper_ids': [],
        'total_citations': 0,
        'works_count': 0,
        'h_index': 0,
        'institution': ''
    })
    
    # Collect author information from papers
    for paper, score in zip(papers, paper_scores):
        for authorship in paper.get('authorships', []):
            author = authorship.get('author', {})
            author_id = author.get('id', '').split('/')[-1] if author.get('id') else None
            
            if author_id and author_id.startswith('A'):
                author_data[author_id]['name'] = author.get('display_name', 'Unknown')
                author_data[author_id]['id'] = author_id
                author_data[author_id]['paper_scores'].append(score)
                author_data[author_id]['paper_ids'].append(paper.get('id', ''))
                
                # Get institution
                institutions = authorship.get('institutions', [])
                if institutions and not author_data[author_id]['institution']:
                    author_data[author_id]['institution'] = institutions[0].get('display_name', '')
    
    # Filter authors with minimum paper count
    filtered_authors = {
        aid: data for aid, data in author_data.items() 
        if len(data['paper_scores']) >= min_papers
    }
    
    # Fetch detailed metrics for each author
    with st.spinner(f"Fetching metrics for {len(filtered_authors)} authors..."):
        progress_bar = st.progress(0)
        for idx, (author_id, data) in enumerate(filtered_authors.items()):
            author_details = get_author_details(author_id)
            if author_details:
                data['h_index'] = author_details.get('summary_stats', {}).get('h_index', 0)
                data['total_citations'] = author_details.get('cited_by_count', 0)
                data['works_count'] = author_details.get('works_count', 0)
            
            progress_bar.progress((idx + 1) / len(filtered_authors))
            time.sleep(0.1)  # Rate limiting
        
        progress_bar.empty()
    
    # Calculate composite score for ranking
    ranked_authors = []
    for author_id, data in filtered_authors.items():
        avg_relevance = np.mean(data['paper_scores'])
        
        # Normalize metrics (using log scale for citations)
        normalized_h_index = data['h_index'] / 100.0  # Assume max h-index of 100
        normalized_citations = np.log1p(data['total_citations']) / 15.0  # Log scale
        
        # Composite score: weighted combination
        composite_score = (
            0.5 * avg_relevance +  # 50% semantic relevance
            0.3 * normalized_h_index +  # 30% h-index
            0.2 * normalized_citations  # 20% citations
        )
        
        ranked_authors.append({
            'author_id': author_id,
            'name': data['name'],
            'institution': data['institution'],
            'h_index': data['h_index'],
            'total_citations': data['total_citations'],
            'works_count': data['works_count'],
            'num_relevant_papers': len(data['paper_scores']),
            'avg_relevance_score': avg_relevance,
            'composite_score': composite_score,
            'openalex_url': f"https://openalex.org/{author_id}"
        })
    
    # Sort by composite score
    ranked_authors.sort(key=lambda x: x['composite_score'], reverse=True)
    
    return ranked_authors

def main():
    st.title("🔬 OpenAlex Semantic Search")
    st.markdown("""
    Search for academic papers and discover top researchers using semantic search powered by OpenAlex.
    
    **How it works:**
    1. Enter your search terms (e.g., "machine learning for drug discovery")
    2. The app finds relevant papers using semantic similarity
    3. Authors are ranked by relevance, h-index, and citation metrics
    """)
    
    # Sidebar controls
    st.sidebar.header("Search Settings")
    
    num_papers = st.sidebar.slider(
        "Number of papers to fetch",
        min_value=20,
        max_value=100,
        value=50,
        step=10
    )
    
    top_papers_display = st.sidebar.slider(
        "Top papers to display",
        min_value=5,
        max_value=30,
        value=10,
        step=5
    )
    
    top_authors_display = st.sidebar.slider(
        "Top authors to display",
        min_value=5,
        max_value=50,
        value=20,
        step=5
    )
    
    min_papers_per_author = st.sidebar.slider(
        "Minimum papers per author",
        min_value=1,
        max_value=5,
        value=2,
        step=1,
        help="Minimum number of relevant papers an author must have to be included"
    )
    
    # Main search input
    query = st.text_input(
        "Enter your search query:",
        placeholder="e.g., 'graph neural networks for protein structure prediction'",
        help="Enter keywords or a description of what you're looking for"
    )
    
    search_button = st.button("🔍 Search", type="primary")
    
    if search_button and query:
        # Load model
        with st.spinner("Loading semantic model..."):
            model = load_model()
        
        # Search papers
        with st.spinner(f"Searching OpenAlex for papers about '{query}'..."):
            papers = search_openalex_papers(query, num_papers)
        
        if not papers:
            st.warning("No papers found. Try different search terms.")
            return
        
        st.success(f"Found {len(papers)} papers!")
        
        # Prepare papers for semantic search
        with st.spinner("Analyzing papers with semantic search..."):
            paper_texts = []
            valid_papers = []
            
            for paper in papers:
                title = paper.get('display_name', '') or paper.get('title', '')
                abstract = reconstruct_abstract(paper.get('abstract_inverted_index', {}))
                
                # Combine title and abstract (title weighted more)
                text = f"{title} {title} {abstract}"  # Title appears twice for emphasis
                
                if text.strip():
                    paper_texts.append(text)
                    valid_papers.append(paper)
            
            if not paper_texts:
                st.error("No valid paper content found.")
                return
            
            # Generate embeddings
            query_embedding = model.encode(query, convert_to_tensor=False)
            paper_embeddings = model.encode(paper_texts, convert_to_tensor=False, show_progress_bar=True)
            
            # Calculate similarities
            similarities = calculate_semantic_similarity(query_embedding, paper_embeddings)
            
            # Sort papers by similarity
            sorted_indices = np.argsort(similarities)[::-1]
            sorted_papers = [valid_papers[i] for i in sorted_indices]
            sorted_scores = [similarities[i] for i in sorted_indices]
        
        # Display top papers
        st.header(f"📄 Top {top_papers_display} Most Relevant Papers")
        
        for idx, (paper, score) in enumerate(zip(sorted_papers[:top_papers_display], sorted_scores[:top_papers_display])):
            with st.expander(f"**{idx+1}. {paper.get('display_name', 'Untitled')}** (Relevance: {score:.3f})"):
                col1, col2 = st.columns([3, 1])
                
                with col1:
                    abstract = reconstruct_abstract(paper.get('abstract_inverted_index', {}))
                    if abstract:
                        st.markdown(f"**Abstract:** {abstract[:500]}{'...' if len(abstract) > 500 else ''}")
                    else:
                        st.markdown("*No abstract available*")
                    
                    # Authors
                    authors = [a.get('author', {}).get('display_name', 'Unknown') 
                              for a in paper.get('authorships', [])]
                    if authors:
                        st.markdown(f"**Authors:** {', '.join(authors[:5])}{'...' if len(authors) > 5 else ''}")
                
                with col2:
                    st.metric("Year", paper.get('publication_year', 'N/A'))
                    st.metric("Citations", paper.get('cited_by_count', 0))
                    
                    paper_id = paper.get('id', '').split('/')[-1]
                    if paper_id:
                        st.markdown(f"[View on OpenAlex](https://openalex.org/{paper_id})")
        
        # Rank authors
        st.header(f"👨‍🔬 Top {top_authors_display} Researchers")
        
        ranked_authors = rank_authors(
            sorted_papers, 
            sorted_scores, 
            model, 
            query_embedding,
            min_papers=min_papers_per_author
        )
        
        if not ranked_authors:
            st.warning(f"No authors found with at least {min_papers_per_author} relevant papers.")
            return
        
        # Display authors in a table
        st.markdown(f"Found {len(ranked_authors)} researchers with at least {min_papers_per_author} relevant papers.")
        
        for idx, author in enumerate(ranked_authors[:top_authors_display], 1):
            with st.container():
                col1, col2, col3, col4 = st.columns([3, 1, 1, 1])
                
                with col1:
                    st.markdown(f"**{idx}. [{author['name']}]({author['openalex_url']})**")
                    if author['institution']:
                        st.caption(author['institution'])
                
                with col2:
                    st.metric("H-Index", author['h_index'])
                
                with col3:
                    st.metric("Citations", f"{author['total_citations']:,}")
                
                with col4:
                    st.metric("Relevance", f"{author['avg_relevance_score']:.3f}")
                
                st.caption(f"Total works: {author['works_count']} | Relevant papers: {author['num_relevant_papers']}")
                st.divider()
        
        # Download results
        st.header("📥 Download Results")
        
        # Prepare CSV data for authors
        import io
        import csv
        
        csv_buffer = io.StringIO()
        csv_writer = csv.writer(csv_buffer)
        
        # Write header
        csv_writer.writerow([
            'Rank', 'Name', 'Institution', 'H-Index', 'Total Citations', 
            'Total Works', 'Relevant Papers', 'Avg Relevance Score', 'Composite Score', 'OpenAlex URL'
        ])
        
        # Write data
        for idx, author in enumerate(ranked_authors, 1):
            csv_writer.writerow([
                idx,
                author['name'],
                author['institution'],
                author['h_index'],
                author['total_citations'],
                author['works_count'],
                author['num_relevant_papers'],
                f"{author['avg_relevance_score']:.4f}",
                f"{author['composite_score']:.4f}",
                author['openalex_url']
            ])
        
        csv_data = csv_buffer.getvalue()
        
        st.download_button(
            label="Download Author Rankings (CSV)",
            data=csv_data,
            file_name=f"openalex_authors_{query.replace(' ', '_')[:30]}.csv",
            mime="text/csv"
        )

if __name__ == "__main__":
    main()