Spaces:

INLEXIO
/

semantic-search

Running

App Files Files Community

INLEXIO commited on 12 days ago

Commit

2e73ba2

verified ·

1 Parent(s): de16096

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +203 -20

src/streamlit_app.py CHANGED Viewed

@@ -52,11 +52,12 @@ def load_model():
 # LIMITED CACHE: Only store 50 recent searches
 @st.cache_data(ttl=3600, max_entries=50, show_spinner=False)
-def search_openalex_papers(query, num_results=50, country_code=None, use_fulltext=False):
     """
     Search OpenAlex for papers related to the query
     Optionally filter by author's country
     Optionally use full-text search (searches title + abstract + full text when available)
     Note: Results are cached for 1 hour, max 50 searches stored
     For large requests (>100), uses pagination
@@ -90,6 +91,12 @@ def search_openalex_papers(query, num_results=50, country_code=None, use_fulltex
         if country_code:
             filters.append(f"authorships.countries:{country_code}")
         # Combine filters with comma (AND operation)
         if filters:
             params["filter"] = ",".join(filters)
@@ -152,6 +159,66 @@ def get_author_details(author_id):
     except Exception as e:
         return None
 def calculate_semantic_similarity(query_embedding, paper_embeddings):
     """
     Calculate cosine similarity between query and papers
@@ -164,12 +231,14 @@ def calculate_semantic_similarity(query_embedding, paper_embeddings):
     similarities = np.dot(paper_norms, query_norm)
     return similarities
-def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
     """
     Extract authors from papers and rank them based on:
     - Semantic relevance (average of their paper scores)
     - H-index
     - Total citations
     """
     author_data = defaultdict(lambda: {
         'name': '',
@@ -179,7 +248,9 @@ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
         'total_citations': 0,
         'works_count': 0,
         'h_index': 0,
-        'institution': ''
     })
     # Collect author information from papers
@@ -220,6 +291,35 @@ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
         progress_bar.empty()
     # Calculate composite score for ranking
     ranked_authors = []
     for author_id, data in filtered_authors.items():
@@ -230,11 +330,21 @@ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
         normalized_citations = np.log1p(data['total_citations']) / 15.0  # Log scale
         # Weighted composite score
-        composite_score = (
-            0.5 * avg_relevance +  # 50% relevance
-            0.3 * min(normalized_h_index, 1.0) +  # 30% h-index
-            0.2 * min(normalized_citations, 1.0)  # 20% citations
-        )
         ranked_authors.append({
             'name': data['name'],
@@ -244,6 +354,8 @@ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
             'works_count': data['works_count'],
             'num_relevant_papers': len(data['paper_scores']),
             'avg_relevance_score': avg_relevance,
             'composite_score': composite_score,
             'institution': data['institution'],
             'openalex_url': f"https://openalex.org/A{author_id}"
@@ -315,6 +427,37 @@ def main():
     )
     country_code = COUNTRIES[selected_country]
     # Full-text search option
     use_fulltext = st.sidebar.checkbox(
         "Include full text (when available)",
@@ -322,6 +465,26 @@ def main():
         help="Search within full paper text (not just title/abstract). ~10-15% of papers have full text available. Slightly slower."
     )
     # Minimum papers per author
     min_papers_per_author = st.sidebar.slider(
         "Minimum papers per author",
@@ -353,21 +516,25 @@ def main():
     if search_button and query:
         # Display search parameters
-        st.info(f"🔍 Searching: **{query}** | Mode: **{search_mode}** | Papers: **{num_papers}** | Country: **{selected_country}** | Full-text: **{'Yes' if use_fulltext else 'No'}** | Min papers/author: **{min_papers_per_author}**")
         # Load model
         with st.spinner("Loading semantic model..."):
             model = load_model()
         # Search papers
-        search_key = f"{query}_{num_papers}_{country_code}_{use_fulltext}"
         if search_mode == "Deep Search":
             progress_text = f"🔍 Deep search in progress: Fetching up to {num_papers} papers from OpenAlex..."
             progress_bar = st.progress(0, text=progress_text)
-        with st.spinner(f"Searching OpenAlex for papers about '{query}'{' from ' + selected_country if country_code else ''}{' (including full text)' if use_fulltext else ''}..."):
-            papers = search_openalex_papers(query, num_papers, country_code, use_fulltext)
         if search_mode == "Deep Search":
             progress_bar.progress(33, text="📄 Papers fetched! Now generating embeddings...")
@@ -383,9 +550,11 @@ def main():
             st.write(f"**Search Mode:** {search_mode}")
             st.write(f"**Query:** {query}")
             st.write(f"**Full-text search:** {'Enabled' if use_fulltext else 'Disabled'}")
             st.write(f"**Papers requested:** {num_papers}")
             st.write(f"**Papers fetched:** {len(papers)}")
             st.write(f"**Country filter:** {selected_country} ({country_code or 'None'})")
             st.write(f"**First paper:** {papers[0].get('display_name', 'N/A')[:100]}...")
             st.write(f"**Last paper:** {papers[-1].get('display_name', 'N/A')[:100]}...")
@@ -469,7 +638,9 @@ def main():
             sorted_scores,
             model,
             query_embedding,
-            min_papers=min_papers_per_author
         )
         if not ranked_authors:
@@ -495,9 +666,15 @@ def main():
                     st.metric("Citations", f"{author['total_citations']:,}")
                 with col4:
-                    st.metric("Relevance", f"{author['avg_relevance_score']:.3f}")
-                st.caption(f"Total works: {author['works_count']} | Relevant papers: {author['num_relevant_papers']}")
                 st.divider()
         # Download results
@@ -511,14 +688,17 @@ def main():
         csv_writer = csv.writer(csv_buffer)
         # Write header
-        csv_writer.writerow([
             'Rank', 'Name', 'Institution', 'H-Index', 'Total Citations',
             'Total Works', 'Relevant Papers', 'Avg Relevance Score', 'Composite Score', 'OpenAlex URL'
-        ])
         # Write data
         for idx, author in enumerate(ranked_authors, 1):
-            csv_writer.writerow([
                 idx,
                 author['name'],
                 author['institution'],
@@ -528,8 +708,11 @@ def main():
                 author['num_relevant_papers'],
                 f"{author['avg_relevance_score']:.4f}",
                 f"{author['composite_score']:.4f}",
-                author['openalex_url']
-            ])
         csv_data = csv_buffer.getvalue()

 # LIMITED CACHE: Only store 50 recent searches
 @st.cache_data(ttl=3600, max_entries=50, show_spinner=False)
+def search_openalex_papers(query, num_results=50, country_code=None, use_fulltext=False, year_min=None, year_max=None):
     """
     Search OpenAlex for papers related to the query
     Optionally filter by author's country
     Optionally use full-text search (searches title + abstract + full text when available)
+    Optionally filter by publication year range
     Note: Results are cached for 1 hour, max 50 searches stored
     For large requests (>100), uses pagination
         if country_code:
             filters.append(f"authorships.countries:{country_code}")
+        # Add year range filter if specified
+        if year_min is not None:
+            filters.append(f"publication_year:>{year_min-1}")  # Greater than or equal
+        if year_max is not None:
+            filters.append(f"publication_year:<{year_max+1}")  # Less than or equal
         # Combine filters with comma (AND operation)
         if filters:
             params["filter"] = ",".join(filters)
     except Exception as e:
         return None
+# LIMITED CACHE: Only store 200 recent author works lookups
+@st.cache_data(ttl=3600, max_entries=200)
+def get_author_works(author_id, max_works=20):
+    """
+    Fetch author's recent works for validation
+    Returns up to max_works most recent papers by this author
+    """
+    base_url = "https://api.openalex.org/works"
+    params = {
+        "filter": f"author.id:A{author_id}",
+        "per_page": max_works,
+        "sort": "cited_by_count:desc",  # Get most cited papers
+        "select": "id,title,abstract_inverted_index,publication_year",
+        "mailto": "[email protected]"
+    }
+    try:
+        response = requests.get(base_url, params=params, timeout=10)
+        response.raise_for_status()
+        data = response.json()
+        return data.get("results", [])
+    except Exception as e:
+        return []
+def validate_author_relevance(author_id, query_embedding, model, threshold=0.25, max_works=20):
+    """
+    Validate if an author is actually relevant to the search query
+    by checking semantic similarity of their body of work
+    Returns: (is_valid, avg_similarity, num_works_checked)
+    """
+    # Fetch author's works
+    works = get_author_works(author_id, max_works)
+    if not works:
+        return False, 0.0, 0
+    # Generate embeddings for author's works
+    work_texts = []
+    for work in works:
+        title = work.get('title', '') or work.get('display_name', '')
+        abstract = reconstruct_abstract(work.get('abstract_inverted_index', {}))
+        text = f"{title} {title} {abstract}"
+        if text.strip():
+            work_texts.append(text)
+    if not work_texts:
+        return False, 0.0, 0
+    # Calculate similarity to query
+    work_embeddings = model.encode(work_texts, convert_to_tensor=False, show_progress_bar=False)
+    similarities = calculate_semantic_similarity(query_embedding, work_embeddings)
+    avg_similarity = np.mean(similarities)
+    # Author is valid if their average work similarity exceeds threshold
+    is_valid = avg_similarity >= threshold
+    return is_valid, avg_similarity, len(work_texts)
 def calculate_semantic_similarity(query_embedding, paper_embeddings):
     """
     Calculate cosine similarity between query and papers
     similarities = np.dot(paper_norms, query_norm)
     return similarities
+def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2, validate_authors=True, validation_threshold=0.25):
     """
     Extract authors from papers and rank them based on:
     - Semantic relevance (average of their paper scores)
     - H-index
     - Total citations
+    If validate_authors=True, checks each author's body of work for relevance
     """
     author_data = defaultdict(lambda: {
         'name': '',
         'total_citations': 0,
         'works_count': 0,
         'h_index': 0,
+        'institution': '',
+        'validation_score': 0.0,
+        'validated': False
     })
     # Collect author information from papers
         progress_bar.empty()
+    # Validate authors if requested
+    if validate_authors:
+        with st.spinner(f"Validating author relevance (checking their body of work)..."):
+            progress_bar = st.progress(0)
+            validated_count = 0
+            for idx, (author_id, data) in enumerate(filtered_authors.items()):
+                is_valid, val_score, num_works = validate_author_relevance(
+                    author_id, query_embedding, model, validation_threshold
+                )
+                data['validated'] = is_valid
+                data['validation_score'] = val_score
+                data['num_works_checked'] = num_works
+                if is_valid:
+                    validated_count += 1
+                progress_bar.progress((idx + 1) / len(filtered_authors))
+                time.sleep(0.1)  # Rate limiting
+            progress_bar.empty()
+            st.success(f"✅ {validated_count}/{len(filtered_authors)} authors validated as relevant to your query")
+        # Filter to only validated authors
+        filtered_authors = {
+            aid: data for aid, data in filtered_authors.items()
+            if data['validated']
+        }
     # Calculate composite score for ranking
     ranked_authors = []
     for author_id, data in filtered_authors.items():
         normalized_citations = np.log1p(data['total_citations']) / 15.0  # Log scale
         # Weighted composite score
+        if validate_authors:
+            # Include validation score in composite
+            composite_score = (
+                0.4 * avg_relevance +  # 40% relevance in initial papers
+                0.3 * data['validation_score'] +  # 30% validation (their body of work)
+                0.2 * min(normalized_h_index, 1.0) +  # 20% h-index
+                0.1 * min(normalized_citations, 1.0)  # 10% citations
+            )
+        else:
+            # Original scoring without validation
+            composite_score = (
+                0.5 * avg_relevance +  # 50% relevance
+                0.3 * min(normalized_h_index, 1.0) +  # 30% h-index
+                0.2 * min(normalized_citations, 1.0)  # 20% citations
+            )
         ranked_authors.append({
             'name': data['name'],
             'works_count': data['works_count'],
             'num_relevant_papers': len(data['paper_scores']),
             'avg_relevance_score': avg_relevance,
+            'validation_score': data['validation_score'],
+            'validated': data['validated'],
             'composite_score': composite_score,
             'institution': data['institution'],
             'openalex_url': f"https://openalex.org/A{author_id}"
     )
     country_code = COUNTRIES[selected_country]
+    # Year range filter
+    st.sidebar.subheader("📅 Year Range")
+    current_year = 2025
+    use_year_filter = st.sidebar.checkbox(
+        "Limit by publication year",
+        value=False,
+        help="Filter papers by publication year range"
+    )
+    if use_year_filter:
+        year_col1, year_col2 = st.sidebar.columns(2)
+        with year_col1:
+            year_min = st.number_input(
+                "From",
+                min_value=1900,
+                max_value=current_year,
+                value=2015,
+                step=1
+            )
+        with year_col2:
+            year_max = st.number_input(
+                "To",
+                min_value=1900,
+                max_value=current_year,
+                value=current_year,
+                step=1
+            )
+    else:
+        year_min = None
+        year_max = None
     # Full-text search option
     use_fulltext = st.sidebar.checkbox(
         "Include full text (when available)",
         help="Search within full paper text (not just title/abstract). ~10-15% of papers have full text available. Slightly slower."
     )
+    # Author validation
+    st.sidebar.subheader("👤 Author Validation")
+    validate_authors = st.sidebar.checkbox(
+        "Validate authors' body of work",
+        value=True,
+        help="Check each author's recent papers to confirm they're actually working in this area. More accurate but slower."
+    )
+    if validate_authors:
+        validation_threshold = st.sidebar.slider(
+            "Validation threshold",
+            min_value=0.15,
+            max_value=0.50,
+            value=0.25,
+            step=0.05,
+            help="Minimum average similarity score for author's works. Higher = stricter filter."
+        )
+    else:
+        validation_threshold = 0.25
     # Minimum papers per author
     min_papers_per_author = st.sidebar.slider(
         "Minimum papers per author",
     if search_button and query:
         # Display search parameters
+        year_range_text = f"Years: **{year_min}-{year_max}**" if use_year_filter else "Years: **All**"
+        validation_text = f"Validation: **On (threshold {validation_threshold})**" if validate_authors else "Validation: **Off**"
+        st.info(f"🔍 Searching: **{query}** | Mode: **{search_mode}** | Papers: **{num_papers}** | {year_range_text} | Country: **{selected_country}** | Full-text: **{'Yes' if use_fulltext else 'No'}** | {validation_text} | Min papers/author: **{min_papers_per_author}**")
         # Load model
         with st.spinner("Loading semantic model..."):
             model = load_model()
         # Search papers
+        search_key = f"{query}_{num_papers}_{country_code}_{use_fulltext}_{year_min}_{year_max}"
         if search_mode == "Deep Search":
             progress_text = f"🔍 Deep search in progress: Fetching up to {num_papers} papers from OpenAlex..."
             progress_bar = st.progress(0, text=progress_text)
+        year_filter_text = f" from {year_min}-{year_max}" if use_year_filter else ""
+        with st.spinner(f"Searching OpenAlex for papers about '{query}'{year_filter_text}{' from ' + selected_country if country_code else ''}{' (including full text)' if use_fulltext else ''}..."):
+            papers = search_openalex_papers(query, num_papers, country_code, use_fulltext, year_min, year_max)
         if search_mode == "Deep Search":
             progress_bar.progress(33, text="📄 Papers fetched! Now generating embeddings...")
             st.write(f"**Search Mode:** {search_mode}")
             st.write(f"**Query:** {query}")
             st.write(f"**Full-text search:** {'Enabled' if use_fulltext else 'Disabled'}")
+            st.write(f"**Year range:** {year_min}-{year_max}" if use_year_filter else "**Year range:** All years")
             st.write(f"**Papers requested:** {num_papers}")
             st.write(f"**Papers fetched:** {len(papers)}")
             st.write(f"**Country filter:** {selected_country} ({country_code or 'None'})")
+            st.write(f"**Author validation:** {'Enabled (threshold: ' + str(validation_threshold) + ')' if validate_authors else 'Disabled'}")
             st.write(f"**First paper:** {papers[0].get('display_name', 'N/A')[:100]}...")
             st.write(f"**Last paper:** {papers[-1].get('display_name', 'N/A')[:100]}...")
             sorted_scores,
             model,
             query_embedding,
+            min_papers=min_papers_per_author,
+            validate_authors=validate_authors,
+            validation_threshold=validation_threshold
         )
         if not ranked_authors:
                     st.metric("Citations", f"{author['total_citations']:,}")
                 with col4:
+                    if validate_authors:
+                        st.metric("Body Relevance", f"{author['validation_score']:.3f}")
+                    else:
+                        st.metric("Relevance", f"{author['avg_relevance_score']:.3f}")
+                caption_text = f"Total works: {author['works_count']} | Relevant papers: {author['num_relevant_papers']}"
+                if validate_authors:
+                    caption_text += f" | Paper relevance: {author['avg_relevance_score']:.3f}"
+                st.caption(caption_text)
                 st.divider()
         # Download results
         csv_writer = csv.writer(csv_buffer)
         # Write header
+        header = [
             'Rank', 'Name', 'Institution', 'H-Index', 'Total Citations',
             'Total Works', 'Relevant Papers', 'Avg Relevance Score', 'Composite Score', 'OpenAlex URL'
+        ]
+        if validate_authors:
+            header.insert(-1, 'Body of Work Validation Score')
+        csv_writer.writerow(header)
         # Write data
         for idx, author in enumerate(ranked_authors, 1):
+            row = [
                 idx,
                 author['name'],
                 author['institution'],
                 author['num_relevant_papers'],
                 f"{author['avg_relevance_score']:.4f}",
                 f"{author['composite_score']:.4f}",
+            ]
+            if validate_authors:
+                row.append(f"{author['validation_score']:.4f}")
+            row.append(author['openalex_url'])
+            csv_writer.writerow(row)
         csv_data = csv_buffer.getvalue()