Spaces:

INLEXIO
/

semantic-search

Sleeping

App Files Files Community

INLEXIO commited on 19 days ago

Commit

2c6e629

verified ·

1 Parent(s): 0c2af38

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +94 -95

src/streamlit_app.py CHANGED Viewed

@@ -5,12 +5,35 @@ import numpy as np
 from collections import defaultdict
 import time
 import os
-# Set cache directory to writable location
 os.environ['HF_HOME'] = '/tmp/huggingface'
 os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface'
 os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/tmp/huggingface'
 # Page config
 st.set_page_config(
     page_title="OpenAlex Semantic Search",
@@ -24,14 +47,15 @@ def load_model():
     """Load the sentence transformer model"""
     return SentenceTransformer('all-MiniLM-L6-v2', cache_folder='/tmp/huggingface')
-@st.cache_data(ttl=3600, show_spinner=False)
 def search_openalex_papers(query, num_results=50, country_code=None, use_fulltext=False):
     """
     Search OpenAlex for papers related to the query
     Optionally filter by author's country
     Optionally use full-text search (searches title + abstract + full text when available)
-    Note: Results are cached for 1 hour based on query, num_results, country_code, and use_fulltext
     For large requests (>100), uses pagination
     """
     base_url = "https://api.openalex.org/works"
@@ -105,10 +129,12 @@ def reconstruct_abstract(inverted_index):
     words_with_positions.sort(key=lambda x: x[0])
     return " ".join([word for _, word in words_with_positions])
-@st.cache_data(ttl=3600)
 def get_author_details(author_id):
     """
     Fetch detailed author information from OpenAlex
     """
     base_url = f"https://api.openalex.org/authors/{author_id}"
@@ -200,24 +226,24 @@ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
         normalized_h_index = data['h_index'] / 100.0  # Assume max h-index of 100
         normalized_citations = np.log1p(data['total_citations']) / 15.0  # Log scale
-        # Composite score: weighted combination
         composite_score = (
-            0.5 * avg_relevance +  # 50% semantic relevance
-            0.3 * normalized_h_index +  # 30% h-index
-            0.2 * normalized_citations  # 20% citations
         )
         ranked_authors.append({
-            'author_id': author_id,
             'name': data['name'],
-            'institution': data['institution'],
             'h_index': data['h_index'],
             'total_citations': data['total_citations'],
             'works_count': data['works_count'],
             'num_relevant_papers': len(data['paper_scores']),
             'avg_relevance_score': avg_relevance,
             'composite_score': composite_score,
-            'openalex_url': f"https://openalex.org/{author_id}"
         })
     # Sort by composite score
@@ -225,121 +251,94 @@ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
     return ranked_authors
 def main():
     st.title("🔬 OpenAlex Semantic Search")
     st.markdown("""
-    Search for academic papers and discover top researchers using semantic search powered by OpenAlex.
-    **How it works:**
-    1. Enter your search terms (e.g., "machine learning for drug discovery")
-    2. Optionally filter by author country
-    3. The app finds relevant papers using semantic similarity
-    4. Authors are ranked by relevance, h-index, and citation metrics
     """)
-    # Sidebar controls
-    st.sidebar.header("Search Settings")
-    # Clear cache button
-    if st.sidebar.button("🔄 Clear Cache", help="Clear cached search results to force fresh data"):
-        st.cache_data.clear()
-        st.sidebar.success("Cache cleared!")
-        st.rerun()
-    st.sidebar.divider()
-    # Country filter
-    country_options = {
-        "Any Country": None,
-        "United States": "US",
-        "United Kingdom": "GB",
-        "Germany": "DE",
-        "France": "FR",
-        "Canada": "CA",
-        "Australia": "AU",
-        "China": "CN",
-        "Japan": "JP",
-        "India": "IN",
-        "South Korea": "KR",
-        "Netherlands": "NL",
-        "Switzerland": "CH",
-        "Sweden": "SE",
-        "Italy": "IT",
-        "Spain": "ES",
-        "Brazil": "BR",
-        "Singapore": "SG",
-        "Israel": "IL",
-        "Belgium": "BE",
-        "Austria": "AT",
-    }
-    selected_country = st.sidebar.selectbox(
-        "Filter by author country",
-        options=list(country_options.keys()),
-        help="Filter papers by the country of at least one author. Uses ISO country codes from OpenAlex data."
-    )
-    country_code = country_options[selected_country]
     # Search mode selection
     search_mode = st.sidebar.radio(
         "Search Mode",
-        options=["Quick Search", "Deep Search"],
-        help="Quick: 50-100 papers in 30s | Deep: 1,000-5,000 papers in 2-5 min"
-    )
-    # Full-text search option
-    use_fulltext = st.sidebar.checkbox(
-        "Include full text (when available)",
-        value=False,
-        help="Search title + abstract + full text. Full text available for ~10-15% of papers. May find more specific matches."
     )
     if search_mode == "Quick Search":
         num_papers = st.sidebar.slider(
-            "Number of papers to fetch",
             min_value=20,
             max_value=100,
             value=50,
-            step=10
         )
-    else:  # Deep Search
         num_papers = st.sidebar.slider(
-            "Number of papers to fetch",
-            min_value=500,
-            max_value=5000,
-            value=1000,
-            step=500,
-            help="⚠️ Deep search takes 2-5 minutes"
         )
-        st.sidebar.warning("⚠️ Deep search will take 2-5 minutes to complete")
-    top_papers_display = st.sidebar.slider(
-        "Top papers to display",
-        min_value=5,
-        max_value=30,
-        value=10,
-        step=5
     )
-    top_authors_display = st.sidebar.slider(
-        "Top authors to display",
-        min_value=5,
-        max_value=50,
-        value=20,
-        step=5
     )
     min_papers_per_author = st.sidebar.slider(
         "Minimum papers per author",
         min_value=1,
         max_value=5,
         value=2,
-        step=1,
-        help="Minimum number of relevant papers an author must have to be included"
     )
-    # Main search input
     query = st.text_input(
         "Enter your search query:",
         placeholder="e.g., 'graph neural networks for protein structure prediction'",

 from collections import defaultdict
 import time
 import os
+import shutil
+# Set cache directory to /tmp (gets cleared on restart)
 os.environ['HF_HOME'] = '/tmp/huggingface'
 os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface'
 os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/tmp/huggingface'
+# Clear old cache on startup to prevent accumulation
+def clear_old_cache():
+    """Clear /tmp cache if it gets too large"""
+    cache_dir = '/tmp/huggingface'
+    try:
+        if os.path.exists(cache_dir):
+            size_mb = sum(
+                os.path.getsize(os.path.join(dirpath, filename))
+                for dirpath, dirnames, filenames in os.walk(cache_dir)
+                for filename in filenames
+            ) / (1024 * 1024)
+            # If cache > 5GB, clear it
+            if size_mb > 5000:
+                shutil.rmtree(cache_dir)
+                os.makedirs(cache_dir)
+    except:
+        pass
+# Run cleanup on startup
+clear_old_cache()
 # Page config
 st.set_page_config(
     page_title="OpenAlex Semantic Search",
     """Load the sentence transformer model"""
     return SentenceTransformer('all-MiniLM-L6-v2', cache_folder='/tmp/huggingface')
+# LIMITED CACHE: Only store 50 recent searches
+@st.cache_data(ttl=3600, max_entries=50, show_spinner=False)
 def search_openalex_papers(query, num_results=50, country_code=None, use_fulltext=False):
     """
     Search OpenAlex for papers related to the query
     Optionally filter by author's country
     Optionally use full-text search (searches title + abstract + full text when available)
+    Note: Results are cached for 1 hour, max 50 searches stored
     For large requests (>100), uses pagination
     """
     base_url = "https://api.openalex.org/works"
     words_with_positions.sort(key=lambda x: x[0])
     return " ".join([word for _, word in words_with_positions])
+# LIMITED CACHE: Only store 200 recent author lookups
+@st.cache_data(ttl=3600, max_entries=200)
 def get_author_details(author_id):
     """
     Fetch detailed author information from OpenAlex
+    Cache limited to 200 authors to prevent storage issues
     """
     base_url = f"https://api.openalex.org/authors/{author_id}"
         normalized_h_index = data['h_index'] / 100.0  # Assume max h-index of 100
         normalized_citations = np.log1p(data['total_citations']) / 15.0  # Log scale
+        # Weighted composite score
         composite_score = (
+            0.5 * avg_relevance +  # 50% relevance
+            0.3 * min(normalized_h_index, 1.0) +  # 30% h-index
+            0.2 * min(normalized_citations, 1.0)  # 20% citations
         )
         ranked_authors.append({
             'name': data['name'],
+            'id': author_id,
             'h_index': data['h_index'],
             'total_citations': data['total_citations'],
             'works_count': data['works_count'],
             'num_relevant_papers': len(data['paper_scores']),
             'avg_relevance_score': avg_relevance,
             'composite_score': composite_score,
+            'institution': data['institution'],
+            'openalex_url': f"https://openalex.org/A{author_id}"
         })
     # Sort by composite score
     return ranked_authors
+# Define country codes
+COUNTRIES = {
+    "All Countries": None,
+    "Australia": "AU",
+    "Canada": "CA",
+    "China": "CN",
+    "France": "FR",
+    "Germany": "DE",
+    "India": "IN",
+    "Japan": "JP",
+    "United Kingdom": "GB",
+    "United States": "US",
+}
 def main():
+    # Header
     st.title("🔬 OpenAlex Semantic Search")
     st.markdown("""
+    Search for research papers and discover top researchers using semantic similarity matching.
+    This tool searches the OpenAlex database and ranks results by relevance, not just citations.
     """)
+    # Sidebar configuration
+    st.sidebar.header("⚙️ Search Configuration")
     # Search mode selection
     search_mode = st.sidebar.radio(
         "Search Mode",
+        ["Quick Search", "Deep Search"],
+        help="Quick: 50-100 papers (~30s) | Deep: 500-1,000 papers (2-5 min)"
     )
+    # Number of papers based on mode
     if search_mode == "Quick Search":
         num_papers = st.sidebar.slider(
+            "Number of papers to analyze",
             min_value=20,
             max_value=100,
             value=50,
+            step=10,
+            help="More papers = more comprehensive but slower"
         )
+    else:  # Deep Search - LIMIT TO 1000 to prevent storage issues
         num_papers = st.sidebar.slider(
+            "Number of papers to analyze",
+            min_value=100,
+            max_value=1000,  # REDUCED from 5000
+            value=500,
+            step=100,
+            help="⚠️ Limited to 1000 papers to prevent storage issues. Deep search takes 2-5 minutes."
         )
+    # Country filter
+    selected_country = st.sidebar.selectbox(
+        "Filter by author country (optional)",
+        options=list(COUNTRIES.keys()),
+        help="Only include papers where at least one author is from this country"
     )
+    country_code = COUNTRIES[selected_country]
+    # Full-text search option
+    use_fulltext = st.sidebar.checkbox(
+        "Include full text (when available)",
+        value=False,
+        help="Search within full paper text (not just title/abstract). ~10-15% of papers have full text available. Slightly slower."
     )
+    # Minimum papers per author
     min_papers_per_author = st.sidebar.slider(
         "Minimum papers per author",
         min_value=1,
         max_value=5,
         value=2,
+        help="Filters out authors who appear in fewer than N papers"
     )
+    # Display settings
+    st.sidebar.header("📊 Display Settings")
+    top_papers_display = st.sidebar.slider("Number of top papers to show", 5, 50, 10)
+    top_authors_display = st.sidebar.slider("Number of top authors to show", 5, 50, 10)
+    # Storage usage info
+    st.sidebar.markdown("---")
+    st.sidebar.info("💾 Cache limited to prevent storage issues:\n- Max 50 searches stored\n- Max 200 authors cached\n- Max 1000 papers in Deep Search")
+    # Main search interface
+    st.header("🔍 Search Query")
     query = st.text_input(
         "Enter your search query:",
         placeholder="e.g., 'graph neural networks for protein structure prediction'",