Spaces:
Running
Running
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +203 -20
src/streamlit_app.py
CHANGED
|
@@ -52,11 +52,12 @@ def load_model():
|
|
| 52 |
|
| 53 |
# LIMITED CACHE: Only store 50 recent searches
|
| 54 |
@st.cache_data(ttl=3600, max_entries=50, show_spinner=False)
|
| 55 |
-
def search_openalex_papers(query, num_results=50, country_code=None, use_fulltext=False):
|
| 56 |
"""
|
| 57 |
Search OpenAlex for papers related to the query
|
| 58 |
Optionally filter by author's country
|
| 59 |
Optionally use full-text search (searches title + abstract + full text when available)
|
|
|
|
| 60 |
|
| 61 |
Note: Results are cached for 1 hour, max 50 searches stored
|
| 62 |
For large requests (>100), uses pagination
|
|
@@ -90,6 +91,12 @@ def search_openalex_papers(query, num_results=50, country_code=None, use_fulltex
|
|
| 90 |
if country_code:
|
| 91 |
filters.append(f"authorships.countries:{country_code}")
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
# Combine filters with comma (AND operation)
|
| 94 |
if filters:
|
| 95 |
params["filter"] = ",".join(filters)
|
|
@@ -152,6 +159,66 @@ def get_author_details(author_id):
|
|
| 152 |
except Exception as e:
|
| 153 |
return None
|
| 154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
def calculate_semantic_similarity(query_embedding, paper_embeddings):
|
| 156 |
"""
|
| 157 |
Calculate cosine similarity between query and papers
|
|
@@ -164,12 +231,14 @@ def calculate_semantic_similarity(query_embedding, paper_embeddings):
|
|
| 164 |
similarities = np.dot(paper_norms, query_norm)
|
| 165 |
return similarities
|
| 166 |
|
| 167 |
-
def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
|
| 168 |
"""
|
| 169 |
Extract authors from papers and rank them based on:
|
| 170 |
- Semantic relevance (average of their paper scores)
|
| 171 |
- H-index
|
| 172 |
- Total citations
|
|
|
|
|
|
|
| 173 |
"""
|
| 174 |
author_data = defaultdict(lambda: {
|
| 175 |
'name': '',
|
|
@@ -179,7 +248,9 @@ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
|
|
| 179 |
'total_citations': 0,
|
| 180 |
'works_count': 0,
|
| 181 |
'h_index': 0,
|
| 182 |
-
'institution': ''
|
|
|
|
|
|
|
| 183 |
})
|
| 184 |
|
| 185 |
# Collect author information from papers
|
|
@@ -220,6 +291,35 @@ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
|
|
| 220 |
|
| 221 |
progress_bar.empty()
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
# Calculate composite score for ranking
|
| 224 |
ranked_authors = []
|
| 225 |
for author_id, data in filtered_authors.items():
|
|
@@ -230,11 +330,21 @@ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
|
|
| 230 |
normalized_citations = np.log1p(data['total_citations']) / 15.0 # Log scale
|
| 231 |
|
| 232 |
# Weighted composite score
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
ranked_authors.append({
|
| 240 |
'name': data['name'],
|
|
@@ -244,6 +354,8 @@ def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2):
|
|
| 244 |
'works_count': data['works_count'],
|
| 245 |
'num_relevant_papers': len(data['paper_scores']),
|
| 246 |
'avg_relevance_score': avg_relevance,
|
|
|
|
|
|
|
| 247 |
'composite_score': composite_score,
|
| 248 |
'institution': data['institution'],
|
| 249 |
'openalex_url': f"https://openalex.org/A{author_id}"
|
|
@@ -315,6 +427,37 @@ def main():
|
|
| 315 |
)
|
| 316 |
country_code = COUNTRIES[selected_country]
|
| 317 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
# Full-text search option
|
| 319 |
use_fulltext = st.sidebar.checkbox(
|
| 320 |
"Include full text (when available)",
|
|
@@ -322,6 +465,26 @@ def main():
|
|
| 322 |
help="Search within full paper text (not just title/abstract). ~10-15% of papers have full text available. Slightly slower."
|
| 323 |
)
|
| 324 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
# Minimum papers per author
|
| 326 |
min_papers_per_author = st.sidebar.slider(
|
| 327 |
"Minimum papers per author",
|
|
@@ -353,21 +516,25 @@ def main():
|
|
| 353 |
|
| 354 |
if search_button and query:
|
| 355 |
# Display search parameters
|
| 356 |
-
|
|
|
|
|
|
|
|
|
|
| 357 |
|
| 358 |
# Load model
|
| 359 |
with st.spinner("Loading semantic model..."):
|
| 360 |
model = load_model()
|
| 361 |
|
| 362 |
# Search papers
|
| 363 |
-
search_key = f"{query}_{num_papers}_{country_code}_{use_fulltext}"
|
| 364 |
|
| 365 |
if search_mode == "Deep Search":
|
| 366 |
progress_text = f"π Deep search in progress: Fetching up to {num_papers} papers from OpenAlex..."
|
| 367 |
progress_bar = st.progress(0, text=progress_text)
|
| 368 |
|
| 369 |
-
|
| 370 |
-
|
|
|
|
| 371 |
|
| 372 |
if search_mode == "Deep Search":
|
| 373 |
progress_bar.progress(33, text="π Papers fetched! Now generating embeddings...")
|
|
@@ -383,9 +550,11 @@ def main():
|
|
| 383 |
st.write(f"**Search Mode:** {search_mode}")
|
| 384 |
st.write(f"**Query:** {query}")
|
| 385 |
st.write(f"**Full-text search:** {'Enabled' if use_fulltext else 'Disabled'}")
|
|
|
|
| 386 |
st.write(f"**Papers requested:** {num_papers}")
|
| 387 |
st.write(f"**Papers fetched:** {len(papers)}")
|
| 388 |
st.write(f"**Country filter:** {selected_country} ({country_code or 'None'})")
|
|
|
|
| 389 |
st.write(f"**First paper:** {papers[0].get('display_name', 'N/A')[:100]}...")
|
| 390 |
st.write(f"**Last paper:** {papers[-1].get('display_name', 'N/A')[:100]}...")
|
| 391 |
|
|
@@ -469,7 +638,9 @@ def main():
|
|
| 469 |
sorted_scores,
|
| 470 |
model,
|
| 471 |
query_embedding,
|
| 472 |
-
min_papers=min_papers_per_author
|
|
|
|
|
|
|
| 473 |
)
|
| 474 |
|
| 475 |
if not ranked_authors:
|
|
@@ -495,9 +666,15 @@ def main():
|
|
| 495 |
st.metric("Citations", f"{author['total_citations']:,}")
|
| 496 |
|
| 497 |
with col4:
|
| 498 |
-
|
|
|
|
|
|
|
|
|
|
| 499 |
|
| 500 |
-
|
|
|
|
|
|
|
|
|
|
| 501 |
st.divider()
|
| 502 |
|
| 503 |
# Download results
|
|
@@ -511,14 +688,17 @@ def main():
|
|
| 511 |
csv_writer = csv.writer(csv_buffer)
|
| 512 |
|
| 513 |
# Write header
|
| 514 |
-
|
| 515 |
'Rank', 'Name', 'Institution', 'H-Index', 'Total Citations',
|
| 516 |
'Total Works', 'Relevant Papers', 'Avg Relevance Score', 'Composite Score', 'OpenAlex URL'
|
| 517 |
-
]
|
|
|
|
|
|
|
|
|
|
| 518 |
|
| 519 |
# Write data
|
| 520 |
for idx, author in enumerate(ranked_authors, 1):
|
| 521 |
-
|
| 522 |
idx,
|
| 523 |
author['name'],
|
| 524 |
author['institution'],
|
|
@@ -528,8 +708,11 @@ def main():
|
|
| 528 |
author['num_relevant_papers'],
|
| 529 |
f"{author['avg_relevance_score']:.4f}",
|
| 530 |
f"{author['composite_score']:.4f}",
|
| 531 |
-
|
| 532 |
-
|
|
|
|
|
|
|
|
|
|
| 533 |
|
| 534 |
csv_data = csv_buffer.getvalue()
|
| 535 |
|
|
|
|
| 52 |
|
| 53 |
# LIMITED CACHE: Only store 50 recent searches
|
| 54 |
@st.cache_data(ttl=3600, max_entries=50, show_spinner=False)
|
| 55 |
+
def search_openalex_papers(query, num_results=50, country_code=None, use_fulltext=False, year_min=None, year_max=None):
|
| 56 |
"""
|
| 57 |
Search OpenAlex for papers related to the query
|
| 58 |
Optionally filter by author's country
|
| 59 |
Optionally use full-text search (searches title + abstract + full text when available)
|
| 60 |
+
Optionally filter by publication year range
|
| 61 |
|
| 62 |
Note: Results are cached for 1 hour, max 50 searches stored
|
| 63 |
For large requests (>100), uses pagination
|
|
|
|
| 91 |
if country_code:
|
| 92 |
filters.append(f"authorships.countries:{country_code}")
|
| 93 |
|
| 94 |
+
# Add year range filter if specified
|
| 95 |
+
if year_min is not None:
|
| 96 |
+
filters.append(f"publication_year:>{year_min-1}") # Greater than or equal
|
| 97 |
+
if year_max is not None:
|
| 98 |
+
filters.append(f"publication_year:<{year_max+1}") # Less than or equal
|
| 99 |
+
|
| 100 |
# Combine filters with comma (AND operation)
|
| 101 |
if filters:
|
| 102 |
params["filter"] = ",".join(filters)
|
|
|
|
| 159 |
except Exception as e:
|
| 160 |
return None
|
| 161 |
|
| 162 |
+
# LIMITED CACHE: Only store 200 recent author works lookups
|
| 163 |
+
@st.cache_data(ttl=3600, max_entries=200)
|
| 164 |
+
def get_author_works(author_id, max_works=20):
|
| 165 |
+
"""
|
| 166 |
+
Fetch author's recent works for validation
|
| 167 |
+
Returns up to max_works most recent papers by this author
|
| 168 |
+
"""
|
| 169 |
+
base_url = "https://api.openalex.org/works"
|
| 170 |
+
|
| 171 |
+
params = {
|
| 172 |
+
"filter": f"author.id:A{author_id}",
|
| 173 |
+
"per_page": max_works,
|
| 174 |
+
"sort": "cited_by_count:desc", # Get most cited papers
|
| 175 |
+
"select": "id,title,abstract_inverted_index,publication_year",
|
| 176 |
+
"mailto": "[email protected]"
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
try:
|
| 180 |
+
response = requests.get(base_url, params=params, timeout=10)
|
| 181 |
+
response.raise_for_status()
|
| 182 |
+
data = response.json()
|
| 183 |
+
return data.get("results", [])
|
| 184 |
+
except Exception as e:
|
| 185 |
+
return []
|
| 186 |
+
|
| 187 |
+
def validate_author_relevance(author_id, query_embedding, model, threshold=0.25, max_works=20):
|
| 188 |
+
"""
|
| 189 |
+
Validate if an author is actually relevant to the search query
|
| 190 |
+
by checking semantic similarity of their body of work
|
| 191 |
+
|
| 192 |
+
Returns: (is_valid, avg_similarity, num_works_checked)
|
| 193 |
+
"""
|
| 194 |
+
# Fetch author's works
|
| 195 |
+
works = get_author_works(author_id, max_works)
|
| 196 |
+
|
| 197 |
+
if not works:
|
| 198 |
+
return False, 0.0, 0
|
| 199 |
+
|
| 200 |
+
# Generate embeddings for author's works
|
| 201 |
+
work_texts = []
|
| 202 |
+
for work in works:
|
| 203 |
+
title = work.get('title', '') or work.get('display_name', '')
|
| 204 |
+
abstract = reconstruct_abstract(work.get('abstract_inverted_index', {}))
|
| 205 |
+
text = f"{title} {title} {abstract}"
|
| 206 |
+
if text.strip():
|
| 207 |
+
work_texts.append(text)
|
| 208 |
+
|
| 209 |
+
if not work_texts:
|
| 210 |
+
return False, 0.0, 0
|
| 211 |
+
|
| 212 |
+
# Calculate similarity to query
|
| 213 |
+
work_embeddings = model.encode(work_texts, convert_to_tensor=False, show_progress_bar=False)
|
| 214 |
+
similarities = calculate_semantic_similarity(query_embedding, work_embeddings)
|
| 215 |
+
avg_similarity = np.mean(similarities)
|
| 216 |
+
|
| 217 |
+
# Author is valid if their average work similarity exceeds threshold
|
| 218 |
+
is_valid = avg_similarity >= threshold
|
| 219 |
+
|
| 220 |
+
return is_valid, avg_similarity, len(work_texts)
|
| 221 |
+
|
| 222 |
def calculate_semantic_similarity(query_embedding, paper_embeddings):
|
| 223 |
"""
|
| 224 |
Calculate cosine similarity between query and papers
|
|
|
|
| 231 |
similarities = np.dot(paper_norms, query_norm)
|
| 232 |
return similarities
|
| 233 |
|
| 234 |
+
def rank_authors(papers, paper_scores, model, query_embedding, min_papers=2, validate_authors=True, validation_threshold=0.25):
|
| 235 |
"""
|
| 236 |
Extract authors from papers and rank them based on:
|
| 237 |
- Semantic relevance (average of their paper scores)
|
| 238 |
- H-index
|
| 239 |
- Total citations
|
| 240 |
+
|
| 241 |
+
If validate_authors=True, checks each author's body of work for relevance
|
| 242 |
"""
|
| 243 |
author_data = defaultdict(lambda: {
|
| 244 |
'name': '',
|
|
|
|
| 248 |
'total_citations': 0,
|
| 249 |
'works_count': 0,
|
| 250 |
'h_index': 0,
|
| 251 |
+
'institution': '',
|
| 252 |
+
'validation_score': 0.0,
|
| 253 |
+
'validated': False
|
| 254 |
})
|
| 255 |
|
| 256 |
# Collect author information from papers
|
|
|
|
| 291 |
|
| 292 |
progress_bar.empty()
|
| 293 |
|
| 294 |
+
# Validate authors if requested
|
| 295 |
+
if validate_authors:
|
| 296 |
+
with st.spinner(f"Validating author relevance (checking their body of work)..."):
|
| 297 |
+
progress_bar = st.progress(0)
|
| 298 |
+
validated_count = 0
|
| 299 |
+
|
| 300 |
+
for idx, (author_id, data) in enumerate(filtered_authors.items()):
|
| 301 |
+
is_valid, val_score, num_works = validate_author_relevance(
|
| 302 |
+
author_id, query_embedding, model, validation_threshold
|
| 303 |
+
)
|
| 304 |
+
data['validated'] = is_valid
|
| 305 |
+
data['validation_score'] = val_score
|
| 306 |
+
data['num_works_checked'] = num_works
|
| 307 |
+
|
| 308 |
+
if is_valid:
|
| 309 |
+
validated_count += 1
|
| 310 |
+
|
| 311 |
+
progress_bar.progress((idx + 1) / len(filtered_authors))
|
| 312 |
+
time.sleep(0.1) # Rate limiting
|
| 313 |
+
|
| 314 |
+
progress_bar.empty()
|
| 315 |
+
st.success(f"β
{validated_count}/{len(filtered_authors)} authors validated as relevant to your query")
|
| 316 |
+
|
| 317 |
+
# Filter to only validated authors
|
| 318 |
+
filtered_authors = {
|
| 319 |
+
aid: data for aid, data in filtered_authors.items()
|
| 320 |
+
if data['validated']
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
# Calculate composite score for ranking
|
| 324 |
ranked_authors = []
|
| 325 |
for author_id, data in filtered_authors.items():
|
|
|
|
| 330 |
normalized_citations = np.log1p(data['total_citations']) / 15.0 # Log scale
|
| 331 |
|
| 332 |
# Weighted composite score
|
| 333 |
+
if validate_authors:
|
| 334 |
+
# Include validation score in composite
|
| 335 |
+
composite_score = (
|
| 336 |
+
0.4 * avg_relevance + # 40% relevance in initial papers
|
| 337 |
+
0.3 * data['validation_score'] + # 30% validation (their body of work)
|
| 338 |
+
0.2 * min(normalized_h_index, 1.0) + # 20% h-index
|
| 339 |
+
0.1 * min(normalized_citations, 1.0) # 10% citations
|
| 340 |
+
)
|
| 341 |
+
else:
|
| 342 |
+
# Original scoring without validation
|
| 343 |
+
composite_score = (
|
| 344 |
+
0.5 * avg_relevance + # 50% relevance
|
| 345 |
+
0.3 * min(normalized_h_index, 1.0) + # 30% h-index
|
| 346 |
+
0.2 * min(normalized_citations, 1.0) # 20% citations
|
| 347 |
+
)
|
| 348 |
|
| 349 |
ranked_authors.append({
|
| 350 |
'name': data['name'],
|
|
|
|
| 354 |
'works_count': data['works_count'],
|
| 355 |
'num_relevant_papers': len(data['paper_scores']),
|
| 356 |
'avg_relevance_score': avg_relevance,
|
| 357 |
+
'validation_score': data['validation_score'],
|
| 358 |
+
'validated': data['validated'],
|
| 359 |
'composite_score': composite_score,
|
| 360 |
'institution': data['institution'],
|
| 361 |
'openalex_url': f"https://openalex.org/A{author_id}"
|
|
|
|
| 427 |
)
|
| 428 |
country_code = COUNTRIES[selected_country]
|
| 429 |
|
| 430 |
+
# Year range filter
|
| 431 |
+
st.sidebar.subheader("π
Year Range")
|
| 432 |
+
current_year = 2025
|
| 433 |
+
use_year_filter = st.sidebar.checkbox(
|
| 434 |
+
"Limit by publication year",
|
| 435 |
+
value=False,
|
| 436 |
+
help="Filter papers by publication year range"
|
| 437 |
+
)
|
| 438 |
+
|
| 439 |
+
if use_year_filter:
|
| 440 |
+
year_col1, year_col2 = st.sidebar.columns(2)
|
| 441 |
+
with year_col1:
|
| 442 |
+
year_min = st.number_input(
|
| 443 |
+
"From",
|
| 444 |
+
min_value=1900,
|
| 445 |
+
max_value=current_year,
|
| 446 |
+
value=2015,
|
| 447 |
+
step=1
|
| 448 |
+
)
|
| 449 |
+
with year_col2:
|
| 450 |
+
year_max = st.number_input(
|
| 451 |
+
"To",
|
| 452 |
+
min_value=1900,
|
| 453 |
+
max_value=current_year,
|
| 454 |
+
value=current_year,
|
| 455 |
+
step=1
|
| 456 |
+
)
|
| 457 |
+
else:
|
| 458 |
+
year_min = None
|
| 459 |
+
year_max = None
|
| 460 |
+
|
| 461 |
# Full-text search option
|
| 462 |
use_fulltext = st.sidebar.checkbox(
|
| 463 |
"Include full text (when available)",
|
|
|
|
| 465 |
help="Search within full paper text (not just title/abstract). ~10-15% of papers have full text available. Slightly slower."
|
| 466 |
)
|
| 467 |
|
| 468 |
+
# Author validation
|
| 469 |
+
st.sidebar.subheader("π€ Author Validation")
|
| 470 |
+
validate_authors = st.sidebar.checkbox(
|
| 471 |
+
"Validate authors' body of work",
|
| 472 |
+
value=True,
|
| 473 |
+
help="Check each author's recent papers to confirm they're actually working in this area. More accurate but slower."
|
| 474 |
+
)
|
| 475 |
+
|
| 476 |
+
if validate_authors:
|
| 477 |
+
validation_threshold = st.sidebar.slider(
|
| 478 |
+
"Validation threshold",
|
| 479 |
+
min_value=0.15,
|
| 480 |
+
max_value=0.50,
|
| 481 |
+
value=0.25,
|
| 482 |
+
step=0.05,
|
| 483 |
+
help="Minimum average similarity score for author's works. Higher = stricter filter."
|
| 484 |
+
)
|
| 485 |
+
else:
|
| 486 |
+
validation_threshold = 0.25
|
| 487 |
+
|
| 488 |
# Minimum papers per author
|
| 489 |
min_papers_per_author = st.sidebar.slider(
|
| 490 |
"Minimum papers per author",
|
|
|
|
| 516 |
|
| 517 |
if search_button and query:
|
| 518 |
# Display search parameters
|
| 519 |
+
year_range_text = f"Years: **{year_min}-{year_max}**" if use_year_filter else "Years: **All**"
|
| 520 |
+
validation_text = f"Validation: **On (threshold {validation_threshold})**" if validate_authors else "Validation: **Off**"
|
| 521 |
+
|
| 522 |
+
st.info(f"π Searching: **{query}** | Mode: **{search_mode}** | Papers: **{num_papers}** | {year_range_text} | Country: **{selected_country}** | Full-text: **{'Yes' if use_fulltext else 'No'}** | {validation_text} | Min papers/author: **{min_papers_per_author}**")
|
| 523 |
|
| 524 |
# Load model
|
| 525 |
with st.spinner("Loading semantic model..."):
|
| 526 |
model = load_model()
|
| 527 |
|
| 528 |
# Search papers
|
| 529 |
+
search_key = f"{query}_{num_papers}_{country_code}_{use_fulltext}_{year_min}_{year_max}"
|
| 530 |
|
| 531 |
if search_mode == "Deep Search":
|
| 532 |
progress_text = f"π Deep search in progress: Fetching up to {num_papers} papers from OpenAlex..."
|
| 533 |
progress_bar = st.progress(0, text=progress_text)
|
| 534 |
|
| 535 |
+
year_filter_text = f" from {year_min}-{year_max}" if use_year_filter else ""
|
| 536 |
+
with st.spinner(f"Searching OpenAlex for papers about '{query}'{year_filter_text}{' from ' + selected_country if country_code else ''}{' (including full text)' if use_fulltext else ''}..."):
|
| 537 |
+
papers = search_openalex_papers(query, num_papers, country_code, use_fulltext, year_min, year_max)
|
| 538 |
|
| 539 |
if search_mode == "Deep Search":
|
| 540 |
progress_bar.progress(33, text="π Papers fetched! Now generating embeddings...")
|
|
|
|
| 550 |
st.write(f"**Search Mode:** {search_mode}")
|
| 551 |
st.write(f"**Query:** {query}")
|
| 552 |
st.write(f"**Full-text search:** {'Enabled' if use_fulltext else 'Disabled'}")
|
| 553 |
+
st.write(f"**Year range:** {year_min}-{year_max}" if use_year_filter else "**Year range:** All years")
|
| 554 |
st.write(f"**Papers requested:** {num_papers}")
|
| 555 |
st.write(f"**Papers fetched:** {len(papers)}")
|
| 556 |
st.write(f"**Country filter:** {selected_country} ({country_code or 'None'})")
|
| 557 |
+
st.write(f"**Author validation:** {'Enabled (threshold: ' + str(validation_threshold) + ')' if validate_authors else 'Disabled'}")
|
| 558 |
st.write(f"**First paper:** {papers[0].get('display_name', 'N/A')[:100]}...")
|
| 559 |
st.write(f"**Last paper:** {papers[-1].get('display_name', 'N/A')[:100]}...")
|
| 560 |
|
|
|
|
| 638 |
sorted_scores,
|
| 639 |
model,
|
| 640 |
query_embedding,
|
| 641 |
+
min_papers=min_papers_per_author,
|
| 642 |
+
validate_authors=validate_authors,
|
| 643 |
+
validation_threshold=validation_threshold
|
| 644 |
)
|
| 645 |
|
| 646 |
if not ranked_authors:
|
|
|
|
| 666 |
st.metric("Citations", f"{author['total_citations']:,}")
|
| 667 |
|
| 668 |
with col4:
|
| 669 |
+
if validate_authors:
|
| 670 |
+
st.metric("Body Relevance", f"{author['validation_score']:.3f}")
|
| 671 |
+
else:
|
| 672 |
+
st.metric("Relevance", f"{author['avg_relevance_score']:.3f}")
|
| 673 |
|
| 674 |
+
caption_text = f"Total works: {author['works_count']} | Relevant papers: {author['num_relevant_papers']}"
|
| 675 |
+
if validate_authors:
|
| 676 |
+
caption_text += f" | Paper relevance: {author['avg_relevance_score']:.3f}"
|
| 677 |
+
st.caption(caption_text)
|
| 678 |
st.divider()
|
| 679 |
|
| 680 |
# Download results
|
|
|
|
| 688 |
csv_writer = csv.writer(csv_buffer)
|
| 689 |
|
| 690 |
# Write header
|
| 691 |
+
header = [
|
| 692 |
'Rank', 'Name', 'Institution', 'H-Index', 'Total Citations',
|
| 693 |
'Total Works', 'Relevant Papers', 'Avg Relevance Score', 'Composite Score', 'OpenAlex URL'
|
| 694 |
+
]
|
| 695 |
+
if validate_authors:
|
| 696 |
+
header.insert(-1, 'Body of Work Validation Score')
|
| 697 |
+
csv_writer.writerow(header)
|
| 698 |
|
| 699 |
# Write data
|
| 700 |
for idx, author in enumerate(ranked_authors, 1):
|
| 701 |
+
row = [
|
| 702 |
idx,
|
| 703 |
author['name'],
|
| 704 |
author['institution'],
|
|
|
|
| 708 |
author['num_relevant_papers'],
|
| 709 |
f"{author['avg_relevance_score']:.4f}",
|
| 710 |
f"{author['composite_score']:.4f}",
|
| 711 |
+
]
|
| 712 |
+
if validate_authors:
|
| 713 |
+
row.append(f"{author['validation_score']:.4f}")
|
| 714 |
+
row.append(author['openalex_url'])
|
| 715 |
+
csv_writer.writerow(row)
|
| 716 |
|
| 717 |
csv_data = csv_buffer.getvalue()
|
| 718 |
|