|
|
""" |
|
|
DocMind - Utility Functions |
|
|
Helper functions for the multi-agent system |
|
|
""" |
|
|
|
|
|
from typing import List, Dict |
|
|
import re |
|
|
from datetime import datetime |
|
|
|
|
|
|
|
|
def clean_text(text: str) -> str: |
|
|
"""Clean and normalize text""" |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
|
|
|
text = re.sub(r'[^\w\s.,!?;:()\-]', '', text) |
|
|
return text.strip() |
|
|
|
|
|
|
|
|
def truncate_text(text: str, max_length: int = 500) -> str: |
|
|
"""Truncate text to maximum length, ending at sentence boundary""" |
|
|
if len(text) <= max_length: |
|
|
return text |
|
|
|
|
|
|
|
|
truncated = text[:max_length] |
|
|
last_period = truncated.rfind('.') |
|
|
|
|
|
if last_period > 0: |
|
|
return truncated[:last_period + 1] |
|
|
return truncated + "..." |
|
|
|
|
|
|
|
|
def format_authors(authors: List[str], max_authors: int = 3) -> str: |
|
|
"""Format author list for display""" |
|
|
if len(authors) <= max_authors: |
|
|
return ", ".join(authors) |
|
|
else: |
|
|
return ", ".join(authors[:max_authors]) + " et al." |
|
|
|
|
|
|
|
|
def extract_year(date_string: str) -> int: |
|
|
"""Extract year from date string""" |
|
|
try: |
|
|
if isinstance(date_string, str): |
|
|
return int(date_string[:4]) |
|
|
return datetime.now().year |
|
|
except: |
|
|
return datetime.now().year |
|
|
|
|
|
|
|
|
def score_recency(year: int, current_year: int = None) -> float: |
|
|
""" |
|
|
Score paper based on recency |
|
|
|
|
|
Returns: |
|
|
Score from 0-1, where 1 is most recent |
|
|
""" |
|
|
if current_year is None: |
|
|
current_year = datetime.now().year |
|
|
|
|
|
age = current_year - year |
|
|
if age <= 0: |
|
|
return 1.0 |
|
|
elif age <= 1: |
|
|
return 0.9 |
|
|
elif age <= 2: |
|
|
return 0.7 |
|
|
elif age <= 3: |
|
|
return 0.5 |
|
|
else: |
|
|
return max(0.3, 1.0 / (age + 1)) |
|
|
|
|
|
|
|
|
def combine_scores( |
|
|
relevance: float, |
|
|
recency: float, |
|
|
quality: float, |
|
|
weights: Dict[str, float] = None |
|
|
) -> float: |
|
|
""" |
|
|
Combine multiple scores with weights |
|
|
|
|
|
Args: |
|
|
relevance: Relevance score (0-1) |
|
|
recency: Recency score (0-1) |
|
|
quality: Quality score (0-1) |
|
|
weights: Dict with keys 'relevance', 'recency', 'quality' |
|
|
|
|
|
Returns: |
|
|
Combined score (0-1) |
|
|
""" |
|
|
if weights is None: |
|
|
weights = { |
|
|
'relevance': 0.6, |
|
|
'recency': 0.2, |
|
|
'quality': 0.2 |
|
|
} |
|
|
|
|
|
return ( |
|
|
relevance * weights['relevance'] + |
|
|
recency * weights['recency'] + |
|
|
quality * weights['quality'] |
|
|
) |
|
|
|
|
|
|
|
|
def deduplicate_papers(papers: List[Dict]) -> List[Dict]: |
|
|
"""Remove duplicate papers based on arXiv ID""" |
|
|
seen = set() |
|
|
unique = [] |
|
|
|
|
|
for paper in papers: |
|
|
paper_id = paper.get('arxiv_id', '') |
|
|
if paper_id and paper_id not in seen: |
|
|
seen.add(paper_id) |
|
|
unique.append(paper) |
|
|
|
|
|
return unique |
|
|
|
|
|
|
|
|
def format_citation(paper: Dict, style: str = 'apa') -> str: |
|
|
""" |
|
|
Format paper citation |
|
|
|
|
|
Args: |
|
|
paper: Paper dict with title, authors, year, arxiv_id |
|
|
style: Citation style ('apa', 'simple', 'markdown') |
|
|
|
|
|
Returns: |
|
|
Formatted citation string |
|
|
""" |
|
|
authors = format_authors(paper.get('authors', [])) |
|
|
title = paper.get('title', 'Unknown Title') |
|
|
year = extract_year(paper.get('published', '')) |
|
|
arxiv_id = paper.get('arxiv_id', '') |
|
|
|
|
|
if style == 'apa': |
|
|
return f"{authors} ({year}). {title}. arXiv:{arxiv_id}" |
|
|
|
|
|
elif style == 'markdown': |
|
|
return f"**{title}** - {authors} ({year}) - arXiv:[{arxiv_id}](https://arxiv.org/abs/{arxiv_id})" |
|
|
|
|
|
else: |
|
|
return f"{title} ({arxiv_id}, {year})" |
|
|
|
|
|
|
|
|
def extract_keywords(text: str, top_n: int = 5) -> List[str]: |
|
|
""" |
|
|
Extract simple keywords from text (frequency-based) |
|
|
|
|
|
Args: |
|
|
text: Input text |
|
|
top_n: Number of keywords to return |
|
|
|
|
|
Returns: |
|
|
List of top keywords |
|
|
""" |
|
|
|
|
|
|
|
|
stop_words = { |
|
|
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', |
|
|
'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been', |
|
|
'this', 'that', 'these', 'those', 'we', 'our', 'propose', 'show' |
|
|
} |
|
|
|
|
|
|
|
|
words = re.findall(r'\b[a-z]{4,}\b', text.lower()) |
|
|
word_freq = {} |
|
|
|
|
|
for word in words: |
|
|
if word not in stop_words: |
|
|
word_freq[word] = word_freq.get(word, 0) + 1 |
|
|
|
|
|
|
|
|
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) |
|
|
return [word for word, freq in sorted_words[:top_n]] |
|
|
|
|
|
|
|
|
class ProgressTracker: |
|
|
"""Simple progress tracker for multi-step processes""" |
|
|
|
|
|
def __init__(self, total_steps: int): |
|
|
self.total_steps = total_steps |
|
|
self.current_step = 0 |
|
|
self.step_names = [] |
|
|
|
|
|
def next_step(self, step_name: str = None): |
|
|
"""Move to next step""" |
|
|
self.current_step += 1 |
|
|
if step_name: |
|
|
self.step_names.append(step_name) |
|
|
|
|
|
def get_progress(self) -> float: |
|
|
"""Get progress as percentage""" |
|
|
return (self.current_step / self.total_steps) * 100 |
|
|
|
|
|
def get_status(self) -> str: |
|
|
"""Get status string""" |
|
|
return f"Step {self.current_step}/{self.total_steps} ({self.get_progress():.1f}%)" |
|
|
|
|
|
|
|
|
def validate_paper_dict(paper: Dict) -> bool: |
|
|
"""Validate that paper dictionary has required fields""" |
|
|
required_fields = ['title', 'abstract', 'arxiv_id', 'authors', 'published'] |
|
|
return all(field in paper for field in required_fields) |
|
|
|
|
|
|
|
|
def safe_get(dictionary: Dict, key: str, default=None): |
|
|
"""Safely get value from dictionary with fallback""" |
|
|
try: |
|
|
return dictionary.get(key, default) |
|
|
except: |
|
|
return default |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
sample_paper = { |
|
|
'title': 'Attention Is All You Need', |
|
|
'authors': ['Vaswani', 'Shazeer', 'Parmar', 'Uszkoreit'], |
|
|
'published': '2017-06-12', |
|
|
'arxiv_id': '1706.03762', |
|
|
'abstract': 'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks...' |
|
|
} |
|
|
|
|
|
print("Citation (APA):", format_citation(sample_paper, 'apa')) |
|
|
print("Citation (Markdown):", format_citation(sample_paper, 'markdown')) |
|
|
print("Authors:", format_authors(sample_paper['authors'])) |
|
|
print("Recency score:", score_recency(2017)) |
|
|
print("Keywords:", extract_keywords(sample_paper['abstract'])) |