""" DocMind - Utility Functions Helper functions for the multi-agent system """ from typing import List, Dict import re from datetime import datetime def clean_text(text: str) -> str: """Clean and normalize text""" # Remove extra whitespace text = re.sub(r'\s+', ' ', text) # Remove special characters but keep basic punctuation text = re.sub(r'[^\w\s.,!?;:()\-]', '', text) return text.strip() def truncate_text(text: str, max_length: int = 500) -> str: """Truncate text to maximum length, ending at sentence boundary""" if len(text) <= max_length: return text # Find last sentence boundary before max_length truncated = text[:max_length] last_period = truncated.rfind('.') if last_period > 0: return truncated[:last_period + 1] return truncated + "..." def format_authors(authors: List[str], max_authors: int = 3) -> str: """Format author list for display""" if len(authors) <= max_authors: return ", ".join(authors) else: return ", ".join(authors[:max_authors]) + " et al." def extract_year(date_string: str) -> int: """Extract year from date string""" try: if isinstance(date_string, str): return int(date_string[:4]) return datetime.now().year except: return datetime.now().year def score_recency(year: int, current_year: int = None) -> float: """ Score paper based on recency Returns: Score from 0-1, where 1 is most recent """ if current_year is None: current_year = datetime.now().year age = current_year - year if age <= 0: return 1.0 elif age <= 1: return 0.9 elif age <= 2: return 0.7 elif age <= 3: return 0.5 else: return max(0.3, 1.0 / (age + 1)) def combine_scores( relevance: float, recency: float, quality: float, weights: Dict[str, float] = None ) -> float: """ Combine multiple scores with weights Args: relevance: Relevance score (0-1) recency: Recency score (0-1) quality: Quality score (0-1) weights: Dict with keys 'relevance', 'recency', 'quality' Returns: Combined score (0-1) """ if weights is None: weights = { 'relevance': 0.6, 'recency': 0.2, 'quality': 0.2 } return ( relevance * weights['relevance'] + recency * weights['recency'] + quality * weights['quality'] ) def deduplicate_papers(papers: List[Dict]) -> List[Dict]: """Remove duplicate papers based on arXiv ID""" seen = set() unique = [] for paper in papers: paper_id = paper.get('arxiv_id', '') if paper_id and paper_id not in seen: seen.add(paper_id) unique.append(paper) return unique def format_citation(paper: Dict, style: str = 'apa') -> str: """ Format paper citation Args: paper: Paper dict with title, authors, year, arxiv_id style: Citation style ('apa', 'simple', 'markdown') Returns: Formatted citation string """ authors = format_authors(paper.get('authors', [])) title = paper.get('title', 'Unknown Title') year = extract_year(paper.get('published', '')) arxiv_id = paper.get('arxiv_id', '') if style == 'apa': return f"{authors} ({year}). {title}. arXiv:{arxiv_id}" elif style == 'markdown': return f"**{title}** - {authors} ({year}) - arXiv:[{arxiv_id}](https://arxiv.org/abs/{arxiv_id})" else: # simple return f"{title} ({arxiv_id}, {year})" def extract_keywords(text: str, top_n: int = 5) -> List[str]: """ Extract simple keywords from text (frequency-based) Args: text: Input text top_n: Number of keywords to return Returns: List of top keywords """ # Simple word frequency approach # Remove common words stop_words = { 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been', 'this', 'that', 'these', 'those', 'we', 'our', 'propose', 'show' } # Tokenize and count words = re.findall(r'\b[a-z]{4,}\b', text.lower()) word_freq = {} for word in words: if word not in stop_words: word_freq[word] = word_freq.get(word, 0) + 1 # Get top N sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) return [word for word, freq in sorted_words[:top_n]] class ProgressTracker: """Simple progress tracker for multi-step processes""" def __init__(self, total_steps: int): self.total_steps = total_steps self.current_step = 0 self.step_names = [] def next_step(self, step_name: str = None): """Move to next step""" self.current_step += 1 if step_name: self.step_names.append(step_name) def get_progress(self) -> float: """Get progress as percentage""" return (self.current_step / self.total_steps) * 100 def get_status(self) -> str: """Get status string""" return f"Step {self.current_step}/{self.total_steps} ({self.get_progress():.1f}%)" def validate_paper_dict(paper: Dict) -> bool: """Validate that paper dictionary has required fields""" required_fields = ['title', 'abstract', 'arxiv_id', 'authors', 'published'] return all(field in paper for field in required_fields) def safe_get(dictionary: Dict, key: str, default=None): """Safely get value from dictionary with fallback""" try: return dictionary.get(key, default) except: return default # Example usage if __name__ == "__main__": # Test utilities sample_paper = { 'title': 'Attention Is All You Need', 'authors': ['Vaswani', 'Shazeer', 'Parmar', 'Uszkoreit'], 'published': '2017-06-12', 'arxiv_id': '1706.03762', 'abstract': 'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks...' } print("Citation (APA):", format_citation(sample_paper, 'apa')) print("Citation (Markdown):", format_citation(sample_paper, 'markdown')) print("Authors:", format_authors(sample_paper['authors'])) print("Recency score:", score_recency(2017)) print("Keywords:", extract_keywords(sample_paper['abstract']))