Spaces:

Tanaybh
/

DocMind

Runtime error

File size: 6,496 Bytes

3a5fdfb

"""
DocMind - Utility Functions
Helper functions for the multi-agent system
"""

from typing import List, Dict
import re
from datetime import datetime


def clean_text(text: str) -> str:
    """Clean and normalize text"""
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s.,!?;:()\-]', '', text)
    return text.strip()


def truncate_text(text: str, max_length: int = 500) -> str:
    """Truncate text to maximum length, ending at sentence boundary"""
    if len(text) <= max_length:
        return text

    # Find last sentence boundary before max_length
    truncated = text[:max_length]
    last_period = truncated.rfind('.')

    if last_period > 0:
        return truncated[:last_period + 1]
    return truncated + "..."


def format_authors(authors: List[str], max_authors: int = 3) -> str:
    """Format author list for display"""
    if len(authors) <= max_authors:
        return ", ".join(authors)
    else:
        return ", ".join(authors[:max_authors]) + " et al."


def extract_year(date_string: str) -> int:
    """Extract year from date string"""
    try:
        if isinstance(date_string, str):
            return int(date_string[:4])
        return datetime.now().year
    except:
        return datetime.now().year


def score_recency(year: int, current_year: int = None) -> float:
    """
    Score paper based on recency

    Returns:
        Score from 0-1, where 1 is most recent
    """
    if current_year is None:
        current_year = datetime.now().year

    age = current_year - year
    if age <= 0:
        return 1.0
    elif age <= 1:
        return 0.9
    elif age <= 2:
        return 0.7
    elif age <= 3:
        return 0.5
    else:
        return max(0.3, 1.0 / (age + 1))


def combine_scores(
        relevance: float,
        recency: float,
        quality: float,
        weights: Dict[str, float] = None
) -> float:
    """
    Combine multiple scores with weights

    Args:
        relevance: Relevance score (0-1)
        recency: Recency score (0-1)
        quality: Quality score (0-1)
        weights: Dict with keys 'relevance', 'recency', 'quality'

    Returns:
        Combined score (0-1)
    """
    if weights is None:
        weights = {
            'relevance': 0.6,
            'recency': 0.2,
            'quality': 0.2
        }

    return (
            relevance * weights['relevance'] +
            recency * weights['recency'] +
            quality * weights['quality']
    )


def deduplicate_papers(papers: List[Dict]) -> List[Dict]:
    """Remove duplicate papers based on arXiv ID"""
    seen = set()
    unique = []

    for paper in papers:
        paper_id = paper.get('arxiv_id', '')
        if paper_id and paper_id not in seen:
            seen.add(paper_id)
            unique.append(paper)

    return unique


def format_citation(paper: Dict, style: str = 'apa') -> str:
    """
    Format paper citation

    Args:
        paper: Paper dict with title, authors, year, arxiv_id
        style: Citation style ('apa', 'simple', 'markdown')

    Returns:
        Formatted citation string
    """
    authors = format_authors(paper.get('authors', []))
    title = paper.get('title', 'Unknown Title')
    year = extract_year(paper.get('published', ''))
    arxiv_id = paper.get('arxiv_id', '')

    if style == 'apa':
        return f"{authors} ({year}). {title}. arXiv:{arxiv_id}"

    elif style == 'markdown':
        return f"**{title}** - {authors} ({year}) - arXiv:[{arxiv_id}](https://arxiv.org/abs/{arxiv_id})"

    else:  # simple
        return f"{title} ({arxiv_id}, {year})"


def extract_keywords(text: str, top_n: int = 5) -> List[str]:
    """
    Extract simple keywords from text (frequency-based)

    Args:
        text: Input text
        top_n: Number of keywords to return

    Returns:
        List of top keywords
    """
    # Simple word frequency approach
    # Remove common words
    stop_words = {
        'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
        'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
        'this', 'that', 'these', 'those', 'we', 'our', 'propose', 'show'
    }

    # Tokenize and count
    words = re.findall(r'\b[a-z]{4,}\b', text.lower())
    word_freq = {}

    for word in words:
        if word not in stop_words:
            word_freq[word] = word_freq.get(word, 0) + 1

    # Get top N
    sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    return [word for word, freq in sorted_words[:top_n]]


class ProgressTracker:
    """Simple progress tracker for multi-step processes"""

    def __init__(self, total_steps: int):
        self.total_steps = total_steps
        self.current_step = 0
        self.step_names = []

    def next_step(self, step_name: str = None):
        """Move to next step"""
        self.current_step += 1
        if step_name:
            self.step_names.append(step_name)

    def get_progress(self) -> float:
        """Get progress as percentage"""
        return (self.current_step / self.total_steps) * 100

    def get_status(self) -> str:
        """Get status string"""
        return f"Step {self.current_step}/{self.total_steps} ({self.get_progress():.1f}%)"


def validate_paper_dict(paper: Dict) -> bool:
    """Validate that paper dictionary has required fields"""
    required_fields = ['title', 'abstract', 'arxiv_id', 'authors', 'published']
    return all(field in paper for field in required_fields)


def safe_get(dictionary: Dict, key: str, default=None):
    """Safely get value from dictionary with fallback"""
    try:
        return dictionary.get(key, default)
    except:
        return default


# Example usage
if __name__ == "__main__":
    # Test utilities
    sample_paper = {
        'title': 'Attention Is All You Need',
        'authors': ['Vaswani', 'Shazeer', 'Parmar', 'Uszkoreit'],
        'published': '2017-06-12',
        'arxiv_id': '1706.03762',
        'abstract': 'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks...'
    }

    print("Citation (APA):", format_citation(sample_paper, 'apa'))
    print("Citation (Markdown):", format_citation(sample_paper, 'markdown'))
    print("Authors:", format_authors(sample_paper['authors']))
    print("Recency score:", score_recency(2017))
    print("Keywords:", extract_keywords(sample_paper['abstract']))