DocMind / utils.py
Tanaybh's picture
Upload 4 files
3a5fdfb verified
raw
history blame
6.5 kB
"""
DocMind - Utility Functions
Helper functions for the multi-agent system
"""
from typing import List, Dict
import re
from datetime import datetime
def clean_text(text: str) -> str:
"""Clean and normalize text"""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters but keep basic punctuation
text = re.sub(r'[^\w\s.,!?;:()\-]', '', text)
return text.strip()
def truncate_text(text: str, max_length: int = 500) -> str:
"""Truncate text to maximum length, ending at sentence boundary"""
if len(text) <= max_length:
return text
# Find last sentence boundary before max_length
truncated = text[:max_length]
last_period = truncated.rfind('.')
if last_period > 0:
return truncated[:last_period + 1]
return truncated + "..."
def format_authors(authors: List[str], max_authors: int = 3) -> str:
"""Format author list for display"""
if len(authors) <= max_authors:
return ", ".join(authors)
else:
return ", ".join(authors[:max_authors]) + " et al."
def extract_year(date_string: str) -> int:
"""Extract year from date string"""
try:
if isinstance(date_string, str):
return int(date_string[:4])
return datetime.now().year
except:
return datetime.now().year
def score_recency(year: int, current_year: int = None) -> float:
"""
Score paper based on recency
Returns:
Score from 0-1, where 1 is most recent
"""
if current_year is None:
current_year = datetime.now().year
age = current_year - year
if age <= 0:
return 1.0
elif age <= 1:
return 0.9
elif age <= 2:
return 0.7
elif age <= 3:
return 0.5
else:
return max(0.3, 1.0 / (age + 1))
def combine_scores(
relevance: float,
recency: float,
quality: float,
weights: Dict[str, float] = None
) -> float:
"""
Combine multiple scores with weights
Args:
relevance: Relevance score (0-1)
recency: Recency score (0-1)
quality: Quality score (0-1)
weights: Dict with keys 'relevance', 'recency', 'quality'
Returns:
Combined score (0-1)
"""
if weights is None:
weights = {
'relevance': 0.6,
'recency': 0.2,
'quality': 0.2
}
return (
relevance * weights['relevance'] +
recency * weights['recency'] +
quality * weights['quality']
)
def deduplicate_papers(papers: List[Dict]) -> List[Dict]:
"""Remove duplicate papers based on arXiv ID"""
seen = set()
unique = []
for paper in papers:
paper_id = paper.get('arxiv_id', '')
if paper_id and paper_id not in seen:
seen.add(paper_id)
unique.append(paper)
return unique
def format_citation(paper: Dict, style: str = 'apa') -> str:
"""
Format paper citation
Args:
paper: Paper dict with title, authors, year, arxiv_id
style: Citation style ('apa', 'simple', 'markdown')
Returns:
Formatted citation string
"""
authors = format_authors(paper.get('authors', []))
title = paper.get('title', 'Unknown Title')
year = extract_year(paper.get('published', ''))
arxiv_id = paper.get('arxiv_id', '')
if style == 'apa':
return f"{authors} ({year}). {title}. arXiv:{arxiv_id}"
elif style == 'markdown':
return f"**{title}** - {authors} ({year}) - arXiv:[{arxiv_id}](https://arxiv.org/abs/{arxiv_id})"
else: # simple
return f"{title} ({arxiv_id}, {year})"
def extract_keywords(text: str, top_n: int = 5) -> List[str]:
"""
Extract simple keywords from text (frequency-based)
Args:
text: Input text
top_n: Number of keywords to return
Returns:
List of top keywords
"""
# Simple word frequency approach
# Remove common words
stop_words = {
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
'this', 'that', 'these', 'those', 'we', 'our', 'propose', 'show'
}
# Tokenize and count
words = re.findall(r'\b[a-z]{4,}\b', text.lower())
word_freq = {}
for word in words:
if word not in stop_words:
word_freq[word] = word_freq.get(word, 0) + 1
# Get top N
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
return [word for word, freq in sorted_words[:top_n]]
class ProgressTracker:
"""Simple progress tracker for multi-step processes"""
def __init__(self, total_steps: int):
self.total_steps = total_steps
self.current_step = 0
self.step_names = []
def next_step(self, step_name: str = None):
"""Move to next step"""
self.current_step += 1
if step_name:
self.step_names.append(step_name)
def get_progress(self) -> float:
"""Get progress as percentage"""
return (self.current_step / self.total_steps) * 100
def get_status(self) -> str:
"""Get status string"""
return f"Step {self.current_step}/{self.total_steps} ({self.get_progress():.1f}%)"
def validate_paper_dict(paper: Dict) -> bool:
"""Validate that paper dictionary has required fields"""
required_fields = ['title', 'abstract', 'arxiv_id', 'authors', 'published']
return all(field in paper for field in required_fields)
def safe_get(dictionary: Dict, key: str, default=None):
"""Safely get value from dictionary with fallback"""
try:
return dictionary.get(key, default)
except:
return default
# Example usage
if __name__ == "__main__":
# Test utilities
sample_paper = {
'title': 'Attention Is All You Need',
'authors': ['Vaswani', 'Shazeer', 'Parmar', 'Uszkoreit'],
'published': '2017-06-12',
'arxiv_id': '1706.03762',
'abstract': 'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks...'
}
print("Citation (APA):", format_citation(sample_paper, 'apa'))
print("Citation (Markdown):", format_citation(sample_paper, 'markdown'))
print("Authors:", format_authors(sample_paper['authors']))
print("Recency score:", score_recency(2017))
print("Keywords:", extract_keywords(sample_paper['abstract']))