Spaces:

Tanaybh
/

DocMind

Runtime error

App Files Files Community

DocMind / utils.py

Tanaybh

Upload 4 files

3a5fdfb verified about 2 months ago

raw

history blame

6.5 kB

	"""
	DocMind - Utility Functions
	Helper functions for the multi-agent system
	"""

	from typing import List, Dict
	import re
	from datetime import datetime


	def clean_text(text: str) -> str:
	"""Clean and normalize text"""
	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text)
	# Remove special characters but keep basic punctuation
	text = re.sub(r'[^\w\s.,!?;:()\-]', '', text)
	return text.strip()


	def truncate_text(text: str, max_length: int = 500) -> str:
	"""Truncate text to maximum length, ending at sentence boundary"""
	if len(text) <= max_length:
	return text

	# Find last sentence boundary before max_length
	truncated = text[:max_length]
	last_period = truncated.rfind('.')

	if last_period > 0:
	return truncated[:last_period + 1]
	return truncated + "..."


	def format_authors(authors: List[str], max_authors: int = 3) -> str:
	"""Format author list for display"""
	if len(authors) <= max_authors:
	return ", ".join(authors)
	else:
	return ", ".join(authors[:max_authors]) + " et al."


	def extract_year(date_string: str) -> int:
	"""Extract year from date string"""
	try:
	if isinstance(date_string, str):
	return int(date_string[:4])
	return datetime.now().year
	except:
	return datetime.now().year


	def score_recency(year: int, current_year: int = None) -> float:
	"""
	Score paper based on recency

	Returns:
	Score from 0-1, where 1 is most recent
	"""
	if current_year is None:
	current_year = datetime.now().year

	age = current_year - year
	if age <= 0:
	return 1.0
	elif age <= 1:
	return 0.9
	elif age <= 2:
	return 0.7
	elif age <= 3:
	return 0.5
	else:
	return max(0.3, 1.0 / (age + 1))


	def combine_scores(
	relevance: float,
	recency: float,
	quality: float,
	weights: Dict[str, float] = None
	) -> float:
	"""
	Combine multiple scores with weights

	Args:
	relevance: Relevance score (0-1)
	recency: Recency score (0-1)
	quality: Quality score (0-1)
	weights: Dict with keys 'relevance', 'recency', 'quality'

	Returns:
	Combined score (0-1)
	"""
	if weights is None:
	weights = {
	'relevance': 0.6,
	'recency': 0.2,
	'quality': 0.2
	}

	return (
	relevance * weights['relevance'] +
	recency * weights['recency'] +
	quality * weights['quality']
	)


	def deduplicate_papers(papers: List[Dict]) -> List[Dict]:
	"""Remove duplicate papers based on arXiv ID"""
	seen = set()
	unique = []

	for paper in papers:
	paper_id = paper.get('arxiv_id', '')
	if paper_id and paper_id not in seen:
	seen.add(paper_id)
	unique.append(paper)

	return unique


	def format_citation(paper: Dict, style: str = 'apa') -> str:
	"""
	Format paper citation

	Args:
	paper: Paper dict with title, authors, year, arxiv_id
	style: Citation style ('apa', 'simple', 'markdown')

	Returns:
	Formatted citation string
	"""
	authors = format_authors(paper.get('authors', []))
	title = paper.get('title', 'Unknown Title')
	year = extract_year(paper.get('published', ''))
	arxiv_id = paper.get('arxiv_id', '')

	if style == 'apa':
	return f"{authors} ({year}). {title}. arXiv:{arxiv_id}"

	elif style == 'markdown':
	return f"{title} - {authors} ({year}) - arXiv:[{arxiv_id}](https://arxiv.org/abs/{arxiv_id})"

	else: # simple
	return f"{title} ({arxiv_id}, {year})"


	def extract_keywords(text: str, top_n: int = 5) -> List[str]:
	"""
	Extract simple keywords from text (frequency-based)

	Args:
	text: Input text
	top_n: Number of keywords to return

	Returns:
	List of top keywords
	"""
	# Simple word frequency approach
	# Remove common words
	stop_words = {
	'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
	'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
	'this', 'that', 'these', 'those', 'we', 'our', 'propose', 'show'
	}

	# Tokenize and count
	words = re.findall(r'\b[a-z]{4,}\b', text.lower())
	word_freq = {}

	for word in words:
	if word not in stop_words:
	word_freq[word] = word_freq.get(word, 0) + 1

	# Get top N
	sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
	return [word for word, freq in sorted_words[:top_n]]


	class ProgressTracker:
	"""Simple progress tracker for multi-step processes"""

	def __init__(self, total_steps: int):
	self.total_steps = total_steps
	self.current_step = 0
	self.step_names = []

	def next_step(self, step_name: str = None):
	"""Move to next step"""
	self.current_step += 1
	if step_name:
	self.step_names.append(step_name)

	def get_progress(self) -> float:
	"""Get progress as percentage"""
	return (self.current_step / self.total_steps) * 100

	def get_status(self) -> str:
	"""Get status string"""
	return f"Step {self.current_step}/{self.total_steps} ({self.get_progress():.1f}%)"


	def validate_paper_dict(paper: Dict) -> bool:
	"""Validate that paper dictionary has required fields"""
	required_fields = ['title', 'abstract', 'arxiv_id', 'authors', 'published']
	return all(field in paper for field in required_fields)


	def safe_get(dictionary: Dict, key: str, default=None):
	"""Safely get value from dictionary with fallback"""
	try:
	return dictionary.get(key, default)
	except:
	return default


	# Example usage
	if __name__ == "__main__":
	# Test utilities
	sample_paper = {
	'title': 'Attention Is All You Need',
	'authors': ['Vaswani', 'Shazeer', 'Parmar', 'Uszkoreit'],
	'published': '2017-06-12',
	'arxiv_id': '1706.03762',
	'abstract': 'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks...'
	}

	print("Citation (APA):", format_citation(sample_paper, 'apa'))
	print("Citation (Markdown):", format_citation(sample_paper, 'markdown'))
	print("Authors:", format_authors(sample_paper['authors']))
	print("Recency score:", score_recency(2017))
	print("Keywords:", extract_keywords(sample_paper['abstract']))