""" DocMind - arXiv Data Fetcher Fetches papers from arXiv API and saves them for indexing """ import arxiv import os import json from pathlib import Path from typing import List, Dict class ArxivFetcher: def __init__(self, data_dir: str = "data/papers"): self.data_dir = Path(data_dir) self.data_dir.mkdir(parents=True, exist_ok=True) def fetch_papers( self, query: str = "machine learning", max_results: int = 100, category: str = None ) -> List[Dict]: """ Fetch papers from arXiv API Args: query: Search query string max_results: Maximum number of papers to fetch category: arXiv category (e.g., 'cs.AI', 'cs.LG') Returns: List of paper dictionaries """ print(f"Fetching papers from arXiv: query='{query}', max={max_results}") # Build search query search_query = query if category: search_query = f"cat:{category} AND {query}" search = arxiv.Search( query=search_query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate ) papers = [] for result in search.results(): paper = { 'arxiv_id': result.entry_id.split('/')[-1], 'title': result.title, 'authors': [author.name for author in result.authors], 'abstract': result.summary, 'published': result.published.strftime('%Y-%m-%d'), 'pdf_url': result.pdf_url, 'categories': result.categories } papers.append(paper) print(f"Successfully fetched {len(papers)} papers") return papers def save_papers(self, papers: List[Dict], filename: str = "papers.json"): """Save papers to JSON file""" filepath = self.data_dir / filename with open(filepath, 'w', encoding='utf-8') as f: json.dump(papers, f, indent=2, ensure_ascii=False) print(f"Saved {len(papers)} papers to {filepath}") def load_papers(self, filename: str = "papers.json") -> List[Dict]: """Load papers from JSON file""" filepath = self.data_dir / filename if not filepath.exists(): print(f"No saved papers found at {filepath}") return [] with open(filepath, 'r', encoding='utf-8') as f: papers = json.load(f) print(f"Loaded {len(papers)} papers from {filepath}") return papers def main(): """Example usage: Fetch recent ML and AI papers""" fetcher = ArxivFetcher() # Fetch recent ML papers ml_papers = fetcher.fetch_papers( query="machine learning OR deep learning", max_results=50, category="cs.LG" ) # Fetch recent AI papers ai_papers = fetcher.fetch_papers( query="artificial intelligence OR neural networks", max_results=50, category="cs.AI" ) # Combine and save all_papers = ml_papers + ai_papers fetcher.save_papers(all_papers, "arxiv_papers.json") # Show sample print("\n=== Sample Paper ===") print(f"Title: {all_papers[0]['title']}") print(f"Authors: {', '.join(all_papers[0]['authors'][:3])}") print(f"Abstract: {all_papers[0]['abstract'][:200]}...") if __name__ == "__main__": main()