DocMind / fetch_arxiv_data.py
Tanaybh's picture
Upload 4 files
3a5fdfb verified
raw
history blame
3.43 kB
"""
DocMind - arXiv Data Fetcher
Fetches papers from arXiv API and saves them for indexing
"""
import arxiv
import os
import json
from pathlib import Path
from typing import List, Dict
class ArxivFetcher:
def __init__(self, data_dir: str = "data/papers"):
self.data_dir = Path(data_dir)
self.data_dir.mkdir(parents=True, exist_ok=True)
def fetch_papers(
self,
query: str = "machine learning",
max_results: int = 100,
category: str = None
) -> List[Dict]:
"""
Fetch papers from arXiv API
Args:
query: Search query string
max_results: Maximum number of papers to fetch
category: arXiv category (e.g., 'cs.AI', 'cs.LG')
Returns:
List of paper dictionaries
"""
print(f"Fetching papers from arXiv: query='{query}', max={max_results}")
# Build search query
search_query = query
if category:
search_query = f"cat:{category} AND {query}"
search = arxiv.Search(
query=search_query,
max_results=max_results,
sort_by=arxiv.SortCriterion.SubmittedDate
)
papers = []
for result in search.results():
paper = {
'arxiv_id': result.entry_id.split('/')[-1],
'title': result.title,
'authors': [author.name for author in result.authors],
'abstract': result.summary,
'published': result.published.strftime('%Y-%m-%d'),
'pdf_url': result.pdf_url,
'categories': result.categories
}
papers.append(paper)
print(f"Successfully fetched {len(papers)} papers")
return papers
def save_papers(self, papers: List[Dict], filename: str = "papers.json"):
"""Save papers to JSON file"""
filepath = self.data_dir / filename
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(papers, f, indent=2, ensure_ascii=False)
print(f"Saved {len(papers)} papers to {filepath}")
def load_papers(self, filename: str = "papers.json") -> List[Dict]:
"""Load papers from JSON file"""
filepath = self.data_dir / filename
if not filepath.exists():
print(f"No saved papers found at {filepath}")
return []
with open(filepath, 'r', encoding='utf-8') as f:
papers = json.load(f)
print(f"Loaded {len(papers)} papers from {filepath}")
return papers
def main():
"""Example usage: Fetch recent ML and AI papers"""
fetcher = ArxivFetcher()
# Fetch recent ML papers
ml_papers = fetcher.fetch_papers(
query="machine learning OR deep learning",
max_results=50,
category="cs.LG"
)
# Fetch recent AI papers
ai_papers = fetcher.fetch_papers(
query="artificial intelligence OR neural networks",
max_results=50,
category="cs.AI"
)
# Combine and save
all_papers = ml_papers + ai_papers
fetcher.save_papers(all_papers, "arxiv_papers.json")
# Show sample
print("\n=== Sample Paper ===")
print(f"Title: {all_papers[0]['title']}")
print(f"Authors: {', '.join(all_papers[0]['authors'][:3])}")
print(f"Abstract: {all_papers[0]['abstract'][:200]}...")
if __name__ == "__main__":
main()