|
|
""" |
|
|
DocMind - arXiv Data Fetcher |
|
|
Fetches papers from arXiv API and saves them for indexing |
|
|
""" |
|
|
|
|
|
import arxiv |
|
|
import os |
|
|
import json |
|
|
from pathlib import Path |
|
|
from typing import List, Dict |
|
|
|
|
|
|
|
|
class ArxivFetcher: |
|
|
def __init__(self, data_dir: str = "data/papers"): |
|
|
self.data_dir = Path(data_dir) |
|
|
self.data_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
def fetch_papers( |
|
|
self, |
|
|
query: str = "machine learning", |
|
|
max_results: int = 100, |
|
|
category: str = None |
|
|
) -> List[Dict]: |
|
|
""" |
|
|
Fetch papers from arXiv API |
|
|
|
|
|
Args: |
|
|
query: Search query string |
|
|
max_results: Maximum number of papers to fetch |
|
|
category: arXiv category (e.g., 'cs.AI', 'cs.LG') |
|
|
|
|
|
Returns: |
|
|
List of paper dictionaries |
|
|
""" |
|
|
print(f"Fetching papers from arXiv: query='{query}', max={max_results}") |
|
|
|
|
|
|
|
|
search_query = query |
|
|
if category: |
|
|
search_query = f"cat:{category} AND {query}" |
|
|
|
|
|
search = arxiv.Search( |
|
|
query=search_query, |
|
|
max_results=max_results, |
|
|
sort_by=arxiv.SortCriterion.SubmittedDate |
|
|
) |
|
|
|
|
|
papers = [] |
|
|
for result in search.results(): |
|
|
paper = { |
|
|
'arxiv_id': result.entry_id.split('/')[-1], |
|
|
'title': result.title, |
|
|
'authors': [author.name for author in result.authors], |
|
|
'abstract': result.summary, |
|
|
'published': result.published.strftime('%Y-%m-%d'), |
|
|
'pdf_url': result.pdf_url, |
|
|
'categories': result.categories |
|
|
} |
|
|
papers.append(paper) |
|
|
|
|
|
print(f"Successfully fetched {len(papers)} papers") |
|
|
return papers |
|
|
|
|
|
def save_papers(self, papers: List[Dict], filename: str = "papers.json"): |
|
|
"""Save papers to JSON file""" |
|
|
filepath = self.data_dir / filename |
|
|
with open(filepath, 'w', encoding='utf-8') as f: |
|
|
json.dump(papers, f, indent=2, ensure_ascii=False) |
|
|
print(f"Saved {len(papers)} papers to {filepath}") |
|
|
|
|
|
def load_papers(self, filename: str = "papers.json") -> List[Dict]: |
|
|
"""Load papers from JSON file""" |
|
|
filepath = self.data_dir / filename |
|
|
if not filepath.exists(): |
|
|
print(f"No saved papers found at {filepath}") |
|
|
return [] |
|
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f: |
|
|
papers = json.load(f) |
|
|
print(f"Loaded {len(papers)} papers from {filepath}") |
|
|
return papers |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Example usage: Fetch recent ML and AI papers""" |
|
|
fetcher = ArxivFetcher() |
|
|
|
|
|
|
|
|
ml_papers = fetcher.fetch_papers( |
|
|
query="machine learning OR deep learning", |
|
|
max_results=50, |
|
|
category="cs.LG" |
|
|
) |
|
|
|
|
|
|
|
|
ai_papers = fetcher.fetch_papers( |
|
|
query="artificial intelligence OR neural networks", |
|
|
max_results=50, |
|
|
category="cs.AI" |
|
|
) |
|
|
|
|
|
|
|
|
all_papers = ml_papers + ai_papers |
|
|
fetcher.save_papers(all_papers, "arxiv_papers.json") |
|
|
|
|
|
|
|
|
print("\n=== Sample Paper ===") |
|
|
print(f"Title: {all_papers[0]['title']}") |
|
|
print(f"Authors: {', '.join(all_papers[0]['authors'][:3])}") |
|
|
print(f"Abstract: {all_papers[0]['abstract'][:200]}...") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |