File size: 3,426 Bytes
3a5fdfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""
DocMind - arXiv Data Fetcher
Fetches papers from arXiv API and saves them for indexing
"""

import arxiv
import os
import json
from pathlib import Path
from typing import List, Dict


class ArxivFetcher:
    def __init__(self, data_dir: str = "data/papers"):
        self.data_dir = Path(data_dir)
        self.data_dir.mkdir(parents=True, exist_ok=True)

    def fetch_papers(
            self,
            query: str = "machine learning",
            max_results: int = 100,
            category: str = None
    ) -> List[Dict]:
        """
        Fetch papers from arXiv API

        Args:
            query: Search query string
            max_results: Maximum number of papers to fetch
            category: arXiv category (e.g., 'cs.AI', 'cs.LG')

        Returns:
            List of paper dictionaries
        """
        print(f"Fetching papers from arXiv: query='{query}', max={max_results}")

        # Build search query
        search_query = query
        if category:
            search_query = f"cat:{category} AND {query}"

        search = arxiv.Search(
            query=search_query,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )

        papers = []
        for result in search.results():
            paper = {
                'arxiv_id': result.entry_id.split('/')[-1],
                'title': result.title,
                'authors': [author.name for author in result.authors],
                'abstract': result.summary,
                'published': result.published.strftime('%Y-%m-%d'),
                'pdf_url': result.pdf_url,
                'categories': result.categories
            }
            papers.append(paper)

        print(f"Successfully fetched {len(papers)} papers")
        return papers

    def save_papers(self, papers: List[Dict], filename: str = "papers.json"):
        """Save papers to JSON file"""
        filepath = self.data_dir / filename
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(papers, f, indent=2, ensure_ascii=False)
        print(f"Saved {len(papers)} papers to {filepath}")

    def load_papers(self, filename: str = "papers.json") -> List[Dict]:
        """Load papers from JSON file"""
        filepath = self.data_dir / filename
        if not filepath.exists():
            print(f"No saved papers found at {filepath}")
            return []

        with open(filepath, 'r', encoding='utf-8') as f:
            papers = json.load(f)
        print(f"Loaded {len(papers)} papers from {filepath}")
        return papers


def main():
    """Example usage: Fetch recent ML and AI papers"""
    fetcher = ArxivFetcher()

    # Fetch recent ML papers
    ml_papers = fetcher.fetch_papers(
        query="machine learning OR deep learning",
        max_results=50,
        category="cs.LG"
    )

    # Fetch recent AI papers
    ai_papers = fetcher.fetch_papers(
        query="artificial intelligence OR neural networks",
        max_results=50,
        category="cs.AI"
    )

    # Combine and save
    all_papers = ml_papers + ai_papers
    fetcher.save_papers(all_papers, "arxiv_papers.json")

    # Show sample
    print("\n=== Sample Paper ===")
    print(f"Title: {all_papers[0]['title']}")
    print(f"Authors: {', '.join(all_papers[0]['authors'][:3])}")
    print(f"Abstract: {all_papers[0]['abstract'][:200]}...")


if __name__ == "__main__":
    main()