Spaces:

Tanaybh
/

DocMind

Runtime error

App Files Files Community

DocMind / fetch_arxiv_data.py

Tanaybh

Upload 4 files

3a5fdfb verified about 2 months ago

raw

history blame

3.43 kB

	"""
	DocMind - arXiv Data Fetcher
	Fetches papers from arXiv API and saves them for indexing
	"""

	import arxiv
	import os
	import json
	from pathlib import Path
	from typing import List, Dict


	class ArxivFetcher:
	def __init__(self, data_dir: str = "data/papers"):
	self.data_dir = Path(data_dir)
	self.data_dir.mkdir(parents=True, exist_ok=True)

	def fetch_papers(
	self,
	query: str = "machine learning",
	max_results: int = 100,
	category: str = None
	) -> List[Dict]:
	"""
	Fetch papers from arXiv API

	Args:
	query: Search query string
	max_results: Maximum number of papers to fetch
	category: arXiv category (e.g., 'cs.AI', 'cs.LG')

	Returns:
	List of paper dictionaries
	"""
	print(f"Fetching papers from arXiv: query='{query}', max={max_results}")

	# Build search query
	search_query = query
	if category:
	search_query = f"cat:{category} AND {query}"

	search = arxiv.Search(
	query=search_query,
	max_results=max_results,
	sort_by=arxiv.SortCriterion.SubmittedDate
	)

	papers = []
	for result in search.results():
	paper = {
	'arxiv_id': result.entry_id.split('/')[-1],
	'title': result.title,
	'authors': [author.name for author in result.authors],
	'abstract': result.summary,
	'published': result.published.strftime('%Y-%m-%d'),
	'pdf_url': result.pdf_url,
	'categories': result.categories
	}
	papers.append(paper)

	print(f"Successfully fetched {len(papers)} papers")
	return papers

	def save_papers(self, papers: List[Dict], filename: str = "papers.json"):
	"""Save papers to JSON file"""
	filepath = self.data_dir / filename
	with open(filepath, 'w', encoding='utf-8') as f:
	json.dump(papers, f, indent=2, ensure_ascii=False)
	print(f"Saved {len(papers)} papers to {filepath}")

	def load_papers(self, filename: str = "papers.json") -> List[Dict]:
	"""Load papers from JSON file"""
	filepath = self.data_dir / filename
	if not filepath.exists():
	print(f"No saved papers found at {filepath}")
	return []

	with open(filepath, 'r', encoding='utf-8') as f:
	papers = json.load(f)
	print(f"Loaded {len(papers)} papers from {filepath}")
	return papers


	def main():
	"""Example usage: Fetch recent ML and AI papers"""
	fetcher = ArxivFetcher()

	# Fetch recent ML papers
	ml_papers = fetcher.fetch_papers(
	query="machine learning OR deep learning",
	max_results=50,
	category="cs.LG"
	)

	# Fetch recent AI papers
	ai_papers = fetcher.fetch_papers(
	query="artificial intelligence OR neural networks",
	max_results=50,
	category="cs.AI"
	)

	# Combine and save
	all_papers = ml_papers + ai_papers
	fetcher.save_papers(all_papers, "arxiv_papers.json")

	# Show sample
	print("\n=== Sample Paper ===")
	print(f"Title: {all_papers[0]['title']}")
	print(f"Authors: {', '.join(all_papers[0]['authors'][:3])}")
	print(f"Abstract: {all_papers[0]['abstract'][:200]}...")


	if __name__ == "__main__":
	main()