Spaces:

Nithin89
/

AI_Research_Buddy

Sleeping

AI_Research_Buddy / src /data_ingestion.py

your-github-username

Auto-update from GitHub Actions

d9e62f5 9 months ago

1.28 kB

	import requests
	import xml.etree.ElementTree as ET
	from src.logger import logger

	class DataIngestion:
	def __init__(self, api_url="http://export.arxiv.org/api/query"):
	self.api_url = api_url

	def fetch_papers(self, topic, max_results=5):
	"""Fetch papers from arXiv with logging and better error handling."""
	url = f"{self.api_url}?search_query=all:{topic}&start=0&max_results={max_results}"
	logger.info(f"Fetching papers from: {url}")

	try:
	response = requests.get(url, timeout=10) # Added timeout
	response.raise_for_status()
	except requests.exceptions.RequestException as e:
	logger.error(f"Error fetching papers: {e}")
	return [], []

	# Parse XML
	root = ET.fromstring(response.text)
	titles, abstracts = [], []

	for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
	title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip()
	abstract = entry.find("{http://www.w3.org/2005/Atom}summary").text.strip()
	titles.append(title)
	abstracts.append(abstract)

	logger.info(f"Fetched {len(abstracts)} papers.")
	return titles, abstracts