Spaces:
Sleeping
Sleeping
| import requests | |
| import xml.etree.ElementTree as ET | |
| from src.logger import logger | |
| class DataIngestion: | |
| def __init__(self, api_url="http://export.arxiv.org/api/query"): | |
| self.api_url = api_url | |
| def fetch_papers(self, topic, max_results=5): | |
| """Fetch papers from arXiv with logging and better error handling.""" | |
| url = f"{self.api_url}?search_query=all:{topic}&start=0&max_results={max_results}" | |
| logger.info(f"Fetching papers from: {url}") | |
| try: | |
| response = requests.get(url, timeout=10) # Added timeout | |
| response.raise_for_status() | |
| except requests.exceptions.RequestException as e: | |
| logger.error(f"Error fetching papers: {e}") | |
| return [], [] | |
| # Parse XML | |
| root = ET.fromstring(response.text) | |
| titles, abstracts = [], [] | |
| for entry in root.findall("{http://www.w3.org/2005/Atom}entry"): | |
| title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip() | |
| abstract = entry.find("{http://www.w3.org/2005/Atom}summary").text.strip() | |
| titles.append(title) | |
| abstracts.append(abstract) | |
| logger.info(f"Fetched {len(abstracts)} papers.") | |
| return titles, abstracts | |