Spaces:
Sleeping
Sleeping
| import os | |
| import requests | |
| import xml.etree.ElementTree as ET | |
| import urllib.parse | |
| from tqdm import tqdm | |
| import time | |
| import re | |
| import requests | |
| from asg_query import generate_generic_query_qwen, generate_query_qwen | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| PARENT_FOLDER = "arxiv_downloads_new_new_new" | |
| os.makedirs(PARENT_FOLDER, exist_ok=True) | |
| def sanitize_filename(filename): | |
| filename = filename.replace("\n", "").strip() | |
| filename = re.sub(r'[\/:*?"<>|]', '_', filename) | |
| return filename[:100] + ".pdf" | |
| def search_arxiv_papers(topic, max_results=50): | |
| query_qwen = generate_query_qwen(topic) | |
| encoded_query = urllib.parse.quote_plus(query_qwen) | |
| url = f"https://export.arxiv.org/api/query?search_query={encoded_query}&start=0&max_results={max_results}&sortBy=submittedDate" | |
| # base_url = "http://export.arxiv.org/api/query?" | |
| # query = f"search_query=all:{topic.replace(' ', '+')}&start=0&max_results={max_results}&sortBy=relevance&sortOrder=descending" | |
| # url = base_url + query | |
| response = requests.get(url) | |
| if response.status_code != 200: | |
| print(f"Error fetching data for {topic}: {response.status_code}") | |
| return [] | |
| root = ET.fromstring(response.text) | |
| entries = root.findall("{http://www.w3.org/2005/Atom}entry") | |
| papers = [] | |
| for entry in entries: | |
| title = entry.find("{http://www.w3.org/2005/Atom}title").text | |
| pdf_link = entry.find("{http://www.w3.org/2005/Atom}id").text.replace("abs", "pdf") | |
| papers.append({"title": title, "pdf_link": pdf_link}) | |
| return papers | |
| def download_pdf(url, folder, filename): | |
| file_path = os.path.join(folder, filename) | |
| response = requests.get(url, stream=True) | |
| if response.status_code == 200: | |
| with open(file_path, 'wb') as file: | |
| for chunk in response.iter_content(chunk_size=1024): | |
| file.write(chunk) | |
| else: | |
| print(f"Failed to download {url}") | |
| def download_arxiv_papers(topic, max_results=50): | |
| folder_name = os.path.join(PARENT_FOLDER, topic.replace(" ", "_")) | |
| os.makedirs(folder_name, exist_ok=True) | |
| papers = search_arxiv_papers(topic, max_results) | |
| if not papers: | |
| print(f"No papers found for topic: {topic}") | |
| return | |
| print(f"Downloading {len(papers)} papers for topic: {topic}") | |
| for paper in tqdm(papers, total=len(papers)): | |
| filename = sanitize_filename(paper['title']) | |
| pdf_link = paper["pdf_link"] | |
| download_pdf(pdf_link, folder_name, filename) | |
| time.sleep(2) | |
| print(f"Download complete. Papers saved in: {folder_name}") | |
| def search_arxiv_with_query(query, max_results=50): | |
| """ | |
| Query the arXiv API with a given query string. | |
| Parameters: | |
| query (str): The query string (URL-unencoded). | |
| max_results (int): Maximum number of results to request. | |
| Returns: | |
| list: A list of dictionaries containing paper metadata. | |
| """ | |
| encoded_query = urllib.parse.quote_plus(query) | |
| url = f"https://export.arxiv.org/api/query?search_query={encoded_query}&start=0&max_results={max_results}&sortBy=submittedDate" | |
| response = requests.get(url) | |
| if response.status_code != 200: | |
| print(f"Error fetching data with query: {query} | status code: {response.status_code}") | |
| return [] | |
| try: | |
| root = ET.fromstring(response.text) | |
| except Exception as e: | |
| print("Error parsing XML:", e) | |
| return [] | |
| entries = root.findall("{http://www.w3.org/2005/Atom}entry") | |
| papers = [] | |
| for entry in entries: | |
| title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip() | |
| pdf_link = entry.find("{http://www.w3.org/2005/Atom}id").text.replace("abs", "pdf") | |
| papers.append({"title": title, "pdf_link": pdf_link}) | |
| return papers | |
| def download_arxiv_papers_new(topic, max_results=50, min_results=10): | |
| """ | |
| Download arXiv papers for a given topic. | |
| Process: | |
| 1. Use a strict query generated by generate_query_qwen(topic) to query arXiv. | |
| 2. If the number of results is fewer than `min_results`, then generate a more generic query | |
| using generate_generic_query_qwen() and run additional searches. | |
| 3. Combine non-duplicate papers (filtered by title) until reaching max_results or exhausting attempts. | |
| 4. Download the PDF of each paper. | |
| Parameters: | |
| topic (str): The research topic. | |
| max_results (int): Total maximum number of papers to download (default is 50). | |
| min_results (int): Minimum acceptable number of papers from the first query (default is 10). | |
| """ | |
| folder_name = os.path.join(PARENT_FOLDER, topic.replace(" ", "_")) | |
| os.makedirs(folder_name, exist_ok=True) | |
| # 1. Initial strict query. | |
| strict_query = generate_query_qwen(topic) | |
| papers = search_arxiv_with_query(strict_query, max_results=max_results) | |
| # Use a dict keyed by title to avoid duplicates. | |
| total_papers = {paper["title"]: paper for paper in papers} | |
| print(f"[Strict Query] Found {len(total_papers)} papers for topic: {topic}") | |
| # 2. If the strict query returns fewer than min_results papers, | |
| # use the generic query to broaden the search. | |
| attempts = 0 | |
| MAX_ATTEMPTS = 5 # Limit attempts to avoid infinite loops. | |
| while len(total_papers) < max_results and len(total_papers) < min_results and attempts < MAX_ATTEMPTS: | |
| # Generate a less strict (generic) query | |
| generic_query = generate_generic_query_qwen(strict_query, topic) | |
| print(f"[Generic Query Attempt {attempts + 1}] Using generic query: {generic_query}") | |
| generic_papers = search_arxiv_with_query(generic_query, max_results=max_results) | |
| new_count = 0 | |
| for paper in generic_papers: | |
| if paper["title"] not in total_papers: | |
| total_papers[paper["title"]] = paper | |
| new_count += 1 | |
| if len(total_papers) >= max_results: | |
| break | |
| attempts += 1 | |
| strict_query = generic_query # Update the query for the next iteration. | |
| total_paper_list = list(total_papers.values())[:max_results] | |
| if not total_paper_list: | |
| print(f"No papers found for topic: {topic}") | |
| return | |
| print(f"Downloading {len(total_paper_list)} papers for topic: {topic}") | |
| for paper in tqdm(total_paper_list, total=len(total_paper_list)): | |
| filename = sanitize_filename(paper['title']) | |
| pdf_link = paper["pdf_link"] | |
| download_pdf(pdf_link, folder_name, filename) | |
| time.sleep(2) # Delay to avoid overwhelming the arXiv API | |
| print(f"Download complete. Papers saved in: {folder_name}") | |
| first_topics = [ | |
| "quantum computing: bqp, quantum supremacy, and related concepts", | |
| "fixed-parameter tractability and related concepts in computational complexity", | |
| "fundamental concepts in computational complexity theory", | |
| "pcp theorem and its implications in approximation and complexity theory", | |
| "interconnections in theoretical computer science: seth, 3sum, apsp, and related concepts", | |
| "nosql database systems for flexible and scalable data management", | |
| "temporal databases, real-time databases, and data management systems", | |
| "large language model integration with databases for enhanced data management and survey analysis", | |
| "ai-driven database management", | |
| "distributed systems and databases: key concepts and technologies", | |
| "graph databases and query languages: traversal, indexing, and analytics", | |
| "graph databases: models, data modeling, and applications", | |
| "multi-model databases: mongodb, arangodb, and jsonb", | |
| "time-series data management and analytics", | |
| "advanced data management and retrieval techniques", | |
| "vector databases and their role in modern data management and retrieval", | |
| "content delivery networks: technologies and strategies for optimization", | |
| "lpwan technologies: lora, zigbee 3.0, 6lowpan, and related protocols in iot", | |
| "network slicing and emerging technologies in 6g networks", | |
| "advanced concepts and technologies in software-defined networking and network function virtualization", | |
| "battery electrolyte formulation in lithium-ion batteries", | |
| "flow batteries as energy storage systems", | |
| "internal consistency, self-feedback, and reliability in large language models", | |
| "attention mechanisms in large language models", | |
| "controlled text generation with large language models in natural language processing", | |
| "domain adaptation and specialized nlp applications", | |
| "evaluation of large language models for natural language processing", | |
| "information extraction and large language models in natural language processing", | |
| "techniques for low-resource natural language processing", | |
| "model compression techniques for transformer models", | |
| "multi-agent offline policy reinforcement learning: decentralized learning and cooperative policy optimization", | |
| "multimodal learning and its applications", | |
| "reasoning capabilities of large language models", | |
| "transformer models in natural language processing" | |
| ] | |
| second_topics = [ | |
| "semi-supervised learning", | |
| "out-of-distribution detection", | |
| "in-context learning" | |
| ] | |
| if __name__ == '__main__': | |
| for topic in first_topics: | |
| print(f"\nProcessing topic (first list): {topic}") | |
| download_arxiv_papers_new(topic, max_results=50, min_results=20) | |
| for topic in second_topics: | |
| print(f"\nProcessing topic (second list): {topic}") | |
| download_arxiv_papers_new(topic, max_results=50, min_results=20) |