Spaces:
Runtime error
Runtime error
gamingflexer
commited on
Commit
·
2f67f06
1
Parent(s):
d1e24cf
Debugged & expections added
Browse files- src/scrapper/arxiv.py +19 -10
- src/scrapper/main.py +13 -11
src/scrapper/arxiv.py
CHANGED
|
@@ -10,6 +10,7 @@ import PyPDF2
|
|
| 10 |
import requests
|
| 11 |
from tqdm.auto import tqdm
|
| 12 |
from decouple import config
|
|
|
|
| 13 |
|
| 14 |
"""
|
| 15 |
Usage : get_paper_id("8-bit matrix multiplication for transformers at scale") -> 2106.09680
|
|
@@ -91,7 +92,7 @@ class Arxiv:
|
|
| 91 |
# initialize the requests session
|
| 92 |
self.session = requests.Session()
|
| 93 |
|
| 94 |
-
def load(self, save: bool = False):
|
| 95 |
"""Load the paper from the ArXiv API or from a local file
|
| 96 |
if it already exists. Stores the paper's text content and
|
| 97 |
meta data in self.content and other attributes.
|
|
@@ -101,6 +102,7 @@ class Arxiv:
|
|
| 101 |
:type save: bool, optional
|
| 102 |
"""
|
| 103 |
# check if pdf already exists
|
|
|
|
| 104 |
if os.path.exists(f'papers/{self.id}.json'):
|
| 105 |
print(f'Loading papers/{self.id}.json from file')
|
| 106 |
with open(f'papers/{self.id}.json', 'r') as fp:
|
|
@@ -108,15 +110,22 @@ class Arxiv:
|
|
| 108 |
for key, value in attributes.items():
|
| 109 |
setattr(self, key, value)
|
| 110 |
else:
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
def get_refs(self, extractor, text_splitter):
|
| 122 |
"""Get the references for the paper.
|
|
|
|
| 10 |
import requests
|
| 11 |
from tqdm.auto import tqdm
|
| 12 |
from decouple import config
|
| 13 |
+
import uuid
|
| 14 |
|
| 15 |
"""
|
| 16 |
Usage : get_paper_id("8-bit matrix multiplication for transformers at scale") -> 2106.09680
|
|
|
|
| 92 |
# initialize the requests session
|
| 93 |
self.session = requests.Session()
|
| 94 |
|
| 95 |
+
def load(self, path_author : str ,save: bool = False):
|
| 96 |
"""Load the paper from the ArXiv API or from a local file
|
| 97 |
if it already exists. Stores the paper's text content and
|
| 98 |
meta data in self.content and other attributes.
|
|
|
|
| 102 |
:type save: bool, optional
|
| 103 |
"""
|
| 104 |
# check if pdf already exists
|
| 105 |
+
# to_save_path = os.path.join(path_author, str(self.id)+".json")
|
| 106 |
if os.path.exists(f'papers/{self.id}.json'):
|
| 107 |
print(f'Loading papers/{self.id}.json from file')
|
| 108 |
with open(f'papers/{self.id}.json', 'r') as fp:
|
|
|
|
| 110 |
for key, value in attributes.items():
|
| 111 |
setattr(self, key, value)
|
| 112 |
else:
|
| 113 |
+
try:
|
| 114 |
+
res = self.session.get(self.url)
|
| 115 |
+
print(f'Downloading {self.url}')
|
| 116 |
+
# uuid_small = str(uuid.uuid4())[:8]
|
| 117 |
+
temp_pdf_path = f'./temp.pdf'
|
| 118 |
+
with open(temp_pdf_path, 'wb') as fp:
|
| 119 |
+
fp.write(res.content)
|
| 120 |
+
# extract text content
|
| 121 |
+
self._convert_pdf_to_text()
|
| 122 |
+
# get meta for PDF
|
| 123 |
+
self._download_meta()
|
| 124 |
+
if save:
|
| 125 |
+
self.save()
|
| 126 |
+
except Exception as e:
|
| 127 |
+
print(f"Error while downloading paper {self.id}: {e}")
|
| 128 |
+
raise e
|
| 129 |
|
| 130 |
def get_refs(self, extractor, text_splitter):
|
| 131 |
"""Get the references for the paper.
|
src/scrapper/main.py
CHANGED
|
@@ -11,8 +11,8 @@ class ArxivPaper:
|
|
| 11 |
self.author_name = author_name
|
| 12 |
self.extractor, self.text_splitter = init_extractor(template=reference_extraction['template'], openai_api_key=OPENAI_API_KEY)
|
| 13 |
|
| 14 |
-
def get_results_google(self, number_of_results
|
| 15 |
-
result_dict = get_google_scrape(self.author_name
|
| 16 |
paper_links = []
|
| 17 |
for i in result_dict['organic_results']:
|
| 18 |
if "arxiv.org" in i['link']:
|
|
@@ -36,12 +36,14 @@ class ArxivPaper:
|
|
| 36 |
path_author = os.path.join(path, self.author_name.replace(" ", "_"))
|
| 37 |
data = {}
|
| 38 |
for i in tqdm(paper_ids):
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
| 11 |
self.author_name = author_name
|
| 12 |
self.extractor, self.text_splitter = init_extractor(template=reference_extraction['template'], openai_api_key=OPENAI_API_KEY)
|
| 13 |
|
| 14 |
+
def get_results_google(self, number_of_results = 25):
|
| 15 |
+
result_dict = get_google_scrape(str(self.author_name)+" research papers arxiv.org",num=number_of_results)
|
| 16 |
paper_links = []
|
| 17 |
for i in result_dict['organic_results']:
|
| 18 |
if "arxiv.org" in i['link']:
|
|
|
|
| 36 |
path_author = os.path.join(path, self.author_name.replace(" ", "_"))
|
| 37 |
data = {}
|
| 38 |
for i in tqdm(paper_ids):
|
| 39 |
+
try:
|
| 40 |
+
paper = Arxiv(i)
|
| 41 |
+
paper.load(path_author)
|
| 42 |
+
paper.get_meta()
|
| 43 |
+
refs = paper.get_refs(
|
| 44 |
+
extractor=self.extractor,
|
| 45 |
+
text_splitter=self.text_splitter,)
|
| 46 |
+
paper.chunker()
|
| 47 |
+
paper.save_chunks(include_metadata=True, path=path_author)
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print(f"Error processing paper {i}: {e}")
|