Spaces:
Runtime error
Runtime error
gamingflexer
commited on
Commit
·
7cbc824
1
Parent(s):
2c4c422
Refactor Arxiv class and add new methods
Browse files- src/scrapper/arxiv.py +221 -3
src/scrapper/arxiv.py
CHANGED
|
@@ -1,8 +1,15 @@
|
|
| 1 |
-
import
|
| 2 |
-
|
| 3 |
import logging
|
| 4 |
-
from typing import
|
| 5 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
"""
|
| 8 |
Usage : get_paper_id("8-bit matrix multiplication for transformers at scale") -> 2106.09680
|
|
@@ -64,3 +71,214 @@ def get_paper_id(query: str, handle_not_found: bool = True):
|
|
| 64 |
# if no paper is found, raise an error
|
| 65 |
raise Exception(f'No paper found for query: {query}')
|
| 66 |
return paper_id
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
import logging
|
| 4 |
+
from typing import Optional
|
| 5 |
import re
|
| 6 |
+
import requests
|
| 7 |
+
from requests.adapters import HTTPAdapter, Retry
|
| 8 |
+
import arxiv
|
| 9 |
+
import PyPDF2
|
| 10 |
+
import requests
|
| 11 |
+
from tqdm.auto import tqdm
|
| 12 |
+
from decouple import config
|
| 13 |
|
| 14 |
"""
|
| 15 |
Usage : get_paper_id("8-bit matrix multiplication for transformers at scale") -> 2106.09680
|
|
|
|
| 71 |
# if no paper is found, raise an error
|
| 72 |
raise Exception(f'No paper found for query: {query}')
|
| 73 |
return paper_id
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class Arxiv:
|
| 77 |
+
refs_re = re.compile(r'\n(References|REFERENCES)\n')
|
| 78 |
+
references = []
|
| 79 |
+
|
| 80 |
+
llm = None
|
| 81 |
+
|
| 82 |
+
def __init__(self, paper_id: str):
|
| 83 |
+
"""Object to handle the extraction of an ArXiv paper and its
|
| 84 |
+
relevant information.
|
| 85 |
+
|
| 86 |
+
:param paper_id: The ID of the paper to extract
|
| 87 |
+
:type paper_id: str
|
| 88 |
+
"""
|
| 89 |
+
self.id = paper_id
|
| 90 |
+
self.url = f"https://export.arxiv.org/pdf/{paper_id}.pdf"
|
| 91 |
+
# initialize the requests session
|
| 92 |
+
self.session = requests.Session()
|
| 93 |
+
|
| 94 |
+
def load(self, save: bool = False):
|
| 95 |
+
"""Load the paper from the ArXiv API or from a local file
|
| 96 |
+
if it already exists. Stores the paper's text content and
|
| 97 |
+
meta data in self.content and other attributes.
|
| 98 |
+
|
| 99 |
+
:param save: Whether to save the paper to a local file,
|
| 100 |
+
defaults to False
|
| 101 |
+
:type save: bool, optional
|
| 102 |
+
"""
|
| 103 |
+
# check if pdf already exists
|
| 104 |
+
if os.path.exists(f'papers/{self.id}.json'):
|
| 105 |
+
print(f'Loading papers/{self.id}.json from file')
|
| 106 |
+
with open(f'papers/{self.id}.json', 'r') as fp:
|
| 107 |
+
attributes = json.loads(fp.read())
|
| 108 |
+
for key, value in attributes.items():
|
| 109 |
+
setattr(self, key, value)
|
| 110 |
+
else:
|
| 111 |
+
res = self.session.get(self.url)
|
| 112 |
+
with open(f'temp.pdf', 'wb') as fp:
|
| 113 |
+
fp.write(res.content)
|
| 114 |
+
# extract text content
|
| 115 |
+
self._convert_pdf_to_text()
|
| 116 |
+
# get meta for PDF
|
| 117 |
+
self._download_meta()
|
| 118 |
+
if save:
|
| 119 |
+
self.save()
|
| 120 |
+
|
| 121 |
+
def get_refs(self, extractor, text_splitter):
|
| 122 |
+
"""Get the references for the paper.
|
| 123 |
+
|
| 124 |
+
:param extractor: The LLMChain extractor model
|
| 125 |
+
:type extractor: LLMChain
|
| 126 |
+
:param text_splitter: The text splitter to use
|
| 127 |
+
:type text_splitter: TokenTextSplitter
|
| 128 |
+
:return: The references for the paper
|
| 129 |
+
:rtype: list
|
| 130 |
+
"""
|
| 131 |
+
if len(self.references) == 0:
|
| 132 |
+
self._download_refs(extractor, text_splitter)
|
| 133 |
+
return self.references
|
| 134 |
+
|
| 135 |
+
def _download_refs(self, extractor, text_splitter):
|
| 136 |
+
"""Download the references for the paper. Stores them in
|
| 137 |
+
the self.references attribute.
|
| 138 |
+
|
| 139 |
+
:param extractor: The LLMChain extractor model
|
| 140 |
+
:type extractor: LLMChain
|
| 141 |
+
:param text_splitter: The text splitter to use
|
| 142 |
+
:type text_splitter: TokenTextSplitter
|
| 143 |
+
"""
|
| 144 |
+
# get references section of paper
|
| 145 |
+
refs = self.refs_re.split(self.content)[-1]
|
| 146 |
+
# we don't need the full thing, just the first page
|
| 147 |
+
refs_page = text_splitter.split_text(refs)[0]
|
| 148 |
+
# use LLM extractor to extract references
|
| 149 |
+
out = extractor.run(refs=refs_page)
|
| 150 |
+
out = out.split('\n')
|
| 151 |
+
out = [o for o in out if o != '']
|
| 152 |
+
# with list of references, find the paper IDs
|
| 153 |
+
ids = [get_paper_id(o) for o in out]
|
| 154 |
+
# clean up into JSONL type format
|
| 155 |
+
out = [o.split(' | ') for o in out]
|
| 156 |
+
# in case we're missing some fields
|
| 157 |
+
out = [o for o in out if len(o) == 3]
|
| 158 |
+
meta = [{
|
| 159 |
+
'id': _id,
|
| 160 |
+
'title': o[0],
|
| 161 |
+
'authors': o[1],
|
| 162 |
+
'year': o[2]
|
| 163 |
+
} for o, _id in zip(out, ids) if _id is not None]
|
| 164 |
+
logging.debug(f"Extracted {len(meta)} references")
|
| 165 |
+
self.references = meta
|
| 166 |
+
|
| 167 |
+
def _convert_pdf_to_text(self):
|
| 168 |
+
"""Convert the PDF to text and store it in the self.content
|
| 169 |
+
attribute.
|
| 170 |
+
"""
|
| 171 |
+
text = []
|
| 172 |
+
with open("temp.pdf", 'rb') as f:
|
| 173 |
+
# create a PDF object
|
| 174 |
+
pdf = PyPDF2.PdfReader(f)
|
| 175 |
+
# iterate over every page in the PDF
|
| 176 |
+
for page in range(len(pdf.pages)):
|
| 177 |
+
# get the page object
|
| 178 |
+
page_obj = pdf.pages[page]
|
| 179 |
+
# extract text from the page
|
| 180 |
+
text.append(page_obj.extract_text())
|
| 181 |
+
text = "\n".join(text)
|
| 182 |
+
self.content = text
|
| 183 |
+
|
| 184 |
+
def _download_meta(self):
|
| 185 |
+
"""Download the meta information for the paper from the
|
| 186 |
+
ArXiv API and store it in the self attributes.
|
| 187 |
+
"""
|
| 188 |
+
search = arxiv.Search(
|
| 189 |
+
query=f'id:{self.id}',
|
| 190 |
+
max_results=1,
|
| 191 |
+
sort_by=arxiv.SortCriterion.SubmittedDate
|
| 192 |
+
)
|
| 193 |
+
result = list(search.results())
|
| 194 |
+
if len(result) == 0:
|
| 195 |
+
raise ValueError(f"No paper found for paper '{self.id}'")
|
| 196 |
+
result = result[0]
|
| 197 |
+
# remove 'v1', 'v2', etc. from the end of the pdf_url
|
| 198 |
+
result.pdf_url = re.sub(r'v\d+$', '', result.pdf_url)
|
| 199 |
+
self.authors = [author.name for author in result.authors]
|
| 200 |
+
self.categories = result.categories
|
| 201 |
+
self.comment = result.comment
|
| 202 |
+
self.journal_ref = result.journal_ref
|
| 203 |
+
self.source = result.pdf_url
|
| 204 |
+
self.primary_category = result.primary_category
|
| 205 |
+
self.published = result.published.strftime('%Y%m%d')
|
| 206 |
+
self.summary = result.summary
|
| 207 |
+
self.title = result.title
|
| 208 |
+
self.updated = result.updated.strftime('%Y%m%d')
|
| 209 |
+
logging.debug(f"Downloaded metadata for paper '{self.id}'")
|
| 210 |
+
|
| 211 |
+
def save(self):
|
| 212 |
+
"""Save the paper to a local JSON file.
|
| 213 |
+
"""
|
| 214 |
+
with open(f'papers/{self.id}.json', 'w') as fp:
|
| 215 |
+
json.dump(self.__dict__(), fp, indent=4)
|
| 216 |
+
|
| 217 |
+
def save_chunks(
|
| 218 |
+
self,
|
| 219 |
+
include_metadata: bool = True,
|
| 220 |
+
path: str = "chunks"
|
| 221 |
+
):
|
| 222 |
+
"""Save the paper's chunks to a local JSONL file.
|
| 223 |
+
|
| 224 |
+
:param include_metadata: Whether to include the paper's
|
| 225 |
+
metadata in the chunks, defaults
|
| 226 |
+
to True
|
| 227 |
+
:type include_metadata: bool, optional
|
| 228 |
+
:param path: The path to save the file to, defaults to "papers"
|
| 229 |
+
:type path: str, optional
|
| 230 |
+
"""
|
| 231 |
+
if not os.path.exists(path):
|
| 232 |
+
os.makedirs(path)
|
| 233 |
+
with open(f'{path}/{self.id}.jsonl', 'w') as fp:
|
| 234 |
+
for chunk in self.dataset:
|
| 235 |
+
if include_metadata:
|
| 236 |
+
chunk.update(self.get_meta())
|
| 237 |
+
fp.write(json.dumps(chunk) + '\n')
|
| 238 |
+
logging.debug(f"Saved paper to '{path}/{self.id}.jsonl'")
|
| 239 |
+
|
| 240 |
+
def get_meta(self):
|
| 241 |
+
"""Returns the meta information for the paper.
|
| 242 |
+
|
| 243 |
+
:return: The meta information for the paper
|
| 244 |
+
:rtype: dict
|
| 245 |
+
"""
|
| 246 |
+
fields = self.__dict__()
|
| 247 |
+
# drop content field because it's big
|
| 248 |
+
fields.pop('content')
|
| 249 |
+
return fields
|
| 250 |
+
|
| 251 |
+
def chunker(self, chunk_size=300):
|
| 252 |
+
# Single Chunk is made for now
|
| 253 |
+
clean_paper = self._clean_text(self.content)
|
| 254 |
+
langchain_dataset = []
|
| 255 |
+
langchain_dataset.append({
|
| 256 |
+
'doi': self.id,
|
| 257 |
+
'chunk-id': 1,
|
| 258 |
+
'chunk': clean_paper
|
| 259 |
+
})
|
| 260 |
+
self.dataset = langchain_dataset
|
| 261 |
+
|
| 262 |
+
def _clean_text(self, text):
|
| 263 |
+
text = re.sub(r'-\n', '', text)
|
| 264 |
+
return text
|
| 265 |
+
|
| 266 |
+
def __dict__(self):
|
| 267 |
+
return {
|
| 268 |
+
'id': self.id,
|
| 269 |
+
'title': self.title,
|
| 270 |
+
'summary': self.summary,
|
| 271 |
+
'source': self.source,
|
| 272 |
+
'authors': self.authors,
|
| 273 |
+
'categories': self.categories,
|
| 274 |
+
'comment': self.comment,
|
| 275 |
+
'journal_ref': self.journal_ref,
|
| 276 |
+
'primary_category': self.primary_category,
|
| 277 |
+
'published': self.published,
|
| 278 |
+
'updated': self.updated,
|
| 279 |
+
'content': self.content,
|
| 280 |
+
'references': self.references
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
def __repr__(self):
|
| 284 |
+
return f"Arxiv(paper_id='{self.id}')"
|