Spaces:

jskim
/

paper-matching

Runtime error

App Files Files Community

paper-matching / input_format.py

jskim

pdf input removed. now retrieving urls for reviewer papers.

81ca652 over 2 years ago

raw

history blame

2.67 kB

	import numpy as np
	from pypdf import PdfReader
	from urllib.parse import urlparse
	import requests
	from semanticscholar import SemanticScholar

	### Input Formatting Module

	## Input formatting for the given paper
	# Extracting text from a pdf or a link

	def get_text_from_pdf(file_path):
	"""
	Convert a pdf to list of text files
	"""
	reader = PdfReader(file_path)
	text = []
	for p in reader.pages:
	t = p.extract_text()
	text.append(t)
	return text

	def get_text_from_url(url, file_path='paper.pdf'):
	"""
	Get text of the paper from a url
	"""
	## Check for different URL cases
	url_parts = urlparse(url)
	# arxiv
	if 'arxiv' in url_parts.netloc:
	if 'abs' in url_parts.path:
	# abstract page, change the url to pdf link
	paper_id = url_parts.path.split('/')[-1]
	url = 'https://www.arxiv.org/pdf/%s.pdf'%(paper_id)
	elif 'pdf' in url_parts.path:
	# pdf file, pass
	pass
	else:
	raise ValueError('invalid url')
	else:
	raise ValueError('invalid url')

	# download the file
	download_pdf(url, file_path)

	# get the text from the pdf file
	text = get_text_from_pdf(file_path)
	return text

	def download_pdf(url, file_name):
	"""
	Download the pdf file from given url and save it as file_name
	"""
	# Send GET request
	response = requests.get(url)

	# Save the PDF
	if response.status_code == 200:
	with open(file_name, "wb") as f:
	f.write(response.content)
	elif response.status_code == 404:
	raise ValueError('cannot download the file')
	else:
	print(response.status_code)

	## Input formatting for the given author (reviewer)
	# Extracting text from a link

	def get_text_from_author_id(author_id, max_count=100):
	if author_id is None:
	raise ValueError('Input valid author ID')
	aid = str(author_id)
	if 'http' in aid: # handle semantic scholar url input
	aid = aid.split('/')
	aid = aid[aid.index('author')+2]
	url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract,papers.url"%aid
	r = requests.get(url)
	if r.status_code == 404:
	raise ValueError('Author link not found.')
	data = r.json()
	papers = data['papers'][:max_count]
	name = data['name']

	return name, papers

	## TODO Preprocess Extracted Texts from PDFs
	# Get a portion of the text for actual task

	def get_title(text):
	pass

	def get_abstract(text):
	pass

	def get_introduction(text):
	pass

	def get_conclusion(text):
	pass