Spaces:
Runtime error
Runtime error
| import numpy as np | |
| from pypdf import PdfReader | |
| from urllib.parse import urlparse | |
| import requests | |
| from semanticscholar import SemanticScholar | |
| ### Input Formatting Module | |
| ## Input formatting for the given paper | |
| # Extracting text from a pdf or a link | |
| def get_text_from_pdf(file_path): | |
| """ | |
| Convert a pdf to list of text files | |
| """ | |
| reader = PdfReader(file_path) | |
| text = [] | |
| for p in reader.pages: | |
| t = p.extract_text() | |
| text.append(t) | |
| return text | |
| def get_text_from_url(url, file_path='paper.pdf'): | |
| """ | |
| Get text of the paper from a url | |
| """ | |
| ## Check for different URL cases | |
| url_parts = urlparse(url) | |
| # arxiv | |
| if 'arxiv' in url_parts.netloc: | |
| if 'abs' in url_parts.path: | |
| # abstract page, change the url to pdf link | |
| paper_id = url_parts.path.split('/')[-1] | |
| url = 'https://www.arxiv.org/pdf/%s.pdf'%(paper_id) | |
| elif 'pdf' in url_parts.path: | |
| # pdf file, pass | |
| pass | |
| else: | |
| raise ValueError('invalid url') | |
| else: | |
| raise ValueError('invalid url') | |
| # download the file | |
| download_pdf(url, file_path) | |
| # get the text from the pdf file | |
| text = get_text_from_pdf(file_path) | |
| return text | |
| def download_pdf(url, file_name): | |
| """ | |
| Download the pdf file from given url and save it as file_name | |
| """ | |
| # Send GET request | |
| response = requests.get(url) | |
| # Save the PDF | |
| if response.status_code == 200: | |
| with open(file_name, "wb") as f: | |
| f.write(response.content) | |
| elif response.status_code == 404: | |
| raise ValueError('cannot download the file') | |
| else: | |
| print(response.status_code) | |
| ## Input formatting for the given author (reviewer) | |
| # Extracting text from a link | |
| def get_text_from_author_id(author_id, max_count=100): | |
| if author_id is None: | |
| raise ValueError('Input valid author ID') | |
| aid = str(author_id) | |
| if 'http' in aid: # handle semantic scholar url input | |
| aid = aid.split('/') | |
| aid = aid[aid.index('author')+2] | |
| url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract,papers.url"%aid | |
| r = requests.get(url) | |
| if r.status_code == 404: | |
| raise ValueError('Author link not found.') | |
| data = r.json() | |
| papers = data['papers'][:max_count] | |
| name = data['name'] | |
| return name, papers | |
| ## TODO Preprocess Extracted Texts from PDFs | |
| # Get a portion of the text for actual task | |
| def get_title(text): | |
| pass | |
| def get_abstract(text): | |
| pass | |
| def get_introduction(text): | |
| pass | |
| def get_conclusion(text): | |
| pass |