Spaces:
Runtime error
Runtime error
adding flexibility to use different models for setence-level info.
Browse files- app.py +14 -9
- input_format.py +1 -1
- score.py +45 -14
app.py
CHANGED
|
@@ -14,14 +14,18 @@ from score import *
|
|
| 14 |
# load document scoring model
|
| 15 |
#torch.cuda.is_available = lambda : False # uncomment to test with CPU only
|
| 16 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 17 |
-
pretrained_model = 'allenai/specter'
|
|
|
|
| 18 |
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
|
| 19 |
doc_model = AutoModel.from_pretrained(pretrained_model)
|
| 20 |
doc_model.to(device)
|
| 21 |
|
| 22 |
# load sentence model
|
| 23 |
-
sent_model =
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def get_similar_paper(
|
| 27 |
title_input,
|
|
@@ -84,6 +88,7 @@ def get_similar_paper(
|
|
| 84 |
# Compute sent-level and phrase-level affinity scores for each papers
|
| 85 |
sent_ids, sent_scores, info, top_pairs_info = get_highlight_info(
|
| 86 |
sent_model,
|
|
|
|
| 87 |
abstract_text_input,
|
| 88 |
ab,
|
| 89 |
K=2 # top two sentences from the candidate
|
|
@@ -256,21 +261,21 @@ with gr.Blocks(css='style.css') as demo:
|
|
| 256 |
|
| 257 |
# General instruction
|
| 258 |
general_instruction = """
|
| 259 |
-
# R2P2:
|
| 260 |
|
| 261 |
#### Who is it for?
|
| 262 |
It is for meta-reviewers, area chairs, program chairs, or anyone who oversees the submission-reviewer matching process in peer review for academic conferences, journals, and grants.
|
| 263 |
|
| 264 |
-
<center><img src="file/tool.
|
| 265 |
|
| 266 |
#### How does it help?
|
| 267 |
A typical meta-reviewer workflow lacks supportive information on **what makes the pre-selected candidate reviewers a good fit** for the submission. Only affinity scores between the reviewer and the paper are shown, without additional detail.
|
| 268 |
|
| 269 |
-
R2P2 provides more information about each reviewer. It searches for the most relevant papers among the reviewer's previous publications and highlights relevant parts within them.
|
| 270 |
"""
|
| 271 |
# TODO add instruction video link
|
| 272 |
# More details (video, addendum)
|
| 273 |
-
more_details_instruction = """Check out <a href="", target="_blank">this video</a> for a quick demo of what R2P2 is and how it can help. You can find more details <a href="file/details.html", target="_blank">here</a
|
| 274 |
|
| 275 |
gr.Markdown(general_instruction)
|
| 276 |
gr.HTML(more_details_instruction)
|
|
@@ -298,7 +303,7 @@ R2P2 provides more information about each reviewer. It searches for the most rel
|
|
| 298 |
examples=[[example_title, example_submission, example_reviewer]],
|
| 299 |
inputs=[title_input, abstract_text_input, author_id_input],
|
| 300 |
cache_examples=False,
|
| 301 |
-
label="
|
| 302 |
)
|
| 303 |
|
| 304 |
with gr.Row():
|
|
@@ -417,7 +422,7 @@ R2P2 provides more information about each reviewer. It searches for the most rel
|
|
| 417 |
|
| 418 |
---
|
| 419 |
"""
|
| 420 |
-
|
| 421 |
# show multiple papers in radio check box to select from
|
| 422 |
paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
|
| 423 |
with gr.Row():
|
|
|
|
| 14 |
# load document scoring model
|
| 15 |
#torch.cuda.is_available = lambda : False # uncomment to test with CPU only
|
| 16 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 17 |
+
#pretrained_model = 'allenai/specter'
|
| 18 |
+
pretrained_model = 'allenai/specter2'
|
| 19 |
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
|
| 20 |
doc_model = AutoModel.from_pretrained(pretrained_model)
|
| 21 |
doc_model.to(device)
|
| 22 |
|
| 23 |
# load sentence model
|
| 24 |
+
sent_model = doc_model # have the same model for document and sentence level
|
| 25 |
+
|
| 26 |
+
# OR specify different model for sentence level
|
| 27 |
+
# sent_model = SentenceTransformer('sentence-transformers/gtr-t5-base')
|
| 28 |
+
# sent_model.to(device)
|
| 29 |
|
| 30 |
def get_similar_paper(
|
| 31 |
title_input,
|
|
|
|
| 88 |
# Compute sent-level and phrase-level affinity scores for each papers
|
| 89 |
sent_ids, sent_scores, info, top_pairs_info = get_highlight_info(
|
| 90 |
sent_model,
|
| 91 |
+
tokenizer,
|
| 92 |
abstract_text_input,
|
| 93 |
ab,
|
| 94 |
K=2 # top two sentences from the candidate
|
|
|
|
| 261 |
|
| 262 |
# General instruction
|
| 263 |
general_instruction = """
|
| 264 |
+
# R2P2: An Assistance Tool for Reviewer-Paper Matching in Peer Review
|
| 265 |
|
| 266 |
#### Who is it for?
|
| 267 |
It is for meta-reviewers, area chairs, program chairs, or anyone who oversees the submission-reviewer matching process in peer review for academic conferences, journals, and grants.
|
| 268 |
|
| 269 |
+
<center><img src="file/tool-img.jpeg" width="70%" alt="general workflow"></center>
|
| 270 |
|
| 271 |
#### How does it help?
|
| 272 |
A typical meta-reviewer workflow lacks supportive information on **what makes the pre-selected candidate reviewers a good fit** for the submission. Only affinity scores between the reviewer and the paper are shown, without additional detail.
|
| 273 |
|
| 274 |
+
R2P2 provides more information about each reviewer. It searches for the **most relevant papers** among the reviewer's previous publications and **highlights relevant parts** within them.
|
| 275 |
"""
|
| 276 |
# TODO add instruction video link
|
| 277 |
# More details (video, addendum)
|
| 278 |
+
more_details_instruction = """Check out <a href="", target="_blank">this video</a> for a quick demo of what R2P2 is and how it can help. You can find more details <a href="file/details.html", target="_blank">here</a>, along with our privacy policy and disclaimer."""
|
| 279 |
|
| 280 |
gr.Markdown(general_instruction)
|
| 281 |
gr.HTML(more_details_instruction)
|
|
|
|
| 303 |
examples=[[example_title, example_submission, example_reviewer]],
|
| 304 |
inputs=[title_input, abstract_text_input, author_id_input],
|
| 305 |
cache_examples=False,
|
| 306 |
+
label="Try out the following example input."
|
| 307 |
)
|
| 308 |
|
| 309 |
with gr.Row():
|
|
|
|
| 422 |
|
| 423 |
---
|
| 424 |
"""
|
| 425 |
+
# TODO allow users to change the number of highlights to show?
|
| 426 |
# show multiple papers in radio check box to select from
|
| 427 |
paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
|
| 428 |
with gr.Row():
|
input_format.py
CHANGED
|
@@ -66,7 +66,7 @@ def download_pdf(url, file_name):
|
|
| 66 |
## Input formatting for the given author (reviewer)
|
| 67 |
# Extracting text from a link
|
| 68 |
|
| 69 |
-
def get_text_from_author_id(author_id, max_count=
|
| 70 |
if author_id is None:
|
| 71 |
raise ValueError('Input valid author ID')
|
| 72 |
aid = str(author_id)
|
|
|
|
| 66 |
## Input formatting for the given author (reviewer)
|
| 67 |
# Extracting text from a link
|
| 68 |
|
| 69 |
+
def get_text_from_author_id(author_id, max_count=150):
|
| 70 |
if author_id is None:
|
| 71 |
raise ValueError('Input valid author ID')
|
| 72 |
aid = str(author_id)
|
score.py
CHANGED
|
@@ -1,20 +1,39 @@
|
|
| 1 |
-
from sentence_transformers import util
|
|
|
|
| 2 |
from nltk.tokenize import sent_tokenize
|
| 3 |
from nltk import word_tokenize, pos_tag
|
| 4 |
import torch
|
| 5 |
import numpy as np
|
| 6 |
import tqdm
|
| 7 |
|
| 8 |
-
def compute_sentencewise_scores(model, query_sents, candidate_sents):
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
return util.cos_sim(q_v, c_v)
|
| 13 |
|
| 14 |
def get_embedding(model, query_sents, candidate_sents):
|
| 15 |
q_v = model.encode(query_sents)
|
| 16 |
c_v = model.encode(candidate_sents)
|
| 17 |
-
|
| 18 |
return q_v, c_v
|
| 19 |
|
| 20 |
def get_top_k(score_mat, K=3):
|
|
@@ -72,10 +91,10 @@ def get_match_phrase(w1, w2, method='pos'):
|
|
| 72 |
pos1 = pos_tag(w1)
|
| 73 |
pos2 = pos_tag(w2)
|
| 74 |
for i, (w, p) in enumerate(pos2):
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
return mask1, mask2
|
| 80 |
|
| 81 |
def remove_spaces(words, attrs):
|
|
@@ -90,14 +109,14 @@ def remove_spaces(words, attrs):
|
|
| 90 |
idx, single_q, double_q = 0, 0, 0
|
| 91 |
while idx < len(words):
|
| 92 |
# stick to the word that appears right before
|
| 93 |
-
if words[idx] in [',', '.', '%', ')', ':', '?', ';', "'s"]:
|
| 94 |
ww = word_out.pop()
|
| 95 |
aa = attr_out.pop()
|
| 96 |
word_out.append(ww + words[idx])
|
| 97 |
attr_out.append(aa)
|
| 98 |
idx += 1
|
| 99 |
# stick to the word that appears right after
|
| 100 |
-
elif words[idx] in ["("]:
|
| 101 |
word_out.append(words[idx] + words[idx+1])
|
| 102 |
attr_out.append(attrs[idx+1])
|
| 103 |
idx += 2
|
|
@@ -141,6 +160,18 @@ def remove_spaces(words, attrs):
|
|
| 141 |
word_out.append(words[idx] + words[idx+1])
|
| 142 |
attr_out.append(attrs[idx+1])
|
| 143 |
idx += 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
else:
|
| 145 |
word_out.append(words[idx])
|
| 146 |
attr_out.append(attrs[idx])
|
|
@@ -193,7 +224,7 @@ def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scor
|
|
| 193 |
|
| 194 |
return output
|
| 195 |
|
| 196 |
-
def get_highlight_info(model, text1, text2, K=None):
|
| 197 |
"""
|
| 198 |
Get highlight information from two texts
|
| 199 |
"""
|
|
@@ -201,7 +232,7 @@ def get_highlight_info(model, text1, text2, K=None):
|
|
| 201 |
sent2 = sent_tokenize(text2) # candidate
|
| 202 |
if K is None: # if K is not set, select based on the length of the candidate
|
| 203 |
K = int(len(sent2) / 3)
|
| 204 |
-
score_mat = compute_sentencewise_scores(model, sent1, sent2)
|
| 205 |
|
| 206 |
sent_ids, sent_scores = get_top_k(score_mat, K=K)
|
| 207 |
words2, all_words2, sent_start_id2 = get_words(sent2)
|
|
|
|
| 1 |
+
from sentence_transformers import util, SentenceTransformer
|
| 2 |
+
from transformers import BertModel
|
| 3 |
from nltk.tokenize import sent_tokenize
|
| 4 |
from nltk import word_tokenize, pos_tag
|
| 5 |
import torch
|
| 6 |
import numpy as np
|
| 7 |
import tqdm
|
| 8 |
|
| 9 |
+
def compute_sentencewise_scores(model, query_sents, candidate_sents, tokenizer=None):
|
| 10 |
+
if isinstance(model, SentenceTransformer):
|
| 11 |
+
# if the model is using SentenceTrasformer style
|
| 12 |
+
q_v, c_v = get_embedding(model, query_sents, candidate_sents)
|
| 13 |
+
elif isinstance(model, BertModel):
|
| 14 |
+
# if the model is BERT-style model using transformers library
|
| 15 |
+
inputs = tokenizer(
|
| 16 |
+
query_sents + candidate_sents,
|
| 17 |
+
padding=True,
|
| 18 |
+
truncation=True,
|
| 19 |
+
return_tensors="pt",
|
| 20 |
+
max_length=512
|
| 21 |
+
)
|
| 22 |
+
inputs.to(model.device)
|
| 23 |
+
result = model(**inputs)
|
| 24 |
+
embeddings = result.last_hidden_state[:, 0, :].detach().cpu().numpy()
|
| 25 |
+
q_v = embeddings[:len(query_sents)]
|
| 26 |
+
c_v = embeddings[len(query_sents):]
|
| 27 |
+
else:
|
| 28 |
+
raise ValueError('model not supported at the time')
|
| 29 |
+
assert(q_v.shape[1] == c_v.shape[1])
|
| 30 |
+
assert(q_v.shape[0] == len(query_sents))
|
| 31 |
+
assert(c_v.shape[0] == len(candidate_sents))
|
| 32 |
return util.cos_sim(q_v, c_v)
|
| 33 |
|
| 34 |
def get_embedding(model, query_sents, candidate_sents):
|
| 35 |
q_v = model.encode(query_sents)
|
| 36 |
c_v = model.encode(candidate_sents)
|
|
|
|
| 37 |
return q_v, c_v
|
| 38 |
|
| 39 |
def get_top_k(score_mat, K=3):
|
|
|
|
| 91 |
pos1 = pos_tag(w1)
|
| 92 |
pos2 = pos_tag(w2)
|
| 93 |
for i, (w, p) in enumerate(pos2):
|
| 94 |
+
for j, (w_, p_) in enumerate(pos1):
|
| 95 |
+
if w.lower() == w_.lower() and p in include:
|
| 96 |
+
mask2[i] = 1
|
| 97 |
+
mask1[j] = 1
|
| 98 |
return mask1, mask2
|
| 99 |
|
| 100 |
def remove_spaces(words, attrs):
|
|
|
|
| 109 |
idx, single_q, double_q = 0, 0, 0
|
| 110 |
while idx < len(words):
|
| 111 |
# stick to the word that appears right before
|
| 112 |
+
if words[idx] in [',', '.', '%', ')', ':', '?', ';', "'s", '”', "''"]:
|
| 113 |
ww = word_out.pop()
|
| 114 |
aa = attr_out.pop()
|
| 115 |
word_out.append(ww + words[idx])
|
| 116 |
attr_out.append(aa)
|
| 117 |
idx += 1
|
| 118 |
# stick to the word that appears right after
|
| 119 |
+
elif words[idx] in ["(", '“']:
|
| 120 |
word_out.append(words[idx] + words[idx+1])
|
| 121 |
attr_out.append(attrs[idx+1])
|
| 122 |
idx += 2
|
|
|
|
| 160 |
word_out.append(words[idx] + words[idx+1])
|
| 161 |
attr_out.append(attrs[idx+1])
|
| 162 |
idx += 2
|
| 163 |
+
elif words[idx] == '``':
|
| 164 |
+
# this is opening quote: stick to the word after, but change to real double quote
|
| 165 |
+
word_out.append('"' + words[idx+1])
|
| 166 |
+
attr_out.append(attrs[idx+1])
|
| 167 |
+
idx += 2
|
| 168 |
+
elif words[idx] == "''":
|
| 169 |
+
# this is closing quote: stick to word before, but change to real double quote
|
| 170 |
+
ww = word_out.pop()
|
| 171 |
+
aa = attr_out.pop()
|
| 172 |
+
word_out.append(ww + '"')
|
| 173 |
+
attr_out.append(aa)
|
| 174 |
+
idx += 1
|
| 175 |
else:
|
| 176 |
word_out.append(words[idx])
|
| 177 |
attr_out.append(attrs[idx])
|
|
|
|
| 224 |
|
| 225 |
return output
|
| 226 |
|
| 227 |
+
def get_highlight_info(model, tokenizer, text1, text2, K=None):
|
| 228 |
"""
|
| 229 |
Get highlight information from two texts
|
| 230 |
"""
|
|
|
|
| 232 |
sent2 = sent_tokenize(text2) # candidate
|
| 233 |
if K is None: # if K is not set, select based on the length of the candidate
|
| 234 |
K = int(len(sent2) / 3)
|
| 235 |
+
score_mat = compute_sentencewise_scores(model, sent1, sent2, tokenizer=tokenizer)
|
| 236 |
|
| 237 |
sent_ids, sent_scores = get_top_k(score_mat, K=K)
|
| 238 |
words2, all_words2, sent_start_id2 = get_words(sent2)
|