Spaces:

jskim
/

paper-matching

Runtime error

App Files Files Community

jskim commited on Mar 24, 2023

Commit

5ee7598

1 Parent(s): e7933f3

adding flexibility to use different models for setence-level info.

Browse files

Files changed (3) hide show

app.py +14 -9
input_format.py +1 -1
score.py +45 -14

app.py CHANGED Viewed

@@ -14,14 +14,18 @@ from score import *
 # load document scoring model
 #torch.cuda.is_available = lambda : False  # uncomment to test with CPU only
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-pretrained_model = 'allenai/specter'
 tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
 doc_model = AutoModel.from_pretrained(pretrained_model)
 doc_model.to(device)
 # load sentence model
-sent_model = SentenceTransformer('sentence-transformers/gtr-t5-base')
-sent_model.to(device)
 def get_similar_paper(
     title_input,
@@ -84,6 +88,7 @@ def get_similar_paper(
         # Compute sent-level and phrase-level affinity scores for each papers
         sent_ids, sent_scores, info, top_pairs_info = get_highlight_info(
             sent_model,
             abstract_text_input,
             ab,
             K=2 # top two sentences from the candidate
@@ -256,21 +261,21 @@ with gr.Blocks(css='style.css') as demo:
     # General instruction
     general_instruction = """
-# R2P2: Reviewer TO Paper in Peer review
 #### Who is it for?
 It is for meta-reviewers, area chairs, program chairs, or anyone who oversees the submission-reviewer matching process in peer review for academic conferences, journals, and grants.
-<center><img src="file/tool.gif" width="70%" alt="general workflow"></center>
 #### How does it help?
 A typical meta-reviewer workflow lacks supportive information on **what makes the pre-selected candidate reviewers a good fit** for the submission. Only affinity scores between the reviewer and the paper are shown, without additional detail.
-R2P2 provides more information about each reviewer. It searches for the most relevant papers among the reviewer's previous publications and highlights relevant parts within them.
     """
     # TODO add instruction video link
     # More details (video, addendum)
-    more_details_instruction = """Check out <a href="", target="_blank">this video</a> for a quick demo of what R2P2 is and how it can help. You can find more details <a href="file/details.html", target="_blank">here</a> about R2P2, along with our privacy policy and disclaimer."""
     gr.Markdown(general_instruction)
     gr.HTML(more_details_instruction)
@@ -298,7 +303,7 @@ R2P2 provides more information about each reviewer. It searches for the most rel
         examples=[[example_title, example_submission, example_reviewer]],
         inputs=[title_input, abstract_text_input, author_id_input],
         cache_examples=False,
-        label="Click to try out the example input."
     )
     with gr.Row():
@@ -417,7 +422,7 @@ R2P2 provides more information about each reviewer. It searches for the most rel
     ---
     """
     # show multiple papers in radio check box to select from
     paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
     with gr.Row():

 # load document scoring model
 #torch.cuda.is_available = lambda : False  # uncomment to test with CPU only
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+#pretrained_model = 'allenai/specter'
+pretrained_model = 'allenai/specter2'
 tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
 doc_model = AutoModel.from_pretrained(pretrained_model)
 doc_model.to(device)
 # load sentence model
+sent_model = doc_model # have the same model for document and sentence level
+# OR specify different model for sentence level
+# sent_model = SentenceTransformer('sentence-transformers/gtr-t5-base')
+# sent_model.to(device)
 def get_similar_paper(
     title_input,
         # Compute sent-level and phrase-level affinity scores for each papers
         sent_ids, sent_scores, info, top_pairs_info = get_highlight_info(
             sent_model,
+            tokenizer,
             abstract_text_input,
             ab,
             K=2 # top two sentences from the candidate
     # General instruction
     general_instruction = """
+# R2P2: An Assistance Tool for Reviewer-Paper Matching in Peer Review
 #### Who is it for?
 It is for meta-reviewers, area chairs, program chairs, or anyone who oversees the submission-reviewer matching process in peer review for academic conferences, journals, and grants.
+<center><img src="file/tool-img.jpeg" width="70%" alt="general workflow"></center>
 #### How does it help?
 A typical meta-reviewer workflow lacks supportive information on **what makes the pre-selected candidate reviewers a good fit** for the submission. Only affinity scores between the reviewer and the paper are shown, without additional detail.
+R2P2 provides more information about each reviewer. It searches for the **most relevant papers** among the reviewer's previous publications and **highlights relevant parts** within them.
     """
     # TODO add instruction video link
     # More details (video, addendum)
+    more_details_instruction = """Check out <a href="", target="_blank">this video</a> for a quick demo of what R2P2 is and how it can help. You can find more details <a href="file/details.html", target="_blank">here</a>, along with our privacy policy and disclaimer."""
     gr.Markdown(general_instruction)
     gr.HTML(more_details_instruction)
         examples=[[example_title, example_submission, example_reviewer]],
         inputs=[title_input, abstract_text_input, author_id_input],
         cache_examples=False,
+        label="Try out the following example input."
     )
     with gr.Row():
     ---
     """
+    # TODO allow users to change the number of highlights to show?
     # show multiple papers in radio check box to select from
     paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
     with gr.Row():

input_format.py CHANGED Viewed

@@ -66,7 +66,7 @@ def download_pdf(url, file_name):
 ## Input formatting for the given author (reviewer)
 # Extracting text from a link
-def get_text_from_author_id(author_id, max_count=100):
     if author_id is None:
         raise ValueError('Input valid author ID')
     aid = str(author_id)

 ## Input formatting for the given author (reviewer)
 # Extracting text from a link
+def get_text_from_author_id(author_id, max_count=150):
     if author_id is None:
         raise ValueError('Input valid author ID')
     aid = str(author_id)

score.py CHANGED Viewed

@@ -1,20 +1,39 @@
-from sentence_transformers import util
 from nltk.tokenize import sent_tokenize
 from nltk import word_tokenize, pos_tag
 import torch
 import numpy as np
 import tqdm
-def compute_sentencewise_scores(model, query_sents, candidate_sents):
-    # list of sentences from query and candidate
-    q_v, c_v = get_embedding(model, query_sents, candidate_sents)
     return util.cos_sim(q_v, c_v)
 def get_embedding(model, query_sents, candidate_sents):
     q_v = model.encode(query_sents)
     c_v = model.encode(candidate_sents)
     return q_v, c_v
 def get_top_k(score_mat, K=3):
@@ -72,10 +91,10 @@ def get_match_phrase(w1, w2, method='pos'):
         pos1 = pos_tag(w1)
         pos2 = pos_tag(w2)
         for i, (w, p) in enumerate(pos2):
-            if w.lower() in w1 and p in include:
-                j = w1.index(w.lower())
-                mask2[i] = 1
-                mask1[j] = 1
     return mask1, mask2
 def remove_spaces(words, attrs):
@@ -90,14 +109,14 @@ def remove_spaces(words, attrs):
     idx, single_q, double_q = 0, 0, 0
     while idx < len(words):
         # stick to the word that appears right before
-        if words[idx] in [',', '.', '%', ')', ':', '?', ';', "'s"]:
             ww = word_out.pop()
             aa = attr_out.pop()
             word_out.append(ww + words[idx])
             attr_out.append(aa)
             idx += 1
         # stick to the word that appears right after
-        elif words[idx] in ["("]:
             word_out.append(words[idx] + words[idx+1])
             attr_out.append(attrs[idx+1])
             idx += 2
@@ -141,6 +160,18 @@ def remove_spaces(words, attrs):
                     word_out.append(words[idx] + words[idx+1])
                     attr_out.append(attrs[idx+1])
                     idx += 2
         else:
             word_out.append(words[idx])
             attr_out.append(attrs[idx])
@@ -193,7 +224,7 @@ def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scor
     return output
-def get_highlight_info(model, text1, text2, K=None):
     """
     Get highlight information from two texts
     """
@@ -201,7 +232,7 @@ def get_highlight_info(model, text1, text2, K=None):
     sent2 = sent_tokenize(text2) # candidate
     if K is None: # if K is not set, select based on the length of the candidate
         K = int(len(sent2) / 3)
-    score_mat = compute_sentencewise_scores(model, sent1, sent2)
     sent_ids, sent_scores = get_top_k(score_mat, K=K)
     words2, all_words2, sent_start_id2 = get_words(sent2)

+from sentence_transformers import util, SentenceTransformer
+from transformers import BertModel
 from nltk.tokenize import sent_tokenize
 from nltk import word_tokenize, pos_tag
 import torch
 import numpy as np
 import tqdm
+def compute_sentencewise_scores(model, query_sents, candidate_sents, tokenizer=None):
+    if isinstance(model, SentenceTransformer):
+        # if the model is using SentenceTrasformer style
+        q_v, c_v = get_embedding(model, query_sents, candidate_sents)
+    elif isinstance(model, BertModel):
+        # if the model is BERT-style model using transformers library
+        inputs = tokenizer(
+            query_sents + candidate_sents,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+            max_length=512
+        )
+        inputs.to(model.device)
+        result = model(**inputs)
+        embeddings = result.last_hidden_state[:, 0, :].detach().cpu().numpy()
+        q_v = embeddings[:len(query_sents)]
+        c_v = embeddings[len(query_sents):]
+    else:
+        raise ValueError('model not supported at the time')
+    assert(q_v.shape[1] == c_v.shape[1])
+    assert(q_v.shape[0] == len(query_sents))
+    assert(c_v.shape[0] == len(candidate_sents))
     return util.cos_sim(q_v, c_v)
 def get_embedding(model, query_sents, candidate_sents):
     q_v = model.encode(query_sents)
     c_v = model.encode(candidate_sents)
     return q_v, c_v
 def get_top_k(score_mat, K=3):
         pos1 = pos_tag(w1)
         pos2 = pos_tag(w2)
         for i, (w, p) in enumerate(pos2):
+            for j, (w_, p_) in enumerate(pos1):
+                if w.lower() == w_.lower() and p in include:
+                    mask2[i] = 1
+                    mask1[j] = 1
     return mask1, mask2
 def remove_spaces(words, attrs):
     idx, single_q, double_q = 0, 0, 0
     while idx < len(words):
         # stick to the word that appears right before
+        if words[idx] in [',', '.', '%', ')', ':', '?', ';', "'s", '”', "''"]:
             ww = word_out.pop()
             aa = attr_out.pop()
             word_out.append(ww + words[idx])
             attr_out.append(aa)
             idx += 1
         # stick to the word that appears right after
+        elif words[idx] in ["(", '“']:
             word_out.append(words[idx] + words[idx+1])
             attr_out.append(attrs[idx+1])
             idx += 2
                     word_out.append(words[idx] + words[idx+1])
                     attr_out.append(attrs[idx+1])
                     idx += 2
+        elif words[idx] == '``':
+            # this is opening quote: stick to the word after, but change to real double quote
+            word_out.append('"' + words[idx+1])
+            attr_out.append(attrs[idx+1])
+            idx += 2
+        elif words[idx] == "''":
+            # this is closing quote: stick to word before, but change to real double quote
+            ww = word_out.pop()
+            aa = attr_out.pop()
+            word_out.append(ww + '"')
+            attr_out.append(aa)
+            idx += 1
         else:
             word_out.append(words[idx])
             attr_out.append(attrs[idx])
     return output
+def get_highlight_info(model, tokenizer, text1, text2, K=None):
     """
     Get highlight information from two texts
     """
     sent2 = sent_tokenize(text2) # candidate
     if K is None: # if K is not set, select based on the length of the candidate
         K = int(len(sent2) / 3)
+    score_mat = compute_sentencewise_scores(model, sent1, sent2, tokenizer=tokenizer)
     sent_ids, sent_scores = get_top_k(score_mat, K=K)
     words2, all_words2, sent_start_id2 = get_words(sent2)