Spaces:
Runtime error
Runtime error
visualizing more direct information upfront. leaving interactive parts as the next step.
Browse files
app.py
CHANGED
|
@@ -43,7 +43,7 @@ def get_similar_paper(
|
|
| 43 |
name, papers = get_text_from_author_id(author_id_input)
|
| 44 |
|
| 45 |
# Compute Doc-level affinity scores for the Papers
|
| 46 |
-
print('computing scores...')
|
| 47 |
# TODO detect duplicate papers?
|
| 48 |
titles, abstracts, doc_scores = compute_document_score(
|
| 49 |
doc_model,
|
|
@@ -72,40 +72,77 @@ def get_similar_paper(
|
|
| 72 |
start = time.time()
|
| 73 |
input_sentences = sent_tokenize(abstract_text_input)
|
| 74 |
num_sents = len(input_sentences)
|
|
|
|
|
|
|
| 75 |
for aa, (tt, ab, ds) in enumerate(zip(titles, abstracts, doc_scores)):
|
| 76 |
# Compute sent-level and phrase-level affinity scores for each papers
|
| 77 |
-
sent_ids, sent_scores, info = get_highlight_info(
|
| 78 |
sent_model,
|
| 79 |
abstract_text_input,
|
| 80 |
ab,
|
| 81 |
K=2
|
| 82 |
)
|
| 83 |
-
|
| 84 |
-
word_scores = dict()
|
| 85 |
|
| 86 |
-
#
|
|
|
|
| 87 |
for i in range(num_sents):
|
| 88 |
word_scores[str(i)] = {
|
| 89 |
"original": ab,
|
| 90 |
"interpretation": list(zip(info['all_words'], info[i]['scores']))
|
| 91 |
-
}
|
| 92 |
|
| 93 |
tmp[display_title[aa]] = {
|
| 94 |
'title': tt,
|
| 95 |
'abstract': ab,
|
| 96 |
'doc_score': ds,
|
| 97 |
'source_sentences': input_sentences,
|
| 98 |
-
'highlight': word_scores
|
|
|
|
| 99 |
}
|
| 100 |
-
|
|
|
|
|
|
|
| 101 |
end = time.time()
|
| 102 |
-
print('done in [%0.2f] seconds'%(end - start))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
return (
|
| 105 |
-
gr.update(
|
| 106 |
-
gr.update(
|
| 107 |
-
gr.update(visible=True),
|
| 108 |
-
gr.update(visible=True),
|
| 109 |
)
|
| 110 |
|
| 111 |
def update_name(author_id_input):
|
|
@@ -147,6 +184,7 @@ with gr.Blocks() as demo:
|
|
| 147 |
# Text description about the app and disclaimer
|
| 148 |
### TEXT Description
|
| 149 |
# TODO add instruction video link
|
|
|
|
| 150 |
gr.Markdown(
|
| 151 |
"""
|
| 152 |
# Paper Matching Helper
|
|
@@ -186,9 +224,93 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
|
|
| 186 |
author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
|
| 187 |
with gr.Row():
|
| 188 |
compute_btn = gr.Button('What Makes This a Good Match?')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
### PAPER INFORMATION
|
| 191 |
-
|
| 192 |
# show multiple papers in radio check box to select from
|
| 193 |
with gr.Row():
|
| 194 |
selected_papers_radio = gr.Radio(
|
|
@@ -205,9 +327,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
|
|
| 205 |
affinity= gr.Number(label='Affinity', interactive=False, value=0)
|
| 206 |
with gr.Row():
|
| 207 |
paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
|
| 208 |
-
|
| 209 |
-
## TODO consider adding more direct information feeding to the users before giving them options for interactions.
|
| 210 |
-
|
| 211 |
### RELEVANT PARTS (HIGHLIGHTS)
|
| 212 |
with gr.Row():
|
| 213 |
with gr.Column(scale=2): # text from submission
|
|
@@ -221,7 +341,7 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
|
|
| 221 |
|
| 222 |
### EVENT LISTENERS
|
| 223 |
|
| 224 |
-
# retrieve similar papers
|
| 225 |
compute_btn.click(
|
| 226 |
fn=get_similar_paper,
|
| 227 |
inputs=[
|
|
@@ -229,13 +349,60 @@ Below we describe how to use the tool. Also feel free to check out the [video]()
|
|
| 229 |
pdf_file_input,
|
| 230 |
author_id_input
|
| 231 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
outputs=[
|
| 233 |
selected_papers_radio,
|
| 234 |
source_sentences,
|
| 235 |
title_row,
|
| 236 |
paper_abstract
|
| 237 |
]
|
| 238 |
-
)
|
| 239 |
|
| 240 |
# change highlight based on selected sentences from submission
|
| 241 |
source_sentences.change(
|
|
|
|
| 43 |
name, papers = get_text_from_author_id(author_id_input)
|
| 44 |
|
| 45 |
# Compute Doc-level affinity scores for the Papers
|
| 46 |
+
print('computing document scores...')
|
| 47 |
# TODO detect duplicate papers?
|
| 48 |
titles, abstracts, doc_scores = compute_document_score(
|
| 49 |
doc_model,
|
|
|
|
| 72 |
start = time.time()
|
| 73 |
input_sentences = sent_tokenize(abstract_text_input)
|
| 74 |
num_sents = len(input_sentences)
|
| 75 |
+
|
| 76 |
+
summary_info = dict() # elements to visualize upfront
|
| 77 |
for aa, (tt, ab, ds) in enumerate(zip(titles, abstracts, doc_scores)):
|
| 78 |
# Compute sent-level and phrase-level affinity scores for each papers
|
| 79 |
+
sent_ids, sent_scores, info, top_pairs_info = get_highlight_info(
|
| 80 |
sent_model,
|
| 81 |
abstract_text_input,
|
| 82 |
ab,
|
| 83 |
K=2
|
| 84 |
)
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
# get scores for each word in the format for Gradio Interpretation component
|
| 87 |
+
word_scores = dict()
|
| 88 |
for i in range(num_sents):
|
| 89 |
word_scores[str(i)] = {
|
| 90 |
"original": ab,
|
| 91 |
"interpretation": list(zip(info['all_words'], info[i]['scores']))
|
| 92 |
+
}
|
| 93 |
|
| 94 |
tmp[display_title[aa]] = {
|
| 95 |
'title': tt,
|
| 96 |
'abstract': ab,
|
| 97 |
'doc_score': ds,
|
| 98 |
'source_sentences': input_sentences,
|
| 99 |
+
'highlight': word_scores,
|
| 100 |
+
'top_pairs': top_pairs_info
|
| 101 |
}
|
| 102 |
+
|
| 103 |
+
# TODO better ways of saving intermediate results? user identifiers per session?
|
| 104 |
+
pickle.dump(tmp, open('info.pkl', 'wb'))
|
| 105 |
end = time.time()
|
| 106 |
+
print('done in [%0.2f] seconds'%(end - start))
|
| 107 |
+
|
| 108 |
+
# set up elements to show
|
| 109 |
+
out = [
|
| 110 |
+
gr.update(choices=display_title, interactive=True, visible=False), # set of papers (radio)
|
| 111 |
+
gr.update(choices=input_sentences, interactive=True, visible=False) # submission sentences
|
| 112 |
+
]
|
| 113 |
|
| 114 |
+
# set up elements to visualize upfront
|
| 115 |
+
top_papers_show = 3 # number of top papers to show upfront
|
| 116 |
+
top_num_info_show = 2 # number of sentence pairs from each paper to show upfront
|
| 117 |
+
summary_out = []
|
| 118 |
+
for i in range(top_papers_show):
|
| 119 |
+
out_tmp = [
|
| 120 |
+
gr.update(value=titles[i], visible=True),
|
| 121 |
+
gr.update(value=doc_scores[i], visible=True)
|
| 122 |
+
]
|
| 123 |
+
tp = tmp[display_title[i]]['top_pairs']
|
| 124 |
+
for j in range(top_num_info_show):
|
| 125 |
+
out_tmp += [
|
| 126 |
+
gr.update(value=tp[j]['score'], visible=True),
|
| 127 |
+
tp[j]['query']['original'],
|
| 128 |
+
tp[j]['query'],
|
| 129 |
+
tp[j]['candidate']['original'],
|
| 130 |
+
tp[j]['candidate']
|
| 131 |
+
]
|
| 132 |
+
summary_out += out_tmp
|
| 133 |
+
|
| 134 |
+
# add updates to the show more button
|
| 135 |
+
out = out + summary_out + [gr.update(visible=True)] # show more button
|
| 136 |
+
assert(len(out) == (top_num_info_show * 5 + 2) * top_papers_show + 3)
|
| 137 |
+
|
| 138 |
+
return tuple(out)
|
| 139 |
+
|
| 140 |
+
def show_more():
|
| 141 |
return (
|
| 142 |
+
gr.update(visible=True), # set of papers
|
| 143 |
+
gr.update(visible=True), # submission sentences
|
| 144 |
+
gr.update(visible=True), # title row
|
| 145 |
+
gr.update(visible=True), # abstract row
|
| 146 |
)
|
| 147 |
|
| 148 |
def update_name(author_id_input):
|
|
|
|
| 184 |
# Text description about the app and disclaimer
|
| 185 |
### TEXT Description
|
| 186 |
# TODO add instruction video link
|
| 187 |
+
# TODO udpate instruction based on new changes
|
| 188 |
gr.Markdown(
|
| 189 |
"""
|
| 190 |
# Paper Matching Helper
|
|
|
|
| 224 |
author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
|
| 225 |
with gr.Row():
|
| 226 |
compute_btn = gr.Button('What Makes This a Good Match?')
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
### OVERVIEW
|
| 230 |
+
# Paper title, score, and top-ranking sentence pairs -- two sentence pairs per paper, three papers
|
| 231 |
+
# TODO blockfy similar components together and simplify
|
| 232 |
+
## ONE BLOCK OF INFO FOR A SINGLE PAPER
|
| 233 |
+
## PAPER1
|
| 234 |
+
with gr.Row():
|
| 235 |
+
with gr.Column(scale=3):
|
| 236 |
+
paper_title1 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
|
| 237 |
+
with gr.Column(scale=1):
|
| 238 |
+
affinity1 = gr.Number(label='Affinity', interactive=False, value=0, visible=False)
|
| 239 |
+
with gr.Row() as rel1_1:
|
| 240 |
+
with gr.Column(scale=1):
|
| 241 |
+
sent_pair_score1_1 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
|
| 242 |
+
with gr.Column(scale=4):
|
| 243 |
+
sent_pair_source1_1 = gr.Textbox(label='Sentence from Submission', visible=False)
|
| 244 |
+
sent_pair_source1_1_hl = gr.components.Interpretation(sent_pair_source1_1)
|
| 245 |
+
with gr.Column(scale=4):
|
| 246 |
+
sent_pair_candidate1_1 = gr.Textbox(label='Sentence from Paper', visible=False)
|
| 247 |
+
sent_pair_candidate1_1_hl = gr.components.Interpretation(sent_pair_candidate1_1)
|
| 248 |
+
with gr.Row() as rel1_2:
|
| 249 |
+
with gr.Column(scale=1):
|
| 250 |
+
sent_pair_score1_2 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
|
| 251 |
+
with gr.Column(scale=4):
|
| 252 |
+
sent_pair_source1_2 = gr.Textbox(label='Sentence from Submission', visible=False)
|
| 253 |
+
sent_pair_source1_2_hl = gr.components.Interpretation(sent_pair_source1_2)
|
| 254 |
+
with gr.Column(scale=4):
|
| 255 |
+
sent_pair_candidate1_2 = gr.Textbox(label='Sentence from Paper', visible=False)
|
| 256 |
+
sent_pair_candidate1_2_hl = gr.components.Interpretation(sent_pair_candidate1_2)
|
| 257 |
+
|
| 258 |
+
## PAPER 2
|
| 259 |
+
with gr.Row():
|
| 260 |
+
with gr.Column(scale=3):
|
| 261 |
+
paper_title2 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
|
| 262 |
+
with gr.Column(scale=1):
|
| 263 |
+
affinity2 = gr.Number(label='Affinity', interactive=False, value=0, visible=False)
|
| 264 |
+
with gr.Row() as rel2_1:
|
| 265 |
+
with gr.Column(scale=1):
|
| 266 |
+
sent_pair_score2_1 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
|
| 267 |
+
with gr.Column(scale=4):
|
| 268 |
+
sent_pair_source2_1 = gr.Textbox(label='Sentence from Submission', visible=False)
|
| 269 |
+
sent_pair_source2_1_hl = gr.components.Interpretation(sent_pair_source2_1)
|
| 270 |
+
with gr.Column(scale=4):
|
| 271 |
+
sent_pair_candidate2_1 = gr.Textbox(label='Sentence from Submission', visible=False)
|
| 272 |
+
sent_pair_candidate2_1_hl = gr.components.Interpretation(sent_pair_candidate2_1)
|
| 273 |
+
with gr.Row() as rel2_2:
|
| 274 |
+
with gr.Column(scale=1):
|
| 275 |
+
sent_pair_score2_2 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
|
| 276 |
+
with gr.Column(scale=4):
|
| 277 |
+
sent_pair_source2_2 = gr.Textbox(label='Sentence from Submission', visible=False)
|
| 278 |
+
sent_pair_source2_2_hl = gr.components.Interpretation(sent_pair_source2_2)
|
| 279 |
+
with gr.Column(scale=4):
|
| 280 |
+
sent_pair_candidate2_2 = gr.Textbox(label='Sentence from Submission', visible=False)
|
| 281 |
+
sent_pair_candidate2_2_hl = gr.components.Interpretation(sent_pair_candidate2_2)
|
| 282 |
+
|
| 283 |
+
## PAPER 3
|
| 284 |
+
with gr.Row():
|
| 285 |
+
with gr.Column(scale=3):
|
| 286 |
+
paper_title3 = gr.Textbox(label="From the reviewer's paper:", interactive=False, visible=False)
|
| 287 |
+
with gr.Column(scale=1):
|
| 288 |
+
affinity3 = gr.Number(label='Affinity', interactive=False, value=0, visible=False)
|
| 289 |
+
with gr.Row() as rel3_1:
|
| 290 |
+
with gr.Column(scale=1):
|
| 291 |
+
sent_pair_score3_1 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
|
| 292 |
+
with gr.Column(scale=4):
|
| 293 |
+
sent_pair_source3_1 = gr.Textbox(label='Sentence from Submission', visible=False)
|
| 294 |
+
sent_pair_source3_1_hl = gr.components.Interpretation(sent_pair_source3_1)
|
| 295 |
+
with gr.Column(scale=4):
|
| 296 |
+
sent_pair_candidate3_1 = gr.Textbox(label='Sentence from Submission', visible=False)
|
| 297 |
+
sent_pair_candidate3_1_hl = gr.components.Interpretation(sent_pair_candidate3_1)
|
| 298 |
+
with gr.Row() as rel3_2:
|
| 299 |
+
with gr.Column(scale=1):
|
| 300 |
+
sent_pair_score3_2 = gr.Number(label='Sentence Relevance', interactive=False, value=0, visible=False)
|
| 301 |
+
with gr.Column(scale=4):
|
| 302 |
+
sent_pair_source3_2 = gr.Textbox(label='Sentence from Submission', visible=False)
|
| 303 |
+
sent_pair_source3_2_hl = gr.components.Interpretation(sent_pair_source3_2)
|
| 304 |
+
with gr.Column(scale=4):
|
| 305 |
+
sent_pair_candidate3_2 = gr.Textbox(label='Sentence from Submission', visible=False)
|
| 306 |
+
sent_pair_candidate3_2_hl = gr.components.Interpretation(sent_pair_candidate3_2)
|
| 307 |
+
|
| 308 |
+
## Show more button
|
| 309 |
+
with gr.Row():
|
| 310 |
+
see_more_rel_btn = gr.Button('See more relevant parts from papers', visible=False)
|
| 311 |
|
| 312 |
### PAPER INFORMATION
|
| 313 |
+
|
| 314 |
# show multiple papers in radio check box to select from
|
| 315 |
with gr.Row():
|
| 316 |
selected_papers_radio = gr.Radio(
|
|
|
|
| 327 |
affinity= gr.Number(label='Affinity', interactive=False, value=0)
|
| 328 |
with gr.Row():
|
| 329 |
paper_abstract = gr.Textbox(label='Abstract', interactive=False, visible=False)
|
| 330 |
+
|
|
|
|
|
|
|
| 331 |
### RELEVANT PARTS (HIGHLIGHTS)
|
| 332 |
with gr.Row():
|
| 333 |
with gr.Column(scale=2): # text from submission
|
|
|
|
| 341 |
|
| 342 |
### EVENT LISTENERS
|
| 343 |
|
| 344 |
+
# retrieve similar papers and show top results
|
| 345 |
compute_btn.click(
|
| 346 |
fn=get_similar_paper,
|
| 347 |
inputs=[
|
|
|
|
| 349 |
pdf_file_input,
|
| 350 |
author_id_input
|
| 351 |
],
|
| 352 |
+
outputs=[
|
| 353 |
+
selected_papers_radio,
|
| 354 |
+
source_sentences,
|
| 355 |
+
paper_title1, # paper info
|
| 356 |
+
affinity1,
|
| 357 |
+
sent_pair_score1_1,
|
| 358 |
+
sent_pair_source1_1,
|
| 359 |
+
sent_pair_source1_1_hl,
|
| 360 |
+
sent_pair_candidate1_1,
|
| 361 |
+
sent_pair_candidate1_1_hl,
|
| 362 |
+
sent_pair_score1_2,
|
| 363 |
+
sent_pair_source1_2,
|
| 364 |
+
sent_pair_source1_2_hl,
|
| 365 |
+
sent_pair_candidate1_2,
|
| 366 |
+
sent_pair_candidate1_2_hl,
|
| 367 |
+
paper_title2,
|
| 368 |
+
affinity2,
|
| 369 |
+
sent_pair_score2_1,
|
| 370 |
+
sent_pair_source2_1,
|
| 371 |
+
sent_pair_source2_1_hl,
|
| 372 |
+
sent_pair_candidate2_1,
|
| 373 |
+
sent_pair_candidate2_1_hl,
|
| 374 |
+
sent_pair_score2_2,
|
| 375 |
+
sent_pair_source2_2,
|
| 376 |
+
sent_pair_source2_2_hl,
|
| 377 |
+
sent_pair_candidate2_2,
|
| 378 |
+
sent_pair_candidate2_2_hl,
|
| 379 |
+
paper_title3,
|
| 380 |
+
affinity3,
|
| 381 |
+
sent_pair_score3_1,
|
| 382 |
+
sent_pair_source3_1,
|
| 383 |
+
sent_pair_source3_1_hl,
|
| 384 |
+
sent_pair_candidate3_1,
|
| 385 |
+
sent_pair_candidate3_1_hl,
|
| 386 |
+
sent_pair_score3_2,
|
| 387 |
+
sent_pair_source3_2,
|
| 388 |
+
sent_pair_source3_2_hl,
|
| 389 |
+
sent_pair_candidate3_2,
|
| 390 |
+
sent_pair_candidate3_2_hl,
|
| 391 |
+
see_more_rel_btn
|
| 392 |
+
]
|
| 393 |
+
)
|
| 394 |
+
|
| 395 |
+
# Get more info (move to more interactive portion)
|
| 396 |
+
see_more_rel_btn.click(
|
| 397 |
+
fn=show_more,
|
| 398 |
+
inputs=None,
|
| 399 |
outputs=[
|
| 400 |
selected_papers_radio,
|
| 401 |
source_sentences,
|
| 402 |
title_row,
|
| 403 |
paper_abstract
|
| 404 |
]
|
| 405 |
+
)
|
| 406 |
|
| 407 |
# change highlight based on selected sentences from submission
|
| 408 |
source_sentences.change(
|
score.py
CHANGED
|
@@ -6,7 +6,6 @@ import numpy as np
|
|
| 6 |
import tqdm
|
| 7 |
|
| 8 |
def compute_sentencewise_scores(model, query_sents, candidate_sents):
|
| 9 |
-
# TODO make this more general for different types of models
|
| 10 |
# list of sentences from query and candidate
|
| 11 |
q_v, c_v = get_embedding(model, query_sents, candidate_sents)
|
| 12 |
|
|
@@ -74,8 +73,10 @@ def get_match_phrase(w1, w2, method='pos'):
|
|
| 74 |
pos2 = pos_tag(w2)
|
| 75 |
for i, (w, p) in enumerate(pos2):
|
| 76 |
if w.lower() in w1 and p in include:
|
|
|
|
| 77 |
mask2[i] = 1
|
| 78 |
-
|
|
|
|
| 79 |
|
| 80 |
def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
|
| 81 |
"""
|
|
@@ -102,12 +103,12 @@ def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scor
|
|
| 102 |
sent_range = (sent_start_id[sid], sent_start_id[sid+1])
|
| 103 |
is_selected_sent[sent_range[0]:sent_range[1]] = 1
|
| 104 |
word_scores[sent_range[0]:sent_range[1]] = sscore
|
| 105 |
-
is_selected_phrase[sent_range[0]:sent_range[1]] = \
|
| 106 |
get_match_phrase(query_words, all_words[sent_range[0]:sent_range[1]])
|
| 107 |
else:
|
| 108 |
is_selected_sent[sent_start_id[sid]:] = 1
|
| 109 |
word_scores[sent_start_id[sid]:] = sscore
|
| 110 |
-
is_selected_phrase[sent_start_id[sid]:] = \
|
| 111 |
get_match_phrase(query_words, all_words[sent_start_id[sid]:])
|
| 112 |
|
| 113 |
# update selected phrase scores (-1 meaning a different color in gradio)
|
|
@@ -135,7 +136,42 @@ def get_highlight_info(model, text1, text2, K=None):
|
|
| 135 |
words2, all_words2, sent_start_id2 = get_words(sent2)
|
| 136 |
info = mark_words(sent1, words2, all_words2, sent_start_id2, sent_ids, sent_scores)
|
| 137 |
|
| 138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
### Document-level operations
|
| 141 |
|
|
@@ -194,4 +230,4 @@ def compute_document_score(doc_model, tokenizer, query, papers, batch=5):
|
|
| 194 |
abstracts_sorted = [abstracts[x] for x in idx_sorted]
|
| 195 |
scores_sorted = [scores[x] for x in idx_sorted]
|
| 196 |
|
| 197 |
-
return titles_sorted, abstracts_sorted, scores_sorted
|
|
|
|
| 6 |
import tqdm
|
| 7 |
|
| 8 |
def compute_sentencewise_scores(model, query_sents, candidate_sents):
|
|
|
|
| 9 |
# list of sentences from query and candidate
|
| 10 |
q_v, c_v = get_embedding(model, query_sents, candidate_sents)
|
| 11 |
|
|
|
|
| 73 |
pos2 = pos_tag(w2)
|
| 74 |
for i, (w, p) in enumerate(pos2):
|
| 75 |
if w.lower() in w1 and p in include:
|
| 76 |
+
j = w1.index(w.lower())
|
| 77 |
mask2[i] = 1
|
| 78 |
+
mask1[j] = 1
|
| 79 |
+
return mask1, mask2
|
| 80 |
|
| 81 |
def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
|
| 82 |
"""
|
|
|
|
| 103 |
sent_range = (sent_start_id[sid], sent_start_id[sid+1])
|
| 104 |
is_selected_sent[sent_range[0]:sent_range[1]] = 1
|
| 105 |
word_scores[sent_range[0]:sent_range[1]] = sscore
|
| 106 |
+
_, is_selected_phrase[sent_range[0]:sent_range[1]] = \
|
| 107 |
get_match_phrase(query_words, all_words[sent_range[0]:sent_range[1]])
|
| 108 |
else:
|
| 109 |
is_selected_sent[sent_start_id[sid]:] = 1
|
| 110 |
word_scores[sent_start_id[sid]:] = sscore
|
| 111 |
+
_, is_selected_phrase[sent_start_id[sid]:] = \
|
| 112 |
get_match_phrase(query_words, all_words[sent_start_id[sid]:])
|
| 113 |
|
| 114 |
# update selected phrase scores (-1 meaning a different color in gradio)
|
|
|
|
| 136 |
words2, all_words2, sent_start_id2 = get_words(sent2)
|
| 137 |
info = mark_words(sent1, words2, all_words2, sent_start_id2, sent_ids, sent_scores)
|
| 138 |
|
| 139 |
+
# get top sentence pairs from the query and candidate (score, index_pair)
|
| 140 |
+
top_pair_num = 5
|
| 141 |
+
top_pairs = []
|
| 142 |
+
ii = np.unravel_index(np.argsort(np.array(sent_scores).ravel())[-top_pair_num:], sent_scores.shape)
|
| 143 |
+
for i, j in zip(ii[0][::-1], ii[1][::-1]):
|
| 144 |
+
score = sent_scores[i,j]
|
| 145 |
+
index_pair = (i, sent_ids[i,j].item())
|
| 146 |
+
top_pairs.append((score, index_pair)) # list of (score, (sent_id_query, sent_id_candidate))
|
| 147 |
+
|
| 148 |
+
# convert top_pairs to corresponding highlights format for GRadio Interpretation component
|
| 149 |
+
top_pairs_info = dict()
|
| 150 |
+
count = 0
|
| 151 |
+
for s, (sidq, sidc) in top_pairs:
|
| 152 |
+
q_sent = sent1[sidq]
|
| 153 |
+
c_sent = sent2[sidc]
|
| 154 |
+
q_words = word_tokenize(q_sent)
|
| 155 |
+
c_words = word_tokenize(c_sent)
|
| 156 |
+
mask1, mask2 = get_match_phrase(q_words, c_words)
|
| 157 |
+
mask1 *= -1 # mark matching phrases as blue
|
| 158 |
+
mask2 *= -1
|
| 159 |
+
assert(len(mask1) == len(q_words) and len(mask2) == len(c_words))
|
| 160 |
+
top_pairs_info[count] = {
|
| 161 |
+
'query': {
|
| 162 |
+
'original': q_sent,
|
| 163 |
+
'interpretation': list(zip(q_words, mask1))
|
| 164 |
+
},
|
| 165 |
+
'candidate': {
|
| 166 |
+
'original': c_sent,
|
| 167 |
+
'interpretation': list(zip(c_words, mask2))
|
| 168 |
+
},
|
| 169 |
+
'score': s,
|
| 170 |
+
'sent_idx': (sidq, sidc)
|
| 171 |
+
}
|
| 172 |
+
count += 1
|
| 173 |
+
|
| 174 |
+
return sent_ids, sent_scores, info, top_pairs_info
|
| 175 |
|
| 176 |
### Document-level operations
|
| 177 |
|
|
|
|
| 230 |
abstracts_sorted = [abstracts[x] for x in idx_sorted]
|
| 231 |
scores_sorted = [scores[x] for x in idx_sorted]
|
| 232 |
|
| 233 |
+
return titles_sorted, abstracts_sorted, scores_sorted
|