Spaces:
Runtime error
Runtime error
better output formatting (removing spaces around punctuations)
Browse files
app.py
CHANGED
|
@@ -87,9 +87,11 @@ def get_similar_paper(
|
|
| 87 |
# get scores for each word in the format for Gradio Interpretation component
|
| 88 |
word_scores = dict()
|
| 89 |
for i in range(num_sents):
|
|
|
|
|
|
|
| 90 |
word_scores[str(i)] = {
|
| 91 |
"original": ab,
|
| 92 |
-
"interpretation": list(zip(
|
| 93 |
}
|
| 94 |
|
| 95 |
results[display_title[aa]] = {
|
|
|
|
| 87 |
# get scores for each word in the format for Gradio Interpretation component
|
| 88 |
word_scores = dict()
|
| 89 |
for i in range(num_sents):
|
| 90 |
+
|
| 91 |
+
ww, ss = remove_spaces(info['all_words'], info[i]['scores'])
|
| 92 |
word_scores[str(i)] = {
|
| 93 |
"original": ab,
|
| 94 |
+
"interpretation": list(zip(ww, ss))
|
| 95 |
}
|
| 96 |
|
| 97 |
results[display_title[aa]] = {
|
score.py
CHANGED
|
@@ -78,6 +78,77 @@ def get_match_phrase(w1, w2, method='pos'):
|
|
| 78 |
mask1[j] = 1
|
| 79 |
return mask1, mask2
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
|
| 82 |
"""
|
| 83 |
Mark the words that are highlighted, both by in terms of sentence and phrase
|
|
@@ -158,6 +229,11 @@ def get_highlight_info(model, text1, text2, K=None):
|
|
| 158 |
mask1 *= -sc # mark matching phrases as blue (-1: darkest)
|
| 159 |
mask2 *= -sc # mark matching phrases as blue
|
| 160 |
assert(len(mask1) == len(q_words) and len(mask2) == len(c_words))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
top_pairs_info[count] = {
|
| 162 |
'query': {
|
| 163 |
'original': q_sent,
|
|
|
|
| 78 |
mask1[j] = 1
|
| 79 |
return mask1, mask2
|
| 80 |
|
| 81 |
+
def remove_spaces(words, attrs):
|
| 82 |
+
# make the output more readable by removing unnecessary spacings from the tokenizer
|
| 83 |
+
# e.g.
|
| 84 |
+
# 1. spacing for parenthesis
|
| 85 |
+
# 2. spacing for single/double quotations
|
| 86 |
+
# 3. spacing for commas and periods
|
| 87 |
+
# 4. spacing for possessive quotations
|
| 88 |
+
assert(len(words) == len(attrs))
|
| 89 |
+
word_out, attr_out = [], []
|
| 90 |
+
idx, single_q, double_q = 0, 0, 0
|
| 91 |
+
while idx < len(words):
|
| 92 |
+
# stick to the word that appears right before
|
| 93 |
+
if words[idx] in [',', '.', '%', ')', ':', '?', ';', "'s"]:
|
| 94 |
+
ww = word_out.pop()
|
| 95 |
+
aa = attr_out.pop()
|
| 96 |
+
word_out.append(ww + words[idx])
|
| 97 |
+
attr_out.append(aa)
|
| 98 |
+
idx += 1
|
| 99 |
+
# stick to the word that appears right after
|
| 100 |
+
elif words[idx] in ["("]:
|
| 101 |
+
word_out.append(words[idx] + words[idx+1])
|
| 102 |
+
attr_out.append(attrs[idx+1])
|
| 103 |
+
idx += 2
|
| 104 |
+
# quotes
|
| 105 |
+
elif words[idx] == '"':
|
| 106 |
+
double_q += 1
|
| 107 |
+
if double_q == 2:
|
| 108 |
+
# this is closing quote: stick to word before
|
| 109 |
+
ww = word_out.pop()
|
| 110 |
+
aa = attr_out.pop()
|
| 111 |
+
word_out.append(ww + words[idx])
|
| 112 |
+
attr_out.append(aa)
|
| 113 |
+
idx += 1
|
| 114 |
+
double_q = 0
|
| 115 |
+
else:
|
| 116 |
+
# this is opening quote: stick to the word after
|
| 117 |
+
word_out.append(words[idx] + words[idx+1])
|
| 118 |
+
attr_out.append(attrs[idx+1])
|
| 119 |
+
idx += 2
|
| 120 |
+
elif words[idx] == "'":
|
| 121 |
+
single_q += 1
|
| 122 |
+
if single_q == 2:
|
| 123 |
+
# this is closing quote: stick to word before
|
| 124 |
+
ww = word_out.pop()
|
| 125 |
+
aa = attr_out.pop()
|
| 126 |
+
word_out.append(ww + words[idx])
|
| 127 |
+
attr_out.append(aa)
|
| 128 |
+
idx += 1
|
| 129 |
+
single_q = 0
|
| 130 |
+
else:
|
| 131 |
+
if words[idx-1][-1] == 's': #possessive quote
|
| 132 |
+
# stick to the word before, reset counter
|
| 133 |
+
ww = word_out.pop()
|
| 134 |
+
aa = attr_out.pop()
|
| 135 |
+
word_out.append(ww + words[idx])
|
| 136 |
+
attr_out.append(aa)
|
| 137 |
+
idx += 1
|
| 138 |
+
single_q = 0
|
| 139 |
+
else:
|
| 140 |
+
# this is opening quote: stick to the word after
|
| 141 |
+
word_out.append(words[idx] + words[idx+1])
|
| 142 |
+
attr_out.append(attrs[idx+1])
|
| 143 |
+
idx += 2
|
| 144 |
+
else:
|
| 145 |
+
word_out.append(words[idx])
|
| 146 |
+
attr_out.append(attrs[idx])
|
| 147 |
+
idx += 1
|
| 148 |
+
|
| 149 |
+
assert(len(word_out) == len(attr_out))
|
| 150 |
+
return word_out, attr_out
|
| 151 |
+
|
| 152 |
def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
|
| 153 |
"""
|
| 154 |
Mark the words that are highlighted, both by in terms of sentence and phrase
|
|
|
|
| 229 |
mask1 *= -sc # mark matching phrases as blue (-1: darkest)
|
| 230 |
mask2 *= -sc # mark matching phrases as blue
|
| 231 |
assert(len(mask1) == len(q_words) and len(mask2) == len(c_words))
|
| 232 |
+
|
| 233 |
+
# spacing
|
| 234 |
+
q_words, mask1 = remove_spaces(q_words, mask1)
|
| 235 |
+
c_words, mask2 = remove_spaces(c_words, mask2)
|
| 236 |
+
|
| 237 |
top_pairs_info[count] = {
|
| 238 |
'query': {
|
| 239 |
'original': q_sent,
|