Spaces:
Build error
Build error
Parsed references are displayed in HTML table. Colors for some labels
Browse files
app.py
CHANGED
|
@@ -4,6 +4,7 @@ import spacy
|
|
| 4 |
from spacy import displacy
|
| 5 |
|
| 6 |
from bib_tokenizers import create_references_tokenizer
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
nlp = spacy.load("en_bib_references_trf")
|
|
@@ -121,22 +122,51 @@ def split_up_references(
|
|
| 121 |
def text_analysis(text, is_eol_mode):
|
| 122 |
|
| 123 |
if not text:
|
| 124 |
-
return "<div style='max-width:100%;
|
| 125 |
-
|
| 126 |
-
html = ""
|
| 127 |
|
| 128 |
doc_with_linebreaks = split_up_references(
|
| 129 |
text, is_eol_mode=is_eol_mode, nlp=nlp, nlp_blank=nlp_blank
|
| 130 |
)
|
| 131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
for i, sent in enumerate(doc_with_linebreaks.sents):
|
| 133 |
bib_item_doc = sent.as_doc()
|
| 134 |
-
bib_item_doc
|
| 135 |
-
html +=
|
| 136 |
|
| 137 |
html = (
|
| 138 |
-
"<div style='max-width:100%; max-height:720px; overflow:auto'>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
+ html
|
|
|
|
| 140 |
+ "</div>"
|
| 141 |
)
|
| 142 |
|
|
@@ -154,7 +184,7 @@ with demo:
|
|
| 154 |
is_eol_mode = gr.components.Checkbox(
|
| 155 |
label="My Unparsed Bibliography does not contain more than one reference per line (Multiline references are supported regardless of this choice)"
|
| 156 |
)
|
| 157 |
-
html = gr.components.HTML(label="Parsed
|
| 158 |
textbox.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
|
| 159 |
is_eol_mode.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
|
| 160 |
|
|
@@ -189,8 +219,7 @@ CFR
|
|
| 189 |
[Knu] Donald Knuth. Knuth: Computers and typesetting."""
|
| 190 |
],
|
| 191 |
[
|
| 192 |
-
"""
|
| 193 |
-
Bartkiewicz, A., Szymczak, M., Cohen, R. J., & Richards, A. M. S. 2005, MN- RAS, 361, 623
|
| 194 |
Bartkiewicz, A., Szymczak, M., & van Langevelde, H. J. 2016, A&A, 587, A104
|
| 195 |
Benjamin, R. A., Churchwell, E., Babler, B. L., et al. 2003, PASP, 115, 953
|
| 196 |
Beuther, H., Mottram, J. C., Ahmadi, A., et al. 2018, A&A, 617, A100
|
|
|
|
| 4 |
from spacy import displacy
|
| 5 |
|
| 6 |
from bib_tokenizers import create_references_tokenizer
|
| 7 |
+
from schema import tags_ent
|
| 8 |
|
| 9 |
|
| 10 |
nlp = spacy.load("en_bib_references_trf")
|
|
|
|
| 122 |
def text_analysis(text, is_eol_mode):
|
| 123 |
|
| 124 |
if not text:
|
| 125 |
+
return "<div style='max-width:100%; overflow:auto; color:grey'><p>Unparsed Bibliography Section is empty</p></div>"
|
|
|
|
|
|
|
| 126 |
|
| 127 |
doc_with_linebreaks = split_up_references(
|
| 128 |
text, is_eol_mode=is_eol_mode, nlp=nlp, nlp_blank=nlp_blank
|
| 129 |
)
|
| 130 |
|
| 131 |
+
html = ""
|
| 132 |
+
options = {
|
| 133 |
+
"ents": tags_ent,
|
| 134 |
+
"colors": {
|
| 135 |
+
"citation-number": "yellow",
|
| 136 |
+
"citation-label": "yellow",
|
| 137 |
+
"family": "DeepSkyBlue",
|
| 138 |
+
"given": "LightSkyBlue",
|
| 139 |
+
"title": "PeachPuff",
|
| 140 |
+
"container-title": "Moccasin",
|
| 141 |
+
"publisher": "PaleTurquoise",
|
| 142 |
+
"issued": "Gold",
|
| 143 |
+
},
|
| 144 |
+
}
|
| 145 |
for i, sent in enumerate(doc_with_linebreaks.sents):
|
| 146 |
bib_item_doc = sent.as_doc()
|
| 147 |
+
ref = displacy.render(bib_item_doc, style="ent", options=options)
|
| 148 |
+
html += f"<tr><td>{i}</td><td>{ref}</td></tr>"
|
| 149 |
|
| 150 |
html = (
|
| 151 |
+
"""<div style='max-width:100%; max-height:720px; overflow:auto'>
|
| 152 |
+
<style>table {
|
| 153 |
+
font-family: arial, sans-serif;
|
| 154 |
+
border-collapse: collapse;
|
| 155 |
+
width: 100%;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
td, th {
|
| 159 |
+
border: 1px solid #b0b0b0;
|
| 160 |
+
text-align: left;
|
| 161 |
+
padding: 8px;
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
tr:nth-child(even) {
|
| 165 |
+
background-color: #f2f2f2;
|
| 166 |
+
}</style>"""
|
| 167 |
+
+ "<table><tr><th>Index</th><th>Parsed Reference</th></tr>"
|
| 168 |
+ html
|
| 169 |
+
+ "</table>"
|
| 170 |
+ "</div>"
|
| 171 |
)
|
| 172 |
|
|
|
|
| 184 |
is_eol_mode = gr.components.Checkbox(
|
| 185 |
label="My Unparsed Bibliography does not contain more than one reference per line (Multiline references are supported regardless of this choice)"
|
| 186 |
)
|
| 187 |
+
html = gr.components.HTML(label="Parsed References")
|
| 188 |
textbox.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
|
| 189 |
is_eol_mode.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
|
| 190 |
|
|
|
|
| 219 |
[Knu] Donald Knuth. Knuth: Computers and typesetting."""
|
| 220 |
],
|
| 221 |
[
|
| 222 |
+
"""Bartkiewicz, A., Szymczak, M., Cohen, R. J., & Richards, A. M. S. 2005, MN- RAS, 361, 623
|
|
|
|
| 223 |
Bartkiewicz, A., Szymczak, M., & van Langevelde, H. J. 2016, A&A, 587, A104
|
| 224 |
Benjamin, R. A., Churchwell, E., Babler, B. L., et al. 2003, PASP, 115, 953
|
| 225 |
Beuther, H., Mottram, J. C., Ahmadi, A., et al. 2018, A&A, 617, A100
|
schema.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# non-overlapped spans generated by CSL. Can be considered as annotations for the NER task
|
| 2 |
+
tags_ent = [
|
| 3 |
+
"citation-number",
|
| 4 |
+
"citation-label",
|
| 5 |
+
"family",
|
| 6 |
+
"given",
|
| 7 |
+
"title",
|
| 8 |
+
"container-title",
|
| 9 |
+
"issued",
|
| 10 |
+
"url",
|
| 11 |
+
"publisher",
|
| 12 |
+
"page",
|
| 13 |
+
"doi",
|
| 14 |
+
"publisher-place",
|
| 15 |
+
"number-of-pages",
|
| 16 |
+
"collection-title",
|
| 17 |
+
"collection-number",
|
| 18 |
+
"genre",
|
| 19 |
+
"authority",
|
| 20 |
+
"URL",
|
| 21 |
+
"DOI",
|
| 22 |
+
"volume",
|
| 23 |
+
# "title-short", it is a valid tag, but we ended up with the only one in the dataset...
|
| 24 |
+
"number",
|
| 25 |
+
"note",
|
| 26 |
+
"archive",
|
| 27 |
+
"archive_location",
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
# spans which may enclose other annotated spans. Spacy allows to store overlapped spans within doc.spans
|
| 31 |
+
tags_span = [
|
| 32 |
+
"author",
|
| 33 |
+
"year",
|
| 34 |
+
"month",
|
| 35 |
+
"day",
|
| 36 |
+
"issued",
|
| 37 |
+
"url",
|
| 38 |
+
"bib",
|
| 39 |
+
] + tags_ent
|
| 40 |
+
|
| 41 |
+
# span tag used for adding sentence boundaries annotations: an annotated CSL style encloses each bib item with <bib>..</bib>
|
| 42 |
+
tag_sentence_start = "bib"
|
| 43 |
+
|
| 44 |
+
spankey_sentence_start = "sc"
|