Update curated.py
Browse files- curated.py +60 -0
curated.py
CHANGED
|
@@ -436,6 +436,35 @@ s2o_filter = pd.DataFrame(
|
|
| 436 |
table_html_s2o = s2o_filter.to_html(index=False, border=0)
|
| 437 |
table_div_s2o = Div(NotStr(table_html_s2o))
|
| 438 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
med_filter = pd.DataFrame(
|
| 440 |
{
|
| 441 |
"Dataset": [
|
|
@@ -465,6 +494,35 @@ med_filter = pd.DataFrame(
|
|
| 465 |
table_html_med = med_filter.to_html(index=False, border=0)
|
| 466 |
table_div_med = Div(NotStr(table_html_med))
|
| 467 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
phil_filter = pd.DataFrame(
|
| 469 |
{
|
| 470 |
"Dataset": [
|
|
@@ -855,6 +913,7 @@ filtering_process = Div(
|
|
| 855 |
style="margin-bottom: -3px",
|
| 856 |
),
|
| 857 |
),
|
|
|
|
| 858 |
#Details(
|
| 859 |
# Summary("S2ORC Abstract Filtering Examples "),
|
| 860 |
# Div(
|
|
@@ -914,6 +973,7 @@ filtering_process = Div(
|
|
| 914 |
),
|
| 915 |
),
|
| 916 |
table_div_med,
|
|
|
|
| 917 |
Details(
|
| 918 |
Summary("PubMed Filtering Examples"),
|
| 919 |
Div(
|
|
|
|
| 436 |
table_html_s2o = s2o_filter.to_html(index=False, border=0)
|
| 437 |
table_div_s2o = Div(NotStr(table_html_s2o))
|
| 438 |
|
| 439 |
+
s2oa_filter = pd.DataFrame(
|
| 440 |
+
{
|
| 441 |
+
"Dataset": [
|
| 442 |
+
"S2ORC Abstract",
|
| 443 |
+
],
|
| 444 |
+
"Lines Downloaded": [
|
| 445 |
+
"102324176",
|
| 446 |
+
],
|
| 447 |
+
"Percent Removed After Language Filter": [
|
| 448 |
+
"18.04%",
|
| 449 |
+
],
|
| 450 |
+
"Percent Removed After Min Word Count Filter": [
|
| 451 |
+
"1.17%",
|
| 452 |
+
],
|
| 453 |
+
"Percent Removed After Unigram Probability Filter": [
|
| 454 |
+
"0.00%",
|
| 455 |
+
],
|
| 456 |
+
"Percent Removed After Local Dedup": [
|
| 457 |
+
"0.13%",
|
| 458 |
+
],
|
| 459 |
+
"Total Percentage Remaining": [
|
| 460 |
+
"80.66%",
|
| 461 |
+
],
|
| 462 |
+
}
|
| 463 |
+
)
|
| 464 |
+
|
| 465 |
+
table_html_s2oa = s2oa_filter.to_html(index=False, border=0)
|
| 466 |
+
table_div_s2oa = Div(NotStr(table_html_s2oa))
|
| 467 |
+
|
| 468 |
med_filter = pd.DataFrame(
|
| 469 |
{
|
| 470 |
"Dataset": [
|
|
|
|
| 494 |
table_html_med = med_filter.to_html(index=False, border=0)
|
| 495 |
table_div_med = Div(NotStr(table_html_med))
|
| 496 |
|
| 497 |
+
pma_filter = pd.DataFrame(
|
| 498 |
+
{
|
| 499 |
+
"Dataset": [
|
| 500 |
+
"PubMed - Abstract",
|
| 501 |
+
],
|
| 502 |
+
"Lines Downloaded": [
|
| 503 |
+
"25787474",
|
| 504 |
+
],
|
| 505 |
+
"Percent Removed After Language Filter": [
|
| 506 |
+
"0.01%",
|
| 507 |
+
],
|
| 508 |
+
"Percent Removed After Min Word Count Filter": [
|
| 509 |
+
"0.14%",
|
| 510 |
+
],
|
| 511 |
+
"Percent Removed After Unigram Probability Filter": [
|
| 512 |
+
"0.00%",
|
| 513 |
+
],
|
| 514 |
+
"Percent Removed After Local Dedup": [
|
| 515 |
+
"0.00%",
|
| 516 |
+
],
|
| 517 |
+
"Total Percentage Remaining": [
|
| 518 |
+
"98.85%",
|
| 519 |
+
],
|
| 520 |
+
}
|
| 521 |
+
)
|
| 522 |
+
|
| 523 |
+
table_html_pma = pma_filter.to_html(index=False, border=0)
|
| 524 |
+
table_div_pma = Div(NotStr(table_html_pma))
|
| 525 |
+
|
| 526 |
phil_filter = pd.DataFrame(
|
| 527 |
{
|
| 528 |
"Dataset": [
|
|
|
|
| 913 |
style="margin-bottom: -3px",
|
| 914 |
),
|
| 915 |
),
|
| 916 |
+
table_div_s2oa,
|
| 917 |
#Details(
|
| 918 |
# Summary("S2ORC Abstract Filtering Examples "),
|
| 919 |
# Div(
|
|
|
|
| 973 |
),
|
| 974 |
),
|
| 975 |
table_div_med,
|
| 976 |
+
table_div_pma,
|
| 977 |
Details(
|
| 978 |
Summary("PubMed Filtering Examples"),
|
| 979 |
Div(
|