Update curated.py
Browse files- curated.py +48 -0
curated.py
CHANGED
|
@@ -694,6 +694,53 @@ def get_chart_28168342():
|
|
| 694 |
return fig
|
| 695 |
|
| 696 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 697 |
def update(target: str, request):
|
| 698 |
params = request.query_params
|
| 699 |
if data_source := params.get(f"data_source_{target}"):
|
|
@@ -836,6 +883,7 @@ def curated(request):
|
|
| 836 |
table_desc,
|
| 837 |
data_preprocessing_div,
|
| 838 |
plotly2fasthtml(get_chart_28168342()),
|
|
|
|
| 839 |
H2("Curated Sources Processing"),
|
| 840 |
filtering_process,
|
| 841 |
data_preparation_div,
|
|
|
|
| 694 |
return fig
|
| 695 |
|
| 696 |
|
| 697 |
+
def get_chart_new():
|
| 698 |
+
fig = go.Figure()
|
| 699 |
+
filter_names = [
|
| 700 |
+
"Download",
|
| 701 |
+
"Language",
|
| 702 |
+
"Min word count",
|
| 703 |
+
"Title Abstract",
|
| 704 |
+
"Majority language",
|
| 705 |
+
"Paragraph count",
|
| 706 |
+
"Frequency",
|
| 707 |
+
"Unigram log probability",
|
| 708 |
+
"Local dedup",
|
| 709 |
+
]
|
| 710 |
+
|
| 711 |
+
data_sources = [
|
| 712 |
+
("Wikipedia", [61614907, 0, 1146416, 0, 0, 0, 0, 0, 20]),
|
| 713 |
+
("Freelaw", [75971288, 2280522, 5518932, 0, 0, 0, 0, 48660, 20]),
|
| 714 |
+
("DM Maths", [112559888, 0, 0, 0, 0, 0, 0, 0, 20]),
|
| 715 |
+
("USPTO", [6880276, 1312, 129042, 0, 0, 0, 0, 533, 20]),
|
| 716 |
+
("PG19", [28752, 69, 1, 0, 0, 0, 0, 50, 20]),
|
| 717 |
+
("Hackernews", [2064931, 54129, 314, 0, 0, 0, 0, 6852, 20]),
|
| 718 |
+
("Ubuntu IRC", [37966, 14465, 33, 0, 0, 0, 0, 263, 20]),
|
| 719 |
+
("Europarl", [69814, 0, 0, 0, 0, 0, 0, 0, 20]),
|
| 720 |
+
("StackExchange", [23246548, 0, 196, 0, 0, 0, 0, 0, 20]),
|
| 721 |
+
("Arxiv", [1911867, 42426, 105601, 0, 0, 0, 0, 1179, 20]),
|
| 722 |
+
("S2ORC", [12963563, 0, 0, 2232450, 1275493, 148804, 1251669, 0, 20]),
|
| 723 |
+
("S2ORC Abstract", [102324176, 18456575, 978308, 0, 0, 0, 0, 111381, 20]),
|
| 724 |
+
("PubMed Central", [5230932, 400446, 62176, 0, 0, 0, 0, 836, 20]),
|
| 725 |
+
("PubMed Central Abstract", [25787474, 3100, 36419, 0, 0, 0, 0, 1231, 20]),
|
| 726 |
+
("PhilPapers", [49389, 10214, 0, 0, 0, 0, 0, 47, 20]),
|
| 727 |
+
]
|
| 728 |
+
|
| 729 |
+
for name, x_values in data_sources:
|
| 730 |
+
fig.add_trace(
|
| 731 |
+
go.Funnel(
|
| 732 |
+
name=name,
|
| 733 |
+
orientation="h",
|
| 734 |
+
y=filter_names,
|
| 735 |
+
x=x_values,
|
| 736 |
+
textinfo="value+percent total",
|
| 737 |
+
textposition="inside",
|
| 738 |
+
)
|
| 739 |
+
)
|
| 740 |
+
|
| 741 |
+
fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)")
|
| 742 |
+
return fig
|
| 743 |
+
|
| 744 |
def update(target: str, request):
|
| 745 |
params = request.query_params
|
| 746 |
if data_source := params.get(f"data_source_{target}"):
|
|
|
|
| 883 |
table_desc,
|
| 884 |
data_preprocessing_div,
|
| 885 |
plotly2fasthtml(get_chart_28168342()),
|
| 886 |
+
plotly2fasthtml(get_chart_new()),
|
| 887 |
H2("Curated Sources Processing"),
|
| 888 |
filtering_process,
|
| 889 |
data_preparation_div,
|