Spaces:
Runtime error
Runtime error
Commit
·
693f997
1
Parent(s):
611e98e
filter on repetition removal
Browse files- app.py +61 -12
- en_examples_with_stats.json +2 -2
- explanation_filtering_pipeline.pdf +0 -0
- filtering_pipeline_oscar.pdf +0 -0
- zh_examples_with_stats.json +0 -3
app.py
CHANGED
|
@@ -7,6 +7,7 @@ import os
|
|
| 7 |
import base64
|
| 8 |
import json
|
| 9 |
import pandas as pd
|
|
|
|
| 10 |
|
| 11 |
import numpy as np
|
| 12 |
|
|
@@ -32,7 +33,7 @@ class Visualization:
|
|
| 32 |
|
| 33 |
def preamble(self):
|
| 34 |
st.markdown(
|
| 35 |
-
"Before diving into this demo, you might want to take a look at how the filtering pipeline
|
| 36 |
)
|
| 37 |
|
| 38 |
def get_binary_file_downloader_html(bin_file, file_label="File"):
|
|
@@ -45,7 +46,7 @@ class Visualization:
|
|
| 45 |
st.markdown(
|
| 46 |
get_binary_file_downloader_html(
|
| 47 |
self.path_instructions,
|
| 48 |
-
"Download the filtering pipeline
|
| 49 |
),
|
| 50 |
unsafe_allow_html=True,
|
| 51 |
)
|
|
@@ -73,16 +74,17 @@ class Visualization:
|
|
| 73 |
doc["text"][: self.max_len_text_display]
|
| 74 |
+ " [...] [THIS LONG TEXT HAS BEEN TRUNCATED FOR DISPLAY REASONS]"
|
| 75 |
)
|
| 76 |
-
self.
|
|
|
|
| 77 |
|
| 78 |
def set_title(self):
|
| 79 |
-
st.title(f"{self.num_docs} {self.lang} documents
|
| 80 |
|
| 81 |
def filtering_of_docs(self):
|
| 82 |
st.sidebar.subheader("Parameters of the filtering on documents")
|
| 83 |
|
| 84 |
-
def set_sliders(
|
| 85 |
-
columns = list(docs)
|
| 86 |
keys = []
|
| 87 |
conds = {}
|
| 88 |
|
|
@@ -99,7 +101,7 @@ class Visualization:
|
|
| 99 |
|
| 100 |
if "number_words" in columns:
|
| 101 |
cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
|
| 102 |
-
max_nb_words = int(np.max(docs["number_words"])) + 1
|
| 103 |
cutoff_min_number_words = st.sidebar.slider(
|
| 104 |
cutoff_def, 0, min(max_nb_words, 500), 0
|
| 105 |
)
|
|
@@ -119,6 +121,46 @@ class Visualization:
|
|
| 119 |
|
| 120 |
conds["number_words"] = [cond_1, cond_2]
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
if "special_characters_ratio" in columns:
|
| 123 |
cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
|
| 124 |
cutoff_special_characters_ratio = st.sidebar.slider(
|
|
@@ -169,7 +211,7 @@ class Visualization:
|
|
| 169 |
|
| 170 |
if "perplexity_score" in columns:
|
| 171 |
cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
|
| 172 |
-
max_pp = int(np.max(docs["perplexity_score"])) + 1
|
| 173 |
cutoff_perplexity_score = st.sidebar.slider(
|
| 174 |
cutoff_def, 0, max_pp, max_pp
|
| 175 |
)
|
|
@@ -181,7 +223,7 @@ class Visualization:
|
|
| 181 |
|
| 182 |
return keys, conds
|
| 183 |
|
| 184 |
-
self.keys, conds = set_sliders(
|
| 185 |
|
| 186 |
all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
|
| 187 |
all_conds = np.all(all_conds, axis=0)
|
|
@@ -215,6 +257,13 @@ class Visualization:
|
|
| 215 |
"Discarded documents for the filter on the number of words",
|
| 216 |
)
|
| 217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
if "special_characters_ratio" in columns:
|
| 219 |
cond_filter = np.invert(
|
| 220 |
np.all(conds["special_characters_ratio"], axis=0)
|
|
@@ -360,9 +409,9 @@ class Visualization:
|
|
| 360 |
self.download_data()
|
| 361 |
|
| 362 |
|
| 363 |
-
path_instructions = "./
|
| 364 |
-
path_data = "./
|
| 365 |
-
lang = "
|
| 366 |
num_docs = 5000
|
| 367 |
num_docs_for_words = 500
|
| 368 |
max_len_text_display = 10000
|
|
|
|
| 7 |
import base64
|
| 8 |
import json
|
| 9 |
import pandas as pd
|
| 10 |
+
pd.options.mode.chained_assignment = None
|
| 11 |
|
| 12 |
import numpy as np
|
| 13 |
|
|
|
|
| 33 |
|
| 34 |
def preamble(self):
|
| 35 |
st.markdown(
|
| 36 |
+
"Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
|
| 37 |
)
|
| 38 |
|
| 39 |
def get_binary_file_downloader_html(bin_file, file_label="File"):
|
|
|
|
| 46 |
st.markdown(
|
| 47 |
get_binary_file_downloader_html(
|
| 48 |
self.path_instructions,
|
| 49 |
+
"Download the explanation of the filtering pipeline as pdf",
|
| 50 |
),
|
| 51 |
unsafe_allow_html=True,
|
| 52 |
)
|
|
|
|
| 74 |
doc["text"][: self.max_len_text_display]
|
| 75 |
+ " [...] [THIS LONG TEXT HAS BEEN TRUNCATED FOR DISPLAY REASONS]"
|
| 76 |
)
|
| 77 |
+
self.docs_checkpoint = pd.DataFrame(docs)
|
| 78 |
+
self.docs = self.docs_checkpoint
|
| 79 |
|
| 80 |
def set_title(self):
|
| 81 |
+
st.title(f"{self.num_docs} {self.lang} documents with their stats.")
|
| 82 |
|
| 83 |
def filtering_of_docs(self):
|
| 84 |
st.sidebar.subheader("Parameters of the filtering on documents")
|
| 85 |
|
| 86 |
+
def set_sliders():
|
| 87 |
+
columns = list(self.docs)
|
| 88 |
keys = []
|
| 89 |
conds = {}
|
| 90 |
|
|
|
|
| 101 |
|
| 102 |
if "number_words" in columns:
|
| 103 |
cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
|
| 104 |
+
max_nb_words = int(np.max(self.docs["number_words"])) + 1
|
| 105 |
cutoff_min_number_words = st.sidebar.slider(
|
| 106 |
cutoff_def, 0, min(max_nb_words, 500), 0
|
| 107 |
)
|
|
|
|
| 121 |
|
| 122 |
conds["number_words"] = [cond_1, cond_2]
|
| 123 |
|
| 124 |
+
if "repetitions_ratio" in columns:
|
| 125 |
+
val_repetitions_lengths = list(
|
| 126 |
+
self.docs["repetitions_ratio"].iloc[0].keys()
|
| 127 |
+
)
|
| 128 |
+
default_index = (
|
| 129 |
+
val_repetitions_lengths.index("10")
|
| 130 |
+
if "10" in val_repetitions_lengths
|
| 131 |
+
else 0
|
| 132 |
+
)
|
| 133 |
+
label_selectbox = (
|
| 134 |
+
"Length of the repetitions (that will determine the repetitions ratio). "
|
| 135 |
+
"Choosing a higher or lower number does not mean that the filtering "
|
| 136 |
+
"is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
|
| 137 |
+
"tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
|
| 138 |
+
"few or no repetitions, simply because their length gives them more diversity, and we do "
|
| 139 |
+
"not want to discard such documents."
|
| 140 |
+
)
|
| 141 |
+
repetitions_length = st.sidebar.selectbox(
|
| 142 |
+
label=label_selectbox,
|
| 143 |
+
options=val_repetitions_lengths,
|
| 144 |
+
index=default_index,
|
| 145 |
+
)
|
| 146 |
+
self.docs = self.docs_checkpoint
|
| 147 |
+
for i in range(len(self.docs["repetitions_ratio"])):
|
| 148 |
+
self.docs["repetitions_ratio"].iloc[i] = self.docs["repetitions_ratio"].iloc[i][repetitions_length]
|
| 149 |
+
|
| 150 |
+
cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
|
| 151 |
+
cutoff_repetitions_ratio = st.sidebar.slider(
|
| 152 |
+
cutoff_def, 0.0, 1.0, 1.0, step=0.01
|
| 153 |
+
)
|
| 154 |
+
new_key = (
|
| 155 |
+
"repetitions_ratio",
|
| 156 |
+
cutoff_repetitions_ratio,
|
| 157 |
+
True,
|
| 158 |
+
)
|
| 159 |
+
keys.append(new_key)
|
| 160 |
+
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
| 161 |
+
print_discared_by_cond(cond)
|
| 162 |
+
conds["repetitions_ratio"] = [cond]
|
| 163 |
+
|
| 164 |
if "special_characters_ratio" in columns:
|
| 165 |
cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
|
| 166 |
cutoff_special_characters_ratio = st.sidebar.slider(
|
|
|
|
| 211 |
|
| 212 |
if "perplexity_score" in columns:
|
| 213 |
cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
|
| 214 |
+
max_pp = int(np.max(self.docs["perplexity_score"])) + 1
|
| 215 |
cutoff_perplexity_score = st.sidebar.slider(
|
| 216 |
cutoff_def, 0, max_pp, max_pp
|
| 217 |
)
|
|
|
|
| 223 |
|
| 224 |
return keys, conds
|
| 225 |
|
| 226 |
+
self.keys, conds = set_sliders()
|
| 227 |
|
| 228 |
all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
|
| 229 |
all_conds = np.all(all_conds, axis=0)
|
|
|
|
| 257 |
"Discarded documents for the filter on the number of words",
|
| 258 |
)
|
| 259 |
|
| 260 |
+
if "repetitions_ratio" in columns:
|
| 261 |
+
cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
|
| 262 |
+
display_dataset(
|
| 263 |
+
cond_filter,
|
| 264 |
+
"Discarded documents for the filter on the repetitions ratio",
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
if "special_characters_ratio" in columns:
|
| 268 |
cond_filter = np.invert(
|
| 269 |
np.all(conds["special_characters_ratio"], axis=0)
|
|
|
|
| 409 |
self.download_data()
|
| 410 |
|
| 411 |
|
| 412 |
+
path_instructions = "./explanation_filtering_pipeline.pdf"
|
| 413 |
+
path_data = "./en_examples_with_stats.json"
|
| 414 |
+
lang = "English"
|
| 415 |
num_docs = 5000
|
| 416 |
num_docs_for_words = 500
|
| 417 |
max_len_text_display = 10000
|
en_examples_with_stats.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:00106fc2a9d51bbc78ce1ca2d05f2f402bf927a1f741f6c092b3f17cb9f16801
|
| 3 |
+
size 237353442
|
explanation_filtering_pipeline.pdf
ADDED
|
Binary file (216 kB). View file
|
|
|
filtering_pipeline_oscar.pdf
DELETED
|
Binary file (196 kB)
|
|
|
zh_examples_with_stats.json
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:438a5bb757c23581784946f345a99ab11b77c43f57a3cbf18148c197ec4ef741
|
| 3 |
-
size 193517532
|
|
|
|
|
|
|
|
|
|
|
|