Spaces:

huggingface
/

text-data-filtering

Runtime error

App Files Files Community

HugoLaurencon commited on Jan 8, 2022

Commit

693f997

1 Parent(s): 611e98e

filter on repetition removal

Browse files

Files changed (5) hide show

app.py +61 -12
en_examples_with_stats.json +2 -2
explanation_filtering_pipeline.pdf +0 -0
filtering_pipeline_oscar.pdf +0 -0
zh_examples_with_stats.json +0 -3

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import os
 import base64
 import json
 import pandas as pd
 import numpy as np
@@ -32,7 +33,7 @@ class Visualization:
     def preamble(self):
         st.markdown(
-            "Before diving into this demo, you might want to take a look at how the filtering pipeline of OSCAR looks like in more detail."
         )
         def get_binary_file_downloader_html(bin_file, file_label="File"):
@@ -45,7 +46,7 @@ class Visualization:
         st.markdown(
             get_binary_file_downloader_html(
                 self.path_instructions,
-                "Download the filtering pipeline of OSCAR as pdf",
             ),
             unsafe_allow_html=True,
         )
@@ -73,16 +74,17 @@ class Visualization:
                     doc["text"][: self.max_len_text_display]
                     + " [...] [THIS LONG TEXT HAS BEEN TRUNCATED FOR DISPLAY REASONS]"
                 )
-        self.docs = pd.DataFrame(docs)
     def set_title(self):
-        st.title(f"{self.num_docs} {self.lang} documents from OSCAR with their stats.")
     def filtering_of_docs(self):
         st.sidebar.subheader("Parameters of the filtering on documents")
-        def set_sliders(docs):
-            columns = list(docs)
             keys = []
             conds = {}
@@ -99,7 +101,7 @@ class Visualization:
             if "number_words" in columns:
                 cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
-                max_nb_words = int(np.max(docs["number_words"])) + 1
                 cutoff_min_number_words = st.sidebar.slider(
                     cutoff_def, 0, min(max_nb_words, 500), 0
                 )
@@ -119,6 +121,46 @@ class Visualization:
                 conds["number_words"] = [cond_1, cond_2]
             if "special_characters_ratio" in columns:
                 cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
                 cutoff_special_characters_ratio = st.sidebar.slider(
@@ -169,7 +211,7 @@ class Visualization:
             if "perplexity_score" in columns:
                 cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
-                max_pp = int(np.max(docs["perplexity_score"])) + 1
                 cutoff_perplexity_score = st.sidebar.slider(
                     cutoff_def, 0, max_pp, max_pp
                 )
@@ -181,7 +223,7 @@ class Visualization:
             return keys, conds
-        self.keys, conds = set_sliders(self.docs)
         all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
         all_conds = np.all(all_conds, axis=0)
@@ -215,6 +257,13 @@ class Visualization:
                     "Discarded documents for the filter on the number of words",
                 )
             if "special_characters_ratio" in columns:
                 cond_filter = np.invert(
                     np.all(conds["special_characters_ratio"], axis=0)
@@ -360,9 +409,9 @@ class Visualization:
         self.download_data()
-path_instructions = "./filtering_pipeline_oscar.pdf"
-path_data = "./zh_examples_with_stats.json"
-lang = "Chinese"
 num_docs = 5000
 num_docs_for_words = 500
 max_len_text_display = 10000

 import base64
 import json
 import pandas as pd
+pd.options.mode.chained_assignment = None
 import numpy as np
     def preamble(self):
         st.markdown(
+            "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
         )
         def get_binary_file_downloader_html(bin_file, file_label="File"):
         st.markdown(
             get_binary_file_downloader_html(
                 self.path_instructions,
+                "Download the explanation of the filtering pipeline as pdf",
             ),
             unsafe_allow_html=True,
         )
                     doc["text"][: self.max_len_text_display]
                     + " [...] [THIS LONG TEXT HAS BEEN TRUNCATED FOR DISPLAY REASONS]"
                 )
+        self.docs_checkpoint = pd.DataFrame(docs)
+        self.docs = self.docs_checkpoint
     def set_title(self):
+        st.title(f"{self.num_docs} {self.lang} documents with their stats.")
     def filtering_of_docs(self):
         st.sidebar.subheader("Parameters of the filtering on documents")
+        def set_sliders():
+            columns = list(self.docs)
             keys = []
             conds = {}
             if "number_words" in columns:
                 cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
+                max_nb_words = int(np.max(self.docs["number_words"])) + 1
                 cutoff_min_number_words = st.sidebar.slider(
                     cutoff_def, 0, min(max_nb_words, 500), 0
                 )
                 conds["number_words"] = [cond_1, cond_2]
+            if "repetitions_ratio" in columns:
+                val_repetitions_lengths = list(
+                    self.docs["repetitions_ratio"].iloc[0].keys()
+                )
+                default_index = (
+                    val_repetitions_lengths.index("10")
+                    if "10" in val_repetitions_lengths
+                    else 0
+                )
+                label_selectbox = (
+                    "Length of the repetitions (that will determine the repetitions ratio). "
+                    "Choosing a higher or lower number does not mean that the filtering "
+                    "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
+                    "tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
+                    "few or no repetitions, simply because their length gives them more diversity, and we do "
+                    "not want to discard such documents."
+                )
+                repetitions_length = st.sidebar.selectbox(
+                    label=label_selectbox,
+                    options=val_repetitions_lengths,
+                    index=default_index,
+                )
+                self.docs = self.docs_checkpoint
+                for i in range(len(self.docs["repetitions_ratio"])):
+                    self.docs["repetitions_ratio"].iloc[i] = self.docs["repetitions_ratio"].iloc[i][repetitions_length]
+                cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
+                cutoff_repetitions_ratio = st.sidebar.slider(
+                    cutoff_def, 0.0, 1.0, 1.0, step=0.01
+                )
+                new_key = (
+                    "repetitions_ratio",
+                    cutoff_repetitions_ratio,
+                    True,
+                )
+                keys.append(new_key)
+                cond = get_cond(new_key[0], new_key[1], new_key[2])
+                print_discared_by_cond(cond)
+                conds["repetitions_ratio"] = [cond]
             if "special_characters_ratio" in columns:
                 cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
                 cutoff_special_characters_ratio = st.sidebar.slider(
             if "perplexity_score" in columns:
                 cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
+                max_pp = int(np.max(self.docs["perplexity_score"])) + 1
                 cutoff_perplexity_score = st.sidebar.slider(
                     cutoff_def, 0, max_pp, max_pp
                 )
             return keys, conds
+        self.keys, conds = set_sliders()
         all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
         all_conds = np.all(all_conds, axis=0)
                     "Discarded documents for the filter on the number of words",
                 )
+            if "repetitions_ratio" in columns:
+                cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
+                display_dataset(
+                    cond_filter,
+                    "Discarded documents for the filter on the repetitions ratio",
+                )
             if "special_characters_ratio" in columns:
                 cond_filter = np.invert(
                     np.all(conds["special_characters_ratio"], axis=0)
         self.download_data()
+path_instructions = "./explanation_filtering_pipeline.pdf"
+path_data = "./en_examples_with_stats.json"
+lang = "English"
 num_docs = 5000
 num_docs_for_words = 500
 max_len_text_display = 10000

en_examples_with_stats.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f2325873414309a7ea67d2753202207a2773319dc40f338c0a0fc7bb703463a6
-size 713107133

 version https://git-lfs.github.com/spec/v1
+oid sha256:00106fc2a9d51bbc78ce1ca2d05f2f402bf927a1f741f6c092b3f17cb9f16801
+size 237353442

explanation_filtering_pipeline.pdf ADDED Viewed

Binary file (216 kB). View file

filtering_pipeline_oscar.pdf DELETED Viewed

Binary file (196 kB)

zh_examples_with_stats.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:438a5bb757c23581784946f345a99ab11b77c43f57a3cbf18148c197ec4ef741
-size 193517532