Spaces:
Runtime error
Runtime error
Mariusz Kossakowski
commited on
Commit
·
494bd12
1
Parent(s):
1358c21
Add tokens count to aspectemo
Browse files
clarin_datasets/aspectemo_dataset.py
CHANGED
|
@@ -49,6 +49,7 @@ class AspectEmoDataset(DatasetToShow):
|
|
| 49 |
description = st.container()
|
| 50 |
dataframe_head = st.container()
|
| 51 |
class_distribution = st.container()
|
|
|
|
| 52 |
|
| 53 |
with header:
|
| 54 |
st.title(self.dataset_name)
|
|
@@ -58,6 +59,11 @@ class AspectEmoDataset(DatasetToShow):
|
|
| 58 |
st.write(self.description)
|
| 59 |
|
| 60 |
full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
with dataframe_head:
|
| 62 |
df_to_show = full_dataframe.head(10)
|
| 63 |
st.header("First 10 observations of the dataset")
|
|
@@ -66,14 +72,16 @@ class AspectEmoDataset(DatasetToShow):
|
|
| 66 |
|
| 67 |
class_distribution_dict = {}
|
| 68 |
for subset in self.subsets:
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
| 72 |
class_distribution_dict[subset] = (
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
)
|
| 78 |
|
| 79 |
class_distribution_df = pd.merge(
|
|
@@ -84,4 +92,33 @@ class AspectEmoDataset(DatasetToShow):
|
|
| 84 |
with class_distribution:
|
| 85 |
st.header("Class distribution in each subset (without '0')")
|
| 86 |
st.dataframe(class_distribution_df)
|
| 87 |
-
st.text_area(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
description = st.container()
|
| 50 |
dataframe_head = st.container()
|
| 51 |
class_distribution = st.container()
|
| 52 |
+
most_common_tokens = st.container()
|
| 53 |
|
| 54 |
with header:
|
| 55 |
st.title(self.dataset_name)
|
|
|
|
| 59 |
st.write(self.description)
|
| 60 |
|
| 61 |
full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
|
| 62 |
+
tokens_all = full_dataframe["tokens"].tolist()
|
| 63 |
+
tokens_all = [x for subarray in tokens_all for x in subarray]
|
| 64 |
+
labels_all = full_dataframe["labels"].tolist()
|
| 65 |
+
labels_all = [x for subarray in labels_all for x in subarray]
|
| 66 |
+
|
| 67 |
with dataframe_head:
|
| 68 |
df_to_show = full_dataframe.head(10)
|
| 69 |
st.header("First 10 observations of the dataset")
|
|
|
|
| 72 |
|
| 73 |
class_distribution_dict = {}
|
| 74 |
for subset in self.subsets:
|
| 75 |
+
all_labels_from_subset = self.data_dict[subset]["labels"].tolist()
|
| 76 |
+
all_labels_from_subset = [
|
| 77 |
+
x for subarray in all_labels_from_subset for x in subarray if x != 0
|
| 78 |
+
]
|
| 79 |
+
all_labels_from_subset = pd.Series(all_labels_from_subset)
|
| 80 |
class_distribution_dict[subset] = (
|
| 81 |
+
all_labels_from_subset.value_counts(normalize=True)
|
| 82 |
+
.sort_index()
|
| 83 |
+
.reset_index()
|
| 84 |
+
.rename({"index": "class", 0: subset}, axis="columns")
|
| 85 |
)
|
| 86 |
|
| 87 |
class_distribution_df = pd.merge(
|
|
|
|
| 92 |
with class_distribution:
|
| 93 |
st.header("Class distribution in each subset (without '0')")
|
| 94 |
st.dataframe(class_distribution_df)
|
| 95 |
+
st.text_area(
|
| 96 |
+
label="LaTeX code", value=class_distribution_df.style.to_latex()
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# Most common tokens from selected class (without 0)
|
| 100 |
+
full_df_unzipped = pd.DataFrame(
|
| 101 |
+
{
|
| 102 |
+
"token": tokens_all,
|
| 103 |
+
"label": labels_all,
|
| 104 |
+
}
|
| 105 |
+
)
|
| 106 |
+
full_df_unzipped = full_df_unzipped.loc[full_df_unzipped["label"] != 0]
|
| 107 |
+
possible_options = sorted(full_df_unzipped["label"].unique())
|
| 108 |
+
with most_common_tokens:
|
| 109 |
+
st.header("10 most common tokens from selected class (without '0')")
|
| 110 |
+
selected_class = st.selectbox(
|
| 111 |
+
label="Select class to show", options=possible_options
|
| 112 |
+
)
|
| 113 |
+
df_to_show = (
|
| 114 |
+
full_df_unzipped.loc[full_df_unzipped["label"] == selected_class]
|
| 115 |
+
.groupby(["token"])
|
| 116 |
+
.count()
|
| 117 |
+
.reset_index()
|
| 118 |
+
.rename({"label": "no_of_occurrences"}, axis=1)
|
| 119 |
+
.sort_values(by="no_of_occurrences", ascending=False)
|
| 120 |
+
.reset_index(drop=True)
|
| 121 |
+
.head(10)
|
| 122 |
+
)
|
| 123 |
+
st.dataframe(df_to_show)
|
| 124 |
+
st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
|