File size: 6,528 Bytes
24d6e19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from string import ascii_lowercase, ascii_uppercase
import tqdm
import pandas as pd

CLD_ALPHABET = list(ascii_lowercase) + list(ascii_uppercase)

def asserts_non_significance(col: list[bool], i: int, j: int) -> bool:
    """Assert whether i and j are represented as non-significant in the column 
    i.e., if the corresponding values in the column are different

    Parameters
    ----------
    col : list[bool]
        current column
    i : int
        index of first treatment
    j : int
        index of second treatment

    Returns
    -------
    bool
        If the non-significance is represented accurately
    """
    return col[i] and col[j]

def insert(column: list[bool], i: int, j: int):
    """Duplicates column and in one of its copies flip entry i to 0, 
    and in the other copy flip entry j to 0

    Parameters
    ----------
    column : list[bool]
        Original column
    i : int
       Index of first group
    j : int
       Index of second group

    Returns
    -------
    list[bool], list[bool]
       New columns after duplication and flip
    """
    col_i = column.copy()
    col_j = column.copy()
    col_i[i] = False
    col_j[j] = False
    return col_i, col_j

def can_be_absorbed(new_col: list[bool], ref_col: list[bool]) -> bool: 
    """An old column absorbs the new column 
    if it has a 1 in every row in which the new column has one

    Parameters
    ----------
    new_col : list[bool]
        Column to add
    ref_col : list[bool]
        Old column we are checking if it can absorb new_col

    Returns
    -------
    bool
       Whether old column cand absorb new_col
    """
    return all(ref_col[i] for i, x in enumerate(new_col) if x)

def absorb(new_column: list[bool], columns: list[list[bool]]) -> list[list[bool]]:
    """Absorb new column into existing columns if the condition allows

    Parameters
    ----------
    new_column : list[bool]
        Column to add
    columns : list[list[bool]]
        existing columns

    Returns
    -------
    list[list[bool]]
        Columns after absorption
    """
    if any(can_be_absorbed(new_column, c) for c in columns):
        return columns
    return columns + [new_column]

def cld(comparisons: pd.DataFrame) -> dict[str, str]:
    """
    Compact Letter Display

    Compute the compact letter display using the insert-absorb algorithm.

    See the following papers for more information:
    (1) https://doi.org/10.1016/j.csda.2006.09.035
    (2) https://doi.org/10.1198/1061860043515

    Parameters
    ----------
        comparisons : pd.DataFrame
            A DataFrame containing the pairwise comparisons produced by:
            https://www.statsmodels.org/dev/generated/statsmodels.stats.multicomp.pairwise_tukeyhsd.html
    """
    unique_groups = set(comparisons["group1"].unique())
    unique_groups = unique_groups.union(set(comparisons["group2"].unique()))
    unique_groups = list(unique_groups)
    unique_groups_indices = {g: i for i, g in enumerate(unique_groups)}

    sig_diff = comparisons[comparisons["reject"]]
    print(f"Found {len(sig_diff)} significantly different pairs")

    # Initialize CLD matrix for all unique groups/models, with "columns" as rows
    solution = [[True] * len(unique_groups)]

    for _, row in tqdm.tqdm(sig_diff.iterrows(), total=len(sig_diff)):
        i = unique_groups_indices[row["group1"]]
        j = unique_groups_indices[row["group2"]]

        has_changed: bool = True
        while has_changed:
            has_changed = False

            for idx in range(len(solution)):
                if asserts_non_significance(solution[idx], i, j):
                    # Duplicate the column
                    col_i, col_j = insert(solution[idx], i, j)

                    # Remove the old column
                    solution.pop(idx)

                    # Try absorb the column in an old column
                    # Simply add it to the solution otherwise
                    solution = absorb(col_i, solution)
                    solution = absorb(col_j, solution)

                    has_changed = True
                    break

    # Assign letters
    letters = [""] * len(unique_groups)

    for ci, col in enumerate(solution):
        letter = CLD_ALPHABET[ci]
        for idx, has_letter in enumerate(col):
            if has_letter:
                letters[idx] += letter

    return {group: sorted(letter) for group, letter in zip(unique_groups, letters)}

from statsmodels.stats.multicomp import pairwise_tukeyhsd
import tqdm

def add_cld_to_leaderboard(
    leaderboard: pd.DataFrame,
    scores: pd.DataFrame,
    metric: str,
):
    """Add the compact letter display to the leaderboard.

    Parameters
    ----------
    leaderboard : pd.DataFrame
        The full leaderboard DataFrame
    scores : pd.DataFrame
        The **raw** scores DataFrame, with all replicates from bootstrapping
    metric_ : str
        The metric label to calculate CLD for.
    """
    ordered_methods = leaderboard["user"].values

    scores = scores[["Sample", "user", metric]]
    scores[metric] = scores[metric].astype(float)

    # We compared methods using bootstrapping and the Tukey HSD test, presenting results via Compact Letter Display (CLD).
    # While acknowledging that bootstrapping likely underestimates variance,
    # we are not aware of better sampling techniques that fit the challenge format.
    stats = pairwise_tukeyhsd(endog=scores[metric], groups=scores["user"])
    # comparisons = stats.summary_frame()
    # The version of statsmodel is for some reason not the latest, so we have to do small workaround to get summary_frame
    summary_table = stats.summary()
    # data attribute is a list of lists with column names as first element
    data = summary_table.data[1:]
    columns = summary_table.data[0]
    comparisons = pd.DataFrame(data=data, columns=columns)

    letter_mapping = {}
    letter_code = cld(comparisons)

    cld_column = [""] * len(leaderboard)
    for idx, method in enumerate(ordered_methods):
        try:
            letters = letter_code[str(method)]

            for letter in letters:
                if letter not in letter_mapping:
                    letter_mapping[letter] = CLD_ALPHABET[len(letter_mapping)]
                cld_column[idx] += letter_mapping[letter]
        except KeyError: # Error with CLD for openadmet-dummy
            cld_column[idx] = "None"

    leaderboard["CLD"] = cld_column

    return leaderboard