Spaces:

openadmet
/

OpenADMET-ExpansionRx-Challenge

Running

File size: 6,528 Bytes

24d6e19

from statsmodels.stats.multicomp import pairwise_tukeyhsd
from string import ascii_lowercase, ascii_uppercase
import tqdm
import pandas as pd

CLD_ALPHABET = list(ascii_lowercase) + list(ascii_uppercase)

def asserts_non_significance(col: list[bool], i: int, j: int) -> bool:
    """Assert whether i and j are represented as non-significant in the column 
    i.e., if the corresponding values in the column are different

    Parameters
    ----------
    col : list[bool]
        current column
    i : int
        index of first treatment
    j : int
        index of second treatment

    Returns
    -------
    bool
        If the non-significance is represented accurately
    """
    return col[i] and col[j]

def insert(column: list[bool], i: int, j: int):
    """Duplicates column and in one of its copies flip entry i to 0, 
    and in the other copy flip entry j to 0

    Parameters
    ----------
    column : list[bool]
        Original column
    i : int
       Index of first group
    j : int
       Index of second group

    Returns
    -------
    list[bool], list[bool]
       New columns after duplication and flip
    """
    col_i = column.copy()
    col_j = column.copy()
    col_i[i] = False
    col_j[j] = False
    return col_i, col_j

def can_be_absorbed(new_col: list[bool], ref_col: list[bool]) -> bool: 
    """An old column absorbs the new column 
    if it has a 1 in every row in which the new column has one

    Parameters
    ----------
    new_col : list[bool]
        Column to add
    ref_col : list[bool]
        Old column we are checking if it can absorb new_col

    Returns
    -------
    bool
       Whether old column cand absorb new_col
    """
    return all(ref_col[i] for i, x in enumerate(new_col) if x)

def absorb(new_column: list[bool], columns: list[list[bool]]) -> list[list[bool]]:
    """Absorb new column into existing columns if the condition allows

    Parameters
    ----------
    new_column : list[bool]
        Column to add
    columns : list[list[bool]]
        existing columns

    Returns
    -------
    list[list[bool]]
        Columns after absorption
    """
    if any(can_be_absorbed(new_column, c) for c in columns):
        return columns
    return columns + [new_column]

def cld(comparisons: pd.DataFrame) -> dict[str, str]:
    """
    Compact Letter Display

    Compute the compact letter display using the insert-absorb algorithm.

    See the following papers for more information:
    (1) https://doi.org/10.1016/j.csda.2006.09.035
    (2) https://doi.org/10.1198/1061860043515

    Parameters
    ----------
        comparisons : pd.DataFrame
            A DataFrame containing the pairwise comparisons produced by:
            https://www.statsmodels.org/dev/generated/statsmodels.stats.multicomp.pairwise_tukeyhsd.html
    """
    unique_groups = set(comparisons["group1"].unique())
    unique_groups = unique_groups.union(set(comparisons["group2"].unique()))
    unique_groups = list(unique_groups)
    unique_groups_indices = {g: i for i, g in enumerate(unique_groups)}

    sig_diff = comparisons[comparisons["reject"]]
    print(f"Found {len(sig_diff)} significantly different pairs")

    # Initialize CLD matrix for all unique groups/models, with "columns" as rows
    solution = [[True] * len(unique_groups)]

    for _, row in tqdm.tqdm(sig_diff.iterrows(), total=len(sig_diff)):
        i = unique_groups_indices[row["group1"]]
        j = unique_groups_indices[row["group2"]]

        has_changed: bool = True
        while has_changed:
            has_changed = False

            for idx in range(len(solution)):
                if asserts_non_significance(solution[idx], i, j):
                    # Duplicate the column
                    col_i, col_j = insert(solution[idx], i, j)

                    # Remove the old column
                    solution.pop(idx)

                    # Try absorb the column in an old column
                    # Simply add it to the solution otherwise
                    solution = absorb(col_i, solution)
                    solution = absorb(col_j, solution)

                    has_changed = True
                    break

    # Assign letters
    letters = [""] * len(unique_groups)

    for ci, col in enumerate(solution):
        letter = CLD_ALPHABET[ci]
        for idx, has_letter in enumerate(col):
            if has_letter:
                letters[idx] += letter

    return {group: sorted(letter) for group, letter in zip(unique_groups, letters)}

from statsmodels.stats.multicomp import pairwise_tukeyhsd
import tqdm

def add_cld_to_leaderboard(
    leaderboard: pd.DataFrame,
    scores: pd.DataFrame,
    metric: str,
):
    """Add the compact letter display to the leaderboard.

    Parameters
    ----------
    leaderboard : pd.DataFrame
        The full leaderboard DataFrame
    scores : pd.DataFrame
        The **raw** scores DataFrame, with all replicates from bootstrapping
    metric_ : str
        The metric label to calculate CLD for.
    """
    ordered_methods = leaderboard["user"].values

    scores = scores[["Sample", "user", metric]]
    scores[metric] = scores[metric].astype(float)

    # We compared methods using bootstrapping and the Tukey HSD test, presenting results via Compact Letter Display (CLD).
    # While acknowledging that bootstrapping likely underestimates variance,
    # we are not aware of better sampling techniques that fit the challenge format.
    stats = pairwise_tukeyhsd(endog=scores[metric], groups=scores["user"])
    # comparisons = stats.summary_frame()
    # The version of statsmodel is for some reason not the latest, so we have to do small workaround to get summary_frame
    summary_table = stats.summary()
    # data attribute is a list of lists with column names as first element
    data = summary_table.data[1:]
    columns = summary_table.data[0]
    comparisons = pd.DataFrame(data=data, columns=columns)

    letter_mapping = {}
    letter_code = cld(comparisons)

    cld_column = [""] * len(leaderboard)
    for idx, method in enumerate(ordered_methods):
        try:
            letters = letter_code[str(method)]

            for letter in letters:
                if letter not in letter_mapping:
                    letter_mapping[letter] = CLD_ALPHABET[len(letter_mapping)]
                cld_column[idx] += letter_mapping[letter]
        except KeyError: # Error with CLD for openadmet-dummy
            cld_column[idx] = "None"

    leaderboard["CLD"] = cld_column

    return leaderboard