Spaces:

openadmet
/

OpenADMET-ExpansionRx-Challenge

Running

OpenADMET-ExpansionRx-Challenge / cld.py

Maria Castellanos

Add code for CLD

24d6e19 18 days ago

6.53 kB

	from statsmodels.stats.multicomp import pairwise_tukeyhsd
	from string import ascii_lowercase, ascii_uppercase
	import tqdm
	import pandas as pd

	CLD_ALPHABET = list(ascii_lowercase) + list(ascii_uppercase)

	def asserts_non_significance(col: list[bool], i: int, j: int) -> bool:
	"""Assert whether i and j are represented as non-significant in the column
	i.e., if the corresponding values in the column are different

	Parameters
	----------
	col : list[bool]
	current column
	i : int
	index of first treatment
	j : int
	index of second treatment

	Returns
	-------
	bool
	If the non-significance is represented accurately
	"""
	return col[i] and col[j]

	def insert(column: list[bool], i: int, j: int):
	"""Duplicates column and in one of its copies flip entry i to 0,
	and in the other copy flip entry j to 0

	Parameters
	----------
	column : list[bool]
	Original column
	i : int
	Index of first group
	j : int
	Index of second group

	Returns
	-------
	list[bool], list[bool]
	New columns after duplication and flip
	"""
	col_i = column.copy()
	col_j = column.copy()
	col_i[i] = False
	col_j[j] = False
	return col_i, col_j

	def can_be_absorbed(new_col: list[bool], ref_col: list[bool]) -> bool:
	"""An old column absorbs the new column
	if it has a 1 in every row in which the new column has one

	Parameters
	----------
	new_col : list[bool]
	Column to add
	ref_col : list[bool]
	Old column we are checking if it can absorb new_col

	Returns
	-------
	bool
	Whether old column cand absorb new_col
	"""
	return all(ref_col[i] for i, x in enumerate(new_col) if x)

	def absorb(new_column: list[bool], columns: list[list[bool]]) -> list[list[bool]]:
	"""Absorb new column into existing columns if the condition allows

	Parameters
	----------
	new_column : list[bool]
	Column to add
	columns : list[list[bool]]
	existing columns

	Returns
	-------
	list[list[bool]]
	Columns after absorption
	"""
	if any(can_be_absorbed(new_column, c) for c in columns):
	return columns
	return columns + [new_column]

	def cld(comparisons: pd.DataFrame) -> dict[str, str]:
	"""
	Compact Letter Display

	Compute the compact letter display using the insert-absorb algorithm.

	See the following papers for more information:
	(1) https://doi.org/10.1016/j.csda.2006.09.035
	(2) https://doi.org/10.1198/1061860043515

	Parameters
	----------
	comparisons : pd.DataFrame
	A DataFrame containing the pairwise comparisons produced by:
	https://www.statsmodels.org/dev/generated/statsmodels.stats.multicomp.pairwise_tukeyhsd.html
	"""
	unique_groups = set(comparisons["group1"].unique())
	unique_groups = unique_groups.union(set(comparisons["group2"].unique()))
	unique_groups = list(unique_groups)
	unique_groups_indices = {g: i for i, g in enumerate(unique_groups)}

	sig_diff = comparisons[comparisons["reject"]]
	print(f"Found {len(sig_diff)} significantly different pairs")

	# Initialize CLD matrix for all unique groups/models, with "columns" as rows
	solution = [[True] * len(unique_groups)]

	for _, row in tqdm.tqdm(sig_diff.iterrows(), total=len(sig_diff)):
	i = unique_groups_indices[row["group1"]]
	j = unique_groups_indices[row["group2"]]

	has_changed: bool = True
	while has_changed:
	has_changed = False

	for idx in range(len(solution)):
	if asserts_non_significance(solution[idx], i, j):
	# Duplicate the column
	col_i, col_j = insert(solution[idx], i, j)

	# Remove the old column
	solution.pop(idx)

	# Try absorb the column in an old column
	# Simply add it to the solution otherwise
	solution = absorb(col_i, solution)
	solution = absorb(col_j, solution)

	has_changed = True
	break

	# Assign letters
	letters = [""] * len(unique_groups)

	for ci, col in enumerate(solution):
	letter = CLD_ALPHABET[ci]
	for idx, has_letter in enumerate(col):
	if has_letter:
	letters[idx] += letter

	return {group: sorted(letter) for group, letter in zip(unique_groups, letters)}

	from statsmodels.stats.multicomp import pairwise_tukeyhsd
	import tqdm

	def add_cld_to_leaderboard(
	leaderboard: pd.DataFrame,
	scores: pd.DataFrame,
	metric: str,
	):
	"""Add the compact letter display to the leaderboard.

	Parameters
	----------
	leaderboard : pd.DataFrame
	The full leaderboard DataFrame
	scores : pd.DataFrame
	The raw scores DataFrame, with all replicates from bootstrapping
	metric_ : str
	The metric label to calculate CLD for.
	"""
	ordered_methods = leaderboard["user"].values

	scores = scores[["Sample", "user", metric]]
	scores[metric] = scores[metric].astype(float)

	# We compared methods using bootstrapping and the Tukey HSD test, presenting results via Compact Letter Display (CLD).
	# While acknowledging that bootstrapping likely underestimates variance,
	# we are not aware of better sampling techniques that fit the challenge format.
	stats = pairwise_tukeyhsd(endog=scores[metric], groups=scores["user"])
	# comparisons = stats.summary_frame()
	# The version of statsmodel is for some reason not the latest, so we have to do small workaround to get summary_frame
	summary_table = stats.summary()
	# data attribute is a list of lists with column names as first element
	data = summary_table.data[1:]
	columns = summary_table.data[0]
	comparisons = pd.DataFrame(data=data, columns=columns)

	letter_mapping = {}
	letter_code = cld(comparisons)

	cld_column = [""] * len(leaderboard)
	for idx, method in enumerate(ordered_methods):
	try:
	letters = letter_code[str(method)]

	for letter in letters:
	if letter not in letter_mapping:
	letter_mapping[letter] = CLD_ALPHABET[len(letter_mapping)]
	cld_column[idx] += letter_mapping[letter]
	except KeyError: # Error with CLD for openadmet-dummy
	cld_column[idx] = "None"

	leaderboard["CLD"] = cld_column

	return leaderboard