Spaces:

Zaixi
/

Pocket-Gen

Running on Zero

dcacefd 5 months ago

71.8 kB

	CUDA_LAUNCH_BLOCKING = 1
	import sys
	sys.path.append("..")
	import torch
	import torch.nn.functional as F
	import torch.nn as nn
	from torch.nn import Module, Sequential, ModuleList, Linear, Conv1d
	from torch_geometric.nn import radius_graph, knn_graph
	from torch_geometric.utils import add_self_loops
	from torch_scatter import scatter_sum, scatter_softmax, scatter_mean, scatter_std
	import numpy as np
	from .radial_basis import RadialBasis
	import copy
	from math import pi as PI

	from ..common import GaussianSmearing, ShiftedSoftplus
	from ..protein_features import ProteinFeatures


	residue_atom_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
	[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
	[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
	[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
	[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
	[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
	[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
	[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
	[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
	[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
	[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
	[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
	[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
	[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
	[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
	[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
	[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]]).float()

	class AttentionInteractionBlock(Module):

	def __init__(self, hidden_channels, edge_channels, key_channels, num_heads=1):
	super().__init__()

	assert hidden_channels % num_heads == 0
	assert key_channels % num_heads == 0

	self.hidden_channels = hidden_channels
	self.key_channels = key_channels
	self.num_heads = num_heads

	self.k_lin = Conv1d(hidden_channels, key_channels, 1, groups=num_heads, bias=False)
	self.q_lin = Conv1d(hidden_channels, key_channels, 1, groups=num_heads, bias=False)
	self.v_lin = Conv1d(hidden_channels, hidden_channels, 1, groups=num_heads, bias=False)

	self.weight_k_net = Sequential(
	Linear(edge_channels, key_channels // num_heads),
	ShiftedSoftplus(),
	Linear(key_channels // num_heads, key_channels // num_heads),
	)
	self.weight_k_lin = Linear(key_channels // num_heads, key_channels // num_heads)

	self.weight_v_net = Sequential(
	Linear(edge_channels, hidden_channels // num_heads),
	ShiftedSoftplus(),
	Linear(hidden_channels // num_heads, hidden_channels // num_heads),
	)
	self.weight_v_lin = Linear(hidden_channels // num_heads, hidden_channels // num_heads)

	self.centroid_lin = Linear(hidden_channels, hidden_channels)
	self.act = ShiftedSoftplus()
	self.out_transform = Linear(hidden_channels, hidden_channels)
	self.layernorm_attention = nn.LayerNorm(hidden_channels)
	self.layernorm_ffn = nn.LayerNorm(hidden_channels)

	def forward(self, x, edge_index, edge_attr):
	"""
	Args:
	x: Node features, (N, H).
	edge_index: (2, E).
	edge_attr: (E, H)
	"""
	N = x.size(0)
	row, col = edge_index # (E,) , (E,)

	# self-attention layer_norm
	y = self.layernorm_attention(x)

	# Project to multiple key, query and value spaces
	h_keys = self.k_lin(y.unsqueeze(-1)).view(N, self.num_heads, -1) # (N, heads, K_per_head)
	h_queries = self.q_lin(y.unsqueeze(-1)).view(N, self.num_heads, -1) # (N, heads, K_per_head)
	h_values = self.v_lin(y.unsqueeze(-1)).view(N, self.num_heads, -1) # (N, heads, H_per_head)

	# Compute keys and queries
	W_k = self.weight_k_net(edge_attr) # (E, K_per_head)
	keys_j = self.weight_k_lin(W_k.unsqueeze(1) * h_keys[col]) # (E, heads, K_per_head)
	queries_i = h_queries[row] # (E, heads, K_per_head)

	# Compute attention weights (alphas)
	qk_ij = (queries_i * keys_j).sum(-1) # (E, heads)
	alpha = scatter_softmax(qk_ij, row, dim=0)

	# Compose messages
	W_v = self.weight_v_net(edge_attr) # (E, H_per_head)
	msg_j = self.weight_v_lin(W_v.unsqueeze(1) * h_values[col]) # (E, heads, H_per_head)
	msg_j = alpha.unsqueeze(-1) * msg_j # (E, heads, H_per_head)

	# Aggregate messages
	aggr_msg = scatter_sum(msg_j, row, dim=0, dim_size=N).view(N, -1) # (N, heads*H_per_head)
	x = aggr_msg + x
	y = self.layernorm_ffn(x)
	out = self.out_transform(self.act(y)) + x
	return out


	class CFTransformerEncoder(Module):

	def __init__(self, hidden_channels=128, edge_channels=64, key_channels=128, num_heads=4, num_interactions=6, k=32,
	cutoff=10.0):
	super().__init__()

	self.hidden_channels = hidden_channels
	self.edge_channels = edge_channels
	self.key_channels = key_channels
	self.num_heads = num_heads
	self.num_interactions = num_interactions
	self.k = k
	self.cutoff = cutoff

	self.distance_expansion = GaussianSmearing(stop=cutoff, num_gaussians=edge_channels)
	self.interactions = ModuleList()
	for _ in range(num_interactions):
	block = AttentionInteractionBlock(
	hidden_channels=hidden_channels,
	edge_channels=edge_channels,
	key_channels=key_channels,
	num_heads=num_heads,
	)
	self.interactions.append(block)

	@property
	def out_channels(self):
	return self.hidden_channels

	def forward(self, node_attr, pos, batch):
	# edge_index = radius_graph(pos, self.cutoff, batch=batch, loop=False)
	edge_index = knn_graph(pos, k=self.k, batch=batch, flow='target_to_source')
	edge_length = torch.norm(pos[edge_index[0]] - pos[edge_index[1]], dim=1)
	edge_attr = self.distance_expansion(edge_length)

	h = node_attr
	for interaction in self.interactions:
	h = h + interaction(h, edge_index, edge_attr)
	return h


	# residue level graph transformer
	class AAEmbedding(nn.Module):

	def __init__(self, device):
	super(AAEmbedding, self).__init__()

	self.hydropathy = {'#': 0, "I": 4.5, "V": 4.2, "L": 3.8, "F": 2.8, "C": 2.5, "M": 1.9, "A": 1.8, "W": -0.9,
	"G": -0.4, "T": -0.7, "S": -0.8, "Y": -1.3, "P": -1.6, "H": -3.2, "N": -3.5, "D": -3.5,
	"Q": -3.5, "E": -3.5, "K": -3.9, "R": -4.5}
	self.volume = {'#': 0, "G": 60.1, "A": 88.6, "S": 89.0, "C": 108.5, "D": 111.1, "P": 112.7, "N": 114.1,
	"T": 116.1, "E": 138.4, "V": 140.0, "Q": 143.8, "H": 153.2, "M": 162.9, "I": 166.7, "L": 166.7,
	"K": 168.6, "R": 173.4, "F": 189.9, "Y": 193.6, "W": 227.8}
	self.charge = {{'R': 1, 'K': 1, 'D': -1, 'E': -1, 'H': 0.1}, {x: 0 for x in 'ABCFGIJLMNOPQSTUVWXYZ#'}}
	self.polarity = {{x: 1 for x in 'RNDQEHKSTY'}, {x: 0 for x in "ACGILMFPWV#"}}
	self.acceptor = {{x: 1 for x in 'DENQHSTY'}, {x: 0 for x in "RKWACGILMFPV#"}}
	self.donor = {{x: 1 for x in 'RKWNQHSTY'}, {x: 0 for x in "DEACGILMFPV#"}}
	ALPHABET = ['#', 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y',
	'V']
	self.embedding = torch.tensor([
	[self.hydropathy[aa], self.volume[aa] / 100, self.charge[aa], self.polarity[aa], self.acceptor[aa],
	self.donor[aa]]
	for aa in ALPHABET]).to(device)

	def to_rbf(self, D, D_min, D_max, stride):
	D_count = int((D_max - D_min) / stride)
	D_mu = torch.linspace(D_min, D_max, D_count).to(D.device)
	D_mu = D_mu.view(1, -1) # [1, K]
	D_expand = torch.unsqueeze(D, -1) # [N, 1]
	return torch.exp(-((D_expand - D_mu) / stride) ** 2)

	def transform(self, aa_vecs):
	return torch.cat([
	self.to_rbf(aa_vecs[:, 0], -4.5, 4.5, 0.1),
	self.to_rbf(aa_vecs[:, 1], 0, 2.2, 0.1),
	self.to_rbf(aa_vecs[:, 2], -1.0, 1.0, 0.25),
	torch.sigmoid(aa_vecs[:, 3:] * 6 - 3),
	], dim=-1)

	def dim(self):
	return 90 + 22 + 8 + 3

	def forward(self, x, raw=False):
	# B, N = x.size(0), x.size(1)
	# aa_vecs = self.embedding[x.view(-1)].view(B, N, -1)
	aa_vecs = self.embedding[x.view(-1)]
	rbf_vecs = self.transform(aa_vecs)
	return aa_vecs if raw else rbf_vecs

	def soft_forward(self, x):
	aa_vecs = torch.matmul(x, self.embedding)
	rbf_vecs = self.transform(aa_vecs)
	return rbf_vecs


	class TransformerLayer(nn.Module):
	def __init__(self, num_hidden, num_heads=4, dropout=0.1):
	super(TransformerLayer, self).__init__()
	self.num_heads = num_heads
	self.num_hidden = num_hidden
	self.dropout_attention = nn.Dropout(dropout)
	self.dropout_ffn = nn.Dropout(dropout)
	self.self_attention_norm = nn.LayerNorm(num_hidden)
	self.ffn_norm = nn.LayerNorm(num_hidden)

	self.attention = ResidueAttention(num_hidden, num_heads)
	self.ffn = PositionWiseFeedForward(num_hidden, num_hidden)

	def forward(self, h_V, h_E, E_idx):
	""" Parallel computation of full transformer layer """
	# Self-attention
	y = self.self_attention_norm(h_V)
	y = self.attention(y, h_E, E_idx)
	h_V = h_V + self.dropout_attention(y)

	# Position-wise feedforward
	y = self.ffn_norm(h_V)
	y = self.ffn(y)
	h_V = h_V + self.dropout_ffn(y)
	return h_V


	class PositionWiseFeedForward(nn.Module):
	def __init__(self, num_hidden, num_ff):
	super(PositionWiseFeedForward, self).__init__()
	self.W_in = nn.Linear(num_hidden, num_ff, bias=True)
	self.W_out = nn.Linear(num_ff, num_hidden, bias=True)

	def forward(self, h_V):
	h = F.relu(self.W_in(h_V))
	h = self.W_out(h)
	return h


	class ResidueAttention(nn.Module):
	def __init__(self, num_hidden, num_heads=4):
	super(ResidueAttention, self).__init__()
	self.num_heads = num_heads
	self.num_hidden = num_hidden

	# Self-attention layers: {queries, keys, values, output}
	self.W_Q = nn.Linear(num_hidden, num_hidden, bias=False)
	self.W_K = nn.Linear(num_hidden * 2, num_hidden, bias=False)
	self.W_V = nn.Linear(num_hidden * 2, num_hidden, bias=False)
	self.W_O = nn.Linear(num_hidden, num_hidden, bias=False)
	self.act = ShiftedSoftplus()
	self.layernorm = nn.LayerNorm(num_hidden)

	def forward(self, h_V, h_E, edge_index):
	""" Self-attention, graph-structured O(Nk)
	Args:
	h_V: Node features [N_batch, N_nodes, N_hidden]
	h_E: Neighbor features [N_batch, N_nodes, K, N_hidden]
	mask_attend: Mask for attention [N_batch, N_nodes, K]
	Returns:
	h_V: Node update
	"""

	# Queries, Keys, Values
	n_edges = h_E.shape[0]
	n_nodes = h_V.shape[0]
	n_heads = self.num_heads
	row, col = edge_index # (E,) , (E,)

	d = int(self.num_hidden / n_heads)
	Q = self.W_Q(h_V).view([n_nodes, n_heads, 1, d])
	K = self.W_K(torch.cat([h_E, h_V[col]], dim=-1)).view([n_edges, n_heads, d, 1])
	V = self.W_V(torch.cat([h_E, h_V[col]], dim=-1)).view([n_edges, n_heads, d])
	# Attention with scaled inner product
	attend_logits = torch.matmul(Q[row], K).view([n_edges, n_heads]) # (E, heads)
	alpha = scatter_softmax(attend_logits, row, dim=0) / np.sqrt(d)
	# Compose messages
	msg_j = alpha.unsqueeze(-1) * V # (E, heads, H_per_head)

	# Aggregate messages
	aggr_msg = scatter_sum(msg_j, row, dim=0, dim_size=n_nodes).view(n_nodes, -1) # (N, heads*H_per_head)
	h_V_update = self.W_O(self.act(aggr_msg))
	return h_V_update


	# hierachical graph transformer encoder
	class HierEncoder(Module):
	def __init__(self, hidden_channels=128, edge_channels=64, key_channels=128, num_heads=4, num_interactions=6, k=32,
	cutoff=10.0, device='cuda:0'):
	super().__init__()

	self.hidden_channels = hidden_channels
	self.edge_channels = edge_channels
	self.key_channels = key_channels
	self.num_heads = num_heads
	self.num_interactions = num_interactions
	self.k = k
	self.cutoff = cutoff
	self.device = device

	self.distance_expansion = GaussianSmearing(stop=cutoff, num_gaussians=edge_channels)
	self.interactions = ModuleList()
	for _ in range(num_interactions):
	block = AttentionInteractionBlock(
	hidden_channels=hidden_channels,
	edge_channels=edge_channels,
	key_channels=key_channels,
	num_heads=num_heads,
	)
	self.interactions.append(block)

	# Residue level settings
	self.residue_feat = AAEmbedding(device) # for residue node feature
	self.features = ProteinFeatures(top_k=8) # for residue edge feature
	self.W_v = nn.Linear(hidden_channels + self.residue_feat.dim(), hidden_channels, bias=True)
	self.W_e = nn.Linear(self.features.feature_dimensions, hidden_channels, bias=True)
	self.residue_encoder_layers = nn.ModuleList([TransformerLayer(hidden_channels, dropout=0.1) for _ in range(2)])

	self.T_a = nn.Sequential(nn.Linear(2 * hidden_channels + edge_channels, hidden_channels), nn.ReLU(),
	nn.Linear(hidden_channels, 1))
	self.T_x = nn.Sequential(nn.Linear(3 * hidden_channels, hidden_channels), nn.ReLU(),
	nn.Linear(hidden_channels, 14))

	@property
	def out_channels(self):
	return self.hidden_channels

	def forward(self, node_attr, pos, batch_ctx, batch, pred_res_type, mask_protein, external_index, backbone=True,
	mask=True):
	S_id, R = batch['res_idx'], batch['amino_acid']
	residue_batch, atom2residue = batch['amino_acid_batch'], batch['atom2residue']
	edit_residue, edit_atom = batch['protein_edit_residue'], batch['protein_edit_atom']
	if mask:
	R[batch['random_mask_residue']] = 0
	R = F.one_hot(R, num_classes=21).float()
	if backbone:
	atom2residue, edit_atom = batch['atom2residue_backbone'], batch['protein_edit_atom_backbone']
	R_edit = R[edit_residue]
	R_edit[:, 1:] = pred_res_type
	R[edit_residue] = R_edit
	R[:, 0] = 0

	edge_index = knn_graph(pos, k=self.k, batch=batch_ctx, flow='target_to_source')
	edge_length = torch.norm(pos[edge_index[0]] - pos[edge_index[1]], dim=1)
	edge_attr = self.distance_expansion(edge_length)

	h = node_attr
	for interaction in self.interactions:
	h = interaction(h, edge_index, edge_attr)

	h_ligand_coarse = scatter_sum(h[~mask_protein], batch['ligand_atom_batch'], dim=0)
	pos_ligand_coarse = scatter_sum(batch['ligand_pos'], batch['ligand_atom_batch'], dim=0)
	E, residue_edge_index, residue_edge_length, edge_index_new, E_new = self.features(pos_ligand_coarse,
	batch['protein_edit_residue'],
	batch['residue_pos'], S_id,
	residue_batch)
	h_protein = h[mask_protein]
	V = torch.cat([self.residue_feat.soft_forward(R), scatter_sum(h_protein, atom2residue, dim=0)], dim=-1)
	h_res = self.W_v(V)

	h_res = torch.cat([h_res, h_ligand_coarse])
	edge_index_combined = torch.cat([residue_edge_index, edge_index_new], 1)
	E = torch.cat([E, E_new], 0)
	h_E = self.W_e(E)

	for layer in self.residue_encoder_layers:
	h_res = layer(h_res, h_E, edge_index_combined)

	# update X:
	h_res = h_res[:len(residue_batch)]
	h_E = h_E[:residue_edge_index.size(1)]
	# protein internal update
	mij = torch.cat([h_res[residue_edge_index[0]], h_res[residue_edge_index[1]], h_E], dim=-1)
	if backbone:
	protein_pos = pos[mask_protein]
	ligand_pos = pos[~mask_protein]
	N = atom2residue.max() + 1
	X_bb = torch.zeros(N, 4, 3).to(pos.device)
	for j in range(N):
	X_bb[j] = protein_pos[atom2residue == j][:4] # 4 backbone atoms [N,4,3]
	xij = X_bb[residue_edge_index[0]] - X_bb[residue_edge_index[1]] # [N,4,3]
	dij = xij.norm(dim=-1) + 1e-6 # [N,4]
	fij = torch.maximum(self.T_x(mij)[:, :4], 3.8 - dij) # break term [N,4]
	xij = xij / dij.unsqueeze(-1) * fij.unsqueeze(-1)
	f_res = scatter_mean(xij, residue_edge_index[0], dim=0) # [N,4,3]
	X_bb[edit_residue] += f_res.clamp(min=-5.0, max=5.0)[edit_residue]

	# Clash correction
	for _ in range(2):
	xij = X_bb[residue_edge_index[0]] - X_bb[residue_edge_index[1]] # [N,4,3]
	dij = xij.norm(dim=-1) + 1e-6 # [N,4]
	fij = F.relu(3.8 - dij) # repulsion term [N,4]
	xij = xij / dij.unsqueeze(-1) * fij.unsqueeze(-1)
	f_res = scatter_mean(xij, residue_edge_index[0], dim=0) # [N,4,3]
	X_bb[edit_residue] += f_res.clamp(min=-5.0, max=5.0)[edit_residue]

	# protein-ligand external update
	protein_pos[edit_atom] = X_bb[edit_residue].view(-1, 3)
	pos[mask_protein] = protein_pos
	dij = torch.norm(pos[mask_protein][external_index[0]] - pos[~mask_protein][external_index[1]], dim=1) + 1e-6
	mij = torch.cat(
	[h[mask_protein][external_index[0]], h[~mask_protein][external_index[1]], self.distance_expansion(dij)],
	dim=-1)
	xij = pos[mask_protein][external_index[0]] - pos[~mask_protein][external_index[1]]
	fij = torch.maximum(self.T_a(mij).squeeze(-1), 1.5 - dij)
	xij = xij / dij.unsqueeze(-1) * fij.unsqueeze(-1)
	f_atom = scatter_mean(xij, external_index[0], dim=0, dim_size=protein_pos.size(0))
	protein_pos += f_atom
	f_ligand_atom = scatter_mean(xij, external_index[1], dim=0, dim_size=ligand_pos.size(0))
	ligand_pos -= f_ligand_atom * 0.05

	else:
	protein_pos = pos[mask_protein]
	ligand_pos = pos[~mask_protein]
	X_avg = scatter_mean(protein_pos, atom2residue, dim=0)
	X = X_avg.unsqueeze(1).repeat(1, 14, 1)
	N = atom2residue.max() + 1
	mask = torch.zeros(N, 14, dtype=bool).to(protein_pos.device)
	residue_natoms = atom2residue.bincount()
	for j in range(N):
	mask[j][:residue_natoms[j]] = 1 # all atoms mask [N,14]
	X[j][:residue_natoms[j]] = protein_pos[atom2residue == j]

	xij = X[residue_edge_index[0]] - X_avg[residue_edge_index[1]].unsqueeze(1) # [N,14,3]
	dij = xij.norm(dim=-1) + 1e-6 # [N,14]
	fij = torch.maximum(self.T_x(mij), 3.8 - dij) # break term [N,14]
	xij = xij / dij.unsqueeze(-1) * fij.unsqueeze(-1)
	f_res = scatter_mean(xij, residue_edge_index[0], dim=0) # [N,14,3]
	f_res[:, :4] *= 0.1
	X[edit_residue] += f_res.clamp(min=-5.0, max=5.0)[edit_residue]

	for _ in range(2):
	protein_pos = X[mask]
	X_avg = scatter_mean(protein_pos, atom2residue, dim=0)
	xij = X[residue_edge_index[0]] - X_avg[residue_edge_index[1]].unsqueeze(1) # [N,14,3]
	dij = xij.norm(dim=-1) + 1e-6 # [N,14]
	fij = F.relu(3.8 - dij) # repulsion term [N,14]
	xij = xij / dij.unsqueeze(-1) * fij.unsqueeze(-1)
	f_res = scatter_mean(xij, residue_edge_index[0], dim=0) # [N,14,3]
	X[edit_residue] += f_res.clamp(min=-5.0, max=5.0)[edit_residue]

	# protein-ligand external update
	protein_pos = X[mask]
	pos[mask_protein] = protein_pos
	dij = torch.norm(pos[mask_protein][external_index[0]] - pos[~mask_protein][external_index[1]], dim=1) + 1e-6
	mij = torch.cat(
	[h[mask_protein][external_index[0]], h[~mask_protein][external_index[1]], self.distance_expansion(dij)],
	dim=-1)
	xij = pos[mask_protein][external_index[0]] - pos[~mask_protein][external_index[1]]
	fij = torch.maximum(self.T_a(mij).squeeze(-1), 1.5 - dij)
	xij = xij / dij.unsqueeze(-1) * fij.unsqueeze(-1)
	f_atom = scatter_mean(xij, external_index[0], dim=0, dim_size=protein_pos.size(0))
	f_atom[batch['edit_backbone']] *= 0.1
	protein_pos += f_atom
	f_ligand_atom = scatter_mean(xij, external_index[1], dim=0, dim_size=ligand_pos.size(0))
	ligand_pos -= f_ligand_atom * 0.05

	return h, h_res, protein_pos, ligand_pos


	# bilevel encoder
	class BilevelEncoder(Module):
	def __init__(self, hidden_channels=128, edge_channels=64, key_channels=128, num_heads=4, num_interactions=6, k=8,
	cutoff=10.0, device='cuda:0'):
	super().__init__()

	self.hidden_channels = hidden_channels
	self.edge_channels = edge_channels
	self.key_channels = key_channels
	self.num_heads = num_heads
	self.num_interactions = num_interactions
	self.k = k
	self.cutoff = cutoff
	self.device = device
	self.esm_refine = True

	self.distance_expansion = GaussianSmearing(stop=cutoff, num_gaussians=edge_channels)

	# Residue level settings
	self.atom_pos_embedding = nn.Embedding(14, self.hidden_channels)
	self.residue_embedding = nn.Embedding(21, self.hidden_channels) # one embedding for mask
	self.W_Q = nn.Linear(self.hidden_channels, self.hidden_channels, bias=False)
	self.W_K = nn.Linear(self.hidden_channels, self.hidden_channels, bias=False)
	self.W_V = nn.Linear(self.hidden_channels, self.hidden_channels, bias=False)
	self.W_K_lig = nn.Linear(self.hidden_channels, self.hidden_channels, bias=False)
	self.W_V_lig = nn.Linear(self.hidden_channels, self.hidden_channels, bias=False)
	self.W_O = nn.Linear(self.hidden_channels, self.hidden_channels, bias=False)
	self.W_O_lig = nn.Linear(self.hidden_channels, self.hidden_channels, bias=False)
	self.W_O_lig1 = nn.Linear(self.hidden_channels, self.hidden_channels, bias=False)
	self.act = ShiftedSoftplus()
	self.layernorm = nn.LayerNorm(hidden_channels)
	self.layernorm1 = nn.LayerNorm(hidden_channels)
	self.dropout = nn.Dropout(0.1)

	self.T_i = nn.Sequential(nn.Linear(2 * hidden_channels + edge_channels, hidden_channels), nn.ReLU(),
	nn.Linear(hidden_channels, 1))
	self.T_e1 = nn.Sequential(nn.Linear(2 * hidden_channels + edge_channels, hidden_channels), nn.ReLU(),
	nn.Linear(hidden_channels, 1))
	self.T_e2 = nn.Sequential(nn.Linear(2 * hidden_channels + edge_channels, hidden_channels), nn.ReLU(),
	nn.Linear(hidden_channels, 1))
	self.residue_mlp = Linear(hidden_channels, 20)
	self.sigma_D = Sequential(
	Linear(edge_channels, hidden_channels),
	nn.ReLU(),
	Linear(hidden_channels, num_heads),
	)
	self.sigma_D1 = Sequential(
	Linear(edge_channels, hidden_channels),
	nn.ReLU(),
	Linear(hidden_channels, num_heads),
	)
	self.residue_atom_mask = residue_atom_mask.to(device)

	@property
	def out_channels(self):
	return self.hidden_channels

	def connect_edges(self, res_X, batch):
	edge_index = knn_graph(res_X[:, 1], k=self.k, batch=batch, flow='target_to_source')
	edge_index, _ = add_self_loops(edge_index, num_nodes=res_X.size(0)) # add self loops
	return edge_index

	def _forward(self, res_H, res_X, res_S, batch, ligand_pos, ligand_feat, ligand_mask, edit_residue_num, residue_mask):
	atom_mask = self.residue_atom_mask[res_S]
	edge_index = self.connect_edges(res_X, batch)
	row, col = edge_index
	R_ij = torch.cdist(res_X[row], res_X[col], p=2)
	dist_rep = self.distance_expansion(R_ij).view(row.shape[0], res_X.shape[1], res_X.shape[1], -1)
	n_nodes = res_H.shape[0]
	n_edges = edge_index.shape[1]
	n_heads = self.num_heads
	n_channels = res_X.shape[1]
	d = int(self.hidden_channels / n_heads)
	res_H = self.layernorm(res_H)
	Q = self.W_Q(res_H).view([n_nodes, n_channels, n_heads, d])
	K = self.W_K(res_H).view([n_nodes, n_channels, n_heads, d])
	V = self.W_V(res_H).view([n_nodes, n_channels, n_heads, d])
	# Attention with scaled inner product
	attend_logits = torch.matmul(Q[row].transpose(1, 2), K[col].permute(0, 2, 3, 1)).view([n_edges, n_heads, n_channels, n_channels])
	attend_logits /= np.sqrt(d)
	attend_logits = attend_logits + self.sigma_D(dist_rep).permute(0, 3, 1, 2)
	attend_mask = (atom_mask[row].unsqueeze(-1) @ atom_mask[col].unsqueeze(-2))
	attend_mask = attend_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)

	atom_mask_head = atom_mask.unsqueeze(1).repeat(1, n_heads, 1)
	r_ij = atom_mask_head[row].unsqueeze(-2) @ attend_logits @ atom_mask_head[col].unsqueeze(-1)
	r_ij = r_ij.squeeze() / attend_mask.sum(-1).sum(-1)
	beta = scatter_softmax(r_ij, row, dim=0)

	attend_logits = torch.softmax(attend_logits, dim=-1)
	attend_logits = attend_logits * attend_mask
	attend_logits = attend_logits/(attend_logits.norm(1, dim=-1).unsqueeze(-1)+1e-7)
	alpha_ij_Vj = attend_logits @ V[col].transpose(1, 2)

	# res_H update
	update_H = scatter_sum(beta.unsqueeze(-1).unsqueeze(-1) * alpha_ij_Vj, row, dim=0, dim_size=n_nodes)
	update_H = update_H.transpose(1, 2).reshape(n_nodes, n_channels, -1) # (N, channels, heads*H_per_head)
	res_H = res_H + self.dropout(self.W_O(self.act(update_H)))
	res_H = res_H * atom_mask.unsqueeze(-1)

	# res_X update
	X_ij = res_X[row].unsqueeze(-2) - res_X[col].unsqueeze(1)
	X_ij = X_ij/(X_ij.norm(2, dim=-1).unsqueeze(-1)+1e-5)

	# Aggregate messages
	Q = Q.view(n_nodes, n_channels, -1)
	K = K.view(n_nodes, n_channels, -1)
	p_idx, q_idx = torch.cartesian_prod(torch.arange(n_channels), torch.arange(n_channels)).chunk(2, dim=-1)
	p_idx, q_idx = p_idx.squeeze(-1), q_idx.squeeze(-1)
	input = torch.cat([Q[row][:, p_idx].view(-1, self.hidden_channels), K[col][:, q_idx].view(-1, self.hidden_channels), dist_rep.view(-1, self.edge_channels)], dim=-1)
	f = self.dropout(self.T_i(input).view(n_edges, n_channels, n_channels))
	#attend_mask = (atom_mask[row].unsqueeze(-1) @ atom_mask[col].unsqueeze(-2)).bool()
	f = f * attend_logits.mean(1)
	res_X[residue_mask] = res_X[residue_mask] + torch.clamp(scatter_sum(beta.mean(-1).unsqueeze(-1).unsqueeze(-1) * (f.unsqueeze(-1) * X_ij).sum(-2), row, dim=0, dim_size=n_nodes)[residue_mask], min=-3.0, max=3.0)

	# consider ligand
	batch_size = batch.max().item() + 1
	lig_channel = ligand_feat.shape[1]
	row1 = torch.arange(n_nodes).to(self.device)[residue_mask]
	col1 = torch.repeat_interleave(torch.arange(batch_size).to(self.device), edit_residue_num)
	n_edges = row1.shape[0]
	Q = Q.view([n_nodes, n_channels, n_heads, d])
	ligand_feat = self.layernorm1(ligand_feat)
	K_lig = self.W_K_lig(ligand_feat).view([batch_size, lig_channel, n_heads, d])
	V_lig = self.W_V_lig(ligand_feat).view([batch_size, lig_channel, n_heads, d])
	R_ij = torch.cdist(res_X[row1], ligand_pos[col1], p=2)
	dist_rep1 = self.distance_expansion(R_ij).view(row1.shape[0], res_X.shape[1], ligand_pos.shape[1], -1)
	attend_logits = torch.matmul(Q[row1].transpose(1, 2), K_lig[col1].permute(0, 2, 3, 1)).view([n_edges, n_heads, n_channels, lig_channel])
	attend_logits /= np.sqrt(d)
	attend_logits = attend_logits + self.sigma_D1(dist_rep1).permute(0, 3, 1, 2)
	attend_mask = (atom_mask[row1].unsqueeze(-1) @ ligand_mask[col1].unsqueeze(-2))
	attend_mask = attend_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)

	atom_mask_head = atom_mask.unsqueeze(1).repeat(1, n_heads, 1)
	ligand_mask_head = ligand_mask.unsqueeze(1).repeat(1, n_heads, 1)
	r_ij = atom_mask_head[row1].unsqueeze(-2) @ attend_logits @ ligand_mask_head[col1].unsqueeze(-1)
	r_ij = r_ij.squeeze() / attend_mask.sum(-1).sum(-1)
	beta = scatter_softmax(r_ij, col1, dim=0)

	attend_logits_res = torch.softmax(attend_logits, dim=-1)
	attend_logits_res = attend_logits_res * attend_mask
	attend_logits_res = attend_logits_res / (attend_logits_res.norm(1, dim=-1).unsqueeze(-1) + 1e-7)
	alpha_ij_Vj = attend_logits_res @ V_lig[col1].transpose(1, 2)

	attend_logits_lig = torch.softmax(attend_logits, dim=-2) * attend_mask
	attend_logits_lig = attend_logits_lig / (attend_logits_res.norm(1, dim=-1).unsqueeze(-1) + 1e-7)

	res_H[residue_mask] = res_H[residue_mask] + self.dropout(self.W_O_lig(self.act(alpha_ij_Vj.transpose(1, 2).reshape(residue_mask.sum(), n_channels, -1))))
	res_H = res_H * atom_mask.unsqueeze(-1)
	alpha_ij_Vj_lig = attend_logits_lig.transpose(-1, -2) @ V[row1].transpose(1, 2)
	update_lig_feat = scatter_sum(beta.unsqueeze(-1).unsqueeze(-1) * alpha_ij_Vj_lig, col1, dim=0, dim_size=batch_size)
	ligand_feat = ligand_feat + self.dropout(self.W_O_lig1(self.act(update_lig_feat.transpose(1, 2).reshape(batch_size, lig_channel, -1))))

	X_ij = res_X[row1].unsqueeze(-2) - ligand_pos[col1].unsqueeze(1)
	X_ij = X_ij / (X_ij.norm(2, dim=-1).unsqueeze(-1) + 1e-5)

	Q = Q.view(n_nodes, n_channels, -1)
	K_lig = K_lig.view(batch_size, lig_channel, -1)
	p_idx, q_idx = torch.cartesian_prod(torch.arange(n_channels), torch.arange(lig_channel)).chunk(2, dim=-1)
	p_idx, q_idx = p_idx.squeeze(-1), q_idx.squeeze(-1)
	f_lig = self.dropout(self.T_e1(torch.cat([Q[row1][:, p_idx].view(-1, self.hidden_channels), K_lig[col1][:, q_idx].view(-1, self.hidden_channels), dist_rep1.view(-1, self.edge_channels)], dim=-1)).view(n_edges, n_channels, lig_channel))
	f_res = self.dropout(self.T_e2(torch.cat([Q[row1][:, p_idx].view(-1, self.hidden_channels), K_lig[col1][:, q_idx].view(-1, self.hidden_channels), dist_rep1.view(-1, self.edge_channels)], dim=-1)).view(n_edges, n_channels, lig_channel))
	attend_mask = (atom_mask[row1].unsqueeze(-1) @ ligand_mask[col1].unsqueeze(-2)).bool()
	f_lig = f_lig * attend_logits_lig.mean(1)
	f_res = f_res * attend_mask
	ligand_pos = ligand_pos + scatter_sum(beta.mean(-1).unsqueeze(-1).unsqueeze(-1) * (f_lig.unsqueeze(-1) * X_ij).sum(1), col1, dim=0, dim_size=batch_size)
	res_X[residue_mask] = res_X[residue_mask] + torch.clamp((f_res.unsqueeze(-1) * - X_ij).mean(-2), min=-3.0, max=3.0)

	return res_H, res_X, ligand_pos, ligand_feat

	def forward(self, res_H, res_X, res_S, batch, full_seq, ligand_pos, ligand_feat, ligand_mask, edit_residue_num, residue_mask, esmadapter, full_seq_mask, r10_mask):
	for e in range(4):
	res_H, res_X, ligand_pos, ligand_feat = self._forward(res_H, res_X, res_S, batch, ligand_pos, ligand_feat, ligand_mask, edit_residue_num, residue_mask)
	# predict residue types
	h_residue = res_H.sum(-2)
	batch_size = batch.max().item() + 1
	encoder_out = {'feats': torch.zeros(batch_size, full_seq.shape[1], self.hidden_channels).to(self.device)}
	encoder_out['feats'][r10_mask] = h_residue.view(-1, self.hidden_channels)
	init_pred = full_seq
	decode_logits = esmadapter(init_pred, encoder_out)['logits']
	pred_res_type = decode_logits[full_seq_mask][:, 4:24]
	return res_H, res_X, pred_res_type, ligand_pos


	# bilevel layer
	class GETLayer(Module):
	def __init__(self, hidden_channels=128, edge_channels=64, key_channels=128, num_heads=4, num_interactions=6, k=8,
	cutoff=10.0, device='cuda:0', sparse_k=3):
	super().__init__()

	self.consider_ligand = False
	self.sparse_k = sparse_k

	self.hidden_channels = hidden_channels
	self.edge_channels = edge_channels
	self.key_channels = key_channels
	self.num_heads = num_heads
	self.num_interactions = num_interactions
	self.d = int(self.hidden_channels / self.num_heads)
	self.k = k
	self.cutoff = cutoff
	self.device = device
	self.esm_refine = True

	self.distance_expansion = GaussianSmearing(stop=cutoff, num_gaussians=edge_channels)

	# Residue level settings
	self.atom_pos_embedding = nn.Embedding(14, self.hidden_channels)
	self.residue_embedding = nn.Embedding(21, self.hidden_channels) # one embedding for mask
	self.W_Q = nn.Linear(self.hidden_channels, self.hidden_channels, bias=False)
	self.W_K = nn.Linear(self.hidden_channels, self.hidden_channels, bias=False)
	self.W_V = nn.Linear(self.hidden_channels, self.hidden_channels, bias=False)
	self.W_Q_lig = nn.Linear(self.hidden_channels, self.hidden_channels, bias=False)
	self.W_K_lig = nn.Linear(self.hidden_channels, self.hidden_channels, bias=False)
	self.W_V_lig = nn.Linear(self.hidden_channels, self.hidden_channels, bias=False)
	self.W_O = nn.Linear(self.hidden_channels, self.hidden_channels)
	self.W_O_lig = nn.Linear(self.hidden_channels, self.hidden_channels)
	self.W_O_lig1 = nn.Linear(self.hidden_channels, self.hidden_channels)
	self.act = nn.SiLU()
	self.layernorm = nn.LayerNorm(hidden_channels)
	self.layernorm1 = nn.LayerNorm(hidden_channels)
	self.dropout = nn.Dropout(0.1)

	self.T_i = nn.Sequential(nn.Linear(3 * self.d, self.d), nn.SiLU(),
	nn.Linear(self.d, 1))
	self.T_e1 = nn.Sequential(nn.Linear(3 * self.d, self.d), nn.SiLU(),
	nn.Linear(self.d, 1))
	self.T_e2 = nn.Sequential(nn.Linear(3 * self.hidden_channels, self.hidden_channels), nn.SiLU(),
	nn.Linear(self.hidden_channels, self.num_heads))
	self.residue_mlp = Linear(hidden_channels, 20, bias=True)
	self.sigma_D = Sequential(
	Linear(edge_channels, hidden_channels),
	nn.SiLU(),
	Linear(hidden_channels, num_heads),
	)
	self.sigma_D1 = Sequential(
	Linear(edge_channels, hidden_channels),
	nn.SiLU(),
	Linear(hidden_channels, num_heads),
	)
	self.sigma_v = Linear(edge_channels, hidden_channels)
	self.block_mlp_invariant = nn.Sequential(nn.Linear(self.d, self.d), nn.SiLU(), nn.Linear(self.d, self.d))
	self.block_mlp_equivariant = nn.Sequential(nn.Linear(self.d, self.d), nn.SiLU(), nn.Linear(self.d, self.d))

	@property
	def out_channels(self):
	return self.hidden_channels

	def connect_edges(self, res_X, batch):
	edge_index = knn_graph(res_X[:, 1], k=self.k, batch=batch, flow='target_to_source')
	edge_index, _ = add_self_loops(edge_index, num_nodes=res_X.size(0)) # add self loops
	return edge_index

	def attention(self, res_H, res_X, atom_mask, batch, residue_mask):
	edge_index = self.connect_edges(res_X, batch)
	row, col = edge_index
	R_ij = torch.cdist(res_X[row], res_X[col], p=2)
	dist_rep = self.distance_expansion(R_ij).view(row.shape[0], res_X.shape[1], res_X.shape[1], -1)
	n_nodes = res_H.shape[0]
	n_edges = edge_index.shape[1]
	n_channels = res_X.shape[1]
	Q = self.W_Q(res_H).view([n_nodes, n_channels, self.num_heads, self.d])
	K = self.W_K(res_H).view([n_nodes, n_channels, self.num_heads, self.d])
	V = self.W_V(res_H).view([n_nodes, n_channels, self.num_heads, self.d])
	attend_logits = torch.matmul(Q[row].transpose(1, 2), K[col].permute(0, 2, 3, 1)).view(
	[n_edges, self.num_heads, n_channels, n_channels])
	attend_logits /= np.sqrt(self.d)
	attend_logits = attend_logits + self.sigma_D(dist_rep).permute(0, 3, 1, 2) # distance bias
	attend_mask = (atom_mask[row].unsqueeze(-1) @ atom_mask[col].unsqueeze(-2)).unsqueeze(1).repeat(1,
	self.num_heads,
	1, 1)

	# sparse attention, only keep top k=3
	attend_logits[torch.logical_not(attend_mask)] = -1e5 # do not sellect from entries not attend
	_, top_indices = torch.topk(attend_logits, self.sparse_k, dim=-1, largest=True)
	sparse_mask = torch.zeros_like(attend_logits, dtype=torch.bool)
	rows = torch.arange(attend_logits.size(0)).view(-1, 1, 1, 1).expand(-1, attend_logits.size(1),
	attend_logits.size(2), self.sparse_k)
	depth = torch.arange(attend_logits.size(1)).view(1, -1, 1, 1).expand(attend_logits.size(0), -1,
	attend_logits.size(2), self.sparse_k)
	height = torch.arange(attend_logits.size(2)).view(1, 1, -1, 1).expand(attend_logits.size(0),
	attend_logits.size(1), -1, self.sparse_k)
	sparse_mask[rows, depth, height, top_indices] = True
	attend_logits = attend_logits * sparse_mask
	attend_logits = attend_logits * attend_mask

	# calculate beta
	atom_mask_head = atom_mask.unsqueeze(1).repeat(1, self.num_heads, 1)
	r_ij = atom_mask_head[row].unsqueeze(-2) @ attend_logits @ atom_mask_head[col].unsqueeze(-1)
	r_ij = r_ij.squeeze() / (attend_mask * sparse_mask).sum(-1).sum(-1) # take avarage over non-zero entries
	beta = scatter_softmax(r_ij, row, dim=0) # [nnodes, num_heads]

	attend_logits = torch.softmax(attend_logits, dim=-1)
	attend_logits = attend_logits * attend_mask * sparse_mask
	attend_logits = attend_logits / (
	attend_logits.norm(1, dim=-1).unsqueeze(-1) + 1e-7) # normalize over every rows

	alpha_ij_Vj = attend_logits @ V[col].transpose(1, 2) # [nedges, num_heads, 14, d]
	alpha_ij_Vj = self.block_mlp_invariant(alpha_ij_Vj)

	# invariant res_H update
	update_H = scatter_sum(beta.unsqueeze(-1).unsqueeze(-1) * alpha_ij_Vj, row, dim=0, dim_size=n_nodes)
	update_H = update_H.transpose(1, 2).reshape(n_nodes, n_channels, -1) # (nnodes, 14, heads*d)
	res_H = res_H + self.W_O(update_H)
	res_H = res_H * atom_mask.unsqueeze(-1) # set empty entry zeros

	###################################################################################
	# equivariant res_X update
	X_ij = res_X[row].unsqueeze(-2) - res_X[col].unsqueeze(1) # [nedges, 14, 14, 3]
	X_ij = X_ij / (X_ij.norm(2, dim=-1).unsqueeze(-1) + 1e-5)
	X_ij = X_ij.unsqueeze(1).repeat(1, self.num_heads, 1, 1, 1)
	X_ij = X_ij * attend_mask.unsqueeze(-1)

	dist_rep = self.sigma_v(dist_rep).view([n_edges, n_channels, n_channels, self.num_heads, self.d]).permute(0, 3, 1, 2, 4)
	input = torch.cat(
	[(Q[row].transpose(1, 2))[rows, depth, height], (K[col].transpose(1, 2))[rows, depth, top_indices],
	dist_rep[rows, depth, height, top_indices]], dim=-1)
	f = self.T_i(input).view(n_edges, self.num_heads, n_channels, self.sparse_k)
	f = f.unsqueeze(-1) * X_ij[rows, depth, height, top_indices].view(n_edges, self.num_heads, n_channels, self.sparse_k, 3)
	f = (f * attend_logits[rows, depth, height, top_indices].unsqueeze(-1)).sum(-2) # [nedges, num_heads, 14, 3]
	res_X[residue_mask] = res_X[residue_mask] + torch.clamp(
	scatter_sum(beta.unsqueeze(-1).unsqueeze(-1) * f, row, dim=0, dim_size=n_nodes).sum(1)[residue_mask],
	min=-3.0, max=3.0)
	res_X = res_X * atom_mask.unsqueeze(-1) # set empty entries zeros
	return res_H, res_X

	def attention_ligand(self, res_H, res_X, atom_mask, batch, ligand_pos, ligand_feat, ligand_mask, edit_residue_num,
	residue_mask):
	batch_size = batch.max().item() + 1
	n_nodes = res_H.shape[0]
	n_channels = res_X.shape[1]
	lig_channel = ligand_feat.shape[1]
	row = torch.arange(n_nodes).to(self.device)[residue_mask]
	col = torch.repeat_interleave(torch.arange(batch_size).to(self.device), edit_residue_num)
	n_edges = row.shape[0]
	Q = self.W_Q_lig(res_H).view([n_nodes, n_channels, self.num_heads, self.d])
	K_lig = self.W_K_lig(ligand_feat).view([batch_size, lig_channel, self.num_heads, self.d])
	V_lig = self.W_V_lig(ligand_feat).view([batch_size, lig_channel, self.num_heads, self.d])
	V = self.W_V(res_H).view([n_nodes, n_channels, self.num_heads, self.d])
	R_ij = torch.cdist(res_X[row], ligand_pos[col], p=2)
	attend_mask = (atom_mask[row].unsqueeze(-1) @ ligand_mask[col].unsqueeze(-2))
	R_ij = R_ij * attend_mask
	dist_rep1 = self.distance_expansion(R_ij).view(row.shape[0], res_X.shape[1], ligand_pos.shape[1], -1)
	attend_logits = torch.matmul(Q[row].transpose(1, 2), K_lig[col].permute(0, 2, 3, 1)).view(
	[n_edges, self.num_heads, n_channels, lig_channel])
	attend_logits /= np.sqrt(self.d)
	attend_logits = attend_logits + self.sigma_D1(dist_rep1).permute(0, 3, 1, 2)

	attend_mask = attend_mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1)

	# sparse attention, only keep top k=3
	attend_logits[torch.logical_not(attend_mask)] = -1e5 # do not sellect from entries not attend
	_, top_indices = torch.topk(attend_logits, self.sparse_k, dim=-1, largest=True)
	sparse_mask = torch.zeros_like(attend_logits, dtype=torch.bool)
	rows = torch.arange(attend_logits.size(0)).view(-1, 1, 1, 1).expand(-1, attend_logits.size(1),
	attend_logits.size(2), self.sparse_k)
	depth = torch.arange(attend_logits.size(1)).view(1, -1, 1, 1).expand(attend_logits.size(0), -1,
	attend_logits.size(2), self.sparse_k)
	height = torch.arange(attend_logits.size(2)).view(1, 1, -1, 1).expand(attend_logits.size(0),
	attend_logits.size(1), -1, self.sparse_k)
	sparse_mask[rows, depth, height, top_indices] = True
	attend_logits = attend_logits * attend_mask
	attend_logits = attend_logits * sparse_mask

	# calculate beta
	atom_mask_head = atom_mask.unsqueeze(1).repeat(1, self.num_heads, 1)
	ligand_mask_head = ligand_mask.unsqueeze(1).repeat(1, self.num_heads, 1)
	r_ij = atom_mask_head[row].unsqueeze(-2) @ attend_logits @ ligand_mask_head[col].unsqueeze(-1)
	r_ij = r_ij.squeeze() / (attend_mask * sparse_mask).sum(-1).sum(-1) # take avarage over non-zero entries
	beta = scatter_softmax(r_ij, col, dim=0)

	attend_logits_lig = torch.softmax(attend_logits, dim=-2) * attend_mask
	attend_logits_lig = attend_logits_lig / (attend_logits_lig.norm(1, dim=-2).unsqueeze(-2) + 1e-7)

	attend_logits = attend_logits * sparse_mask
	attend_logits_res = torch.softmax(attend_logits, dim=-1)
	attend_logits_res = attend_logits_res * attend_mask * sparse_mask
	attend_logits_res = attend_logits_res / (attend_logits_res.norm(1, dim=-1).unsqueeze(-1) + 1e-7)
	alpha_ij_Vj = attend_logits_res @ V_lig[col].transpose(1, 2)

	# invariant feature update
	res_H[residue_mask] = res_H[residue_mask] + self.W_O_lig(
	alpha_ij_Vj.transpose(1, 2).reshape(residue_mask.sum(), n_channels, -1))
	res_H = res_H * atom_mask.unsqueeze(-1) # set empty entries zeros
	alpha_ij_Vj_lig = attend_logits_lig.transpose(-1, -2) @ V[row].transpose(1, 2)
	update_lig_feat = scatter_sum(beta.unsqueeze(-1).unsqueeze(-1) * alpha_ij_Vj_lig, col, dim=0,
	dim_size=batch_size)
	ligand_feat = ligand_feat + self.W_O_lig1(update_lig_feat.transpose(1, 2).reshape(batch_size, lig_channel, -1))
	ligand_feat = ligand_feat * ligand_mask.unsqueeze(-1) # set empty entries zeros

	###################################################################################
	# equivariant res_X update
	X_ij = res_X[row].unsqueeze(-2) - ligand_pos[col].unsqueeze(1)
	X_ij = X_ij / (X_ij.norm(2, dim=-1).unsqueeze(-1) + 1e-5)
	X_ij = X_ij.unsqueeze(1).repeat(1, self.num_heads, 1, 1, 1)
	X_ij = X_ij * attend_mask.unsqueeze(-1)

	dist_rep1 = self.sigma_v(dist_rep1).view([n_edges, n_channels, lig_channel, self.num_heads, self.d]).permute(0,
	3,
	1,
	2,
	4)
	input = torch.cat(
	[(Q[row].transpose(1, 2))[rows, depth, height], (K_lig[col].transpose(1, 2))[rows, depth, top_indices],
	dist_rep1[rows, depth, height, top_indices]], dim=-1)
	f_res = self.T_e1(input).view(n_edges, self.num_heads, n_channels, self.sparse_k)
	f_res = f_res.unsqueeze(-1) * X_ij[rows, depth, height, top_indices].view(n_edges, self.num_heads, n_channels,
	self.sparse_k, 3)
	f_res = (f_res * attend_logits[rows, depth, height, top_indices].unsqueeze(-1)).sum(-2).sum(
	1) # [nedges, 14, 3]
	res_X[residue_mask] = res_X[residue_mask] + torch.clamp(f_res, min=-3.0, max=3.0)
	res_X = res_X * atom_mask.unsqueeze(-1) # set empty entries zeros

	p_idx, q_idx = torch.cartesian_prod(torch.arange(n_channels), torch.arange(lig_channel)).chunk(2, dim=-1)
	p_idx, q_idx = p_idx.squeeze(-1), q_idx.squeeze(-1)
	dist_rep1 = dist_rep1.permute(0, 2, 3, 1, 4)

	f_lig = self.T_e2(torch.cat(
	[Q[row][:, p_idx].view(-1, self.hidden_channels), K_lig[col][:, q_idx].view(-1, self.hidden_channels),
	dist_rep1.view(-1, self.hidden_channels)], dim=-1)).view(n_edges, n_channels, lig_channel, self.num_heads)
	# attend_mask = (atom_mask[row].unsqueeze(-1) @ ligand_mask[col].unsqueeze(-2)).bool()
	f_lig = f_lig.permute(0, 3, 1, 2) * attend_logits_lig
	f_lig = (f_lig.unsqueeze(-1) * X_ij).sum(2) # [n_edges, num_heads, lig_channel, 3]
	ligand_pos = ligand_pos + scatter_sum((beta.unsqueeze(-1).unsqueeze(-1) * f_lig).sum(1), col, dim=0,
	dim_size=batch_size)
	ligand_pos = ligand_pos * ligand_mask.unsqueeze(-1) # set empty entries zeros
	return res_H, res_X, ligand_pos, ligand_feat

	def attention_res_ligand(self, res_H, res_X, atom_mask, batch, ligand_pos, ligand_feat, ligand_mask, edit_residue_num,
	residue_mask):
	edge_index = self.connect_edges(res_X, batch)
	row, col = edge_index
	R_ij = torch.cdist(res_X[row], res_X[col], p=2)
	dist_rep = self.distance_expansion(R_ij).view(row.shape[0], res_X.shape[1], res_X.shape[1], -1)
	n_nodes = res_H.shape[0]
	n_edges = edge_index.shape[1]
	n_channels = res_X.shape[1]
	Q = self.W_Q(res_H).view([n_nodes, n_channels, self.num_heads, self.d])
	K = self.W_K(res_H).view([n_nodes, n_channels, self.num_heads, self.d])
	V = self.W_V(res_H).view([n_nodes, n_channels, self.num_heads, self.d])
	Q_lig = self.W_Q_lig(res_H).view([n_nodes, n_channels, self.num_heads, self.d])
	attend_logits = torch.matmul(Q[row].transpose(1, 2), K[col].permute(0, 2, 3, 1)).view(
	[n_edges, self.num_heads, n_channels, n_channels])
	attend_logits /= np.sqrt(self.d)
	attend_logits = attend_logits + self.sigma_D(dist_rep).permute(0, 3, 1, 2) # distance bias
	attend_mask = (atom_mask[row].unsqueeze(-1) @ atom_mask[col].unsqueeze(-2)).unsqueeze(1).repeat(1,
	self.num_heads,
	1, 1)

	# sparse attention, only keep top k=3
	attend_logits[torch.logical_not(attend_mask)] = -1e5 # do not sellect from entries not attend
	_, top_indices = torch.topk(attend_logits, self.sparse_k, dim=-1, largest=True)
	sparse_mask = torch.zeros_like(attend_logits, dtype=torch.bool)
	rows = torch.arange(attend_logits.size(0)).view(-1, 1, 1, 1).expand(-1, attend_logits.size(1),
	attend_logits.size(2), self.sparse_k)
	depth = torch.arange(attend_logits.size(1)).view(1, -1, 1, 1).expand(attend_logits.size(0), -1,
	attend_logits.size(2), self.sparse_k)
	height = torch.arange(attend_logits.size(2)).view(1, 1, -1, 1).expand(attend_logits.size(0),
	attend_logits.size(1), -1, self.sparse_k)
	sparse_mask[rows, depth, height, top_indices] = True
	attend_logits = attend_logits * sparse_mask
	attend_logits = attend_logits * attend_mask

	# calculate beta
	atom_mask_head = atom_mask.unsqueeze(1).repeat(1, self.num_heads, 1)
	r_ij = atom_mask_head[row].unsqueeze(-2) @ attend_logits @ atom_mask_head[col].unsqueeze(-1)
	r_ij = r_ij.squeeze() / (attend_mask * sparse_mask).sum(-1).sum(-1) # take avarage over non-zero entries
	beta = scatter_softmax(r_ij, row, dim=0) # [nnodes, num_heads]

	attend_logits = torch.softmax(attend_logits, dim=-1)
	attend_logits = attend_logits * attend_mask * sparse_mask
	attend_logits = attend_logits / (
	attend_logits.norm(1, dim=-1).unsqueeze(-1) + 1e-7) # normalize over every rows

	alpha_ij_Vj = attend_logits @ V[col].transpose(1, 2) # [nedges, num_heads, 14, d]
	alpha_ij_Vj = self.block_mlp_invariant(alpha_ij_Vj)

	# invariant res_H update
	update_H = scatter_sum(beta.unsqueeze(-1).unsqueeze(-1) * alpha_ij_Vj, row, dim=0, dim_size=n_nodes)
	update_H = update_H.transpose(1, 2).reshape(n_nodes, n_channels, -1) # (nnodes, 14, heads*d)
	res_H = res_H + self.W_O(update_H)
	res_H = res_H * atom_mask.unsqueeze(-1) # set empty entry zeros

	###################################################################################
	# equivariant res_X update
	X_ij = res_X[row].unsqueeze(-2) - res_X[col].unsqueeze(1) # [nedges, 14, 14, 3]
	X_ij = X_ij / (X_ij.norm(2, dim=-1).unsqueeze(-1) + 1e-5)
	X_ij = X_ij.unsqueeze(1).repeat(1, self.num_heads, 1, 1, 1)
	X_ij = X_ij * attend_mask.unsqueeze(-1)

	dist_rep = self.sigma_v(dist_rep).view([n_edges, n_channels, n_channels, self.num_heads, self.d]).permute(0, 3,
	1, 2,
	4)
	input = torch.cat(
	[(Q[row].transpose(1, 2))[rows, depth, height], (K[col].transpose(1, 2))[rows, depth, top_indices],
	dist_rep[rows, depth, height, top_indices]], dim=-1)
	f = self.T_i(input).view(n_edges, self.num_heads, n_channels, self.sparse_k)
	f = f.unsqueeze(-1) * X_ij[rows, depth, height, top_indices].view(n_edges, self.num_heads, n_channels,
	self.sparse_k, 3)
	f = (f * attend_logits[rows, depth, height, top_indices].unsqueeze(-1)).sum(-2) # [nedges, num_heads, 14, 3]
	res_X[residue_mask] = res_X[residue_mask] + torch.clamp(
	scatter_sum(beta.unsqueeze(-1).unsqueeze(-1) * f, row, dim=0, dim_size=n_nodes).sum(1)[residue_mask],
	min=-3.0, max=3.0)
	res_X = res_X * atom_mask.unsqueeze(-1) # set empty entries zeros

	###############################################################################################################
	batch_size = batch.max().item() + 1
	n_nodes = res_H.shape[0]
	n_channels = res_X.shape[1]
	lig_channel = ligand_feat.shape[1]
	row = torch.arange(n_nodes).to(self.device)[residue_mask]
	col = torch.repeat_interleave(torch.arange(batch_size).to(self.device), edit_residue_num)
	n_edges = row.shape[0]
	#Q_lig = self.W_Q_lig(res_H).view([n_nodes, n_channels, self.num_heads, self.d])
	#Q_lig = Q
	K_lig = self.W_K_lig(ligand_feat).view([batch_size, lig_channel, self.num_heads, self.d])
	V_lig = self.W_V_lig(ligand_feat).view([batch_size, lig_channel, self.num_heads, self.d])
	R_ij = torch.cdist(res_X[row], ligand_pos[col], p=2)
	attend_mask = (atom_mask[row].unsqueeze(-1) @ ligand_mask[col].unsqueeze(-2))
	R_ij = R_ij * attend_mask
	dist_rep1 = self.distance_expansion(R_ij).view(row.shape[0], res_X.shape[1], ligand_pos.shape[1], -1)
	attend_logits = torch.matmul(Q_lig[row].transpose(1, 2), K_lig[col].permute(0, 2, 3, 1)).view(
	[n_edges, self.num_heads, n_channels, lig_channel])
	attend_logits /= np.sqrt(self.d)
	attend_logits = attend_logits + self.sigma_D1(dist_rep1).permute(0, 3, 1, 2)

	attend_mask = attend_mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1)

	# sparse attention, only keep top k=3
	attend_logits[torch.logical_not(attend_mask)] = -1e5 # do not sellect from entries not attend
	_, top_indices = torch.topk(attend_logits, self.sparse_k, dim=-1, largest=True)
	sparse_mask = torch.zeros_like(attend_logits, dtype=torch.bool)
	rows = torch.arange(attend_logits.size(0)).view(-1, 1, 1, 1).expand(-1, attend_logits.size(1),
	attend_logits.size(2), self.sparse_k)
	depth = torch.arange(attend_logits.size(1)).view(1, -1, 1, 1).expand(attend_logits.size(0), -1,
	attend_logits.size(2), self.sparse_k)
	height = torch.arange(attend_logits.size(2)).view(1, 1, -1, 1).expand(attend_logits.size(0),
	attend_logits.size(1), -1, self.sparse_k)
	sparse_mask[rows, depth, height, top_indices] = True
	attend_logits = attend_logits * attend_mask
	attend_logits = attend_logits * sparse_mask
	return_attend = copy.deepcopy(attend_logits)

	# calculate beta
	atom_mask_head = atom_mask.unsqueeze(1).repeat(1, self.num_heads, 1)
	ligand_mask_head = ligand_mask.unsqueeze(1).repeat(1, self.num_heads, 1)
	r_ij = atom_mask_head[row].unsqueeze(-2) @ attend_logits @ ligand_mask_head[col].unsqueeze(-1)
	r_ij = r_ij.squeeze() / (attend_mask * sparse_mask).sum(-1).sum(-1) # take avarage over non-zero entries
	beta = scatter_softmax(r_ij, col, dim=0)

	attend_logits_lig = torch.softmax(attend_logits, dim=-2) * attend_mask
	attend_logits_lig = attend_logits_lig / (attend_logits_lig.norm(1, dim=-2).unsqueeze(-2) + 1e-7)

	attend_logits = attend_logits * sparse_mask
	attend_logits_res = torch.softmax(attend_logits, dim=-1)
	attend_logits_res = attend_logits_res * attend_mask * sparse_mask
	attend_logits_res = attend_logits_res / (attend_logits_res.norm(1, dim=-1).unsqueeze(-1) + 1e-7)
	alpha_ij_Vj = attend_logits_res @ V_lig[col].transpose(1, 2)

	# invariant feature update
	res_H[residue_mask] = res_H[residue_mask] + self.W_O_lig(
	alpha_ij_Vj.transpose(1, 2).reshape(residue_mask.sum(), n_channels, -1))
	res_H = res_H * atom_mask.unsqueeze(-1) # set empty entries zeros
	alpha_ij_Vj_lig = attend_logits_lig.transpose(-1, -2) @ V[row].transpose(1, 2)
	update_lig_feat = scatter_sum(beta.unsqueeze(-1).unsqueeze(-1) * alpha_ij_Vj_lig, col, dim=0,
	dim_size=batch_size)
	ligand_feat = ligand_feat + self.W_O_lig1(update_lig_feat.transpose(1, 2).reshape(batch_size, lig_channel, -1))
	ligand_feat = ligand_feat * ligand_mask.unsqueeze(-1) # set empty entries zeros

	###################################################################################
	# equivariant res_X update
	X_ij = res_X[row].unsqueeze(-2) - ligand_pos[col].unsqueeze(1)
	X_ij = X_ij / (X_ij.norm(2, dim=-1).unsqueeze(-1) + 1e-5)
	X_ij = X_ij.unsqueeze(1).repeat(1, self.num_heads, 1, 1, 1)
	X_ij = X_ij * attend_mask.unsqueeze(-1)

	dist_rep1 = self.sigma_v(dist_rep1).view([n_edges, n_channels, lig_channel, self.num_heads, self.d]).permute(0,
	3,
	1,
	2,
	4)
	input = torch.cat(
	[(Q_lig[row].transpose(1, 2))[rows, depth, height], (K_lig[col].transpose(1, 2))[rows, depth, top_indices],
	dist_rep1[rows, depth, height, top_indices]], dim=-1)
	f_res = self.T_e1(input).view(n_edges, self.num_heads, n_channels, self.sparse_k)
	f_res = f_res.unsqueeze(-1) * X_ij[rows, depth, height, top_indices].view(n_edges, self.num_heads, n_channels,
	self.sparse_k, 3)
	f_res = (f_res * attend_logits[rows, depth, height, top_indices].unsqueeze(-1)).sum(-2).sum(
	1) # [nedges, 14, 3]
	res_X[residue_mask] = res_X[residue_mask] + torch.clamp(f_res, min=-3.0, max=3.0)
	res_X = res_X * atom_mask.unsqueeze(-1) # set empty entries zeros

	p_idx, q_idx = torch.cartesian_prod(torch.arange(n_channels), torch.arange(lig_channel)).chunk(2, dim=-1)
	p_idx, q_idx = p_idx.squeeze(-1), q_idx.squeeze(-1)
	dist_rep1 = dist_rep1.permute(0, 2, 3, 1, 4)

	f_lig = self.T_e2(torch.cat(
	[Q_lig[row][:, p_idx].view(-1, self.hidden_channels), K_lig[col][:, q_idx].view(-1, self.hidden_channels),
	dist_rep1.view(-1, self.hidden_channels)], dim=-1)).view(n_edges, n_channels, lig_channel, self.num_heads)
	# attend_mask = (atom_mask[row].unsqueeze(-1) @ ligand_mask[col].unsqueeze(-2)).bool()
	f_lig = f_lig.permute(0, 3, 1, 2) * attend_logits_lig
	f_lig = (f_lig.unsqueeze(-1) * X_ij).sum(2) # [n_edges, num_heads, lig_channel, 3]
	ligand_pos = ligand_pos + scatter_sum((beta.unsqueeze(-1).unsqueeze(-1) * f_lig).sum(1), col, dim=0,
	dim_size=batch_size)
	ligand_pos = ligand_pos * ligand_mask.unsqueeze(-1) # set empty entries zeros
	return res_H, res_X, ligand_pos, ligand_feat, return_attend

	def forward(self, res_H, res_X, atom_mask, batch, ligand_pos, ligand_feat, ligand_mask, edit_residue_num, residue_mask):
	#res_H, res_X = self.attention(res_H, res_X, atom_mask, batch, residue_mask)
	res_H, res_X, ligand_pos, ligand_feat, return_attend = self.attention_res_ligand(res_H, res_X, atom_mask, batch, ligand_pos,
	ligand_feat, ligand_mask, edit_residue_num,
	residue_mask)

	pred_res_type = self.residue_mlp(res_H[residue_mask].sum(-2))
	return res_H, res_X, ligand_pos, ligand_feat, pred_res_type, return_attend


	class EquivariantFFN(nn.Module):
	def __init__(self, d_in, d_hidden, d_out, n_channel=1, n_rbf=16, act_fn=nn.SiLU(),
	residual=True, dropout=0.1, constant=1, z_requires_grad=True) -> None:
	super().__init__()
	self.constant = constant
	self.residual = residual
	self.n_rbf = n_rbf

	self.mlp_h = nn.Sequential(
	nn.Linear(d_in * 2 + n_channel * n_rbf, d_hidden),
	act_fn,
	nn.Dropout(dropout),
	nn.Linear(d_hidden, d_hidden),
	act_fn,
	nn.Dropout(dropout),
	nn.Linear(d_hidden, d_out),
	nn.Dropout(dropout)
	)

	self.mlp_z = nn.Sequential(
	nn.Linear(d_in * 2 + n_channel * n_rbf, d_hidden),
	act_fn,
	nn.Dropout(dropout),
	nn.Linear(d_hidden, d_hidden),
	act_fn,
	nn.Dropout(dropout),
	nn.Linear(d_hidden, 1),
	nn.Dropout(dropout)
	)

	if not z_requires_grad:
	for param in self.mlp_z.parameters():
	param.requires_grad = False

	self.rbf = RadialBasis(n_rbf, 7.0)

	def forward(self, H, X, atom_mask, residue_mask):
	'''
	:param H: [N, d_in]
	:param Z: [N, n_channel, 3]
	:param block_id: [Nu]
	'''

	radial, (X_c, X_o) = self._radial(X, atom_mask) # [N, n_hidden_channel], ([N, 1, 3], [N, n_channel, 3]
	# H_c = scatter_mean(H, block_id, dim=0)[block_id] # [N, d_in]
	H_c = (H * atom_mask.unsqueeze(-1)).sum(-2) / atom_mask.sum(-1).unsqueeze(-1)
	H_c = H_c.unsqueeze(-2).repeat(1, 14, 1)
	inputs = torch.cat([H, H_c, radial], dim=-1) # [N, 14, d_in + d_in + n_rbf]

	H_update = self.mlp_h(inputs)

	H = H + H_update if self.residual else H_update

	X_update = X_c.unsqueeze(-2) + self.mlp_z(inputs) * X_o
	X[residue_mask] = X_update[residue_mask]

	H, X = H * atom_mask.unsqueeze(-1), X * atom_mask.unsqueeze(-1)

	return H, X

	def _radial(self, X, atom_mask):
	X_c = (X * atom_mask.unsqueeze(-1)).sum(-2) / atom_mask.sum(-1).unsqueeze(-1) # center
	X_o = X - X_c.unsqueeze(-2) # [N, 14, 3], no translation
	X_o = X_o * atom_mask.unsqueeze(-1)

	D = stable_norm(X_o, dim=-1) # [N, 14]
	radial = self.rbf(D.view(-1)).view(D.shape[0], D.shape[1], -1) # [N, 14, n_rbf]
	return radial, (X_c, X_o)


	class InvariantLayerNorm(nn.Module):
	def __init__(self, d_hidden) -> None:
	super().__init__()

	self.layernorm = nn.LayerNorm(d_hidden)
	self.layernorm1 = nn.LayerNorm(d_hidden)

	def forward(self, res_H, ligand_feat, atom_mask, ligand_mask):
	res_H[atom_mask.bool()] = self.layernorm(res_H[atom_mask.bool()])
	ligand_feat[ligand_mask.bool()] = self.layernorm1(ligand_feat[ligand_mask.bool()])
	return res_H, ligand_feat


	class GET(nn.Module):
	'''Equivariant Adaptive Block Transformer'''

	def __init__(self, hidden_channels=128, edge_channels=64, key_channels=128, num_heads=4, num_interactions=6, k=8,
	cutoff=10.0, device='cuda:0', n_layers=4, pre_norm=False, sparse_k=3):
	super().__init__()
	'''
	:param d_hidden: Number of hidden features
	:param d_radial: Number of features for calculating geometric relations
	:param n_channel: Number of channels of coordinates of each unit
	:param n_rbf: Dimension of RBF feature, 1 for not using rbf
	:param cutoff: cutoff for RBF
	:param d_edge: Number of features for the edge features
	:param n_layers: Number of layer
	:param act_fn: Non-linearity
	:param residual: Use residual connections, we recommend not changing this one
	:param dropout: probability of dropout
	'''

	self.n_layers = n_layers
	self.pre_norm = pre_norm
	self.sparse_k = sparse_k
	self.residue_atom_mask = residue_atom_mask.to(device)

	if self.pre_norm:
	self.pre_layernorm = InvariantLayerNorm(hidden_channels)

	for i in range(0, n_layers):
	self.add_module(f'layer_{i}',
	GETLayer(hidden_channels, edge_channels, key_channels, num_heads, num_interactions, k,
	cutoff, device))
	self.add_module(f'layernorm0_{i}', InvariantLayerNorm(hidden_channels))
	self.add_module(f'ffn_{i}', EquivariantFFN(
	hidden_channels, 2 * hidden_channels, hidden_channels,
	))
	self.add_module(f'layernorm1_{i}', InvariantLayerNorm(hidden_channels))

	def forward(self, res_H, res_X, res_S, batch, ligand_pos, ligand_feat, ligand_mask, edit_residue_num, residue_mask):
	atom_mask = self.residue_atom_mask[res_S]

	if self.pre_norm:
	res_H, ligand_feat = self.pre_layernorm(res_H, ligand_feat, atom_mask, ligand_mask)

	for i in range(self.n_layers):
	res_H, res_X, ligand_pos, ligand_feat, pred_res_type, attend = self._modules[f'layer_{i}'](res_H, res_X, atom_mask,
	batch, ligand_pos,
	ligand_feat, ligand_mask,
	edit_residue_num,
	residue_mask)
	res_H, ligand_feat = self._modules[f'layernorm0_{i}'](res_H, ligand_feat, atom_mask, ligand_mask)
	res_H, res_X = self._modules[f'ffn_{i}'](res_H, res_X, atom_mask, residue_mask)
	res_H, ligand_feat = self._modules[f'layernorm1_{i}'](res_H, ligand_feat, atom_mask, ligand_mask)

	return res_H, res_X, ligand_pos, ligand_feat, pred_res_type, attend


	def stable_norm(input, args, *kwargs):
	return torch.norm(input, args, *kwargs)
	input = input.clone()
	with torch.no_grad():
	sign = torch.sign(input)
	input = torch.abs(input)
	input.clamp_(min=1e-10)
	input = sign * input
	return torch.norm(input, args, *kwargs)


	if __name__ == '__main__':
	hidden_channels = 128
	edge_channels = 64
	key_channels = 128
	num_heads = 4
	device = torch.device('cuda:0')
	model = GET()
	model.to(device)
	model.eval()

	res_H = torch.rand(10, 14, hidden_channels).to(device)
	res_X = torch.rand(10, 14, 3).to(device)
	res_S = torch.ones(10, dtype=torch.long)
	atom_mask = residue_atom_mask[res_S].to(device)
	ligand_mask = torch.tensor([[1., 1, 1, 0, 0], [1, 1, 1, 1, 1]]).to(device)
	batch = torch.tensor([0, 0, 0, 1, 1, 1, 1, 1, 1, 1], device=device)
	residue_mask = torch.ones(10, device=device).bool()
	edit_residue_num = torch.tensor([3, 7], device=device)
	ligand_pos, ligand_feat = torch.rand(2, 5, 3).to(device), torch.rand(2, 5, hidden_channels).to(device)
	ligand_pos = ligand_pos * ligand_mask.unsqueeze(-1)
	res_X = res_X * atom_mask.unsqueeze(-1)

	U, _, V = torch.linalg.svd(torch.randn(3, 3, device=device, dtype=torch.float))
	if torch.linalg.det(U) * torch.linalg.det(V) < 0:
	U[:, -1] = -U[:, -1]
	Q1, t1 = U.mm(V), torch.randn(3, device=device)
	U, _, V = torch.linalg.svd(torch.randn(3, 3, device=device, dtype=torch.float))
	if torch.linalg.det(U) * torch.linalg.det(V) < 0:
	U[:, -1] = -U[:, -1]
	Q2, t2 = U.mm(V), torch.randn(3, device=device)

	res_H1, res_X1, ligand_pos1, ligand_feat1, pred_res_type1 = model(copy.deepcopy(res_H), res_X, res_S, batch,
	ligand_pos, copy.deepcopy(ligand_feat),
	ligand_mask, edit_residue_num,
	residue_mask)

	res_X1 = copy.deepcopy(res_X1.detach())
	res_X1[batch == 0] = torch.matmul(res_X1[batch == 0], Q1) + t1
	res_X1[batch == 1] = torch.matmul(res_X1[batch == 1], Q2) + t2
	res_X1 = res_X1 * atom_mask.unsqueeze(-1)

	ligand_pos1[0] = torch.matmul(ligand_pos1[0], Q1) + t1
	ligand_pos1[1] = torch.matmul(ligand_pos1[1], Q2) + t2
	ligand_pos1 = ligand_pos1 * ligand_mask.unsqueeze(-1)

	res_X[batch == 0] = torch.matmul(res_X[batch == 0], Q1) + t1
	res_X[batch == 1] = torch.matmul(res_X[batch == 1], Q2) + t2
	res_X = res_X * atom_mask.unsqueeze(-1)

	ligand_pos[0] = torch.matmul(ligand_pos[0], Q1) + t1
	ligand_pos[1] = torch.matmul(ligand_pos[1], Q2) + t2
	ligand_pos = ligand_pos * ligand_mask.unsqueeze(-1)

	res_H2, res_X2, ligand_pos2, ligand_feat2, pred_res_type2 = model(copy.deepcopy(res_H), res_X, res_S, batch,
	ligand_pos, copy.deepcopy(ligand_feat),
	ligand_mask, edit_residue_num,
	residue_mask)

	print((res_X1 - res_X2).norm())
	print((res_H1 - res_H2).float().norm())
	print((pred_res_type1 - pred_res_type2).norm())
	print((ligand_pos1 - ligand_pos2).norm())
	print((ligand_feat1 - ligand_feat2).norm())