|
|
import os |
|
|
import re |
|
|
import pandas as pd |
|
|
from io import StringIO |
|
|
import rdkit |
|
|
from rdkit import Chem |
|
|
from rdkit.Chem import AllChem, Draw |
|
|
import numpy as np |
|
|
from PIL import Image, ImageDraw, ImageFont |
|
|
import matplotlib.pyplot as plt |
|
|
import matplotlib.patches as patches |
|
|
from io import BytesIO |
|
|
import tempfile |
|
|
from rdkit import Chem |
|
|
|
|
|
class PeptideAnalyzer: |
|
|
def __init__(self): |
|
|
self.bond_patterns = [ |
|
|
(r'OC\(=O\)', 'ester'), |
|
|
(r'N\(C\)C\(=O\)', 'n_methyl'), |
|
|
(r'N[0-9]C\(=O\)', 'proline'), |
|
|
(r'NC\(=O\)', 'peptide'), |
|
|
(r'C\(=O\)N\(C\)', 'n_methyl_reverse'), |
|
|
(r'C\(=O\)N[12]?', 'peptide_reverse') |
|
|
] |
|
|
|
|
|
self.three_to_one = { |
|
|
'Ala': 'A', 'Cys': 'C', 'Asp': 'D', 'Glu': 'E', |
|
|
'Phe': 'F', 'Gly': 'G', 'His': 'H', 'Ile': 'I', |
|
|
'Lys': 'K', 'Leu': 'L', 'Met': 'M', 'Asn': 'N', |
|
|
'Pro': 'P', 'Gln': 'Q', 'Arg': 'R', 'Ser': 'S', |
|
|
'Thr': 'T', 'Val': 'V', 'Trp': 'W', 'Tyr': 'Y' |
|
|
} |
|
|
|
|
|
def is_peptide(self, smiles): |
|
|
"""Check if the SMILES represents a peptide structure""" |
|
|
mol = Chem.MolFromSmiles(smiles) |
|
|
if mol is None: |
|
|
return False |
|
|
|
|
|
|
|
|
peptide_bond_pattern = Chem.MolFromSmarts('[NH][C](=O)') |
|
|
if mol.HasSubstructMatch(peptide_bond_pattern): |
|
|
return True |
|
|
|
|
|
|
|
|
n_methyl_pattern = Chem.MolFromSmarts('[N;H0;$(NC)](C)[C](=O)') |
|
|
if mol.HasSubstructMatch(n_methyl_pattern): |
|
|
return True |
|
|
|
|
|
return False |
|
|
|
|
|
def is_cyclic(self, smiles): |
|
|
"""Improved cyclic peptide detection""" |
|
|
|
|
|
if smiles.endswith('C(=O)O'): |
|
|
return False, [], [] |
|
|
|
|
|
|
|
|
ring_numbers = re.findall(r'(?:^|[^c])[0-9](?=[A-Z@\(\)])', smiles) |
|
|
|
|
|
|
|
|
aromatic_matches = re.findall(r'c[0-9](?:ccccc|c\[nH\]c)[0-9]', smiles) |
|
|
aromatic_cycles = [] |
|
|
for match in aromatic_matches: |
|
|
numbers = re.findall(r'[0-9]', match) |
|
|
aromatic_cycles.extend(numbers) |
|
|
|
|
|
|
|
|
peptide_cycles = [n for n in ring_numbers if n not in aromatic_cycles] |
|
|
|
|
|
is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O') |
|
|
return is_cyclic, peptide_cycles, aromatic_cycles |
|
|
|
|
|
def split_on_bonds(self, smiles): |
|
|
"""Split SMILES into segments with simplified Pro handling""" |
|
|
positions = [] |
|
|
used = set() |
|
|
|
|
|
|
|
|
gly_pattern = r'NCC\(=O\)' |
|
|
for match in re.finditer(gly_pattern, smiles): |
|
|
if not any(p in range(match.start(), match.end()) for p in used): |
|
|
positions.append({ |
|
|
'start': match.start(), |
|
|
'end': match.end(), |
|
|
'type': 'gly', |
|
|
'pattern': match.group() |
|
|
}) |
|
|
used.update(range(match.start(), match.end())) |
|
|
|
|
|
for pattern, bond_type in self.bond_patterns: |
|
|
for match in re.finditer(pattern, smiles): |
|
|
if not any(p in range(match.start(), match.end()) for p in used): |
|
|
positions.append({ |
|
|
'start': match.start(), |
|
|
'end': match.end(), |
|
|
'type': bond_type, |
|
|
'pattern': match.group() |
|
|
}) |
|
|
used.update(range(match.start(), match.end())) |
|
|
|
|
|
|
|
|
positions.sort(key=lambda x: x['start']) |
|
|
|
|
|
|
|
|
segments = [] |
|
|
|
|
|
if positions: |
|
|
|
|
|
if positions[0]['start'] > 0: |
|
|
segments.append({ |
|
|
'content': smiles[0:positions[0]['start']], |
|
|
'bond_after': positions[0]['pattern'] |
|
|
}) |
|
|
|
|
|
|
|
|
for i in range(len(positions)-1): |
|
|
current = positions[i] |
|
|
next_pos = positions[i+1] |
|
|
|
|
|
if current['type'] == 'gly': |
|
|
segments.append({ |
|
|
'content': 'NCC(=O)', |
|
|
'bond_before': positions[i-1]['pattern'] if i > 0 else None, |
|
|
'bond_after': next_pos['pattern'] |
|
|
}) |
|
|
else: |
|
|
content = smiles[current['end']:next_pos['start']] |
|
|
if content: |
|
|
segments.append({ |
|
|
'content': content, |
|
|
'bond_before': current['pattern'], |
|
|
'bond_after': next_pos['pattern'] |
|
|
}) |
|
|
|
|
|
|
|
|
if positions[-1]['end'] < len(smiles): |
|
|
segments.append({ |
|
|
'content': smiles[positions[-1]['end']:], |
|
|
'bond_before': positions[-1]['pattern'] |
|
|
}) |
|
|
|
|
|
return segments |
|
|
|
|
|
def clean_terminal_carboxyl(self, segment): |
|
|
"""Remove C-terminal carboxyl only if it's the true terminus""" |
|
|
content = segment['content'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if 'C(=O)O' in content and not segment.get('bond_after'): |
|
|
print('recognized?') |
|
|
|
|
|
cleaned = re.sub(r'\(C\(=O\)O\)', '', content) |
|
|
|
|
|
cleaned = re.sub(r'\(\)', '', cleaned) |
|
|
print(cleaned) |
|
|
return cleaned |
|
|
return content |
|
|
|
|
|
def identify_residue(self, segment): |
|
|
"""Identify residue with Pro reconstruction""" |
|
|
|
|
|
content = self.clean_terminal_carboxyl(segment) |
|
|
mods = self.get_modifications(segment) |
|
|
|
|
|
|
|
|
|
|
|
if 'c1ccccc1' in content: |
|
|
if '[C@@H](c1ccccc1)' in content or '[C@H](c1ccccc1)' in content: |
|
|
return '4', mods |
|
|
|
|
|
|
|
|
if 'Cc1ccc' in content: |
|
|
if 'OMe' in content or 'OCc1ccc' in content: |
|
|
return '0A1', mods |
|
|
elif 'Clc1ccc' in content: |
|
|
return '200', mods |
|
|
elif 'Brc1ccc' in content: |
|
|
return '4BF', mods |
|
|
elif 'C#Nc1ccc' in content: |
|
|
return '4CF', mods |
|
|
elif 'Ic1ccc' in content: |
|
|
return 'PHI', mods |
|
|
elif 'Fc1ccc' in content: |
|
|
return 'PFF', mods |
|
|
|
|
|
|
|
|
if 'c[nH]c2' in content: |
|
|
if 'Oc2cccc2' in content: |
|
|
return '0AF', mods |
|
|
elif 'Fc2cccc2' in content: |
|
|
return '4FW', mods |
|
|
elif 'Clc2cccc2' in content: |
|
|
return '6CW', mods |
|
|
elif 'Brc2cccc2' in content: |
|
|
return 'BTR', mods |
|
|
elif 'COc2cccc2' in content: |
|
|
return 'MOT5', mods |
|
|
elif 'Cc2cccc2' in content: |
|
|
return 'MTR5', mods |
|
|
|
|
|
|
|
|
if 'CC(C)(C)[C@@H]' in content or 'CC(C)(C)[C@H]' in content: |
|
|
return 'BUG', mods |
|
|
|
|
|
if 'CCCNC(=N)N' in content: |
|
|
return 'CIR', mods |
|
|
|
|
|
if '[SeH]' in content: |
|
|
return 'CSE', mods |
|
|
|
|
|
if '[NH3]CC[C@@H]' in content or '[NH3]CC[C@H]' in content: |
|
|
return 'DAB', mods |
|
|
|
|
|
if 'C1CCCCC1' in content: |
|
|
if 'C1CCCCC1[C@@H]' in content or 'C1CCCCC1[C@H]' in content: |
|
|
return 'CHG', mods |
|
|
elif 'C1CCCCC1C[C@@H]' in content or 'C1CCCCC1C[C@H]' in content: |
|
|
return 'ALC', mods |
|
|
|
|
|
|
|
|
if 'c1cccc2c1cccc2' in content: |
|
|
if 'c1cccc2c1cccc2[C@@H]' in content or 'c1cccc2c1cccc2[C@H]' in content: |
|
|
return 'NAL', mods |
|
|
|
|
|
|
|
|
if 'c1cncc' in content: |
|
|
return 'PYR4', mods |
|
|
if 'c1cscc' in content: |
|
|
return 'THA3', mods |
|
|
if 'c1nnc' in content: |
|
|
return 'TRZ4', mods |
|
|
|
|
|
|
|
|
if 'OP(O)(O)O' in content: |
|
|
if '[C@@H](COP' in content or '[C@H](COP' in content: |
|
|
return 'SEP', mods |
|
|
elif '[C@@H](OP' in content or '[C@H](OP' in content: |
|
|
return 'TPO', mods |
|
|
|
|
|
|
|
|
if 'c1c2ccccc2cc2c1cccc2' in content: |
|
|
return 'ANTH', mods |
|
|
if 'c1csc2c1cccc2' in content: |
|
|
return 'BTH3', mods |
|
|
if '[C@]12C[C@H]3C[C@@H](C2)C[C@@H](C1)C3' in content: |
|
|
return 'ADAM', mods |
|
|
|
|
|
|
|
|
if 'FC(F)(F)' in content: |
|
|
if 'CC(F)(F)F' in content: |
|
|
return 'FLA', mods |
|
|
if 'C(F)(F)F)c1' in content: |
|
|
if 'c1ccccc1C(F)(F)F' in content: |
|
|
return 'TFG2', mods |
|
|
if 'c1cccc(c1)C(F)(F)F' in content: |
|
|
return 'TFG3', mods |
|
|
if 'c1ccc(cc1)C(F)(F)F' in content: |
|
|
return 'TFG4', mods |
|
|
|
|
|
|
|
|
if 'F' in content and 'c1' in content: |
|
|
if 'c1ccc(c(c1)F)F' in content: |
|
|
return 'F2F', mods |
|
|
if 'cc(F)cc(c1)F' in content: |
|
|
return 'WFP', mods |
|
|
if 'Cl' in content and 'c1' in content: |
|
|
if 'c1ccc(cc1Cl)Cl' in content: |
|
|
return 'CP24', mods |
|
|
if 'c1ccc(c(c1)Cl)Cl' in content: |
|
|
return 'CP34', mods |
|
|
|
|
|
|
|
|
if 'O' in content and 'c1' in content: |
|
|
if 'c1cc(O)cc(c1)O' in content: |
|
|
return '3FG', mods |
|
|
if 'c1ccc(c(c1)O)O' in content: |
|
|
return 'DAH', mods |
|
|
|
|
|
|
|
|
if 'C1CCCC1' in content: |
|
|
return 'CPA3', mods |
|
|
if 'C1CCCCC1' in content: |
|
|
if 'CC1CCCCC1' in content: |
|
|
return 'ALC', mods |
|
|
else: |
|
|
return 'CHG', mods |
|
|
|
|
|
|
|
|
if 'CCC[C@@H]' in content or 'CCC[C@H]' in content: |
|
|
return 'NLE', mods |
|
|
if 'CC[C@@H]' in content or 'CC[C@H]' in content: |
|
|
if not any(x in content for x in ['CC(C)', 'COC', 'CN(']): |
|
|
return 'ABA', mods |
|
|
|
|
|
|
|
|
if 'c1cnc' in content: |
|
|
if '[C@@H]1CN[C@@H](N1)F' in content: |
|
|
return '2HF', mods |
|
|
if 'c1cnc([nH]1)F' in content: |
|
|
return '2HF1', mods |
|
|
if 'c1c[nH]c(n1)F' in content: |
|
|
return '2HF2', mods |
|
|
|
|
|
|
|
|
if '[SeH]' in content: |
|
|
return 'CSE', mods |
|
|
if 'S' in content: |
|
|
if 'CSCc1ccccc1' in content: |
|
|
return 'BCS', mods |
|
|
if 'CCSC' in content: |
|
|
return 'ESC', mods |
|
|
if 'CCS' in content: |
|
|
return 'HCS', mods |
|
|
|
|
|
|
|
|
if 'CN=[N]=N' in content: |
|
|
return 'AZDA', mods |
|
|
if '[NH]=[C](=[NH2])=[NH2]' in content: |
|
|
if 'CCC[NH]=' in content: |
|
|
return 'AGM', mods |
|
|
if 'CC[NH]=' in content: |
|
|
return 'GDPR', mods |
|
|
|
|
|
if 'CCON' in content: |
|
|
return 'CAN', mods |
|
|
if '[C@@H]1C=C[C@@H](C=C1)' in content: |
|
|
return 'ACZ', mods |
|
|
if 'CCC(=O)[NH3]' in content: |
|
|
return 'ONL', mods |
|
|
if 'c1ccncc1' in content: |
|
|
return 'PYR4', mods |
|
|
if 'c1ccco1' in content: |
|
|
return 'FUA2', mods |
|
|
|
|
|
if 'c1ccc' in content: |
|
|
if 'c1ccc(cc1)c1ccccc1' in content: |
|
|
return 'BIF', mods |
|
|
if 'c1ccc(cc1)C(=O)c1ccccc1' in content: |
|
|
return 'PBF', mods |
|
|
if 'c1ccc(cc1)C(C)(C)C' in content: |
|
|
return 'TBP4', mods |
|
|
if 'c1ccc(cc1)[C](=[NH2])=[NH2]' in content: |
|
|
return '0BN', mods |
|
|
if 'c1cccc(c1)[C](=[NH2])=[NH2]' in content: |
|
|
return 'APM', mods |
|
|
|
|
|
|
|
|
if 'O' in content: |
|
|
if '[C@H]([C@H](C)O)O' in content: |
|
|
return 'ILX', mods |
|
|
if '[C@H]([C@@H](C)O)O' in content: |
|
|
return 'ALO', mods |
|
|
if '[C@H](COP(O)(O)O)' in content: |
|
|
return 'SEP', mods |
|
|
if '[C@H]([C@@H](C)OP(O)(O)O)' in content: |
|
|
return 'TPO', mods |
|
|
if '[C@H](c1ccc(O)cc1)O' in content: |
|
|
return 'OMX', mods |
|
|
if '[C@H](c1ccc(c(Cl)c1)O)O' in content: |
|
|
return 'OMY', mods |
|
|
|
|
|
|
|
|
if 'n1' in content: |
|
|
if 'n1cccn1' in content: |
|
|
return 'PYZ1', mods |
|
|
if 'n1nncn1' in content: |
|
|
return 'TEZA', mods |
|
|
if 'c2c(n1)cccc2' in content: |
|
|
return 'QU32', mods |
|
|
if 'c1cnc2c(c1)cccc2' in content: |
|
|
return 'QU33', mods |
|
|
if 'c1ccnc2c1cccc2' in content: |
|
|
return 'QU34', mods |
|
|
if 'c1ccc2c(c1)nccc2' in content: |
|
|
return 'QU35', mods |
|
|
if 'c1ccc2c(c1)cncc2' in content: |
|
|
return 'QU36', mods |
|
|
if 'c1cnc2c(n1)cccc2' in content: |
|
|
return 'QX32', mods |
|
|
|
|
|
|
|
|
if 'N' in content: |
|
|
if '[NH3]CC[C@@H]' in content: |
|
|
return 'DAB', mods |
|
|
if '[NH3]C[C@@H]' in content: |
|
|
return 'DPP', mods |
|
|
if '[NH3]CCCCCC[C@@H]' in content: |
|
|
return 'HHK', mods |
|
|
if 'CCC[NH]=[C](=[NH2])=[NH2]' in content: |
|
|
return 'GBUT', mods |
|
|
if '[NH]=[C](=S)=[NH2]' in content: |
|
|
return 'THIC', mods |
|
|
|
|
|
|
|
|
if 'CC' in content: |
|
|
if 'CCCC[C@@H]' in content: |
|
|
return 'AHP', mods |
|
|
if 'CCC([C@@H])(C)C' in content: |
|
|
return 'I2M', mods |
|
|
if 'CC[C@H]([C@@H])C' in content: |
|
|
return 'IIL', mods |
|
|
if '[C@H](CCC(C)C)' in content: |
|
|
return 'HLEU', mods |
|
|
if '[C@@H]([C@@H](C)O)C' in content: |
|
|
return 'HLU', mods |
|
|
|
|
|
|
|
|
if '[C@@H]' in content: |
|
|
if '[C@@H](C[C@@H](F))' in content: |
|
|
return 'FGA4', mods |
|
|
if '[C@@H](C[C@@H](O))' in content: |
|
|
return '3GL', mods |
|
|
if '[C@@H](C[C@H](C))' in content: |
|
|
return 'LME', mods |
|
|
if '[C@@H](CC[C@H](C))' in content: |
|
|
return 'MEG', mods |
|
|
|
|
|
|
|
|
if 'S' in content: |
|
|
if 'SCC[C@@H]' in content: |
|
|
return 'HSER', mods |
|
|
if 'SCCN' in content: |
|
|
return 'SLZ', mods |
|
|
if 'SC(=O)' in content: |
|
|
return 'CSA', mods |
|
|
if '[S@@](=O)' in content: |
|
|
return 'SME', mods |
|
|
if 'S(=O)(=O)' in content: |
|
|
return 'OMT', mods |
|
|
|
|
|
|
|
|
if 'C=' in content: |
|
|
if 'C=C[C@@H]' in content: |
|
|
return '2AG', mods |
|
|
if 'C=C[C@@H]' in content: |
|
|
return 'LVG', mods |
|
|
if 'C=Cc1ccccc1' in content: |
|
|
return 'STYA', mods |
|
|
|
|
|
|
|
|
if '[C@@H]1Cc2c(C1)cccc2' in content: |
|
|
return 'IGL', mods |
|
|
if '[C](=[C](=O)=O)=O' in content: |
|
|
return '26P', mods |
|
|
if '[C](=[C](=O)=O)=C' in content: |
|
|
return '2NP', mods |
|
|
if 'c2cnc[nH]2' in content: |
|
|
return 'HIS', mods |
|
|
if 'c1cccc2c1cc(O)cc2' in content: |
|
|
return 'NAO1', mods |
|
|
if 'c1ccc2c(c1)cc(O)cc2' in content: |
|
|
return 'NAO2', mods |
|
|
|
|
|
|
|
|
if any([ |
|
|
|
|
|
(segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and |
|
|
any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789')) |
|
|
for n in '123456789' |
|
|
]) or any([ |
|
|
|
|
|
(f'CCCN{n}' in content and content.endswith('=O') and |
|
|
any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789')) |
|
|
for n in '123456789' |
|
|
]) or any([ |
|
|
|
|
|
(content == f'CCC[C@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or |
|
|
(content == f'CCC[C@@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or |
|
|
|
|
|
(f'N{n}CCC[C@H]{n}' in content) or |
|
|
(f'N{n}CCC[C@@H]{n}' in content) |
|
|
for n in '123456789' |
|
|
]): |
|
|
return 'Pro', mods |
|
|
|
|
|
|
|
|
if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \ |
|
|
'c[nH]c' in content.replace(' ', ''): |
|
|
return 'Trp', mods |
|
|
|
|
|
|
|
|
if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content: |
|
|
return 'Lys', mods |
|
|
|
|
|
|
|
|
if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content: |
|
|
return 'Arg', mods |
|
|
|
|
|
if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content: |
|
|
return 'Nle', mods |
|
|
|
|
|
|
|
|
if ('C[C@H](CCCN)' in content or 'C[C@@H](CCCN)' in content) and 'CC(C)' not in content: |
|
|
return 'Orn', mods |
|
|
|
|
|
|
|
|
if ('Cc3cc2ccccc2c3' in content) and ('C[C@H]' in content or 'C[C@@H]' in content): |
|
|
return '2Nal', mods |
|
|
|
|
|
|
|
|
if 'N2CCCCC2' in content or 'CCCCC2' in content: |
|
|
return 'Cha', mods |
|
|
|
|
|
|
|
|
if ('C[C@H](CC)' in content or 'C[C@@H](CC)' in content) and not any(p in content for p in ['CC(C)', 'CCCC', 'CCC(C)']): |
|
|
return 'Abu', mods |
|
|
|
|
|
|
|
|
if ('N3CCCCC3' in content or 'CCCCC3' in content) and ('C[C@H]' in content or 'C[C@@H]' in content): |
|
|
return 'Pip', mods |
|
|
|
|
|
|
|
|
if ('C[C@H](C1CCCCC1)' in content or 'C[C@@H](C1CCCCC1)' in content): |
|
|
return 'Chg', mods |
|
|
|
|
|
|
|
|
if ('Cc2ccc(F)cc2' in content) and ('C[C@H]' in content or 'C[C@@H]' in content): |
|
|
return '4F-Phe', mods |
|
|
|
|
|
|
|
|
if ('NCC(=O)' in content) or (content == 'C'): |
|
|
|
|
|
if segment.get('bond_before') and segment.get('bond_after'): |
|
|
if ('C(=O)N' in segment['bond_before'] or 'C(=O)N(C)' in segment['bond_before']): |
|
|
return 'Gly', mods |
|
|
|
|
|
elif segment.get('bond_before') and segment.get('bond_before').startswith('C(=O)N'): |
|
|
return 'Gly', mods |
|
|
|
|
|
if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content: |
|
|
return 'Leu', mods |
|
|
if '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content: |
|
|
return 'Leu', mods |
|
|
|
|
|
if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content: |
|
|
return 'Thr', mods |
|
|
|
|
|
if '[C@H](Cc2ccccc2)' in content or '[C@@H](Cc2ccccc2)' in content: |
|
|
return 'Phe', mods |
|
|
|
|
|
if ('[C@H](C(C)C)' in content or |
|
|
'[C@@H](C(C)C)' in content or |
|
|
'[C@H]C(C)C' in content or |
|
|
'[C@@H]C(C)C' in content): |
|
|
if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']): |
|
|
return 'Val', mods |
|
|
|
|
|
if '[C@H](COC(C)(C)C)' in content or '[C@@H](COC(C)(C)C)' in content: |
|
|
return 'O-tBu', mods |
|
|
|
|
|
if any([ |
|
|
'CC[C@H](C)' in content, |
|
|
'CC[C@@H](C)' in content, |
|
|
'C(C)C[C@H]' in content and 'CC(C)C' not in content, |
|
|
'C(C)C[C@@H]' in content and 'CC(C)C' not in content |
|
|
]): |
|
|
return 'Ile', mods |
|
|
|
|
|
if ('[C@H](C)' in content or '[C@@H](C)' in content): |
|
|
if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']): |
|
|
return 'Ala', mods |
|
|
|
|
|
|
|
|
if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content): |
|
|
return 'Tyr', mods |
|
|
|
|
|
|
|
|
|
|
|
if '[C@H](CO)' in content or '[C@@H](CO)' in content: |
|
|
if not ('C(C)O' in content or 'COC' in content): |
|
|
return 'Ser', mods |
|
|
|
|
|
|
|
|
if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content or '[C@@H](C)O' in content or '[C@H](C)O' in content: |
|
|
return 'Thr', mods |
|
|
|
|
|
|
|
|
if '[C@H](CS)' in content or '[C@@H](CS)' in content: |
|
|
return 'Cys', mods |
|
|
|
|
|
|
|
|
if ('C[C@H](CCSC)' in content or 'C[C@@H](CCSC)' in content): |
|
|
return 'Met', mods |
|
|
|
|
|
|
|
|
if ('CC(=O)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content): |
|
|
return 'Asn', mods |
|
|
|
|
|
|
|
|
if ('CCC(=O)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content): |
|
|
return 'Gln', mods |
|
|
|
|
|
|
|
|
if ('CC(=O)O' in content) and ('C[C@H]' in content or 'C[C@@H]' in content): |
|
|
return 'Asp', mods |
|
|
|
|
|
|
|
|
if ('CCC(=O)O' in content) and ('C[C@H]' in content or 'C[C@@H]' in content): |
|
|
return 'Glu', mods |
|
|
|
|
|
|
|
|
if ('CCCNC(=N)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content): |
|
|
return 'Arg', mods |
|
|
|
|
|
|
|
|
if ('Cc2cnc[nH]2' in content) and ('C[C@H]' in content or 'C[C@@H]' in content): |
|
|
return 'His', mods |
|
|
|
|
|
return None, mods |
|
|
|
|
|
def get_modifications(self, segment): |
|
|
"""Get modifications based on bond types""" |
|
|
mods = [] |
|
|
if segment.get('bond_after'): |
|
|
if 'N(C)' in segment['bond_after'] or segment['bond_after'].startswith('C(=O)N(C)'): |
|
|
mods.append('N-Me') |
|
|
if 'OC(=O)' in segment['bond_after']: |
|
|
mods.append('O-linked') |
|
|
return mods |
|
|
|
|
|
def analyze_structure(self, smiles): |
|
|
"""Main analysis function with debug output""" |
|
|
print("\nAnalyzing structure:", smiles) |
|
|
|
|
|
|
|
|
segments = self.split_on_bonds(smiles) |
|
|
|
|
|
print("\nSegment Analysis:") |
|
|
sequence = [] |
|
|
for i, segment in enumerate(segments): |
|
|
print(f"\nSegment {i}:") |
|
|
print(f"Content: {segment['content']}") |
|
|
print(f"Bond before: {segment.get('bond_before', 'None')}") |
|
|
print(f"Bond after: {segment.get('bond_after', 'None')}") |
|
|
|
|
|
residue, mods = self.identify_residue(segment) |
|
|
if residue: |
|
|
if mods: |
|
|
sequence.append(f"{residue}({','.join(mods)})") |
|
|
else: |
|
|
sequence.append(residue) |
|
|
print(f"Identified as: {residue}") |
|
|
print(f"Modifications: {mods}") |
|
|
else: |
|
|
print(f"Warning: Could not identify residue in segment: {segment['content']}") |
|
|
|
|
|
|
|
|
is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles) |
|
|
three_letter = '-'.join(sequence) |
|
|
one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence) |
|
|
|
|
|
if is_cyclic: |
|
|
three_letter = f"cyclo({three_letter})" |
|
|
one_letter = f"cyclo({one_letter})" |
|
|
|
|
|
print(f"\nFinal sequence: {three_letter}") |
|
|
print(f"One-letter code: {one_letter}") |
|
|
print(f"Is cyclic: {is_cyclic}") |
|
|
|
|
|
|
|
|
|
|
|
return three_letter, len(segments) |
|
|
"""return { |
|
|
'three_letter': three_letter, |
|
|
#'one_letter': one_letter, |
|
|
'is_cyclic': is_cyclic |
|
|
}""" |
|
|
|
|
|
def return_sequence(self, smiles): |
|
|
"""Main analysis function with debug output""" |
|
|
print("\nAnalyzing structure:", smiles) |
|
|
|
|
|
|
|
|
segments = self.split_on_bonds(smiles) |
|
|
|
|
|
print("\nSegment Analysis:") |
|
|
sequence = [] |
|
|
for i, segment in enumerate(segments): |
|
|
print(f"\nSegment {i}:") |
|
|
print(f"Content: {segment['content']}") |
|
|
print(f"Bond before: {segment.get('bond_before', 'None')}") |
|
|
print(f"Bond after: {segment.get('bond_after', 'None')}") |
|
|
|
|
|
residue, mods = self.identify_residue(segment) |
|
|
if residue: |
|
|
if mods: |
|
|
sequence.append(f"{residue}({','.join(mods)})") |
|
|
else: |
|
|
sequence.append(residue) |
|
|
print(f"Identified as: {residue}") |
|
|
print(f"Modifications: {mods}") |
|
|
else: |
|
|
print(f"Warning: Could not identify residue in segment: {segment['content']}") |
|
|
|
|
|
return sequence |
|
|
|
|
|
""" |
|
|
def annotate_cyclic_structure(mol, sequence): |
|
|
'''Create annotated 2D structure with clear, non-overlapping residue labels''' |
|
|
# Generate 2D coordinates |
|
|
# Generate 2D coordinates |
|
|
AllChem.Compute2DCoords(mol) |
|
|
|
|
|
# Create drawer with larger size for annotations |
|
|
drawer = Draw.rdMolDraw2D.MolDraw2DCairo(2000, 2000) # Even larger size |
|
|
|
|
|
# Get residue list and reverse it to match structural representation |
|
|
if sequence.startswith('cyclo('): |
|
|
residues = sequence[6:-1].split('-') |
|
|
else: |
|
|
residues = sequence.split('-') |
|
|
residues = list(reversed(residues)) # Reverse the sequence |
|
|
|
|
|
# Draw molecule first to get its bounds |
|
|
drawer.drawOptions().addAtomIndices = False |
|
|
drawer.DrawMolecule(mol) |
|
|
drawer.FinishDrawing() |
|
|
|
|
|
# Convert to PIL Image |
|
|
img = Image.open(BytesIO(drawer.GetDrawingText())) |
|
|
draw = ImageDraw.Draw(img) |
|
|
|
|
|
try: |
|
|
# Try to use DejaVuSans as it's commonly available on Linux systems |
|
|
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 60) |
|
|
small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 60) |
|
|
except OSError: |
|
|
try: |
|
|
# Fallback to Arial if available (common on Windows) |
|
|
font = ImageFont.truetype("arial.ttf", 60) |
|
|
small_font = ImageFont.truetype("arial.ttf", 60) |
|
|
except OSError: |
|
|
# If no TrueType fonts are available, fall back to default |
|
|
print("Warning: TrueType fonts not available, using default font") |
|
|
font = ImageFont.load_default() |
|
|
small_font = ImageFont.load_default() |
|
|
# Get molecule bounds |
|
|
conf = mol.GetConformer() |
|
|
positions = [] |
|
|
for i in range(mol.GetNumAtoms()): |
|
|
pos = conf.GetAtomPosition(i) |
|
|
positions.append((pos.x, pos.y)) |
|
|
|
|
|
x_coords = [p[0] for p in positions] |
|
|
y_coords = [p[1] for p in positions] |
|
|
min_x, max_x = min(x_coords), max(x_coords) |
|
|
min_y, max_y = min(y_coords), max(y_coords) |
|
|
|
|
|
# Calculate scaling factors |
|
|
scale = 150 # Increased scale factor |
|
|
center_x = 1000 # Image center |
|
|
center_y = 1000 |
|
|
|
|
|
# Add residue labels in a circular arrangement around the structure |
|
|
n_residues = len(residues) |
|
|
radius = 700 # Distance of labels from center |
|
|
|
|
|
# Start from the rightmost point (3 o'clock position) and go counterclockwise |
|
|
# Offset by -3 positions to align with structure |
|
|
offset = 0 # Adjust this value to match the structure alignment |
|
|
for i, residue in enumerate(residues): |
|
|
# Calculate position in a circle around the structure |
|
|
# Start from 0 (3 o'clock) and go counterclockwise |
|
|
angle = -(2 * np.pi * ((i + offset) % n_residues) / n_residues) |
|
|
|
|
|
# Calculate label position |
|
|
label_x = center_x + radius * np.cos(angle) |
|
|
label_y = center_y + radius * np.sin(angle) |
|
|
|
|
|
# Draw residue label |
|
|
text = f"{i+1}. {residue}" |
|
|
bbox = draw.textbbox((label_x, label_y), text, font=font) |
|
|
padding = 10 |
|
|
draw.rectangle([bbox[0]-padding, bbox[1]-padding, |
|
|
bbox[2]+padding, bbox[3]+padding], |
|
|
fill='white', outline='white') |
|
|
draw.text((label_x, label_y), text, |
|
|
font=font, fill='black', anchor="mm") |
|
|
|
|
|
# Add sequence at the top with white background |
|
|
seq_text = f"Sequence: {sequence}" |
|
|
bbox = draw.textbbox((center_x, 100), seq_text, font=small_font) |
|
|
padding = 10 |
|
|
draw.rectangle([bbox[0]-padding, bbox[1]-padding, |
|
|
bbox[2]+padding, bbox[3]+padding], |
|
|
fill='white', outline='white') |
|
|
draw.text((center_x, 100), seq_text, |
|
|
font=small_font, fill='black', anchor="mm") |
|
|
|
|
|
return img |
|
|
|
|
|
""" |
|
|
def annotate_cyclic_structure(mol, sequence): |
|
|
"""Create structure visualization with just the sequence header""" |
|
|
|
|
|
AllChem.Compute2DCoords(mol) |
|
|
|
|
|
|
|
|
drawer = Draw.rdMolDraw2D.MolDraw2DCairo(2000, 2000) |
|
|
|
|
|
|
|
|
drawer.drawOptions().addAtomIndices = False |
|
|
drawer.DrawMolecule(mol) |
|
|
drawer.FinishDrawing() |
|
|
|
|
|
|
|
|
img = Image.open(BytesIO(drawer.GetDrawingText())) |
|
|
draw = ImageDraw.Draw(img) |
|
|
try: |
|
|
small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 60) |
|
|
except OSError: |
|
|
try: |
|
|
small_font = ImageFont.truetype("arial.ttf", 60) |
|
|
except OSError: |
|
|
print("Warning: TrueType fonts not available, using default font") |
|
|
small_font = ImageFont.load_default() |
|
|
|
|
|
|
|
|
seq_text = f"Sequence: {sequence}" |
|
|
bbox = draw.textbbox((1000, 100), seq_text, font=small_font) |
|
|
padding = 10 |
|
|
draw.rectangle([bbox[0]-padding, bbox[1]-padding, |
|
|
bbox[2]+padding, bbox[3]+padding], |
|
|
fill='white', outline='white') |
|
|
draw.text((1000, 100), seq_text, |
|
|
font=small_font, fill='black', anchor="mm") |
|
|
|
|
|
return img |
|
|
|
|
|
def create_enhanced_linear_viz(sequence, smiles): |
|
|
"""Create an enhanced linear representation using PeptideAnalyzer""" |
|
|
analyzer = PeptideAnalyzer() |
|
|
|
|
|
|
|
|
fig = plt.figure(figsize=(15, 10)) |
|
|
gs = fig.add_gridspec(2, 1, height_ratios=[1, 2]) |
|
|
ax_struct = fig.add_subplot(gs[0]) |
|
|
ax_detail = fig.add_subplot(gs[1]) |
|
|
|
|
|
|
|
|
if sequence.startswith('cyclo('): |
|
|
residues = sequence[6:-1].split('-') |
|
|
else: |
|
|
residues = sequence.split('-') |
|
|
|
|
|
|
|
|
segments = analyzer.split_on_bonds(smiles) |
|
|
|
|
|
|
|
|
print(f"Number of residues: {len(residues)}") |
|
|
print(f"Number of segments: {len(segments)}") |
|
|
|
|
|
|
|
|
ax_struct.set_xlim(0, 10) |
|
|
ax_struct.set_ylim(0, 2) |
|
|
|
|
|
num_residues = len(residues) |
|
|
spacing = 9.0 / (num_residues - 1) if num_residues > 1 else 9.0 |
|
|
|
|
|
|
|
|
y_pos = 1.5 |
|
|
for i in range(num_residues): |
|
|
x_pos = 0.5 + i * spacing |
|
|
|
|
|
|
|
|
rect = patches.Rectangle((x_pos-0.3, y_pos-0.2), 0.6, 0.4, |
|
|
facecolor='lightblue', edgecolor='black') |
|
|
ax_struct.add_patch(rect) |
|
|
|
|
|
|
|
|
if i < num_residues - 1: |
|
|
segment = segments[i] if i < len(segments) else None |
|
|
if segment: |
|
|
|
|
|
bond_type = 'ester' if 'O-linked' in segment.get('bond_after', '') else 'peptide' |
|
|
is_n_methylated = 'N-Me' in segment.get('bond_after', '') |
|
|
|
|
|
bond_color = 'red' if bond_type == 'ester' else 'black' |
|
|
linestyle = '--' if bond_type == 'ester' else '-' |
|
|
|
|
|
|
|
|
ax_struct.plot([x_pos+0.3, x_pos+spacing-0.3], [y_pos, y_pos], |
|
|
color=bond_color, linestyle=linestyle, linewidth=2) |
|
|
|
|
|
|
|
|
mid_x = x_pos + spacing/2 |
|
|
bond_label = f"{bond_type}" |
|
|
if is_n_methylated: |
|
|
bond_label += "\n(N-Me)" |
|
|
ax_struct.text(mid_x, y_pos+0.1, bond_label, |
|
|
ha='center', va='bottom', fontsize=10, |
|
|
color=bond_color) |
|
|
|
|
|
|
|
|
ax_struct.text(x_pos, y_pos-0.5, residues[i], |
|
|
ha='center', va='top', fontsize=14) |
|
|
|
|
|
|
|
|
ax_detail.set_ylim(0, len(segments)+1) |
|
|
ax_detail.set_xlim(0, 1) |
|
|
|
|
|
|
|
|
segment_y = len(segments) |
|
|
for i, segment in enumerate(segments): |
|
|
y = segment_y - i |
|
|
|
|
|
|
|
|
residue, mods = analyzer.identify_residue(segment) |
|
|
if residue: |
|
|
text = f"Residue {i+1}: {residue}" |
|
|
if mods: |
|
|
text += f" ({', '.join(mods)})" |
|
|
color = 'blue' |
|
|
else: |
|
|
|
|
|
text = f"Bond {i}: " |
|
|
if 'O-linked' in segment.get('bond_after', ''): |
|
|
text += "ester" |
|
|
elif 'N-Me' in segment.get('bond_after', ''): |
|
|
text += "peptide (N-methylated)" |
|
|
else: |
|
|
text += "peptide" |
|
|
color = 'red' |
|
|
|
|
|
|
|
|
ax_detail.text(0.05, y, text, fontsize=12, color=color) |
|
|
ax_detail.text(0.5, y, f"SMILES: {segment.get('content', '')}", fontsize=10, color='gray') |
|
|
|
|
|
|
|
|
if sequence.startswith('cyclo('): |
|
|
ax_struct.annotate('', xy=(9.5, y_pos), xytext=(0.5, y_pos), |
|
|
arrowprops=dict(arrowstyle='<->', color='red', lw=2)) |
|
|
ax_struct.text(5, y_pos+0.3, 'Cyclic Connection', |
|
|
ha='center', color='red', fontsize=14) |
|
|
|
|
|
|
|
|
ax_struct.set_title("Peptide Structure Overview", pad=20) |
|
|
ax_detail.set_title("Segment Analysis Breakdown", pad=20) |
|
|
|
|
|
|
|
|
for ax in [ax_struct, ax_detail]: |
|
|
ax.set_xticks([]) |
|
|
ax.set_yticks([]) |
|
|
ax.axis('off') |
|
|
|
|
|
plt.tight_layout() |
|
|
return fig |
|
|
|
|
|
class PeptideStructureGenerator: |
|
|
"""A class to generate 3D structures of peptides using different embedding methods""" |
|
|
|
|
|
@staticmethod |
|
|
def prepare_molecule(smiles): |
|
|
"""Prepare molecule with proper hydrogen handling""" |
|
|
mol = Chem.MolFromSmiles(smiles, sanitize=False) |
|
|
if mol is None: |
|
|
raise ValueError("Failed to create molecule from SMILES") |
|
|
|
|
|
|
|
|
for atom in mol.GetAtoms(): |
|
|
atom.UpdatePropertyCache(strict=False) |
|
|
|
|
|
|
|
|
Chem.SanitizeMol(mol, |
|
|
sanitizeOps=Chem.SANITIZE_FINDRADICALS| |
|
|
Chem.SANITIZE_KEKULIZE| |
|
|
Chem.SANITIZE_SETAROMATICITY| |
|
|
Chem.SANITIZE_SETCONJUGATION| |
|
|
Chem.SANITIZE_SETHYBRIDIZATION| |
|
|
Chem.SANITIZE_CLEANUPCHIRALITY) |
|
|
|
|
|
mol = Chem.AddHs(mol) |
|
|
return mol |
|
|
|
|
|
@staticmethod |
|
|
def get_etkdg_params(attempt=0): |
|
|
"""Get ETKDG parameters with optional modifications based on attempt number""" |
|
|
params = AllChem.ETKDGv3() |
|
|
params.randomSeed = -1 |
|
|
params.maxIterations = 200 |
|
|
params.numThreads = 4 |
|
|
params.useBasicKnowledge = True |
|
|
params.enforceChirality = True |
|
|
params.useExpTorsionAnglePrefs = True |
|
|
params.useSmallRingTorsions = True |
|
|
params.useMacrocycleTorsions = True |
|
|
params.ETversion = 2 |
|
|
params.pruneRmsThresh = -1 |
|
|
params.embedRmsThresh = 0.5 |
|
|
|
|
|
if attempt > 10: |
|
|
params.bondLength = 1.5 + (attempt - 10) * 0.02 |
|
|
params.useExpTorsionAnglePrefs = False |
|
|
|
|
|
return params |
|
|
|
|
|
def generate_structure_etkdg(self, smiles, max_attempts=20): |
|
|
"""Generate 3D structure using ETKDG without UFF optimization""" |
|
|
success = False |
|
|
mol = None |
|
|
|
|
|
for attempt in range(max_attempts): |
|
|
try: |
|
|
mol = self.prepare_molecule(smiles) |
|
|
params = self.get_etkdg_params(attempt) |
|
|
|
|
|
if AllChem.EmbedMolecule(mol, params) == 0: |
|
|
success = True |
|
|
break |
|
|
except Exception as e: |
|
|
continue |
|
|
|
|
|
if not success: |
|
|
raise ValueError("Failed to generate structure with ETKDG") |
|
|
|
|
|
return mol |
|
|
|
|
|
def generate_structure_uff(self, smiles, max_attempts=20): |
|
|
"""Generate 3D structure using ETKDG followed by UFF optimization""" |
|
|
best_mol = None |
|
|
lowest_energy = float('inf') |
|
|
|
|
|
for attempt in range(max_attempts): |
|
|
try: |
|
|
test_mol = self.prepare_molecule(smiles) |
|
|
params = self.get_etkdg_params(attempt) |
|
|
|
|
|
if AllChem.EmbedMolecule(test_mol, params) == 0: |
|
|
res = AllChem.UFFOptimizeMolecule(test_mol, maxIters=2000, |
|
|
vdwThresh=10.0, confId=0, |
|
|
ignoreInterfragInteractions=True) |
|
|
|
|
|
if res == 0: |
|
|
ff = AllChem.UFFGetMoleculeForceField(test_mol) |
|
|
if ff: |
|
|
current_energy = ff.CalcEnergy() |
|
|
if current_energy < lowest_energy: |
|
|
lowest_energy = current_energy |
|
|
best_mol = Chem.Mol(test_mol) |
|
|
except Exception: |
|
|
continue |
|
|
|
|
|
if best_mol is None: |
|
|
raise ValueError("Failed to generate optimized structure") |
|
|
|
|
|
return best_mol |
|
|
|
|
|
@staticmethod |
|
|
def mol_to_sdf_bytes(mol): |
|
|
"""Convert RDKit molecule to SDF file bytes""" |
|
|
|
|
|
sio = StringIO() |
|
|
writer = Chem.SDWriter(sio) |
|
|
writer.write(mol) |
|
|
writer.close() |
|
|
|
|
|
|
|
|
return sio.getvalue().encode('utf-8') |
|
|
|
|
|
def process_input(smiles_input=None, file_obj=None, show_linear=False, |
|
|
show_segment_details=False, generate_3d=False, use_uff=False): |
|
|
"""Process input and create visualizations using PeptideAnalyzer""" |
|
|
analyzer = PeptideAnalyzer() |
|
|
temp_dir = tempfile.mkdtemp() if generate_3d else None |
|
|
structure_files = [] |
|
|
|
|
|
|
|
|
if smiles_input: |
|
|
smiles = smiles_input.strip() |
|
|
|
|
|
|
|
|
if not analyzer.is_peptide(smiles): |
|
|
return "Error: Input SMILES does not appear to be a peptide structure.", None, None |
|
|
|
|
|
try: |
|
|
|
|
|
mol = Chem.MolFromSmiles(smiles) |
|
|
if mol is None: |
|
|
return "Error: Invalid SMILES notation.", None, None |
|
|
|
|
|
|
|
|
if generate_3d: |
|
|
generator = PeptideStructureGenerator() |
|
|
|
|
|
try: |
|
|
|
|
|
mol_etkdg = generator.generate_structure_etkdg(smiles) |
|
|
etkdg_path = os.path.join(temp_dir, "structure_etkdg.sdf") |
|
|
writer = Chem.SDWriter(etkdg_path) |
|
|
writer.write(mol_etkdg) |
|
|
writer.close() |
|
|
structure_files.append(etkdg_path) |
|
|
|
|
|
|
|
|
if use_uff: |
|
|
mol_uff = generator.generate_structure_uff(smiles) |
|
|
uff_path = os.path.join(temp_dir, "structure_uff.sdf") |
|
|
writer = Chem.SDWriter(uff_path) |
|
|
writer.write(mol_uff) |
|
|
writer.close() |
|
|
structure_files.append(uff_path) |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error generating 3D structures: {str(e)}", None, None, None |
|
|
|
|
|
|
|
|
segments = analyzer.split_on_bonds(smiles) |
|
|
|
|
|
|
|
|
sequence_parts = [] |
|
|
output_text = "" |
|
|
|
|
|
|
|
|
if show_segment_details: |
|
|
output_text += "Segment Analysis:\n" |
|
|
for i, segment in enumerate(segments): |
|
|
output_text += f"\nSegment {i}:\n" |
|
|
output_text += f"Content: {segment['content']}\n" |
|
|
output_text += f"Bond before: {segment.get('bond_before', 'None')}\n" |
|
|
output_text += f"Bond after: {segment.get('bond_after', 'None')}\n" |
|
|
|
|
|
residue, mods = analyzer.identify_residue(segment) |
|
|
if residue: |
|
|
if mods: |
|
|
sequence_parts.append(f"{residue}({','.join(mods)})") |
|
|
else: |
|
|
sequence_parts.append(residue) |
|
|
output_text += f"Identified as: {residue}\n" |
|
|
output_text += f"Modifications: {mods}\n" |
|
|
else: |
|
|
output_text += f"Warning: Could not identify residue in segment: {segment['content']}\n" |
|
|
output_text += "\n" |
|
|
else: |
|
|
|
|
|
for segment in segments: |
|
|
residue, mods = analyzer.identify_residue(segment) |
|
|
if residue: |
|
|
if mods: |
|
|
sequence_parts.append(f"{residue}({','.join(mods)})") |
|
|
else: |
|
|
sequence_parts.append(residue) |
|
|
|
|
|
|
|
|
is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles) |
|
|
three_letter = '-'.join(sequence_parts) |
|
|
one_letter = ''.join(analyzer.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence_parts) |
|
|
|
|
|
if is_cyclic: |
|
|
three_letter = f"cyclo({three_letter})" |
|
|
one_letter = f"cyclo({one_letter})" |
|
|
|
|
|
|
|
|
img_cyclic = annotate_cyclic_structure(mol, three_letter) |
|
|
|
|
|
|
|
|
img_linear = None |
|
|
if show_linear: |
|
|
fig_linear = create_enhanced_linear_viz(three_letter, smiles) |
|
|
buf = BytesIO() |
|
|
fig_linear.savefig(buf, format='png', bbox_inches='tight', dpi=300) |
|
|
buf.seek(0) |
|
|
img_linear = Image.open(buf) |
|
|
plt.close(fig_linear) |
|
|
|
|
|
|
|
|
summary = "Summary:\n" |
|
|
summary += f"Sequence: {three_letter}\n" |
|
|
summary += f"One-letter code: {one_letter}\n" |
|
|
summary += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if structure_files: |
|
|
summary += "\n3D Structures Generated:\n" |
|
|
for filepath in structure_files: |
|
|
summary += f"- {os.path.basename(filepath)}\n" |
|
|
|
|
|
return summary + output_text, img_cyclic, img_linear, structure_files if structure_files else None |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error processing SMILES: {str(e)}", None, None, None |
|
|
|
|
|
|
|
|
if file_obj is not None: |
|
|
try: |
|
|
|
|
|
if hasattr(file_obj, 'name'): |
|
|
with open(file_obj.name, 'r') as f: |
|
|
content = f.read() |
|
|
else: |
|
|
content = file_obj.decode('utf-8') if isinstance(file_obj, bytes) else str(file_obj) |
|
|
|
|
|
output_text = "" |
|
|
for line in content.splitlines(): |
|
|
smiles = line.strip() |
|
|
if smiles: |
|
|
|
|
|
if not analyzer.is_peptide(smiles): |
|
|
output_text += f"Skipping non-peptide SMILES: {smiles}\n" |
|
|
continue |
|
|
|
|
|
|
|
|
segments = analyzer.split_on_bonds(smiles) |
|
|
sequence_parts = [] |
|
|
|
|
|
|
|
|
if show_segment_details: |
|
|
output_text += f"\nSegment Analysis for SMILES: {smiles}\n" |
|
|
for i, segment in enumerate(segments): |
|
|
output_text += f"\nSegment {i}:\n" |
|
|
output_text += f"Content: {segment['content']}\n" |
|
|
output_text += f"Bond before: {segment.get('bond_before', 'None')}\n" |
|
|
output_text += f"Bond after: {segment.get('bond_after', 'None')}\n" |
|
|
residue, mods = analyzer.identify_residue(segment) |
|
|
if residue: |
|
|
if mods: |
|
|
sequence_parts.append(f"{residue}({','.join(mods)})") |
|
|
else: |
|
|
sequence_parts.append(residue) |
|
|
output_text += f"Identified as: {residue}\n" |
|
|
output_text += f"Modifications: {mods}\n" |
|
|
else: |
|
|
for segment in segments: |
|
|
residue, mods = analyzer.identify_residue(segment) |
|
|
if residue: |
|
|
if mods: |
|
|
sequence_parts.append(f"{residue}({','.join(mods)})") |
|
|
else: |
|
|
sequence_parts.append(residue) |
|
|
|
|
|
|
|
|
is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles) |
|
|
sequence = f"cyclo({'-'.join(sequence_parts)})" if is_cyclic else '-'.join(sequence_parts) |
|
|
|
|
|
output_text += f"\nSummary for SMILES: {smiles}\n" |
|
|
output_text += f"Sequence: {sequence}\n" |
|
|
output_text += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n" |
|
|
if is_cyclic: |
|
|
output_text += f"Peptide Cycles: {', '.join(peptide_cycles)}\n" |
|
|
|
|
|
output_text += "-" * 50 + "\n" |
|
|
|
|
|
return output_text, None, None |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error processing file: {str(e)}", None, None |
|
|
|
|
|
return "No input provided.", None, None |
|
|
|
|
|
|