Coverage for src / molecular_simulations / build / build_ligand.py: 2%
298 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 10:07 -0600
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 10:07 -0600
1#!/usr/bin/env python
2from .build_amber import ImplicitSolvent, ExplicitSolvent
3import gc
4from glob import glob
5import json
6from MDAnalysis.lib.util import convert_aa_code
7from openbabel import pybel
8from openmm import *
9from openmm.app import *
10import os
11from pathlib import Path
12from pdbfixer import PDBFixer
13from pdbfixer.pdbfixer import Sequence
14from rdkit import Chem
15import shutil
16from typing import Dict, List, Union
18PathLike = Union[str, Path]
19OptPath = Union[str, Path, None]
20Sequences = List[Sequence]
22class LigandError(Exception):
23 """
24 Custom Ligand exception to catch parameterization errors.
25 """
26 def __init__(self, message='This system contains ligands which we cannot model!'):
27 self.message = message
28 super().__init__(self.message)
31class LigandBuilder:
32 """
33 Parameterizes a ligand molecule and generates all relevant force field files for
34 running tleap.
35 """
36 def __init__(self, path: PathLike, lig: str, lig_number: int=0, file_prefix: str=''):
37 self.path = path
38 self.lig = path / lig
39 self.ln = lig_number
40 self.out_lig = path / f'{file_prefix}{lig.stem}'
42 def parameterize_ligand(self) -> None:
43 """
44 Ensures consistent treatment of all ligand sdf files, generating
45 GAFF2 parameters in the form of .frcmod and .lib files. Produces
46 a mol2 file for coordinates and connectivity and ensures that
47 antechamber did not fail. Hydrogens are added in rdkit which
48 generally does a good job of this.
50 Returns:
51 None
52 """
53 ext = self.lig.suffix
54 self.lig = self.lig.stem
56 convert_to_gaff = f'antechamber -i {self.lig}_prep.mol2 -fi mol2 -o \
57 {self.out_lig}.mol2 -fo mol2 -at gaff2 -c bcc -s 0 -pf y -rn LG{self.ln}'
58 parmchk2 = f'parmchk2 -i {self.out_lig}.mol2 -f mol2 -o {self.out_lig}.frcmod'
60 tleap_ligand = f"""source leaprc.gaff2
61 LG{self.ln} = loadmol2 {self.out_lig}.mol2
62 loadamberparams {self.out_lig}.frcmod
63 saveoff LG{self.ln} {self.out_lig}.lib
64 quit
65 """
67 if ext == '.sdf':
68 self.process_sdf()
69 else:
70 self.process_pdb()
72 self.convert_to_mol2()
73 os.system(convert_to_gaff)
74 try:
75 self.move_antechamber_outputs()
76 self.check_sqm()
77 os.system(parmchk2)
78 leap_file, leap_log = self.write_leap(tleap_ligand)
79 os.system(f'tleap -f {leap_file} > {leap_log}')
80 except FileNotFoundError:
81 raise LigandError(f'Antechamber failed! {self.lig}')
83 def process_sdf(self) -> None:
84 """
85 Add hydrogens in rdkit. Atom hybridization is taken from the
86 input sdf file and if this is incorrect, hydrogens will be wrong
87 too.
89 Returns:
90 None
91 """
92 mol = Chem.SDMolSupplier(f'{self.lig}.sdf')[0]
93 molH = Chem.AddHs(mol, addCoords=True)
94 with Chem.SDWriter(f'{self.lig}_H.sdf') as w:
95 w.write(molH)
97 def process_pdb(self) -> None:
98 """
99 Ingests a PDB file of a small molecule, adds hydrogens and writes out to
100 an SDF file.
102 Returns:
103 None
104 """
105 mol = Chem.MolFromPDBFile(f'{self.lig}.pdb')
106 molH = Chem.AddHs(mol, addCoords=True)
107 with Chem.SDWriter(f'{self.lig}_H.sdf') as w:
108 w.write(molH)
110 def convert_to_mol2(self) -> None:
111 """
112 Converts an sdf file to mol2 format using obabel.
114 Returns:
115 None
116 """
117 mol = list(pybel.readfile('sdf', f'{self.lig}_H.sdf'))[0]
118 mol.write('mol2', f'{self.lig}_prep.mol2', True)
120 def move_antechamber_outputs(self) -> None:
121 """
122 Remove unneccessary outputs from antechamber. Keep the
123 sqm.out file as proof that antechamber did not fail.
125 Returns:
126 None
127 """
128 os.remove('sqm.in')
129 os.remove('sqm.pdb')
130 shutil.move('sqm.out', f'{self.lig}_sqm.out')
132 def check_sqm(self) -> None:
133 """
134 Checks for evidence that antechamber calculations exited
135 successfully. This is always on the second to last line,
136 and if not present, indicates that we failed to produce
137 sane parameters for this molecule. In that case, I wish
138 you good luck.
140 Returns:
141 None
142 """
143 line = open(f'{self.lig}_sqm.out').readlines()[-2]
145 if 'Calculation Completed' not in line:
146 # make sqm.in more tolerant
147 # manually run sqm
148 # if it still fails then we raise an error
149 raise LigandError(f'SQM failed for ligand {self.lig}!')
151 def write_leap(self,
152 inp: str) -> str:
153 """
154 Writes out a tleap input file and returns the path
155 to the file.
156 """
157 leap_file = f'{self.path}/tleap.in'
158 leap_log = f'{self.path}/leap.log'
159 with open(leap_file, 'w') as outfile:
160 outfile.write(inp)
162 return leap_file, leap_log
165class PLINDERBuilder(ImplicitSolvent):
166 """
167 Builds complexes consisting of a biomolecule pdb and small molecule ligand.
168 Runs antechamber workflow to generate gaff2 parameters.
169 """
170 def __init__(self,
171 path: PathLike,
172 system_id: str,
173 out: PathLike,
174 **kwargs):
175 super().__init__(path / system_id, 'receptor.pdb', out / system_id,
176 protein=True, rna=True, dna=True, phos_protein=True,
177 mod_protein=True, **kwargs)
178 self.system_id = system_id
179 self.ffs.append('leaprc.gaff2')
180 self.build_dir = self.out / 'build'
181 self.ions = None
183 def build(self) -> None:
184 ligs = self.migrate_files()
186 if not ligs:
187 print(f'No ligands!\n\n{self.pdb}')
188 raise LigandError
190 self.ligs = self.ligand_handler(ligs)
191 self.assemble_system()
193 def ligand_handler(self, ligs: List[PathLike]) -> List[PathLike]:
194 ligands = []
195 for i, lig in enumerate(ligs):
196 lig_builder = LigandBuilder(self.build_dir, lig, i)
197 lig_builder.parameterize_ligand()
198 ligands.append(os.path.basename(lig)[:-4])
200 return ligands
202 def migrate_files(self) -> List[str]:
203 os.makedirs(str(self.build_dir), exist_ok=True)
204 os.chdir(self.build_dir) # necessary for antechamber outputs
206 # grab the sequence file to complete protein modeling
207 shutil.copy(str(self.path / 'sequences.fasta'),
208 str(self.build_dir))
209 self.fasta = str(self.build_dir / 'sequences.fasta')
211 # fix and move pdb
212 self.prep_protein()
214 # move ligand(s)
215 ligands = []
216 lig_files = self.path / 'ligand_files'
217 ligs = [Path(lig) for lig in glob(str(lig_files) + '/*.sdf')]
218 for lig in ligs:
219 shutil.copy(str(lig),
220 str(self.build_dir))
222 if self.check_ligand(lig):
223 ligands.append(lig.name)
226 # handle any potential ions
227 if self.ions is not None:
228 self.ffs.append('leaprc.water.tip3p')
229 self.place_ions()
231 return ligands
233 def place_ions(self) -> None:
234 """
235 This is horrible and I apologize profusely if you find yourself
236 having to go through the following. Good luck.
237 """
238 pdb_lines = open(self.pdb).readlines()[:-1]
240 if 'END' in pdb_lines[-1]:
241 if 'TER' in pdb_lines[-2]:
242 ln = -3
243 else:
244 ln = -2
245 elif 'TER' in pdb_lines[-1]:
246 ln = -2
247 else:
248 ln = -1
250 try:
251 next_atom_num = int(pdb_lines[ln][6:12].strip()) + 1
252 next_resid = int(pdb_lines[ln][22:26].strip()) + 1
253 except ValueError:
254 print(f'ERROR: {self.pdb}')
255 raise LigandError
257 for ion in self.ions:
258 for atom in ion:
259 # HERE BE DRAGONS
260 ion_line = f'ATOM {next_atom_num:>5}'
262 if atom[0].lower() in ['na', 'k', 'cl']:
263 ionname = atom[0] + atom[1]
264 ion_line += f'{ionname:>4} {ionname:<3} '
265 else:
266 ionname = atom[0].upper()
267 ion_line += f'{ionname:>3} {ionname:<3}'
269 coords = ''.join([f'{x:>8.3f}' for x in atom[2:]])
270 ion_line += f'{next_resid:>5} {coords} 0.00 0.00\n'
272 pdb_lines.append(ion_line)
273 pdb_lines.append('TER\n')
275 next_atom_num += 1
276 next_resid += 1
278 pdb_lines.append('END')
280 with open(self.pdb, 'w') as f:
281 f.write(''.join(pdb_lines))
283 def assemble_system(self) -> None:
284 """
285 Slightly modified from the parent class, now we have to add
286 the ligand parameters and assemble a complex rather than just
287 placing a biomolecule in the water box.
288 """
289 tleap_complex = [f'source {ff}' for ff in self.ffs]
290 structs = [f'PROT = loadpdb {self.pdb}']
291 combine = 'COMPLEX = combine{PROT'
292 for i, lig in enumerate(self.ligs):
293 ligand = self.build_dir / lig
294 tleap_complex += [f'loadamberparams {ligand}.frcmod',
295 f'loadoff {ligand}.lib']
296 structs += [f'LG{i} = loadmol2 {ligand}.mol2']
297 combine += f' LG{i}'
299 combine += '}'
300 tleap_complex += structs
301 tleap_complex.append(combine)
302 tleap_complex += [
303 'set default PBRadii mbondi3',
304 f'savepdb COMPLEX {self.out}/system.pdb',
305 f'saveamberparm COMPLEX {self.out}/system.prmtop {self.out}/system.inpcrd',
306 'quit'
307 ]
309 tleap_complex = '\n'.join(tleap_complex)
310 leap_file = self.build_dir / 'tleap.in'
311 with open(str(leap_file), 'w') as outfile:
312 outfile.write(tleap_complex)
314 tleap = f'tleap -f {leap_file}'
315 os.system(tleap)
317 def prep_protein(self) -> None:
318 raw_pdb = self.path / self.pdb
319 prep_pdb = self.build_dir / 'prepped.pdb'
320 self.pdb = self.build_dir / 'protein.pdb'
322 # complex workflow for modeling missing residues
323 self.triage_pdb(raw_pdb, prep_pdb)
325 # remove hydrogens (-y) and waters (-d) from the input PDB
326 pdb4amber = f'pdb4amber -i {prep_pdb} -o {self.pdb} -y -d'
327 os.system(pdb4amber)
329 def triage_pdb(self, broken_pdb: PathLike,
330 repaired_pdb: PathLike) -> str:
331 """
332 Runs PDBFixer to repair missing loops and ensure structure is
333 in good shape. Runs a check against the sequence provided by
334 PLINDER and ensures that any non-canonical residues are represented
335 in the sequence properly.
336 """
337 fixer = PDBFixer(filename=str(broken_pdb))
338 chains = [chain for chain in fixer.topology.chains()]
339 chain_map = {chain.id: [res for res in chain.residues()]
340 for chain in chains}
342 # non-databank models do not have SEQRES and therefore no
343 # sequence data to model missing residues
344 fixer.sequences = self.inject_fasta(chain_map)
345 fixer.findMissingResidues()
346 fixer.findMissingAtoms()
347 fixer.addMissingAtoms()
349 with open(str(repaired_pdb), 'w') as f:
350 PDBFile.writeFile(fixer.topology, fixer.positions, f)
352 def inject_fasta(self,
353 chain_map: Dict[str, List[str]]) -> Sequences:
354 """
355 Checks fasta against actual sequence. Modifies sequence so that
356 it correctly matches in the case of non-canonical residues such
357 as phosphorylations (i.e. SER -> SEP).
358 """
359 fasta = open(self.fasta).readlines()
360 remapping = json.load(open(f'{self.path}/chain_mapping.json', 'rb'))
361 sequences = []
362 for i in range(len(fasta) // 2):
363 seq_chain = fasta[2*i].strip()[1:] # strip off > and \n
364 chain = remapping[seq_chain]
365 one_letter_seq = fasta[2*i+1].strip()
366 try:
367 three_letter_seq = [convert_aa_code(aa) for aa in one_letter_seq]
368 except ValueError:
369 print(f'\nUnknown residue in fasta!\n\n{self.pdb}')
370 raise LigandError
372 try:
373 three_letter_seq = self.check_ptms(three_letter_seq,
374 chain_map[chain])
375 sequences.append(
376 Sequence(chainId=chain,
377 residues=three_letter_seq)
378 )
379 except KeyError: # not sure what to do if this fails
380 print(f'\nUnknown ligand error!\n\n{self.pdb}')
381 raise LigandError
383 return sequences
385 def check_ptms(self,
386 sequence: List[str],
387 chain_residues: List[str]) -> List[str]:
388 """
389 Check the full sequence (from fasta) against the potentially partial
390 sequence from the structural model stored in `chain_residues`.
391 """
392 for residue in chain_residues:
393 resID = int(residue.id) - 1 # since 0-indexed in list
395 try:
396 if sequence[resID] != residue.name:
397 sequence[resID] = residue.name
398 except IndexError:
399 print(f'Sequence length is messed up!\n\n{self.pdb}')
400 raise LigandError
402 return sequence
404 def check_ligand(self, ligand: PathLike) -> bool:
405 """
406 Check ligand for ions and other weird stuff. We need to take care not
407 to assume all species containing formal charges are ions, nor that all
408 species containing atoms in the cation/anion lists are ions. Good example
409 is the multitude of small molecule drugs containing bonded halogens.
410 """
411 ion = False
412 mol = Chem.SDMolSupplier(str(ligand))[0]
414 ligand = []
415 for atom, position in zip(mol.GetAtoms(), mol.GetConformer().GetPositions()):
416 symbol = atom.GetSymbol()
417 if symbol.lower() in self.cation_list + self.anion_list:
418 charge = atom.GetFormalCharge()
419 if charge != 0:
420 ion = True
421 sign = '+' if charge > 0 else '-'
422 if abs(charge) > 1:
423 sign = f'{charge}{sign}'
425 ligand.append([symbol, sign] + [x for x in position])
427 if ion:
428 try:
429 self.ions.append(ligand)
430 except AttributeError:
431 self.ions = [ligand]
432 return False
434 return True
436 @property
437 def cation_list(self) -> List[str]:
438 return [
439 'na', 'k', 'ca', 'mn', 'mg', 'li', 'rb', 'cs', 'cu',
440 'ag', 'au', 'ti', 'be', 'sr', 'ba', 'ra', 'v', 'cr',
441 'fe', 'co', 'zn', 'ni', 'pd', 'cd', 'sn', 'pt', 'hg',
442 'pb', 'al'
443 ]
445 @property
446 def anion_list(self) -> List[str]:
447 return [
448 'cl', 'br', 'i', 'f'
449 ]
452class ComplexBuilder(ExplicitSolvent):
453 """
454 Builds complexes consisting of a biomolecule pdb and small molecule ligand.
455 Runs antechamber workflow to generate gaff2 parameters. Can optionally
456 supply precomputed frcmod/lib files by their path + suffix in the
457 lig_param_prefix argument (e.g. /path/to/lig.mol2 or /path/to/lig)
458 """
459 def __init__(self, path: str, pdb: str, lig: str | list[str], padding: float=10.,
460 lig_param_prefix: str | None = None, **kwargs):
461 super().__init__(path, pdb, padding)
462 self.lig = Path(lig) if isinstance(lig, str) else [Path(l) for l in lig]
463 self.ffs.append('leaprc.gaff2')
464 self.build_dir = self.out.parent / 'build'
466 if lig_param_prefix is None:
467 self.lig_param_prefix = lig_param_prefix
468 else:
469 prefix = Path(lig_param_prefix)
470 self.lig_param_prefix = prefix.parent / prefix.stem
472 for key, value in kwargs.items():
473 setattr(self, key, value)
475 def build(self) -> None:
476 self.build_dir.mkdir(exist_ok=True, parents=True)
477 os.chdir(self.build_dir) # necessary for antechamber outputs
479 if self.lig_param_prefix is None:
480 if isinstance(self.lig, list):
481 lig_paths = []
482 for i, lig in enumerate(self.lig):
483 lig_paths += self.process_ligand(lig, i)
485 self.lig = lig_paths
487 else:
488 self.lig = self.process_ligand(self.lig)
489 else:
490 self.lig = self.lig_param_prefix
492 if hasattr(self, ion):
493 self.add_ion_to_pdb()
495 self.prep_pdb()
496 self.assemble_system()
498 def process_ligand(self, lig: PathLike, prefix: int | None = None) -> PathLike:
499 if lig.parent != self.build_dir:
500 shutil.copy(lig, self.build_dir)
502 if prefix is None:
503 prefix = ''
505 lig_builder = LigandBuilder(self.build_dir, lig, file_prefix=prefix)
506 lig_builder.parameterize_ligand()
508 return lig_builder.lig
510 def add_ion_to_pdb(self) -> None:
511 ion = [line for line in open(self.ion).readlines()
512 if any(['ATOM' in line, 'HETATM' in line])]
513 pdb = [line for line in open(self.pdb).readlines()]
515 out_pdb = []
516 for line in pdb:
517 if 'END' in line:
518 out_pdb.append(ion)
519 out_pdb.append(line)
520 else:
521 out_pdb.append(line)
523 with open(self.pdb, 'w') as f:
524 f.write('\n'.join(out_pdb))
526 def assemble_system(self, dim, num_ions) -> None:
527 """
528 Slightly modified from the parent class, now we have to add
529 the ligand parameters and assemble a complex rather than just
530 placing a biomolecule in the water box.
531 """
532 tleap_ffs = '\n'.join([f'source {ff}' for ff in self.ffs])
533 tleap_complex = [
534 tleap_ffs,
535 'source leaprc.gaff2',
536 ]
538 if not isinstance(self.lig, list):
539 self.lig = [self.lig]
541 LABELS = []
542 for i, lig in enumerate(self.lig):
543 tleap_complex += [
544 f'loadamberparams {lig}.frcmod',
545 f'loadoff {lig}.lib',
546 f'LG{i} = loadmol2 {lig}.mol2',
547 ]
549 LABELS.append(f'LG{i}')
551 LABELS.append(f'PROT')
552 LABELS = ' '.join(LABELS)
554 tleap_complex = [
555 f'PROT = loadpdb {self.pdb}',
556 f'COMPLEX = combine {{LABELS}}',
557 'setbox COMPLEX centers',
558 f'set COMPLEX box {{{dim} {dim} {dim}}}',
559 f'solvatebox COMPLEX {self.water_box} {{0 0 0}}',
560 'addions COMPLEX Na+ 0',
561 'addions COMPLEX Cl- 0',
562 f'addIonsRand COMPLEX Na+ {num_ions} Cl- {num_ions}',
563 f'savepdb COMPLEX {self.out}.pdb',
564 f'saveamberparm COMPLEX {self.out}.prmtop {self.out}.inpcrd'
565 ]
567 leap_file = self.write_leap('\n'.join(tleap_complex))
568 tleap = f'tleap -f {leap_file}'
569 os.system(tleap)