Coverage for src / molecular_simulations / build / build_ligand.py: 2%

298 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 10:07 -0600

1#!/usr/bin/env python 

2from .build_amber import ImplicitSolvent, ExplicitSolvent 

3import gc 

4from glob import glob 

5import json 

6from MDAnalysis.lib.util import convert_aa_code 

7from openbabel import pybel 

8from openmm import * 

9from openmm.app import * 

10import os 

11from pathlib import Path 

12from pdbfixer import PDBFixer 

13from pdbfixer.pdbfixer import Sequence 

14from rdkit import Chem 

15import shutil 

16from typing import Dict, List, Union 

17 

18PathLike = Union[str, Path] 

19OptPath = Union[str, Path, None] 

20Sequences = List[Sequence] 

21 

22class LigandError(Exception): 

23 """ 

24 Custom Ligand exception to catch parameterization errors. 

25 """ 

26 def __init__(self, message='This system contains ligands which we cannot model!'): 

27 self.message = message 

28 super().__init__(self.message) 

29 

30 

31class LigandBuilder: 

32 """ 

33 Parameterizes a ligand molecule and generates all relevant force field files for 

34 running tleap. 

35 """ 

36 def __init__(self, path: PathLike, lig: str, lig_number: int=0, file_prefix: str=''): 

37 self.path = path 

38 self.lig = path / lig 

39 self.ln = lig_number 

40 self.out_lig = path / f'{file_prefix}{lig.stem}' 

41 

42 def parameterize_ligand(self) -> None: 

43 """ 

44 Ensures consistent treatment of all ligand sdf files, generating 

45 GAFF2 parameters in the form of .frcmod and .lib files. Produces 

46 a mol2 file for coordinates and connectivity and ensures that 

47 antechamber did not fail. Hydrogens are added in rdkit which  

48 generally does a good job of this. 

49 

50 Returns: 

51 None 

52 """ 

53 ext = self.lig.suffix 

54 self.lig = self.lig.stem 

55 

56 convert_to_gaff = f'antechamber -i {self.lig}_prep.mol2 -fi mol2 -o \ 

57 {self.out_lig}.mol2 -fo mol2 -at gaff2 -c bcc -s 0 -pf y -rn LG{self.ln}' 

58 parmchk2 = f'parmchk2 -i {self.out_lig}.mol2 -f mol2 -o {self.out_lig}.frcmod' 

59 

60 tleap_ligand = f"""source leaprc.gaff2 

61 LG{self.ln} = loadmol2 {self.out_lig}.mol2 

62 loadamberparams {self.out_lig}.frcmod 

63 saveoff LG{self.ln} {self.out_lig}.lib 

64 quit 

65 """ 

66 

67 if ext == '.sdf': 

68 self.process_sdf() 

69 else: 

70 self.process_pdb() 

71 

72 self.convert_to_mol2() 

73 os.system(convert_to_gaff) 

74 try: 

75 self.move_antechamber_outputs() 

76 self.check_sqm() 

77 os.system(parmchk2) 

78 leap_file, leap_log = self.write_leap(tleap_ligand) 

79 os.system(f'tleap -f {leap_file} > {leap_log}') 

80 except FileNotFoundError: 

81 raise LigandError(f'Antechamber failed! {self.lig}') 

82 

83 def process_sdf(self) -> None: 

84 """ 

85 Add hydrogens in rdkit. Atom hybridization is taken from the 

86 input sdf file and if this is incorrect, hydrogens will be wrong 

87 too. 

88 

89 Returns: 

90 None 

91 """ 

92 mol = Chem.SDMolSupplier(f'{self.lig}.sdf')[0] 

93 molH = Chem.AddHs(mol, addCoords=True) 

94 with Chem.SDWriter(f'{self.lig}_H.sdf') as w: 

95 w.write(molH) 

96 

97 def process_pdb(self) -> None: 

98 """ 

99 Ingests a PDB file of a small molecule, adds hydrogens and writes out to 

100 an SDF file. 

101 

102 Returns: 

103 None 

104 """ 

105 mol = Chem.MolFromPDBFile(f'{self.lig}.pdb') 

106 molH = Chem.AddHs(mol, addCoords=True) 

107 with Chem.SDWriter(f'{self.lig}_H.sdf') as w: 

108 w.write(molH) 

109 

110 def convert_to_mol2(self) -> None: 

111 """ 

112 Converts an sdf file to mol2 format using obabel. 

113 

114 Returns: 

115 None 

116 """ 

117 mol = list(pybel.readfile('sdf', f'{self.lig}_H.sdf'))[0] 

118 mol.write('mol2', f'{self.lig}_prep.mol2', True) 

119 

120 def move_antechamber_outputs(self) -> None: 

121 """ 

122 Remove unneccessary outputs from antechamber. Keep the 

123 sqm.out file as proof that antechamber did not fail. 

124 

125 Returns: 

126 None 

127 """ 

128 os.remove('sqm.in') 

129 os.remove('sqm.pdb') 

130 shutil.move('sqm.out', f'{self.lig}_sqm.out') 

131 

132 def check_sqm(self) -> None: 

133 """ 

134 Checks for evidence that antechamber calculations exited 

135 successfully. This is always on the second to last line, 

136 and if not present, indicates that we failed to produce 

137 sane parameters for this molecule. In that case, I wish 

138 you good luck. 

139 

140 Returns: 

141 None 

142 """ 

143 line = open(f'{self.lig}_sqm.out').readlines()[-2] 

144 

145 if 'Calculation Completed' not in line: 

146 # make sqm.in more tolerant 

147 # manually run sqm 

148 # if it still fails then we raise an error 

149 raise LigandError(f'SQM failed for ligand {self.lig}!') 

150 

151 def write_leap(self, 

152 inp: str) -> str: 

153 """ 

154 Writes out a tleap input file and returns the path 

155 to the file. 

156 """ 

157 leap_file = f'{self.path}/tleap.in' 

158 leap_log = f'{self.path}/leap.log' 

159 with open(leap_file, 'w') as outfile: 

160 outfile.write(inp) 

161 

162 return leap_file, leap_log 

163 

164 

165class PLINDERBuilder(ImplicitSolvent): 

166 """ 

167 Builds complexes consisting of a biomolecule pdb and small molecule ligand. 

168 Runs antechamber workflow to generate gaff2 parameters. 

169 """ 

170 def __init__(self, 

171 path: PathLike, 

172 system_id: str, 

173 out: PathLike, 

174 **kwargs): 

175 super().__init__(path / system_id, 'receptor.pdb', out / system_id, 

176 protein=True, rna=True, dna=True, phos_protein=True, 

177 mod_protein=True, **kwargs) 

178 self.system_id = system_id 

179 self.ffs.append('leaprc.gaff2') 

180 self.build_dir = self.out / 'build' 

181 self.ions = None 

182 

183 def build(self) -> None: 

184 ligs = self.migrate_files() 

185 

186 if not ligs: 

187 print(f'No ligands!\n\n{self.pdb}') 

188 raise LigandError 

189 

190 self.ligs = self.ligand_handler(ligs) 

191 self.assemble_system() 

192 

193 def ligand_handler(self, ligs: List[PathLike]) -> List[PathLike]: 

194 ligands = [] 

195 for i, lig in enumerate(ligs): 

196 lig_builder = LigandBuilder(self.build_dir, lig, i) 

197 lig_builder.parameterize_ligand() 

198 ligands.append(os.path.basename(lig)[:-4]) 

199 

200 return ligands 

201 

202 def migrate_files(self) -> List[str]: 

203 os.makedirs(str(self.build_dir), exist_ok=True) 

204 os.chdir(self.build_dir) # necessary for antechamber outputs 

205 

206 # grab the sequence file to complete protein modeling 

207 shutil.copy(str(self.path / 'sequences.fasta'), 

208 str(self.build_dir)) 

209 self.fasta = str(self.build_dir / 'sequences.fasta') 

210 

211 # fix and move pdb 

212 self.prep_protein() 

213 

214 # move ligand(s) 

215 ligands = [] 

216 lig_files = self.path / 'ligand_files' 

217 ligs = [Path(lig) for lig in glob(str(lig_files) + '/*.sdf')] 

218 for lig in ligs: 

219 shutil.copy(str(lig), 

220 str(self.build_dir)) 

221 

222 if self.check_ligand(lig): 

223 ligands.append(lig.name) 

224 

225 

226 # handle any potential ions 

227 if self.ions is not None: 

228 self.ffs.append('leaprc.water.tip3p') 

229 self.place_ions() 

230 

231 return ligands 

232 

233 def place_ions(self) -> None: 

234 """ 

235 This is horrible and I apologize profusely if you find yourself 

236 having to go through the following. Good luck. 

237 """ 

238 pdb_lines = open(self.pdb).readlines()[:-1] 

239 

240 if 'END' in pdb_lines[-1]: 

241 if 'TER' in pdb_lines[-2]: 

242 ln = -3 

243 else: 

244 ln = -2 

245 elif 'TER' in pdb_lines[-1]: 

246 ln = -2 

247 else: 

248 ln = -1 

249 

250 try: 

251 next_atom_num = int(pdb_lines[ln][6:12].strip()) + 1 

252 next_resid = int(pdb_lines[ln][22:26].strip()) + 1 

253 except ValueError: 

254 print(f'ERROR: {self.pdb}') 

255 raise LigandError 

256 

257 for ion in self.ions: 

258 for atom in ion: 

259 # HERE BE DRAGONS 

260 ion_line = f'ATOM {next_atom_num:>5}' 

261 

262 if atom[0].lower() in ['na', 'k', 'cl']: 

263 ionname = atom[0] + atom[1] 

264 ion_line += f'{ionname:>4} {ionname:<3} ' 

265 else: 

266 ionname = atom[0].upper() 

267 ion_line += f'{ionname:>3} {ionname:<3}' 

268 

269 coords = ''.join([f'{x:>8.3f}' for x in atom[2:]]) 

270 ion_line += f'{next_resid:>5} {coords} 0.00 0.00\n' 

271 

272 pdb_lines.append(ion_line) 

273 pdb_lines.append('TER\n') 

274 

275 next_atom_num += 1 

276 next_resid += 1 

277 

278 pdb_lines.append('END') 

279 

280 with open(self.pdb, 'w') as f: 

281 f.write(''.join(pdb_lines)) 

282 

283 def assemble_system(self) -> None: 

284 """ 

285 Slightly modified from the parent class, now we have to add 

286 the ligand parameters and assemble a complex rather than just 

287 placing a biomolecule in the water box. 

288 """ 

289 tleap_complex = [f'source {ff}' for ff in self.ffs] 

290 structs = [f'PROT = loadpdb {self.pdb}'] 

291 combine = 'COMPLEX = combine{PROT' 

292 for i, lig in enumerate(self.ligs): 

293 ligand = self.build_dir / lig 

294 tleap_complex += [f'loadamberparams {ligand}.frcmod', 

295 f'loadoff {ligand}.lib'] 

296 structs += [f'LG{i} = loadmol2 {ligand}.mol2'] 

297 combine += f' LG{i}' 

298 

299 combine += '}' 

300 tleap_complex += structs 

301 tleap_complex.append(combine) 

302 tleap_complex += [ 

303 'set default PBRadii mbondi3', 

304 f'savepdb COMPLEX {self.out}/system.pdb', 

305 f'saveamberparm COMPLEX {self.out}/system.prmtop {self.out}/system.inpcrd', 

306 'quit' 

307 ] 

308 

309 tleap_complex = '\n'.join(tleap_complex) 

310 leap_file = self.build_dir / 'tleap.in' 

311 with open(str(leap_file), 'w') as outfile: 

312 outfile.write(tleap_complex) 

313 

314 tleap = f'tleap -f {leap_file}' 

315 os.system(tleap) 

316 

317 def prep_protein(self) -> None: 

318 raw_pdb = self.path / self.pdb 

319 prep_pdb = self.build_dir / 'prepped.pdb' 

320 self.pdb = self.build_dir / 'protein.pdb' 

321 

322 # complex workflow for modeling missing residues 

323 self.triage_pdb(raw_pdb, prep_pdb) 

324 

325 # remove hydrogens (-y) and waters (-d) from the input PDB 

326 pdb4amber = f'pdb4amber -i {prep_pdb} -o {self.pdb} -y -d' 

327 os.system(pdb4amber) 

328 

329 def triage_pdb(self, broken_pdb: PathLike, 

330 repaired_pdb: PathLike) -> str: 

331 """ 

332 Runs PDBFixer to repair missing loops and ensure structure is 

333 in good shape. Runs a check against the sequence provided by 

334 PLINDER and ensures that any non-canonical residues are represented 

335 in the sequence properly. 

336 """ 

337 fixer = PDBFixer(filename=str(broken_pdb)) 

338 chains = [chain for chain in fixer.topology.chains()] 

339 chain_map = {chain.id: [res for res in chain.residues()] 

340 for chain in chains} 

341 

342 # non-databank models do not have SEQRES and therefore no 

343 # sequence data to model missing residues 

344 fixer.sequences = self.inject_fasta(chain_map) 

345 fixer.findMissingResidues() 

346 fixer.findMissingAtoms() 

347 fixer.addMissingAtoms() 

348 

349 with open(str(repaired_pdb), 'w') as f: 

350 PDBFile.writeFile(fixer.topology, fixer.positions, f) 

351 

352 def inject_fasta(self, 

353 chain_map: Dict[str, List[str]]) -> Sequences: 

354 """ 

355 Checks fasta against actual sequence. Modifies sequence so that  

356 it correctly matches in the case of non-canonical residues such 

357 as phosphorylations (i.e. SER -> SEP). 

358 """ 

359 fasta = open(self.fasta).readlines() 

360 remapping = json.load(open(f'{self.path}/chain_mapping.json', 'rb')) 

361 sequences = [] 

362 for i in range(len(fasta) // 2): 

363 seq_chain = fasta[2*i].strip()[1:] # strip off > and \n 

364 chain = remapping[seq_chain] 

365 one_letter_seq = fasta[2*i+1].strip() 

366 try: 

367 three_letter_seq = [convert_aa_code(aa) for aa in one_letter_seq] 

368 except ValueError: 

369 print(f'\nUnknown residue in fasta!\n\n{self.pdb}') 

370 raise LigandError 

371 

372 try: 

373 three_letter_seq = self.check_ptms(three_letter_seq, 

374 chain_map[chain]) 

375 sequences.append( 

376 Sequence(chainId=chain, 

377 residues=three_letter_seq) 

378 ) 

379 except KeyError: # not sure what to do if this fails 

380 print(f'\nUnknown ligand error!\n\n{self.pdb}') 

381 raise LigandError 

382 

383 return sequences 

384 

385 def check_ptms(self, 

386 sequence: List[str], 

387 chain_residues: List[str]) -> List[str]: 

388 """ 

389 Check the full sequence (from fasta) against the potentially partial 

390 sequence from the structural model stored in `chain_residues`. 

391 """ 

392 for residue in chain_residues: 

393 resID = int(residue.id) - 1 # since 0-indexed in list 

394 

395 try: 

396 if sequence[resID] != residue.name: 

397 sequence[resID] = residue.name 

398 except IndexError: 

399 print(f'Sequence length is messed up!\n\n{self.pdb}') 

400 raise LigandError 

401 

402 return sequence 

403 

404 def check_ligand(self, ligand: PathLike) -> bool: 

405 """ 

406 Check ligand for ions and other weird stuff. We need to take care not 

407 to assume all species containing formal charges are ions, nor that all 

408 species containing atoms in the cation/anion lists are ions. Good example 

409 is the multitude of small molecule drugs containing bonded halogens. 

410 """ 

411 ion = False 

412 mol = Chem.SDMolSupplier(str(ligand))[0] 

413 

414 ligand = [] 

415 for atom, position in zip(mol.GetAtoms(), mol.GetConformer().GetPositions()): 

416 symbol = atom.GetSymbol() 

417 if symbol.lower() in self.cation_list + self.anion_list: 

418 charge = atom.GetFormalCharge() 

419 if charge != 0: 

420 ion = True 

421 sign = '+' if charge > 0 else '-' 

422 if abs(charge) > 1: 

423 sign = f'{charge}{sign}' 

424 

425 ligand.append([symbol, sign] + [x for x in position]) 

426 

427 if ion: 

428 try: 

429 self.ions.append(ligand) 

430 except AttributeError: 

431 self.ions = [ligand] 

432 return False 

433 

434 return True 

435 

436 @property 

437 def cation_list(self) -> List[str]: 

438 return [ 

439 'na', 'k', 'ca', 'mn', 'mg', 'li', 'rb', 'cs', 'cu', 

440 'ag', 'au', 'ti', 'be', 'sr', 'ba', 'ra', 'v', 'cr', 

441 'fe', 'co', 'zn', 'ni', 'pd', 'cd', 'sn', 'pt', 'hg', 

442 'pb', 'al' 

443 ] 

444 

445 @property 

446 def anion_list(self) -> List[str]: 

447 return [ 

448 'cl', 'br', 'i', 'f' 

449 ] 

450 

451 

452class ComplexBuilder(ExplicitSolvent): 

453 """ 

454 Builds complexes consisting of a biomolecule pdb and small molecule ligand. 

455 Runs antechamber workflow to generate gaff2 parameters. Can optionally 

456 supply precomputed frcmod/lib files by their path + suffix in the  

457 lig_param_prefix argument (e.g. /path/to/lig.mol2 or /path/to/lig) 

458 """ 

459 def __init__(self, path: str, pdb: str, lig: str | list[str], padding: float=10., 

460 lig_param_prefix: str | None = None, **kwargs): 

461 super().__init__(path, pdb, padding) 

462 self.lig = Path(lig) if isinstance(lig, str) else [Path(l) for l in lig] 

463 self.ffs.append('leaprc.gaff2') 

464 self.build_dir = self.out.parent / 'build' 

465 

466 if lig_param_prefix is None: 

467 self.lig_param_prefix = lig_param_prefix 

468 else: 

469 prefix = Path(lig_param_prefix) 

470 self.lig_param_prefix = prefix.parent / prefix.stem 

471 

472 for key, value in kwargs.items(): 

473 setattr(self, key, value) 

474 

475 def build(self) -> None: 

476 self.build_dir.mkdir(exist_ok=True, parents=True) 

477 os.chdir(self.build_dir) # necessary for antechamber outputs 

478 

479 if self.lig_param_prefix is None: 

480 if isinstance(self.lig, list): 

481 lig_paths = [] 

482 for i, lig in enumerate(self.lig): 

483 lig_paths += self.process_ligand(lig, i) 

484 

485 self.lig = lig_paths 

486 

487 else: 

488 self.lig = self.process_ligand(self.lig) 

489 else: 

490 self.lig = self.lig_param_prefix 

491 

492 if hasattr(self, ion): 

493 self.add_ion_to_pdb() 

494 

495 self.prep_pdb() 

496 self.assemble_system() 

497 

498 def process_ligand(self, lig: PathLike, prefix: int | None = None) -> PathLike: 

499 if lig.parent != self.build_dir: 

500 shutil.copy(lig, self.build_dir) 

501 

502 if prefix is None: 

503 prefix = '' 

504 

505 lig_builder = LigandBuilder(self.build_dir, lig, file_prefix=prefix) 

506 lig_builder.parameterize_ligand() 

507 

508 return lig_builder.lig 

509 

510 def add_ion_to_pdb(self) -> None: 

511 ion = [line for line in open(self.ion).readlines() 

512 if any(['ATOM' in line, 'HETATM' in line])] 

513 pdb = [line for line in open(self.pdb).readlines()] 

514 

515 out_pdb = [] 

516 for line in pdb: 

517 if 'END' in line: 

518 out_pdb.append(ion) 

519 out_pdb.append(line) 

520 else: 

521 out_pdb.append(line) 

522 

523 with open(self.pdb, 'w') as f: 

524 f.write('\n'.join(out_pdb)) 

525 

526 def assemble_system(self, dim, num_ions) -> None: 

527 """ 

528 Slightly modified from the parent class, now we have to add 

529 the ligand parameters and assemble a complex rather than just 

530 placing a biomolecule in the water box. 

531 """ 

532 tleap_ffs = '\n'.join([f'source {ff}' for ff in self.ffs]) 

533 tleap_complex = [ 

534 tleap_ffs, 

535 'source leaprc.gaff2', 

536 ] 

537 

538 if not isinstance(self.lig, list): 

539 self.lig = [self.lig] 

540 

541 LABELS = [] 

542 for i, lig in enumerate(self.lig): 

543 tleap_complex += [ 

544 f'loadamberparams {lig}.frcmod', 

545 f'loadoff {lig}.lib', 

546 f'LG{i} = loadmol2 {lig}.mol2', 

547 ] 

548 

549 LABELS.append(f'LG{i}') 

550 

551 LABELS.append(f'PROT') 

552 LABELS = ' '.join(LABELS) 

553 

554 tleap_complex = [ 

555 f'PROT = loadpdb {self.pdb}', 

556 f'COMPLEX = combine {{LABELS}}', 

557 'setbox COMPLEX centers', 

558 f'set COMPLEX box {{{dim} {dim} {dim}}}', 

559 f'solvatebox COMPLEX {self.water_box} {{0 0 0}}', 

560 'addions COMPLEX Na+ 0', 

561 'addions COMPLEX Cl- 0', 

562 f'addIonsRand COMPLEX Na+ {num_ions} Cl- {num_ions}', 

563 f'savepdb COMPLEX {self.out}.pdb', 

564 f'saveamberparm COMPLEX {self.out}.prmtop {self.out}.inpcrd' 

565 ] 

566 

567 leap_file = self.write_leap('\n'.join(tleap_complex)) 

568 tleap = f'tleap -f {leap_file}' 

569 os.system(tleap) 

570