Analysis Workflows¶
This tutorial demonstrates comprehensive analysis pipelines for characterizing protein-protein or protein-ligand binding interfaces.
Complete Interface Analysis¶
Combine multiple analysis tools to fully characterize a binding interface:
from pathlib import Path
from molecular_simulations.analysis import (
Fingerprinter,
PPInteractions,
SASA,
RelativeSASA,
)
import MDAnalysis as mda
# Paths
top = Path("system.prmtop")
traj = Path("prod.dcd")
out = Path("analysis_results")
out.mkdir(exist_ok=True)
# Load universe for MDAnalysis-based analyses
u = mda.Universe(str(top), str(traj))
Step 1: Interaction Energy Fingerprinting¶
Identify residues with strong energetic contributions:
fp = Fingerprinter(
topology=str(top),
trajectory=str(traj),
target_selection="segid A",
binder_selection="segid B",
)
fp.run()
fp.save(out / "fingerprint.npz")
# Analyze results
import numpy as np
data = np.load(out / "fingerprint.npz")
# Find residues with strongest interactions
total_energy = data["electrostatic"] + data["vdw"]
mean_energy = total_energy.mean(axis=0) # Average over frames
hotspots = np.argsort(mean_energy)[:10] # Top 10 contributors
print("Top interaction hotspots:")
for i in hotspots:
print(f" Residue {data['residues'][i]}: {mean_energy[i]:.2f} kcal/mol")
Step 2: Contact Analysis¶
Characterize specific interaction types:
ppi = PPInteractions(
top=str(top),
traj=str(traj),
out=str(out),
sel1="segid A",
sel2="segid B",
hbond_cutoff=3.5,
sb_cutoff=6.0,
hydrophobic_cutoff=8.0,
)
ppi.run()
# Results saved as JSON with:
# - Hydrogen bond frequencies
# - Salt bridge occupancies
# - Hydrophobic contact probabilities
Step 3: Burial Analysis¶
Calculate interface burial upon complex formation:
# SASA of the complex
complex_sasa = SASA(u, selection="protein")
complex_sasa.run()
# For interface residues, compare to isolated chains
chain_a = u.select_atoms("segid A")
chain_b = u.select_atoms("segid B")
# Buried surface area = SASA(A) + SASA(B) - SASA(complex)
Step 4: Relative Solvent Accessibility¶
Identify buried vs exposed residues:
rsasa = RelativeSASA(u, selection="segid A")
rsasa.run()
# Residues with relative SASA < 0.25 are typically buried
buried = rsasa.results < 0.25
Batch Processing Multiple Systems¶
Process multiple trajectories efficiently:
from natsort import natsorted
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
def analyze_replica(replica_dir):
"""Analyze a single replica."""
top = replica_dir / "system.prmtop"
traj = replica_dir / "prod.dcd"
out = replica_dir / "analysis"
out.mkdir(exist_ok=True)
fp = Fingerprinter(
topology=str(top),
trajectory=str(traj),
target_selection="segid A",
)
fp.run()
fp.save(out / "fingerprint.npz")
return replica_dir
# Process replicas in parallel
replica_dirs = natsorted(Path("./").glob("replica_*"))
with ProcessPoolExecutor(max_workers=4) as executor:
results = list(tqdm(
executor.map(analyze_replica, replica_dirs),
total=len(replica_dirs),
desc="Analyzing replicas"
))
Aggregating Results¶
Combine results from multiple replicas:
import numpy as np
from pathlib import Path
# Collect fingerprints from all replicas
fingerprints = []
for rep in Path("./").glob("replica_*/analysis/fingerprint.npz"):
data = np.load(rep)
fingerprints.append(data["electrostatic"] + data["vdw"])
# Stack and compute statistics
all_fp = np.concatenate(fingerprints, axis=0)
mean_fp = all_fp.mean(axis=0)
std_fp = all_fp.std(axis=0)
print(f"Analyzed {all_fp.shape[0]} total frames")
print(f"Mean binding energy: {mean_fp.sum():.2f} ± {std_fp.sum():.2f} kcal/mol")
Visualization¶
Create publication-quality figures:
import matplotlib.pyplot as plt
import seaborn as sns
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Energy fingerprint heatmap
ax = axes[0]
sns.heatmap(
all_fp.mean(axis=0).reshape(-1, 1),
cmap="RdBu_r",
center=0,
ax=ax,
cbar_kws={"label": "Interaction Energy (kcal/mol)"}
)
ax.set_title("Per-Residue Interaction Energies")
# Contact frequency bar plot
ax = axes[1]
# ... load and plot PPI results
plt.tight_layout()
plt.savefig("interface_analysis.png", dpi=300)
Interface Scoring for AlphaFold Models¶
For predicted structures with AlphaFold confidence metrics:
from molecular_simulations.analysis import ipSAE
scorer = ipSAE(
structure_file="alphafold_complex.pdb",
pae_file="predicted_aligned_error.json",
plddt_file="plddt_scores.json",
)
scores = scorer.run()
print(f"ipTM: {scores['ipTM']:.3f}")
print(f"ipSAE: {scores['ipSAE']:.3f}")
print(f"pDockQ: {scores['pDockQ']:.3f}")
print(f"pDockQ2: {scores['pDockQ2']:.3f}")
# Interpretation:
# pDockQ > 0.5: Likely correct interface
# pDockQ 0.23-0.5: Possible interface
# pDockQ < 0.23: Unlikely correct