Coverage for src / molecular_simulations / analysis / utils.py: 100%

50 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-13 01:26 -0600

1import MDAnalysis as mda 

2import numpy as np 

3from pathlib import Path 

4import shutil 

5from typing import Callable, Union 

6 

7OptPath = Union[Path, str, None] 

8 

9class EmbedData: 

10 """ 

11 Embeds given data into the beta-factor column of PDB. Writes out to same 

12 path as input PDB and backs up old PDB file, unless an output path is  

13 explicitly provided. Embedding data should be provided as a dictionary where 

14 the keys are MDAnalysis selection strings and the values are numpy arrays 

15 of shape (n_frames, n_residues, n_datapoints) or (n_residues, n_datapoints). 

16 

17 Arguments: 

18 pdb (Path): Path to PDB file to load. Also will be the output if one is 

19 not provided. 

20 embedding_dict (dict[str, np.ndarray]): A dictionary containing MDAnalysis 

21 selections as keys and data as the values. 

22 out (OptPath): Defaults to None. If not None this will be the path to the 

23 output PDB. 

24 """ 

25 def __init__(self, 

26 pdb: Path, 

27 embedding_dict: dict[str, np.ndarray], 

28 out: OptPath=None): 

29 self.pdb = pdb if isinstance(pdb, Path) else Path(pdb) 

30 self.embeddings = embedding_dict 

31 self.out = out if out is not None else self.pdb 

32 

33 self.u = mda.Universe(str(self.pdb)) 

34 

35 def embed(self) -> None: 

36 """ 

37 Unpacks embedding dictionary, embeds data and writes out new PDB. 

38 

39 Returns: 

40 None 

41 """ 

42 for sel, data in self.embeddings.items(): 

43 self.embed_selection(sel, data) 

44 

45 self.write_new_pdb() 

46 

47 def embed_selection(self, 

48 selection: str, 

49 data: np.ndarray) -> None: 

50 """ 

51 Embeds data into given selection in the beta column for each residue. 

52 

53 Arguments: 

54 selection (str): MDAnalysis selection string. 

55 data (np.ndarray): Array of data to place in beta column. Shape should 

56 be (n_residues_in_selection, 1). 

57 

58 Returns: 

59 None 

60 """ 

61 sel = self.u.select_atoms(selection) 

62 

63 for residue, datum in zip(sel.residues, data): 

64 residue.atoms.tempfactors = np.full(residue.atoms.tempfactors.shape, datum) 

65 

66 def write_new_pdb(self) -> None: 

67 """ 

68 Writes out PDB file. If an output was not designated, backs up original 

69 PDB with the extension .orig.pdb. If this backup already exists, do not 

70 back up the PDB as that may occur if you run this twice and to do so would 

71 mean losing the actual original PDB. 

72 

73 Returns: 

74 None 

75 """ 

76 if self.out.exists(): 

77 if not self.pdb.with_suffix('.orig.pdb').exists(): 

78 shutil.copyfile(str(self.pdb), str(self.pdb.with_suffix('.orig.pdb'))) 

79 

80 with mda.Writer(str(self.out)) as W: 

81 W.write(self.u.atoms) 

82 

83 

84class EmbedEnergyData(EmbedData): 

85 """ 

86 Special instance of EmbedData in which the data stored in embedding_dict is 

87 non-bonded energy data with both LJ and coulombic terms. In this case we need 

88 to obtain the total energy by summing these and rescale it as many softwares 

89 do not understand a negative beta factor. 

90 

91 Arguments: 

92 pdb (Path): Path to PDB file to load. Also will be the output if one is 

93 not provided. 

94 embedding_dict (dict[str, np.ndarray]): A dictionary containing MDAnalysis 

95 selections as keys and data as the values. 

96 out (OptPath): Defaults to None. If not None this will be the path to the 

97 output PDB. 

98 """ 

99 def __init__(self, 

100 pdb: Path, 

101 embedding_dict: dict[str, np.ndarray], 

102 out: OptPath=None): 

103 super().__init__(pdb, embedding_dict, out) 

104 self.embeddings = self.preprocess() 

105 

106 def preprocess(self) -> dict[str, np.ndarray]: 

107 """ 

108 Processes embeddings data so that it can be fed through parent methods. 

109 This requires the embeddings data contain values of one-dimensional arrays, 

110 and that the data be rescaled such that there are no negative values while 

111 preserving the distance between values. 

112 

113 Returns: 

114 (dict[str, np.ndarray]): Processed data array. 

115 """ 

116 new_embeddings = dict() 

117 all_data = [] 

118 for sel, data in self.embeddings.items(): 

119 sanitized = self.sanitize_data(data) 

120 all_data.append(sanitized) 

121 

122 rescaling_factor = np.min(np.concatenate(all_data)) 

123 for sel, data in self.embeddings.items(): 

124 sanitized = self.sanitize_data(data) 

125 rescaled = sanitized / rescaling_factor 

126 rescaled[np.where(rescaled > 1.)] = 1. 

127 new_embeddings[sel] = rescaled 

128 

129 return new_embeddings 

130 

131 @staticmethod 

132 def sanitize_data(data: np.ndarray,) -> np.ndarray: 

133 """ 

134 Takes in data of shape (n_frames, n_residues, n_terms) and  

135 returns one-dimensional array of shape (n_residues,) by 

136 first averaging in the first dimension and then summing in 

137 the new second dimension - originally the third dimension. 

138 

139 Arguments: 

140 data (np.ndarray): Unprocessed input data. 

141 

142 Returns: 

143 (np.ndarray): One-dimensional processed data. 

144 """ 

145 if len(data.shape) > 2: 

146 data = np.mean(data, axis=0) 

147 

148 if data.shape[1] > 1: 

149 data = np.sum(data, axis=1) 

150 

151 return data