Coverage for src/instawell/parser.py: 95%

42 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-28 21:17 -0500

1""" 

2Robust parsing utilities for experimental condition strings. 

3 

4This module provides functions to parse condition strings like: 

5 '500uM_Geranyl-Monophosphate_d104hFic-H363A_1mM-ATP-5mM-MgCl2' 

6 

7Into structured components (concentration, ligand, protein, buffer). 

8""" 

9 

10import re 

11from typing import Dict, Optional, Tuple 

12from .data_models import UniqueCondition 

13 

14 

15# Configuration for parsing 

16CONDITION_DELIMITER = "_" 

17DEFAULT_FIELDS = ("concentration", "ligand", "protein", "buffer") 

18 

19 

20def parse_condition_string( 

21 condition_str: str, 

22 delimiter: str = CONDITION_DELIMITER, 

23 fields: Tuple[str, ...] = DEFAULT_FIELDS, 

24) -> Dict[str, str]: 

25 """ 

26 Parse a condition string into component fields. 

27 

28 This function handles condition strings where individual components 

29 (like ligand or protein names) may contain the delimiter character. 

30 It parses from the end of the string, taking the last N components 

31 where N is the number of fields specified. 

32 

33 Args: 

34 condition_str: The full condition string to parse 

35 delimiter: Character used to separate fields (default: '_') 

36 fields: Ordered tuple of field names to parse (default: ('concentration', 'ligand', 'protein', 'buffer')) 

37 The last len(fields) components will be extracted in this order. 

38 

39 Returns: 

40 Dictionary mapping field names to their values 

41 

42 Raises: 

43 ValueError: If the condition string doesn't have enough components 

44 

45 Examples: 

46 >>> # Default: parse concentration, ligand, protein, buffer from end 

47 >>> parse_condition_string("500uM_ATP_Protein1_Buffer1") 

48 {'concentration': '500uM', 'ligand': 'ATP', 'protein': 'Protein1', 'buffer': 'Buffer1'} 

49 

50 >>> # Handles underscores in component names (takes last 4 components) 

51 >>> parse_condition_string("500uM_Geranyl-Monophosphate_d104hFic-H363A_1mM-ATP-5mM-MgCl2") 

52 {'concentration': '500uM', 'ligand': 'Geranyl-Monophosphate', 'protein': 'd104hFic-H363A', 'buffer': '1mM-ATP-5mM-MgCl2'} 

53 

54 >>> # Custom field order 

55 >>> parse_condition_string("ATP_Protein1_500uM_Buffer1", fields=("ligand", "protein", "concentration", "buffer")) 

56 {'ligand': 'ATP', 'protein': 'Protein1', 'concentration': '500uM', 'buffer': 'Buffer1'} 

57 

58 >>> # Parse fewer fields (last 2 components only) 

59 >>> parse_condition_string("500uM_ATP_Protein1_Buffer1", fields=("protein", "buffer")) 

60 {'protein': 'Protein1', 'buffer': 'Buffer1'} 

61 """ 

62 if not condition_str or condition_str.strip() == "": 

63 raise ValueError("Condition string cannot be empty") 

64 

65 parts = condition_str.split(delimiter) 

66 num_fields = len(fields) 

67 

68 if len(parts) < num_fields: 

69 raise ValueError( 

70 f"Condition '{condition_str}' must have at least {num_fields} " 

71 f"'{delimiter}'-separated components for fields {fields}, " 

72 f"but only found {len(parts)} components: {parts}" 

73 ) 

74 

75 # Take last N components where N = len(fields) 

76 # This handles underscores in earlier components 

77 field_values = parts[-num_fields:] 

78 

79 # Create dictionary mapping field names to values 

80 return dict(zip(fields, field_values)) 

81 

82 

83def condition_from_string( 

84 condition_str: str, 

85 delimiter: str = CONDITION_DELIMITER, 

86 fields: Tuple[str, ...] = DEFAULT_FIELDS, 

87 include_replicates: bool = False, 

88) -> UniqueCondition: 

89 """ 

90 Parse a condition string and return a UniqueCondition object. 

91 

92 Args: 

93 condition_str: The full condition string to parse 

94 delimiter: Character used to separate fields (default: '_') 

95 fields: Ordered tuple of field names to parse (default: ('concentration', 'ligand', 'protein', 'buffer')) 

96 include_replicates: Whether to include an empty replicates list (default: False) 

97 

98 Returns: 

99 UniqueCondition object with parsed fields 

100 

101 Raises: 

102 ValueError: If the condition string is invalid or required fields are missing 

103 

104 Examples: 

105 >>> condition = condition_from_string("500uM_ATP_Protein1_Buffer1") 

106 >>> condition.concentration 

107 '500uM' 

108 >>> condition.ligand_name 

109 'ATP' 

110 

111 >>> # Custom field order 

112 >>> condition = condition_from_string("ATP_Protein1_500uM_Buffer1", 

113 ... fields=("ligand", "protein", "concentration", "buffer")) 

114 >>> condition.ligand_name 

115 'ATP' 

116 """ 

117 parsed = parse_condition_string(condition_str, delimiter, fields) 

118 

119 # Ensure all required fields for UniqueCondition are present 

120 required = {"concentration", "ligand", "protein", "buffer"} 

121 missing = required - set(parsed.keys()) 

122 if missing: 

123 raise ValueError( 

124 f"Cannot create UniqueCondition: missing required fields {missing}. " 

125 f"Parsed fields: {list(parsed.keys())}" 

126 ) 

127 

128 return UniqueCondition( 

129 full_name=condition_str, 

130 concentration=parsed["concentration"], 

131 ligand_name=parsed["ligand"], 

132 protein_name=parsed["protein"], 

133 buffer_condition=parsed["buffer"], 

134 replicates=[] if include_replicates else [], 

135 ) 

136 

137 

138def condition_to_string(condition: UniqueCondition, delimiter: str = CONDITION_DELIMITER) -> str: 

139 """ 

140 Reconstruct a condition string from a UniqueCondition object. 

141 

142 Args: 

143 condition: UniqueCondition object to serialize 

144 delimiter: Character to use for separating fields (default: '_') 

145 

146 Returns: 

147 Reconstructed condition string 

148 

149 Examples: 

150 >>> condition = UniqueCondition( 

151 ... concentration="500uM", 

152 ... ligand_name="ATP", 

153 ... protein_name="Protein1", 

154 ... buffer_condition="Buffer1" 

155 ... ) 

156 >>> condition_to_string(condition) 

157 '500uM_ATP_Protein1_Buffer1' 

158 """ 

159 parts = [ 

160 condition.concentration, 

161 condition.ligand_name, 

162 condition.protein_name, 

163 condition.buffer_condition, 

164 ] 

165 

166 # Validate that no component is empty 

167 if any(not part for part in parts): 

168 raise ValueError( 

169 f"Cannot convert condition to string - missing required fields: {condition}" 

170 ) 

171 

172 return delimiter.join(parts) 

173 

174 

175def validate_condition_string(condition_str: str) -> bool: 

176 """ 

177 Check if a condition string is valid without raising exceptions. 

178 

179 Args: 

180 condition_str: The condition string to validate 

181 

182 Returns: 

183 True if valid, False otherwise 

184 

185 Examples: 

186 >>> validate_condition_string("500uM_ATP_Protein1_Buffer1") 

187 True 

188 >>> validate_condition_string("invalid") 

189 False 

190 >>> validate_condition_string("0_0_0_0") 

191 True # Valid format, even if semantically empty 

192 """ 

193 try: 

194 parse_condition_string(condition_str) 

195 return True 

196 except (ValueError, IndexError): 

197 return False 

198 

199 

200def parse_concentration_to_float(concentration: str) -> Optional[float]: 

201 """ 

202 Convert a concentration string to a float value. 

203 

204 Handles common concentration formats like: 

205 - '500uM' -> 500.0 

206 - '1mM' -> 1.0 

207 - '0.5nM' -> 0.5 

208 - 'apo' -> 0.0 

209 - 'DMSO' -> 0.0 

210 

211 Args: 

212 concentration: Concentration string to parse 

213 

214 Returns: 

215 Float value, or None if parsing fails 

216 

217 Examples: 

218 >>> parse_concentration_to_float("500uM") 

219 500.0 

220 >>> parse_concentration_to_float("apo") 

221 0.0 

222 >>> parse_concentration_to_float("DMSO") 

223 0.0 

224 """ 

225 # Handle special cases 

226 if concentration.lower() in ["apo", "dmso", "control", "baseline"]: 

227 return 0.0 

228 

229 # Try to extract numeric value using regex 

230 # Matches patterns like: 500uM, 1.5mM, 0.1nM, etc. 

231 match = re.match(r"^([\d.]+)", concentration) 

232 if match: 

233 try: 

234 return float(match.group(1)) 

235 except ValueError: 

236 return None 

237 

238 return None