Coverage for src/instawell/parser.py: 95%
42 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-28 21:17 -0500
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-28 21:17 -0500
1"""
2Robust parsing utilities for experimental condition strings.
4This module provides functions to parse condition strings like:
5 '500uM_Geranyl-Monophosphate_d104hFic-H363A_1mM-ATP-5mM-MgCl2'
7Into structured components (concentration, ligand, protein, buffer).
8"""
10import re
11from typing import Dict, Optional, Tuple
12from .data_models import UniqueCondition
15# Configuration for parsing
16CONDITION_DELIMITER = "_"
17DEFAULT_FIELDS = ("concentration", "ligand", "protein", "buffer")
20def parse_condition_string(
21 condition_str: str,
22 delimiter: str = CONDITION_DELIMITER,
23 fields: Tuple[str, ...] = DEFAULT_FIELDS,
24) -> Dict[str, str]:
25 """
26 Parse a condition string into component fields.
28 This function handles condition strings where individual components
29 (like ligand or protein names) may contain the delimiter character.
30 It parses from the end of the string, taking the last N components
31 where N is the number of fields specified.
33 Args:
34 condition_str: The full condition string to parse
35 delimiter: Character used to separate fields (default: '_')
36 fields: Ordered tuple of field names to parse (default: ('concentration', 'ligand', 'protein', 'buffer'))
37 The last len(fields) components will be extracted in this order.
39 Returns:
40 Dictionary mapping field names to their values
42 Raises:
43 ValueError: If the condition string doesn't have enough components
45 Examples:
46 >>> # Default: parse concentration, ligand, protein, buffer from end
47 >>> parse_condition_string("500uM_ATP_Protein1_Buffer1")
48 {'concentration': '500uM', 'ligand': 'ATP', 'protein': 'Protein1', 'buffer': 'Buffer1'}
50 >>> # Handles underscores in component names (takes last 4 components)
51 >>> parse_condition_string("500uM_Geranyl-Monophosphate_d104hFic-H363A_1mM-ATP-5mM-MgCl2")
52 {'concentration': '500uM', 'ligand': 'Geranyl-Monophosphate', 'protein': 'd104hFic-H363A', 'buffer': '1mM-ATP-5mM-MgCl2'}
54 >>> # Custom field order
55 >>> parse_condition_string("ATP_Protein1_500uM_Buffer1", fields=("ligand", "protein", "concentration", "buffer"))
56 {'ligand': 'ATP', 'protein': 'Protein1', 'concentration': '500uM', 'buffer': 'Buffer1'}
58 >>> # Parse fewer fields (last 2 components only)
59 >>> parse_condition_string("500uM_ATP_Protein1_Buffer1", fields=("protein", "buffer"))
60 {'protein': 'Protein1', 'buffer': 'Buffer1'}
61 """
62 if not condition_str or condition_str.strip() == "":
63 raise ValueError("Condition string cannot be empty")
65 parts = condition_str.split(delimiter)
66 num_fields = len(fields)
68 if len(parts) < num_fields:
69 raise ValueError(
70 f"Condition '{condition_str}' must have at least {num_fields} "
71 f"'{delimiter}'-separated components for fields {fields}, "
72 f"but only found {len(parts)} components: {parts}"
73 )
75 # Take last N components where N = len(fields)
76 # This handles underscores in earlier components
77 field_values = parts[-num_fields:]
79 # Create dictionary mapping field names to values
80 return dict(zip(fields, field_values))
83def condition_from_string(
84 condition_str: str,
85 delimiter: str = CONDITION_DELIMITER,
86 fields: Tuple[str, ...] = DEFAULT_FIELDS,
87 include_replicates: bool = False,
88) -> UniqueCondition:
89 """
90 Parse a condition string and return a UniqueCondition object.
92 Args:
93 condition_str: The full condition string to parse
94 delimiter: Character used to separate fields (default: '_')
95 fields: Ordered tuple of field names to parse (default: ('concentration', 'ligand', 'protein', 'buffer'))
96 include_replicates: Whether to include an empty replicates list (default: False)
98 Returns:
99 UniqueCondition object with parsed fields
101 Raises:
102 ValueError: If the condition string is invalid or required fields are missing
104 Examples:
105 >>> condition = condition_from_string("500uM_ATP_Protein1_Buffer1")
106 >>> condition.concentration
107 '500uM'
108 >>> condition.ligand_name
109 'ATP'
111 >>> # Custom field order
112 >>> condition = condition_from_string("ATP_Protein1_500uM_Buffer1",
113 ... fields=("ligand", "protein", "concentration", "buffer"))
114 >>> condition.ligand_name
115 'ATP'
116 """
117 parsed = parse_condition_string(condition_str, delimiter, fields)
119 # Ensure all required fields for UniqueCondition are present
120 required = {"concentration", "ligand", "protein", "buffer"}
121 missing = required - set(parsed.keys())
122 if missing:
123 raise ValueError(
124 f"Cannot create UniqueCondition: missing required fields {missing}. "
125 f"Parsed fields: {list(parsed.keys())}"
126 )
128 return UniqueCondition(
129 full_name=condition_str,
130 concentration=parsed["concentration"],
131 ligand_name=parsed["ligand"],
132 protein_name=parsed["protein"],
133 buffer_condition=parsed["buffer"],
134 replicates=[] if include_replicates else [],
135 )
138def condition_to_string(condition: UniqueCondition, delimiter: str = CONDITION_DELIMITER) -> str:
139 """
140 Reconstruct a condition string from a UniqueCondition object.
142 Args:
143 condition: UniqueCondition object to serialize
144 delimiter: Character to use for separating fields (default: '_')
146 Returns:
147 Reconstructed condition string
149 Examples:
150 >>> condition = UniqueCondition(
151 ... concentration="500uM",
152 ... ligand_name="ATP",
153 ... protein_name="Protein1",
154 ... buffer_condition="Buffer1"
155 ... )
156 >>> condition_to_string(condition)
157 '500uM_ATP_Protein1_Buffer1'
158 """
159 parts = [
160 condition.concentration,
161 condition.ligand_name,
162 condition.protein_name,
163 condition.buffer_condition,
164 ]
166 # Validate that no component is empty
167 if any(not part for part in parts):
168 raise ValueError(
169 f"Cannot convert condition to string - missing required fields: {condition}"
170 )
172 return delimiter.join(parts)
175def validate_condition_string(condition_str: str) -> bool:
176 """
177 Check if a condition string is valid without raising exceptions.
179 Args:
180 condition_str: The condition string to validate
182 Returns:
183 True if valid, False otherwise
185 Examples:
186 >>> validate_condition_string("500uM_ATP_Protein1_Buffer1")
187 True
188 >>> validate_condition_string("invalid")
189 False
190 >>> validate_condition_string("0_0_0_0")
191 True # Valid format, even if semantically empty
192 """
193 try:
194 parse_condition_string(condition_str)
195 return True
196 except (ValueError, IndexError):
197 return False
200def parse_concentration_to_float(concentration: str) -> Optional[float]:
201 """
202 Convert a concentration string to a float value.
204 Handles common concentration formats like:
205 - '500uM' -> 500.0
206 - '1mM' -> 1.0
207 - '0.5nM' -> 0.5
208 - 'apo' -> 0.0
209 - 'DMSO' -> 0.0
211 Args:
212 concentration: Concentration string to parse
214 Returns:
215 Float value, or None if parsing fails
217 Examples:
218 >>> parse_concentration_to_float("500uM")
219 500.0
220 >>> parse_concentration_to_float("apo")
221 0.0
222 >>> parse_concentration_to_float("DMSO")
223 0.0
224 """
225 # Handle special cases
226 if concentration.lower() in ["apo", "dmso", "control", "baseline"]:
227 return 0.0
229 # Try to extract numeric value using regex
230 # Matches patterns like: 500uM, 1.5mM, 0.1nM, etc.
231 match = re.match(r"^([\d.]+)", concentration)
232 if match:
233 try:
234 return float(match.group(1))
235 except ValueError:
236 return None
238 return None