Coverage for nlp_manager/regex_read_codes.py: 98%

131 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1""" 

2crate_anon/nlp_manager/regex_read_codes.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Regular expressions to detect some Read codes (CTV3).** 

27 

28See https://en.wikipedia.org/wiki/Read_code. 

29 

30""" 

31 

32import logging 

33from typing import List 

34 

35from crate_anon.common.regex_helpers import ( 

36 at_start_wb, 

37 escape_literal_string_for_regex, 

38 escape_literal_for_regex_allowing_flexible_whitespace, 

39 LEFT_BRACKET as LB, 

40 OPTIONAL_WHITESPACE, 

41 regex_or, 

42 RIGHT_BRACKET as RB, 

43) 

44 

45log = logging.getLogger(__name__) 

46 

47 

48# ============================================================================= 

49# Represent a Read code 

50# ============================================================================= 

51 

52 

53class ReadCode: 

54 r""" 

55 Represents information about the way a quantity is represented as a Read 

56 code. 

57 

58 NOTE: Read codes are case-sensitive. (See 

59 https://www.gp-training.net/it/read-codes/.) 

60 

61 It would be desirable to mark the Read code as case-sensitive, within a 

62 regex that is case-insensitive overall. Apparently Tcl supports this via 

63 the ``(?c)`` flag: https://www.regular-expressions.info/modifiers.html. 

64 

65 However, others just support the "locally case-insensitive" flag, ``(?i)``. 

66 

67 Python (via ``regex``) fails to parse the test regex ``(?i)te(?-i)st``, 

68 from https://www.regular-expressions.info/modifiers.html. It gives the 

69 error ``regex._regex_core.error: bad inline flags: cannot turn flags off at 

70 position 11``. No docs at https://pypi.org/project/regex/ or 

71 https://docs.python.org/3/library/re.html suggest otherwise. 

72 

73 Since we absolutely want case-insensitive matching for the most part, I 

74 think we'll live with this limitation. 

75 """ 

76 

77 def __init__(self, read_code: str, phrases: List[str] = None) -> None: 

78 """ 

79 Args: 

80 read_code: 

81 The Read (CTV3) code, a string of length 5. 

82 phrases: 

83 The associated possible phrases. 

84 """ 

85 assert isinstance(read_code, str) 

86 assert len(read_code) == 5 

87 self.read_code = read_code 

88 self.phrases = phrases or [] # type: List[str] 

89 

90 def component_regex_strings(self) -> List[str]: 

91 """ 

92 A list of regular expression strings representing this quantity. 

93 

94 Provides regexes for: 

95 

96 .. code-block:: none 

97 

98 phrase (readcode) 

99 phrase 

100 """ 

101 components = [] # type: List[str] 

102 esc_read = escape_literal_string_for_regex(self.read_code) 

103 optional_observation = r"(?:\s* - \s+ observation)?" 

104 for p in self.phrases: 

105 phrase = at_start_wb( 

106 escape_literal_for_regex_allowing_flexible_whitespace(p) 

107 ) 

108 r = ( 

109 f"{phrase}{optional_observation}" 

110 f"(?:{OPTIONAL_WHITESPACE}{LB}{esc_read}{RB})?" 

111 ) 

112 components.append(r) 

113 return components 

114 

115 def regex_str(self) -> str: 

116 """ 

117 A single composite regex string representing this quantity. 

118 """ 

119 return regex_or( 

120 *self.component_regex_strings(), 

121 wrap_each_in_noncapture_group=True, 

122 wrap_result_in_noncapture_group=True, 

123 ) 

124 

125 

126# ============================================================================= 

127# Some known values used by our NLP parsers 

128# ============================================================================= 

129 

130 

131class ReadCodes: 

132 """ 

133 Some known Read codes. 

134 

135 From ``v3ReadCode_PBCL.xlsx``. 

136 """ 

137 

138 # ------------------------------------------------------------------------- 

139 # Biochemistry 

140 # ------------------------------------------------------------------------- 

141 

142 ALBUMIN_PLASMA = ReadCode( 

143 read_code="XaIRc", phrases=["Plasma albumin level"] 

144 ) 

145 ALBUMIN_SERUM = ReadCode( 

146 read_code="XE2eA", phrases=["Serum albumin level"] 

147 ) 

148 ALKPHOS = ReadCode( 

149 read_code="44F3.", phrases=["Alkaline phosphatase level"] 

150 ) 

151 ALKPHOS_PLASMA = ReadCode( 

152 read_code="XaIRj", phrases=["Plasma alkaline phosphatase level"] 

153 ) 

154 ALKPHOS_SERUM = ReadCode( 

155 read_code="XE2px", phrases=["Serum alkaline phosphatase level"] 

156 ) 

157 ALT = ReadCode(read_code="44G3.", phrases=["ALT/SGPT serum level"]) 

158 

159 BILIRUBIN_PLASMA_TOTAL = ReadCode( 

160 read_code="XaETf", phrases=["Plasma total bilirubin level"] 

161 ) 

162 BILIRUBIN_SERUM = ReadCode( 

163 read_code="44E..", phrases=["Serum bilirubin level"] 

164 ) 

165 BILIRUBIN_SERUM_TOTAL = ReadCode( 

166 read_code="XaERu", phrases=["Serum total bilirubin level"] 

167 ) 

168 BILIRUBIN_TOTAL = ReadCode( 

169 read_code="XE2qu", phrases=["Total bilirubin level"] 

170 ) 

171 

172 CHOLESTEROL_SERUM = ReadCode( 

173 read_code="XE2eD", phrases=["Serum cholesterol level"] 

174 ) 

175 CHOLESTEROL_TOTAL_PLASMA = ReadCode( 

176 read_code="XaIRd", phrases=["Plasma total cholesterol level"] 

177 ) 

178 CHOLESTEROL_TOTAL_SERUM = ReadCode( 

179 read_code="XaJe9", phrases=["Serum total cholesterol level"] 

180 ) 

181 CREATININE = ReadCode(read_code="X771Q", phrases=["Creatinine level"]) 

182 CREATININE_PLASMA = ReadCode( 

183 read_code="XaETQ", phrases=["Plasma creatinine level"] 

184 ) 

185 CREATININE_PLASMA_CORRECTED = ReadCode( 

186 read_code="XaERX", phrases=["Cor plasma creatinine level"] 

187 ) 

188 CREATININE_SERUM = ReadCode( 

189 read_code="XE2q5", phrases=["Serum creatinine level"] 

190 ) 

191 CREATININE_SERUM_CORRECTED = ReadCode( 

192 read_code="XaERc", phrases=["Cor serum creatinine level"] 

193 ) 

194 CRP_PLASMA = ReadCode( 

195 read_code="XE2dy", phrases=["Plasma C-reactive protein level"] 

196 ) 

197 CRP_SERUM = ReadCode( 

198 read_code="XaINL", phrases=["Serum C reactive protein level"] 

199 ) 

200 

201 GAMMA_GT = ReadCode( 

202 read_code="44G4.", phrases=["Gamma-glutamyl transferase lev"] 

203 ) 

204 GAMMA_GT_PLASMA = ReadCode( 

205 read_code="XaES4", phrases=["Plasma gamma-glutamyl transferase level"] 

206 ) 

207 GAMMA_GT_SERUM = ReadCode( 

208 read_code="XaES3", phrases=["Serum gamma-glutamyl transferase level"] 

209 ) 

210 GLUCOSE = ReadCode(read_code="X772y", phrases=["Glucose level"]) 

211 GLUCOSE_BLOOD = ReadCode( 

212 read_code="X772z", phrases=["Blood glucose level"] 

213 ) 

214 GLUCOSE_BLOOD_2H_POSTPRANDIAL = ReadCode( 

215 read_code="44U7.", phrases=["2 hour post-prand blood gluc"] 

216 ) 

217 GLUCOSE_BLOOD_150_MIN = ReadCode( 

218 read_code="XaEOS", phrases=["150 minute blood glucose level"] 

219 ) 

220 GLUCOSE_PLASMA_RANDOM = ReadCode( 

221 read_code="44g0.", phrases=["Plasma random glucose level"] 

222 ) 

223 GLUCOSE_PLASMA_FASTING = ReadCode( 

224 read_code="44g1.", phrases=["Plasma fasting glucose level"] 

225 ) 

226 GLUCOSE_PLASMA_30_MIN = ReadCode( 

227 read_code="XaEOT", phrases=["30 minute plasma glucose level"] 

228 ) 

229 GLUCOSE_PLASMA_60_MIN = ReadCode( 

230 read_code="XaEOU", phrases=["60 minute plasma glucose level"] 

231 ) 

232 GLUCOSE_PLASMA_90_MIN = ReadCode( 

233 read_code="XaEPc", phrases=["90 minute plasma glucose level"] 

234 ) 

235 GLUCOSE_PLASMA_120_MIN = ReadCode( 

236 read_code="XaEOV", phrases=["120 minute plasma glucose level"] 

237 ) 

238 GLUCOSE_PLASMA_2H_POSTPRANDIAL = ReadCode( 

239 read_code="44g2.", phrases=["Plasma 2-hr post-pran gluc lev"] 

240 ) 

241 GLUCOSE_PLASMA_150_MIN = ReadCode( 

242 read_code="XaEOW", phrases=["150 min plasma glucose level"] 

243 ) 

244 GLUCOSE_SERUM = ReadCode( 

245 read_code="44f..", phrases=["Serum glucose level"] 

246 ) 

247 GLUCOSE_SERUM_RANDOM = ReadCode( 

248 read_code="44f0.", phrases=["Serum random glucose level"] 

249 ) 

250 GLUCOSE_SERUM_FASTING = ReadCode( 

251 read_code="44f1.", phrases=["Serum fasting glucose level"] 

252 ) 

253 GLUCOSE_SERUM_30_MIN = ReadCode( 

254 read_code="XaEOX", phrases=["30 minute serum glucose level"] 

255 ) 

256 GLUCOSE_SERUM_60_MIN = ReadCode( 

257 read_code="XaEOY", phrases=["60 minute serum glucose level"] 

258 ) 

259 GLUCOSE_SERUM_90_MIN = ReadCode( 

260 read_code="XaEPd", phrases=["90 minute serum glucose level"] 

261 ) 

262 GLUCOSE_SERUM_120_MIN = ReadCode( 

263 read_code="XaEOZ", phrases=["120 minute serum glucose level"] 

264 ) 

265 GLUCOSE_SERUM_2H_POSTPRANDIAL = ReadCode( 

266 read_code="44f2.", phrases=["Serum 2-hr post-prand gluc lev"] 

267 ) 

268 GLUCOSE_SERUM_150_MIN = ReadCode( 

269 read_code="XaERQ", phrases=["150 minute serum glucose level"] 

270 ) 

271 

272 HBA1C = ReadCode(read_code="X772q", phrases=["Haemoglobin A1c level"]) 

273 HBA1C_DCCT = ReadCode( 

274 read_code="XaERp", phrases=["HbA1c level (DCCT aligned)"] 

275 ) 

276 HBA1C_IFCC = ReadCode( 

277 read_code="XaPbt", phrases=["HbA1c levl - IFCC standardised"] 

278 ) 

279 HDL_PLASMA = ReadCode( 

280 read_code="XaEVr", phrases=["Plasma HDL cholesterol level"] 

281 ) 

282 HDL_PLASMA_RANDOM = ReadCode( 

283 read_code="44d2.", phrases=["Plasma rndm HDL cholest level"] 

284 ) 

285 HDL_PLASMA_FASTING = ReadCode( 

286 read_code="44d3.", phrases=["Plasma fast HDL cholest level"] 

287 ) 

288 HDL_SERUM = ReadCode( 

289 read_code="44P5.", phrases=["Serum HDL cholesterol level"] 

290 ) 

291 HDL_SERUM_FASTING = ReadCode( 

292 read_code="44PB.", phrases=["Serum fast HDL cholesterol lev"] 

293 ) 

294 HDL_SERUM_RANDOM = ReadCode( 

295 read_code="44PC.", phrases=["Ser random HDL cholesterol lev"] 

296 ) 

297 

298 LITHIUM_SERUM = ReadCode( 

299 read_code="XE25g", phrases=["Serum lithium level"] 

300 ) 

301 LDL_PLASMA = ReadCode( 

302 read_code="XaEVs", phrases=["Plasma LDL cholesterol level"] 

303 ) 

304 LDL_PLASMA_RANDOM = ReadCode( 

305 read_code="44d4.", phrases=["Plasma rndm LDL cholest level"] 

306 ) 

307 LDL_PLASMA_FASTING = ReadCode( 

308 read_code="44d5.", phrases=["Plasma fast LDL cholest level"] 

309 ) 

310 LDL_SERUM = ReadCode( 

311 read_code="44P6.", phrases=["Serum LDL cholesterol level"] 

312 ) 

313 LDL_SERUM_FASTING = ReadCode( 

314 read_code="44PD.", phrases=["Serum fast LDL cholesterol lev"] 

315 ) 

316 LDL_SERUM_RANDOM = ReadCode( 

317 read_code="44PE.", phrases=["Ser random LDL cholesterol lev"] 

318 ) 

319 

320 POTASSIUM = ReadCode(read_code="X771S", phrases=["Potassium level"]) 

321 POTASSIUM_BLOOD = ReadCode( 

322 read_code="XaDvZ", phrases=["Blood potassium level"] 

323 ) 

324 POTASSIUM_PLASMA = ReadCode( 

325 read_code="XaIRl", phrases=["Plasma potassium level"] 

326 ) 

327 POTASSIUM_SERUM = ReadCode( 

328 read_code="XE2pz", phrases=["Serum potassium level"] 

329 ) 

330 

331 TG = ReadCode(read_code="X772O", phrases=["Triglyceride level"]) 

332 TG_PLASMA = ReadCode( 

333 read_code="44e..", phrases=["Plasma triglyceride level"] 

334 ) 

335 TG_PLASMA_RANDOM = ReadCode( 

336 read_code="44e0.", phrases=["Plasma rndm triglyceride level"] 

337 ) 

338 TG_PLASMA_FASTING = ReadCode( 

339 read_code="44e1.", phrases=["Plasma fast triglyceride level"] 

340 ) 

341 TG_SERUM = ReadCode( 

342 read_code="XE2q9", phrases=["Serum triglyceride levels"] 

343 ) 

344 TG_SERUM_FASTING = ReadCode( 

345 read_code="44Q4.", phrases=["Serum fasting triglyceride lev"] 

346 ) 

347 TG_SERUM_RANDOM = ReadCode( 

348 read_code="44Q5.", phrases=["Serum random triglyceride lev"] 

349 ) 

350 TSH_PLASMA = ReadCode(read_code="XaELW", phrases=["Plasma TSH level"]) 

351 TSH_PLASMA_30_MIN = ReadCode( 

352 read_code="XaET7", phrases=["30 minute plasma TSH level"] 

353 ) 

354 TSH_PLASMA_60_MIN = ReadCode( 

355 read_code="XaESa", phrases=["60 minute plasma TSH level"] 

356 ) 

357 TSH_PLASMA_90_MIN = ReadCode( 

358 read_code="XaET2", phrases=["90 minute plasma TSH level"] 

359 ) 

360 TSH_PLASMA_120_MIN = ReadCode( 

361 read_code="XaESb", phrases=["120 minute plasma TSH level"] 

362 ) 

363 TSH_PLASMA_150_MIN = ReadCode( 

364 read_code="XaESc", phrases=["150 minute plasma TSH level"] 

365 ) 

366 TSH_SERUM = ReadCode(read_code="XaELV", phrases=["Serum TSH level"]) 

367 TSH_SERUM_60_MIN = ReadCode( 

368 read_code="XaESX", phrases=["60 minute serum TSH level"] 

369 ) 

370 TSH_SERUM_90_MIN = ReadCode( 

371 read_code="XaESY", phrases=["90 minute serum TSH level"] 

372 ) 

373 TSH_SERUM_120_MIN = ReadCode( 

374 read_code="XaET1", phrases=["120 minute serum TSH level"] 

375 ) 

376 TSH_SERUM_150_MIN = ReadCode( 

377 read_code="XaESZ", phrases=["150 minute serum TSH level"] 

378 ) 

379 

380 SODIUM = ReadCode(read_code="X771T", phrases=["Sodium level"]) 

381 SODIUM_BLOOD = ReadCode(read_code="XaDva", phrases=["Blood sodium level"]) 

382 SODIUM_PLASMA = ReadCode( 

383 read_code="XaIRf", phrases=["Plasma sodium level"] 

384 ) 

385 SODIUM_SERUM = ReadCode(read_code="XE2q0", phrases=["Serum sodium level"]) 

386 

387 UREA_BLOOD = ReadCode(read_code="X771P", phrases=["Blood urea"]) 

388 UREA_PLASMA = ReadCode(read_code="XaDvl", phrases=["Plasma urea level"]) 

389 UREA_SERUM = ReadCode(read_code="XM0lt", phrases=["Serum urea level"]) 

390 

391 # ------------------------------------------------------------------------- 

392 # Haematology 

393 # ------------------------------------------------------------------------- 

394 

395 BASOPHIL_COUNT = ReadCode(read_code="42L..", phrases=["Basophil count"]) 

396 

397 EOSINOPHIL_COUNT = ReadCode( 

398 read_code="42K..", phrases=["Eosinophil count"] 

399 ) 

400 ESR = ReadCode( 

401 read_code="XE2m7", phrases=["Erythrocyte sedimentation rate"] 

402 ) 

403 

404 HAEMATOCRIT = ReadCode(read_code="X76tb", phrases=["Haematocrit"]) 

405 HAEMOGLOBIN_CONCENTRATION = ReadCode( 

406 read_code="Xa96v", phrases=["Haemoglobin concentration"] 

407 ) 

408 

409 LYMPHOCYTE_COUNT = ReadCode( 

410 read_code="42M..", phrases=["Lymphocyte count"] 

411 ) 

412 

413 MONOCYTE_COUNT = ReadCode(read_code="42N..", phrases=["Monocyte count"]) 

414 

415 NEUTROPHIL_COUNT = ReadCode( 

416 read_code="42J..", phrases=["Neutrophil count"] 

417 ) 

418 

419 PLATELET_COUNT = ReadCode(read_code="42P..", phrases=["Platelet count"]) 

420 POLYMORPH_COUNT = ReadCode( # = neutrophils 

421 read_code="XaIao", phrases=["Polymorph count"] 

422 ) 

423 

424 RBC_COUNT = ReadCode(read_code="426..", phrases=["Red blood cell count"]) 

425 

426 WBC_COUNT = ReadCode( 

427 read_code="XaIdY", phrases=["Total white blood count"] 

428 ) 

429 

430 

431# ============================================================================= 

432# Combiner function 

433# ============================================================================= 

434 

435 

436def regex_components_from_read_codes(*read_codes: ReadCode) -> List[str]: 

437 """ 

438 Returns all components from the specified Read code objects. 

439 """ 

440 code_strings = [] # type: List[str] 

441 for rc in read_codes: 

442 code_strings += rc.component_regex_strings() 

443 return code_strings 

444 

445 

446def any_read_code_of(*read_codes: ReadCode) -> str: 

447 """ 

448 Returns a regex allowing any of the specified Read codes. 

449 """ 

450 code_strings = regex_components_from_read_codes(*read_codes) 

451 return regex_or( 

452 *code_strings, 

453 wrap_each_in_noncapture_group=True, 

454 wrap_result_in_noncapture_group=True, 

455 )