Coverage for nlp_manager/nlp_definition.py: 61%

233 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2026-01-06 10:22 -0600

1""" 

2crate_anon/nlp_manager/nlp_definition.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**NLP definition class.** 

27 

28""" 

29 

30# ============================================================================= 

31# Imports 

32# ============================================================================= 

33 

34import datetime 

35import json 

36import logging 

37import os 

38import sys 

39from typing import ( 

40 Any, 

41 Dict, 

42 Iterable, 

43 List, 

44 Optional, 

45 Tuple, 

46 Type, 

47 TYPE_CHECKING, 

48) 

49 

50from cardinal_pythonlib.datetimefunc import get_now_utc_notz_datetime 

51from cardinal_pythonlib.docker import running_under_docker 

52from cardinal_pythonlib.lists import chunks 

53from sqlalchemy.engine.base import Engine 

54from sqlalchemy.orm.session import Session 

55from sqlalchemy.schema import MetaData 

56 

57from crate_anon.anonymise.constants import AnonymiseConfigDefaults 

58from crate_anon.anonymise.dbholder import DatabaseHolder 

59from crate_anon.common.constants import EnvVar 

60from crate_anon.common.extendedconfigparser import ( 

61 ConfigSection, 

62 ExtendedConfigParser, 

63) 

64from crate_anon.common.sql import TransactionSizeLimiter 

65from crate_anon.nlp_manager.cloud_config import CloudConfig 

66from crate_anon.nlp_manager.constants import ( 

67 CloudNlpConfigKeys, 

68 DatabaseConfigKeys, 

69 DEFAULT_CLOUD_LIMIT_BEFORE_COMMIT, 

70 DEFAULT_CLOUD_MAX_CONTENT_LENGTH, 

71 DEFAULT_CLOUD_MAX_RECORDS_PER_REQUEST, 

72 DEFAULT_CLOUD_MAX_TRIES, 

73 DEFAULT_CLOUD_RATE_LIMIT_HZ, 

74 DEFAULT_CLOUD_WAIT_ON_CONN_ERR_S, 

75 DEFAULT_TEMPORARY_TABLENAME, 

76 full_sectionname, 

77 GATE_PIPELINE_CLASSNAME, 

78 NlpOutputConfigKeys, 

79 HashClass, 

80 InputFieldConfigKeys, 

81 MAX_SQL_FIELD_LEN, 

82 NLP_CONFIG_ENV_VAR, 

83 NlpConfigPrefixes, 

84 NlpDefConfigKeys, 

85 ProcessorConfigKeys, 

86 NlpDefValues, 

87) 

88from crate_anon.nlprp.constants import NlprpKeys 

89from crate_anon.version import CRATE_VERSION, CRATE_VERSION_DATE 

90 

91if TYPE_CHECKING: 

92 from crate_anon.nlp_manager.base_nlp_parser import ( 

93 BaseNlpParser, 

94 TableMaker, 

95 ) 

96 from crate_anon.nlp_manager.input_field_config import InputFieldConfig 

97 

98log = logging.getLogger(__name__) 

99 

100 

101# ============================================================================= 

102# Demo config 

103# ============================================================================= 

104 

105 

106def demo_nlp_config() -> str: 

107 """ 

108 Returns a demo NLP config file for CRATE. 

109 """ 

110 # ------------------------------------------------------------------------- 

111 # Imports 

112 # ------------------------------------------------------------------------- 

113 

114 from crate_anon.nlp_manager.parse_biochemistry import ( 

115 ALL_BIOCHEMISTRY_NLP_AND_VALIDATORS, 

116 ) # delayed import 

117 from crate_anon.nlp_manager.parse_clinical import ( 

118 ALL_CLINICAL_NLP_AND_VALIDATORS, 

119 ) # delayed import 

120 from crate_anon.nlp_manager.parse_cognitive import ( 

121 ALL_COGNITIVE_NLP_AND_VALIDATORS, 

122 ) # delayed import 

123 from crate_anon.nlp_manager.parse_haematology import ( 

124 ALL_HAEMATOLOGY_NLP_AND_VALIDATORS, 

125 ) # delayed import 

126 from crate_anon.nlp_manager.parse_substance_misuse import ( 

127 ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS, 

128 ) # delayed import 

129 

130 # ------------------------------------------------------------------------- 

131 # Helper functions 

132 # ------------------------------------------------------------------------- 

133 

134 def _make_procdef_pair(name: str) -> str: 

135 return f"""[{NlpConfigPrefixes.PROCESSOR}:procdef_{name}] 

136{ProcessorConfigKeys.DESTDB} = {destdb} 

137{ProcessorConfigKeys.DESTTABLE} = {name} 

138[{NlpConfigPrefixes.PROCESSOR}:procdef_validate_{name}] 

139{ProcessorConfigKeys.DESTDB} = {destdb} 

140{ProcessorConfigKeys.DESTTABLE} = validate_{name}""" 

141 

142 def _make_module_procdef_block( 

143 nlp_and_validators: List[ 

144 Tuple[Type["BaseNlpParser"], Type["BaseNlpParser"]] 

145 ] 

146 ) -> str: 

147 _procdeflist = [] # type: List[str] 

148 for nlpclass, validatorclass in nlp_and_validators: 

149 _procdeflist.append( 

150 _make_procdef_pair(nlpclass.classname().lower()) 

151 ) 

152 return "\n\n".join(_procdeflist) 

153 

154 def _make_proclist( 

155 nlp_and_validators: List[ 

156 Tuple[Type["BaseNlpParser"], Type["BaseNlpParser"]] 

157 ] 

158 ) -> str: 

159 _proclist = [] # type: List[str] 

160 for nlpclass, validatorclass in nlp_and_validators: 

161 _name = nlpclass.classname().lower() 

162 _proclist.append( 

163 f" {nlpclass.classname()} procdef_{_name}\n" 

164 f" {validatorclass.classname()} procdef_validate_{_name}" 

165 ) 

166 return "\n".join(_proclist) 

167 

168 # ------------------------------------------------------------------------- 

169 # Quasi-constants 

170 # ------------------------------------------------------------------------- 

171 

172 for_docker = running_under_docker() 

173 

174 destdb = "DESTINATION_DATABASE" 

175 hashphrase = "doesnotmatter" 

176 if_clin_docs = "INPUT_FIELD_CLINICAL_DOCUMENTS" 

177 if_prog_notes = "INPUT_FIELD_PROGRESS_NOTES" 

178 inputfields = f"{if_clin_docs}\n {if_prog_notes}" 

179 truncate_text_at = "32766" 

180 my_env = "MY_ENV_SECTION" 

181 my_src_db = "SOURCE_DATABASE" 

182 my_cloud = "my_uk_cloud_service" 

183 ridfield = "RID_FIELD" 

184 tridfield = "TRID_FIELD" 

185 nlp_input_terminator = "END_OF_TEXT_FOR_NLP" 

186 nlp_output_terminator = "END_OF_NLP_OUTPUT_RECORD" 

187 

188 procdefs_biochemistry = _make_module_procdef_block( 

189 ALL_BIOCHEMISTRY_NLP_AND_VALIDATORS 

190 ) 

191 procdefs_clinical = _make_module_procdef_block( 

192 ALL_CLINICAL_NLP_AND_VALIDATORS 

193 ) 

194 procdefs_cognitive = _make_module_procdef_block( 

195 ALL_COGNITIVE_NLP_AND_VALIDATORS 

196 ) 

197 procdefs_haematology = _make_module_procdef_block( 

198 ALL_HAEMATOLOGY_NLP_AND_VALIDATORS 

199 ) 

200 procdefs_substance_misuse = _make_module_procdef_block( 

201 ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS 

202 ) 

203 

204 proclist_biochemistry = _make_proclist(ALL_BIOCHEMISTRY_NLP_AND_VALIDATORS) 

205 proclist_clinical = _make_proclist(ALL_CLINICAL_NLP_AND_VALIDATORS) 

206 proclist_cognitive = _make_proclist(ALL_COGNITIVE_NLP_AND_VALIDATORS) 

207 proclist_haematology = _make_proclist(ALL_HAEMATOLOGY_NLP_AND_VALIDATORS) 

208 proclist_substance_misuse = _make_proclist( 

209 ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS 

210 ) 

211 

212 if EnvVar.GENERATING_CRATE_DOCS in os.environ: 

213 nlp_prog_dir = "/path/to/crate_anon/nlp_manager/compiled_nlp_classes" 

214 else: 

215 this_dir = os.path.abspath( 

216 os.path.dirname(__file__) 

217 ) # crate_anon/nlp_manager 

218 nlp_prog_dir = os.path.join(this_dir, "compiled_nlp_classes") 

219 

220 if for_docker: 

221 # See crate.Dockerfile 

222 gate_home = "/crate/gate" 

223 kcl_pharmacotherapy_dir = "/crate/brc-gate-pharmacotherapy" 

224 cloud_request_data_dir = "/crate/tmp/clouddata" 

225 gate_plugin_file = ( 

226 "/crate/src/crate_anon/nlp_manager/specimen_gate_plugin_file.ini" 

227 ) 

228 else: 

229 gate_home = "/path/to/GATE_Developer_9.0.1" 

230 kcl_pharmacotherapy_dir = "/path/to/brc-gate-pharmacotherapy" 

231 cloud_request_data_dir = "/srv/crate/clouddata" 

232 gate_plugin_file = "/path/to/specimen_gate_plugin_file.ini" 

233 

234 _DA = AnonymiseConfigDefaults 

235 

236 # ------------------------------------------------------------------------- 

237 # The demo config itself 

238 # ------------------------------------------------------------------------- 

239 

240 # noinspection HttpUrlsUsage 

241 return f"""# Configuration file for CRATE NLP manager (crate_nlp). 

242# Version {CRATE_VERSION} ({CRATE_VERSION_DATE}). 

243# 

244# PLEASE SEE THE HELP at https://crateanon.readthedocs.io/ 

245# Using defaults for Docker environment: {for_docker} 

246 

247# ============================================================================= 

248# A. Individual NLP definitions 

249# ============================================================================= 

250# - referred to by the NLP manager's command-line arguments 

251# - You are likely to need to alter these (particularly the bits in capital 

252# letters) to refer to your own database(s). 

253 

254# ----------------------------------------------------------------------------- 

255# GATE people-and-places demo 

256# ----------------------------------------------------------------------------- 

257 

258[{NlpConfigPrefixes.NLPDEF}:gate_name_location_demo] 

259 

260{NlpDefConfigKeys.INPUTFIELDDEFS} = 

261 {inputfields} 

262{NlpDefConfigKeys.PROCESSORS} = 

263 GATE procdef_gate_name_location 

264{NlpDefConfigKeys.PROGRESSDB} = {destdb} 

265{NlpDefConfigKeys.HASHPHRASE} = {hashphrase} 

266 

267 

268# ----------------------------------------------------------------------------- 

269# KConnect (Bio-YODIE) disease-finding GATE app 

270# ----------------------------------------------------------------------------- 

271 

272[{NlpConfigPrefixes.NLPDEF}:gate_kconnect_diseases] 

273 

274{NlpDefConfigKeys.INPUTFIELDDEFS} = 

275 {inputfields} 

276{NlpDefConfigKeys.PROCESSORS} = 

277 GATE procdef_gate_kconnect 

278{NlpDefConfigKeys.PROGRESSDB} = {destdb} 

279{NlpDefConfigKeys.HASHPHRASE} = {hashphrase} 

280 

281 

282# ----------------------------------------------------------------------------- 

283# KCL Lewy body dementia GATE app 

284# ----------------------------------------------------------------------------- 

285 

286[{NlpConfigPrefixes.NLPDEF}:gate_kcl_lbd] 

287 

288{NlpDefConfigKeys.INPUTFIELDDEFS} = 

289 {inputfields} 

290{NlpDefConfigKeys.PROCESSORS} = 

291 GATE procdef_gate_kcl_lbda 

292{NlpDefConfigKeys.PROGRESSDB} = {destdb} 

293{NlpDefConfigKeys.HASHPHRASE} = {hashphrase} 

294 

295 

296# ----------------------------------------------------------------------------- 

297# KCL pharmacotherapy GATE app 

298# ----------------------------------------------------------------------------- 

299 

300[{NlpConfigPrefixes.NLPDEF}:gate_kcl_pharmacotherapy] 

301 

302{NlpDefConfigKeys.INPUTFIELDDEFS} = 

303 {inputfields} 

304{NlpDefConfigKeys.PROCESSORS} = 

305 GATE procdef_gate_pharmacotherapy 

306{NlpDefConfigKeys.PROGRESSDB} = {destdb} 

307{NlpDefConfigKeys.HASHPHRASE} = {hashphrase} 

308 

309 

310# ----------------------------------------------------------------------------- 

311# Medex-UIMA medication-finding app 

312# ----------------------------------------------------------------------------- 

313 

314[{NlpConfigPrefixes.NLPDEF}:medex_medications] 

315 

316{NlpDefConfigKeys.INPUTFIELDDEFS} = 

317 {inputfields} 

318{NlpDefConfigKeys.PROCESSORS} = 

319 Medex procdef_medex_medications 

320{NlpDefConfigKeys.PROGRESSDB} = {destdb} 

321{NlpDefConfigKeys.HASHPHRASE} = {hashphrase} 

322 

323 

324# ----------------------------------------------------------------------------- 

325# CRATE number-finding Python regexes 

326# ----------------------------------------------------------------------------- 

327 

328[{NlpConfigPrefixes.NLPDEF}:crate_biomarkers] 

329 

330{NlpDefConfigKeys.INPUTFIELDDEFS} = 

331 {inputfields} 

332{NlpDefConfigKeys.PROCESSORS} = 

333 # ------------------------------------------------------------------------- 

334 # Biochemistry 

335 # ------------------------------------------------------------------------- 

336{proclist_biochemistry} 

337 # ------------------------------------------------------------------------- 

338 # Clinical 

339 # ------------------------------------------------------------------------- 

340{proclist_clinical} 

341 # ------------------------------------------------------------------------- 

342 # Cognitive 

343 # ------------------------------------------------------------------------- 

344{proclist_cognitive} 

345 # ------------------------------------------------------------------------- 

346 # Haematology 

347 # ------------------------------------------------------------------------- 

348{proclist_haematology} 

349 # ------------------------------------------------------------------------- 

350 # Substance misuse 

351 # ------------------------------------------------------------------------- 

352{proclist_substance_misuse} 

353 

354{NlpDefConfigKeys.PROGRESSDB} = {destdb} 

355{NlpDefConfigKeys.HASHPHRASE} = {hashphrase} 

356# {NlpDefConfigKeys.TRUNCATE_TEXT_AT} = {truncate_text_at} 

357# {NlpDefConfigKeys.RECORD_TRUNCATED_VALUES} = False 

358{NlpDefConfigKeys.MAX_ROWS_BEFORE_COMMIT} = {_DA.MAX_ROWS_BEFORE_COMMIT} 

359{NlpDefConfigKeys.MAX_BYTES_BEFORE_COMMIT} = {_DA.MAX_BYTES_BEFORE_COMMIT} 

360 

361# ----------------------------------------------------------------------------- 

362# Cloud NLP demo 

363# ----------------------------------------------------------------------------- 

364 

365[{NlpConfigPrefixes.NLPDEF}:cloud_nlp_demo] 

366 

367{NlpDefConfigKeys.INPUTFIELDDEFS} = 

368 {inputfields} 

369{NlpDefConfigKeys.PROCESSORS} = 

370 Cloud procdef_cloud_crp 

371{NlpDefConfigKeys.PROGRESSDB} = {destdb} 

372{NlpDefConfigKeys.HASHPHRASE} = {hashphrase} 

373{NlpDefConfigKeys.CLOUD_CONFIG} = {my_cloud} 

374{NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR} = {cloud_request_data_dir} 

375 

376 

377# ============================================================================= 

378# B. NLP processor definitions 

379# ============================================================================= 

380# - You're likely to have to modify the destination databases these point to, 

381# but otherwise you can probably leave them as they are. 

382 

383# ----------------------------------------------------------------------------- 

384# Specimen CRATE regular expression processor definitions 

385# ----------------------------------------------------------------------------- 

386 

387 # Most of these are very simple, and just require a destination database 

388 # (as a cross-reference to a database section within this file) and a 

389 # destination table. 

390 

391 # Biochemistry 

392 

393{procdefs_biochemistry} 

394 

395 # Clinical 

396 

397{procdefs_clinical} 

398 

399 # Cognitive 

400 

401{procdefs_cognitive} 

402 

403 # Haematology 

404 

405{procdefs_haematology} 

406 

407 # Substance misuse 

408 

409{procdefs_substance_misuse} 

410 

411# ----------------------------------------------------------------------------- 

412# Specimen GATE demo people/places processor definition 

413# ----------------------------------------------------------------------------- 

414 

415 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

416 # Define the processor 

417 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

418 

419[{NlpConfigPrefixes.PROCESSOR}:procdef_gate_name_location] 

420 

421{ProcessorConfigKeys.DESTDB} = {destdb} 

422{ProcessorConfigKeys.OUTPUTTYPEMAP} = 

423 Person output_person 

424 Location output_location 

425{ProcessorConfigKeys.PROGARGS} = 

426 java 

427 -classpath "{{NLPPROGDIR}}"{{OS_PATHSEP}}"{{GATE_HOME}}/lib/*" 

428 -Dgate.home="{{GATE_HOME}}" 

429 {GATE_PIPELINE_CLASSNAME} 

430 --gate_app "{{GATE_HOME}}/plugins/ANNIE/ANNIE_with_defaults.gapp" 

431 --pluginfile "{{GATE_PLUGIN_FILE}}" 

432 --annotation Person 

433 --annotation Location 

434 --input_terminator {nlp_input_terminator} 

435 --output_terminator {nlp_output_terminator} 

436 --log_tag {{NLPLOGTAG}} 

437 --verbose 

438{ProcessorConfigKeys.PROGENVSECTION} = {my_env} 

439{ProcessorConfigKeys.INPUT_TERMINATOR} = {nlp_input_terminator} 

440{ProcessorConfigKeys.OUTPUT_TERMINATOR} = {nlp_output_terminator} 

441# {ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES} = 1000 

442 

443 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

444 # Define the output tables used by this GATE processor 

445 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

446 

447[{NlpConfigPrefixes.OUTPUT}:output_person] 

448 

449{NlpOutputConfigKeys.DESTTABLE} = person 

450{NlpOutputConfigKeys.RENAMES} = 

451 firstName firstname 

452{NlpOutputConfigKeys.DESTFIELDS} = 

453 rule VARCHAR(100) Rule used to find this person (e.g. TitleFirstName, PersonFull) 

454 firstname VARCHAR(100) First name 

455 surname VARCHAR(100) Surname 

456 gender VARCHAR(7) Gender (e.g. male, female, unknown) 

457 kind VARCHAR(100) Kind of name (e.g. personName, fullName) 

458 # ... longest gender: "unknown" (7) 

459{NlpOutputConfigKeys.INDEXDEFS} = 

460 firstname 64 

461 surname 64 

462 

463[{NlpConfigPrefixes.OUTPUT}:output_location] 

464 

465{NlpOutputConfigKeys.DESTTABLE} = location 

466{NlpOutputConfigKeys.RENAMES} = 

467 locType loctype 

468{NlpOutputConfigKeys.DESTFIELDS} = 

469 rule VARCHAR(100) Rule used (e.g. Location1) 

470 loctype VARCHAR(100) Location type (e.g. city) 

471{NlpOutputConfigKeys.INDEXDEFS} = 

472 rule 100 

473 loctype 100 

474 

475 

476# ----------------------------------------------------------------------------- 

477# Specimen Sheffield/KCL KConnect (Bio-YODIE) processor definition 

478# ----------------------------------------------------------------------------- 

479# https://gate.ac.uk/applications/bio-yodie.html 

480 

481 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

482 # Define the processor 

483 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

484 

485[{NlpConfigPrefixes.PROCESSOR}:procdef_gate_kconnect] 

486 

487{ProcessorConfigKeys.DESTDB} = {destdb} 

488{ProcessorConfigKeys.OUTPUTTYPEMAP} = 

489 Disease_or_Syndrome output_disease_or_syndrome 

490{ProcessorConfigKeys.PROGARGS} = 

491 java 

492 -classpath "{{NLPPROGDIR}}"{{OS_PATHSEP}}"{{GATE_HOME}}/lib/*" 

493 -Dgate.home="{{GATE_HOME}}" 

494 {GATE_PIPELINE_CLASSNAME} 

495 --gate_app "{{KCONNECTDIR}}/main-bio/main-bio.xgapp" 

496 --pluginfile "{{GATE_PLUGIN_FILE}}" 

497 --annotation Disease_or_Syndrome 

498 --input_terminator {nlp_input_terminator} 

499 --output_terminator {nlp_output_terminator} 

500 --log_tag {{NLPLOGTAG}} 

501 --suppress_gate_stdout 

502 --verbose 

503{ProcessorConfigKeys.PROGENVSECTION} = {my_env} 

504{ProcessorConfigKeys.INPUT_TERMINATOR} = {nlp_input_terminator} 

505{ProcessorConfigKeys.OUTPUT_TERMINATOR} = {nlp_output_terminator} 

506# {ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES} = 1000 

507 

508 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

509 # Define the output tables used by this GATE processor 

510 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

511 

512[{NlpConfigPrefixes.OUTPUT}:output_disease_or_syndrome] 

513 

514{NlpOutputConfigKeys.DESTTABLE} = kconnect_diseases 

515{NlpOutputConfigKeys.RENAMES} = 

516 Experiencer experiencer 

517 Negation negation 

518 PREF pref 

519 STY sty 

520 TUI tui 

521 Temporality temporality 

522 VOCABS vocabs 

523{NlpOutputConfigKeys.DESTFIELDS} = 

524 # Found by manual inspection of KConnect/Bio-YODIE output from the GATE console: 

525 experiencer VARCHAR(100) Who experienced it; e.g. "Patient", "Other" 

526 negation VARCHAR(100) Was it negated or not; e.g. "Affirmed", "Negated" 

527 pref VARCHAR(100) PREFferred name; e.g. "Rheumatic gout" 

528 sty VARCHAR(100) Semantic Type (STY) [semantic type name]; e.g. "Disease or Syndrome" 

529 tui VARCHAR(4) Type Unique Identifier (TUI) [semantic type identifier]; 4 characters; https://www.ncbi.nlm.nih.gov/books/NBK9679/; e.g. "T047" 

530 temporality VARCHAR(100) Occurrence in time; e.g. "Recent", "historical", "hypothetical" 

531 vocabs VARCHAR(255) List of UMLS vocabularies; e.g. "AIR,MSH,NDFRT,MEDLINEPLUS,NCI,LNC,NCI_FDA,NCI,MTH,AIR,ICD9CM,LNC,SNOMEDCT_US,LCH_NW,HPO,SNOMEDCT_US,ICD9CM,SNOMEDCT_US,COSTAR,CST,DXP,QMR,OMIM,OMIM,AOD,CSP,NCI_NCI-GLOSS,CHV" 

532 inst VARCHAR(8) Looks like a Concept Unique Identifier (CUI); 1 letter then 7 digits; e.g. "C0003873" 

533 inst_full VARCHAR(255) Looks like a URL to a CUI; e.g. "http://linkedlifedata.com/resource/umls/id/C0003873" 

534 language VARCHAR(100) Language; e.g. ""; ?will look like "ENG" for English? See https://www.nlm.nih.gov/research/umls/implementation_resources/query_diagrams/er1.html 

535 tui_full VARCHAR(255) TUI (?); e.g. "http://linkedlifedata.com/resource/semanticnetwork/id/T047" 

536{NlpOutputConfigKeys.INDEXDEFS} = 

537 pref 100 

538 sty 100 

539 tui 4 

540 inst 8 

541 

542 

543# ----------------------------------------------------------------------------- 

544# Specimen KCL GATE pharmacotherapy processor definition 

545# ----------------------------------------------------------------------------- 

546# https://github.com/KHP-Informatics/brc-gate-pharmacotherapy 

547 

548 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

549 # Define the processor 

550 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

551 

552[{NlpConfigPrefixes.PROCESSOR}:procdef_gate_pharmacotherapy] 

553 

554{ProcessorConfigKeys.DESTDB} = {destdb} 

555{ProcessorConfigKeys.OUTPUTTYPEMAP} = 

556 Prescription output_prescription 

557{ProcessorConfigKeys.PROGARGS} = 

558 java 

559 -classpath "{{NLPPROGDIR}}"{{OS_PATHSEP}}"{{GATE_HOME}}/lib/*" 

560 -Dgate.home="{{GATE_HOME}}" 

561 {GATE_PIPELINE_CLASSNAME} 

562 --gate_app "{{GATE_PHARMACOTHERAPY_DIR}}/application.xgapp" 

563 --pluginfile "{{GATE_PLUGIN_FILE}}" 

564 --include_set Output 

565 --annotation Prescription 

566 --input_terminator {nlp_input_terminator} 

567 --output_terminator {nlp_output_terminator} 

568 --log_tag {{NLPLOGTAG}} 

569 --suppress_gate_stdout 

570 --show_contents_on_crash 

571{ProcessorConfigKeys.PROGENVSECTION} = {my_env} 

572{ProcessorConfigKeys.INPUT_TERMINATOR} = {nlp_input_terminator} 

573{ProcessorConfigKeys.OUTPUT_TERMINATOR} = {nlp_output_terminator} 

574# {ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES} = 1000 

575 

576 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

577 # Define the output tables used by this GATE processor 

578 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

579 

580[{NlpConfigPrefixes.OUTPUT}:output_prescription] 

581 

582{NlpOutputConfigKeys.DESTTABLE} = medications_gate 

583{NlpOutputConfigKeys.RENAMES} = 

584 drug-type drug_type 

585 dose-value dose_value 

586 dose-unit dose_unit 

587 dose-multiple dose_multiple 

588 Directionality directionality 

589 Experiencer experiencer 

590 "Length of Time" length_of_time 

591 Temporality temporality 

592 "Unit of Time" unit_of_time 

593{NlpOutputConfigKeys.NULL_LITERALS} = 

594 null 

595 "" 

596{NlpOutputConfigKeys.DESTFIELDS} = 

597 # Found by (a) manual inspection of BRC GATE pharmacotherapy output from 

598 # the GATE console; (b) inspection of 

599 # application-resources/schemas/Prescription.xml 

600 # Note preference for DECIMAL over FLOAT/REAL; see 

601 # https://stackoverflow.com/questions/1056323 

602 # Note that not all annotations appear for all texts. Try e.g.: 

603 # Please start haloperidol 5mg tds. 

604 # I suggest you start haloperidol 5mg tds for one week. 

605 rule VARCHAR(100) Rule yielding this drug. Not in XML but is present in a subset: e.g. "weanOff"; max length unclear 

606 drug VARCHAR(200) Drug name. Required string; e.g. "haloperidol"; max length 47 from "wc -L BNF_generic.lst", 134 from BNF_trade.lst 

607 drug_type VARCHAR(100) Type of drug name. Required string; from "drug-type"; e.g. "BNF_generic"; ?length of longest drug ".lst" filename 

608 dose VARCHAR(100) Dose text. Required string; e.g. "5mg"; max length unclear 

609 dose_value DECIMAL Numerical dose value. Required numeric; from "dose-value"; "double" in the XML but DECIMAL probably better; e.g. 5.0 

610 dose_unit VARCHAR(100) Text of dose units. Required string; from "dose-unit"; e.g. "mg"; max length unclear 

611 dose_multiple INT Dose count (multiple). Required integer; from "dose-multiple"; e.g. 1 

612 route VARCHAR(7) Route of administration. Required string; one of: "oral", "im", "iv", "rectal", "sc", "dermal", "unknown" 

613 status VARCHAR(10) Change in drug status. Required; one of: "start", "continuing", "stop" 

614 tense VARCHAR(7) Tense in which drug is referred to. Required; one of: "past", "present" 

615 date VARCHAR(100) ?. Optional string; max length unclear 

616 directionality VARCHAR(100) ?. Optional string; max length unclear 

617 experiencer VARCHAR(100) Person experiencing the drug-related event. Optional string; e.g. "Patient" 

618 frequency DECIMAL Frequency (times per <time_unit>). Optional numeric; "double" in the XML but DECIMAL probably better 

619 interval DECIMAL The n in "every n <time_unit>s" (1 for "every <time_unit>"). Optional numeric; "double" in the XML but DECIMAL probably better 

620 length_of_time VARCHAR(100) ?. Optional string; from "Length of Time"; max length unclear 

621 temporality VARCHAR(100) ?. Optional string; e.g. "Recent", "Historical" 

622 time_unit VARCHAR(100) Unit of time (see frequency, interval). Optional string; from "time-unit"; e.g. "day"; max length unclear 

623 unit_of_time VARCHAR(100) ?. Optional string; from "Unit of Time"; max length unclear 

624 when VARCHAR(100) ?. Optional string; max length unclear 

625{NlpOutputConfigKeys.INDEXDEFS} = 

626 rule 100 

627 drug 200 

628 route 7 

629 status 10 

630 tense 7 

631 

632 

633# ----------------------------------------------------------------------------- 

634# Specimen KCL Lewy Body Diagnosis Application (LBDA) processor definition 

635# ----------------------------------------------------------------------------- 

636# https://github.com/KHP-Informatics/brc-gate-LBD 

637 

638 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

639 # Define the processor 

640 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

641 

642[{NlpConfigPrefixes.PROCESSOR}:procdef_gate_kcl_lbda] 

643 

644 # "cDiagnosis" is the "confirmed diagnosis" field, as d/w Jyoti Jyoti 

645 # 2018-03-20; see also README.md. This appears in the "Automatic" and the 

646 # unnamed set. There is also a near-miss one, "DiagnosisAlmost", which 

647 # appears in the unnamed set. 

648 # "Mr Jones has Lewy body dementia." 

649 # -> DiagnosisAlmost 

650 # "Mr Jones has a diagnosis of Lewy body dementia." 

651 # -> DiagnosisAlmost, cDiagnosis 

652 # Note that we must use lower case in the outputtypemap. 

653 

654{ProcessorConfigKeys.DESTDB} = {destdb} 

655{ProcessorConfigKeys.OUTPUTTYPEMAP} = 

656 cDiagnosis output_lbd_diagnosis 

657 DiagnosisAlmost output_lbd_diagnosis 

658{ProcessorConfigKeys.PROGARGS} = 

659 java 

660 -classpath "{{NLPPROGDIR}}"{{OS_PATHSEP}}"{{GATE_HOME}}/lib/*" 

661 -Dgate.home="{{GATE_HOME}}" 

662 {GATE_PIPELINE_CLASSNAME} 

663 --gate_app "{{KCL_LBDA_DIR}}/application.xgapp" 

664 --pluginfile "{{GATE_PLUGIN_FILE}}" 

665 --set_annotation "" DiagnosisAlmost 

666 --set_annotation Automatic cDiagnosis 

667 --input_terminator {nlp_input_terminator} 

668 --output_terminator {nlp_output_terminator} 

669 --log_tag {{NLPLOGTAG}} 

670 --suppress_gate_stdout 

671 --verbose 

672{ProcessorConfigKeys.PROGENVSECTION} = {my_env} 

673{ProcessorConfigKeys.INPUT_TERMINATOR} = {nlp_input_terminator} 

674{ProcessorConfigKeys.OUTPUT_TERMINATOR} = {nlp_output_terminator} 

675# {ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES} = 1000 

676 

677 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

678 # Define the output tables used by this GATE processor 

679 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

680 

681[{NlpConfigPrefixes.OUTPUT}:output_lbd_diagnosis] 

682 

683{NlpOutputConfigKeys.DESTTABLE} = lewy_body_dementia_gate 

684{NlpOutputConfigKeys.NULL_LITERALS} = 

685 null 

686 "" 

687{NlpOutputConfigKeys.DESTFIELDS} = 

688 # Found by 

689 # (a) manual inspection of output from the GATE Developer console: 

690 # - e.g. {{rule=Includefin, text=Lewy body dementia}} 

691 # (b) inspection of contents: 

692 # - run a Cygwin shell 

693 # - find . -type f -exec grep cDiagnosis -l {{}} \\; 

694 # - 3 hits: 

695 # ./application-resources/jape/DiagnosisExclude2.jape 

696 # ... part of the "Lewy"-detection apparatus 

697 # ./application-resources/jape/text-feature.jape 

698 # ... adds "text" annotation to cDiagnosis Token 

699 # ./application.xgapp 

700 # ... in annotationTypes 

701 # On that basis: 

702 rule VARCHAR(100) Rule that generated the hit. 

703 text VARCHAR(200) Text that matched the rule. 

704{NlpOutputConfigKeys.INDEXDEFS} = 

705 rule 100 

706 text 200 

707 

708 

709# ----------------------------------------------------------------------------- 

710# Specimen MedEx processor definition 

711# ----------------------------------------------------------------------------- 

712# https://sbmi.uth.edu/ccb/resources/medex.htm 

713 

714[{NlpConfigPrefixes.PROCESSOR}:procdef_medex_medications] 

715 

716{ProcessorConfigKeys.DESTDB} = {destdb} 

717{ProcessorConfigKeys.DESTTABLE} = medications_medex 

718{ProcessorConfigKeys.PROGARGS} = 

719 java 

720 -classpath {{NLPPROGDIR}}:{{MEDEXDIR}}/bin:{{MEDEXDIR}}/lib/* 

721 -Dfile.encoding=UTF-8 

722 CrateMedexPipeline 

723 -lt {{NLPLOGTAG}} 

724 -v -v 

725# ... other arguments are added by the code 

726{ProcessorConfigKeys.PROGENVSECTION} = {my_env} 

727 

728 

729# ============================================================================= 

730# C. Environment variable definitions 

731# ============================================================================= 

732# - You'll need to modify this according to your local configuration. 

733 

734[{NlpConfigPrefixes.ENV}:{my_env}] 

735 

736GATE_HOME = {gate_home} 

737GATE_PHARMACOTHERAPY_DIR = {kcl_pharmacotherapy_dir} 

738GATE_PLUGIN_FILE = {gate_plugin_file} 

739KCL_LBDA_DIR = /path/to/brc-gate-LBD/Lewy_Body_Diagnosis 

740KCONNECTDIR = /path/to/yodie-pipeline-1-2-umls-only 

741MEDEXDIR = /path/to/Medex_UIMA_1.3.6 

742NLPPROGDIR = {nlp_prog_dir} 

743OS_PATHSEP = {os.pathsep} 

744 

745 

746# ============================================================================= 

747# D. Input field definitions 

748# ============================================================================= 

749 

750[{NlpConfigPrefixes.INPUT}:{if_clin_docs}] 

751 

752{InputFieldConfigKeys.SRCDB} = {my_src_db} 

753{InputFieldConfigKeys.SRCTABLE} = EXTRACTED_CLINICAL_DOCUMENTS 

754{InputFieldConfigKeys.SRCPKFIELD} = DOCUMENT_PK 

755{InputFieldConfigKeys.SRCFIELD} = DOCUMENT_TEXT 

756{InputFieldConfigKeys.SRCDATETIMEFIELD} = DOCUMENT_DATE 

757{InputFieldConfigKeys.COPYFIELDS} = 

758 {ridfield} 

759 {tridfield} 

760{InputFieldConfigKeys.INDEXED_COPYFIELDS} = 

761 {ridfield} 

762 {tridfield} 

763# {InputFieldConfigKeys.DEBUG_ROW_LIMIT} = 0 

764 

765[{NlpConfigPrefixes.INPUT}:{if_prog_notes}] 

766 

767{InputFieldConfigKeys.SRCDB} = {my_src_db} 

768{InputFieldConfigKeys.SRCTABLE} = PROGRESS_NOTES 

769{InputFieldConfigKeys.SRCPKFIELD} = PN_PK 

770{InputFieldConfigKeys.SRCFIELD} = PN_TEXT 

771{InputFieldConfigKeys.SRCDATETIMEFIELD} = PN_DATE 

772{InputFieldConfigKeys.COPYFIELDS} = 

773 {ridfield} 

774 {tridfield} 

775{InputFieldConfigKeys.INDEXED_COPYFIELDS} = 

776 {ridfield} 

777 {tridfield} 

778 

779 

780# ============================================================================= 

781# E. Database definitions, each in its own section 

782# ============================================================================= 

783 

784[{NlpConfigPrefixes.DATABASE}:{my_src_db}] 

785 

786{DatabaseConfigKeys.URL} = mysql+mysqldb://anontest:XXX@127.0.0.1:3306/anonymous_output?charset=utf8 

787 

788[{NlpConfigPrefixes.DATABASE}:{destdb}] 

789 

790{DatabaseConfigKeys.URL} = mysql+mysqldb://anontest:XXX@127.0.0.1:3306/anonymous_output?charset=utf8 

791 

792 

793# ============================================================================= 

794# F. Information for using cloud-based NLP 

795# ============================================================================= 

796 

797[{NlpConfigPrefixes.CLOUD}:{my_cloud}] 

798 

799{CloudNlpConfigKeys.CLOUD_URL} = https://your_url 

800{CloudNlpConfigKeys.USERNAME} = your_username 

801{CloudNlpConfigKeys.PASSWORD} = your_password 

802{CloudNlpConfigKeys.WAIT_ON_CONN_ERR} = {DEFAULT_CLOUD_WAIT_ON_CONN_ERR_S} 

803{CloudNlpConfigKeys.MAX_CONTENT_LENGTH} = {DEFAULT_CLOUD_MAX_CONTENT_LENGTH} 

804{CloudNlpConfigKeys.MAX_RECORDS_PER_REQUEST} = {DEFAULT_CLOUD_MAX_RECORDS_PER_REQUEST} 

805{CloudNlpConfigKeys.LIMIT_BEFORE_COMMIT} = {DEFAULT_CLOUD_LIMIT_BEFORE_COMMIT} 

806{CloudNlpConfigKeys.STOP_AT_FAILURE} = true 

807{CloudNlpConfigKeys.MAX_TRIES} = {DEFAULT_CLOUD_MAX_TRIES} 

808{CloudNlpConfigKeys.RATE_LIMIT_HZ} = {DEFAULT_CLOUD_RATE_LIMIT_HZ} 

809 

810[{NlpConfigPrefixes.PROCESSOR}:procdef_cloud_crp] 

811 

812{ProcessorConfigKeys.DESTDB} = {destdb} 

813{ProcessorConfigKeys.DESTTABLE} = crp_test 

814{ProcessorConfigKeys.PROCESSOR_NAME} = crate_anon.nlp_manager.parse_biochemistry.Crp 

815{ProcessorConfigKeys.PROCESSOR_FORMAT} = {NlpDefValues.FORMAT_STANDARD} 

816 

817""" # noqa: E501 

818 

819 

820# ============================================================================= 

821# Get config filename (from environment variable) 

822# ============================================================================= 

823 

824 

825def get_nlp_config_filename_or_exit() -> str: 

826 """ 

827 Returns the config filename, from our environment variable. 

828 If we can't retrieve it, perform a hard exit. 

829 """ 

830 # Get filename 

831 try: 

832 config_filename = os.environ[NLP_CONFIG_ENV_VAR] 

833 assert config_filename 

834 except (KeyError, AssertionError): 

835 print( 

836 f"You must set the {NLP_CONFIG_ENV_VAR} environment variable " 

837 f"to point to a CRATE NLP config file, or specify it on the " 

838 f"command line." 

839 ) 

840 sys.exit(1) 

841 return config_filename 

842 

843 

844# ============================================================================= 

845# Config class 

846# ============================================================================= 

847 

848 

849class NlpDefinition: 

850 """ 

851 Class representing an NLP master definition as read from config file. 

852 

853 An NLP definition represents the combination of 

854 

855 - one or more NLP processors (e.g. "CRATE's C-reactive protein finder") 

856 - one or more input fields in the source database 

857 

858 The NLP definition can therefore be used to say "run this set of NLP 

859 processors over this set of textual fields in my database". 

860 

861 See the documentation for the :ref:`NLP config file <nlp_config>`. 

862 """ 

863 

864 def __init__(self, nlpname: str, logtag: str = "") -> None: 

865 """ 

866 Read config from file. 

867 

868 Args: 

869 nlpname: config section name for this NLP definition 

870 logtag: text that may be passed to child processes to identify 

871 the NLP definition in their log output 

872 """ 

873 

874 # DELAYED IMPORTS (to make life simpler for classes deriving from 

875 # NlpParser and using NlpDefinition -- they can now do it directly, 

876 # not just via forward reference). 

877 from crate_anon.nlp_manager.all_processors import make_nlp_parser 

878 from crate_anon.nlp_manager.input_field_config import InputFieldConfig 

879 

880 self._nlpname = nlpname 

881 self._logtag = logtag 

882 

883 log.info(f"Loading config for section: {nlpname}") 

884 self._config_filename = get_nlp_config_filename_or_exit() 

885 

886 # Read config from file. 

887 self._cfg = ConfigSection( 

888 section=full_sectionname(NlpConfigPrefixes.NLPDEF, nlpname), 

889 filename=self._config_filename, 

890 case_sensitive=True, 

891 ) 

892 

893 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

894 # Our own stuff 

895 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

896 self._databases = {} # type: Dict[str, DatabaseHolder] 

897 self._progressdb_name = self._cfg.opt_str( 

898 NlpDefConfigKeys.PROGRESSDB, required=True 

899 ) 

900 self._progdb = self.get_database(self._progressdb_name) 

901 self._temporary_tablename = self._cfg.opt_str( 

902 NlpDefConfigKeys.TEMPORARY_TABLENAME, 

903 default=DEFAULT_TEMPORARY_TABLENAME, 

904 ) 

905 self._hashphrase = self._cfg.opt_str( 

906 NlpDefConfigKeys.HASHPHRASE, required=True 

907 ) 

908 self._hasher = HashClass(self._hashphrase) 

909 self._max_rows_before_commit = self._cfg.opt_int_positive( 

910 NlpDefConfigKeys.MAX_ROWS_BEFORE_COMMIT, 

911 AnonymiseConfigDefaults.MAX_ROWS_BEFORE_COMMIT, 

912 ) 

913 self._max_bytes_before_commit = self._cfg.opt_int_positive( 

914 NlpDefConfigKeys.MAX_BYTES_BEFORE_COMMIT, 

915 AnonymiseConfigDefaults.MAX_BYTES_BEFORE_COMMIT, 

916 ) 

917 self._now = get_now_utc_notz_datetime() 

918 self.truncate_text_at = self._cfg.opt_int_positive( 

919 NlpDefConfigKeys.TRUNCATE_TEXT_AT, default=0 

920 ) 

921 self.record_truncated_values = self._cfg.opt_bool( 

922 NlpDefConfigKeys.RECORD_TRUNCATED_VALUES, default=False 

923 ) 

924 self._cloud_config_name = self._cfg.opt_str( 

925 NlpDefConfigKeys.CLOUD_CONFIG 

926 ) 

927 self._cloud_request_data_dir = self._cfg.opt_str( 

928 NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR 

929 ) 

930 

931 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

932 # Input field definitions 

933 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

934 self._inputfielddefs = self._cfg.opt_strlist( 

935 NlpDefConfigKeys.INPUTFIELDDEFS, required=True, lower=False 

936 ) 

937 self._inputfieldmap = {} # type: Dict[str, InputFieldConfig] 

938 for cfg_input_name in self._inputfielddefs: 

939 if cfg_input_name in self._inputfieldmap: 

940 continue 

941 self._inputfieldmap[cfg_input_name] = InputFieldConfig( 

942 self, cfg_input_name 

943 ) 

944 

945 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

946 # NLP processors 

947 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

948 self._processors = [] # type: List[TableMaker] 

949 processorpairs = self._cfg.opt_strlist( 

950 NlpDefConfigKeys.PROCESSORS, required=True, lower=False 

951 ) 

952 # self._procstmp = {} 

953 try: 

954 for proctype, procname in chunks(processorpairs, 2): 

955 processor = make_nlp_parser( 

956 classname=proctype, 

957 nlpdef=self, 

958 cfg_processor_name=procname, 

959 ) 

960 self._processors.append(processor) 

961 except ValueError: 

962 log.critical(f"Bad {NlpDefConfigKeys.PROCESSORS} specification") 

963 raise 

964 

965 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

966 # Transaction sizes, for early commit 

967 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

968 self._transaction_limiters = ( 

969 {} 

970 ) # type: Dict[Session, TransactionSizeLimiter] 

971 # dictionary of session -> TransactionSizeLimiter 

972 

973 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

974 # Cloud config (loaded on request, then cached) 

975 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

976 self._cloudcfg = None # type: Optional[CloudConfig] 

977 

978 # ------------------------------------------------------------------------- 

979 # Basic info 

980 # ------------------------------------------------------------------------- 

981 

982 @property 

983 def name(self) -> str: 

984 """ 

985 Returns the name of the NLP definition. 

986 """ 

987 return self._nlpname 

988 

989 @property 

990 def logtag(self) -> str: 

991 """ 

992 Returns the log tag of the NLP definition (may be used by child 

993 processes to provide more information for logs). 

994 """ 

995 return self._logtag 

996 

997 @property 

998 def now(self) -> datetime.datetime: 

999 """ 

1000 Returns the time this NLP definition was created (in UTC). Used to 

1001 time-stamp NLP runs. 

1002 """ 

1003 return self._now 

1004 

1005 # ------------------------------------------------------------------------- 

1006 # Config file 

1007 # ------------------------------------------------------------------------- 

1008 

1009 @property 

1010 def parser(self) -> ExtendedConfigParser: 

1011 """ 

1012 Returns the 

1013 :class:`crate_anon.common.extendedconfigparser.ExtendedConfigParser` in 

1014 use. 

1015 """ 

1016 return self._cfg.parser 

1017 

1018 def get_config_section(self, section: str) -> ConfigSection: 

1019 """ 

1020 Returns a :class:`crate_anon.common.extendedconfigparser.ConfigSection` 

1021 referring to a (potentially different) section. 

1022 

1023 Args: 

1024 section: 

1025 New section name. 

1026 """ 

1027 return self._cfg.other_section(section) 

1028 

1029 def get_env_dict( 

1030 self, 

1031 env_section_name: str, 

1032 parent_env: Optional[Dict[str, str]] = None, 

1033 ) -> Dict[str, str]: 

1034 """ 

1035 Gets an operating system environment variable dictionary (``variable: 

1036 value`` mapping) from the config file. 

1037 

1038 Args: 

1039 env_section_name: config section name, without its "env:" prefix 

1040 parent_env: optional starting point (e.g. parent OS environment) 

1041 

1042 Returns: 

1043 a dictionary suitable for use as an OS environment 

1044 

1045 """ 

1046 return self._cfg.parser.get_env_dict( 

1047 full_sectionname(NlpConfigPrefixes.ENV, env_section_name), 

1048 parent_env=parent_env, 

1049 ) 

1050 

1051 def get_database( 

1052 self, 

1053 name_and_cfg_section: str, 

1054 with_session: bool = True, 

1055 with_conn: bool = False, 

1056 reflect: bool = False, 

1057 ) -> DatabaseHolder: 

1058 """ 

1059 Returns a :class:`crate_anon.anonymise.dbholder.DatabaseHolder` from 

1060 the config file, containing information abuot a database. 

1061 

1062 Args: 

1063 name_and_cfg_section: 

1064 string that is the name of the database, and also the config 

1065 file section name describing the database 

1066 with_session: create an SQLAlchemy Session? 

1067 with_conn: create an SQLAlchemy connection (via an Engine)? 

1068 reflect: read the database structure (when required)? 

1069 """ 

1070 if name_and_cfg_section in self._databases: 

1071 return self._databases[name_and_cfg_section] 

1072 dbsection = full_sectionname( 

1073 NlpConfigPrefixes.DATABASE, name_and_cfg_section 

1074 ) 

1075 assert len(name_and_cfg_section) <= MAX_SQL_FIELD_LEN 

1076 db = self.parser.get_database( 

1077 dbsection, 

1078 with_session=with_session, 

1079 with_conn=with_conn, 

1080 reflect=reflect, 

1081 ) 

1082 self._databases[name_and_cfg_section] = db 

1083 return db 

1084 

1085 # ------------------------------------------------------------------------- 

1086 # Hashing 

1087 # ------------------------------------------------------------------------- 

1088 

1089 def hash(self, text: str) -> str: 

1090 """ 

1091 Hash text via this NLP definition's hasher. The hash will be stored in 

1092 a secret progress database and to detect later changes in the source 

1093 records. 

1094 

1095 Args: 

1096 text: text (typically from the source database) to be hashed 

1097 

1098 Returns: 

1099 the hashed value 

1100 """ 

1101 return self._hasher.hash(text) 

1102 

1103 # ------------------------------------------------------------------------- 

1104 # Database 

1105 # ------------------------------------------------------------------------- 

1106 

1107 @property 

1108 def temporary_tablename(self) -> str: 

1109 """ 

1110 Temporary tablename to use. 

1111 

1112 See the documentation for the :ref:`NLP config file <nlp_config>`. 

1113 """ 

1114 return self._temporary_tablename 

1115 

1116 def set_echo(self, echo: bool) -> None: 

1117 """ 

1118 Set the SQLAlchemy ``echo`` parameter (to echo SQL) for all our 

1119 source databases. 

1120 """ 

1121 self._progdb.engine.echo = echo 

1122 for db in self._databases.values(): 

1123 db.engine.echo = echo 

1124 # Now, SQLAlchemy will mess things up by adding an additional handler. 

1125 # So, bye-bye: 

1126 for logname in ( 

1127 "sqlalchemy.engine.base.Engine", 

1128 "sqlalchemy.engine.base.OptionEngine", 

1129 ): 

1130 logger = logging.getLogger(logname) 

1131 logger.handlers = [] # ... of type: List[logging.Handler] 

1132 

1133 @property 

1134 def progressdb_session(self) -> Session: 

1135 """ 

1136 Returns an SQLAlchemy ORM :class:`Session` for the progress database. 

1137 """ 

1138 return self._progdb.session 

1139 

1140 @property 

1141 def progressdb_engine(self) -> Engine: 

1142 """ 

1143 Returns an SQLAlchemy Core :class:`Engine` for the progress database. 

1144 """ 

1145 return self._progdb.engine 

1146 

1147 @property 

1148 def progressdb_metadata(self) -> MetaData: 

1149 """ 

1150 Returns the SQLAlchemy :class:`MetaData` for the progress database. 

1151 """ 

1152 return self._progdb.metadata 

1153 

1154 @property 

1155 def progdb(self) -> DatabaseHolder: 

1156 """ 

1157 Returns the progress database. 

1158 """ 

1159 return self._progdb 

1160 

1161 def commit_all(self) -> None: 

1162 """ 

1163 Execute a COMMIT on all databases (all destination database and the 

1164 progress database). 

1165 """ 

1166 self.commit(self.progressdb_session) 

1167 for db in self._databases.values(): 

1168 self.commit(db.session) 

1169 

1170 def get_transation_limiter( 

1171 self, session: Session 

1172 ) -> TransactionSizeLimiter: 

1173 """ 

1174 Returns (or creates and returns) a transaction limiter for a given 

1175 SQLAlchemy session. 

1176 

1177 Args: 

1178 session: SQLAlchemy ORM :class:`Session` 

1179 

1180 Returns: 

1181 a :class:`crate_anon.common.sql.TransactionSizeLimiter` 

1182 

1183 """ 

1184 if session not in self._transaction_limiters: 

1185 self._transaction_limiters[session] = TransactionSizeLimiter( 

1186 session, 

1187 max_rows_before_commit=self._max_rows_before_commit, 

1188 max_bytes_before_commit=self._max_bytes_before_commit, 

1189 ) 

1190 return self._transaction_limiters[session] 

1191 

1192 def notify_transaction( 

1193 self, 

1194 session: Session, 

1195 n_rows: int, 

1196 n_bytes: int, 

1197 force_commit: bool = False, 

1198 ) -> None: 

1199 """ 

1200 Tell our transaction limiter about a transaction that's occurred on 

1201 one of our databases. This may trigger a COMMIT. 

1202 

1203 Args: 

1204 session: SQLAlchemy ORM :class:`Session` that was used 

1205 n_rows: number of rows inserted 

1206 n_bytes: number of bytes inserted 

1207 force_commit: force a COMMIT? 

1208 """ 

1209 tl = self.get_transation_limiter(session) 

1210 tl.notify(n_rows=n_rows, n_bytes=n_bytes, force_commit=force_commit) 

1211 

1212 def commit(self, session: Session) -> None: 

1213 """ 

1214 Executes a COMMIT on a specific session. 

1215 

1216 Args: 

1217 session: SQLAlchemy ORM :class:`Session` 

1218 """ 

1219 tl = self.get_transation_limiter(session) 

1220 tl.commit() 

1221 

1222 # ------------------------------------------------------------------------- 

1223 # Input fields 

1224 # ------------------------------------------------------------------------- 

1225 

1226 @property 

1227 def inputfieldconfigs(self) -> Iterable["InputFieldConfig"]: 

1228 """ 

1229 Returns all input field configurations used by this NLP definition. 

1230 

1231 Returns: 

1232 list of 

1233 `crate_anon.nlp_manager.input_field_config.InputFieldConfig` 

1234 objects 

1235 

1236 """ 

1237 return self._inputfieldmap.values() 

1238 

1239 # ------------------------------------------------------------------------- 

1240 # NLP processors 

1241 # ------------------------------------------------------------------------- 

1242 

1243 @property 

1244 def processors(self) -> List["TableMaker"]: 

1245 """ 

1246 Returns all NLP processors used by this NLP definition. 

1247 

1248 Returns: 

1249 list of objects derived from 

1250 :class:`crate_anon.nlp_manager.base_nlp_parser.BaseNlpParser` 

1251 

1252 """ 

1253 return self._processors 

1254 

1255 @property 

1256 def noncloud_processors(self) -> List["BaseNlpParser"]: 

1257 """ 

1258 Returns all local (non-cloud) NLP processors used by this NLP 

1259 definition. 

1260 

1261 Returns: 

1262 list of objects derived from 

1263 :class:`crate_anon.nlp_manager.base_nlp_parser.BaseNlpParser` 

1264 

1265 """ 

1266 return [ 

1267 x for x in self._processors if not x.is_cloud_processor() 

1268 ] # type: List["BaseNlpParser"] 

1269 

1270 @property 

1271 def uses_cloud_processors(self) -> bool: 

1272 """ 

1273 Are any of our processors cloud-based? 

1274 """ 

1275 return any(x.is_cloud_processor() for x in self._processors) 

1276 

1277 # ------------------------------------------------------------------------- 

1278 # NLPRP info 

1279 # ------------------------------------------------------------------------- 

1280 

1281 def nlprp_local_processors( 

1282 self, sql_dialect: str = None 

1283 ) -> Dict[str, Any]: 

1284 """ 

1285 Returns a draft list of processors as per the NLPRP 

1286 :ref:`list_processors <nlprp_list_processors>` command. 

1287 """ 

1288 processors = [] # type: List[Dict, str, Any] 

1289 for proc in self.noncloud_processors: 

1290 processors.append(proc.nlprp_processor_info(sql_dialect)) 

1291 return {NlprpKeys.PROCESSORS: processors} 

1292 

1293 def nlprp_local_processors_json( 

1294 self, indent: int = 4, sort_keys: bool = True, sql_dialect: str = None 

1295 ) -> str: 

1296 """ 

1297 Returns a formatted JSON string from :func:`nlprp_list_processors`. 

1298 This is primarily for debugging. 

1299 

1300 Args: 

1301 indent: number of spaces for indentation 

1302 sort_keys: sort keys? 

1303 sql_dialect: preferred SQL dialect for ``tabular_schema``, or 

1304 ``None`` for default 

1305 """ 

1306 json_structure = self.nlprp_local_processors(sql_dialect=sql_dialect) 

1307 return json.dumps(json_structure, indent=indent, sort_keys=sort_keys) 

1308 

1309 # ------------------------------------------------------------------------- 

1310 # Cloud NLP 

1311 # ------------------------------------------------------------------------- 

1312 

1313 def get_cloud_config(self) -> Optional[CloudConfig]: 

1314 """ 

1315 Returns the :class:`crate_anon.nlp_manager.cloud_config.CloudConfig` 

1316 object associated with this NLP definition, or ``None`` if there isn't 

1317 one. 

1318 """ 

1319 our_name = self.name 

1320 if self._cloudcfg is None: 

1321 if not self._cloud_config_name: 

1322 raise ValueError( 

1323 f"No {NlpDefConfigKeys.CLOUD_CONFIG!r} parameter " 

1324 f"specified for NLP definition {our_name!r}" 

1325 ) 

1326 if not self._cloud_request_data_dir: 

1327 raise ValueError( 

1328 f"No {NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR!r} " 

1329 f"parameter specified for NLP definition {our_name!r}" 

1330 ) 

1331 req_root_dir = os.path.abspath(self._cloud_request_data_dir) 

1332 if not os.path.isdir(req_root_dir): 

1333 raise ValueError( 

1334 f"Directory {req_root_dir!r}, specified by config " 

1335 f"parameter {NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR!r} " 

1336 f"for NLP definition {our_name!r}" 

1337 ) 

1338 req_data_dir = os.path.join(req_root_dir, our_name) 

1339 os.makedirs(req_data_dir, exist_ok=True) 

1340 self._cloudcfg = CloudConfig( 

1341 self, name=self._cloud_config_name, req_data_dir=req_data_dir 

1342 ) 

1343 return self._cloudcfg 

1344 

1345 def get_cloud_config_or_raise(self) -> CloudConfig: 

1346 """ 

1347 Returns the :class:`crate_anon.nlp_manager.cloud_config.CloudConfig` 

1348 object associated with this NLP definition, or raise :exc:`ValueError` 

1349 if there isn't one. 

1350 """ 

1351 cloudcfg = self.get_cloud_config() 

1352 if cloudcfg is None: 

1353 raise ValueError( 

1354 f"No cloud NLP configuration for NLP definition " 

1355 f"{self.name!r}" 

1356 ) 

1357 if not cloudcfg.remote_processors: 

1358 raise ValueError( 

1359 f"No remote (cloud) processors configured for " 

1360 f"NLP definition {self.name!r}" 

1361 ) 

1362 return cloudcfg