Coverage for nlp_manager/nlp_definition.py: 61%
233 statements
« prev ^ index » next coverage.py v7.8.0, created at 2026-01-06 10:22 -0600
« prev ^ index » next coverage.py v7.8.0, created at 2026-01-06 10:22 -0600
1"""
2crate_anon/nlp_manager/nlp_definition.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**NLP definition class.**
28"""
30# =============================================================================
31# Imports
32# =============================================================================
34import datetime
35import json
36import logging
37import os
38import sys
39from typing import (
40 Any,
41 Dict,
42 Iterable,
43 List,
44 Optional,
45 Tuple,
46 Type,
47 TYPE_CHECKING,
48)
50from cardinal_pythonlib.datetimefunc import get_now_utc_notz_datetime
51from cardinal_pythonlib.docker import running_under_docker
52from cardinal_pythonlib.lists import chunks
53from sqlalchemy.engine.base import Engine
54from sqlalchemy.orm.session import Session
55from sqlalchemy.schema import MetaData
57from crate_anon.anonymise.constants import AnonymiseConfigDefaults
58from crate_anon.anonymise.dbholder import DatabaseHolder
59from crate_anon.common.constants import EnvVar
60from crate_anon.common.extendedconfigparser import (
61 ConfigSection,
62 ExtendedConfigParser,
63)
64from crate_anon.common.sql import TransactionSizeLimiter
65from crate_anon.nlp_manager.cloud_config import CloudConfig
66from crate_anon.nlp_manager.constants import (
67 CloudNlpConfigKeys,
68 DatabaseConfigKeys,
69 DEFAULT_CLOUD_LIMIT_BEFORE_COMMIT,
70 DEFAULT_CLOUD_MAX_CONTENT_LENGTH,
71 DEFAULT_CLOUD_MAX_RECORDS_PER_REQUEST,
72 DEFAULT_CLOUD_MAX_TRIES,
73 DEFAULT_CLOUD_RATE_LIMIT_HZ,
74 DEFAULT_CLOUD_WAIT_ON_CONN_ERR_S,
75 DEFAULT_TEMPORARY_TABLENAME,
76 full_sectionname,
77 GATE_PIPELINE_CLASSNAME,
78 NlpOutputConfigKeys,
79 HashClass,
80 InputFieldConfigKeys,
81 MAX_SQL_FIELD_LEN,
82 NLP_CONFIG_ENV_VAR,
83 NlpConfigPrefixes,
84 NlpDefConfigKeys,
85 ProcessorConfigKeys,
86 NlpDefValues,
87)
88from crate_anon.nlprp.constants import NlprpKeys
89from crate_anon.version import CRATE_VERSION, CRATE_VERSION_DATE
91if TYPE_CHECKING:
92 from crate_anon.nlp_manager.base_nlp_parser import (
93 BaseNlpParser,
94 TableMaker,
95 )
96 from crate_anon.nlp_manager.input_field_config import InputFieldConfig
98log = logging.getLogger(__name__)
101# =============================================================================
102# Demo config
103# =============================================================================
106def demo_nlp_config() -> str:
107 """
108 Returns a demo NLP config file for CRATE.
109 """
110 # -------------------------------------------------------------------------
111 # Imports
112 # -------------------------------------------------------------------------
114 from crate_anon.nlp_manager.parse_biochemistry import (
115 ALL_BIOCHEMISTRY_NLP_AND_VALIDATORS,
116 ) # delayed import
117 from crate_anon.nlp_manager.parse_clinical import (
118 ALL_CLINICAL_NLP_AND_VALIDATORS,
119 ) # delayed import
120 from crate_anon.nlp_manager.parse_cognitive import (
121 ALL_COGNITIVE_NLP_AND_VALIDATORS,
122 ) # delayed import
123 from crate_anon.nlp_manager.parse_haematology import (
124 ALL_HAEMATOLOGY_NLP_AND_VALIDATORS,
125 ) # delayed import
126 from crate_anon.nlp_manager.parse_substance_misuse import (
127 ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS,
128 ) # delayed import
130 # -------------------------------------------------------------------------
131 # Helper functions
132 # -------------------------------------------------------------------------
134 def _make_procdef_pair(name: str) -> str:
135 return f"""[{NlpConfigPrefixes.PROCESSOR}:procdef_{name}]
136{ProcessorConfigKeys.DESTDB} = {destdb}
137{ProcessorConfigKeys.DESTTABLE} = {name}
138[{NlpConfigPrefixes.PROCESSOR}:procdef_validate_{name}]
139{ProcessorConfigKeys.DESTDB} = {destdb}
140{ProcessorConfigKeys.DESTTABLE} = validate_{name}"""
142 def _make_module_procdef_block(
143 nlp_and_validators: List[
144 Tuple[Type["BaseNlpParser"], Type["BaseNlpParser"]]
145 ]
146 ) -> str:
147 _procdeflist = [] # type: List[str]
148 for nlpclass, validatorclass in nlp_and_validators:
149 _procdeflist.append(
150 _make_procdef_pair(nlpclass.classname().lower())
151 )
152 return "\n\n".join(_procdeflist)
154 def _make_proclist(
155 nlp_and_validators: List[
156 Tuple[Type["BaseNlpParser"], Type["BaseNlpParser"]]
157 ]
158 ) -> str:
159 _proclist = [] # type: List[str]
160 for nlpclass, validatorclass in nlp_and_validators:
161 _name = nlpclass.classname().lower()
162 _proclist.append(
163 f" {nlpclass.classname()} procdef_{_name}\n"
164 f" {validatorclass.classname()} procdef_validate_{_name}"
165 )
166 return "\n".join(_proclist)
168 # -------------------------------------------------------------------------
169 # Quasi-constants
170 # -------------------------------------------------------------------------
172 for_docker = running_under_docker()
174 destdb = "DESTINATION_DATABASE"
175 hashphrase = "doesnotmatter"
176 if_clin_docs = "INPUT_FIELD_CLINICAL_DOCUMENTS"
177 if_prog_notes = "INPUT_FIELD_PROGRESS_NOTES"
178 inputfields = f"{if_clin_docs}\n {if_prog_notes}"
179 truncate_text_at = "32766"
180 my_env = "MY_ENV_SECTION"
181 my_src_db = "SOURCE_DATABASE"
182 my_cloud = "my_uk_cloud_service"
183 ridfield = "RID_FIELD"
184 tridfield = "TRID_FIELD"
185 nlp_input_terminator = "END_OF_TEXT_FOR_NLP"
186 nlp_output_terminator = "END_OF_NLP_OUTPUT_RECORD"
188 procdefs_biochemistry = _make_module_procdef_block(
189 ALL_BIOCHEMISTRY_NLP_AND_VALIDATORS
190 )
191 procdefs_clinical = _make_module_procdef_block(
192 ALL_CLINICAL_NLP_AND_VALIDATORS
193 )
194 procdefs_cognitive = _make_module_procdef_block(
195 ALL_COGNITIVE_NLP_AND_VALIDATORS
196 )
197 procdefs_haematology = _make_module_procdef_block(
198 ALL_HAEMATOLOGY_NLP_AND_VALIDATORS
199 )
200 procdefs_substance_misuse = _make_module_procdef_block(
201 ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS
202 )
204 proclist_biochemistry = _make_proclist(ALL_BIOCHEMISTRY_NLP_AND_VALIDATORS)
205 proclist_clinical = _make_proclist(ALL_CLINICAL_NLP_AND_VALIDATORS)
206 proclist_cognitive = _make_proclist(ALL_COGNITIVE_NLP_AND_VALIDATORS)
207 proclist_haematology = _make_proclist(ALL_HAEMATOLOGY_NLP_AND_VALIDATORS)
208 proclist_substance_misuse = _make_proclist(
209 ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS
210 )
212 if EnvVar.GENERATING_CRATE_DOCS in os.environ:
213 nlp_prog_dir = "/path/to/crate_anon/nlp_manager/compiled_nlp_classes"
214 else:
215 this_dir = os.path.abspath(
216 os.path.dirname(__file__)
217 ) # crate_anon/nlp_manager
218 nlp_prog_dir = os.path.join(this_dir, "compiled_nlp_classes")
220 if for_docker:
221 # See crate.Dockerfile
222 gate_home = "/crate/gate"
223 kcl_pharmacotherapy_dir = "/crate/brc-gate-pharmacotherapy"
224 cloud_request_data_dir = "/crate/tmp/clouddata"
225 gate_plugin_file = (
226 "/crate/src/crate_anon/nlp_manager/specimen_gate_plugin_file.ini"
227 )
228 else:
229 gate_home = "/path/to/GATE_Developer_9.0.1"
230 kcl_pharmacotherapy_dir = "/path/to/brc-gate-pharmacotherapy"
231 cloud_request_data_dir = "/srv/crate/clouddata"
232 gate_plugin_file = "/path/to/specimen_gate_plugin_file.ini"
234 _DA = AnonymiseConfigDefaults
236 # -------------------------------------------------------------------------
237 # The demo config itself
238 # -------------------------------------------------------------------------
240 # noinspection HttpUrlsUsage
241 return f"""# Configuration file for CRATE NLP manager (crate_nlp).
242# Version {CRATE_VERSION} ({CRATE_VERSION_DATE}).
243#
244# PLEASE SEE THE HELP at https://crateanon.readthedocs.io/
245# Using defaults for Docker environment: {for_docker}
247# =============================================================================
248# A. Individual NLP definitions
249# =============================================================================
250# - referred to by the NLP manager's command-line arguments
251# - You are likely to need to alter these (particularly the bits in capital
252# letters) to refer to your own database(s).
254# -----------------------------------------------------------------------------
255# GATE people-and-places demo
256# -----------------------------------------------------------------------------
258[{NlpConfigPrefixes.NLPDEF}:gate_name_location_demo]
260{NlpDefConfigKeys.INPUTFIELDDEFS} =
261 {inputfields}
262{NlpDefConfigKeys.PROCESSORS} =
263 GATE procdef_gate_name_location
264{NlpDefConfigKeys.PROGRESSDB} = {destdb}
265{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}
268# -----------------------------------------------------------------------------
269# KConnect (Bio-YODIE) disease-finding GATE app
270# -----------------------------------------------------------------------------
272[{NlpConfigPrefixes.NLPDEF}:gate_kconnect_diseases]
274{NlpDefConfigKeys.INPUTFIELDDEFS} =
275 {inputfields}
276{NlpDefConfigKeys.PROCESSORS} =
277 GATE procdef_gate_kconnect
278{NlpDefConfigKeys.PROGRESSDB} = {destdb}
279{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}
282# -----------------------------------------------------------------------------
283# KCL Lewy body dementia GATE app
284# -----------------------------------------------------------------------------
286[{NlpConfigPrefixes.NLPDEF}:gate_kcl_lbd]
288{NlpDefConfigKeys.INPUTFIELDDEFS} =
289 {inputfields}
290{NlpDefConfigKeys.PROCESSORS} =
291 GATE procdef_gate_kcl_lbda
292{NlpDefConfigKeys.PROGRESSDB} = {destdb}
293{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}
296# -----------------------------------------------------------------------------
297# KCL pharmacotherapy GATE app
298# -----------------------------------------------------------------------------
300[{NlpConfigPrefixes.NLPDEF}:gate_kcl_pharmacotherapy]
302{NlpDefConfigKeys.INPUTFIELDDEFS} =
303 {inputfields}
304{NlpDefConfigKeys.PROCESSORS} =
305 GATE procdef_gate_pharmacotherapy
306{NlpDefConfigKeys.PROGRESSDB} = {destdb}
307{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}
310# -----------------------------------------------------------------------------
311# Medex-UIMA medication-finding app
312# -----------------------------------------------------------------------------
314[{NlpConfigPrefixes.NLPDEF}:medex_medications]
316{NlpDefConfigKeys.INPUTFIELDDEFS} =
317 {inputfields}
318{NlpDefConfigKeys.PROCESSORS} =
319 Medex procdef_medex_medications
320{NlpDefConfigKeys.PROGRESSDB} = {destdb}
321{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}
324# -----------------------------------------------------------------------------
325# CRATE number-finding Python regexes
326# -----------------------------------------------------------------------------
328[{NlpConfigPrefixes.NLPDEF}:crate_biomarkers]
330{NlpDefConfigKeys.INPUTFIELDDEFS} =
331 {inputfields}
332{NlpDefConfigKeys.PROCESSORS} =
333 # -------------------------------------------------------------------------
334 # Biochemistry
335 # -------------------------------------------------------------------------
336{proclist_biochemistry}
337 # -------------------------------------------------------------------------
338 # Clinical
339 # -------------------------------------------------------------------------
340{proclist_clinical}
341 # -------------------------------------------------------------------------
342 # Cognitive
343 # -------------------------------------------------------------------------
344{proclist_cognitive}
345 # -------------------------------------------------------------------------
346 # Haematology
347 # -------------------------------------------------------------------------
348{proclist_haematology}
349 # -------------------------------------------------------------------------
350 # Substance misuse
351 # -------------------------------------------------------------------------
352{proclist_substance_misuse}
354{NlpDefConfigKeys.PROGRESSDB} = {destdb}
355{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}
356# {NlpDefConfigKeys.TRUNCATE_TEXT_AT} = {truncate_text_at}
357# {NlpDefConfigKeys.RECORD_TRUNCATED_VALUES} = False
358{NlpDefConfigKeys.MAX_ROWS_BEFORE_COMMIT} = {_DA.MAX_ROWS_BEFORE_COMMIT}
359{NlpDefConfigKeys.MAX_BYTES_BEFORE_COMMIT} = {_DA.MAX_BYTES_BEFORE_COMMIT}
361# -----------------------------------------------------------------------------
362# Cloud NLP demo
363# -----------------------------------------------------------------------------
365[{NlpConfigPrefixes.NLPDEF}:cloud_nlp_demo]
367{NlpDefConfigKeys.INPUTFIELDDEFS} =
368 {inputfields}
369{NlpDefConfigKeys.PROCESSORS} =
370 Cloud procdef_cloud_crp
371{NlpDefConfigKeys.PROGRESSDB} = {destdb}
372{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}
373{NlpDefConfigKeys.CLOUD_CONFIG} = {my_cloud}
374{NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR} = {cloud_request_data_dir}
377# =============================================================================
378# B. NLP processor definitions
379# =============================================================================
380# - You're likely to have to modify the destination databases these point to,
381# but otherwise you can probably leave them as they are.
383# -----------------------------------------------------------------------------
384# Specimen CRATE regular expression processor definitions
385# -----------------------------------------------------------------------------
387 # Most of these are very simple, and just require a destination database
388 # (as a cross-reference to a database section within this file) and a
389 # destination table.
391 # Biochemistry
393{procdefs_biochemistry}
395 # Clinical
397{procdefs_clinical}
399 # Cognitive
401{procdefs_cognitive}
403 # Haematology
405{procdefs_haematology}
407 # Substance misuse
409{procdefs_substance_misuse}
411# -----------------------------------------------------------------------------
412# Specimen GATE demo people/places processor definition
413# -----------------------------------------------------------------------------
415 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
416 # Define the processor
417 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
419[{NlpConfigPrefixes.PROCESSOR}:procdef_gate_name_location]
421{ProcessorConfigKeys.DESTDB} = {destdb}
422{ProcessorConfigKeys.OUTPUTTYPEMAP} =
423 Person output_person
424 Location output_location
425{ProcessorConfigKeys.PROGARGS} =
426 java
427 -classpath "{{NLPPROGDIR}}"{{OS_PATHSEP}}"{{GATE_HOME}}/lib/*"
428 -Dgate.home="{{GATE_HOME}}"
429 {GATE_PIPELINE_CLASSNAME}
430 --gate_app "{{GATE_HOME}}/plugins/ANNIE/ANNIE_with_defaults.gapp"
431 --pluginfile "{{GATE_PLUGIN_FILE}}"
432 --annotation Person
433 --annotation Location
434 --input_terminator {nlp_input_terminator}
435 --output_terminator {nlp_output_terminator}
436 --log_tag {{NLPLOGTAG}}
437 --verbose
438{ProcessorConfigKeys.PROGENVSECTION} = {my_env}
439{ProcessorConfigKeys.INPUT_TERMINATOR} = {nlp_input_terminator}
440{ProcessorConfigKeys.OUTPUT_TERMINATOR} = {nlp_output_terminator}
441# {ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES} = 1000
443 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
444 # Define the output tables used by this GATE processor
445 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
447[{NlpConfigPrefixes.OUTPUT}:output_person]
449{NlpOutputConfigKeys.DESTTABLE} = person
450{NlpOutputConfigKeys.RENAMES} =
451 firstName firstname
452{NlpOutputConfigKeys.DESTFIELDS} =
453 rule VARCHAR(100) Rule used to find this person (e.g. TitleFirstName, PersonFull)
454 firstname VARCHAR(100) First name
455 surname VARCHAR(100) Surname
456 gender VARCHAR(7) Gender (e.g. male, female, unknown)
457 kind VARCHAR(100) Kind of name (e.g. personName, fullName)
458 # ... longest gender: "unknown" (7)
459{NlpOutputConfigKeys.INDEXDEFS} =
460 firstname 64
461 surname 64
463[{NlpConfigPrefixes.OUTPUT}:output_location]
465{NlpOutputConfigKeys.DESTTABLE} = location
466{NlpOutputConfigKeys.RENAMES} =
467 locType loctype
468{NlpOutputConfigKeys.DESTFIELDS} =
469 rule VARCHAR(100) Rule used (e.g. Location1)
470 loctype VARCHAR(100) Location type (e.g. city)
471{NlpOutputConfigKeys.INDEXDEFS} =
472 rule 100
473 loctype 100
476# -----------------------------------------------------------------------------
477# Specimen Sheffield/KCL KConnect (Bio-YODIE) processor definition
478# -----------------------------------------------------------------------------
479# https://gate.ac.uk/applications/bio-yodie.html
481 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
482 # Define the processor
483 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
485[{NlpConfigPrefixes.PROCESSOR}:procdef_gate_kconnect]
487{ProcessorConfigKeys.DESTDB} = {destdb}
488{ProcessorConfigKeys.OUTPUTTYPEMAP} =
489 Disease_or_Syndrome output_disease_or_syndrome
490{ProcessorConfigKeys.PROGARGS} =
491 java
492 -classpath "{{NLPPROGDIR}}"{{OS_PATHSEP}}"{{GATE_HOME}}/lib/*"
493 -Dgate.home="{{GATE_HOME}}"
494 {GATE_PIPELINE_CLASSNAME}
495 --gate_app "{{KCONNECTDIR}}/main-bio/main-bio.xgapp"
496 --pluginfile "{{GATE_PLUGIN_FILE}}"
497 --annotation Disease_or_Syndrome
498 --input_terminator {nlp_input_terminator}
499 --output_terminator {nlp_output_terminator}
500 --log_tag {{NLPLOGTAG}}
501 --suppress_gate_stdout
502 --verbose
503{ProcessorConfigKeys.PROGENVSECTION} = {my_env}
504{ProcessorConfigKeys.INPUT_TERMINATOR} = {nlp_input_terminator}
505{ProcessorConfigKeys.OUTPUT_TERMINATOR} = {nlp_output_terminator}
506# {ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES} = 1000
508 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
509 # Define the output tables used by this GATE processor
510 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
512[{NlpConfigPrefixes.OUTPUT}:output_disease_or_syndrome]
514{NlpOutputConfigKeys.DESTTABLE} = kconnect_diseases
515{NlpOutputConfigKeys.RENAMES} =
516 Experiencer experiencer
517 Negation negation
518 PREF pref
519 STY sty
520 TUI tui
521 Temporality temporality
522 VOCABS vocabs
523{NlpOutputConfigKeys.DESTFIELDS} =
524 # Found by manual inspection of KConnect/Bio-YODIE output from the GATE console:
525 experiencer VARCHAR(100) Who experienced it; e.g. "Patient", "Other"
526 negation VARCHAR(100) Was it negated or not; e.g. "Affirmed", "Negated"
527 pref VARCHAR(100) PREFferred name; e.g. "Rheumatic gout"
528 sty VARCHAR(100) Semantic Type (STY) [semantic type name]; e.g. "Disease or Syndrome"
529 tui VARCHAR(4) Type Unique Identifier (TUI) [semantic type identifier]; 4 characters; https://www.ncbi.nlm.nih.gov/books/NBK9679/; e.g. "T047"
530 temporality VARCHAR(100) Occurrence in time; e.g. "Recent", "historical", "hypothetical"
531 vocabs VARCHAR(255) List of UMLS vocabularies; e.g. "AIR,MSH,NDFRT,MEDLINEPLUS,NCI,LNC,NCI_FDA,NCI,MTH,AIR,ICD9CM,LNC,SNOMEDCT_US,LCH_NW,HPO,SNOMEDCT_US,ICD9CM,SNOMEDCT_US,COSTAR,CST,DXP,QMR,OMIM,OMIM,AOD,CSP,NCI_NCI-GLOSS,CHV"
532 inst VARCHAR(8) Looks like a Concept Unique Identifier (CUI); 1 letter then 7 digits; e.g. "C0003873"
533 inst_full VARCHAR(255) Looks like a URL to a CUI; e.g. "http://linkedlifedata.com/resource/umls/id/C0003873"
534 language VARCHAR(100) Language; e.g. ""; ?will look like "ENG" for English? See https://www.nlm.nih.gov/research/umls/implementation_resources/query_diagrams/er1.html
535 tui_full VARCHAR(255) TUI (?); e.g. "http://linkedlifedata.com/resource/semanticnetwork/id/T047"
536{NlpOutputConfigKeys.INDEXDEFS} =
537 pref 100
538 sty 100
539 tui 4
540 inst 8
543# -----------------------------------------------------------------------------
544# Specimen KCL GATE pharmacotherapy processor definition
545# -----------------------------------------------------------------------------
546# https://github.com/KHP-Informatics/brc-gate-pharmacotherapy
548 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
549 # Define the processor
550 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
552[{NlpConfigPrefixes.PROCESSOR}:procdef_gate_pharmacotherapy]
554{ProcessorConfigKeys.DESTDB} = {destdb}
555{ProcessorConfigKeys.OUTPUTTYPEMAP} =
556 Prescription output_prescription
557{ProcessorConfigKeys.PROGARGS} =
558 java
559 -classpath "{{NLPPROGDIR}}"{{OS_PATHSEP}}"{{GATE_HOME}}/lib/*"
560 -Dgate.home="{{GATE_HOME}}"
561 {GATE_PIPELINE_CLASSNAME}
562 --gate_app "{{GATE_PHARMACOTHERAPY_DIR}}/application.xgapp"
563 --pluginfile "{{GATE_PLUGIN_FILE}}"
564 --include_set Output
565 --annotation Prescription
566 --input_terminator {nlp_input_terminator}
567 --output_terminator {nlp_output_terminator}
568 --log_tag {{NLPLOGTAG}}
569 --suppress_gate_stdout
570 --show_contents_on_crash
571{ProcessorConfigKeys.PROGENVSECTION} = {my_env}
572{ProcessorConfigKeys.INPUT_TERMINATOR} = {nlp_input_terminator}
573{ProcessorConfigKeys.OUTPUT_TERMINATOR} = {nlp_output_terminator}
574# {ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES} = 1000
576 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
577 # Define the output tables used by this GATE processor
578 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
580[{NlpConfigPrefixes.OUTPUT}:output_prescription]
582{NlpOutputConfigKeys.DESTTABLE} = medications_gate
583{NlpOutputConfigKeys.RENAMES} =
584 drug-type drug_type
585 dose-value dose_value
586 dose-unit dose_unit
587 dose-multiple dose_multiple
588 Directionality directionality
589 Experiencer experiencer
590 "Length of Time" length_of_time
591 Temporality temporality
592 "Unit of Time" unit_of_time
593{NlpOutputConfigKeys.NULL_LITERALS} =
594 null
595 ""
596{NlpOutputConfigKeys.DESTFIELDS} =
597 # Found by (a) manual inspection of BRC GATE pharmacotherapy output from
598 # the GATE console; (b) inspection of
599 # application-resources/schemas/Prescription.xml
600 # Note preference for DECIMAL over FLOAT/REAL; see
601 # https://stackoverflow.com/questions/1056323
602 # Note that not all annotations appear for all texts. Try e.g.:
603 # Please start haloperidol 5mg tds.
604 # I suggest you start haloperidol 5mg tds for one week.
605 rule VARCHAR(100) Rule yielding this drug. Not in XML but is present in a subset: e.g. "weanOff"; max length unclear
606 drug VARCHAR(200) Drug name. Required string; e.g. "haloperidol"; max length 47 from "wc -L BNF_generic.lst", 134 from BNF_trade.lst
607 drug_type VARCHAR(100) Type of drug name. Required string; from "drug-type"; e.g. "BNF_generic"; ?length of longest drug ".lst" filename
608 dose VARCHAR(100) Dose text. Required string; e.g. "5mg"; max length unclear
609 dose_value DECIMAL Numerical dose value. Required numeric; from "dose-value"; "double" in the XML but DECIMAL probably better; e.g. 5.0
610 dose_unit VARCHAR(100) Text of dose units. Required string; from "dose-unit"; e.g. "mg"; max length unclear
611 dose_multiple INT Dose count (multiple). Required integer; from "dose-multiple"; e.g. 1
612 route VARCHAR(7) Route of administration. Required string; one of: "oral", "im", "iv", "rectal", "sc", "dermal", "unknown"
613 status VARCHAR(10) Change in drug status. Required; one of: "start", "continuing", "stop"
614 tense VARCHAR(7) Tense in which drug is referred to. Required; one of: "past", "present"
615 date VARCHAR(100) ?. Optional string; max length unclear
616 directionality VARCHAR(100) ?. Optional string; max length unclear
617 experiencer VARCHAR(100) Person experiencing the drug-related event. Optional string; e.g. "Patient"
618 frequency DECIMAL Frequency (times per <time_unit>). Optional numeric; "double" in the XML but DECIMAL probably better
619 interval DECIMAL The n in "every n <time_unit>s" (1 for "every <time_unit>"). Optional numeric; "double" in the XML but DECIMAL probably better
620 length_of_time VARCHAR(100) ?. Optional string; from "Length of Time"; max length unclear
621 temporality VARCHAR(100) ?. Optional string; e.g. "Recent", "Historical"
622 time_unit VARCHAR(100) Unit of time (see frequency, interval). Optional string; from "time-unit"; e.g. "day"; max length unclear
623 unit_of_time VARCHAR(100) ?. Optional string; from "Unit of Time"; max length unclear
624 when VARCHAR(100) ?. Optional string; max length unclear
625{NlpOutputConfigKeys.INDEXDEFS} =
626 rule 100
627 drug 200
628 route 7
629 status 10
630 tense 7
633# -----------------------------------------------------------------------------
634# Specimen KCL Lewy Body Diagnosis Application (LBDA) processor definition
635# -----------------------------------------------------------------------------
636# https://github.com/KHP-Informatics/brc-gate-LBD
638 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
639 # Define the processor
640 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
642[{NlpConfigPrefixes.PROCESSOR}:procdef_gate_kcl_lbda]
644 # "cDiagnosis" is the "confirmed diagnosis" field, as d/w Jyoti Jyoti
645 # 2018-03-20; see also README.md. This appears in the "Automatic" and the
646 # unnamed set. There is also a near-miss one, "DiagnosisAlmost", which
647 # appears in the unnamed set.
648 # "Mr Jones has Lewy body dementia."
649 # -> DiagnosisAlmost
650 # "Mr Jones has a diagnosis of Lewy body dementia."
651 # -> DiagnosisAlmost, cDiagnosis
652 # Note that we must use lower case in the outputtypemap.
654{ProcessorConfigKeys.DESTDB} = {destdb}
655{ProcessorConfigKeys.OUTPUTTYPEMAP} =
656 cDiagnosis output_lbd_diagnosis
657 DiagnosisAlmost output_lbd_diagnosis
658{ProcessorConfigKeys.PROGARGS} =
659 java
660 -classpath "{{NLPPROGDIR}}"{{OS_PATHSEP}}"{{GATE_HOME}}/lib/*"
661 -Dgate.home="{{GATE_HOME}}"
662 {GATE_PIPELINE_CLASSNAME}
663 --gate_app "{{KCL_LBDA_DIR}}/application.xgapp"
664 --pluginfile "{{GATE_PLUGIN_FILE}}"
665 --set_annotation "" DiagnosisAlmost
666 --set_annotation Automatic cDiagnosis
667 --input_terminator {nlp_input_terminator}
668 --output_terminator {nlp_output_terminator}
669 --log_tag {{NLPLOGTAG}}
670 --suppress_gate_stdout
671 --verbose
672{ProcessorConfigKeys.PROGENVSECTION} = {my_env}
673{ProcessorConfigKeys.INPUT_TERMINATOR} = {nlp_input_terminator}
674{ProcessorConfigKeys.OUTPUT_TERMINATOR} = {nlp_output_terminator}
675# {ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES} = 1000
677 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
678 # Define the output tables used by this GATE processor
679 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
681[{NlpConfigPrefixes.OUTPUT}:output_lbd_diagnosis]
683{NlpOutputConfigKeys.DESTTABLE} = lewy_body_dementia_gate
684{NlpOutputConfigKeys.NULL_LITERALS} =
685 null
686 ""
687{NlpOutputConfigKeys.DESTFIELDS} =
688 # Found by
689 # (a) manual inspection of output from the GATE Developer console:
690 # - e.g. {{rule=Includefin, text=Lewy body dementia}}
691 # (b) inspection of contents:
692 # - run a Cygwin shell
693 # - find . -type f -exec grep cDiagnosis -l {{}} \\;
694 # - 3 hits:
695 # ./application-resources/jape/DiagnosisExclude2.jape
696 # ... part of the "Lewy"-detection apparatus
697 # ./application-resources/jape/text-feature.jape
698 # ... adds "text" annotation to cDiagnosis Token
699 # ./application.xgapp
700 # ... in annotationTypes
701 # On that basis:
702 rule VARCHAR(100) Rule that generated the hit.
703 text VARCHAR(200) Text that matched the rule.
704{NlpOutputConfigKeys.INDEXDEFS} =
705 rule 100
706 text 200
709# -----------------------------------------------------------------------------
710# Specimen MedEx processor definition
711# -----------------------------------------------------------------------------
712# https://sbmi.uth.edu/ccb/resources/medex.htm
714[{NlpConfigPrefixes.PROCESSOR}:procdef_medex_medications]
716{ProcessorConfigKeys.DESTDB} = {destdb}
717{ProcessorConfigKeys.DESTTABLE} = medications_medex
718{ProcessorConfigKeys.PROGARGS} =
719 java
720 -classpath {{NLPPROGDIR}}:{{MEDEXDIR}}/bin:{{MEDEXDIR}}/lib/*
721 -Dfile.encoding=UTF-8
722 CrateMedexPipeline
723 -lt {{NLPLOGTAG}}
724 -v -v
725# ... other arguments are added by the code
726{ProcessorConfigKeys.PROGENVSECTION} = {my_env}
729# =============================================================================
730# C. Environment variable definitions
731# =============================================================================
732# - You'll need to modify this according to your local configuration.
734[{NlpConfigPrefixes.ENV}:{my_env}]
736GATE_HOME = {gate_home}
737GATE_PHARMACOTHERAPY_DIR = {kcl_pharmacotherapy_dir}
738GATE_PLUGIN_FILE = {gate_plugin_file}
739KCL_LBDA_DIR = /path/to/brc-gate-LBD/Lewy_Body_Diagnosis
740KCONNECTDIR = /path/to/yodie-pipeline-1-2-umls-only
741MEDEXDIR = /path/to/Medex_UIMA_1.3.6
742NLPPROGDIR = {nlp_prog_dir}
743OS_PATHSEP = {os.pathsep}
746# =============================================================================
747# D. Input field definitions
748# =============================================================================
750[{NlpConfigPrefixes.INPUT}:{if_clin_docs}]
752{InputFieldConfigKeys.SRCDB} = {my_src_db}
753{InputFieldConfigKeys.SRCTABLE} = EXTRACTED_CLINICAL_DOCUMENTS
754{InputFieldConfigKeys.SRCPKFIELD} = DOCUMENT_PK
755{InputFieldConfigKeys.SRCFIELD} = DOCUMENT_TEXT
756{InputFieldConfigKeys.SRCDATETIMEFIELD} = DOCUMENT_DATE
757{InputFieldConfigKeys.COPYFIELDS} =
758 {ridfield}
759 {tridfield}
760{InputFieldConfigKeys.INDEXED_COPYFIELDS} =
761 {ridfield}
762 {tridfield}
763# {InputFieldConfigKeys.DEBUG_ROW_LIMIT} = 0
765[{NlpConfigPrefixes.INPUT}:{if_prog_notes}]
767{InputFieldConfigKeys.SRCDB} = {my_src_db}
768{InputFieldConfigKeys.SRCTABLE} = PROGRESS_NOTES
769{InputFieldConfigKeys.SRCPKFIELD} = PN_PK
770{InputFieldConfigKeys.SRCFIELD} = PN_TEXT
771{InputFieldConfigKeys.SRCDATETIMEFIELD} = PN_DATE
772{InputFieldConfigKeys.COPYFIELDS} =
773 {ridfield}
774 {tridfield}
775{InputFieldConfigKeys.INDEXED_COPYFIELDS} =
776 {ridfield}
777 {tridfield}
780# =============================================================================
781# E. Database definitions, each in its own section
782# =============================================================================
784[{NlpConfigPrefixes.DATABASE}:{my_src_db}]
786{DatabaseConfigKeys.URL} = mysql+mysqldb://anontest:XXX@127.0.0.1:3306/anonymous_output?charset=utf8
788[{NlpConfigPrefixes.DATABASE}:{destdb}]
790{DatabaseConfigKeys.URL} = mysql+mysqldb://anontest:XXX@127.0.0.1:3306/anonymous_output?charset=utf8
793# =============================================================================
794# F. Information for using cloud-based NLP
795# =============================================================================
797[{NlpConfigPrefixes.CLOUD}:{my_cloud}]
799{CloudNlpConfigKeys.CLOUD_URL} = https://your_url
800{CloudNlpConfigKeys.USERNAME} = your_username
801{CloudNlpConfigKeys.PASSWORD} = your_password
802{CloudNlpConfigKeys.WAIT_ON_CONN_ERR} = {DEFAULT_CLOUD_WAIT_ON_CONN_ERR_S}
803{CloudNlpConfigKeys.MAX_CONTENT_LENGTH} = {DEFAULT_CLOUD_MAX_CONTENT_LENGTH}
804{CloudNlpConfigKeys.MAX_RECORDS_PER_REQUEST} = {DEFAULT_CLOUD_MAX_RECORDS_PER_REQUEST}
805{CloudNlpConfigKeys.LIMIT_BEFORE_COMMIT} = {DEFAULT_CLOUD_LIMIT_BEFORE_COMMIT}
806{CloudNlpConfigKeys.STOP_AT_FAILURE} = true
807{CloudNlpConfigKeys.MAX_TRIES} = {DEFAULT_CLOUD_MAX_TRIES}
808{CloudNlpConfigKeys.RATE_LIMIT_HZ} = {DEFAULT_CLOUD_RATE_LIMIT_HZ}
810[{NlpConfigPrefixes.PROCESSOR}:procdef_cloud_crp]
812{ProcessorConfigKeys.DESTDB} = {destdb}
813{ProcessorConfigKeys.DESTTABLE} = crp_test
814{ProcessorConfigKeys.PROCESSOR_NAME} = crate_anon.nlp_manager.parse_biochemistry.Crp
815{ProcessorConfigKeys.PROCESSOR_FORMAT} = {NlpDefValues.FORMAT_STANDARD}
817""" # noqa: E501
820# =============================================================================
821# Get config filename (from environment variable)
822# =============================================================================
825def get_nlp_config_filename_or_exit() -> str:
826 """
827 Returns the config filename, from our environment variable.
828 If we can't retrieve it, perform a hard exit.
829 """
830 # Get filename
831 try:
832 config_filename = os.environ[NLP_CONFIG_ENV_VAR]
833 assert config_filename
834 except (KeyError, AssertionError):
835 print(
836 f"You must set the {NLP_CONFIG_ENV_VAR} environment variable "
837 f"to point to a CRATE NLP config file, or specify it on the "
838 f"command line."
839 )
840 sys.exit(1)
841 return config_filename
844# =============================================================================
845# Config class
846# =============================================================================
849class NlpDefinition:
850 """
851 Class representing an NLP master definition as read from config file.
853 An NLP definition represents the combination of
855 - one or more NLP processors (e.g. "CRATE's C-reactive protein finder")
856 - one or more input fields in the source database
858 The NLP definition can therefore be used to say "run this set of NLP
859 processors over this set of textual fields in my database".
861 See the documentation for the :ref:`NLP config file <nlp_config>`.
862 """
864 def __init__(self, nlpname: str, logtag: str = "") -> None:
865 """
866 Read config from file.
868 Args:
869 nlpname: config section name for this NLP definition
870 logtag: text that may be passed to child processes to identify
871 the NLP definition in their log output
872 """
874 # DELAYED IMPORTS (to make life simpler for classes deriving from
875 # NlpParser and using NlpDefinition -- they can now do it directly,
876 # not just via forward reference).
877 from crate_anon.nlp_manager.all_processors import make_nlp_parser
878 from crate_anon.nlp_manager.input_field_config import InputFieldConfig
880 self._nlpname = nlpname
881 self._logtag = logtag
883 log.info(f"Loading config for section: {nlpname}")
884 self._config_filename = get_nlp_config_filename_or_exit()
886 # Read config from file.
887 self._cfg = ConfigSection(
888 section=full_sectionname(NlpConfigPrefixes.NLPDEF, nlpname),
889 filename=self._config_filename,
890 case_sensitive=True,
891 )
893 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
894 # Our own stuff
895 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
896 self._databases = {} # type: Dict[str, DatabaseHolder]
897 self._progressdb_name = self._cfg.opt_str(
898 NlpDefConfigKeys.PROGRESSDB, required=True
899 )
900 self._progdb = self.get_database(self._progressdb_name)
901 self._temporary_tablename = self._cfg.opt_str(
902 NlpDefConfigKeys.TEMPORARY_TABLENAME,
903 default=DEFAULT_TEMPORARY_TABLENAME,
904 )
905 self._hashphrase = self._cfg.opt_str(
906 NlpDefConfigKeys.HASHPHRASE, required=True
907 )
908 self._hasher = HashClass(self._hashphrase)
909 self._max_rows_before_commit = self._cfg.opt_int_positive(
910 NlpDefConfigKeys.MAX_ROWS_BEFORE_COMMIT,
911 AnonymiseConfigDefaults.MAX_ROWS_BEFORE_COMMIT,
912 )
913 self._max_bytes_before_commit = self._cfg.opt_int_positive(
914 NlpDefConfigKeys.MAX_BYTES_BEFORE_COMMIT,
915 AnonymiseConfigDefaults.MAX_BYTES_BEFORE_COMMIT,
916 )
917 self._now = get_now_utc_notz_datetime()
918 self.truncate_text_at = self._cfg.opt_int_positive(
919 NlpDefConfigKeys.TRUNCATE_TEXT_AT, default=0
920 )
921 self.record_truncated_values = self._cfg.opt_bool(
922 NlpDefConfigKeys.RECORD_TRUNCATED_VALUES, default=False
923 )
924 self._cloud_config_name = self._cfg.opt_str(
925 NlpDefConfigKeys.CLOUD_CONFIG
926 )
927 self._cloud_request_data_dir = self._cfg.opt_str(
928 NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR
929 )
931 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
932 # Input field definitions
933 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
934 self._inputfielddefs = self._cfg.opt_strlist(
935 NlpDefConfigKeys.INPUTFIELDDEFS, required=True, lower=False
936 )
937 self._inputfieldmap = {} # type: Dict[str, InputFieldConfig]
938 for cfg_input_name in self._inputfielddefs:
939 if cfg_input_name in self._inputfieldmap:
940 continue
941 self._inputfieldmap[cfg_input_name] = InputFieldConfig(
942 self, cfg_input_name
943 )
945 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
946 # NLP processors
947 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
948 self._processors = [] # type: List[TableMaker]
949 processorpairs = self._cfg.opt_strlist(
950 NlpDefConfigKeys.PROCESSORS, required=True, lower=False
951 )
952 # self._procstmp = {}
953 try:
954 for proctype, procname in chunks(processorpairs, 2):
955 processor = make_nlp_parser(
956 classname=proctype,
957 nlpdef=self,
958 cfg_processor_name=procname,
959 )
960 self._processors.append(processor)
961 except ValueError:
962 log.critical(f"Bad {NlpDefConfigKeys.PROCESSORS} specification")
963 raise
965 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
966 # Transaction sizes, for early commit
967 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
968 self._transaction_limiters = (
969 {}
970 ) # type: Dict[Session, TransactionSizeLimiter]
971 # dictionary of session -> TransactionSizeLimiter
973 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
974 # Cloud config (loaded on request, then cached)
975 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
976 self._cloudcfg = None # type: Optional[CloudConfig]
978 # -------------------------------------------------------------------------
979 # Basic info
980 # -------------------------------------------------------------------------
982 @property
983 def name(self) -> str:
984 """
985 Returns the name of the NLP definition.
986 """
987 return self._nlpname
989 @property
990 def logtag(self) -> str:
991 """
992 Returns the log tag of the NLP definition (may be used by child
993 processes to provide more information for logs).
994 """
995 return self._logtag
997 @property
998 def now(self) -> datetime.datetime:
999 """
1000 Returns the time this NLP definition was created (in UTC). Used to
1001 time-stamp NLP runs.
1002 """
1003 return self._now
1005 # -------------------------------------------------------------------------
1006 # Config file
1007 # -------------------------------------------------------------------------
1009 @property
1010 def parser(self) -> ExtendedConfigParser:
1011 """
1012 Returns the
1013 :class:`crate_anon.common.extendedconfigparser.ExtendedConfigParser` in
1014 use.
1015 """
1016 return self._cfg.parser
1018 def get_config_section(self, section: str) -> ConfigSection:
1019 """
1020 Returns a :class:`crate_anon.common.extendedconfigparser.ConfigSection`
1021 referring to a (potentially different) section.
1023 Args:
1024 section:
1025 New section name.
1026 """
1027 return self._cfg.other_section(section)
1029 def get_env_dict(
1030 self,
1031 env_section_name: str,
1032 parent_env: Optional[Dict[str, str]] = None,
1033 ) -> Dict[str, str]:
1034 """
1035 Gets an operating system environment variable dictionary (``variable:
1036 value`` mapping) from the config file.
1038 Args:
1039 env_section_name: config section name, without its "env:" prefix
1040 parent_env: optional starting point (e.g. parent OS environment)
1042 Returns:
1043 a dictionary suitable for use as an OS environment
1045 """
1046 return self._cfg.parser.get_env_dict(
1047 full_sectionname(NlpConfigPrefixes.ENV, env_section_name),
1048 parent_env=parent_env,
1049 )
1051 def get_database(
1052 self,
1053 name_and_cfg_section: str,
1054 with_session: bool = True,
1055 with_conn: bool = False,
1056 reflect: bool = False,
1057 ) -> DatabaseHolder:
1058 """
1059 Returns a :class:`crate_anon.anonymise.dbholder.DatabaseHolder` from
1060 the config file, containing information abuot a database.
1062 Args:
1063 name_and_cfg_section:
1064 string that is the name of the database, and also the config
1065 file section name describing the database
1066 with_session: create an SQLAlchemy Session?
1067 with_conn: create an SQLAlchemy connection (via an Engine)?
1068 reflect: read the database structure (when required)?
1069 """
1070 if name_and_cfg_section in self._databases:
1071 return self._databases[name_and_cfg_section]
1072 dbsection = full_sectionname(
1073 NlpConfigPrefixes.DATABASE, name_and_cfg_section
1074 )
1075 assert len(name_and_cfg_section) <= MAX_SQL_FIELD_LEN
1076 db = self.parser.get_database(
1077 dbsection,
1078 with_session=with_session,
1079 with_conn=with_conn,
1080 reflect=reflect,
1081 )
1082 self._databases[name_and_cfg_section] = db
1083 return db
1085 # -------------------------------------------------------------------------
1086 # Hashing
1087 # -------------------------------------------------------------------------
1089 def hash(self, text: str) -> str:
1090 """
1091 Hash text via this NLP definition's hasher. The hash will be stored in
1092 a secret progress database and to detect later changes in the source
1093 records.
1095 Args:
1096 text: text (typically from the source database) to be hashed
1098 Returns:
1099 the hashed value
1100 """
1101 return self._hasher.hash(text)
1103 # -------------------------------------------------------------------------
1104 # Database
1105 # -------------------------------------------------------------------------
1107 @property
1108 def temporary_tablename(self) -> str:
1109 """
1110 Temporary tablename to use.
1112 See the documentation for the :ref:`NLP config file <nlp_config>`.
1113 """
1114 return self._temporary_tablename
1116 def set_echo(self, echo: bool) -> None:
1117 """
1118 Set the SQLAlchemy ``echo`` parameter (to echo SQL) for all our
1119 source databases.
1120 """
1121 self._progdb.engine.echo = echo
1122 for db in self._databases.values():
1123 db.engine.echo = echo
1124 # Now, SQLAlchemy will mess things up by adding an additional handler.
1125 # So, bye-bye:
1126 for logname in (
1127 "sqlalchemy.engine.base.Engine",
1128 "sqlalchemy.engine.base.OptionEngine",
1129 ):
1130 logger = logging.getLogger(logname)
1131 logger.handlers = [] # ... of type: List[logging.Handler]
1133 @property
1134 def progressdb_session(self) -> Session:
1135 """
1136 Returns an SQLAlchemy ORM :class:`Session` for the progress database.
1137 """
1138 return self._progdb.session
1140 @property
1141 def progressdb_engine(self) -> Engine:
1142 """
1143 Returns an SQLAlchemy Core :class:`Engine` for the progress database.
1144 """
1145 return self._progdb.engine
1147 @property
1148 def progressdb_metadata(self) -> MetaData:
1149 """
1150 Returns the SQLAlchemy :class:`MetaData` for the progress database.
1151 """
1152 return self._progdb.metadata
1154 @property
1155 def progdb(self) -> DatabaseHolder:
1156 """
1157 Returns the progress database.
1158 """
1159 return self._progdb
1161 def commit_all(self) -> None:
1162 """
1163 Execute a COMMIT on all databases (all destination database and the
1164 progress database).
1165 """
1166 self.commit(self.progressdb_session)
1167 for db in self._databases.values():
1168 self.commit(db.session)
1170 def get_transation_limiter(
1171 self, session: Session
1172 ) -> TransactionSizeLimiter:
1173 """
1174 Returns (or creates and returns) a transaction limiter for a given
1175 SQLAlchemy session.
1177 Args:
1178 session: SQLAlchemy ORM :class:`Session`
1180 Returns:
1181 a :class:`crate_anon.common.sql.TransactionSizeLimiter`
1183 """
1184 if session not in self._transaction_limiters:
1185 self._transaction_limiters[session] = TransactionSizeLimiter(
1186 session,
1187 max_rows_before_commit=self._max_rows_before_commit,
1188 max_bytes_before_commit=self._max_bytes_before_commit,
1189 )
1190 return self._transaction_limiters[session]
1192 def notify_transaction(
1193 self,
1194 session: Session,
1195 n_rows: int,
1196 n_bytes: int,
1197 force_commit: bool = False,
1198 ) -> None:
1199 """
1200 Tell our transaction limiter about a transaction that's occurred on
1201 one of our databases. This may trigger a COMMIT.
1203 Args:
1204 session: SQLAlchemy ORM :class:`Session` that was used
1205 n_rows: number of rows inserted
1206 n_bytes: number of bytes inserted
1207 force_commit: force a COMMIT?
1208 """
1209 tl = self.get_transation_limiter(session)
1210 tl.notify(n_rows=n_rows, n_bytes=n_bytes, force_commit=force_commit)
1212 def commit(self, session: Session) -> None:
1213 """
1214 Executes a COMMIT on a specific session.
1216 Args:
1217 session: SQLAlchemy ORM :class:`Session`
1218 """
1219 tl = self.get_transation_limiter(session)
1220 tl.commit()
1222 # -------------------------------------------------------------------------
1223 # Input fields
1224 # -------------------------------------------------------------------------
1226 @property
1227 def inputfieldconfigs(self) -> Iterable["InputFieldConfig"]:
1228 """
1229 Returns all input field configurations used by this NLP definition.
1231 Returns:
1232 list of
1233 `crate_anon.nlp_manager.input_field_config.InputFieldConfig`
1234 objects
1236 """
1237 return self._inputfieldmap.values()
1239 # -------------------------------------------------------------------------
1240 # NLP processors
1241 # -------------------------------------------------------------------------
1243 @property
1244 def processors(self) -> List["TableMaker"]:
1245 """
1246 Returns all NLP processors used by this NLP definition.
1248 Returns:
1249 list of objects derived from
1250 :class:`crate_anon.nlp_manager.base_nlp_parser.BaseNlpParser`
1252 """
1253 return self._processors
1255 @property
1256 def noncloud_processors(self) -> List["BaseNlpParser"]:
1257 """
1258 Returns all local (non-cloud) NLP processors used by this NLP
1259 definition.
1261 Returns:
1262 list of objects derived from
1263 :class:`crate_anon.nlp_manager.base_nlp_parser.BaseNlpParser`
1265 """
1266 return [
1267 x for x in self._processors if not x.is_cloud_processor()
1268 ] # type: List["BaseNlpParser"]
1270 @property
1271 def uses_cloud_processors(self) -> bool:
1272 """
1273 Are any of our processors cloud-based?
1274 """
1275 return any(x.is_cloud_processor() for x in self._processors)
1277 # -------------------------------------------------------------------------
1278 # NLPRP info
1279 # -------------------------------------------------------------------------
1281 def nlprp_local_processors(
1282 self, sql_dialect: str = None
1283 ) -> Dict[str, Any]:
1284 """
1285 Returns a draft list of processors as per the NLPRP
1286 :ref:`list_processors <nlprp_list_processors>` command.
1287 """
1288 processors = [] # type: List[Dict, str, Any]
1289 for proc in self.noncloud_processors:
1290 processors.append(proc.nlprp_processor_info(sql_dialect))
1291 return {NlprpKeys.PROCESSORS: processors}
1293 def nlprp_local_processors_json(
1294 self, indent: int = 4, sort_keys: bool = True, sql_dialect: str = None
1295 ) -> str:
1296 """
1297 Returns a formatted JSON string from :func:`nlprp_list_processors`.
1298 This is primarily for debugging.
1300 Args:
1301 indent: number of spaces for indentation
1302 sort_keys: sort keys?
1303 sql_dialect: preferred SQL dialect for ``tabular_schema``, or
1304 ``None`` for default
1305 """
1306 json_structure = self.nlprp_local_processors(sql_dialect=sql_dialect)
1307 return json.dumps(json_structure, indent=indent, sort_keys=sort_keys)
1309 # -------------------------------------------------------------------------
1310 # Cloud NLP
1311 # -------------------------------------------------------------------------
1313 def get_cloud_config(self) -> Optional[CloudConfig]:
1314 """
1315 Returns the :class:`crate_anon.nlp_manager.cloud_config.CloudConfig`
1316 object associated with this NLP definition, or ``None`` if there isn't
1317 one.
1318 """
1319 our_name = self.name
1320 if self._cloudcfg is None:
1321 if not self._cloud_config_name:
1322 raise ValueError(
1323 f"No {NlpDefConfigKeys.CLOUD_CONFIG!r} parameter "
1324 f"specified for NLP definition {our_name!r}"
1325 )
1326 if not self._cloud_request_data_dir:
1327 raise ValueError(
1328 f"No {NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR!r} "
1329 f"parameter specified for NLP definition {our_name!r}"
1330 )
1331 req_root_dir = os.path.abspath(self._cloud_request_data_dir)
1332 if not os.path.isdir(req_root_dir):
1333 raise ValueError(
1334 f"Directory {req_root_dir!r}, specified by config "
1335 f"parameter {NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR!r} "
1336 f"for NLP definition {our_name!r}"
1337 )
1338 req_data_dir = os.path.join(req_root_dir, our_name)
1339 os.makedirs(req_data_dir, exist_ok=True)
1340 self._cloudcfg = CloudConfig(
1341 self, name=self._cloud_config_name, req_data_dir=req_data_dir
1342 )
1343 return self._cloudcfg
1345 def get_cloud_config_or_raise(self) -> CloudConfig:
1346 """
1347 Returns the :class:`crate_anon.nlp_manager.cloud_config.CloudConfig`
1348 object associated with this NLP definition, or raise :exc:`ValueError`
1349 if there isn't one.
1350 """
1351 cloudcfg = self.get_cloud_config()
1352 if cloudcfg is None:
1353 raise ValueError(
1354 f"No cloud NLP configuration for NLP definition "
1355 f"{self.name!r}"
1356 )
1357 if not cloudcfg.remote_processors:
1358 raise ValueError(
1359 f"No remote (cloud) processors configured for "
1360 f"NLP definition {self.name!r}"
1361 )
1362 return cloudcfg