Coverage for nlp_manager/constants.py: 98%
123 statements
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
« prev ^ index » next coverage.py v7.8.0, created at 2026-02-05 06:46 -0600
1"""
2crate_anon/nlp_manager/constants.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Constants for CRATE NLP, including a demo config file.**
28"""
30from cardinal_pythonlib.hash import HmacMD5Hasher
31from sqlalchemy.types import String
34# =============================================================================
35# Constants
36# =============================================================================
38DEFAULT_CLOUD_LIMIT_BEFORE_COMMIT = 1000
39DEFAULT_CLOUD_MAX_CONTENT_LENGTH = 0 # no limit
40DEFAULT_CLOUD_MAX_RECORDS_PER_REQUEST = 1000
41DEFAULT_CLOUD_MAX_TRIES = 5
42DEFAULT_CLOUD_RATE_LIMIT_HZ = 2
43DEFAULT_CLOUD_WAIT_ON_CONN_ERR_S = 180 # in seconds
44DEFAULT_REPORT_EVERY_NLP = 500 # low values slow down processing
45DEFAULT_TEMPORARY_TABLENAME = "_crate_nlp_temptable"
47FN_CRATE_VERSION_FIELD = "_crate_version" # new in v0.18.53
48FN_NLPDEF = "_nlpdef"
49FN_PK = "_pk"
50FN_SRCDATETIMEFIELD = "_srcdatetimefield" # new in v0.18.52
51FN_SRCDATETIMEVAL = "_srcdatetimeval" # new in v0.18.52
52FN_SRCDB = "_srcdb"
53FN_SRCFIELD = "_srcfield"
54FN_SRCPKFIELD = "_srcpkfield"
55FN_SRCPKSTR = "_srcpkstr"
56FN_SRCPKVAL = "_srcpkval"
57FN_SRCTABLE = "_srctable"
58FN_WHEN_FETCHED = "_when_fetched_utc" # new in v0.18.53
60TRUNCATED_FLAG = "_truncated" # NOT A FIELD/COLUMN NAME. INTERNAL USE ONLY.
62GATE_PIPELINE_CLASSNAME = "CrateGatePipeline"
64HashClass = HmacMD5Hasher
66MAX_STRING_PK_LENGTH = 64 # trade-off; space versus capability
67MAX_SQL_FIELD_LEN = 64
68# ... http://dev.mysql.com/doc/refman/5.0/en/identifiers.html
69MAX_SEMANTIC_VERSION_STRING_LENGTH = (
70 147 # https://github.com/mojombo/semver/issues/79
71)
72MEDEX_PIPELINE_CLASSNAME = "CrateMedexPipeline"
73MEDEX_DATA_READY_SIGNAL = "data_ready"
74MEDEX_RESULTS_READY_SIGNAL = "results_ready"
76NLP_CONFIG_ENV_VAR = "CRATE_NLP_CONFIG"
78SqlTypeDbIdentifier = String(MAX_SQL_FIELD_LEN)
79# ... text field used for database names, table names, and field names
82# =============================================================================
83# Simple classes for string constant collections
84# =============================================================================
87class NlpConfigPrefixes:
88 """
89 Section name prefixes for the NLP config file.
90 """
92 NLPDEF = "nlpdef"
93 PROCESSOR = "processor"
94 ENV = "env"
95 OUTPUT = "output"
96 INPUT = "input"
97 DATABASE = "database"
98 CLOUD = "cloud"
101class NlpDefConfigKeys:
102 """
103 Config file keys for NLP definitions.
104 """
106 INPUTFIELDDEFS = "inputfielddefs"
107 PROCESSORS = "processors"
108 PROGRESSDB = "progressdb"
109 HASHPHRASE = "hashphrase"
110 TEMPORARY_TABLENAME = "temporary_tablename"
111 MAX_ROWS_BEFORE_COMMIT = "max_rows_before_commit"
112 MAX_BYTES_BEFORE_COMMIT = "max_bytes_before_commit"
113 TRUNCATE_TEXT_AT = "truncate_text_at"
114 RECORD_TRUNCATED_VALUES = "record_truncated_values"
115 CLOUD_CONFIG = "cloud_config"
116 CLOUD_REQUEST_DATA_DIR = "cloud_request_data_dir"
119class NlpDefValues:
120 """
121 Config file values for NLP definitions
122 """
124 # Since any server with the same output format as CRATE's is compatible,
125 # we call this format standard
126 FORMAT_STANDARD = "Standard"
127 FORMAT_GATE = "GATE"
130class InputFieldConfigKeys:
131 """
132 Config file keys for input database fields (columns).
133 """
135 SRCDB = "srcdb"
136 SRCTABLE = "srctable"
137 SRCPKFIELD = "srcpkfield"
138 SRCFIELD = "srcfield"
139 SRCDATETIMEFIELD = "srcdatetimefield"
140 COPYFIELDS = "copyfields"
141 INDEXED_COPYFIELDS = "indexed_copyfields"
142 DEBUG_ROW_LIMIT = "debug_row_limit"
145class ProcessorConfigKeys:
146 """
147 Config file keys for NLP processors.
148 """
150 ASSUME_PREFERRED_UNIT = "assume_preferred_unit"
151 DESTDB = "destdb"
152 DESTTABLE = "desttable"
153 OUTPUTTYPEMAP = "outputtypemap"
154 PROGARGS = "progargs"
155 PROGENVSECTION = "progenvsection"
156 INPUT_TERMINATOR = "input_terminator"
157 OUTPUT_TERMINATOR = "output_terminator"
158 MAX_EXTERNAL_PROG_USES = "max_external_prog_uses"
159 PROCESSOR_NAME = "processor_name"
160 PROCESSOR_VERSION = "processor_version"
161 PROCESSOR_FORMAT = "processor_format"
164class NlpOutputConfigKeys:
165 """
166 Config file keys for output tables from GATE or Cloud NLP processors.
167 """
169 DESTTABLE = "desttable"
170 RENAMES = "renames"
171 NULL_LITERALS = "null_literals"
172 DESTFIELDS = "destfields"
173 INDEXDEFS = "indexdefs"
176class DatabaseConfigKeys:
177 """
178 Config file keys for database definitions.
179 """
181 URL = "url"
182 ECHO = "echo"
185class CloudNlpConfigKeys:
186 """
187 Config file keys for cloud NLP.
188 """
190 CLOUD_URL = "cloud_url"
191 VERIFY_SSL = "verify_ssl"
192 COMPRESS = "compress"
193 USERNAME = "username"
194 PASSWORD = "password"
195 WAIT_ON_CONN_ERR = "wait_on_conn_err"
196 MAX_CONTENT_LENGTH = "max_content_length"
197 LIMIT_BEFORE_COMMIT = "limit_before_commit"
198 MAX_RECORDS_PER_REQUEST = "max_records_per_request"
199 STOP_AT_FAILURE = "stop_at_failure"
200 MAX_TRIES = "max_tries"
201 RATE_LIMIT_HZ = "rate_limit_hz"
202 TEST_LENGTH_FUNCTION_SPEED = "test_length_function_speed"
205class GateApiKeys:
206 """
207 Dictionary keys for the direct API to GATE.
209 See https://cloud.gate.ac.uk/info/help/online-api.html for format of
210 response from processor. The GATE JSON format is:
212 .. code-block:: json
214 {
215 "text":"The text of the document",
216 "entities":{
217 "SampleAnnotationType1":[
218 {
219 "indices":[0,3],
220 "feature1":"value1",
221 "feature2":"value2"
222 }
223 ],
224 "SampleAnnotationType2":[
225 {
226 "indices":[12,15],
227 "feature3":"value3"
228 }
229 ]
230 }
231 }
232 """
234 ENTITIES = "entities"
235 INDICES = "indices"
236 TEXT = "text"
239class GateResultKeys:
240 """
241 Dictionary keys to represent GATE results in our NLPRP server.
242 """
244 TYPE = "type"
245 START = "start"
246 END = "end"
247 SET = "set"
248 FEATURES = "features"
251class GateFieldNames:
252 """
253 Field (column) names for results from GATE.
254 These match KEY_* strings in ``CrateGatePipeline.java``.
255 """
257 SET = "_set"
258 TYPE = "_type"
259 ID = "_id"
260 STARTPOS = "_start"
261 ENDPOS = "_end"
262 CONTENT = "_content"
265# =============================================================================
266# Config helpers
267# =============================================================================
269_ALL_NLPRP_SECTION_PREFIXES = [
270 v for k, v in NlpConfigPrefixes.__dict__.items() if not k.startswith("_")
271]
274def full_sectionname(section_type: str, section: str) -> str:
275 if section_type in _ALL_NLPRP_SECTION_PREFIXES:
276 return section_type + ":" + section
277 raise ValueError(f"Unrecognised section type: {section_type}")