Coverage for nlp_manager/output_user_config.py: 54%
102 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1"""
2crate_anon/nlp_manager/output_user_config.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Define output configuration for GATE NLP applications.**
28"""
30import ast
31import logging
32import shlex
33from typing import Dict, List
35from cardinal_pythonlib.sql.validation import (
36 ensure_valid_field_name,
37 ensure_valid_table_name,
38 is_sqltype_valid,
39)
40from cardinal_pythonlib.lists import chunks
41from cardinal_pythonlib.sqlalchemy.schema import (
42 get_sqla_coltype_from_dialect_str,
43)
44from sqlalchemy.engine.base import Engine
45from sqlalchemy.schema import Column, Index
47from crate_anon.common.extendedconfigparser import (
48 ConfigSection,
49 ExtendedConfigParser,
50)
51from crate_anon.nlp_manager.constants import (
52 full_sectionname,
53 NlpOutputConfigKeys,
54 NlpConfigPrefixes,
55)
56from crate_anon.nlp_manager.input_field_config import InputFieldConfig
58log = logging.getLogger(__name__)
61# =============================================================================
62# OutputUserConfig
63# =============================================================================
66class OutputUserConfig:
67 """
68 Class defining configuration for the output of a given GATE app, or remote
69 cloud app.
71 See the documentation for the :ref:`NLP config file <nlp_config>`.
72 """
74 def __init__(
75 self,
76 config_parser: ExtendedConfigParser,
77 cfg_output_name: str,
78 schema_required: bool = True,
79 ) -> None:
80 """
81 Read config from a configparser section.
83 Args:
84 config_parser:
85 :class:`crate_anon.common.extendedconfigparser.ExtendedConfigParser`
86 cfg_output_name:
87 config file section name suffix -- this is the second of the
88 pair of strings in the ``outputtypemap`` part of the GATE NLP
89 app config section. See
91 - :ref:`NLP config file <nlp_config>`
92 - :class:`crate_anon.nlp_manager.parse_gate.Gate`
93 schema_required:
94 is it required that the user has specified a schema, i.e.
95 destfields and a desttable? - Should be true for Gate, False
96 for Cloud as the remote processors may have their own schema
97 definition.
98 """
100 sectionname = full_sectionname(
101 NlpConfigPrefixes.OUTPUT, cfg_output_name
102 )
103 cfg = ConfigSection(section=sectionname, parser=config_parser)
105 # ---------------------------------------------------------------------
106 # desttable
107 # ---------------------------------------------------------------------
109 self._desttable = cfg.opt_str(
110 NlpOutputConfigKeys.DESTTABLE, required=True
111 )
112 ensure_valid_table_name(self._desttable)
114 # ---------------------------------------------------------------------
115 # renames
116 # ---------------------------------------------------------------------
118 self._renames = {} # type: Dict[str, str]
119 rename_lines = cfg.opt_strlist(
120 NlpOutputConfigKeys.RENAMES, required=False, as_words=False
121 )
122 for line in rename_lines:
123 if not line.strip():
124 continue
125 words = shlex.split(line)
126 if len(words) != 2:
127 raise ValueError(
128 f"Bad {NlpOutputConfigKeys.RENAMES!r} option in config "
129 f"section {sectionname!r}; line was {line!r} but should "
130 f"have contained two things"
131 )
132 annotation_or_remote_column_name = words[0]
133 to_column_name = words[1]
134 ensure_valid_field_name(to_column_name)
135 self._renames[annotation_or_remote_column_name] = to_column_name
137 # ---------------------------------------------------------------------
138 # null_literals
139 # ---------------------------------------------------------------------
141 null_literal_lines = cfg.opt_strlist(
142 NlpOutputConfigKeys.NULL_LITERALS, required=False, as_words=False
143 )
144 self._null_literals = [] # type: List[str]
145 for line in null_literal_lines:
146 self._null_literals += shlex.split(line)
148 # ---------------------------------------------------------------------
149 # destfields
150 # ---------------------------------------------------------------------
152 self._destfields = [] # type: List[str]
153 self._dest_datatypes = [] # type: List[str]
154 self._dest_comments = [] # type: List[str]
155 dest_field_lines = cfg.opt_strlist(
156 NlpOutputConfigKeys.DESTFIELDS,
157 required=schema_required,
158 as_words=False,
159 )
160 # ... comments will be removed during that process.
161 # If dest_field_lines is empty (as it may be for a Cloud processor)
162 # the following block doesn't execute, so the 'dest' attributed remain
163 # empty
164 for dfl in dest_field_lines:
165 parts = dfl.split(maxsplit=2)
166 assert len(parts) >= 2, f"Bad field definition line: {dfl!r}"
167 field = parts[0]
168 datatype = parts[1].upper()
169 comment = parts[2] if len(parts) > 2 else None
170 ensure_valid_field_name(field)
171 if not is_sqltype_valid(datatype):
172 raise ValueError(f"Invalid datatype for {field}: {datatype}")
173 self._destfields.append(field)
174 self._dest_datatypes.append(datatype)
175 self._dest_comments.append(comment)
177 src_fields = [
178 c.name for c in InputFieldConfig.get_core_columns_for_dest()
179 ]
180 for sf in src_fields:
181 if sf in self._destfields:
182 raise ValueError(
183 f"For section {sectionname}, destination field {sf} is "
184 f"auto-supplied; do not add it manually"
185 )
187 if len(set(self._destfields)) != len(self._destfields):
188 raise ValueError(
189 f"Duplicate fields exist in destination fields: "
190 f"{self._destfields}"
191 )
193 # ---------------------------------------------------------------------
194 # indexdefs
195 # ---------------------------------------------------------------------
197 self._indexfields = [] # type: List[str]
198 self._indexlengths = [] # type: List[int]
199 indexdefs = cfg.opt_strlist(NlpOutputConfigKeys.INDEXDEFS)
200 if indexdefs:
201 for c in chunks(indexdefs, 2): # pairs: field, length
202 indexfieldname = c[0]
203 lengthstr = c[1]
204 if indexfieldname not in self._destfields:
205 raise ValueError(
206 f"Index field {indexfieldname} not in "
207 f"destination fields {self._destfields}"
208 )
209 try:
210 length = ast.literal_eval(lengthstr)
211 if length is not None:
212 length = int(length)
213 except ValueError:
214 raise ValueError(f"Bad index length: {lengthstr}")
215 self._indexfields.append(indexfieldname)
216 self._indexlengths.append(length)
218 @property
219 def dest_tablename(self) -> str:
220 """
221 Returns the name of the destination table.
222 """
223 return self._desttable
225 @property
226 def destfields(self) -> List[str]:
227 """
228 Returns the list of destination fields.
229 """
230 return self._destfields
232 def get_columns(self, engine: Engine) -> List[Column]:
233 """
234 Return all SQLAlchemy :class:`Column` definitions for the destination
235 table.
237 Args:
238 engine: SQLAlchemy database :class:`Engine`
240 Returns:
241 list of SQLAlchemy :class:`Column` objects
243 """
244 columns = [] # type: List[Column]
245 for i, field in enumerate(self._destfields):
246 datatype = self._dest_datatypes[i]
247 comment = self._dest_comments[i]
248 columns.append(
249 Column(
250 field,
251 get_sqla_coltype_from_dialect_str(
252 datatype, engine.dialect
253 ),
254 comment=comment,
255 )
256 )
257 return columns
259 @property
260 def indexes(self) -> List[Index]:
261 """
262 Return all SQLAlchemy :class:`Index` definitions for the destination
263 table.
265 Returns:
266 list of SQLAlchemy :class:`Index` objects
268 """
269 indexes = [] # type: List[Index]
270 for i, field in enumerate(self._indexfields):
271 index_name = f"_idx_{field}"
272 length = self._indexlengths[i]
273 kwargs = {"mysql_length": length} if length is not None else {}
274 indexes.append(Index(index_name, field, **kwargs))
275 return indexes
277 @property
278 def renames(self) -> Dict[str, str]:
279 """
280 Return the "rename dictionary": a dictionary mapping GATE annotation
281 names (or cloud remote column names) to local field (column) names in
282 the NLP destination table.
284 See
286 - ``renames`` in the :ref:`NLP config file <nlp_config>`.
287 - :meth:`crate_anon.nlp_manager.parse_gate.Gate.parse`
288 """
289 return self._renames
291 @property
292 def null_literals(self) -> List[str]:
293 """
294 Returns string values from the GATE output that will be interpreted as
295 NULL values.
297 See
299 - ``null_literals`` in the :ref:`NLP config file <nlp_config>`.
300 - :meth:`crate_anon.nlp_manager.parse_gate.Gate.parse`.
301 """
302 return self._null_literals