Coverage for nlp_manager/base_nlp_parser.py: 77%
297 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1"""
2crate_anon/nlp_manager/base_nlp_parser.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**Simple base class for all our NLP parsers (GATE, regex, ...)**
28"""
30from abc import ABC, abstractmethod
31from functools import lru_cache
32import json
33import logging
34import sys
35from typing import (
36 Any,
37 Dict,
38 Generator,
39 Iterable,
40 List,
41 Optional,
42 Tuple,
43 TYPE_CHECKING,
44)
46from cardinal_pythonlib.reprfunc import auto_repr
47from cardinal_pythonlib.timing import MultiTimerContext, timer
48from cardinal_pythonlib.sqlalchemy.schema import (
49 column_lists_equal,
50 index_lists_equal,
51)
53# OK to import "registry"; see
54# https://github.com/zzzeek/sqlalchemy/blob/master/README.dialects.rst
55# noinspection PyProtectedMember
56from sqlalchemy.dialects import registry
57from sqlalchemy.engine.base import Engine
58from sqlalchemy.exc import DatabaseError
59from sqlalchemy.orm.session import Session
60from sqlalchemy.schema import Column, Index, Table
61from sqlalchemy.sql import and_, exists, or_
62from sqlalchemy.sql.schema import MetaData
63from sqlalchemy.types import Integer, Text
65from crate_anon.anonymise.dbholder import DatabaseHolder
66from crate_anon.anonymise.constants import (
67 COMMENT,
68 TABLE_KWARGS,
69)
70from crate_anon.common.sql import decorate_index_name
71from crate_anon.common.stringfunc import (
72 compress_docstring,
73 does_text_contain_word_chars,
74 get_docstring,
75)
76from crate_anon.nlp_manager.constants import (
77 FN_NLPDEF,
78 FN_SRCPKVAL,
79 FN_SRCPKSTR,
80 full_sectionname,
81 NlpConfigPrefixes,
82 ProcessorConfigKeys,
83 GateFieldNames,
84 SqlTypeDbIdentifier,
85 MAX_SQL_FIELD_LEN,
86)
87from crate_anon.nlp_manager.input_field_config import InputFieldConfig
88from crate_anon.nlp_manager.nlp_definition import NlpDefinition
89from crate_anon.nlprp.api import NlprpServerProcessor
90from crate_anon.nlprp.constants import (
91 ALL_SQL_DIALECTS,
92 NlprpKeys,
93 NlprpValues,
94 SqlDialects,
95)
96from crate_anon.version import CRATE_VERSION
98if TYPE_CHECKING:
99 from sqlalchemy.engine.interfaces import Dialect
100 from crate_anon.common.extendedconfigparser import ConfigSection
102log = logging.getLogger(__name__)
104DEFAULT_NLPRP_SQL_DIALECT = SqlDialects.MYSQL
105TIMING_DELETE_DEST_RECORD = "BaseNlpParser_delete_dest_record"
106TIMING_INSERT = "BaseNlpParser_sql_insert"
107TIMING_PARSE = "parse"
108TIMING_HANDLE_PARSED = "handled_parsed"
111# =============================================================================
112# Exception meaning "could not parse this piece of text"
113# =============================================================================
116class TextProcessingFailed(Exception):
117 pass
120# =============================================================================
121# Base class for all parser types
122# =============================================================================
125class TableMaker(ABC):
126 """
127 Base class for all CRATE NLP processors, local and cloud, including those
128 that talk to third-party software. Manages the interface to databases for
129 results storage, etc.
130 """
132 _is_cloud_processor = False # overridden by cloud-based classes
134 def __init__(
135 self,
136 nlpdef: Optional[NlpDefinition],
137 cfg_processor_name: Optional[str],
138 commit: bool = False,
139 friendly_name: str = "?",
140 ) -> None:
141 r"""
142 ``__init__`` function for :class:`TableMaker`.
144 Args:
145 nlpdef:
146 An instance of
147 :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`.
149 cfg_processor_name:
150 The name of a CRATE NLP config file section, TO WHICH we will
151 add a ``processor:`` prefix (from which section we may choose
152 to get extra config information).
154 commit:
155 Force a COMMIT whenever we insert data? You should specify this
156 in multiprocess mode, or you may get database deadlocks.
158 friendly_name:
159 Friendly name for the parser.
160 """
161 # NB This docstring was associated with Sphinx errors!
162 self._nlpdef = nlpdef
163 self._cfg_processor_name = cfg_processor_name
164 self._commit = commit
165 self._friendly_name = friendly_name
166 self._destdb_name = None # type: Optional[str]
167 self._destdb = None # type: Optional[DatabaseHolder]
168 if nlpdef is None:
169 self._sectionname = ""
170 self._cfgsection = None # type: Optional[ConfigSection]
171 self._destdb_name = ""
172 self._destdb = None # type: Optional[DatabaseHolder]
173 else:
174 self._sectionname = full_sectionname(
175 NlpConfigPrefixes.PROCESSOR, cfg_processor_name
176 )
177 self._cfgsection = nlpdef.get_config_section(self._sectionname)
178 self._destdb_name = self._cfgsection.opt_str(
179 ProcessorConfigKeys.DESTDB, required=True
180 )
181 self._destdb = nlpdef.get_database(self._destdb_name)
183 def __str__(self) -> str:
184 return self.classname()
186 def __repr__(self) -> str:
187 return auto_repr(self)
189 @classmethod
190 def classname(cls) -> str:
191 """
192 Returns the short Python name of this class.
193 """
194 return cls.__name__
196 @classmethod
197 def fully_qualified_classname(cls) -> str:
198 """
199 Returns the class's fully qualified name.
200 """
201 # This may be imperfect; see
202 # https://stackoverflow.com/questions/2020014/get-fully-qualified-class-name-of-an-object-in-python # noqa: E501
203 # https://www.python.org/dev/peps/pep-3155/
204 return ".".join([cls.__module__, cls.__qualname__])
206 @classmethod
207 def is_cloud_processor(cls) -> bool:
208 """
209 Is this class a cloud-based (remote) NLP processor?
210 """
211 return cls._is_cloud_processor
213 @abstractmethod
214 def dest_tables_columns(self) -> Dict[str, List[Column]]:
215 """
216 Describes the destination table(s) that this NLP processor wants to
217 write to.
219 Returns:
220 dict: a dictionary of ``{tablename: destination_columns}``, where
221 ``destination_columns`` is a list of SQLAlchemy :class:`Column`
222 objects.
223 """
224 raise NotImplementedError
226 def dest_tables_indexes(self) -> Dict[str, List[Index]]:
227 """
228 Describes indexes that this NLP processor suggests for its destination
229 table(s).
231 It is perfectly legitimate for the list not to include some tables, or
232 indeed to be empty.
234 Returns:
235 dict: a dictionary of ``{tablename: indexes}``, where ``indexes``
236 is a list of SQLAlchemy :class:`Index` objects.
237 """
238 return {}
240 @property
241 def dest_metadata(self) -> MetaData:
242 """
243 Returns the SQLAlchemy metadata for the destination database (which
244 this NLP processor was told about at construction).
245 """
246 return self._destdb.metadata
248 @property
249 def dest_session(self) -> Session:
250 """
251 Returns the SQLAlchemy ORM Session for the destination database (which
252 this NLP processor was told about at construction).
253 """
254 return self._destdb.session
256 @property
257 def dest_engine(self) -> Engine:
258 """
259 Returns the SQLAlchemy database Engine for the destination database
260 (which this NLP processor was told about at construction).
261 """
262 return self._destdb.engine
264 @property
265 def nlpdef_name(self) -> Optional[str]:
266 """
267 Returns the name of our
268 :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`, if we
269 have one, or ``None``.
270 """
271 if self._nlpdef is None:
272 return None
273 return self._nlpdef.name
275 @property
276 def friendly_name(self) -> str:
277 """
278 Returns the NLP parser's friendly name
279 """
280 return self._friendly_name
282 @property
283 def friendly_name_with_section(self) -> str:
284 """
285 Returns the NLP parser's friendly name and config section.
286 """
287 return f"{self.friendly_name} [{self._sectionname}]"
289 @property
290 def dest_dbname(self) -> str:
291 """
292 Returns the friendly (config file) name for the destination database
293 (which this NLP processor was told about at construction).
294 """
295 return self._destdb_name
297 @staticmethod
298 def _assert_no_overlap(
299 description1: str,
300 cols1: List[Column],
301 description2: str,
302 cols2: List[Column],
303 ) -> None:
304 """
305 Asserts that the two column lists do not include overlapping column
306 names.
308 Used for ensuring non-overlapping column names when we add NLP-specific
309 columns to generic columns (e.g. about the source data).
311 Args:
312 description1: description of group 1, used for error messages
313 cols1: list 1 of SQLAlchemy :class:`Column` objects
314 description2: description of group 2, used for error messages
315 cols2: list 2 of SQLAlchemy :class:`Column` objects
316 """
317 set1 = set(c.name for c in cols1)
318 set2 = set(c.name for c in cols2)
319 assert not (set1 & set2), (
320 f"Overlap between {description1} column names ({set1}) and "
321 f"{description2} column names ({set2})"
322 )
324 @staticmethod
325 def _assert_column_lists_identical(
326 list_of_column_lists: List[List[Column]],
327 ) -> None:
328 """
329 Ensure that every column list (in a list of column lists) is identical.
330 """
331 n = len(list_of_column_lists)
332 if n <= 1:
333 return
334 for i in range(n - 1):
335 a_list = list_of_column_lists[i]
336 b_list = list_of_column_lists[i + 1]
337 if not column_lists_equal(a_list, b_list):
338 msg = (
339 "Mismatch between column lists. (Are you trying to"
340 " blend source tables with different column names into a "
341 "single NLP results table?) Mismatch is between list {a} "
342 "and list {b}.\n"
343 "-- LIST A: {a_list}.\n"
344 "-- LIST B: {b_list}.\n"
345 "-- ALL LISTS: {all_lists}.\n"
346 "-- ALL COLUMN NAMES: {all_colnames}.".format(
347 a=i,
348 b=i + 1,
349 a_list=a_list,
350 b_list=b_list,
351 all_lists=list_of_column_lists,
352 all_colnames=[
353 [c.name for c in columns]
354 for columns in list_of_column_lists
355 ],
356 )
357 )
358 log.critical(msg)
359 raise ValueError(msg)
361 @staticmethod
362 def _assert_index_lists_identical(
363 list_of_index_lists: List[List[Index]],
364 ) -> None:
365 """
366 Ensure that every index list (in a list of index lists) is identical.
367 """
368 n = len(list_of_index_lists)
369 if n <= 1:
370 return
371 for i in range(n - 1):
372 a_list = list_of_index_lists[i]
373 b_list = list_of_index_lists[i + 1]
374 if not index_lists_equal(a_list, b_list):
375 msg = (
376 "Mismatch between index lists. (Are you trying to"
377 " blend source tables with different column names into a "
378 "single NLP results table?) Mismatch is between list {a} "
379 "and list {b}.\n"
380 "-- LIST A: {a_list}.\n"
381 "-- LIST B: {b_list}.\n"
382 "-- ALL LISTS: {all_lists}.\n"
383 "-- ALL COLUMN NAMES: {all_colnames}.".format(
384 a=i,
385 b=i + 1,
386 a_list=a_list,
387 b_list=b_list,
388 all_lists=list_of_index_lists,
389 all_colnames=[
390 [c.name for c in columns]
391 for columns in list_of_index_lists
392 ],
393 )
394 )
395 log.critical(msg)
396 raise ValueError(msg)
398 # Put these GATE methods here because it's also useful for Cloud processors
400 @staticmethod
401 def _standard_gate_columns() -> List[Column]:
402 """
403 Returns standard columns for GATE output.
404 """
405 return [
406 Column(
407 GateFieldNames.SET,
408 SqlTypeDbIdentifier,
409 comment="GATE output set name",
410 ),
411 Column(
412 GateFieldNames.TYPE,
413 SqlTypeDbIdentifier,
414 comment="GATE annotation type name",
415 ),
416 Column(
417 GateFieldNames.ID,
418 Integer,
419 comment="GATE annotation ID (not clear this is very useful)",
420 ),
421 Column(
422 GateFieldNames.STARTPOS,
423 Integer,
424 comment="Start position in the content",
425 ),
426 Column(
427 GateFieldNames.ENDPOS,
428 Integer,
429 comment="End position in the content",
430 ),
431 Column(
432 GateFieldNames.CONTENT,
433 Text,
434 comment="Full content marked as relevant.",
435 ),
436 ]
438 def _standard_gate_indexes(self, dest_tablename: str) -> List[Index]:
439 """
440 Returns standard indexes for GATE output.
441 """
442 return [
443 Index(
444 decorate_index_name(
445 "_idx__set", dest_tablename, self.dest_engine
446 ),
447 GateFieldNames.SET,
448 mysql_length=MAX_SQL_FIELD_LEN,
449 )
450 ]
452 @lru_cache(maxsize=None)
453 def tables(self) -> Dict[str, Table]:
454 """
455 Returns a dictionary of ``{tablename: Table}``, mapping table names
456 to SQLAlchemy Table objects, for all destination tables of this NLP
457 processor.
458 """
459 # Obtain a single set of copy columns
460 ifconfigs = self._nlpdef.inputfieldconfigs
461 assert ifconfigs, "Must specify a list of InputFieldConfigs"
462 assert self._destdb, "Cannot use tables() call without a database"
464 copycolumns_list = [i.get_copy_columns() for i in ifconfigs]
465 self._assert_column_lists_identical(copycolumns_list)
466 copy_columns = copycolumns_list[0]
468 core_columns = InputFieldConfig.get_core_columns_for_dest()
469 self._assert_no_overlap("copy", copy_columns, "source", core_columns)
471 # Create one or more tables
472 meta = self.dest_metadata
473 tables = {} # type: Dict[str, Table]
474 t_columns = self.dest_tables_columns()
475 for tablename, extra_dest_cols in t_columns.items():
476 self._assert_no_overlap(
477 "copy", copy_columns, "destination", extra_dest_cols
478 )
479 # And to check we haven't introduced any bugs internally:
480 self._assert_no_overlap(
481 "source", core_columns, "destination", extra_dest_cols
482 )
484 columns = core_columns + extra_dest_cols + copy_columns
486 t_indexes = self.dest_tables_indexes()
487 extra_dest_indexes = [] # type: List[Index]
488 if tablename in t_indexes:
489 extra_dest_indexes = t_indexes[tablename]
490 copyindexes_list = [i.get_copy_indexes() for i in ifconfigs]
491 self._assert_index_lists_identical(copyindexes_list)
492 copy_indexes = copyindexes_list[0]
493 core_indexes = InputFieldConfig.get_core_indexes_for_dest(
494 tablename=tablename, engine=self._destdb.engine
495 )
497 column_like_things = (
498 columns + core_indexes + extra_dest_indexes + copy_indexes
499 )
500 # log.debug(repr(column_like_things))
501 table_kwargs = {
502 COMMENT: f"CRATE NLP results for {self.friendly_name}",
503 **TABLE_KWARGS,
504 }
505 tables[tablename] = Table(
506 tablename, meta, *column_like_things, **table_kwargs
507 )
508 # You can put indexes in the column list:
509 # http://docs.sqlalchemy.org/en/latest/core/constraints.html
511 # NOTE that after creating the Table, all the column objects get
512 # "contaminated" by the link to it, so you have to start afresh
513 # with new column objects, or take a further copy, as above.
515 # You can copy a Column, but not an Index.
516 return tables
518 def get_tablenames(self) -> Iterable[str]:
519 """
520 Returns all destination table names for this NLP processor.
521 """
522 return self.dest_tables_columns().keys()
524 def get_table(self, tablename: str) -> Table:
525 """
526 Returns an SQLAlchemy :class:`Table` for a given destination table of
527 this NLP processor whose name is ``tablename``.
528 """
529 tables = self.tables()
530 try:
531 return tables[tablename]
532 except KeyError:
533 all_tablenames = list(tables.keys())
534 raise KeyError(
535 f"For this NLP processor ({self._cfg_processor_name!r}), the "
536 f"destination table named {tablename!r} does not have an "
537 f"associated Table object. Known Table objects are "
538 f"named {all_tablenames}"
539 )
541 def make_tables(self, drop_first: bool = False) -> List[str]:
542 """
543 Creates all destination tables for this NLP processor in the
544 destination database.
546 Args:
547 drop_first: drop the tables first?
548 """
549 assert self._destdb, "No database specified!"
550 engine = self.dest_engine
551 tables = self.tables()
552 pretty_names = [] # type: List[str]
553 for t in tables.values():
554 pretty_name = f"{self._destdb.name}.{t.name}"
555 if drop_first:
556 log.info(f"Dropping table {pretty_name}")
557 t.drop(engine, checkfirst=True)
558 log.info(f"Creating table {pretty_name} (with indexes)")
559 t.create(engine, checkfirst=True)
560 pretty_names.append(pretty_name)
561 return pretty_names
563 def delete_dest_record(
564 self,
565 ifconfig: InputFieldConfig,
566 srcpkval: int,
567 srcpkstr: Optional[str],
568 commit: bool = False,
569 ) -> None:
570 """
571 Deletes all destination records for a given source record.
573 - Used during incremental updates.
574 - For when a record (specified by ``srcpkval``) has been updated in the
575 source; wipe older entries for it in the destination database(s).
577 Args:
578 ifconfig:
579 :class:`crate_anon.nlp_manager.input_field_config.InputFieldConfig`
580 that defines the source database, table, and field (column)
581 srcpkval:
582 integer primary key (PK) value
583 srcpkstr:
584 for tables with string PKs: the string PK value
585 commit:
586 execute a COMMIT after we have deleted the records?
587 If you don't do this, we will get deadlocks in incremental mode.
588 See e.g.
589 https://dev.mysql.com/doc/refman/5.5/en/innodb-deadlocks.html
590 """ # noqa: E501
591 session = self.dest_session
592 srcdb = ifconfig.srcdb
593 srctable = ifconfig.srctable
594 srcfield = ifconfig.srcfield
595 destdb_name = self._destdb.name
596 nlpdef_name = self._nlpdef.name
597 for tablename, desttable in self.tables().items():
598 log.debug(
599 f"delete_from_dest_dbs... {srcdb}.{srctable} -> "
600 f"{destdb_name}.{tablename}"
601 )
602 # noinspection PyProtectedMember,PyPropertyAccess
603 delquery = (
604 desttable.delete()
605 .where(desttable.c._srcdb == srcdb)
606 .where(desttable.c._srctable == srctable)
607 .where(desttable.c._srcfield == srcfield)
608 .where(desttable.c._srcpkval == srcpkval)
609 .where(desttable.c._nlpdef == nlpdef_name)
610 )
611 if srcpkstr is not None:
612 # noinspection PyProtectedMember,PyPropertyAccess
613 delquery = delquery.where(desttable.c._srcpkstr == srcpkstr)
614 with MultiTimerContext(timer, TIMING_DELETE_DEST_RECORD):
615 session.execute(delquery)
616 if commit:
617 self._nlpdef.commit(session)
619 def delete_where_srcpk_not(
620 self, ifconfig: InputFieldConfig, temptable: Optional[Table]
621 ) -> None:
622 """
623 Function to help with deleting NLP destination records whose source
624 records have been deleted.
626 See :func:`crate_anon.nlp_manager.nlp_manager.delete_where_no_source`.
628 Args:
629 ifconfig:
630 :class:`crate_anon.nlp_manager.input_field_config.InputFieldConfig`
631 that defines the source database, table, and field (column).
632 temptable:
633 If this is specified (as an SQLAlchemy) table, we delete NLP
634 destination records whose source PK has not been inserted into
635 this table. Otherwise, we delete *all* NLP destination records
636 from the source column.
637 """
638 destsession = self.dest_session
639 srcdb = ifconfig.srcdb
640 srctable = ifconfig.srctable
641 srcfield = ifconfig.srcfield
642 for desttable_name, desttable in self.tables().items():
643 log.debug(
644 f"delete_where_srcpk_not... {srcdb}.{srctable} -> "
645 f"{self._destdb_name}.{desttable_name}"
646 )
647 # noinspection PyProtectedMember,PyPropertyAccess
648 dest_deletion_query = (
649 # see get_core_indexes_for_dest
650 desttable.delete()
651 .where(desttable.c._srcdb == srcdb)
652 .where(desttable.c._srctable == srctable)
653 .where(desttable.c._srcfield == srcfield)
654 .where(desttable.c._nlpdef == self._nlpdef.name)
655 )
656 if temptable is not None:
657 log.debug("... deleting selectively")
658 # DELETE FROM a WHERE NOT EXISTS (
659 # SELECT 1 FROM b
660 # WHERE a.a1 = b.b1
661 # AND (
662 # a.a2 = b.b2
663 # OR (a.a2 IS NULL AND b.b2 IS NULL)
664 # )
665 # )
666 temptable_pkvalcol = temptable.columns[FN_SRCPKVAL]
667 temptable_pkstrcol = temptable.columns[FN_SRCPKSTR]
668 # noinspection PyProtectedMember,PyPropertyAccess
669 dest_deletion_query = dest_deletion_query.where(
670 ~exists().where(
671 and_(
672 desttable.c._srcpkval == temptable_pkvalcol,
673 or_(
674 desttable.c._srcpkstr == temptable_pkstrcol,
675 and_(
676 desttable.c._srcpkstr.is_(None),
677 temptable_pkstrcol.is_(None),
678 ),
679 ),
680 )
681 )
682 )
683 else:
684 log.debug("... deleting all")
685 destsession.execute(dest_deletion_query)
686 self._nlpdef.commit(destsession)
688 @property
689 def destdb(self) -> DatabaseHolder:
690 """
691 Returns the destination database.
692 """
693 return self._destdb
696# =============================================================================
697# Base class for all local parser types
698# =============================================================================
701class BaseNlpParser(TableMaker):
702 """
703 Base class for all local CRATE NLP parsers.
704 """
706 uses_external_tool = False # may be overridden
707 is_test_nlp_parser = False # may be overridden by tests!
709 def __init__(
710 self,
711 nlpdef: Optional[NlpDefinition],
712 cfg_processor_name: Optional[str],
713 commit: bool = False,
714 friendly_name: str = "?",
715 ) -> None:
716 super().__init__(
717 nlpdef, cfg_processor_name, commit, friendly_name=friendly_name
718 )
720 # -------------------------------------------------------------------------
721 # NLP processing
722 # -------------------------------------------------------------------------
724 @abstractmethod
725 def parse(
726 self, text: str
727 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
728 """
729 Main parsing function.
731 Args:
732 text:
733 the raw text to parse
735 Yields:
736 tuple: ``tablename, valuedict``, where ``valuedict`` is
737 a dictionary of ``{columnname: value}``. The values returned are
738 ONLY those generated by NLP, and do not include either (a) the
739 source reference values (``_srcdb``, ``_srctable``, etc.) or the
740 "copy" fields.
742 Raises:
743 :exc:`crate_anon.nlp_manager.base_nlp_parser.TextProcessingFailed`
744 if we could not process this text.
745 """
746 raise NotImplementedError
748 def process(
749 self, text: str, starting_fields_values: Dict[str, Any]
750 ) -> None:
751 """
752 The core function that takes a single piece of text and feeds it
753 through a single NLP processor. This may produce zero, one, or many
754 output records. Those records are then merged with information about
755 their source (etc)., and inserted into the destination database.
757 Args:
758 text:
759 the raw text to parse
760 starting_fields_values:
761 a dictionary of the format ``{columnname: value}`` that should
762 be added to whatever the NLP processor comes up with. This
763 will, in practice, include source metadata (which table,
764 row [PK], and column did the text come from), processing
765 metadata (when did the NLP processing take place?), and other
766 values that the user has told us to copy across from the source
767 database.
769 Raises:
770 :exc:`crate_anon.nlp_manager.base_nlp_parser.TextProcessingFailed`
771 if this parser could not process the text
772 """
773 if not does_text_contain_word_chars(text):
774 # log.warning(f"No word characters found in {text}")
775 # ... the warning occurs frequently so slows down processing
776 return
777 starting_fields_values[FN_NLPDEF] = self._nlpdef.name
778 session = self.dest_session
779 n_values = 0
780 with MultiTimerContext(timer, TIMING_PARSE):
781 for tablename, nlp_values in self.parse(text):
782 with MultiTimerContext(timer, TIMING_HANDLE_PARSED):
783 # Merge dictionaries so EXISTING FIELDS/VALUES
784 # (starting_fields_values) HAVE PRIORITY.
785 nlp_values.update(starting_fields_values)
786 sqla_table = self.get_table(tablename)
787 # If we have superfluous keys in our dictionary, SQLAlchemy
788 # will choke ("Unconsumed column names", reporting the
789 # thing that's in our dictionary that it doesn't know
790 # about). HOWEVER, note that SQLA column names may be mixed
791 # case (e.g. 'Text') while our copy-column names are lower
792 # case (e.g. 'text'), so we must have pre-converted
793 # the SQLA column names to lower case. That happens in
794 # InputFieldConfig.get_copy_columns and
795 # InputFieldConfig.get_copy_indexes
796 column_names = [c.name for c in sqla_table.columns]
797 final_values = {
798 k: v
799 for k, v in nlp_values.items()
800 if k in column_names
801 }
802 # log.debug(repr(sqla_table))
803 insertquery = sqla_table.insert().values(final_values)
804 try:
805 with MultiTimerContext(timer, TIMING_INSERT):
806 session.execute(insertquery)
807 except DatabaseError as e:
808 # We can get an error on insert if for example the
809 # output returned by the NLP is invalid for the column
810 # type
811 log.error(e)
813 self._nlpdef.notify_transaction(
814 session,
815 n_rows=1,
816 n_bytes=sys.getsizeof(final_values),
817 force_commit=self._commit,
818 )
819 n_values += 1
820 log.debug(
821 f"NLP processor {self.nlpdef_name}/{self.friendly_name}:"
822 f" found {n_values} values"
823 )
825 @abstractmethod
826 def test(self, verbose: bool = False) -> None:
827 r"""
828 Performs a self-test on the NLP processor.
830 Args:
831 verbose:
832 Be verbose?
834 This is an abstract method that is subclassed.
835 """
836 # NB This docstring was associated with Sphinx errors!
837 raise NotImplementedError(
838 f"No test function for regex class: " f"{self.classname()}"
839 )
841 def test_parser(self, test_strings: List[str]) -> None:
842 """
843 Tests the NLP processor's parser with a set of test strings.
844 """
845 log.info(f"Testing parser: {self.classname()}")
846 for text in test_strings:
847 log.info(f" {text} -> {list(self.parse(text))}")
848 log.info("... OK")
850 # -------------------------------------------------------------------------
851 # NLPRP info
852 # -------------------------------------------------------------------------
854 @staticmethod
855 def describe_sqla_col(
856 column: Column, sql_dialect: str = None
857 ) -> Dict[str, Any]:
858 """
859 Describes a single SQLAlchemy :class:`Column` in the :ref:`NLPRP
860 <nlprp>` format, which follows ``INFORMATION_SCHEMA.COLUMNS`` closely.
862 Args:
863 column:
864 the :class:`Column`
865 sql_dialect:
866 preferred SQL dialect for response, or ``None`` for a default
867 """
868 sql_dialect = sql_dialect or DEFAULT_NLPRP_SQL_DIALECT
869 assert sql_dialect in ALL_SQL_DIALECTS, (
870 f"Unknown SQL dialect {sql_dialect!r}; must be one of "
871 f"{ALL_SQL_DIALECTS}"
872 )
873 dialect = registry.load(sql_dialect)() # type: Dialect
874 # log.debug(f"dialect: {dialect}")
875 # dialect = MSDialect()
876 column_type = column.type.compile(dialect)
877 data_type = column_type.partition("(")[0]
878 # ... https://stackoverflow.com/questions/27387415/how-would-i-get-everything-before-a-in-a-string-python # noqa: E501
879 return {
880 NlprpKeys.COLUMN_NAME: column.name,
881 NlprpKeys.COLUMN_TYPE: column_type,
882 NlprpKeys.DATA_TYPE: data_type,
883 NlprpKeys.IS_NULLABLE: column.nullable,
884 NlprpKeys.COLUMN_COMMENT: column.comment,
885 }
887 def nlprp_schema_info(self, sql_dialect: str = None) -> Dict[str, Any]:
888 """
889 Returns a dictionary for the ``schema_type`` parameter, and associated
890 parameters describing the schema (e.g. ``tabular_schema``), of the
891 NLPRP :ref:`list_processors <nlprp_list_processors>` command.
893 This is not a classmethod, because it may be specialized as we load
894 external schema information (e.g. GATE processors).
896 Args:
897 sql_dialect:
898 preferred SQL dialect for ``tabular_schema``
899 """
900 sql_dialect = sql_dialect or DEFAULT_NLPRP_SQL_DIALECT
901 tabular_schema = {} # type: Dict[str, List[Dict[str, Any]]]
902 for tablename, columns in self.dest_tables_columns().items():
903 colinfo = [] # type: List[Dict[str, Any]]
904 for column in columns:
905 colinfo.append(self.describe_sqla_col(column, sql_dialect))
906 tabular_schema[tablename] = colinfo
907 schema_info = {
908 NlprpKeys.SCHEMA_TYPE: NlprpValues.TABULAR,
909 NlprpKeys.SQL_DIALECT: sql_dialect,
910 NlprpKeys.TABULAR_SCHEMA: tabular_schema,
911 }
912 return schema_info
914 @classmethod
915 def nlprp_name(cls) -> str:
916 """
917 Returns the processor's name for use in response to the NLPRP
918 :ref:`list_processors <nlprp_list_processors>` command.
920 The default is the fully qualified module/class name -- because this is
921 highly unlikely to clash with any other NLP processors on a given
922 server.
923 """
924 return cls.fully_qualified_classname()
926 @classmethod
927 def nlprp_title(cls) -> str:
928 """
929 Returns the processor's title for use in response to the NLPRP
930 :ref:`list_processors <nlprp_list_processors>` command.
932 The default is the short Python class name.
933 """
934 return cls.__name__
936 @classmethod
937 def nlprp_version(cls) -> str:
938 """
939 Returns the processor's version for use in response to the NLPRP
940 :ref:`list_processors <nlprp_list_processors>` command.
942 The default is the current CRATE version.
943 """
944 return CRATE_VERSION
946 @classmethod
947 def nlprp_is_default_version(cls) -> bool:
948 """
949 Returns whether this processor is the default version of its name, for
950 use in response to the NLPRP :ref:`list_processors
951 <nlprp_list_processors>` command.
953 The default is ``True``.
954 """
955 return True
957 @classmethod
958 def nlprp_description(cls) -> str:
959 """
960 Returns the processor's description for use in response to the NLPRP
961 :ref:`list_processors <nlprp_list_processors>` command.
963 Uses each processor's docstring, and reformats it slightly.
964 """
965 return compress_docstring(get_docstring(cls))
967 def nlprp_server_processor(
968 self, sql_dialect: str = None
969 ) -> NlprpServerProcessor:
970 schema_info = self.nlprp_schema_info(sql_dialect)
971 return NlprpServerProcessor(
972 name=self.nlprp_name(),
973 title=self.nlprp_title(),
974 version=self.nlprp_version(),
975 is_default_version=self.nlprp_is_default_version(),
976 description=self.nlprp_description(),
977 schema_type=schema_info[NlprpKeys.SCHEMA_TYPE],
978 sql_dialect=schema_info.get(NlprpKeys.SQL_DIALECT),
979 tabular_schema=schema_info.get(NlprpKeys.TABULAR_SCHEMA),
980 )
982 def nlprp_processor_info(self, sql_dialect: str = None) -> Dict[str, Any]:
983 """
984 Returns a dictionary suitable for use as this processor's response to
985 the NLPRP :ref:`list_processors <nlprp_list_processors>` command.
987 This is not a classmethod, because it may be specialized as we load
988 external schema information (e.g. GATE processors).
990 Args:
991 sql_dialect:
992 preferred SQL dialect for ``tabular_schema``
993 """
994 return self.nlprp_server_processor(sql_dialect).infodict
996 def nlprp_processor_info_json(
997 self, indent: int = 4, sort_keys: bool = True, sql_dialect: str = None
998 ) -> str:
999 """
1000 Returns a formatted JSON string from :func:`nlprp_schema_info`.
1001 This is primarily for debugging.
1003 Args:
1004 indent:
1005 number of spaces for indentation
1006 sort_keys:
1007 sort keys?
1008 sql_dialect:
1009 preferred SQL dialect for ``tabular_schema``, or ``None`` for
1010 default
1011 """
1012 json_structure = self.nlprp_processor_info(sql_dialect=sql_dialect)
1013 return json.dumps(json_structure, indent=indent, sort_keys=sort_keys)