Coverage for nlp_manager/base_nlp_parser.py: 77%

297 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1""" 

2crate_anon/nlp_manager/base_nlp_parser.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Simple base class for all our NLP parsers (GATE, regex, ...)** 

27 

28""" 

29 

30from abc import ABC, abstractmethod 

31from functools import lru_cache 

32import json 

33import logging 

34import sys 

35from typing import ( 

36 Any, 

37 Dict, 

38 Generator, 

39 Iterable, 

40 List, 

41 Optional, 

42 Tuple, 

43 TYPE_CHECKING, 

44) 

45 

46from cardinal_pythonlib.reprfunc import auto_repr 

47from cardinal_pythonlib.timing import MultiTimerContext, timer 

48from cardinal_pythonlib.sqlalchemy.schema import ( 

49 column_lists_equal, 

50 index_lists_equal, 

51) 

52 

53# OK to import "registry"; see 

54# https://github.com/zzzeek/sqlalchemy/blob/master/README.dialects.rst 

55# noinspection PyProtectedMember 

56from sqlalchemy.dialects import registry 

57from sqlalchemy.engine.base import Engine 

58from sqlalchemy.exc import DatabaseError 

59from sqlalchemy.orm.session import Session 

60from sqlalchemy.schema import Column, Index, Table 

61from sqlalchemy.sql import and_, exists, or_ 

62from sqlalchemy.sql.schema import MetaData 

63from sqlalchemy.types import Integer, Text 

64 

65from crate_anon.anonymise.dbholder import DatabaseHolder 

66from crate_anon.anonymise.constants import ( 

67 COMMENT, 

68 TABLE_KWARGS, 

69) 

70from crate_anon.common.sql import decorate_index_name 

71from crate_anon.common.stringfunc import ( 

72 compress_docstring, 

73 does_text_contain_word_chars, 

74 get_docstring, 

75) 

76from crate_anon.nlp_manager.constants import ( 

77 FN_NLPDEF, 

78 FN_SRCPKVAL, 

79 FN_SRCPKSTR, 

80 full_sectionname, 

81 NlpConfigPrefixes, 

82 ProcessorConfigKeys, 

83 GateFieldNames, 

84 SqlTypeDbIdentifier, 

85 MAX_SQL_FIELD_LEN, 

86) 

87from crate_anon.nlp_manager.input_field_config import InputFieldConfig 

88from crate_anon.nlp_manager.nlp_definition import NlpDefinition 

89from crate_anon.nlprp.api import NlprpServerProcessor 

90from crate_anon.nlprp.constants import ( 

91 ALL_SQL_DIALECTS, 

92 NlprpKeys, 

93 NlprpValues, 

94 SqlDialects, 

95) 

96from crate_anon.version import CRATE_VERSION 

97 

98if TYPE_CHECKING: 

99 from sqlalchemy.engine.interfaces import Dialect 

100 from crate_anon.common.extendedconfigparser import ConfigSection 

101 

102log = logging.getLogger(__name__) 

103 

104DEFAULT_NLPRP_SQL_DIALECT = SqlDialects.MYSQL 

105TIMING_DELETE_DEST_RECORD = "BaseNlpParser_delete_dest_record" 

106TIMING_INSERT = "BaseNlpParser_sql_insert" 

107TIMING_PARSE = "parse" 

108TIMING_HANDLE_PARSED = "handled_parsed" 

109 

110 

111# ============================================================================= 

112# Exception meaning "could not parse this piece of text" 

113# ============================================================================= 

114 

115 

116class TextProcessingFailed(Exception): 

117 pass 

118 

119 

120# ============================================================================= 

121# Base class for all parser types 

122# ============================================================================= 

123 

124 

125class TableMaker(ABC): 

126 """ 

127 Base class for all CRATE NLP processors, local and cloud, including those 

128 that talk to third-party software. Manages the interface to databases for 

129 results storage, etc. 

130 """ 

131 

132 _is_cloud_processor = False # overridden by cloud-based classes 

133 

134 def __init__( 

135 self, 

136 nlpdef: Optional[NlpDefinition], 

137 cfg_processor_name: Optional[str], 

138 commit: bool = False, 

139 friendly_name: str = "?", 

140 ) -> None: 

141 r""" 

142 ``__init__`` function for :class:`TableMaker`. 

143 

144 Args: 

145 nlpdef: 

146 An instance of 

147 :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`. 

148 

149 cfg_processor_name: 

150 The name of a CRATE NLP config file section, TO WHICH we will 

151 add a ``processor:`` prefix (from which section we may choose 

152 to get extra config information). 

153 

154 commit: 

155 Force a COMMIT whenever we insert data? You should specify this 

156 in multiprocess mode, or you may get database deadlocks. 

157 

158 friendly_name: 

159 Friendly name for the parser. 

160 """ 

161 # NB This docstring was associated with Sphinx errors! 

162 self._nlpdef = nlpdef 

163 self._cfg_processor_name = cfg_processor_name 

164 self._commit = commit 

165 self._friendly_name = friendly_name 

166 self._destdb_name = None # type: Optional[str] 

167 self._destdb = None # type: Optional[DatabaseHolder] 

168 if nlpdef is None: 

169 self._sectionname = "" 

170 self._cfgsection = None # type: Optional[ConfigSection] 

171 self._destdb_name = "" 

172 self._destdb = None # type: Optional[DatabaseHolder] 

173 else: 

174 self._sectionname = full_sectionname( 

175 NlpConfigPrefixes.PROCESSOR, cfg_processor_name 

176 ) 

177 self._cfgsection = nlpdef.get_config_section(self._sectionname) 

178 self._destdb_name = self._cfgsection.opt_str( 

179 ProcessorConfigKeys.DESTDB, required=True 

180 ) 

181 self._destdb = nlpdef.get_database(self._destdb_name) 

182 

183 def __str__(self) -> str: 

184 return self.classname() 

185 

186 def __repr__(self) -> str: 

187 return auto_repr(self) 

188 

189 @classmethod 

190 def classname(cls) -> str: 

191 """ 

192 Returns the short Python name of this class. 

193 """ 

194 return cls.__name__ 

195 

196 @classmethod 

197 def fully_qualified_classname(cls) -> str: 

198 """ 

199 Returns the class's fully qualified name. 

200 """ 

201 # This may be imperfect; see 

202 # https://stackoverflow.com/questions/2020014/get-fully-qualified-class-name-of-an-object-in-python # noqa: E501 

203 # https://www.python.org/dev/peps/pep-3155/ 

204 return ".".join([cls.__module__, cls.__qualname__]) 

205 

206 @classmethod 

207 def is_cloud_processor(cls) -> bool: 

208 """ 

209 Is this class a cloud-based (remote) NLP processor? 

210 """ 

211 return cls._is_cloud_processor 

212 

213 @abstractmethod 

214 def dest_tables_columns(self) -> Dict[str, List[Column]]: 

215 """ 

216 Describes the destination table(s) that this NLP processor wants to 

217 write to. 

218 

219 Returns: 

220 dict: a dictionary of ``{tablename: destination_columns}``, where 

221 ``destination_columns`` is a list of SQLAlchemy :class:`Column` 

222 objects. 

223 """ 

224 raise NotImplementedError 

225 

226 def dest_tables_indexes(self) -> Dict[str, List[Index]]: 

227 """ 

228 Describes indexes that this NLP processor suggests for its destination 

229 table(s). 

230 

231 It is perfectly legitimate for the list not to include some tables, or 

232 indeed to be empty. 

233 

234 Returns: 

235 dict: a dictionary of ``{tablename: indexes}``, where ``indexes`` 

236 is a list of SQLAlchemy :class:`Index` objects. 

237 """ 

238 return {} 

239 

240 @property 

241 def dest_metadata(self) -> MetaData: 

242 """ 

243 Returns the SQLAlchemy metadata for the destination database (which 

244 this NLP processor was told about at construction). 

245 """ 

246 return self._destdb.metadata 

247 

248 @property 

249 def dest_session(self) -> Session: 

250 """ 

251 Returns the SQLAlchemy ORM Session for the destination database (which 

252 this NLP processor was told about at construction). 

253 """ 

254 return self._destdb.session 

255 

256 @property 

257 def dest_engine(self) -> Engine: 

258 """ 

259 Returns the SQLAlchemy database Engine for the destination database 

260 (which this NLP processor was told about at construction). 

261 """ 

262 return self._destdb.engine 

263 

264 @property 

265 def nlpdef_name(self) -> Optional[str]: 

266 """ 

267 Returns the name of our 

268 :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`, if we 

269 have one, or ``None``. 

270 """ 

271 if self._nlpdef is None: 

272 return None 

273 return self._nlpdef.name 

274 

275 @property 

276 def friendly_name(self) -> str: 

277 """ 

278 Returns the NLP parser's friendly name 

279 """ 

280 return self._friendly_name 

281 

282 @property 

283 def friendly_name_with_section(self) -> str: 

284 """ 

285 Returns the NLP parser's friendly name and config section. 

286 """ 

287 return f"{self.friendly_name} [{self._sectionname}]" 

288 

289 @property 

290 def dest_dbname(self) -> str: 

291 """ 

292 Returns the friendly (config file) name for the destination database 

293 (which this NLP processor was told about at construction). 

294 """ 

295 return self._destdb_name 

296 

297 @staticmethod 

298 def _assert_no_overlap( 

299 description1: str, 

300 cols1: List[Column], 

301 description2: str, 

302 cols2: List[Column], 

303 ) -> None: 

304 """ 

305 Asserts that the two column lists do not include overlapping column 

306 names. 

307 

308 Used for ensuring non-overlapping column names when we add NLP-specific 

309 columns to generic columns (e.g. about the source data). 

310 

311 Args: 

312 description1: description of group 1, used for error messages 

313 cols1: list 1 of SQLAlchemy :class:`Column` objects 

314 description2: description of group 2, used for error messages 

315 cols2: list 2 of SQLAlchemy :class:`Column` objects 

316 """ 

317 set1 = set(c.name for c in cols1) 

318 set2 = set(c.name for c in cols2) 

319 assert not (set1 & set2), ( 

320 f"Overlap between {description1} column names ({set1}) and " 

321 f"{description2} column names ({set2})" 

322 ) 

323 

324 @staticmethod 

325 def _assert_column_lists_identical( 

326 list_of_column_lists: List[List[Column]], 

327 ) -> None: 

328 """ 

329 Ensure that every column list (in a list of column lists) is identical. 

330 """ 

331 n = len(list_of_column_lists) 

332 if n <= 1: 

333 return 

334 for i in range(n - 1): 

335 a_list = list_of_column_lists[i] 

336 b_list = list_of_column_lists[i + 1] 

337 if not column_lists_equal(a_list, b_list): 

338 msg = ( 

339 "Mismatch between column lists. (Are you trying to" 

340 " blend source tables with different column names into a " 

341 "single NLP results table?) Mismatch is between list {a} " 

342 "and list {b}.\n" 

343 "-- LIST A: {a_list}.\n" 

344 "-- LIST B: {b_list}.\n" 

345 "-- ALL LISTS: {all_lists}.\n" 

346 "-- ALL COLUMN NAMES: {all_colnames}.".format( 

347 a=i, 

348 b=i + 1, 

349 a_list=a_list, 

350 b_list=b_list, 

351 all_lists=list_of_column_lists, 

352 all_colnames=[ 

353 [c.name for c in columns] 

354 for columns in list_of_column_lists 

355 ], 

356 ) 

357 ) 

358 log.critical(msg) 

359 raise ValueError(msg) 

360 

361 @staticmethod 

362 def _assert_index_lists_identical( 

363 list_of_index_lists: List[List[Index]], 

364 ) -> None: 

365 """ 

366 Ensure that every index list (in a list of index lists) is identical. 

367 """ 

368 n = len(list_of_index_lists) 

369 if n <= 1: 

370 return 

371 for i in range(n - 1): 

372 a_list = list_of_index_lists[i] 

373 b_list = list_of_index_lists[i + 1] 

374 if not index_lists_equal(a_list, b_list): 

375 msg = ( 

376 "Mismatch between index lists. (Are you trying to" 

377 " blend source tables with different column names into a " 

378 "single NLP results table?) Mismatch is between list {a} " 

379 "and list {b}.\n" 

380 "-- LIST A: {a_list}.\n" 

381 "-- LIST B: {b_list}.\n" 

382 "-- ALL LISTS: {all_lists}.\n" 

383 "-- ALL COLUMN NAMES: {all_colnames}.".format( 

384 a=i, 

385 b=i + 1, 

386 a_list=a_list, 

387 b_list=b_list, 

388 all_lists=list_of_index_lists, 

389 all_colnames=[ 

390 [c.name for c in columns] 

391 for columns in list_of_index_lists 

392 ], 

393 ) 

394 ) 

395 log.critical(msg) 

396 raise ValueError(msg) 

397 

398 # Put these GATE methods here because it's also useful for Cloud processors 

399 

400 @staticmethod 

401 def _standard_gate_columns() -> List[Column]: 

402 """ 

403 Returns standard columns for GATE output. 

404 """ 

405 return [ 

406 Column( 

407 GateFieldNames.SET, 

408 SqlTypeDbIdentifier, 

409 comment="GATE output set name", 

410 ), 

411 Column( 

412 GateFieldNames.TYPE, 

413 SqlTypeDbIdentifier, 

414 comment="GATE annotation type name", 

415 ), 

416 Column( 

417 GateFieldNames.ID, 

418 Integer, 

419 comment="GATE annotation ID (not clear this is very useful)", 

420 ), 

421 Column( 

422 GateFieldNames.STARTPOS, 

423 Integer, 

424 comment="Start position in the content", 

425 ), 

426 Column( 

427 GateFieldNames.ENDPOS, 

428 Integer, 

429 comment="End position in the content", 

430 ), 

431 Column( 

432 GateFieldNames.CONTENT, 

433 Text, 

434 comment="Full content marked as relevant.", 

435 ), 

436 ] 

437 

438 def _standard_gate_indexes(self, dest_tablename: str) -> List[Index]: 

439 """ 

440 Returns standard indexes for GATE output. 

441 """ 

442 return [ 

443 Index( 

444 decorate_index_name( 

445 "_idx__set", dest_tablename, self.dest_engine 

446 ), 

447 GateFieldNames.SET, 

448 mysql_length=MAX_SQL_FIELD_LEN, 

449 ) 

450 ] 

451 

452 @lru_cache(maxsize=None) 

453 def tables(self) -> Dict[str, Table]: 

454 """ 

455 Returns a dictionary of ``{tablename: Table}``, mapping table names 

456 to SQLAlchemy Table objects, for all destination tables of this NLP 

457 processor. 

458 """ 

459 # Obtain a single set of copy columns 

460 ifconfigs = self._nlpdef.inputfieldconfigs 

461 assert ifconfigs, "Must specify a list of InputFieldConfigs" 

462 assert self._destdb, "Cannot use tables() call without a database" 

463 

464 copycolumns_list = [i.get_copy_columns() for i in ifconfigs] 

465 self._assert_column_lists_identical(copycolumns_list) 

466 copy_columns = copycolumns_list[0] 

467 

468 core_columns = InputFieldConfig.get_core_columns_for_dest() 

469 self._assert_no_overlap("copy", copy_columns, "source", core_columns) 

470 

471 # Create one or more tables 

472 meta = self.dest_metadata 

473 tables = {} # type: Dict[str, Table] 

474 t_columns = self.dest_tables_columns() 

475 for tablename, extra_dest_cols in t_columns.items(): 

476 self._assert_no_overlap( 

477 "copy", copy_columns, "destination", extra_dest_cols 

478 ) 

479 # And to check we haven't introduced any bugs internally: 

480 self._assert_no_overlap( 

481 "source", core_columns, "destination", extra_dest_cols 

482 ) 

483 

484 columns = core_columns + extra_dest_cols + copy_columns 

485 

486 t_indexes = self.dest_tables_indexes() 

487 extra_dest_indexes = [] # type: List[Index] 

488 if tablename in t_indexes: 

489 extra_dest_indexes = t_indexes[tablename] 

490 copyindexes_list = [i.get_copy_indexes() for i in ifconfigs] 

491 self._assert_index_lists_identical(copyindexes_list) 

492 copy_indexes = copyindexes_list[0] 

493 core_indexes = InputFieldConfig.get_core_indexes_for_dest( 

494 tablename=tablename, engine=self._destdb.engine 

495 ) 

496 

497 column_like_things = ( 

498 columns + core_indexes + extra_dest_indexes + copy_indexes 

499 ) 

500 # log.debug(repr(column_like_things)) 

501 table_kwargs = { 

502 COMMENT: f"CRATE NLP results for {self.friendly_name}", 

503 **TABLE_KWARGS, 

504 } 

505 tables[tablename] = Table( 

506 tablename, meta, *column_like_things, **table_kwargs 

507 ) 

508 # You can put indexes in the column list: 

509 # http://docs.sqlalchemy.org/en/latest/core/constraints.html 

510 

511 # NOTE that after creating the Table, all the column objects get 

512 # "contaminated" by the link to it, so you have to start afresh 

513 # with new column objects, or take a further copy, as above. 

514 

515 # You can copy a Column, but not an Index. 

516 return tables 

517 

518 def get_tablenames(self) -> Iterable[str]: 

519 """ 

520 Returns all destination table names for this NLP processor. 

521 """ 

522 return self.dest_tables_columns().keys() 

523 

524 def get_table(self, tablename: str) -> Table: 

525 """ 

526 Returns an SQLAlchemy :class:`Table` for a given destination table of 

527 this NLP processor whose name is ``tablename``. 

528 """ 

529 tables = self.tables() 

530 try: 

531 return tables[tablename] 

532 except KeyError: 

533 all_tablenames = list(tables.keys()) 

534 raise KeyError( 

535 f"For this NLP processor ({self._cfg_processor_name!r}), the " 

536 f"destination table named {tablename!r} does not have an " 

537 f"associated Table object. Known Table objects are " 

538 f"named {all_tablenames}" 

539 ) 

540 

541 def make_tables(self, drop_first: bool = False) -> List[str]: 

542 """ 

543 Creates all destination tables for this NLP processor in the 

544 destination database. 

545 

546 Args: 

547 drop_first: drop the tables first? 

548 """ 

549 assert self._destdb, "No database specified!" 

550 engine = self.dest_engine 

551 tables = self.tables() 

552 pretty_names = [] # type: List[str] 

553 for t in tables.values(): 

554 pretty_name = f"{self._destdb.name}.{t.name}" 

555 if drop_first: 

556 log.info(f"Dropping table {pretty_name}") 

557 t.drop(engine, checkfirst=True) 

558 log.info(f"Creating table {pretty_name} (with indexes)") 

559 t.create(engine, checkfirst=True) 

560 pretty_names.append(pretty_name) 

561 return pretty_names 

562 

563 def delete_dest_record( 

564 self, 

565 ifconfig: InputFieldConfig, 

566 srcpkval: int, 

567 srcpkstr: Optional[str], 

568 commit: bool = False, 

569 ) -> None: 

570 """ 

571 Deletes all destination records for a given source record. 

572 

573 - Used during incremental updates. 

574 - For when a record (specified by ``srcpkval``) has been updated in the 

575 source; wipe older entries for it in the destination database(s). 

576 

577 Args: 

578 ifconfig: 

579 :class:`crate_anon.nlp_manager.input_field_config.InputFieldConfig` 

580 that defines the source database, table, and field (column) 

581 srcpkval: 

582 integer primary key (PK) value 

583 srcpkstr: 

584 for tables with string PKs: the string PK value 

585 commit: 

586 execute a COMMIT after we have deleted the records? 

587 If you don't do this, we will get deadlocks in incremental mode. 

588 See e.g. 

589 https://dev.mysql.com/doc/refman/5.5/en/innodb-deadlocks.html 

590 """ # noqa: E501 

591 session = self.dest_session 

592 srcdb = ifconfig.srcdb 

593 srctable = ifconfig.srctable 

594 srcfield = ifconfig.srcfield 

595 destdb_name = self._destdb.name 

596 nlpdef_name = self._nlpdef.name 

597 for tablename, desttable in self.tables().items(): 

598 log.debug( 

599 f"delete_from_dest_dbs... {srcdb}.{srctable} -> " 

600 f"{destdb_name}.{tablename}" 

601 ) 

602 # noinspection PyProtectedMember,PyPropertyAccess 

603 delquery = ( 

604 desttable.delete() 

605 .where(desttable.c._srcdb == srcdb) 

606 .where(desttable.c._srctable == srctable) 

607 .where(desttable.c._srcfield == srcfield) 

608 .where(desttable.c._srcpkval == srcpkval) 

609 .where(desttable.c._nlpdef == nlpdef_name) 

610 ) 

611 if srcpkstr is not None: 

612 # noinspection PyProtectedMember,PyPropertyAccess 

613 delquery = delquery.where(desttable.c._srcpkstr == srcpkstr) 

614 with MultiTimerContext(timer, TIMING_DELETE_DEST_RECORD): 

615 session.execute(delquery) 

616 if commit: 

617 self._nlpdef.commit(session) 

618 

619 def delete_where_srcpk_not( 

620 self, ifconfig: InputFieldConfig, temptable: Optional[Table] 

621 ) -> None: 

622 """ 

623 Function to help with deleting NLP destination records whose source 

624 records have been deleted. 

625 

626 See :func:`crate_anon.nlp_manager.nlp_manager.delete_where_no_source`. 

627 

628 Args: 

629 ifconfig: 

630 :class:`crate_anon.nlp_manager.input_field_config.InputFieldConfig` 

631 that defines the source database, table, and field (column). 

632 temptable: 

633 If this is specified (as an SQLAlchemy) table, we delete NLP 

634 destination records whose source PK has not been inserted into 

635 this table. Otherwise, we delete *all* NLP destination records 

636 from the source column. 

637 """ 

638 destsession = self.dest_session 

639 srcdb = ifconfig.srcdb 

640 srctable = ifconfig.srctable 

641 srcfield = ifconfig.srcfield 

642 for desttable_name, desttable in self.tables().items(): 

643 log.debug( 

644 f"delete_where_srcpk_not... {srcdb}.{srctable} -> " 

645 f"{self._destdb_name}.{desttable_name}" 

646 ) 

647 # noinspection PyProtectedMember,PyPropertyAccess 

648 dest_deletion_query = ( 

649 # see get_core_indexes_for_dest 

650 desttable.delete() 

651 .where(desttable.c._srcdb == srcdb) 

652 .where(desttable.c._srctable == srctable) 

653 .where(desttable.c._srcfield == srcfield) 

654 .where(desttable.c._nlpdef == self._nlpdef.name) 

655 ) 

656 if temptable is not None: 

657 log.debug("... deleting selectively") 

658 # DELETE FROM a WHERE NOT EXISTS ( 

659 # SELECT 1 FROM b 

660 # WHERE a.a1 = b.b1 

661 # AND ( 

662 # a.a2 = b.b2 

663 # OR (a.a2 IS NULL AND b.b2 IS NULL) 

664 # ) 

665 # ) 

666 temptable_pkvalcol = temptable.columns[FN_SRCPKVAL] 

667 temptable_pkstrcol = temptable.columns[FN_SRCPKSTR] 

668 # noinspection PyProtectedMember,PyPropertyAccess 

669 dest_deletion_query = dest_deletion_query.where( 

670 ~exists().where( 

671 and_( 

672 desttable.c._srcpkval == temptable_pkvalcol, 

673 or_( 

674 desttable.c._srcpkstr == temptable_pkstrcol, 

675 and_( 

676 desttable.c._srcpkstr.is_(None), 

677 temptable_pkstrcol.is_(None), 

678 ), 

679 ), 

680 ) 

681 ) 

682 ) 

683 else: 

684 log.debug("... deleting all") 

685 destsession.execute(dest_deletion_query) 

686 self._nlpdef.commit(destsession) 

687 

688 @property 

689 def destdb(self) -> DatabaseHolder: 

690 """ 

691 Returns the destination database. 

692 """ 

693 return self._destdb 

694 

695 

696# ============================================================================= 

697# Base class for all local parser types 

698# ============================================================================= 

699 

700 

701class BaseNlpParser(TableMaker): 

702 """ 

703 Base class for all local CRATE NLP parsers. 

704 """ 

705 

706 uses_external_tool = False # may be overridden 

707 is_test_nlp_parser = False # may be overridden by tests! 

708 

709 def __init__( 

710 self, 

711 nlpdef: Optional[NlpDefinition], 

712 cfg_processor_name: Optional[str], 

713 commit: bool = False, 

714 friendly_name: str = "?", 

715 ) -> None: 

716 super().__init__( 

717 nlpdef, cfg_processor_name, commit, friendly_name=friendly_name 

718 ) 

719 

720 # ------------------------------------------------------------------------- 

721 # NLP processing 

722 # ------------------------------------------------------------------------- 

723 

724 @abstractmethod 

725 def parse( 

726 self, text: str 

727 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: 

728 """ 

729 Main parsing function. 

730 

731 Args: 

732 text: 

733 the raw text to parse 

734 

735 Yields: 

736 tuple: ``tablename, valuedict``, where ``valuedict`` is 

737 a dictionary of ``{columnname: value}``. The values returned are 

738 ONLY those generated by NLP, and do not include either (a) the 

739 source reference values (``_srcdb``, ``_srctable``, etc.) or the 

740 "copy" fields. 

741 

742 Raises: 

743 :exc:`crate_anon.nlp_manager.base_nlp_parser.TextProcessingFailed` 

744 if we could not process this text. 

745 """ 

746 raise NotImplementedError 

747 

748 def process( 

749 self, text: str, starting_fields_values: Dict[str, Any] 

750 ) -> None: 

751 """ 

752 The core function that takes a single piece of text and feeds it 

753 through a single NLP processor. This may produce zero, one, or many 

754 output records. Those records are then merged with information about 

755 their source (etc)., and inserted into the destination database. 

756 

757 Args: 

758 text: 

759 the raw text to parse 

760 starting_fields_values: 

761 a dictionary of the format ``{columnname: value}`` that should 

762 be added to whatever the NLP processor comes up with. This 

763 will, in practice, include source metadata (which table, 

764 row [PK], and column did the text come from), processing 

765 metadata (when did the NLP processing take place?), and other 

766 values that the user has told us to copy across from the source 

767 database. 

768 

769 Raises: 

770 :exc:`crate_anon.nlp_manager.base_nlp_parser.TextProcessingFailed` 

771 if this parser could not process the text 

772 """ 

773 if not does_text_contain_word_chars(text): 

774 # log.warning(f"No word characters found in {text}") 

775 # ... the warning occurs frequently so slows down processing 

776 return 

777 starting_fields_values[FN_NLPDEF] = self._nlpdef.name 

778 session = self.dest_session 

779 n_values = 0 

780 with MultiTimerContext(timer, TIMING_PARSE): 

781 for tablename, nlp_values in self.parse(text): 

782 with MultiTimerContext(timer, TIMING_HANDLE_PARSED): 

783 # Merge dictionaries so EXISTING FIELDS/VALUES 

784 # (starting_fields_values) HAVE PRIORITY. 

785 nlp_values.update(starting_fields_values) 

786 sqla_table = self.get_table(tablename) 

787 # If we have superfluous keys in our dictionary, SQLAlchemy 

788 # will choke ("Unconsumed column names", reporting the 

789 # thing that's in our dictionary that it doesn't know 

790 # about). HOWEVER, note that SQLA column names may be mixed 

791 # case (e.g. 'Text') while our copy-column names are lower 

792 # case (e.g. 'text'), so we must have pre-converted 

793 # the SQLA column names to lower case. That happens in 

794 # InputFieldConfig.get_copy_columns and 

795 # InputFieldConfig.get_copy_indexes 

796 column_names = [c.name for c in sqla_table.columns] 

797 final_values = { 

798 k: v 

799 for k, v in nlp_values.items() 

800 if k in column_names 

801 } 

802 # log.debug(repr(sqla_table)) 

803 insertquery = sqla_table.insert().values(final_values) 

804 try: 

805 with MultiTimerContext(timer, TIMING_INSERT): 

806 session.execute(insertquery) 

807 except DatabaseError as e: 

808 # We can get an error on insert if for example the 

809 # output returned by the NLP is invalid for the column 

810 # type 

811 log.error(e) 

812 

813 self._nlpdef.notify_transaction( 

814 session, 

815 n_rows=1, 

816 n_bytes=sys.getsizeof(final_values), 

817 force_commit=self._commit, 

818 ) 

819 n_values += 1 

820 log.debug( 

821 f"NLP processor {self.nlpdef_name}/{self.friendly_name}:" 

822 f" found {n_values} values" 

823 ) 

824 

825 @abstractmethod 

826 def test(self, verbose: bool = False) -> None: 

827 r""" 

828 Performs a self-test on the NLP processor. 

829 

830 Args: 

831 verbose: 

832 Be verbose? 

833 

834 This is an abstract method that is subclassed. 

835 """ 

836 # NB This docstring was associated with Sphinx errors! 

837 raise NotImplementedError( 

838 f"No test function for regex class: " f"{self.classname()}" 

839 ) 

840 

841 def test_parser(self, test_strings: List[str]) -> None: 

842 """ 

843 Tests the NLP processor's parser with a set of test strings. 

844 """ 

845 log.info(f"Testing parser: {self.classname()}") 

846 for text in test_strings: 

847 log.info(f" {text} -> {list(self.parse(text))}") 

848 log.info("... OK") 

849 

850 # ------------------------------------------------------------------------- 

851 # NLPRP info 

852 # ------------------------------------------------------------------------- 

853 

854 @staticmethod 

855 def describe_sqla_col( 

856 column: Column, sql_dialect: str = None 

857 ) -> Dict[str, Any]: 

858 """ 

859 Describes a single SQLAlchemy :class:`Column` in the :ref:`NLPRP 

860 <nlprp>` format, which follows ``INFORMATION_SCHEMA.COLUMNS`` closely. 

861 

862 Args: 

863 column: 

864 the :class:`Column` 

865 sql_dialect: 

866 preferred SQL dialect for response, or ``None`` for a default 

867 """ 

868 sql_dialect = sql_dialect or DEFAULT_NLPRP_SQL_DIALECT 

869 assert sql_dialect in ALL_SQL_DIALECTS, ( 

870 f"Unknown SQL dialect {sql_dialect!r}; must be one of " 

871 f"{ALL_SQL_DIALECTS}" 

872 ) 

873 dialect = registry.load(sql_dialect)() # type: Dialect 

874 # log.debug(f"dialect: {dialect}") 

875 # dialect = MSDialect() 

876 column_type = column.type.compile(dialect) 

877 data_type = column_type.partition("(")[0] 

878 # ... https://stackoverflow.com/questions/27387415/how-would-i-get-everything-before-a-in-a-string-python # noqa: E501 

879 return { 

880 NlprpKeys.COLUMN_NAME: column.name, 

881 NlprpKeys.COLUMN_TYPE: column_type, 

882 NlprpKeys.DATA_TYPE: data_type, 

883 NlprpKeys.IS_NULLABLE: column.nullable, 

884 NlprpKeys.COLUMN_COMMENT: column.comment, 

885 } 

886 

887 def nlprp_schema_info(self, sql_dialect: str = None) -> Dict[str, Any]: 

888 """ 

889 Returns a dictionary for the ``schema_type`` parameter, and associated 

890 parameters describing the schema (e.g. ``tabular_schema``), of the 

891 NLPRP :ref:`list_processors <nlprp_list_processors>` command. 

892 

893 This is not a classmethod, because it may be specialized as we load 

894 external schema information (e.g. GATE processors). 

895 

896 Args: 

897 sql_dialect: 

898 preferred SQL dialect for ``tabular_schema`` 

899 """ 

900 sql_dialect = sql_dialect or DEFAULT_NLPRP_SQL_DIALECT 

901 tabular_schema = {} # type: Dict[str, List[Dict[str, Any]]] 

902 for tablename, columns in self.dest_tables_columns().items(): 

903 colinfo = [] # type: List[Dict[str, Any]] 

904 for column in columns: 

905 colinfo.append(self.describe_sqla_col(column, sql_dialect)) 

906 tabular_schema[tablename] = colinfo 

907 schema_info = { 

908 NlprpKeys.SCHEMA_TYPE: NlprpValues.TABULAR, 

909 NlprpKeys.SQL_DIALECT: sql_dialect, 

910 NlprpKeys.TABULAR_SCHEMA: tabular_schema, 

911 } 

912 return schema_info 

913 

914 @classmethod 

915 def nlprp_name(cls) -> str: 

916 """ 

917 Returns the processor's name for use in response to the NLPRP 

918 :ref:`list_processors <nlprp_list_processors>` command. 

919 

920 The default is the fully qualified module/class name -- because this is 

921 highly unlikely to clash with any other NLP processors on a given 

922 server. 

923 """ 

924 return cls.fully_qualified_classname() 

925 

926 @classmethod 

927 def nlprp_title(cls) -> str: 

928 """ 

929 Returns the processor's title for use in response to the NLPRP 

930 :ref:`list_processors <nlprp_list_processors>` command. 

931 

932 The default is the short Python class name. 

933 """ 

934 return cls.__name__ 

935 

936 @classmethod 

937 def nlprp_version(cls) -> str: 

938 """ 

939 Returns the processor's version for use in response to the NLPRP 

940 :ref:`list_processors <nlprp_list_processors>` command. 

941 

942 The default is the current CRATE version. 

943 """ 

944 return CRATE_VERSION 

945 

946 @classmethod 

947 def nlprp_is_default_version(cls) -> bool: 

948 """ 

949 Returns whether this processor is the default version of its name, for 

950 use in response to the NLPRP :ref:`list_processors 

951 <nlprp_list_processors>` command. 

952 

953 The default is ``True``. 

954 """ 

955 return True 

956 

957 @classmethod 

958 def nlprp_description(cls) -> str: 

959 """ 

960 Returns the processor's description for use in response to the NLPRP 

961 :ref:`list_processors <nlprp_list_processors>` command. 

962 

963 Uses each processor's docstring, and reformats it slightly. 

964 """ 

965 return compress_docstring(get_docstring(cls)) 

966 

967 def nlprp_server_processor( 

968 self, sql_dialect: str = None 

969 ) -> NlprpServerProcessor: 

970 schema_info = self.nlprp_schema_info(sql_dialect) 

971 return NlprpServerProcessor( 

972 name=self.nlprp_name(), 

973 title=self.nlprp_title(), 

974 version=self.nlprp_version(), 

975 is_default_version=self.nlprp_is_default_version(), 

976 description=self.nlprp_description(), 

977 schema_type=schema_info[NlprpKeys.SCHEMA_TYPE], 

978 sql_dialect=schema_info.get(NlprpKeys.SQL_DIALECT), 

979 tabular_schema=schema_info.get(NlprpKeys.TABULAR_SCHEMA), 

980 ) 

981 

982 def nlprp_processor_info(self, sql_dialect: str = None) -> Dict[str, Any]: 

983 """ 

984 Returns a dictionary suitable for use as this processor's response to 

985 the NLPRP :ref:`list_processors <nlprp_list_processors>` command. 

986 

987 This is not a classmethod, because it may be specialized as we load 

988 external schema information (e.g. GATE processors). 

989 

990 Args: 

991 sql_dialect: 

992 preferred SQL dialect for ``tabular_schema`` 

993 """ 

994 return self.nlprp_server_processor(sql_dialect).infodict 

995 

996 def nlprp_processor_info_json( 

997 self, indent: int = 4, sort_keys: bool = True, sql_dialect: str = None 

998 ) -> str: 

999 """ 

1000 Returns a formatted JSON string from :func:`nlprp_schema_info`. 

1001 This is primarily for debugging. 

1002 

1003 Args: 

1004 indent: 

1005 number of spaces for indentation 

1006 sort_keys: 

1007 sort keys? 

1008 sql_dialect: 

1009 preferred SQL dialect for ``tabular_schema``, or ``None`` for 

1010 default 

1011 """ 

1012 json_structure = self.nlprp_processor_info(sql_dialect=sql_dialect) 

1013 return json.dumps(json_structure, indent=indent, sort_keys=sort_keys)