Coverage for src / dataknobs_xization / authorities.py: 37%

177 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-26 15:46 -0700

1"""Authority-based annotation processing and field grouping. 

2 

3Provides classes for managing authority-based annotations, field groups, 

4and derived annotation columns for structured text extraction. 

5""" 

6 

7import re 

8from abc import ABC, abstractmethod 

9from collections.abc import Callable 

10from typing import Any, Dict, List, Set, Union 

11 

12import pandas as pd 

13 

14import dataknobs_xization.annotations as dk_annots 

15 

16# Key annotation column name constants 

17KEY_AUTH_ID_COL = "auth_id" 

18 

19 

20class DerivedFieldGroups(dk_annots.DerivedAnnotationColumns): 

21 """Defines derived column types: 

22 * "field_type" -- The column holding they type of field of an annotation row 

23 * "field_group" -- The column holding the group number(s) of the field 

24 * "field_record" -- The column holding record number(s) of the field 

25 """ 

26 

27 def __init__( 

28 self, 

29 field_type_suffix: str = "_field", 

30 field_group_suffix: str = "_num", 

31 field_record_suffix: str = "_recsnum", 

32 ): 

33 """Add derived column types/names: Given an annnotation row, 

34 * field_type(row) == f'{row[ann_type_col]}_field' 

35 * field_group(row) == f'{row[ann_type_col]}_num' 

36 * field_record(row) == f'{row[ann_type_col])_recsnum' 

37 

38 Where: 

39 * A field_type column holds annotation "sub"- type values, or fields 

40 * A field_group column identifies groups of annotation fields 

41 * A field_record column identifies groups of annotation field groups 

42 

43 Args: 

44 field_type_suffix: The field_type col name suffix (if not _field). 

45 field_group_suffix: The field_group col name suffix (if not _num). 

46 field_record_suffix: field_record colname sfx (if not _recsnum). 

47 """ 

48 self.field_type_suffix = field_type_suffix 

49 self.field_group_suffix = field_group_suffix 

50 self.field_record_suffix = field_record_suffix 

51 

52 def get_col_value( 

53 self, 

54 metadata: dk_annots.AnnotationsMetaData, 

55 col_type: str, 

56 row: pd.Series, 

57 missing: str = None, 

58 ) -> str: 

59 """Get the value of the column in the given row derived from col_type, 

60 where col_type is one of: 

61 * "field_type" == f"{field}_field" 

62 * "field_group" == f"{field}_num" 

63 * "field_record" == f"{field}_recsnum" 

64 

65 And "field" is the row_accessor's metadata's "ann_type" col's value. 

66 

67 Args: 

68 metadata: The AnnotationsMetaData. 

69 col_type: The type of column value to derive. 

70 row: A row from which to get the value. 

71 missing: The value to return for unknown or missing column. 

72 

73 Returns: 

74 The row value or the missing value. 

75 """ 

76 value = missing 

77 if metadata.ann_type_col in row.index: 

78 field = row[metadata.ann_type_col] 

79 if field is not None: 

80 if col_type == "field_type": 

81 col_name = self.get_field_type_col(field) 

82 elif col_type == "field_group": 

83 col_name = self.get_field_group_col(field) 

84 elif col_type == "field_record": 

85 col_name = self.get_field_record_col(field) 

86 if col_name is not None and col_name in row.index: 

87 value = row[col_name] 

88 return value 

89 

90 def unpack_field(self, field_value: str) -> str: 

91 """Given a field in any of its derivatives (like field type, field group 

92 or field record,) unpack and return the basic field value itself. 

93 """ 

94 field = field_value 

95 if field.endswith(self.field_record_suffix): 

96 field = field.replace(self.field_record_suffix, "") 

97 elif field.endswith(self.field_group_suffix): 

98 field = field.replace(self.field_group_suffix, "") 

99 elif field.endswith(self.field_type_suffix): 

100 field = field.replace(self.field_type_suffix, "") 

101 return field 

102 

103 def get_field_name(self, field_value: str) -> str: 

104 """Given a field name or field col name, e.g., an annotation type col's 

105 value (the field name); or a field type, group, or record column name, 

106 get the field name. 

107 """ 

108 return self.unpack_field(field_value) 

109 

110 def get_field_type_col(self, field_value: str) -> str: 

111 """Given a field name or field col name, e.g., an annotation type col's 

112 value; or a field type, group, or record column name, get the field 

113 name. 

114 """ 

115 field = self.unpack_field(field_value) 

116 return f"{field}{self.field_type_suffix}" 

117 

118 def get_field_group_col(self, field_value: str) -> str: 

119 """Given a field name or field col name, e.g., an annotation type col's 

120 value; or a field type, group, or record, get the name of the derived 

121 field group column. 

122 """ 

123 field = self.unpack_field(field_value) 

124 return f"{field}{self.field_group_suffix}" 

125 

126 def get_field_record_col(self, field_value: str) -> str: 

127 """Given a field name or field col name, e.g., an annotation type col's 

128 value; or a field type, group, or record, get the name of the derived 

129 field record column. 

130 """ 

131 field = self.unpack_field(field_value) 

132 return f"{field}{self.field_record_suffix}" 

133 

134 

135class AuthorityAnnotationsMetaData(dk_annots.AnnotationsMetaData): 

136 """An extension of AnnotationsMetaData that adds an 'auth_id_col' to the 

137 standard (key) annotation columns (attributes). 

138 """ 

139 

140 def __init__( 

141 self, 

142 start_pos_col: str = dk_annots.KEY_START_POS_COL, 

143 end_pos_col: str = dk_annots.KEY_END_POS_COL, 

144 text_col: str = dk_annots.KEY_TEXT_COL, 

145 ann_type_col: str = dk_annots.KEY_ANN_TYPE_COL, 

146 auth_id_col: str = KEY_AUTH_ID_COL, 

147 sort_fields: List[str] = (dk_annots.KEY_START_POS_COL, dk_annots.KEY_END_POS_COL), 

148 sort_fields_ascending: List[bool] = (True, False), 

149 **kwargs: Any, 

150 ): 

151 """Initialize with key (and more) column names and info. 

152 

153 Key column types: 

154 * start_pos 

155 * end_pos 

156 * text 

157 * ann_type 

158 * auth_id 

159 

160 Note: 

161 Actual table columns can be named arbitrarily, BUT interactions 

162 through annotations classes and interfaces relating to the "key" 

163 columns must use the key column constants. 

164 

165 Args: 

166 start_pos_col: Col name for the token starting position. 

167 end_pos_col: Col name for the token ending position. 

168 text_col: Col name for the token text. 

169 ann_type_col: Col name for the annotation types. 

170 auth_id_col: Col name for the authority value ID. 

171 sort_fields: The col types relevant for sorting annotation rows. 

172 sort_fields_ascending: To specify sort order of sort_fields. 

173 **kwargs: More column types mapped to column names. 

174 """ 

175 super().__init__( 

176 start_pos_col=start_pos_col, 

177 end_pos_col=end_pos_col, 

178 text_col=text_col, 

179 ann_type_col=ann_type_col, 

180 sort_fields=sort_fields, 

181 sort_fields_ascending=sort_fields_ascending, 

182 auth_id=auth_id_col, 

183 **kwargs, 

184 ) 

185 

186 @property 

187 def auth_id_col(self) -> str: 

188 """Get the column name for the auth_id""" 

189 return self.data[KEY_AUTH_ID_COL] 

190 

191 

192class AuthorityAnnotationsBuilder(dk_annots.AnnotationsBuilder): 

193 """An extension of an AnnotationsBuilder that adds the 'auth_id' column.""" 

194 

195 def __init__( 

196 self, 

197 metadata: AuthorityAnnotationsMetaData = None, 

198 data_defaults: Dict[str, Any] = None, 

199 ): 

200 """Initialize AuthorityAnnotationsBuilder. 

201 

202 Args: 

203 metadata: The authority annotations metadata. 

204 data_defaults: Dict[ann_colname, default_value] with default 

205 values for annotation columns. 

206 """ 

207 super().__init__( 

208 metadata if metadata is not None else AuthorityAnnotationsMetaData(), data_defaults 

209 ) 

210 

211 def build_annotation_row( 

212 self, start_pos: int, end_pos: int, text: str, ann_type: str, auth_id: str, **kwargs: Any 

213 ) -> Dict[str, Any]: 

214 """Build an annotation row with the mandatory key values and those from 

215 the remaining keyword arguments. 

216 

217 For those kwargs whose names match metadata column names, override the 

218 data_defaults and add remaining data_default attributes. 

219 

220 Args: 

221 start_pos: The token start position. 

222 end_pos: The token end position. 

223 text: The token text. 

224 ann_type: The annotation type. 

225 auth_id: The authority ID for the row. 

226 **kwargs: Additional keyword arguments. 

227 

228 Returns: 

229 The result row dictionary. 

230 """ 

231 return self.do_build_row( 

232 { 

233 self.metadata.start_pos_col: start_pos, 

234 self.metadata.end_pos_col: end_pos, 

235 self.metadata.text_col: text, 

236 self.metadata.ann_type_col: ann_type, 

237 self.metadata.auth_id_col: auth_id, 

238 }, 

239 **kwargs, 

240 ) 

241 

242 

243class AuthorityData: 

244 """A wrapper for authority data.""" 

245 

246 def __init__(self, df: pd.DataFrame, name: str): 

247 self._df = df 

248 self.name = name 

249 

250 @property 

251 def df(self) -> pd.DataFrame: 

252 """Get the authority data in a dataframe""" 

253 return self._df 

254 

255 def lookup_values(self, value: Any, is_id: bool = False) -> pd.DataFrame: 

256 """Lookup authority value(s) for the given value or value id. 

257 

258 Args: 

259 value: A value or value_id for this authority. 

260 is_id: True if value is an ID. 

261 

262 Returns: 

263 The applicable authority dataframe rows. 

264 """ 

265 col = self.df.index if is_id else self.df[self.name] 

266 return self.df[col == value] 

267 

268 

269class Authority(dk_annots.Annotator): 

270 """A class for managing and defining tabular authoritative data for e.g., 

271 taxonomies, etc., and using them to annotate instances within text. 

272 """ 

273 

274 def __init__( 

275 self, 

276 name: str, 

277 auth_anns_builder: AuthorityAnnotationsBuilder = None, 

278 authdata: AuthorityData = None, 

279 field_groups: DerivedFieldGroups = None, 

280 anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None, 

281 parent_auth: "Authority" = None, 

282 ): 

283 """Initialize with this authority's metadata. 

284 

285 Args: 

286 name: This authority's entity name. 

287 auth_anns_builder: The authority annotations row builder to use 

288 for building annotation rows. 

289 authdata: The authority data. 

290 field_groups: The derived field groups to use. 

291 anns_validator: fn(auth, anns_dict_list) that returns True if 

292 the list of annotation row dicts are valid to be added as 

293 annotations for a single match or "entity". 

294 parent_auth: This authority's parent authority (if any). 

295 """ 

296 super().__init__(name) 

297 self.anns_builder = ( 

298 auth_anns_builder if auth_anns_builder is not None else AuthorityAnnotationsBuilder() 

299 ) 

300 self.authdata = authdata 

301 self.field_groups = field_groups if field_groups is not None else DerivedFieldGroups() 

302 self.anns_validator = anns_validator 

303 self._parent = parent_auth 

304 

305 @property 

306 def metadata(self) -> AuthorityAnnotationsMetaData: 

307 """Get the meta-data""" 

308 return self.anns_builder.metadata 

309 

310 @property 

311 def parent(self) -> "Authority": 

312 """Get this authority's parent, or None.""" 

313 return self._parent 

314 

315 @abstractmethod 

316 def has_value(self, value: Any) -> bool: 

317 """Determine whether the given value is in this authority. 

318 

319 Args: 

320 value: A possible authority value. 

321 

322 Returns: 

323 True if the value is a valid entity value. 

324 """ 

325 raise NotImplementedError 

326 

327 def annotate_input( 

328 self, 

329 text_obj: Union[dk_annots.AnnotatedText, str], 

330 **kwargs: Any, 

331 ) -> dk_annots.Annotations: 

332 """Find and annotate this authority's entities in the document text 

333 as dictionaries like: 

334 [ 

335 { 

336 'input_id': <id>, 

337 'start_pos': <start_char_pos>, 

338 'end_pos': <end_char_pos>, 

339 'entity_text': <entity_text>, 

340 'ann_type': <authority_name>, 

341 '<auth_id>': <auth_value_id_or_canonical_form>, 

342 'confidence': <confidence_if_available>, 

343 }, 

344 ] 

345 

346 Args: 

347 text_obj: The text object or string to process. 

348 **kwargs: Additional keyword arguments. 

349 

350 Returns: 

351 An Annotations instance. 

352 """ 

353 if text_obj is not None: 

354 if isinstance(text_obj, str) and len(text_obj.strip()) > 0: 

355 text_obj = dk_annots.AnnotatedText( 

356 text_obj, 

357 annots_metadata=self.metadata, 

358 ) 

359 if text_obj is not None: 

360 annotations = self.add_annotations(text_obj) 

361 return annotations 

362 

363 @abstractmethod 

364 def add_annotations( 

365 self, 

366 text_obj: dk_annots.AnnotatedText, 

367 ) -> dk_annots.Annotations: 

368 """Method to do the work of finding, validating, and adding annotations. 

369 

370 Args: 

371 text_obj: The annotated text object to process and add annotations. 

372 

373 Returns: 

374 The added Annotations. 

375 """ 

376 raise NotImplementedError 

377 

378 def validate_ann_dicts(self, ann_dicts: List[Dict[str, Any]]) -> bool: 

379 """The annotation row dictionaries are valid if: 

380 * They are non-empty 

381 * and 

382 * either there is no annotations validator 

383 * or they are valid according to the validator 

384 

385 Args: 

386 ann_dicts: Annotation dictionaries. 

387 

388 Returns: 

389 True if valid. 

390 """ 

391 return len(ann_dicts) > 0 and ( 

392 self.anns_validator is None or self.anns_validator(self, ann_dicts) 

393 ) 

394 

395 def compose( 

396 self, 

397 annotations: dk_annots.Annotations, 

398 ) -> dk_annots.Annotations: 

399 """Compose annotations into groups. 

400 

401 Args: 

402 annotations: The annotations. 

403 

404 Returns: 

405 Composed annotations. 

406 """ 

407 return annotations 

408 

409 def build_annotation( 

410 self, 

411 start_pos: int = None, 

412 end_pos: int = None, 

413 entity_text: str = None, 

414 auth_value_id: Any = None, 

415 conf: float = 1.0, 

416 **kwargs, 

417 ) -> Dict[str, Any]: 

418 """Build annotations with the given components.""" 

419 return self.anns_builder.build_annotation_row( 

420 start_pos, end_pos, entity_text, self.name, auth_value_id, auth_valconf=conf, **kwargs 

421 ) 

422 

423 

424class AnnotationsValidator(ABC): 

425 """A base class with helper functions for performing validations on annotation 

426 rows. 

427 """ 

428 

429 def __call__( 

430 self, 

431 auth: Authority, 

432 ann_row_dicts: List[Dict[str, Any]], 

433 ) -> bool: 

434 """Call function to enable instances of this type of class to be passed in 

435 as a anns_validator function to an Authority. 

436 

437 Args: 

438 auth: The authority proposing annotations. 

439 ann_row_dicts: The proposed annotations. 

440 

441 Returns: 

442 True if the annotations are valid; otherwise, False. 

443 """ 

444 return self.validate_annotation_rows( 

445 AnnotationsValidator.AuthAnnotations(auth, ann_row_dicts) 

446 ) 

447 

448 @abstractmethod 

449 def validate_annotation_rows( 

450 self, 

451 auth_annotations: "AnnotationsValidator.AuthAnnotations", 

452 ) -> bool: 

453 """Determine whether the proposed authority annotation rows are valid. 

454 

455 Args: 

456 auth_annotations: The AuthAnnotations instance with the 

457 proposed data. 

458 

459 Returns: 

460 True if valid; False if not. 

461 """ 

462 raise NotImplementedError 

463 

464 class AuthAnnotations: 

465 """A wrapper class for convenient access to the entity annotations.""" 

466 

467 def __init__(self, auth: Authority, ann_row_dicts: List[Dict[str, Any]]): 

468 self.auth = auth 

469 self.ann_row_dicts = ann_row_dicts 

470 self._row_accessor = None # AnnotationsRowAccessor 

471 self._anns = None # Annotations 

472 self._atts = None # Dict[str, str] 

473 

474 @property 

475 def row_accessor(self) -> dk_annots.AnnotationsRowAccessor: 

476 """Get the row accessor for this instance's annotations.""" 

477 if self._row_accessor is None: 

478 self._row_accessor = dk_annots.AnnotationsRowAccessor( 

479 self.auth.metadata, derived_cols=self.auth.field_groups 

480 ) 

481 return self._row_accessor 

482 

483 @property 

484 def anns(self) -> dk_annots.Annotations: 

485 """Get this instance's annotation rows as an annotations object""" 

486 if self._anns is None: 

487 self._anns = dk_annots.Annotations(self.auth.metadata) 

488 for row_dict in self.ann_row_dicts: 

489 self._anns.add_dict(row_dict) 

490 return self._anns 

491 

492 @property 

493 def df(self) -> pd.DataFrame: 

494 """Get the annotation's dataframe""" 

495 return self.anns.df 

496 

497 def get_field_type(self, row: pd.Series) -> str: 

498 """Get the entity field type value""" 

499 return self.row_accessor.get_col_value("field_type", row, None) 

500 

501 def get_text(self, row: pd.Series) -> str: 

502 """Get the entity text from the row""" 

503 return self.row_accessor.get_col_value(self.auth.metadata.text_col, row, None) 

504 

505 @property 

506 def attributes(self) -> Dict[str, str]: 

507 """Get this instance's annotation entity attributes""" 

508 if self._atts is None: 

509 self._atts = { 

510 self.get_field_type(row): self.get_text(row) for _, row in self.df.iterrows() 

511 } 

512 return self._atts 

513 

514 def colval(self, col_name, row) -> Any: 

515 """Get the column's value from the given row""" 

516 return self.row_accessor.get_col_value(col_name, row) 

517 

518 

519class AuthorityFactory(ABC): 

520 """A factory class for building an authority.""" 

521 

522 @abstractmethod 

523 def build_authority( 

524 self, 

525 name: str, 

526 auth_anns_builder: AuthorityAnnotationsBuilder, 

527 authdata: AuthorityData, 

528 parent_auth: Authority = None, 

529 ) -> Authority: 

530 """Build an authority with the given name and data. 

531 

532 Args: 

533 name: The authority name. 

534 auth_anns_builder: The authority annotations row builder to use 

535 for building annotation rows. 

536 authdata: The authority data. 

537 parent_auth: The parent authority. 

538 

539 Returns: 

540 The authority. 

541 """ 

542 raise NotImplementedError 

543 

544 

545class LexicalAuthority(Authority): 

546 """A class for managing named entities by ID with associated values and 

547 variations. 

548 """ 

549 

550 def __init__( 

551 self, 

552 name: str, 

553 auth_anns_builder: AuthorityAnnotationsBuilder = None, 

554 authdata: AuthorityData = None, 

555 field_groups: DerivedFieldGroups = None, 

556 anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None, 

557 parent_auth: "Authority" = None, 

558 ): 

559 """Initialize with this authority's metadata. 

560 

561 Args: 

562 name: This authority's entity name. 

563 auth_anns_builder: The authority annotations row builder to use 

564 for building annotation rows. 

565 authdata: The authority data. 

566 field_groups: The derived field groups to use. 

567 anns_validator: fn(auth, anns_dict_list) that returns True if 

568 the list of annotation row dicts are valid to be added as 

569 annotations for a single match or "entity". 

570 parent_auth: This authority's parent authority (if any). 

571 """ 

572 super().__init__( 

573 name, 

574 auth_anns_builder=auth_anns_builder, 

575 authdata=authdata, 

576 field_groups=field_groups, 

577 anns_validator=anns_validator, 

578 parent_auth=parent_auth, 

579 ) 

580 

581 @abstractmethod 

582 def get_value_ids(self, value: Any) -> Set[Any]: 

583 """Get all IDs associated with the given value. Note that typically 

584 there is a single ID for any value, but this allows for inherent 

585 ambiguities in the authority. 

586 

587 Args: 

588 value: An authority value. 

589 

590 Returns: 

591 The associated IDs or an empty set if the value is not valid. 

592 """ 

593 raise NotImplementedError 

594 

595 @abstractmethod 

596 def get_values_by_id(self, value_id: Any) -> Set[Any]: 

597 """Get all values for the associated value ID. Note that typically 

598 there is a single value for an ID, but this allows for inherent 

599 ambiguities in the authority. 

600 

601 Args: 

602 value_id: An authority value ID. 

603 

604 Returns: 

605 The associated values or an empty set if the value is not valid. 

606 """ 

607 raise NotImplementedError 

608 

609 @abstractmethod 

610 def get_id_by_variation(self, variation: str) -> Set[str]: 

611 """Get the IDs of the value(s) associated with the given variation. 

612 

613 Args: 

614 variation: Variation text. 

615 

616 Returns: 

617 The possibly empty set of associated value IDS. 

618 """ 

619 raise NotImplementedError 

620 

621 @abstractmethod 

622 def find_variations( 

623 self, 

624 variation: str, 

625 starts_with: bool = False, 

626 ends_with: bool = False, 

627 scope: str = "fullmatch", 

628 ) -> pd.Series: 

629 """Find all matches to the given variation. 

630 

631 Note: 

632 Only the first true of starts_with, ends_with, and scope will 

633 be applied. If none of these are true, a full match on the pattern 

634 is performed. 

635 

636 Args: 

637 variation: The text to find; treated as a regular expression 

638 unless either starts_with or ends_with is True. 

639 starts_with: When True, find all terms that start with the 

640 variation text. 

641 ends_with: When True, find all terms that end with the variation 

642 text. 

643 scope: 'fullmatch' (default), 'match', or 'contains' for 

644 strict, less strict, and least strict matching. 

645 

646 Returns: 

647 The matching variations as a pd.Series. 

648 """ 

649 raise NotImplementedError 

650 

651 

652class RegexAuthority(Authority): 

653 """A class for managing named entities by ID with associated values and 

654 variations. 

655 """ 

656 

657 def __init__( 

658 self, 

659 name: str, 

660 regex: re.Pattern, 

661 canonical_fn: Callable[[str, str], Any] = None, 

662 auth_anns_builder: AuthorityAnnotationsBuilder = None, 

663 authdata: AuthorityData = None, 

664 field_groups: DerivedFieldGroups = None, 

665 anns_validator: Callable[[Authority, Dict[str, Any]], bool] = None, 

666 parent_auth: "Authority" = None, 

667 ): 

668 """Initialize with this authority's entity name. 

669 

670 Note: 

671 If the regular expression has capturing groups, each group 

672 will result in a separate entity, with the group name if provided 

673 in the regular expression as ...(?P<group_name>group_regex)... 

674 

675 Args: 

676 name: The authority name. 

677 regex: The regular expression to apply. 

678 canonical_fn: A function, fn(match_text, group_name), to 

679 transform input matches to a canonical form as a value_id. 

680 Where group_name will be None and the full match text will be 

681 passed in if there are no group names. Note that the canonical form 

682 is computed before the match_validator is applied and its value 

683 will be found as the value to the <auth_id> key. 

684 auth_anns_builder: The authority annotations row builder to use 

685 for building annotation rows. 

686 authdata: The authority data. 

687 field_groups: The derived field groups to use. 

688 anns_validator: A validation function for each regex match 

689 formed as a list of annotation row dictionaries, one row dictionary 

690 for each matching regex group. If the validator returns False, 

691 then the annotation rows will be rejected. The entity_text key 

692 will hold matched text and the <auth_name>_field key will hold 

693 the group name or number (if there are groups with or without names) 

694 or the <auth_name> if there are no groups in the regular expression. 

695 Note that the validator function takes the regex authority instance 

696 as its first parameter to provide access to the field_groups, etc. 

697 The validation_fn signature is: fn(regexAuthority, ann_row_dicts) 

698 and returns a boolean. 

699 parent_auth: This authority's parent authority (if any). 

700 """ 

701 super().__init__( 

702 name, 

703 auth_anns_builder=auth_anns_builder, 

704 authdata=authdata, 

705 field_groups=field_groups, 

706 anns_validator=anns_validator, 

707 parent_auth=parent_auth, 

708 ) 

709 self.regex = regex 

710 self.canonical_fn = canonical_fn 

711 

712 def has_value(self, value: Any) -> re.Match: 

713 """Determine whether the given value is in this authority. 

714 

715 Args: 

716 value: A possible authority value. 

717 

718 Returns: 

719 None if the value is not a valid entity value; otherwise, 

720 return the re.Match object. 

721 """ 

722 return self.regex.match(str(value)) 

723 

724 def add_annotations( 

725 self, 

726 text_obj: dk_annots.AnnotatedText, 

727 ) -> dk_annots.Annotations: 

728 """Method to do the work of finding, validating, and adding annotations. 

729 

730 Args: 

731 text_obj: The annotated text object to process and add annotations. 

732 

733 Returns: 

734 The added Annotations. 

735 """ 

736 for match in re.finditer(self.regex, text_obj.text): 

737 ann_dicts = [] 

738 if match.lastindex is not None: 

739 if len(self.regex.groupindex) > 0: # we have named groups 

740 for group_name, group_num in self.regex.groupindex.items(): 

741 group_text = match.group(group_num) 

742 kwargs = {self.field_groups.get_field_type_col(self.name): group_name} 

743 ann_dicts.append( 

744 self.build_annotation( 

745 start_pos=match.start(group_name), 

746 end_pos=match.end(group_name), 

747 entity_text=group_text, 

748 auth_value_id=self.get_canonical_form(group_text, group_name), 

749 **kwargs, 

750 ) 

751 ) 

752 else: # we have only numbers for groups 

753 for group_num, group_text in enumerate(match.groups()): 

754 group_num += 1 

755 kwargs = {self.field_groups.get_field_type_col(self.name): group_num} 

756 ann_dicts.append( 

757 self.build_annotation( 

758 start_pos=match.start(group_num), 

759 end_pos=match.end(group_num), 

760 entity_text=group_text, 

761 auth_value_id=self.get_canonical_form(group_text, group_num), 

762 **kwargs, 

763 ) 

764 ) 

765 else: # we have no groups 

766 ann_dicts.append( 

767 self.build_annotation( 

768 start_pos=match.start(), 

769 end_pos=match.end(), 

770 entity_text=match.group(), 

771 auth_value_id=self.get_canonical_form(match.group(), self.name), 

772 ) 

773 ) 

774 if self.validate_ann_dicts(ann_dicts): 

775 # Add non-empty, valid annotation dicts to the result 

776 text_obj.annotations.add_dicts(ann_dicts) 

777 return text_obj.annotations 

778 

779 def get_canonical_form(self, entity_text: str, entity_type: str) -> Any: 

780 if self.canonical_fn is not None: 

781 entity_text = self.canonical_fn(entity_text, entity_type) 

782 return entity_text 

783 

784 

785class AuthoritiesBundle(Authority): 

786 """An authority for expressing values through multiple bundled "authorities" 

787 like dictionary-based and/or multiple regular expression patterns. 

788 """ 

789 

790 def __init__( 

791 self, 

792 name: str, 

793 auth_anns_builder: AuthorityAnnotationsBuilder = None, 

794 authdata: AuthorityData = None, 

795 field_groups: DerivedFieldGroups = None, 

796 parent_auth: "Authority" = None, 

797 anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None, 

798 auths: List[Authority] = None, 

799 ): 

800 """Initialize the AuthoritiesBundle. 

801 

802 Args: 

803 name: This authority's entity name. 

804 auth_anns_builder: The authority annotations row builder to use 

805 for building annotation rows. 

806 authdata: The authority data. 

807 field_groups: The derived field groups to use. 

808 anns_validator: fn(auth, anns_dict_list) that returns True if 

809 the list of annotation row dicts are valid to be added as 

810 annotations for a single match or "entity". 

811 parent_auth: This authority's parent authority (if any). 

812 auths: The authorities to bundle together. 

813 """ 

814 super().__init__( 

815 name, 

816 auth_anns_builder=auth_anns_builder, 

817 authdata=authdata, 

818 field_groups=field_groups, 

819 anns_validator=anns_validator, 

820 parent_auth=parent_auth, 

821 ) 

822 self.auths = auths.copy() if auths is not None else [] 

823 

824 def add(self, auth: Authority): 

825 """Add the authority to this bundle. 

826 

827 Args: 

828 auth: The authority to add. 

829 """ 

830 self.auths.append(auth) 

831 

832 def has_value(self, value: Any) -> bool: 

833 """Determine whether the given value is in this authority. 

834 

835 Args: 

836 value: A possible authority value. 

837 

838 Returns: 

839 True if the value is a valid entity value. 

840 """ 

841 for auth in self.auths: 

842 if auth.has_value(value): 

843 return True 

844 return False 

845 

846 def add_annotations( 

847 self, 

848 text_obj: dk_annots.AnnotatedText, 

849 ) -> dk_annots.Annotations: 

850 """Method to do the work of finding, validating, and adding annotations. 

851 

852 Args: 

853 text_obj: The annotated text object to process and add annotations. 

854 

855 Returns: 

856 The added Annotations. 

857 """ 

858 for auth in self.auths: 

859 auth.annotate_input(text_obj) 

860 return text_obj.annotations