Coverage for nlp_manager/models.py: 100%
22 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
« prev ^ index » next coverage.py v7.8.0, created at 2025-08-27 10:34 -0500
1"""
2crate_anon/nlp_manager/models.py
4===============================================================================
6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
9 This file is part of CRATE.
11 CRATE is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version.
16 CRATE is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
24===============================================================================
26**SQLAlchemy ORM models for the NLP progress database.**
28"""
30from sqlalchemy.orm import declarative_base
31from sqlalchemy.schema import Column, Index, MetaData
32from sqlalchemy.types import BigInteger, DateTime, Integer, String
34from crate_anon.anonymise.constants import COMMENT, TABLE_KWARGS
35from crate_anon.nlp_manager.constants import (
36 HashClass,
37 MAX_STRING_PK_LENGTH,
38 SqlTypeDbIdentifier,
39)
41progress_meta = MetaData()
42ProgressBase = declarative_base(metadata=progress_meta)
44FN_SRCHASH = "srchash"
47# =============================================================================
48# Global constants
49# =============================================================================
51SqlTypeHash = HashClass("dummysalt").sqla_column_type()
54# =============================================================================
55# Record of progress
56# =============================================================================
59class NlpRecord(ProgressBase):
60 """
61 Class to record the fact of processing a source record for a particular
62 kind of NLP (and to keep a hash allowing identification of altered source
63 contents later).
64 """
66 __tablename__ = "crate_nlp_progress"
67 __table_args__ = (
68 Index(
69 "_idx1", # index name
70 # index fields:
71 "srcpkval", # integer and most specific
72 "nlpdef", # usually >1 NLP def to 1 db/table/field combo
73 "srcfield", # } roughly, more to less specific?
74 "srctable", # }
75 "srcdb", # }
76 "srcpkstr", # last as we may not use it
77 # - performance is critical here
78 # - put them in descending order of specificity
79 # https://stackoverflow.com/questions/2292662/how-important-is-the-order-of-columns-in-indexes # noqa: E501
80 # - start with srcpkval, as it's (a) specific and (b) integer
81 # - srcpkfield: don't need to index, because the source table
82 # can only have one PK
83 # - srcpkstr: must include, since srcpkval can be non-unique,
84 # due to hash collisions, if we're using a string
85 # ... but ?should be last because we may not use it in
86 # queries (for tables with integer PK)
87 unique=True,
88 # Despite having a NULL field in a UNIQUE index, this is OK for
89 # SQL Server 2008+ (https://stackoverflow.com/questions/767657) and
90 # MySQL also seems happy.
91 ),
92 {COMMENT: "CRATE NLP progress table", **TABLE_KWARGS},
93 )
94 # https://stackoverflow.com/questions/6626810/multiple-columns-index-when-using-the-declarative-orm-extension-of-sqlalchemy # noqa: E501
95 # http://docs.sqlalchemy.org/en/latest/orm/extensions/declarative/table_config.html # noqa: E501
97 pk = Column(
98 "pk",
99 # https://docs.sqlalchemy.org/en/20/dialects/sqlite.html
100 BigInteger().with_variant(Integer, "sqlite"),
101 primary_key=True,
102 autoincrement=True,
103 comment="PK of NLP record (no specific use)",
104 )
105 srcdb = Column(
106 "srcdb",
107 SqlTypeDbIdentifier,
108 comment="Source database",
109 # primary_key=True
110 )
111 srctable = Column(
112 "srctable",
113 SqlTypeDbIdentifier,
114 comment="Source table name",
115 # primary_key=True
116 )
117 srcpkfield = Column(
118 "srcpkfield",
119 SqlTypeDbIdentifier,
120 comment="Primary key column name in source table (for info only)",
121 )
122 srcpkval = Column(
123 "srcpkval",
124 BigInteger,
125 comment=(
126 "Primary key value in source table (or hash if PK is a string)"
127 ),
128 # primary_key=True
129 )
130 srcpkstr = Column(
131 "srcpkstr",
132 String(MAX_STRING_PK_LENGTH),
133 comment=f"Original string PK, used when the table has a string PK, to "
134 f"deal with hash collisions. Max length: "
135 f"{MAX_STRING_PK_LENGTH}",
136 # primary_key=True, default='' # can't have a NULL in a composite PK
137 )
138 srcfield = Column(
139 "srcfield",
140 SqlTypeDbIdentifier,
141 comment="Name of column in source field containing actual data",
142 # primary_key=True
143 )
144 nlpdef = Column(
145 "nlpdef",
146 SqlTypeDbIdentifier,
147 comment="Name of natural language processing definition that source "
148 "was processed for",
149 # primary_key=True
150 )
151 whenprocessedutc = Column(
152 "whenprocessedutc",
153 DateTime,
154 comment="Time that NLP record was processed (batch time that the run "
155 "was commenced for that NLP definition; UTC)",
156 )
157 srchash = Column(
158 FN_SRCHASH,
159 SqlTypeHash,
160 comment="Secure hash of source field contents at the time of "
161 "processing",
162 )