Coverage for nlp_manager/models.py: 100%

22 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1""" 

2crate_anon/nlp_manager/models.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**SQLAlchemy ORM models for the NLP progress database.** 

27 

28""" 

29 

30from sqlalchemy.orm import declarative_base 

31from sqlalchemy.schema import Column, Index, MetaData 

32from sqlalchemy.types import BigInteger, DateTime, Integer, String 

33 

34from crate_anon.anonymise.constants import COMMENT, TABLE_KWARGS 

35from crate_anon.nlp_manager.constants import ( 

36 HashClass, 

37 MAX_STRING_PK_LENGTH, 

38 SqlTypeDbIdentifier, 

39) 

40 

41progress_meta = MetaData() 

42ProgressBase = declarative_base(metadata=progress_meta) 

43 

44FN_SRCHASH = "srchash" 

45 

46 

47# ============================================================================= 

48# Global constants 

49# ============================================================================= 

50 

51SqlTypeHash = HashClass("dummysalt").sqla_column_type() 

52 

53 

54# ============================================================================= 

55# Record of progress 

56# ============================================================================= 

57 

58 

59class NlpRecord(ProgressBase): 

60 """ 

61 Class to record the fact of processing a source record for a particular 

62 kind of NLP (and to keep a hash allowing identification of altered source 

63 contents later). 

64 """ 

65 

66 __tablename__ = "crate_nlp_progress" 

67 __table_args__ = ( 

68 Index( 

69 "_idx1", # index name 

70 # index fields: 

71 "srcpkval", # integer and most specific 

72 "nlpdef", # usually >1 NLP def to 1 db/table/field combo 

73 "srcfield", # } roughly, more to less specific? 

74 "srctable", # } 

75 "srcdb", # } 

76 "srcpkstr", # last as we may not use it 

77 # - performance is critical here 

78 # - put them in descending order of specificity 

79 # https://stackoverflow.com/questions/2292662/how-important-is-the-order-of-columns-in-indexes # noqa: E501 

80 # - start with srcpkval, as it's (a) specific and (b) integer 

81 # - srcpkfield: don't need to index, because the source table 

82 # can only have one PK 

83 # - srcpkstr: must include, since srcpkval can be non-unique, 

84 # due to hash collisions, if we're using a string 

85 # ... but ?should be last because we may not use it in 

86 # queries (for tables with integer PK) 

87 unique=True, 

88 # Despite having a NULL field in a UNIQUE index, this is OK for 

89 # SQL Server 2008+ (https://stackoverflow.com/questions/767657) and 

90 # MySQL also seems happy. 

91 ), 

92 {COMMENT: "CRATE NLP progress table", **TABLE_KWARGS}, 

93 ) 

94 # https://stackoverflow.com/questions/6626810/multiple-columns-index-when-using-the-declarative-orm-extension-of-sqlalchemy # noqa: E501 

95 # http://docs.sqlalchemy.org/en/latest/orm/extensions/declarative/table_config.html # noqa: E501 

96 

97 pk = Column( 

98 "pk", 

99 # https://docs.sqlalchemy.org/en/20/dialects/sqlite.html 

100 BigInteger().with_variant(Integer, "sqlite"), 

101 primary_key=True, 

102 autoincrement=True, 

103 comment="PK of NLP record (no specific use)", 

104 ) 

105 srcdb = Column( 

106 "srcdb", 

107 SqlTypeDbIdentifier, 

108 comment="Source database", 

109 # primary_key=True 

110 ) 

111 srctable = Column( 

112 "srctable", 

113 SqlTypeDbIdentifier, 

114 comment="Source table name", 

115 # primary_key=True 

116 ) 

117 srcpkfield = Column( 

118 "srcpkfield", 

119 SqlTypeDbIdentifier, 

120 comment="Primary key column name in source table (for info only)", 

121 ) 

122 srcpkval = Column( 

123 "srcpkval", 

124 BigInteger, 

125 comment=( 

126 "Primary key value in source table (or hash if PK is a string)" 

127 ), 

128 # primary_key=True 

129 ) 

130 srcpkstr = Column( 

131 "srcpkstr", 

132 String(MAX_STRING_PK_LENGTH), 

133 comment=f"Original string PK, used when the table has a string PK, to " 

134 f"deal with hash collisions. Max length: " 

135 f"{MAX_STRING_PK_LENGTH}", 

136 # primary_key=True, default='' # can't have a NULL in a composite PK 

137 ) 

138 srcfield = Column( 

139 "srcfield", 

140 SqlTypeDbIdentifier, 

141 comment="Name of column in source field containing actual data", 

142 # primary_key=True 

143 ) 

144 nlpdef = Column( 

145 "nlpdef", 

146 SqlTypeDbIdentifier, 

147 comment="Name of natural language processing definition that source " 

148 "was processed for", 

149 # primary_key=True 

150 ) 

151 whenprocessedutc = Column( 

152 "whenprocessedutc", 

153 DateTime, 

154 comment="Time that NLP record was processed (batch time that the run " 

155 "was commenced for that NLP definition; UTC)", 

156 ) 

157 srchash = Column( 

158 FN_SRCHASH, 

159 SqlTypeHash, 

160 comment="Secure hash of source field contents at the time of " 

161 "processing", 

162 )