Coverage for nlp_manager/constants.py: 98%

123 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2026-02-05 06:46 -0600

1""" 

2crate_anon/nlp_manager/constants.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Constants for CRATE NLP, including a demo config file.** 

27 

28""" 

29 

30from cardinal_pythonlib.hash import HmacMD5Hasher 

31from sqlalchemy.types import String 

32 

33 

34# ============================================================================= 

35# Constants 

36# ============================================================================= 

37 

38DEFAULT_CLOUD_LIMIT_BEFORE_COMMIT = 1000 

39DEFAULT_CLOUD_MAX_CONTENT_LENGTH = 0 # no limit 

40DEFAULT_CLOUD_MAX_RECORDS_PER_REQUEST = 1000 

41DEFAULT_CLOUD_MAX_TRIES = 5 

42DEFAULT_CLOUD_RATE_LIMIT_HZ = 2 

43DEFAULT_CLOUD_WAIT_ON_CONN_ERR_S = 180 # in seconds 

44DEFAULT_REPORT_EVERY_NLP = 500 # low values slow down processing 

45DEFAULT_TEMPORARY_TABLENAME = "_crate_nlp_temptable" 

46 

47FN_CRATE_VERSION_FIELD = "_crate_version" # new in v0.18.53 

48FN_NLPDEF = "_nlpdef" 

49FN_PK = "_pk" 

50FN_SRCDATETIMEFIELD = "_srcdatetimefield" # new in v0.18.52 

51FN_SRCDATETIMEVAL = "_srcdatetimeval" # new in v0.18.52 

52FN_SRCDB = "_srcdb" 

53FN_SRCFIELD = "_srcfield" 

54FN_SRCPKFIELD = "_srcpkfield" 

55FN_SRCPKSTR = "_srcpkstr" 

56FN_SRCPKVAL = "_srcpkval" 

57FN_SRCTABLE = "_srctable" 

58FN_WHEN_FETCHED = "_when_fetched_utc" # new in v0.18.53 

59 

60TRUNCATED_FLAG = "_truncated" # NOT A FIELD/COLUMN NAME. INTERNAL USE ONLY. 

61 

62GATE_PIPELINE_CLASSNAME = "CrateGatePipeline" 

63 

64HashClass = HmacMD5Hasher 

65 

66MAX_STRING_PK_LENGTH = 64 # trade-off; space versus capability 

67MAX_SQL_FIELD_LEN = 64 

68# ... http://dev.mysql.com/doc/refman/5.0/en/identifiers.html 

69MAX_SEMANTIC_VERSION_STRING_LENGTH = ( 

70 147 # https://github.com/mojombo/semver/issues/79 

71) 

72MEDEX_PIPELINE_CLASSNAME = "CrateMedexPipeline" 

73MEDEX_DATA_READY_SIGNAL = "data_ready" 

74MEDEX_RESULTS_READY_SIGNAL = "results_ready" 

75 

76NLP_CONFIG_ENV_VAR = "CRATE_NLP_CONFIG" 

77 

78SqlTypeDbIdentifier = String(MAX_SQL_FIELD_LEN) 

79# ... text field used for database names, table names, and field names 

80 

81 

82# ============================================================================= 

83# Simple classes for string constant collections 

84# ============================================================================= 

85 

86 

87class NlpConfigPrefixes: 

88 """ 

89 Section name prefixes for the NLP config file. 

90 """ 

91 

92 NLPDEF = "nlpdef" 

93 PROCESSOR = "processor" 

94 ENV = "env" 

95 OUTPUT = "output" 

96 INPUT = "input" 

97 DATABASE = "database" 

98 CLOUD = "cloud" 

99 

100 

101class NlpDefConfigKeys: 

102 """ 

103 Config file keys for NLP definitions. 

104 """ 

105 

106 INPUTFIELDDEFS = "inputfielddefs" 

107 PROCESSORS = "processors" 

108 PROGRESSDB = "progressdb" 

109 HASHPHRASE = "hashphrase" 

110 TEMPORARY_TABLENAME = "temporary_tablename" 

111 MAX_ROWS_BEFORE_COMMIT = "max_rows_before_commit" 

112 MAX_BYTES_BEFORE_COMMIT = "max_bytes_before_commit" 

113 TRUNCATE_TEXT_AT = "truncate_text_at" 

114 RECORD_TRUNCATED_VALUES = "record_truncated_values" 

115 CLOUD_CONFIG = "cloud_config" 

116 CLOUD_REQUEST_DATA_DIR = "cloud_request_data_dir" 

117 

118 

119class NlpDefValues: 

120 """ 

121 Config file values for NLP definitions 

122 """ 

123 

124 # Since any server with the same output format as CRATE's is compatible, 

125 # we call this format standard 

126 FORMAT_STANDARD = "Standard" 

127 FORMAT_GATE = "GATE" 

128 

129 

130class InputFieldConfigKeys: 

131 """ 

132 Config file keys for input database fields (columns). 

133 """ 

134 

135 SRCDB = "srcdb" 

136 SRCTABLE = "srctable" 

137 SRCPKFIELD = "srcpkfield" 

138 SRCFIELD = "srcfield" 

139 SRCDATETIMEFIELD = "srcdatetimefield" 

140 COPYFIELDS = "copyfields" 

141 INDEXED_COPYFIELDS = "indexed_copyfields" 

142 DEBUG_ROW_LIMIT = "debug_row_limit" 

143 

144 

145class ProcessorConfigKeys: 

146 """ 

147 Config file keys for NLP processors. 

148 """ 

149 

150 ASSUME_PREFERRED_UNIT = "assume_preferred_unit" 

151 DESTDB = "destdb" 

152 DESTTABLE = "desttable" 

153 OUTPUTTYPEMAP = "outputtypemap" 

154 PROGARGS = "progargs" 

155 PROGENVSECTION = "progenvsection" 

156 INPUT_TERMINATOR = "input_terminator" 

157 OUTPUT_TERMINATOR = "output_terminator" 

158 MAX_EXTERNAL_PROG_USES = "max_external_prog_uses" 

159 PROCESSOR_NAME = "processor_name" 

160 PROCESSOR_VERSION = "processor_version" 

161 PROCESSOR_FORMAT = "processor_format" 

162 

163 

164class NlpOutputConfigKeys: 

165 """ 

166 Config file keys for output tables from GATE or Cloud NLP processors. 

167 """ 

168 

169 DESTTABLE = "desttable" 

170 RENAMES = "renames" 

171 NULL_LITERALS = "null_literals" 

172 DESTFIELDS = "destfields" 

173 INDEXDEFS = "indexdefs" 

174 

175 

176class DatabaseConfigKeys: 

177 """ 

178 Config file keys for database definitions. 

179 """ 

180 

181 URL = "url" 

182 ECHO = "echo" 

183 

184 

185class CloudNlpConfigKeys: 

186 """ 

187 Config file keys for cloud NLP. 

188 """ 

189 

190 CLOUD_URL = "cloud_url" 

191 VERIFY_SSL = "verify_ssl" 

192 COMPRESS = "compress" 

193 USERNAME = "username" 

194 PASSWORD = "password" 

195 WAIT_ON_CONN_ERR = "wait_on_conn_err" 

196 MAX_CONTENT_LENGTH = "max_content_length" 

197 LIMIT_BEFORE_COMMIT = "limit_before_commit" 

198 MAX_RECORDS_PER_REQUEST = "max_records_per_request" 

199 STOP_AT_FAILURE = "stop_at_failure" 

200 MAX_TRIES = "max_tries" 

201 RATE_LIMIT_HZ = "rate_limit_hz" 

202 TEST_LENGTH_FUNCTION_SPEED = "test_length_function_speed" 

203 

204 

205class GateApiKeys: 

206 """ 

207 Dictionary keys for the direct API to GATE. 

208 

209 See https://cloud.gate.ac.uk/info/help/online-api.html for format of 

210 response from processor. The GATE JSON format is: 

211 

212 .. code-block:: json 

213 

214 { 

215 "text":"The text of the document", 

216 "entities":{ 

217 "SampleAnnotationType1":[ 

218 { 

219 "indices":[0,3], 

220 "feature1":"value1", 

221 "feature2":"value2" 

222 } 

223 ], 

224 "SampleAnnotationType2":[ 

225 { 

226 "indices":[12,15], 

227 "feature3":"value3" 

228 } 

229 ] 

230 } 

231 } 

232 """ 

233 

234 ENTITIES = "entities" 

235 INDICES = "indices" 

236 TEXT = "text" 

237 

238 

239class GateResultKeys: 

240 """ 

241 Dictionary keys to represent GATE results in our NLPRP server. 

242 """ 

243 

244 TYPE = "type" 

245 START = "start" 

246 END = "end" 

247 SET = "set" 

248 FEATURES = "features" 

249 

250 

251class GateFieldNames: 

252 """ 

253 Field (column) names for results from GATE. 

254 These match KEY_* strings in ``CrateGatePipeline.java``. 

255 """ 

256 

257 SET = "_set" 

258 TYPE = "_type" 

259 ID = "_id" 

260 STARTPOS = "_start" 

261 ENDPOS = "_end" 

262 CONTENT = "_content" 

263 

264 

265# ============================================================================= 

266# Config helpers 

267# ============================================================================= 

268 

269_ALL_NLPRP_SECTION_PREFIXES = [ 

270 v for k, v in NlpConfigPrefixes.__dict__.items() if not k.startswith("_") 

271] 

272 

273 

274def full_sectionname(section_type: str, section: str) -> str: 

275 if section_type in _ALL_NLPRP_SECTION_PREFIXES: 

276 return section_type + ":" + section 

277 raise ValueError(f"Unrecognised section type: {section_type}")