Coverage for nlp_manager/output_user_config.py: 54%

102 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1""" 

2crate_anon/nlp_manager/output_user_config.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Define output configuration for GATE NLP applications.** 

27 

28""" 

29 

30import ast 

31import logging 

32import shlex 

33from typing import Dict, List 

34 

35from cardinal_pythonlib.sql.validation import ( 

36 ensure_valid_field_name, 

37 ensure_valid_table_name, 

38 is_sqltype_valid, 

39) 

40from cardinal_pythonlib.lists import chunks 

41from cardinal_pythonlib.sqlalchemy.schema import ( 

42 get_sqla_coltype_from_dialect_str, 

43) 

44from sqlalchemy.engine.base import Engine 

45from sqlalchemy.schema import Column, Index 

46 

47from crate_anon.common.extendedconfigparser import ( 

48 ConfigSection, 

49 ExtendedConfigParser, 

50) 

51from crate_anon.nlp_manager.constants import ( 

52 full_sectionname, 

53 NlpOutputConfigKeys, 

54 NlpConfigPrefixes, 

55) 

56from crate_anon.nlp_manager.input_field_config import InputFieldConfig 

57 

58log = logging.getLogger(__name__) 

59 

60 

61# ============================================================================= 

62# OutputUserConfig 

63# ============================================================================= 

64 

65 

66class OutputUserConfig: 

67 """ 

68 Class defining configuration for the output of a given GATE app, or remote 

69 cloud app. 

70 

71 See the documentation for the :ref:`NLP config file <nlp_config>`. 

72 """ 

73 

74 def __init__( 

75 self, 

76 config_parser: ExtendedConfigParser, 

77 cfg_output_name: str, 

78 schema_required: bool = True, 

79 ) -> None: 

80 """ 

81 Read config from a configparser section. 

82 

83 Args: 

84 config_parser: 

85 :class:`crate_anon.common.extendedconfigparser.ExtendedConfigParser` 

86 cfg_output_name: 

87 config file section name suffix -- this is the second of the 

88 pair of strings in the ``outputtypemap`` part of the GATE NLP 

89 app config section. See 

90 

91 - :ref:`NLP config file <nlp_config>` 

92 - :class:`crate_anon.nlp_manager.parse_gate.Gate` 

93 schema_required: 

94 is it required that the user has specified a schema, i.e. 

95 destfields and a desttable? - Should be true for Gate, False 

96 for Cloud as the remote processors may have their own schema 

97 definition. 

98 """ 

99 

100 sectionname = full_sectionname( 

101 NlpConfigPrefixes.OUTPUT, cfg_output_name 

102 ) 

103 cfg = ConfigSection(section=sectionname, parser=config_parser) 

104 

105 # --------------------------------------------------------------------- 

106 # desttable 

107 # --------------------------------------------------------------------- 

108 

109 self._desttable = cfg.opt_str( 

110 NlpOutputConfigKeys.DESTTABLE, required=True 

111 ) 

112 ensure_valid_table_name(self._desttable) 

113 

114 # --------------------------------------------------------------------- 

115 # renames 

116 # --------------------------------------------------------------------- 

117 

118 self._renames = {} # type: Dict[str, str] 

119 rename_lines = cfg.opt_strlist( 

120 NlpOutputConfigKeys.RENAMES, required=False, as_words=False 

121 ) 

122 for line in rename_lines: 

123 if not line.strip(): 

124 continue 

125 words = shlex.split(line) 

126 if len(words) != 2: 

127 raise ValueError( 

128 f"Bad {NlpOutputConfigKeys.RENAMES!r} option in config " 

129 f"section {sectionname!r}; line was {line!r} but should " 

130 f"have contained two things" 

131 ) 

132 annotation_or_remote_column_name = words[0] 

133 to_column_name = words[1] 

134 ensure_valid_field_name(to_column_name) 

135 self._renames[annotation_or_remote_column_name] = to_column_name 

136 

137 # --------------------------------------------------------------------- 

138 # null_literals 

139 # --------------------------------------------------------------------- 

140 

141 null_literal_lines = cfg.opt_strlist( 

142 NlpOutputConfigKeys.NULL_LITERALS, required=False, as_words=False 

143 ) 

144 self._null_literals = [] # type: List[str] 

145 for line in null_literal_lines: 

146 self._null_literals += shlex.split(line) 

147 

148 # --------------------------------------------------------------------- 

149 # destfields 

150 # --------------------------------------------------------------------- 

151 

152 self._destfields = [] # type: List[str] 

153 self._dest_datatypes = [] # type: List[str] 

154 self._dest_comments = [] # type: List[str] 

155 dest_field_lines = cfg.opt_strlist( 

156 NlpOutputConfigKeys.DESTFIELDS, 

157 required=schema_required, 

158 as_words=False, 

159 ) 

160 # ... comments will be removed during that process. 

161 # If dest_field_lines is empty (as it may be for a Cloud processor) 

162 # the following block doesn't execute, so the 'dest' attributed remain 

163 # empty 

164 for dfl in dest_field_lines: 

165 parts = dfl.split(maxsplit=2) 

166 assert len(parts) >= 2, f"Bad field definition line: {dfl!r}" 

167 field = parts[0] 

168 datatype = parts[1].upper() 

169 comment = parts[2] if len(parts) > 2 else None 

170 ensure_valid_field_name(field) 

171 if not is_sqltype_valid(datatype): 

172 raise ValueError(f"Invalid datatype for {field}: {datatype}") 

173 self._destfields.append(field) 

174 self._dest_datatypes.append(datatype) 

175 self._dest_comments.append(comment) 

176 

177 src_fields = [ 

178 c.name for c in InputFieldConfig.get_core_columns_for_dest() 

179 ] 

180 for sf in src_fields: 

181 if sf in self._destfields: 

182 raise ValueError( 

183 f"For section {sectionname}, destination field {sf} is " 

184 f"auto-supplied; do not add it manually" 

185 ) 

186 

187 if len(set(self._destfields)) != len(self._destfields): 

188 raise ValueError( 

189 f"Duplicate fields exist in destination fields: " 

190 f"{self._destfields}" 

191 ) 

192 

193 # --------------------------------------------------------------------- 

194 # indexdefs 

195 # --------------------------------------------------------------------- 

196 

197 self._indexfields = [] # type: List[str] 

198 self._indexlengths = [] # type: List[int] 

199 indexdefs = cfg.opt_strlist(NlpOutputConfigKeys.INDEXDEFS) 

200 if indexdefs: 

201 for c in chunks(indexdefs, 2): # pairs: field, length 

202 indexfieldname = c[0] 

203 lengthstr = c[1] 

204 if indexfieldname not in self._destfields: 

205 raise ValueError( 

206 f"Index field {indexfieldname} not in " 

207 f"destination fields {self._destfields}" 

208 ) 

209 try: 

210 length = ast.literal_eval(lengthstr) 

211 if length is not None: 

212 length = int(length) 

213 except ValueError: 

214 raise ValueError(f"Bad index length: {lengthstr}") 

215 self._indexfields.append(indexfieldname) 

216 self._indexlengths.append(length) 

217 

218 @property 

219 def dest_tablename(self) -> str: 

220 """ 

221 Returns the name of the destination table. 

222 """ 

223 return self._desttable 

224 

225 @property 

226 def destfields(self) -> List[str]: 

227 """ 

228 Returns the list of destination fields. 

229 """ 

230 return self._destfields 

231 

232 def get_columns(self, engine: Engine) -> List[Column]: 

233 """ 

234 Return all SQLAlchemy :class:`Column` definitions for the destination 

235 table. 

236 

237 Args: 

238 engine: SQLAlchemy database :class:`Engine` 

239 

240 Returns: 

241 list of SQLAlchemy :class:`Column` objects 

242 

243 """ 

244 columns = [] # type: List[Column] 

245 for i, field in enumerate(self._destfields): 

246 datatype = self._dest_datatypes[i] 

247 comment = self._dest_comments[i] 

248 columns.append( 

249 Column( 

250 field, 

251 get_sqla_coltype_from_dialect_str( 

252 datatype, engine.dialect 

253 ), 

254 comment=comment, 

255 ) 

256 ) 

257 return columns 

258 

259 @property 

260 def indexes(self) -> List[Index]: 

261 """ 

262 Return all SQLAlchemy :class:`Index` definitions for the destination 

263 table. 

264 

265 Returns: 

266 list of SQLAlchemy :class:`Index` objects 

267 

268 """ 

269 indexes = [] # type: List[Index] 

270 for i, field in enumerate(self._indexfields): 

271 index_name = f"_idx_{field}" 

272 length = self._indexlengths[i] 

273 kwargs = {"mysql_length": length} if length is not None else {} 

274 indexes.append(Index(index_name, field, **kwargs)) 

275 return indexes 

276 

277 @property 

278 def renames(self) -> Dict[str, str]: 

279 """ 

280 Return the "rename dictionary": a dictionary mapping GATE annotation 

281 names (or cloud remote column names) to local field (column) names in 

282 the NLP destination table. 

283 

284 See 

285 

286 - ``renames`` in the :ref:`NLP config file <nlp_config>`. 

287 - :meth:`crate_anon.nlp_manager.parse_gate.Gate.parse` 

288 """ 

289 return self._renames 

290 

291 @property 

292 def null_literals(self) -> List[str]: 

293 """ 

294 Returns string values from the GATE output that will be interpreted as 

295 NULL values. 

296 

297 See 

298 

299 - ``null_literals`` in the :ref:`NLP config file <nlp_config>`. 

300 - :meth:`crate_anon.nlp_manager.parse_gate.Gate.parse`. 

301 """ 

302 return self._null_literals