Coverage for cc_modules/cc_xml.py: 39%

155 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-15 14:23 +0100

1""" 

2camcops_server/cc_modules/cc_xml.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2012, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CamCOPS. 

10 

11 CamCOPS is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CamCOPS is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CamCOPS. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**XML helper functions/classes.** 

27 

28""" 

29 

30import base64 

31import datetime 

32import logging 

33from typing import Any, List, Optional, TYPE_CHECKING, Union 

34import xml.sax.saxutils 

35 

36from cardinal_pythonlib.logs import BraceStyleAdapter 

37from cardinal_pythonlib.reprfunc import auto_repr 

38from cardinal_pythonlib.sqlalchemy.orm_inspect import gen_columns 

39import pendulum # avoid name confusion with Date 

40from pendulum import DateTime as Pendulum 

41from semantic_version.base import Version 

42from sqlalchemy.sql.schema import Column 

43from sqlalchemy.sql.type_api import TypeEngine 

44 

45from camcops_server.cc_modules.cc_simpleobjects import XmlSimpleValue 

46from camcops_server.cc_modules.cc_sqla_coltypes import ( 

47 COLATTR_BLOB_RELATIONSHIP_ATTR_NAME, 

48 gen_camcops_blob_columns, 

49) 

50 

51if TYPE_CHECKING: 

52 from camcops_server.cc_modules.cc_request import ( 

53 CamcopsRequest, 

54 ) 

55 from camcops_server.cc_modules.cc_summaryelement import ( 

56 SummaryElement, 

57 ) 

58 

59log = BraceStyleAdapter(logging.getLogger(__name__)) 

60 

61 

62# ============================================================================= 

63# Constants 

64# ============================================================================= 

65 

66XML_NAME_SNOMED_CODES = "snomed_ct_codes" 

67 

68XML_NAMESPACES = [ 

69 ' xmlns:xsi="https://www.w3.org/2001/XMLSchema-instance"' 

70 # ' xmlns:dt="https://www.w3.org/2001/XMLSchema-datatypes"' 

71] 

72XML_IGNORE_NAMESPACES = [ 

73 'xmlns:mc="https://schemas.openxmlformats.org/markup-compatibility/2006"', 

74 'xmlns:ignore="https://camcops.readthedocs.org/ignore"', 

75 # ... actual URL unimportant 

76 'mc:Ignorable="ignore"', 

77] 

78# http://www.w3.org/TR/xmlschema-1/ 

79# http://www.w3.org/TR/2004/REC-xmlschema-2-20041028/datatypes.html 

80 

81 

82class XmlDataTypes(object): 

83 """ 

84 Constants representing standard XML data types. 

85 """ 

86 

87 BASE64BINARY = "base64Binary" 

88 BOOLEAN = "boolean" 

89 DATE = "date" 

90 DATETIME = "dateTime" 

91 DOUBLE = "double" 

92 INTEGER = "integer" 

93 STRING = "string" 

94 TIME = "time" 

95 

96 

97# ============================================================================= 

98# XML element 

99# ============================================================================= 

100 

101 

102class XmlElement(object): 

103 """ 

104 Represents XML data in a tree. 

105 """ 

106 

107 def __init__( 

108 self, 

109 name: str, 

110 value: Any = None, 

111 datatype: str = None, 

112 comment: str = None, 

113 literal: str = None, 

114 ) -> None: 

115 """ 

116 Args: 

117 name: name of this XML element 

118 value: value of this element: may be a raw value or a list of 

119 :class:`camcops_server.cc_modules.cc_xml.XmlElement` objects 

120 (default: ``None``) 

121 datatype: data type of this element (default: ``None``) 

122 comment: description of this element (default: ``None``) 

123 literal: literal XML; overrides all other options 

124 """ 

125 # Special: boolean requires lower case "true"/"false" (or 0/1) 

126 if datatype == XmlDataTypes.BOOLEAN and value is not None: 

127 value = str(value).lower() 

128 self.name = name 

129 self.value = value 

130 self.datatype = datatype 

131 self.comment = comment 

132 self.literal = literal 

133 

134 def __repr__(self) -> str: 

135 """ 

136 Shows just this element. 

137 """ 

138 return auto_repr(self, with_addr=True) 

139 

140 

141class XmlLiteral(XmlElement): 

142 """ 

143 Represents literal XML. 

144 """ 

145 

146 def __init__(self, literal: str) -> None: 

147 super().__init__(name="", literal=literal) 

148 

149 

150# ============================================================================= 

151# Some literals 

152# ============================================================================= 

153 

154XML_COMMENT_ANCILLARY = XmlLiteral("<!-- Ancillary records -->") 

155XML_COMMENT_ANONYMOUS = XmlLiteral("<!-- Anonymous task; no patient info -->") 

156XML_COMMENT_BLOBS = XmlLiteral("<!-- Associated BLOBs -->") 

157XML_COMMENT_CALCULATED = XmlLiteral("<!-- Calculated fields -->") 

158XML_COMMENT_PATIENT = XmlLiteral("<!-- Associated patient details -->") 

159XML_COMMENT_SNOMED_CT = XmlLiteral("<!-- SNOMED-CT codes -->") 

160XML_COMMENT_SPECIAL_NOTES = XmlLiteral("<!-- Any special notes added -->") 

161XML_COMMENT_STORED = XmlLiteral("<!-- Stored fields -->") 

162 

163 

164# ============================================================================= 

165# XML processing 

166# ============================================================================= 

167# The xml.etree.ElementTree and lxml libraries can both do this sort of thing. 

168# However, they do look quite fiddly and we only want to create something 

169# simple. Therefore, let's roll our own: 

170 

171 

172def make_xml_branches_from_columns( # type: ignore[no-untyped-def] 

173 obj, skip_fields: List[str] = None 

174) -> List[XmlElement]: 

175 """ 

176 Returns a list of XML branches, each an 

177 :class:`camcops_server.cc_modules.cc_xml.XmlElement`, from an SQLAlchemy 

178 ORM object, using the list of SQLAlchemy Column objects that 

179 define/describe its fields. 

180 

181 Args: 

182 obj: the SQLAlchemy ORM object 

183 skip_fields: database column names to skip 

184 """ 

185 skip_fields = skip_fields or [] # type: List[str] 

186 branches = [] # type: List[XmlElement] 

187 for attrname, column in gen_columns(obj): 

188 # log.debug("make_xml_branches_from_columns: {!r}", attrname) 

189 colname = column.name 

190 if colname in skip_fields: 

191 continue 

192 branches.append( 

193 XmlElement( 

194 name=colname, 

195 value=getattr(obj, attrname), 

196 datatype=get_xml_datatype_from_sqla_column(column), 

197 comment=column.comment, 

198 ) 

199 ) 

200 return branches 

201 

202 

203def make_xml_branches_from_summaries( 

204 summaries: List["SummaryElement"], 

205 skip_fields: List[str] = None, 

206 sort_by_name: bool = True, 

207) -> List[XmlElement]: 

208 """ 

209 Returns a list of XML branches, each an 

210 :class:`camcops_server.cc_modules.cc_xml.XmlElement`, from a list of 

211 summary data provided by a task. 

212 

213 Args: 

214 summaries: list of :class:`SummaryElement` objects 

215 skip_fields: summary element names to skip 

216 sort_by_name: sort branches by element name? 

217 """ 

218 skip_fields = skip_fields or [] 

219 branches = [] 

220 for s in summaries: 

221 name = s.name 

222 if name in skip_fields: 

223 continue 

224 branches.append( 

225 XmlElement( 

226 name=name, 

227 value=s.value, 

228 datatype=get_xml_datatype_from_sqla_column_type(s.coltype), 

229 comment=s.comment, 

230 ) 

231 ) 

232 if sort_by_name: 

233 branches.sort(key=lambda el: el.name) 

234 return branches 

235 

236 

237def make_xml_branches_from_blobs( # type: ignore[no-untyped-def] 

238 req: "CamcopsRequest", obj, skip_fields: List[str] = None 

239) -> List[XmlElement]: 

240 """ 

241 Return XML branches from those attributes of an SQLAlchemy ORM object 

242 (e.g. task) that represent BLOBs. 

243 

244 Args: 

245 req: the :class:`camcops_server.cc_modules.cc_request.CamcopsRequest` 

246 obj: the SQLAlchemy ORM object 

247 skip_fields: database column names to skip 

248 

249 Returns: 

250 a list of :class:`camcops_server.cc_modules.cc_xml.XmlElement` objects 

251 

252 """ 

253 skip_fields = skip_fields or [] # type: List[str] 

254 branches = [] # type: List[XmlElement] 

255 for id_attrname, column in gen_camcops_blob_columns(obj): 

256 colname = column.name 

257 if colname in skip_fields: 

258 continue 

259 relationship_attr = column.info.get( 

260 COLATTR_BLOB_RELATIONSHIP_ATTR_NAME, "" 

261 ) 

262 blob = getattr(obj, relationship_attr) 

263 branches.append( 

264 XmlElement( 

265 name=relationship_attr, 

266 value=None if blob is None else blob.get_xml_element(req), 

267 comment=column.comment, 

268 ) 

269 ) 

270 return branches 

271 

272 

273def xml_header(eol: str = "\n") -> str: 

274 """ 

275 XML declaration header. 

276 """ 

277 return f'<?xml version="1.0" encoding="UTF-8"?>{eol}' 

278 

279 

280def get_xml_datatype_from_sqla_column_type(coltype: TypeEngine) -> str: 

281 """ 

282 Returns the XML schema datatype from an SQLAlchemy column type, 

283 such as ``Integer``. Compare :func:`get_xml_datatype_from_sqla_column`. 

284 """ 

285 # http://www.xml.dvint.com/docs/SchemaDataTypesQR-2.pdf 

286 # http://www.w3.org/TR/2004/REC-xmlschema-2-20041028/datatypes.html 

287 pt = coltype.python_type 

288 # pt is a *type*, not an *instance* of that type, so we use issubclass: 

289 # Watch the order. Move from more specific to less specific. 

290 # For example, issubclass(bool, int) == True, so do bool first. 

291 if issubclass(pt, datetime.datetime) or issubclass(pt, Pendulum): 

292 return XmlDataTypes.DATETIME 

293 if issubclass(pt, datetime.date) or issubclass(pt, pendulum.Date): 

294 return XmlDataTypes.DATE 

295 if issubclass(pt, datetime.time) or issubclass(pt, pendulum.Time): 

296 return XmlDataTypes.TIME 

297 if issubclass(pt, bool): 

298 return XmlDataTypes.BOOLEAN 

299 if issubclass(pt, int): 

300 return XmlDataTypes.INTEGER 

301 if issubclass(pt, float): 

302 return XmlDataTypes.DOUBLE 

303 if issubclass(pt, str) or issubclass(pt, Version): 

304 return XmlDataTypes.STRING 

305 # BLOBs are handled separately. 

306 raise NotImplementedError( 

307 f"Don't know XML type for SQLAlchemy type {coltype!r} with Python " 

308 f"type {pt!r}" 

309 ) 

310 

311 

312def get_xml_datatype_from_sqla_column(column: Column) -> Optional[str]: 

313 """ 

314 Returns the XML schema datatype from an SQLAlchemy Column, such as 

315 ``Integer()``. Compare :func:`get_xml_datatype_from_sqla_column_type`. 

316 """ 

317 coltype = column.type # type: TypeEngine 

318 return get_xml_datatype_from_sqla_column_type(coltype) 

319 

320 

321def get_xml_blob_element( 

322 name: str, blobdata: Optional[bytes], comment: str = None 

323) -> XmlElement: 

324 """ 

325 Returns an XmlElement representing a base-64-encoded BLOB. 

326 

327 Args: 

328 name: XML element name 

329 blobdata: the raw binary, or ``None`` 

330 comment: XML comment 

331 """ 

332 if blobdata: 

333 # blobdata is raw binary 

334 b64bytes = base64.b64encode(blobdata) 

335 b64str = b64bytes.decode("ascii") 

336 value = b64str 

337 else: 

338 value = None 

339 return XmlElement( 

340 name=name, 

341 value=value, 

342 datatype=XmlDataTypes.BASE64BINARY, 

343 comment=comment, 

344 ) 

345 # http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#base64Binary 

346 

347 

348def xml_escape_value(value: str) -> str: 

349 """ 

350 Escape a value for XML. 

351 """ 

352 # http://stackoverflow.com/questions/1091945/ 

353 # https://wiki.python.org/moin/EscapingXml 

354 return xml.sax.saxutils.escape(value) 

355 

356 

357def xml_quote_attribute(attr: str) -> str: 

358 """ 

359 Escapes and quotes an attribute for XML. 

360 

361 More stringent than value escaping. 

362 """ 

363 return xml.sax.saxutils.quoteattr(attr) 

364 

365 

366def get_xml_tree( 

367 element: Union[ 

368 XmlElement, XmlSimpleValue, List[Union[XmlElement, XmlSimpleValue]] 

369 ], 

370 level: int = 0, 

371 indent_spaces: int = 4, 

372 eol: str = "\n", 

373 include_comments: bool = False, 

374) -> str: 

375 # noinspection HttpUrlsUsage 

376 """ 

377 Returns an :class:`camcops_server.cc_modules.cc_xml.XmlElement` as text. 

378 

379 Args: 

380 element: root :class:`camcops_server.cc_modules.cc_xml.XmlElement` 

381 level: starting level/depth (used for recursion) 

382 indent_spaces: number of spaces to indent formatted XML 

383 eol: end-of-line string 

384 include_comments: include comments describing each field? 

385 

386 We will represent NULL values with ``xsi:nil``, but this requires a 

387 namespace: 

388 

389 - https://stackoverflow.com/questions/774192 

390 - http://books.xmlschemata.org/relaxng/relax-CHP-11-SECT-1.html 

391 

392 Comments: 

393 

394 - http://blog.galasoft.ch/posts/2010/02/quick-tip-commenting-out-properties-in-xaml/ 

395 - https://stackoverflow.com/questions/2073140/ 

396 

397 Regarding newlines: 

398 

399 - We do nothing special, i.e. newlines are provided in raw format. 

400 - However, some browsers may fail to display them correctly (i.e. they look 

401 like they're missing) -- e.g. Firefox, Chrome -- see 

402 https://stackoverflow.com/questions/2004386. Just try saving and 

403 inspecting the results with a text editor, or use the browser's "View 

404 Source" function (which, for Chrome, shows both newlines and line numbers 

405 too). 

406 

407 """ # noqa 

408 xmltext = "" 

409 prefix = " " * level * indent_spaces 

410 

411 if isinstance(element, XmlElement): 

412 

413 if element.literal: 

414 # A user-inserted piece of XML. Insert, but indent. 

415 xmltext += prefix + element.literal + eol 

416 

417 else: 

418 

419 # Attributes 

420 namespaces = [] 

421 if level == 0: # root 

422 # Apply namespace to root element (will inherit): 

423 namespaces.extend(XML_NAMESPACES) 

424 if include_comments: 

425 namespaces.extend(XML_IGNORE_NAMESPACES) 

426 namespace = " ".join(namespaces) 

427 if element.datatype: 

428 dt = f' xsi:type="{element.datatype}"' 

429 else: 

430 # log.warning("XmlElement has no datatype: {!r}", element) 

431 dt = "" 

432 cmt = "" 

433 if include_comments and element.comment: 

434 cmt = f" ignore:comment={xml_quote_attribute(element.comment)}" 

435 attributes = f"{namespace}{dt}{cmt}" 

436 

437 # Assemble 

438 if element.value is None: 

439 # NULL handling 

440 xmltext += ( 

441 f"{prefix}<{element.name}{attributes} " 

442 f'xsi:nil="true"/>{eol}' 

443 ) 

444 else: 

445 complex_value = isinstance( 

446 element.value, XmlElement 

447 ) or isinstance(element.value, list) 

448 value_to_recurse = ( 

449 element.value 

450 if complex_value 

451 else XmlSimpleValue(element.value) 

452 ) 

453 # ... XmlSimpleValue is a marker that subsequently 

454 # distinguishes things that were part of an XmlElement from 

455 # user-inserted raw XML. 

456 nl = eol if complex_value else "" 

457 pr2 = prefix if complex_value else "" 

458 v = get_xml_tree( 

459 value_to_recurse, 

460 level=level + 1, 

461 indent_spaces=indent_spaces, 

462 eol=eol, 

463 include_comments=include_comments, 

464 ) 

465 xmltext += ( 

466 f"{prefix}<{element.name}{attributes}>{nl}" 

467 f"{v}{pr2}</{element.name}>{eol}" 

468 ) 

469 

470 elif isinstance(element, list): 

471 for subelement in element: 

472 xmltext += get_xml_tree( 

473 subelement, 

474 level, 

475 indent_spaces=indent_spaces, 

476 eol=eol, 

477 include_comments=include_comments, 

478 ) 

479 # recursive 

480 

481 elif isinstance(element, XmlSimpleValue): 

482 # The lowest-level thing a value. No extra indent. 

483 xmltext += xml_escape_value(str(element.value)) 

484 

485 else: 

486 raise ValueError(f"Bad value to get_xml_tree: {element!r}") 

487 

488 return xmltext 

489 

490 

491def get_xml_document( 

492 root: XmlElement, 

493 indent_spaces: int = 4, 

494 eol: str = "\n", 

495 include_comments: bool = False, 

496) -> str: 

497 """ 

498 Returns an entire XML document as text, given the root 

499 :class:`camcops_server.cc_modules.cc_xml.XmlElement`. 

500 

501 Args: 

502 root: root :class:`camcops_server.cc_modules.cc_xml.XmlElement` 

503 indent_spaces: number of spaces to indent formatted XML 

504 eol: end-of-line string 

505 include_comments: include comments describing each field? 

506 """ 

507 if not isinstance(root, XmlElement): 

508 raise AssertionError( 

509 "get_xml_document: root not an XmlElement; " 

510 "XML requires a single root" 

511 ) 

512 return xml_header(eol) + get_xml_tree( 

513 root, 

514 indent_spaces=indent_spaces, 

515 eol=eol, 

516 include_comments=include_comments, 

517 )