Coverage for src / eo_history / history.py: 67%

454 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-12 02:10 +0000

1""" 

2Module for reading and writing ProcessingHistory objects from images 

3 

4This contains code for managing the processing history which 

5we attach to every image file (and to some XML files). 

6 

7 

8""" 

9 

10import base64 

11import json 

12import logging 

13import os 

14import pickle 

15import sys 

16import time 

17import zlib 

18from dataclasses import dataclass 

19from importlib import metadata 

20from pathlib import Path, PosixPath 

21 

22from .backends import get_backend 

23 

24try: 

25 import laspy 

26 

27 has_laspy = True 

28except ModuleNotFoundError: 

29 has_laspy = False 

30 

31 

32try: 

33 import numpy as np 

34 

35 has_numpy = True 

36except ModuleNotFoundError: 

37 has_numpy = False 

38 

39from .container import ( 

40 get_build_singularity_labels, 

41 get_container_build_info, 

42 is_container, 

43) 

44 

45backend = get_backend() 

46if backend == "gdal": 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true

47 from osgeo import gdal 

48 

49 gdal.UseExceptions() 

50 

51 

52@dataclass 

53class Config: 

54 """Class for keeping track of an item in inventory.""" 

55 

56 HISTORY_USEPICKLE: bool 

57 

58 

59config = Config(HISTORY_USEPICKLE=False) 

60 

61logger = logging.getLogger(__name__) 

62 

63# historical reasons we need a config object 

64metadataName = "SLATS_Metadata2" 

65zippedMetadataName = "SLATS_Metadata2_zipped" 

66las_metadata_name = "las_history" 

67history_record_id = 7 

68 

69# These GDAL drivers need to use the metadata on the dataset object instead of the 

70# metadata on the band1 object. Neither seems to be supported by all drivers. Sigh..... 

71driversUsingDSmetadata = set(["JPEG2000", "JP2OpenJPEG"]) 

72 

73# These GDAL drivers have limits on the size of metadata which can be stored, and 

74# so we need to keep below these, or we lose everything. The values are given in bytes. 

75# The GTiff limit is actually mysteriously complicated, but this value seems to 

76# cover it. 

77metadataSizeLimitsByDriver = {"GTiff": 28000, "LAS_VLR": 65535} 

78truncatedParentItemName = "TRUNCATED_PARENT_FILES" 

79truncatedGrandparentItemName = "TRUNCATED_GRANDPARENT_FILES" 

80 

81 

82def read_metadata_from_file(path: str) -> dict: 

83 backend = get_backend() 

84 if backend == "rasterio": 

85 return _read_metadata_from_rasterio_filename(path) 

86 return _read_metadata_from_gdal_filename(path) 

87 

88 

89# same functions but for open objects 

90def read_metadata_from_object(dataset): 

91 backend = get_backend() 

92 if backend == "rasterio": 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true

93 return _read_metadata_from_rasterio(dataset) 

94 return _read_metadata_from_gdal(dataset) 

95 

96 

97def set_processing_history_object(dataset, obj): 

98 backend = get_backend() 

99 if backend == "rasterio": 

100 return _set_processing_history_rasterio(dataset, obj) 

101 return _set_processing_history_gdal(dataset, obj) 

102 

103 

104class ProcessingHistory: 

105 """ 

106 Class that stores the 'processing history' information for a file. 

107 This is the 'new' implementation that stores the information for the 

108 files and the file relationships seperately. 

109 """ 

110 

111 def __init__(self): 

112 self.files = {} 

113 # a dictionary. Key is created by createKey() 

114 # Stored against each key is the 'meta' dictionary 

115 # with all the info for the node. 

116 

117 self.thismeta = {} 

118 # The 'meta' dictionary for 'this' file 

119 

120 self.relationships = {} 

121 # A list of tuples describing the relationship. 

122 # First element in the tuple is key of child, 

123 # second element is key of parent. 

124 # NOTE: this excludes the direct parents which 

125 # are stored seperately below. 

126 

127 self.directparents = {} 

128 # a dictionary. Key is created by createKey() 

129 # Stored against each key is the 'meta' dictionary 

130 # with all the info for the node OF THE DIRECT PARENTS 

131 

132 def dump(self): 

133 """ 

134 Prints the contents of this object out. 

135 """ 

136 print("thismeta:", self.thismeta) 

137 print("-------------------------------------") 

138 print("direct parents:") 

139 for key in self.directparents.keys(): 

140 print(key, self.directparents[key]) 

141 print("-------------------------------------") 

142 print("Files:") 

143 for key in self.files.keys(): 

144 print(key, self.files[key]) 

145 print("-------------------------------------") 

146 print("Relationships:") 

147 for child, parent in self.relationships: 

148 print(child, parent) 

149 print("-------------------------------------") 

150 # print 'toString:' 

151 # print self.toString() 

152 

153 @staticmethod 

154 def fromXMLTree(tree): 

155 """ 

156 Turns an old style XML tree into a ProcessingHistory 

157 object and returns it 

158 """ 

159 obj = ProcessingHistory() 

160 

161 # meta for the head 

162 obj.thismeta = tree.head.meta 

163 

164 # first add the direct parents 

165 for parent in tree.head.parents: 

166 timestamp = None 

167 if "timestamp" in parent.meta: 

168 timestamp = parent.meta["timestamp"] 

169 key = obj.createKey(parent.name, timestamp) 

170 obj.addDirectParent(key, parent.meta) 

171 # and the metadata for all their parents to the other lists 

172 tree.traverseTree(obj, parent) 

173 

174 return obj 

175 

176 def addFile(self, key, meta): 

177 """ 

178 Add a file and its metadata if not already recorded. 

179 Returns True if new file 

180 NOTE: should not be a direct parent - use addDirectParent instead 

181 """ 

182 added = False 

183 if key not in self.files: 

184 self.files[key] = meta 

185 added = True 

186 return added 

187 

188 def addRelationship(self, newchildkey, newparentkey): 

189 """ 

190 Add a relationship between a child file and parent file if not already recorded. 

191 Returns True if a new relationship. 

192 NOTE: should not be a direct parent - use addDirectParent instead 

193 """ 

194 found = (newchildkey, newparentkey) in self.relationships 

195 if not found: 

196 relationship = (newchildkey, newparentkey) 

197 self.relationships[relationship] = None 

198 return not found 

199 

200 def addDirectParent(self, key, meta): 

201 """ 

202 Adds metadata to the list of direct parents 

203 """ 

204 self.directparents[key] = meta 

205 

206 @staticmethod 

207 def createKey(name, timestamp=None): 

208 """ 

209 Creates a key given a filename and a timestamp. 

210 If timestamp not specified the current time is used. 

211 """ 

212 if timestamp is None: 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true

213 timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 

214 return "%s %s" % (str(name).strip("\r\n"), timestamp.strip("\r\n")) 

215 

216 def mergeHistory(self, key, history): 

217 """ 

218 Takes another processing history obj and merges it in. 

219 It is assumed that the current object is the current file 

220 the has self.thismeta set and the object to be merged in 

221 is one of the direct parents. 

222 """ 

223 # add all the non-direct files 

224 for keyh in history.files.keys(): 224 ↛ 225line 224 didn't jump to line 225 because the loop on line 224 never started

225 self.addFile(keyh, history.files[keyh]) 

226 for keyh in history.directparents.keys(): 226 ↛ 227line 226 didn't jump to line 227 because the loop on line 226 never started

227 self.addFile(keyh, history.directparents[keyh]) 

228 for child, parent in history.relationships: 228 ↛ 229line 228 didn't jump to line 229 because the loop on line 228 never started

229 self.addRelationship(child, parent) 

230 

231 self.addDirectParent(key, history.thismeta) 

232 # add it's direct relationships as parents 

233 for parent in history.directparents: 233 ↛ 234line 233 didn't jump to line 234 because the loop on line 233 never started

234 self.addRelationship(key, parent) 

235 

236 def processNode(self, node): 

237 """ 

238 Process a node from an old xml style history file 

239 (using xmlhistory.Tree.traverseTree) 

240 """ 

241 # do we have this node? 

242 timestamp = None 

243 if "timestamp" in node.meta: 

244 timestamp = node.meta["timestamp"] 

245 key = self.createKey(node.name, timestamp) 

246 self.addFile(key, node.meta) 

247 for parent in node.parents: 

248 timestamp = None 

249 if "timestamp" in parent.meta: 

250 timestamp = parent.meta["timestamp"] 

251 parentkey = self.createKey(parent.name, timestamp) 

252 self.addRelationship(key, parentkey) 

253 

254 def convert_to_builtin_types(self, obj): 

255 """ 

256 Recursively convert NumPy data types to native Python data types. 

257 

258 KEEP IN MIND: THIS USES A RECURSIVE FUNCTION 

259 

260 DJ: Addresses error with following line: 

261 

262 representationString = json.dumps(representationDict) 

263 """ 

264 if isinstance(obj, dict): 

265 return { 

266 self.convert_to_builtin_types(k): self.convert_to_builtin_types(v) 

267 for k, v in obj.items() 

268 } 

269 elif isinstance(obj, list): 

270 return [self.convert_to_builtin_types(elem) for elem in obj] 

271 elif isinstance(obj, tuple): 271 ↛ 272line 271 didn't jump to line 272 because the condition on line 271 was never true

272 return tuple(self.convert_to_builtin_types(elem) for elem in obj) 

273 elif isinstance(obj, PosixPath): # NOT SURE IF I NEED THIS 273 ↛ 274line 273 didn't jump to line 274 because the condition on line 273 was never true

274 return str(obj) 

275 elif has_numpy and isinstance(obj, np.integer): 275 ↛ 276line 275 didn't jump to line 276 because the condition on line 275 was never true

276 return int(obj) 

277 elif has_numpy and isinstance(obj, np.floating): 277 ↛ 278line 277 didn't jump to line 278 because the condition on line 277 was never true

278 return float(obj) 

279 elif has_numpy and isinstance(obj, np.ndarray): 279 ↛ 280line 279 didn't jump to line 280 because the condition on line 279 was never true

280 return obj.tolist() 

281 else: 

282 return obj 

283 

284 def toString(self, config_ob=config): 

285 """ 

286 Returns this instance as a ASCII string that can 

287 be stored and recreated as an object using fromString() 

288 

289 Uses JSON for the representation of the relevant objects. 

290 

291 If total backward compatability is required, and the pickle format 

292 is desired, then set the environment variable 

293 RSC_HISTORY_USEPICKLE=1 

294 and the old pickle format will be used. 

295 

296 """ 

297 if not config_ob.HISTORY_USEPICKLE: 297 ↛ 318line 297 didn't jump to line 318 because the condition on line 297 was always true

298 # relationships is now a dictionary, but for backwards compatability 

299 # store it as a list, as it used to be. I will probably get rid of this eventually 

300 relationships = self.relationships 

301 if isinstance(relationships, dict): 301 ↛ 303line 301 didn't jump to line 303 because the condition on line 301 was always true

302 relationships = list(relationships.keys()) 

303 representationDict = { 

304 "files": self.files, 

305 "thismeta": self.thismeta, 

306 "relationships": relationships, 

307 "directparents": self.directparents, 

308 } 

309 

310 # We have two possible json modules, on std and one for python < 2.6 

311 if hasattr(json, "dumps"): 311 ↛ 314line 311 didn't jump to line 314 because the condition on line 311 was always true

312 representationDict = self.convert_to_builtin_types(representationDict) 

313 representationString = json.dumps(representationDict) 

314 elif hasattr(json, "JsonWriter"): 

315 JSONwriter = json.JsonWriter() 

316 representationString = JSONwriter.write(representationDict) 

317 else: 

318 representationString = pickle.dumps(self, 0) 

319 return representationString 

320 

321 @staticmethod 

322 def fromString(s): 

323 """ 

324 Creates and returns a new ProcessingHistory object from 

325 a string previously returned by toString() 

326 

327 Will attempt to process as JSON first, and if that fails will attempt 

328 with pickle, for backward compatibility. (We used to use pickle format, 

329 but now use JSON). 

330 

331 """ 

332 isJSON = True 

333 # We have two possible json modules, on std and one for python < 2.6 

334 if hasattr(json, "loads"): 334 ↛ 339line 334 didn't jump to line 339 because the condition on line 334 was always true

335 try: 

336 representationDict = json.loads(s) 

337 except ValueError: 

338 isJSON = False 

339 elif hasattr(json, "JsonReader"): 

340 try: 

341 JSONreader = json.JsonReader() 

342 representationDict = JSONreader.read(s) 

343 except json.ReadException: 

344 isJSON = False 

345 

346 if isJSON: 346 ↛ 354line 346 didn't jump to line 354 because the condition on line 346 was always true

347 obj = ProcessingHistory() 

348 obj.files = representationDict["files"] 

349 obj.thismeta = representationDict["thismeta"] 

350 obj.relationships = representationDict["relationships"] 

351 obj.directparents = representationDict["directparents"] 

352 else: 

353 # Assume pickle 

354 if hasattr(s, "encode"): 

355 s = s.encode("utf-8") 

356 # also want to avoid the previous rsc.history import 

357 # approach. 

358 obj = pickle.loads(s) 

359 

360 # If relationships is a list, convert it to a dictionary for efficiency 

361 if isinstance(obj.relationships, list): 361 ↛ 366line 361 didn't jump to line 366 because the condition on line 361 was always true

362 relationshipsDict = {} 

363 for t in obj.relationships: 363 ↛ 364line 363 didn't jump to line 364 because the loop on line 363 never started

364 relationshipsDict[tuple(t)] = None 

365 obj.relationships = relationshipsDict 

366 return obj 

367 

368 

369def readTreeFromDataset(dataset): 

370 """ 

371 Reads the processing history out of a GDAL dataset (raster) or a LAS/LAZ file 

372 and returns a ProcessingHistory object. 

373 

374 The function identifies whether the input is a raster or LAS/LAZ file based 

375 on the file extension and processes the metadata accordingly. 

376 - The function reads the VLRs (Variable Length Records) from the LAS header. 

377 - If it finds a VLR with the user_id of las_history, it extracts the metadata. 

378 - The metadata is then used to create a ProcessingHistory object. 

379 """ 

380 

381 # Check the file type, if it lasdata then we can assume laspy is correct 

382 if has_laspy: 382 ↛ 401line 382 didn't jump to line 401 because the condition on line 382 was always true

383 # if you have laspy you can check a laspy object 

384 if isinstance(dataset, laspy.LasData) or isinstance(dataset, laspy.LasReader): 

385 header = dataset.header 

386 # Read metadata from VLRs (Variable Length Records) 

387 for vlr in header.vlrs: 387 ↛ 398line 387 didn't jump to line 398 because the loop on line 387 didn't complete

388 if vlr.user_id == las_metadata_name: 

389 compressed_data = vlr.record_data 

390 s = zlib.decompress( 

391 compressed_data 

392 ) # s = zlib.decompress(base64.b64decode(compressed_data)) 

393 if isinstance(s, bytes): 393 ↛ 395line 393 didn't jump to line 395 because the condition on line 393 was always true

394 s = s.decode("utf-8") 

395 obj = ProcessingHistory.fromString(s) 

396 return obj 

397 # loop finished without finding a matching VLR 

398 logger.warning("""WARNING: The metadata object is empty. See readTreeFromDataset for 

399 processing history and compression code for details""") 

400 return ProcessingHistory() 

401 obj = read_metadata_from_object(dataset) 

402 return obj 

403 

404 

405def readTreeFromFilename(imgfile): 

406 """ 

407 Same as readTreeFromDataset() but takes a filenameinsertMetadataFilename 

408 imgfile is either a Path object or a string 

409 """ 

410 imgfile = str(imgfile) 

411 

412 # GDAL virtual filesystem paths (e.g. /vsicurl/, /vsis3/) cannot be 

413 # represented as Path objects — Path collapses the double-slash in URLs. 

414 # Skip the Path-based LAS check for these. 

415 is_virtual = imgfile.startswith("/vsi") 

416 

417 if not is_virtual: 417 ↛ 427line 417 didn't jump to line 427 because the condition on line 417 was always true

418 imgfile = Path(imgfile) 

419 # PROCESSING LAS/LAZ files 

420 # will only work if you have las 

421 if has_laspy: 421 ↛ 427line 421 didn't jump to line 427 because the condition on line 421 was always true

422 if imgfile.suffix == ".las" or imgfile.suffix == ".laz": 

423 with laspy.open(imgfile) as lasdata: 

424 obj = readTreeFromDataset(lasdata) 

425 return obj 

426 

427 obj = read_metadata_from_file(imgfile) 

428 return obj 

429 

430 

431## rasterio versions 

432def _read_metadata_from_rasterio(dataset): 

433 meta = dataset.tags(1) 

434 if zippedMetadataName in meta: 434 ↛ 444line 434 didn't jump to line 444 because the condition on line 434 was always true

435 sz = meta[zippedMetadataName] 

436 if sz.startswith(r"b'"): 436 ↛ 438line 436 didn't jump to line 438 because the condition on line 436 was always true

437 sz = sz[2:-1] 

438 s = zlib.decompress(base64.b64decode(sz)) 

439 obj = ProcessingHistory.fromString(s) 

440 return obj 

441 else: 

442 # no metadata in this file - manufacture an empty object 

443 # print "warning: %s has no metadata" % dataset.GetDescription() 

444 obj = ProcessingHistory() 

445 return obj 

446 

447 

448def _read_metadata_from_rasterio_filename(imgfile): 

449 import rasterio 

450 

451 with rasterio.open(imgfile) as src: 

452 obj = _read_metadata_from_rasterio(src) 

453 return obj 

454 

455 

456## gdal versiona 

457def _read_metadata_from_gdal(dataset): 

458 drvrName = dataset.GetDriver().ShortName 

459 if drvrName in driversUsingDSmetadata: 459 ↛ 460line 459 didn't jump to line 460 because the condition on line 459 was never true

460 meta = dataset.GetMetadata() 

461 else: 

462 band = dataset.GetRasterBand(1) 

463 meta = band.GetMetadata() 

464 if metadataName in meta and len(meta[metadataName]) > 0: 464 ↛ 466line 464 didn't jump to line 466 because the condition on line 464 was never true

465 # file has the processing history stored as pickled object 

466 s = meta[metadataName] 

467 # always py3 

468 s = bytes(s, "utf-8") 

469 obj = ProcessingHistory.fromString(s) 

470 elif zippedMetadataName in meta: 470 ↛ 480line 470 didn't jump to line 480 because the condition on line 470 was always true

471 sz = meta[zippedMetadataName] 

472 # what is this? 

473 if sz.startswith(r"b'"): 473 ↛ 474line 473 didn't jump to line 474 because the condition on line 473 was never true

474 sz = sz[2:-1] 

475 s = zlib.decompress(base64.b64decode(sz)) 

476 obj = ProcessingHistory.fromString(s) 

477 else: 

478 # no metadata in this file - manufacture an empty object 

479 # print "warning: %s has no metadata" % dataset.GetDescription() 

480 obj = ProcessingHistory() 

481 dataset = None # Close the GDAL dataset 

482 return obj 

483 

484 

485def _read_metadata_from_gdal_filename(imgfile): 

486 from osgeo import gdal, gdalconst 

487 

488 ds = gdal.Open(str(imgfile), gdalconst.GA_ReadOnly) 

489 obj = readTreeFromDataset(ds) 

490 return obj 

491 

492 

493## writing to datasets 

494 

495 

496def _set_processing_history_gdal(dataset, obj): 

497 """ 

498 Write the processing history into a GDAL datatset or a LAS file.. 

499 """ 

500 drvrName = dataset.GetDriver().ShortName 

501 objectString = obj.toString() 

502 objectString = bytes(objectString, "utf-8") 

503 # Now compress it 

504 compressedString = base64.b64encode(zlib.compress(objectString, 9)) 

505 

506 if drvrName in metadataSizeLimitsByDriver: 506 ↛ 531line 506 didn't jump to line 531 because the condition on line 506 was always true

507 compressedStrLen = len(compressedString) 

508 if compressedStrLen > metadataSizeLimitsByDriver[drvrName]: 508 ↛ 511line 508 didn't jump to line 511 because the condition on line 508 was never true

509 # We need to trim all the parents off, so we have only a minimal 

510 # amount of history. 

511 objCopy = ProcessingHistory.fromString( 

512 zlib.decompress(base64.b64decode(compressedString)) 

513 ) 

514 trimParents(objCopy) 

515 compressedString = base64.b64encode( 

516 zlib.compress(objCopy.toString().encode(), 9) 

517 ) 

518 logger.warning( 

519 f"file {dataset.GetFileList()[0]} compressed history is " 

520 f"{compressedStrLen} bytes" 

521 ) 

522 logger.warning( 

523 f"Limit for {drvrName} driver is {metadataSizeLimitsByDriver[drvrName]} bytes" 

524 ) 

525 logger.warning( 

526 f"Truncating to minimal history ({len(compressedString)} bytes)." 

527 ) 

528 

529 # Originally we stored this on band1 (because that's all HFA supported), 

530 # but for some drivers we have to use the dataset instead. 

531 if drvrName in driversUsingDSmetadata: 531 ↛ 532line 531 didn't jump to line 532 because the condition on line 531 was never true

532 gdalObj = dataset 

533 else: 

534 gdalObj = dataset.GetRasterBand(1) 

535 

536 # Now store the compressed string. Note that we always write the string in its compressed 

537 # form, as this will be robust for file format conversions with gdal_translate, e.g. 

538 # to convert to GTiff. 

539 meta = gdalObj.GetMetadata() 

540 meta[zippedMetadataName] = compressedString 

541 gdalObj.SetMetadata(meta) 

542 gdalObj.FlushCache() # Save changes to the file 

543 

544 

545def _set_processing_history_rasterio(dataset, obj): 

546 objectString = obj.toString() 

547 objectString = bytes(objectString, "utf-8") 

548 # Now compress it 

549 compressedString = base64.b64encode(zlib.compress(objectString, 9)) 

550 compressedStrLen = len(compressedString) 

551 if compressedStrLen > metadataSizeLimitsByDriver["GTiff"]: 551 ↛ 554line 551 didn't jump to line 554 because the condition on line 551 was never true

552 # We need to trim all the parents off, so we have only a minimal 

553 # amount of history. 

554 objCopy = ProcessingHistory.fromString( 

555 zlib.decompress(base64.b64decode(compressedString)) 

556 ) 

557 trimParents(objCopy) 

558 compressedString = base64.b64encode( 

559 zlib.compress(objCopy.toString().encode(), 9) 

560 ) 

561 logger.warning( 

562 f"Truncating to minimal history ({len(compressedString)} bytes)." 

563 ) 

564 dataset.update_tags(1, SLATS_Metadata2_zipped=compressedString) 

565 

566 

567def getMandatoryFields(script, argv, strict=True): 

568 """ 

569 Get the mandatory fields and return as a dictionary. 

570 

571 Images with history require a minimum set of mandatory 

572 fields: 

573 * timestamp 

574 * login 

575 * uname_os 

576 * uname_host 

577 * uname_release 

578 * uname_machine 

579 * cwd: the current working directory from where the script was 

580 called 

581 * script 

582 * script arguments 

583 

584 Additionally, if the image was created in a container, the 

585 following are also mandatory: 

586 * one of org.label-schema.vcs-ref or org.opencontainers.image.revision: 

587 the git hash of the container build 

588 * one of org.label-schema.vcs-url, org.opencontainers.image.url, 

589 vcs-url: the url of the git repository 

590 

591 If strict is False, then container labels are considered optional. 

592 

593 

594 Args: 

595 script (str): The script which produces the image. 

596 If script is None, then assumes it is sys.argv[0] 

597 argv (list[str]): The arguments accompanying the 

598 script. If argv is none, assumes it is sys.argv[1:] 

599 strict (bool): Container labels considered mandatory. 

600 This argument will be ignored if not running in a 

601 container. 

602 

603 Returns: (dict): A dictonary with the mandatory field 

604 as keys, and the associated values. 

605 

606 """ 

607 dictn = {} 

608 

609 dictn["timestamp"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 

610 login = os.getenv( 

611 "LOGNAME", "unknown" 

612 ) # see http://docs.python.org/lib/os-procinfo.html#l2h-2578 

613 if login == "unknown": 613 ↛ 615line 613 didn't jump to line 615 because the condition on line 613 was always true

614 login = os.getenv("USER", "unknown") 

615 dictn["login"] = login 

616 

617 uname = os.uname() 

618 dictn["uname_os"] = uname[0] 

619 dictn["uname_host"] = uname[1] 

620 dictn["uname_release"] = uname[2] 

621 dictn["uname_version"] = uname[3] 

622 dictn["uname_machine"] = uname[4] 

623 dictn["cwd"] = os.getcwd() 

624 

625 # now check for container info 

626 if is_container(): 

627 logger.debug("running in a container") 

628 # mapping of label to history 

629 mandatory_container_labels = { 

630 "repo_url": [ 

631 "org.opencontainers.image.url", 

632 "org.label-schema.vcs-url", 

633 "vcs-url", 

634 ], 

635 "container_version": [ 

636 "org.opencontainers.image.revision", 

637 "org.label-schema.vcs-ref", 

638 ], 

639 } 

640 try: 

641 container_metadata = get_container_build_info() 

642 except FileNotFoundError: 

643 logger.warning( 

644 "could not find build metadata file /etc/rsc_build_info.json", 

645 exc_info=1, 

646 ) 

647 logger.warning("Trying for a singularity recipe") 

648 try: 

649 container_metadata = get_build_singularity_labels() 

650 except FileNotFoundError: 

651 logger.warning("could not find singularity metadata", exc_info=1) 

652 container_metadata = {} 

653 logger.warning("no container metadata was found") 

654 

655 # now check for missing labels 

656 # if container_metadata is empty they'll all be missing... 

657 items_found = [] 

658 if "buildinfo" in container_metadata: 658 ↛ 682line 658 didn't jump to line 682 because the condition on line 658 was always true

659 for mandatory_item, container_labels in mandatory_container_labels.items(): 

660 # check that at least one of the labels is present, 

661 # and shouldn't be an empty string 

662 item_found = False 

663 for thislabel in container_labels: 

664 if ( 

665 thislabel in container_metadata["buildinfo"] 

666 and not container_metadata["buildinfo"][thislabel] == "" 

667 ): 

668 item_found = True 

669 break 

670 items_found.append(item_found) 

671 if not item_found: 

672 message = ( 

673 "The container is missing the mandatory metadata for {}".format( 

674 mandatory_item 

675 ) 

676 ) 

677 logger.warning(message) 

678 message = "A container requires one of the following {}".format( 

679 ", ".join(container_labels) 

680 ) 

681 

682 if not all(items_found): 

683 if strict: 

684 raise MissingMandatoryLabels(message) 

685 else: 

686 logger.warning("Mandatory container label restriction over-ridden") 

687 # we add what we have 

688 dictn.update(container_metadata) 

689 

690 else: 

691 dictn["python_version"] = "{}.{}.{}".format(*sys.version_info) 

692 

693 if argv is None: 

694 argv = " ".join(sys.argv[1:]) 

695 dictn["commandline"] = argv 

696 

697 if script is None: 

698 script = sys.argv[0] 

699 dictn["script"] = os.path.basename(script) 

700 dictn["script_dir"] = os.path.dirname(script) 

701 

702 # Find version numbers of any external imported modules (if possible) 

703 # slightly different if we are in a container, since in that case its 

704 # actually not very informative 

705 if is_container(): 

706 moduleVersionDict = dict( 

707 # tuple(str(ws).split()) for ws in pkg_resources.working_set 

708 (dist.metadata["Name"], dist.metadata["Version"]) 

709 for dist in metadata.distributions() 

710 ) 

711 else: 

712 moduleVersionDict = {} 

713 modnameList = list(sys.modules.keys()) 

714 for modname in modnameList: 

715 modobj = sys.modules[modname] 

716 if ( 

717 hasattr(modobj, "__file__") 

718 and modobj.__file__ 

719 and not modobj.__file__.startswith(sys.prefix) 

720 ): 

721 toplevelModname = modname.split(".")[0] 

722 if toplevelModname in sys.modules: 722 ↛ 714line 722 didn't jump to line 714 because the condition on line 722 was always true

723 moduleVersionDict[toplevelModname] = "Unknown" 

724 if len(moduleVersionDict) > 0: 724 ↛ 729line 724 didn't jump to line 729 because the condition on line 724 was always true

725 for modname in moduleVersionDict: 

726 if hasattr(sys.modules[modname], "__version__"): 

727 moduleVersionDict[modname] = str(sys.modules[modname].__version__) 

728 

729 if len(moduleVersionDict) > 0: 729 ↛ 732line 729 didn't jump to line 732 because the condition on line 729 was always true

730 dictn["package_version_dict"] = json.dumps(moduleVersionDict) 

731 

732 return dictn 

733 

734 

735def insertHistory( 

736 name, 

737 parent_list, 

738 optional_dict, 

739 script=None, 

740 argv=None, 

741 strict=True, 

742): 

743 """ 

744 Creates a new node with mandatory metadata 

745 and any optional metadata passed in optional_dict. 

746 It merges all of the metadata from the parent filenames 

747 passed in parent_list. 

748 If this is being called from a Python script leave script None it will read it from the 

749 current environment. 

750 The argument strict is passed through to the getMandatoryFields function. 

751 name isn't accessed but for historical reasons I'll leave it 

752 """ 

753 

754 obj = ProcessingHistory() 

755 obj.thismeta = getMandatoryFields(script, argv, strict=strict) 

756 # optional fields 

757 for key in optional_dict.keys(): 

758 obj.thismeta[key] = optional_dict[key] 

759 

760 for parent in parent_list: 

761 parentobj = readTreeFromFilename(parent) 

762 parentTimestamp = None 

763 if "timestamp" in parentobj.thismeta: 763 ↛ 765line 763 didn't jump to line 765 because the condition on line 763 was always true

764 parentTimestamp = parentobj.thismeta["timestamp"] 

765 key = obj.createKey(parent, parentTimestamp) 

766 obj.mergeHistory(key, parentobj) 

767 

768 return obj 

769 

770 

771def insertMetadataDataset( 

772 dataset, 

773 parent_list, 

774 optional_dict, 

775 script=None, 

776 argv=None, 

777 strict=True, 

778): 

779 """ 

780 Takes a dataset (opened with GA_Update, or Create()) and creates 

781 a new node with mandatory metadata 

782 and any optional metadata passed in optional_dict. It merges 

783 all of the metadata from the parent filenames 

784 passed in parent_list. 

785 If this is being called from a Python script leave script None it will read it from the 

786 current environment. 

787 The argument strict is passed through to the getMandatoryFields function. 

788 """ 

789 name = None 

790 if has_laspy: 790 ↛ 800line 790 didn't jump to line 800 because the condition on line 790 was always true

791 if isinstance(dataset, laspy.LasData): 

792 name = ( 

793 Path(dataset.filename).name 

794 if hasattr(dataset, "filename") 

795 else "Unknown" 

796 ) 

797 # if it was las file, we should have a name. Otherwise either 

798 # we don't have laspy and a las file which will raise an error later 

799 # or it's an image file, in which case we should try to get it 

800 if not name: 

801 try: 

802 name = os.path.basename(dataset.GetDescription()) 

803 except AttributeError: 

804 # if it is a rasterio object 

805 name = dataset.name 

806 

807 obj = insertHistory(name, parent_list, optional_dict, script, argv, strict=strict) 

808 

809 setProcessingHistoryDataset(dataset, obj) 

810 

811 

812def setProcessingHistoryDataset(dataset, obj): 

813 """ 

814 Write the processing history into a GDAL datatset or a LAS file.. 

815 

816 The LAS component appears to directly append the compressed string metadata to the VLR 

817 without performing any of the extra steps seen in the GDAL component 

818 (e.g., checking size limits, trimming parents, or ensuring compatibility). 

819 While the compressed history metadata is added correctly, the LAS component 

820 doesn't account for: 

821 Size Limits: Unlike GDAL, which checks size limits imposed by specific drivers, 

822 there are no similar checks for LAS VLRs. 

823 Trimming Parent Metadata: The LAS part doesn’t have a trimming mechanism similar to what 

824 GDAL uses when the metadata size exceeds the limit. 

825 Consistency: The metadata format in the LAS file is inserted using VLRs, whereas 

826 GDAL stores metadata at the dataset or band level. 

827 This difference in storage location and structure may impact consistency 

828 between the two formats when accessed. 

829 

830 """ 

831 if has_laspy: 831 ↛ 880line 831 didn't jump to line 880 because the condition on line 831 was always true

832 if isinstance(dataset, laspy.LasData): 

833 # Handle LAS file metadata insertion using VLRs 

834 objectString = obj.toString() 

835 objectBytes = bytes(objectString, "utf-8") 

836 

837 # Compress the metadata 

838 compressedString = zlib.compress(objectBytes, 9) 

839 

840 # Check if the compressed metadata exceeds the LAS VLR size limit 

841 compressedStrLen = len(compressedString) 

842 if compressedStrLen > metadataSizeLimitsByDriver["LAS_VLR"]: 842 ↛ 843line 842 didn't jump to line 843 because the condition on line 842 was never true

843 objCopy = ProcessingHistory.fromString( 

844 zlib.decompress(compressedString) 

845 ) 

846 trimParents(objCopy) 

847 objCopyString = objCopy.toString() 

848 objCopyString = bytes(objCopyString, "utf-8") 

849 compressedString = zlib.compress( 

850 objCopyString, 9 

851 ) # base64.b64encode(zlib.compress(objCopyString, 9)) 

852 compressedStrLen = len(compressedString) 

853 

854 # Replace existing VLR if it exists 

855 existing_vlr_index = None 

856 for idx, vlr in enumerate(dataset.vlrs): 

857 if vlr.user_id == las_metadata_name: 857 ↛ 858line 857 didn't jump to line 858 because the condition on line 857 was never true

858 existing_vlr_index = idx 

859 break 

860 

861 history_vlr = laspy.VLR( 

862 user_id=las_metadata_name, 

863 record_id=history_record_id, 

864 record_data=compressedString, 

865 description="VLR Record contains compressed processing history", 

866 ) 

867 

868 if existing_vlr_index is not None: 868 ↛ 869line 868 didn't jump to line 869 because the condition on line 868 was never true

869 logger.debug(f"existing index at {existing_vlr_index}") 

870 dataset.vlrs[existing_vlr_index] = history_vlr 

871 else: 

872 logger.debug("appending history to the vlrs") 

873 dataset.vlrs.append(history_vlr) 

874 

875 # Save the updated LAS file 

876 logger.debug(f"writing updated file {dataset.filename}") 

877 dataset.write(dataset.filename) 

878 dataset.update_header() 

879 return True 

880 set_processing_history_object(dataset, obj) 

881 

882 

883def trimParents(obj): 

884 """ 

885 Trim off the parent nodes, and instead list them as an entry in the 

886 fields of this node. There are separate entries for parent and grandparent files. 

887 Each entry has a list of the files, along with their timestamps. 

888 

889 """ 

890 parentkeys = obj.directparents.keys() 

891 obj.thismeta[truncatedParentItemName] = repr(parentkeys) 

892 grandparentkeys = set() 

893 for k in parentkeys: 

894 grandparents = [k2[1] for k2 in obj.relationships if k2[0] == k] 

895 for gp in grandparents: 

896 grandparentkeys.add(gp) 

897 obj.thismeta[truncatedGrandparentItemName] = repr(list(grandparentkeys)) 

898 obj.directparents = {} 

899 obj.files = {} 

900 obj.relationships = {} 

901 

902 

903def insertMetadataFilename( 

904 imgfile, 

905 parent_list, 

906 optional_dict, 

907 script=None, 

908 argv=None, 

909 strict=True, 

910): 

911 """ 

912 Same as insertMetadataDataset but takes a filename rather than a dataset 

913 Also adapted to work if the file is an XML file, by detecting 

914 this and calling the right routine. 

915 The argument strict is passed through to the getMandatoryFields function. 

916 

917 imgfile can be string or Path 

918 """ 

919 

920 # co-erce to Path 

921 imgfile = Path(imgfile) 

922 if has_laspy: 922 ↛ 942line 922 didn't jump to line 942 because the condition on line 922 was always true

923 if imgfile.suffix == ".las" or imgfile.suffix == ".laz": 

924 # Use laspy to handle LAS/LAZ files 

925 logger.debug(f"adding metadata to las file {imgfile.name}") 

926 with laspy.open(imgfile) as las_file: 

927 # do we really have to read all the data in? 

928 lasdata = las_file.read() 

929 lasdata.filename = imgfile 

930 insertMetadataDataset( 

931 lasdata, 

932 parent_list, 

933 optional_dict, 

934 script, 

935 argv, 

936 strict=strict, 

937 ) 

938 return True 

939 

940 # opening is different depending on whether we are using 

941 # rasterio or not 

942 backend = get_backend() 

943 if backend == "rasterio": 

944 import rasterio 

945 

946 with rasterio.open(imgfile, "r+") as dst: 

947 insertMetadataDataset( 

948 dst, 

949 parent_list, 

950 optional_dict, 

951 script, 

952 argv, 

953 strict=strict, 

954 ) 

955 else: 

956 from osgeo import gdal, gdalconst 

957 

958 ds = gdal.Open(str(imgfile), gdalconst.GA_Update) 

959 insertMetadataDataset( 

960 ds, parent_list, optional_dict, script, argv, strict=strict 

961 ) 

962 del ds 

963 

964 

965def fileIsXML(filename): 

966 """ 

967 Check the beginning of a file and determine whether it appears to be an XML file or not. 

968 """ 

969 try: 

970 magicNumberString = open(filename, "rb").read(5) 

971 isxml = magicNumberString in ("<?xml", b"<?xml") 

972 except FileNotFoundError: 

973 isxml = False 

974 return isxml 

975 

976 

977class MissingMandatoryLabels(Exception): 

978 """Exception raised if any mandatory fields are missing. 

979 

980 Args: 

981 msg (str): A string describing the reason the exception 

982 was raised. Should include the names of the missing 

983 labels. 

984 

985 Attributes: 

986 msg (str): A string describing the reason the exception 

987 was raised. 

988 """ 

989 

990 pass