Coverage for src / eo_history / history.py: 67%
454 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-12 02:10 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-12 02:10 +0000
1"""
2Module for reading and writing ProcessingHistory objects from images
4This contains code for managing the processing history which
5we attach to every image file (and to some XML files).
8"""
10import base64
11import json
12import logging
13import os
14import pickle
15import sys
16import time
17import zlib
18from dataclasses import dataclass
19from importlib import metadata
20from pathlib import Path, PosixPath
22from .backends import get_backend
24try:
25 import laspy
27 has_laspy = True
28except ModuleNotFoundError:
29 has_laspy = False
32try:
33 import numpy as np
35 has_numpy = True
36except ModuleNotFoundError:
37 has_numpy = False
39from .container import (
40 get_build_singularity_labels,
41 get_container_build_info,
42 is_container,
43)
45backend = get_backend()
46if backend == "gdal": 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true
47 from osgeo import gdal
49 gdal.UseExceptions()
52@dataclass
53class Config:
54 """Class for keeping track of an item in inventory."""
56 HISTORY_USEPICKLE: bool
59config = Config(HISTORY_USEPICKLE=False)
61logger = logging.getLogger(__name__)
63# historical reasons we need a config object
64metadataName = "SLATS_Metadata2"
65zippedMetadataName = "SLATS_Metadata2_zipped"
66las_metadata_name = "las_history"
67history_record_id = 7
69# These GDAL drivers need to use the metadata on the dataset object instead of the
70# metadata on the band1 object. Neither seems to be supported by all drivers. Sigh.....
71driversUsingDSmetadata = set(["JPEG2000", "JP2OpenJPEG"])
73# These GDAL drivers have limits on the size of metadata which can be stored, and
74# so we need to keep below these, or we lose everything. The values are given in bytes.
75# The GTiff limit is actually mysteriously complicated, but this value seems to
76# cover it.
77metadataSizeLimitsByDriver = {"GTiff": 28000, "LAS_VLR": 65535}
78truncatedParentItemName = "TRUNCATED_PARENT_FILES"
79truncatedGrandparentItemName = "TRUNCATED_GRANDPARENT_FILES"
82def read_metadata_from_file(path: str) -> dict:
83 backend = get_backend()
84 if backend == "rasterio":
85 return _read_metadata_from_rasterio_filename(path)
86 return _read_metadata_from_gdal_filename(path)
89# same functions but for open objects
90def read_metadata_from_object(dataset):
91 backend = get_backend()
92 if backend == "rasterio": 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true
93 return _read_metadata_from_rasterio(dataset)
94 return _read_metadata_from_gdal(dataset)
97def set_processing_history_object(dataset, obj):
98 backend = get_backend()
99 if backend == "rasterio":
100 return _set_processing_history_rasterio(dataset, obj)
101 return _set_processing_history_gdal(dataset, obj)
104class ProcessingHistory:
105 """
106 Class that stores the 'processing history' information for a file.
107 This is the 'new' implementation that stores the information for the
108 files and the file relationships seperately.
109 """
111 def __init__(self):
112 self.files = {}
113 # a dictionary. Key is created by createKey()
114 # Stored against each key is the 'meta' dictionary
115 # with all the info for the node.
117 self.thismeta = {}
118 # The 'meta' dictionary for 'this' file
120 self.relationships = {}
121 # A list of tuples describing the relationship.
122 # First element in the tuple is key of child,
123 # second element is key of parent.
124 # NOTE: this excludes the direct parents which
125 # are stored seperately below.
127 self.directparents = {}
128 # a dictionary. Key is created by createKey()
129 # Stored against each key is the 'meta' dictionary
130 # with all the info for the node OF THE DIRECT PARENTS
132 def dump(self):
133 """
134 Prints the contents of this object out.
135 """
136 print("thismeta:", self.thismeta)
137 print("-------------------------------------")
138 print("direct parents:")
139 for key in self.directparents.keys():
140 print(key, self.directparents[key])
141 print("-------------------------------------")
142 print("Files:")
143 for key in self.files.keys():
144 print(key, self.files[key])
145 print("-------------------------------------")
146 print("Relationships:")
147 for child, parent in self.relationships:
148 print(child, parent)
149 print("-------------------------------------")
150 # print 'toString:'
151 # print self.toString()
153 @staticmethod
154 def fromXMLTree(tree):
155 """
156 Turns an old style XML tree into a ProcessingHistory
157 object and returns it
158 """
159 obj = ProcessingHistory()
161 # meta for the head
162 obj.thismeta = tree.head.meta
164 # first add the direct parents
165 for parent in tree.head.parents:
166 timestamp = None
167 if "timestamp" in parent.meta:
168 timestamp = parent.meta["timestamp"]
169 key = obj.createKey(parent.name, timestamp)
170 obj.addDirectParent(key, parent.meta)
171 # and the metadata for all their parents to the other lists
172 tree.traverseTree(obj, parent)
174 return obj
176 def addFile(self, key, meta):
177 """
178 Add a file and its metadata if not already recorded.
179 Returns True if new file
180 NOTE: should not be a direct parent - use addDirectParent instead
181 """
182 added = False
183 if key not in self.files:
184 self.files[key] = meta
185 added = True
186 return added
188 def addRelationship(self, newchildkey, newparentkey):
189 """
190 Add a relationship between a child file and parent file if not already recorded.
191 Returns True if a new relationship.
192 NOTE: should not be a direct parent - use addDirectParent instead
193 """
194 found = (newchildkey, newparentkey) in self.relationships
195 if not found:
196 relationship = (newchildkey, newparentkey)
197 self.relationships[relationship] = None
198 return not found
200 def addDirectParent(self, key, meta):
201 """
202 Adds metadata to the list of direct parents
203 """
204 self.directparents[key] = meta
206 @staticmethod
207 def createKey(name, timestamp=None):
208 """
209 Creates a key given a filename and a timestamp.
210 If timestamp not specified the current time is used.
211 """
212 if timestamp is None: 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true
213 timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
214 return "%s %s" % (str(name).strip("\r\n"), timestamp.strip("\r\n"))
216 def mergeHistory(self, key, history):
217 """
218 Takes another processing history obj and merges it in.
219 It is assumed that the current object is the current file
220 the has self.thismeta set and the object to be merged in
221 is one of the direct parents.
222 """
223 # add all the non-direct files
224 for keyh in history.files.keys(): 224 ↛ 225line 224 didn't jump to line 225 because the loop on line 224 never started
225 self.addFile(keyh, history.files[keyh])
226 for keyh in history.directparents.keys(): 226 ↛ 227line 226 didn't jump to line 227 because the loop on line 226 never started
227 self.addFile(keyh, history.directparents[keyh])
228 for child, parent in history.relationships: 228 ↛ 229line 228 didn't jump to line 229 because the loop on line 228 never started
229 self.addRelationship(child, parent)
231 self.addDirectParent(key, history.thismeta)
232 # add it's direct relationships as parents
233 for parent in history.directparents: 233 ↛ 234line 233 didn't jump to line 234 because the loop on line 233 never started
234 self.addRelationship(key, parent)
236 def processNode(self, node):
237 """
238 Process a node from an old xml style history file
239 (using xmlhistory.Tree.traverseTree)
240 """
241 # do we have this node?
242 timestamp = None
243 if "timestamp" in node.meta:
244 timestamp = node.meta["timestamp"]
245 key = self.createKey(node.name, timestamp)
246 self.addFile(key, node.meta)
247 for parent in node.parents:
248 timestamp = None
249 if "timestamp" in parent.meta:
250 timestamp = parent.meta["timestamp"]
251 parentkey = self.createKey(parent.name, timestamp)
252 self.addRelationship(key, parentkey)
254 def convert_to_builtin_types(self, obj):
255 """
256 Recursively convert NumPy data types to native Python data types.
258 KEEP IN MIND: THIS USES A RECURSIVE FUNCTION
260 DJ: Addresses error with following line:
262 representationString = json.dumps(representationDict)
263 """
264 if isinstance(obj, dict):
265 return {
266 self.convert_to_builtin_types(k): self.convert_to_builtin_types(v)
267 for k, v in obj.items()
268 }
269 elif isinstance(obj, list):
270 return [self.convert_to_builtin_types(elem) for elem in obj]
271 elif isinstance(obj, tuple): 271 ↛ 272line 271 didn't jump to line 272 because the condition on line 271 was never true
272 return tuple(self.convert_to_builtin_types(elem) for elem in obj)
273 elif isinstance(obj, PosixPath): # NOT SURE IF I NEED THIS 273 ↛ 274line 273 didn't jump to line 274 because the condition on line 273 was never true
274 return str(obj)
275 elif has_numpy and isinstance(obj, np.integer): 275 ↛ 276line 275 didn't jump to line 276 because the condition on line 275 was never true
276 return int(obj)
277 elif has_numpy and isinstance(obj, np.floating): 277 ↛ 278line 277 didn't jump to line 278 because the condition on line 277 was never true
278 return float(obj)
279 elif has_numpy and isinstance(obj, np.ndarray): 279 ↛ 280line 279 didn't jump to line 280 because the condition on line 279 was never true
280 return obj.tolist()
281 else:
282 return obj
284 def toString(self, config_ob=config):
285 """
286 Returns this instance as a ASCII string that can
287 be stored and recreated as an object using fromString()
289 Uses JSON for the representation of the relevant objects.
291 If total backward compatability is required, and the pickle format
292 is desired, then set the environment variable
293 RSC_HISTORY_USEPICKLE=1
294 and the old pickle format will be used.
296 """
297 if not config_ob.HISTORY_USEPICKLE: 297 ↛ 318line 297 didn't jump to line 318 because the condition on line 297 was always true
298 # relationships is now a dictionary, but for backwards compatability
299 # store it as a list, as it used to be. I will probably get rid of this eventually
300 relationships = self.relationships
301 if isinstance(relationships, dict): 301 ↛ 303line 301 didn't jump to line 303 because the condition on line 301 was always true
302 relationships = list(relationships.keys())
303 representationDict = {
304 "files": self.files,
305 "thismeta": self.thismeta,
306 "relationships": relationships,
307 "directparents": self.directparents,
308 }
310 # We have two possible json modules, on std and one for python < 2.6
311 if hasattr(json, "dumps"): 311 ↛ 314line 311 didn't jump to line 314 because the condition on line 311 was always true
312 representationDict = self.convert_to_builtin_types(representationDict)
313 representationString = json.dumps(representationDict)
314 elif hasattr(json, "JsonWriter"):
315 JSONwriter = json.JsonWriter()
316 representationString = JSONwriter.write(representationDict)
317 else:
318 representationString = pickle.dumps(self, 0)
319 return representationString
321 @staticmethod
322 def fromString(s):
323 """
324 Creates and returns a new ProcessingHistory object from
325 a string previously returned by toString()
327 Will attempt to process as JSON first, and if that fails will attempt
328 with pickle, for backward compatibility. (We used to use pickle format,
329 but now use JSON).
331 """
332 isJSON = True
333 # We have two possible json modules, on std and one for python < 2.6
334 if hasattr(json, "loads"): 334 ↛ 339line 334 didn't jump to line 339 because the condition on line 334 was always true
335 try:
336 representationDict = json.loads(s)
337 except ValueError:
338 isJSON = False
339 elif hasattr(json, "JsonReader"):
340 try:
341 JSONreader = json.JsonReader()
342 representationDict = JSONreader.read(s)
343 except json.ReadException:
344 isJSON = False
346 if isJSON: 346 ↛ 354line 346 didn't jump to line 354 because the condition on line 346 was always true
347 obj = ProcessingHistory()
348 obj.files = representationDict["files"]
349 obj.thismeta = representationDict["thismeta"]
350 obj.relationships = representationDict["relationships"]
351 obj.directparents = representationDict["directparents"]
352 else:
353 # Assume pickle
354 if hasattr(s, "encode"):
355 s = s.encode("utf-8")
356 # also want to avoid the previous rsc.history import
357 # approach.
358 obj = pickle.loads(s)
360 # If relationships is a list, convert it to a dictionary for efficiency
361 if isinstance(obj.relationships, list): 361 ↛ 366line 361 didn't jump to line 366 because the condition on line 361 was always true
362 relationshipsDict = {}
363 for t in obj.relationships: 363 ↛ 364line 363 didn't jump to line 364 because the loop on line 363 never started
364 relationshipsDict[tuple(t)] = None
365 obj.relationships = relationshipsDict
366 return obj
369def readTreeFromDataset(dataset):
370 """
371 Reads the processing history out of a GDAL dataset (raster) or a LAS/LAZ file
372 and returns a ProcessingHistory object.
374 The function identifies whether the input is a raster or LAS/LAZ file based
375 on the file extension and processes the metadata accordingly.
376 - The function reads the VLRs (Variable Length Records) from the LAS header.
377 - If it finds a VLR with the user_id of las_history, it extracts the metadata.
378 - The metadata is then used to create a ProcessingHistory object.
379 """
381 # Check the file type, if it lasdata then we can assume laspy is correct
382 if has_laspy: 382 ↛ 401line 382 didn't jump to line 401 because the condition on line 382 was always true
383 # if you have laspy you can check a laspy object
384 if isinstance(dataset, laspy.LasData) or isinstance(dataset, laspy.LasReader):
385 header = dataset.header
386 # Read metadata from VLRs (Variable Length Records)
387 for vlr in header.vlrs: 387 ↛ 398line 387 didn't jump to line 398 because the loop on line 387 didn't complete
388 if vlr.user_id == las_metadata_name:
389 compressed_data = vlr.record_data
390 s = zlib.decompress(
391 compressed_data
392 ) # s = zlib.decompress(base64.b64decode(compressed_data))
393 if isinstance(s, bytes): 393 ↛ 395line 393 didn't jump to line 395 because the condition on line 393 was always true
394 s = s.decode("utf-8")
395 obj = ProcessingHistory.fromString(s)
396 return obj
397 # loop finished without finding a matching VLR
398 logger.warning("""WARNING: The metadata object is empty. See readTreeFromDataset for
399 processing history and compression code for details""")
400 return ProcessingHistory()
401 obj = read_metadata_from_object(dataset)
402 return obj
405def readTreeFromFilename(imgfile):
406 """
407 Same as readTreeFromDataset() but takes a filenameinsertMetadataFilename
408 imgfile is either a Path object or a string
409 """
410 imgfile = str(imgfile)
412 # GDAL virtual filesystem paths (e.g. /vsicurl/, /vsis3/) cannot be
413 # represented as Path objects — Path collapses the double-slash in URLs.
414 # Skip the Path-based LAS check for these.
415 is_virtual = imgfile.startswith("/vsi")
417 if not is_virtual: 417 ↛ 427line 417 didn't jump to line 427 because the condition on line 417 was always true
418 imgfile = Path(imgfile)
419 # PROCESSING LAS/LAZ files
420 # will only work if you have las
421 if has_laspy: 421 ↛ 427line 421 didn't jump to line 427 because the condition on line 421 was always true
422 if imgfile.suffix == ".las" or imgfile.suffix == ".laz":
423 with laspy.open(imgfile) as lasdata:
424 obj = readTreeFromDataset(lasdata)
425 return obj
427 obj = read_metadata_from_file(imgfile)
428 return obj
431## rasterio versions
432def _read_metadata_from_rasterio(dataset):
433 meta = dataset.tags(1)
434 if zippedMetadataName in meta: 434 ↛ 444line 434 didn't jump to line 444 because the condition on line 434 was always true
435 sz = meta[zippedMetadataName]
436 if sz.startswith(r"b'"): 436 ↛ 438line 436 didn't jump to line 438 because the condition on line 436 was always true
437 sz = sz[2:-1]
438 s = zlib.decompress(base64.b64decode(sz))
439 obj = ProcessingHistory.fromString(s)
440 return obj
441 else:
442 # no metadata in this file - manufacture an empty object
443 # print "warning: %s has no metadata" % dataset.GetDescription()
444 obj = ProcessingHistory()
445 return obj
448def _read_metadata_from_rasterio_filename(imgfile):
449 import rasterio
451 with rasterio.open(imgfile) as src:
452 obj = _read_metadata_from_rasterio(src)
453 return obj
456## gdal versiona
457def _read_metadata_from_gdal(dataset):
458 drvrName = dataset.GetDriver().ShortName
459 if drvrName in driversUsingDSmetadata: 459 ↛ 460line 459 didn't jump to line 460 because the condition on line 459 was never true
460 meta = dataset.GetMetadata()
461 else:
462 band = dataset.GetRasterBand(1)
463 meta = band.GetMetadata()
464 if metadataName in meta and len(meta[metadataName]) > 0: 464 ↛ 466line 464 didn't jump to line 466 because the condition on line 464 was never true
465 # file has the processing history stored as pickled object
466 s = meta[metadataName]
467 # always py3
468 s = bytes(s, "utf-8")
469 obj = ProcessingHistory.fromString(s)
470 elif zippedMetadataName in meta: 470 ↛ 480line 470 didn't jump to line 480 because the condition on line 470 was always true
471 sz = meta[zippedMetadataName]
472 # what is this?
473 if sz.startswith(r"b'"): 473 ↛ 474line 473 didn't jump to line 474 because the condition on line 473 was never true
474 sz = sz[2:-1]
475 s = zlib.decompress(base64.b64decode(sz))
476 obj = ProcessingHistory.fromString(s)
477 else:
478 # no metadata in this file - manufacture an empty object
479 # print "warning: %s has no metadata" % dataset.GetDescription()
480 obj = ProcessingHistory()
481 dataset = None # Close the GDAL dataset
482 return obj
485def _read_metadata_from_gdal_filename(imgfile):
486 from osgeo import gdal, gdalconst
488 ds = gdal.Open(str(imgfile), gdalconst.GA_ReadOnly)
489 obj = readTreeFromDataset(ds)
490 return obj
493## writing to datasets
496def _set_processing_history_gdal(dataset, obj):
497 """
498 Write the processing history into a GDAL datatset or a LAS file..
499 """
500 drvrName = dataset.GetDriver().ShortName
501 objectString = obj.toString()
502 objectString = bytes(objectString, "utf-8")
503 # Now compress it
504 compressedString = base64.b64encode(zlib.compress(objectString, 9))
506 if drvrName in metadataSizeLimitsByDriver: 506 ↛ 531line 506 didn't jump to line 531 because the condition on line 506 was always true
507 compressedStrLen = len(compressedString)
508 if compressedStrLen > metadataSizeLimitsByDriver[drvrName]: 508 ↛ 511line 508 didn't jump to line 511 because the condition on line 508 was never true
509 # We need to trim all the parents off, so we have only a minimal
510 # amount of history.
511 objCopy = ProcessingHistory.fromString(
512 zlib.decompress(base64.b64decode(compressedString))
513 )
514 trimParents(objCopy)
515 compressedString = base64.b64encode(
516 zlib.compress(objCopy.toString().encode(), 9)
517 )
518 logger.warning(
519 f"file {dataset.GetFileList()[0]} compressed history is "
520 f"{compressedStrLen} bytes"
521 )
522 logger.warning(
523 f"Limit for {drvrName} driver is {metadataSizeLimitsByDriver[drvrName]} bytes"
524 )
525 logger.warning(
526 f"Truncating to minimal history ({len(compressedString)} bytes)."
527 )
529 # Originally we stored this on band1 (because that's all HFA supported),
530 # but for some drivers we have to use the dataset instead.
531 if drvrName in driversUsingDSmetadata: 531 ↛ 532line 531 didn't jump to line 532 because the condition on line 531 was never true
532 gdalObj = dataset
533 else:
534 gdalObj = dataset.GetRasterBand(1)
536 # Now store the compressed string. Note that we always write the string in its compressed
537 # form, as this will be robust for file format conversions with gdal_translate, e.g.
538 # to convert to GTiff.
539 meta = gdalObj.GetMetadata()
540 meta[zippedMetadataName] = compressedString
541 gdalObj.SetMetadata(meta)
542 gdalObj.FlushCache() # Save changes to the file
545def _set_processing_history_rasterio(dataset, obj):
546 objectString = obj.toString()
547 objectString = bytes(objectString, "utf-8")
548 # Now compress it
549 compressedString = base64.b64encode(zlib.compress(objectString, 9))
550 compressedStrLen = len(compressedString)
551 if compressedStrLen > metadataSizeLimitsByDriver["GTiff"]: 551 ↛ 554line 551 didn't jump to line 554 because the condition on line 551 was never true
552 # We need to trim all the parents off, so we have only a minimal
553 # amount of history.
554 objCopy = ProcessingHistory.fromString(
555 zlib.decompress(base64.b64decode(compressedString))
556 )
557 trimParents(objCopy)
558 compressedString = base64.b64encode(
559 zlib.compress(objCopy.toString().encode(), 9)
560 )
561 logger.warning(
562 f"Truncating to minimal history ({len(compressedString)} bytes)."
563 )
564 dataset.update_tags(1, SLATS_Metadata2_zipped=compressedString)
567def getMandatoryFields(script, argv, strict=True):
568 """
569 Get the mandatory fields and return as a dictionary.
571 Images with history require a minimum set of mandatory
572 fields:
573 * timestamp
574 * login
575 * uname_os
576 * uname_host
577 * uname_release
578 * uname_machine
579 * cwd: the current working directory from where the script was
580 called
581 * script
582 * script arguments
584 Additionally, if the image was created in a container, the
585 following are also mandatory:
586 * one of org.label-schema.vcs-ref or org.opencontainers.image.revision:
587 the git hash of the container build
588 * one of org.label-schema.vcs-url, org.opencontainers.image.url,
589 vcs-url: the url of the git repository
591 If strict is False, then container labels are considered optional.
594 Args:
595 script (str): The script which produces the image.
596 If script is None, then assumes it is sys.argv[0]
597 argv (list[str]): The arguments accompanying the
598 script. If argv is none, assumes it is sys.argv[1:]
599 strict (bool): Container labels considered mandatory.
600 This argument will be ignored if not running in a
601 container.
603 Returns: (dict): A dictonary with the mandatory field
604 as keys, and the associated values.
606 """
607 dictn = {}
609 dictn["timestamp"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
610 login = os.getenv(
611 "LOGNAME", "unknown"
612 ) # see http://docs.python.org/lib/os-procinfo.html#l2h-2578
613 if login == "unknown": 613 ↛ 615line 613 didn't jump to line 615 because the condition on line 613 was always true
614 login = os.getenv("USER", "unknown")
615 dictn["login"] = login
617 uname = os.uname()
618 dictn["uname_os"] = uname[0]
619 dictn["uname_host"] = uname[1]
620 dictn["uname_release"] = uname[2]
621 dictn["uname_version"] = uname[3]
622 dictn["uname_machine"] = uname[4]
623 dictn["cwd"] = os.getcwd()
625 # now check for container info
626 if is_container():
627 logger.debug("running in a container")
628 # mapping of label to history
629 mandatory_container_labels = {
630 "repo_url": [
631 "org.opencontainers.image.url",
632 "org.label-schema.vcs-url",
633 "vcs-url",
634 ],
635 "container_version": [
636 "org.opencontainers.image.revision",
637 "org.label-schema.vcs-ref",
638 ],
639 }
640 try:
641 container_metadata = get_container_build_info()
642 except FileNotFoundError:
643 logger.warning(
644 "could not find build metadata file /etc/rsc_build_info.json",
645 exc_info=1,
646 )
647 logger.warning("Trying for a singularity recipe")
648 try:
649 container_metadata = get_build_singularity_labels()
650 except FileNotFoundError:
651 logger.warning("could not find singularity metadata", exc_info=1)
652 container_metadata = {}
653 logger.warning("no container metadata was found")
655 # now check for missing labels
656 # if container_metadata is empty they'll all be missing...
657 items_found = []
658 if "buildinfo" in container_metadata: 658 ↛ 682line 658 didn't jump to line 682 because the condition on line 658 was always true
659 for mandatory_item, container_labels in mandatory_container_labels.items():
660 # check that at least one of the labels is present,
661 # and shouldn't be an empty string
662 item_found = False
663 for thislabel in container_labels:
664 if (
665 thislabel in container_metadata["buildinfo"]
666 and not container_metadata["buildinfo"][thislabel] == ""
667 ):
668 item_found = True
669 break
670 items_found.append(item_found)
671 if not item_found:
672 message = (
673 "The container is missing the mandatory metadata for {}".format(
674 mandatory_item
675 )
676 )
677 logger.warning(message)
678 message = "A container requires one of the following {}".format(
679 ", ".join(container_labels)
680 )
682 if not all(items_found):
683 if strict:
684 raise MissingMandatoryLabels(message)
685 else:
686 logger.warning("Mandatory container label restriction over-ridden")
687 # we add what we have
688 dictn.update(container_metadata)
690 else:
691 dictn["python_version"] = "{}.{}.{}".format(*sys.version_info)
693 if argv is None:
694 argv = " ".join(sys.argv[1:])
695 dictn["commandline"] = argv
697 if script is None:
698 script = sys.argv[0]
699 dictn["script"] = os.path.basename(script)
700 dictn["script_dir"] = os.path.dirname(script)
702 # Find version numbers of any external imported modules (if possible)
703 # slightly different if we are in a container, since in that case its
704 # actually not very informative
705 if is_container():
706 moduleVersionDict = dict(
707 # tuple(str(ws).split()) for ws in pkg_resources.working_set
708 (dist.metadata["Name"], dist.metadata["Version"])
709 for dist in metadata.distributions()
710 )
711 else:
712 moduleVersionDict = {}
713 modnameList = list(sys.modules.keys())
714 for modname in modnameList:
715 modobj = sys.modules[modname]
716 if (
717 hasattr(modobj, "__file__")
718 and modobj.__file__
719 and not modobj.__file__.startswith(sys.prefix)
720 ):
721 toplevelModname = modname.split(".")[0]
722 if toplevelModname in sys.modules: 722 ↛ 714line 722 didn't jump to line 714 because the condition on line 722 was always true
723 moduleVersionDict[toplevelModname] = "Unknown"
724 if len(moduleVersionDict) > 0: 724 ↛ 729line 724 didn't jump to line 729 because the condition on line 724 was always true
725 for modname in moduleVersionDict:
726 if hasattr(sys.modules[modname], "__version__"):
727 moduleVersionDict[modname] = str(sys.modules[modname].__version__)
729 if len(moduleVersionDict) > 0: 729 ↛ 732line 729 didn't jump to line 732 because the condition on line 729 was always true
730 dictn["package_version_dict"] = json.dumps(moduleVersionDict)
732 return dictn
735def insertHistory(
736 name,
737 parent_list,
738 optional_dict,
739 script=None,
740 argv=None,
741 strict=True,
742):
743 """
744 Creates a new node with mandatory metadata
745 and any optional metadata passed in optional_dict.
746 It merges all of the metadata from the parent filenames
747 passed in parent_list.
748 If this is being called from a Python script leave script None it will read it from the
749 current environment.
750 The argument strict is passed through to the getMandatoryFields function.
751 name isn't accessed but for historical reasons I'll leave it
752 """
754 obj = ProcessingHistory()
755 obj.thismeta = getMandatoryFields(script, argv, strict=strict)
756 # optional fields
757 for key in optional_dict.keys():
758 obj.thismeta[key] = optional_dict[key]
760 for parent in parent_list:
761 parentobj = readTreeFromFilename(parent)
762 parentTimestamp = None
763 if "timestamp" in parentobj.thismeta: 763 ↛ 765line 763 didn't jump to line 765 because the condition on line 763 was always true
764 parentTimestamp = parentobj.thismeta["timestamp"]
765 key = obj.createKey(parent, parentTimestamp)
766 obj.mergeHistory(key, parentobj)
768 return obj
771def insertMetadataDataset(
772 dataset,
773 parent_list,
774 optional_dict,
775 script=None,
776 argv=None,
777 strict=True,
778):
779 """
780 Takes a dataset (opened with GA_Update, or Create()) and creates
781 a new node with mandatory metadata
782 and any optional metadata passed in optional_dict. It merges
783 all of the metadata from the parent filenames
784 passed in parent_list.
785 If this is being called from a Python script leave script None it will read it from the
786 current environment.
787 The argument strict is passed through to the getMandatoryFields function.
788 """
789 name = None
790 if has_laspy: 790 ↛ 800line 790 didn't jump to line 800 because the condition on line 790 was always true
791 if isinstance(dataset, laspy.LasData):
792 name = (
793 Path(dataset.filename).name
794 if hasattr(dataset, "filename")
795 else "Unknown"
796 )
797 # if it was las file, we should have a name. Otherwise either
798 # we don't have laspy and a las file which will raise an error later
799 # or it's an image file, in which case we should try to get it
800 if not name:
801 try:
802 name = os.path.basename(dataset.GetDescription())
803 except AttributeError:
804 # if it is a rasterio object
805 name = dataset.name
807 obj = insertHistory(name, parent_list, optional_dict, script, argv, strict=strict)
809 setProcessingHistoryDataset(dataset, obj)
812def setProcessingHistoryDataset(dataset, obj):
813 """
814 Write the processing history into a GDAL datatset or a LAS file..
816 The LAS component appears to directly append the compressed string metadata to the VLR
817 without performing any of the extra steps seen in the GDAL component
818 (e.g., checking size limits, trimming parents, or ensuring compatibility).
819 While the compressed history metadata is added correctly, the LAS component
820 doesn't account for:
821 Size Limits: Unlike GDAL, which checks size limits imposed by specific drivers,
822 there are no similar checks for LAS VLRs.
823 Trimming Parent Metadata: The LAS part doesn’t have a trimming mechanism similar to what
824 GDAL uses when the metadata size exceeds the limit.
825 Consistency: The metadata format in the LAS file is inserted using VLRs, whereas
826 GDAL stores metadata at the dataset or band level.
827 This difference in storage location and structure may impact consistency
828 between the two formats when accessed.
830 """
831 if has_laspy: 831 ↛ 880line 831 didn't jump to line 880 because the condition on line 831 was always true
832 if isinstance(dataset, laspy.LasData):
833 # Handle LAS file metadata insertion using VLRs
834 objectString = obj.toString()
835 objectBytes = bytes(objectString, "utf-8")
837 # Compress the metadata
838 compressedString = zlib.compress(objectBytes, 9)
840 # Check if the compressed metadata exceeds the LAS VLR size limit
841 compressedStrLen = len(compressedString)
842 if compressedStrLen > metadataSizeLimitsByDriver["LAS_VLR"]: 842 ↛ 843line 842 didn't jump to line 843 because the condition on line 842 was never true
843 objCopy = ProcessingHistory.fromString(
844 zlib.decompress(compressedString)
845 )
846 trimParents(objCopy)
847 objCopyString = objCopy.toString()
848 objCopyString = bytes(objCopyString, "utf-8")
849 compressedString = zlib.compress(
850 objCopyString, 9
851 ) # base64.b64encode(zlib.compress(objCopyString, 9))
852 compressedStrLen = len(compressedString)
854 # Replace existing VLR if it exists
855 existing_vlr_index = None
856 for idx, vlr in enumerate(dataset.vlrs):
857 if vlr.user_id == las_metadata_name: 857 ↛ 858line 857 didn't jump to line 858 because the condition on line 857 was never true
858 existing_vlr_index = idx
859 break
861 history_vlr = laspy.VLR(
862 user_id=las_metadata_name,
863 record_id=history_record_id,
864 record_data=compressedString,
865 description="VLR Record contains compressed processing history",
866 )
868 if existing_vlr_index is not None: 868 ↛ 869line 868 didn't jump to line 869 because the condition on line 868 was never true
869 logger.debug(f"existing index at {existing_vlr_index}")
870 dataset.vlrs[existing_vlr_index] = history_vlr
871 else:
872 logger.debug("appending history to the vlrs")
873 dataset.vlrs.append(history_vlr)
875 # Save the updated LAS file
876 logger.debug(f"writing updated file {dataset.filename}")
877 dataset.write(dataset.filename)
878 dataset.update_header()
879 return True
880 set_processing_history_object(dataset, obj)
883def trimParents(obj):
884 """
885 Trim off the parent nodes, and instead list them as an entry in the
886 fields of this node. There are separate entries for parent and grandparent files.
887 Each entry has a list of the files, along with their timestamps.
889 """
890 parentkeys = obj.directparents.keys()
891 obj.thismeta[truncatedParentItemName] = repr(parentkeys)
892 grandparentkeys = set()
893 for k in parentkeys:
894 grandparents = [k2[1] for k2 in obj.relationships if k2[0] == k]
895 for gp in grandparents:
896 grandparentkeys.add(gp)
897 obj.thismeta[truncatedGrandparentItemName] = repr(list(grandparentkeys))
898 obj.directparents = {}
899 obj.files = {}
900 obj.relationships = {}
903def insertMetadataFilename(
904 imgfile,
905 parent_list,
906 optional_dict,
907 script=None,
908 argv=None,
909 strict=True,
910):
911 """
912 Same as insertMetadataDataset but takes a filename rather than a dataset
913 Also adapted to work if the file is an XML file, by detecting
914 this and calling the right routine.
915 The argument strict is passed through to the getMandatoryFields function.
917 imgfile can be string or Path
918 """
920 # co-erce to Path
921 imgfile = Path(imgfile)
922 if has_laspy: 922 ↛ 942line 922 didn't jump to line 942 because the condition on line 922 was always true
923 if imgfile.suffix == ".las" or imgfile.suffix == ".laz":
924 # Use laspy to handle LAS/LAZ files
925 logger.debug(f"adding metadata to las file {imgfile.name}")
926 with laspy.open(imgfile) as las_file:
927 # do we really have to read all the data in?
928 lasdata = las_file.read()
929 lasdata.filename = imgfile
930 insertMetadataDataset(
931 lasdata,
932 parent_list,
933 optional_dict,
934 script,
935 argv,
936 strict=strict,
937 )
938 return True
940 # opening is different depending on whether we are using
941 # rasterio or not
942 backend = get_backend()
943 if backend == "rasterio":
944 import rasterio
946 with rasterio.open(imgfile, "r+") as dst:
947 insertMetadataDataset(
948 dst,
949 parent_list,
950 optional_dict,
951 script,
952 argv,
953 strict=strict,
954 )
955 else:
956 from osgeo import gdal, gdalconst
958 ds = gdal.Open(str(imgfile), gdalconst.GA_Update)
959 insertMetadataDataset(
960 ds, parent_list, optional_dict, script, argv, strict=strict
961 )
962 del ds
965def fileIsXML(filename):
966 """
967 Check the beginning of a file and determine whether it appears to be an XML file or not.
968 """
969 try:
970 magicNumberString = open(filename, "rb").read(5)
971 isxml = magicNumberString in ("<?xml", b"<?xml")
972 except FileNotFoundError:
973 isxml = False
974 return isxml
977class MissingMandatoryLabels(Exception):
978 """Exception raised if any mandatory fields are missing.
980 Args:
981 msg (str): A string describing the reason the exception
982 was raised. Should include the names of the missing
983 labels.
985 Attributes:
986 msg (str): A string describing the reason the exception
987 was raised.
988 """
990 pass