docs for zanj v0.6.0
View Source on GitHub

zanj.loading


  1from __future__ import annotations
  2
  3import json
  4import threading
  5import typing
  6import zipfile
  7from dataclasses import dataclass
  8from pathlib import Path
  9from typing import Any, Callable
 10
 11import numpy as np
 12
 13try:
 14    import pandas as pd  # type: ignore[import]
 15
 16    pandas_DataFrame = pd.DataFrame  # type: ignore[no-redef]
 17except ImportError:
 18
 19    class pandas_DataFrame:  # type: ignore[no-redef]
 20        def __init__(self, *args, **kwargs):
 21            raise ImportError("cannot load pandas DataFrame, pandas is not installed")
 22
 23
 24try:
 25    import polars as pl  # type: ignore[import]
 26
 27    polars_DataFrame = pl.DataFrame  # type: ignore[no-redef]
 28except ImportError:
 29
 30    class polars_DataFrame:  # type: ignore[no-redef]
 31        def __init__(self, *args, **kwargs):
 32            raise ImportError("cannot load polars DataFrame, polars is not installed")
 33
 34
 35from muutils.errormode import ErrorMode
 36from muutils.json_serialize.array import load_array
 37from muutils.json_serialize.json_serialize import ObjectPath
 38
 39from zanj.consts import (
 40    JSONdict,
 41    JSONitem,
 42    _FORMAT_KEY,
 43    _REF_KEY,
 44    safe_getsource,
 45    string_as_lines,
 46)
 47
 48from zanj.externals import (
 49    GET_EXTERNAL_LOAD_FUNC,
 50    ZANJ_MAIN,
 51    ZANJ_META,
 52    ExternalItem,
 53    _ZANJ_pre,
 54)
 55
 56# pylint: disable=protected-access, dangerous-default-value
 57
 58
 59def _populate_externals_error_checking(key, item) -> bool:
 60    """checks that the key is valid for the item. returns "True" we need to augment the path by accessing the "data" element"""
 61
 62    # special case for not fully loaded external item which we still need to populate
 63    if isinstance(item, typing.Mapping):
 64        if (_FORMAT_KEY in item) and item[_FORMAT_KEY].endswith(":external"):
 65            if "data" in item:
 66                return True
 67            else:
 68                raise KeyError(
 69                    f"expected an external item, but could not find data: {list(item.keys())}",
 70                    f"{item[_FORMAT_KEY]}, {len(item) = }, {item.get('data', '<EMPTY>') = }",
 71                )
 72
 73    # if it's a list, make sure the key is an int and that it's in range
 74    if isinstance(item, typing.Sequence):
 75        if not isinstance(key, int):
 76            raise TypeError(f"improper type: '{type(key) = }', expected int")
 77        if key >= len(item):
 78            raise IndexError(f"index out of range: '{key = }', expected < {len(item)}")
 79
 80    # if it's a dict, make sure that the key is a str and that it's in the dict
 81    elif isinstance(item, typing.Mapping):
 82        if not isinstance(key, str):
 83            raise TypeError(f"improper type: '{type(key) = }', expected str")
 84        if key not in item:
 85            raise KeyError(f"key not in dict: '{key = }', expected in {item.keys()}")
 86
 87    # otherwise, raise an error
 88    else:
 89        raise TypeError(f"improper type: '{type(item) = }', expected dict or list")
 90
 91    return False
 92
 93
 94@dataclass
 95class LoaderHandler:
 96    """handler for loading an object from a json file or a ZANJ archive"""
 97
 98    # TODO: add a separate "asserts" function?
 99    # right now, any asserts must happen in `check` or `load` which is annoying with lambdas
100
101    # (json_data, path) -> whether to use this handler
102    check: Callable[[JSONitem, ObjectPath, _ZANJ_pre], bool]
103    # function to load the object (json_data, path) -> loaded_obj
104    load: Callable[[JSONitem, ObjectPath, _ZANJ_pre], Any]
105    # unique identifier for the handler, saved in __muutils_format__ field
106    uid: str
107    # source package of the handler -- note that this might be overridden by ZANJ
108    source_pckg: str
109    # priority of the handler, defaults are all 0
110    priority: int = 0
111    # description of the handler
112    desc: str = "(no description)"
113
114    def serialize(self) -> JSONdict:
115        """serialize the handler info"""
116        return {
117            # get the code and doc of the check function
118            "check": {
119                "code": safe_getsource(self.check),
120                "doc": string_as_lines(self.check.__doc__),
121            },
122            # get the code and doc of the load function
123            "load": {
124                "code": safe_getsource(self.load),
125                "doc": string_as_lines(self.load.__doc__),
126            },
127            # get the uid, source_pckg, priority, and desc
128            "uid": str(self.uid),
129            "source_pckg": str(self.source_pckg),
130            "priority": int(self.priority),
131            "desc": str(self.desc),
132        }
133
134    @classmethod
135    def from_formattedclass(cls, fc: type, priority: int = 0):
136        """create a loader from a class with `serialize`, `load` methods and `__muutils_format__` attribute"""
137        assert hasattr(fc, "serialize")
138        assert callable(fc.serialize)  # type: ignore
139        assert hasattr(fc, "load")
140        assert callable(fc.load)  # type: ignore
141        assert hasattr(fc, _FORMAT_KEY)
142        assert isinstance(fc.__muutils_format__, str)  # type: ignore
143
144        return cls(
145            check=lambda json_item, path=None, z=None: (  # type: ignore[misc]
146                json_item[_FORMAT_KEY] == fc.__muutils_format__  # type: ignore[attr-defined]
147            ),
148            load=lambda json_item, path=None, z=None: fc.load(json_item, path, z),  # type: ignore[misc]
149            uid=fc.__muutils_format__,  # type: ignore[attr-defined]
150            source_pckg=str(fc.__module__),
151            priority=priority,
152            desc=f"formatted class loader for {fc.__name__}",
153        )
154
155
156# TODO: how can we type hint this without actually importing torch?
157def _torch_loaderhandler_load(
158    json_item: JSONitem,
159    path: ObjectPath,
160    z: _ZANJ_pre | None = None,
161):
162    """load a torch tensor from a json item"""
163    try:
164        import torch  # type: ignore[import-not-found]
165        from muutils.tensor_utils import TORCH_DTYPE_MAP
166    except ImportError as e:
167        err_msg: str = f"could not import torch, which we need to load the object at {path = }: {json_item = }"
168        raise ImportError(err_msg) from e
169
170    return torch.tensor(
171        # json_item is JSONitem but load_array expects narrower types; runtime check is in LoaderHandler.check
172        load_array(json_item),  # type: ignore[no-matching-overload, call-overload]
173        dtype=TORCH_DTYPE_MAP[json_item["dtype"]],  # type: ignore[index, call-overload]
174    )
175
176
177# NOTE: there are type ignores on the loaders, since the type checking should be the responsibility of the check function
178
179LOADER_MAP_LOCK = threading.Lock()
180
181LOADER_MAP: dict[str, LoaderHandler] = {
182    lh.uid: lh
183    for lh in [
184        # array external
185        LoaderHandler(
186            check=lambda json_item, path=None, z=None: (  # type: ignore[misc]
187                isinstance(json_item, typing.Mapping)
188                and _FORMAT_KEY in json_item
189                and json_item[_FORMAT_KEY].startswith("numpy.ndarray")
190                # and json_item["data"].dtype.name == json_item["dtype"]
191                # and tuple(json_item["data"].shape) == tuple(json_item["shape"])
192            ),
193            load=lambda json_item, path=None, z=None: np.array(  # type: ignore[misc]
194                load_array(json_item), dtype=np.dtype(json_item["dtype"])
195            ),
196            uid="numpy.ndarray",
197            source_pckg="zanj",
198            desc="numpy.ndarray loader",
199        ),
200        LoaderHandler(
201            check=lambda json_item, path=None, z=None: (  # type: ignore[misc]
202                isinstance(json_item, typing.Mapping)
203                and _FORMAT_KEY in json_item
204                and json_item[_FORMAT_KEY].startswith("torch.Tensor")
205                # and json_item["data"].dtype.name == json_item["dtype"]
206                # and tuple(json_item["data"].shape) == tuple(json_item["shape"])
207            ),
208            load=_torch_loaderhandler_load,
209            uid="torch.Tensor",
210            source_pckg="zanj",
211            desc="torch.Tensor loader",
212        ),
213        # pandas
214        LoaderHandler(
215            check=lambda json_item, path=None, z=None: (  # type: ignore[misc]
216                isinstance(json_item, typing.Mapping)
217                and _FORMAT_KEY in json_item
218                and json_item[_FORMAT_KEY].startswith("pandas.DataFrame")
219                and "data" in json_item
220                and isinstance(json_item["data"], typing.Sequence)
221            ),
222            load=lambda json_item, path=None, z=None: (  # type: ignore[misc]
223                pandas_DataFrame(json_item["data"])
224                # if there is no data, load just the columns (this is for empty dataframes)
225                if json_item["data"]
226                else pandas_DataFrame(columns=json_item.get("columns"))
227            ),
228            uid="pandas.DataFrame",
229            source_pckg="zanj",
230            desc="pandas.DataFrame loader",
231        ),
232        # polars
233        LoaderHandler(
234            check=lambda json_item, path=None, z=None: (  # type: ignore[misc]
235                isinstance(json_item, typing.Mapping)
236                and _FORMAT_KEY in json_item
237                and json_item[_FORMAT_KEY].startswith("polars.DataFrame")
238                and "data" in json_item
239                and isinstance(json_item["data"], typing.Sequence)
240            ),
241            load=lambda json_item, path=None, z=None: (  # type: ignore[misc]
242                polars_DataFrame(json_item["data"])
243                if json_item["data"]
244                else polars_DataFrame(
245                    schema={col: str for col in json_item.get("columns", [])}
246                )
247            ),
248            uid="polars.DataFrame",
249            source_pckg="zanj",
250            desc="polars.DataFrame loader",
251        ),
252        # list/tuple external
253        LoaderHandler(
254            check=lambda json_item, path=None, z=None: (  # type: ignore[misc]
255                isinstance(json_item, typing.Mapping)
256                and _FORMAT_KEY in json_item
257                and json_item[_FORMAT_KEY].startswith("list")
258                and "data" in json_item
259                and isinstance(json_item["data"], typing.Sequence)
260            ),
261            load=lambda json_item, path=None, z=None: [  # type: ignore[misc, arg-type]
262                load_item_recursive(x, path, z)  # type: ignore[arg-type]
263                for x in json_item["data"]
264            ],
265            uid="list",
266            source_pckg="zanj",
267            desc="list loader, for externals",
268        ),
269        LoaderHandler(
270            check=lambda json_item, path=None, z=None: (  # type: ignore[misc]
271                isinstance(json_item, typing.Mapping)
272                and _FORMAT_KEY in json_item
273                and json_item[_FORMAT_KEY].startswith("tuple")
274                and "data" in json_item
275                and isinstance(json_item["data"], typing.Sequence)
276            ),
277            load=lambda json_item, path=None, z=None: tuple(  # type: ignore[misc, arg-type]
278                [load_item_recursive(x, path, z) for x in json_item["data"]]  # type: ignore[arg-type]
279            ),
280            uid="tuple",
281            source_pckg="zanj",
282            desc="tuple loader, for externals",
283        ),
284    ]
285}
286
287
288def register_loader_handler(handler: LoaderHandler):
289    """register a custom loader handler"""
290    global LOADER_MAP, LOADER_MAP_LOCK
291    with LOADER_MAP_LOCK:
292        LOADER_MAP[handler.uid] = handler
293
294
295def get_item_loader(
296    json_item: JSONitem,
297    path: ObjectPath,
298    zanj: _ZANJ_pre | None = None,
299    error_mode: ErrorMode = ErrorMode.WARN,
300    # lh_map: dict[str, LoaderHandler] = LOADER_MAP,
301) -> LoaderHandler | None:
302    """get the loader for a json item"""
303    global LOADER_MAP
304
305    # check if we recognize the format
306    if isinstance(json_item, typing.Mapping) and _FORMAT_KEY in json_item:
307        if not isinstance(json_item[_FORMAT_KEY], str):  # type: ignore[index]
308            raise TypeError(
309                f"invalid __muutils_format__ type '{type(json_item[_FORMAT_KEY])}' in '{path=}': '{json_item[_FORMAT_KEY] = }'"  # type: ignore[index]
310            )
311        if json_item[_FORMAT_KEY] in LOADER_MAP:  # type: ignore[index]
312            return LOADER_MAP[json_item[_FORMAT_KEY]]  # type: ignore[index]
313
314    # if we dont recognize the format, try to find a loader that can handle it
315    for key, lh in LOADER_MAP.items():
316        if lh.check(json_item, path, zanj):
317            return lh
318
319    # if we still dont have a loader, return None
320    return None
321
322
323def load_item_recursive(
324    json_item: JSONitem,
325    path: ObjectPath,
326    zanj: _ZANJ_pre | None = None,
327    error_mode: ErrorMode = ErrorMode.WARN,
328    allow_not_loading: bool = True,
329) -> Any:
330    lh: LoaderHandler | None = get_item_loader(
331        json_item=json_item,
332        path=path,
333        zanj=zanj,
334        error_mode=error_mode,
335        # lh_map=lh_map,
336    )
337
338    if lh is not None:
339        # special case for serializable dataclasses
340        if (
341            isinstance(json_item, typing.Mapping)
342            and (_FORMAT_KEY in json_item)
343            and ("SerializableDataclass" in json_item[_FORMAT_KEY])  # type: ignore[operator]
344        ):
345            # why this horribleness?
346            # SerializableDataclass, if it has a field `x` which is also a SerializableDataclass, will automatically call `x.__class__.load()`
347            # However, we need to load things in containers, as well as arrays
348            processed_json_item: dict = {
349                key: (
350                    val
351                    if (
352                        isinstance(val, typing.Mapping)
353                        and (_FORMAT_KEY in val)
354                        and ("SerializableDataclass" in val[_FORMAT_KEY])  # type: ignore[operator, index]
355                    )
356                    else load_item_recursive(
357                        json_item=val,  # type: ignore[arg-type]
358                        path=tuple(path) + (key,),  # type: ignore[arg-type]
359                        zanj=zanj,
360                        error_mode=error_mode,
361                    )
362                )
363                for key, val in json_item.items()
364            }
365
366            return lh.load(processed_json_item, path, zanj)
367
368        else:
369            return lh.load(json_item, path, zanj)
370    else:
371        if isinstance(json_item, dict):
372            return {
373                key: load_item_recursive(
374                    # ty doesn't narrow JSONitem to dict after isinstance check; string key indexing is safe here
375                    json_item=json_item[key],  # type: ignore[invalid-argument-type, call-overload]
376                    path=tuple(path) + (key,),
377                    zanj=zanj,
378                    error_mode=error_mode,
379                    # lh_map=lh_map,
380                )
381                for key in json_item
382            }
383        elif isinstance(json_item, list):
384            return [
385                load_item_recursive(
386                    json_item=x,
387                    path=tuple(path) + (i,),
388                    zanj=zanj,
389                    error_mode=error_mode,
390                    # lh_map=lh_map,
391                )
392                for i, x in enumerate(json_item)
393            ]
394        elif isinstance(json_item, (str, int, float, bool, type(None))):
395            return json_item
396        else:
397            if allow_not_loading:
398                return json_item
399            else:
400                raise ValueError(
401                    f"unknown type {type(json_item)} at {path}\n{json_item}"
402                )
403
404
405def _each_item_in_externals(
406    externals: dict[str, ExternalItem],
407    json_data: JSONitem,
408) -> typing.Iterable[tuple[str, ExternalItem, Any, ObjectPath]]:
409    """note that you MUST use the raw iterator, dont try to turn into a list or something"""
410
411    sorted_externals: list[tuple[str, ExternalItem]] = sorted(
412        externals.items(), key=lambda x: len(x[1].path)
413    )
414
415    for ext_path, ext_item in sorted_externals:
416        # get the path to the item
417        path: ObjectPath = tuple(ext_item.path)
418        assert len(path) > 0
419        assert all(isinstance(key, (str, int)) for key in path), (
420            f"improper types in path {path=}"
421        )
422        # get the item
423        item = json_data
424        for i, key in enumerate(path):
425            try:
426                # ignores in this block are because we cannot know the type is indexable in static analysis
427                # but, we check the types in the line below
428                external_unloaded: bool = _populate_externals_error_checking(key, item)
429                if external_unloaded:
430                    item = item["data"]  # type: ignore
431                item = item[key]  # type: ignore[index]
432
433            except (KeyError, IndexError, TypeError) as e:
434                raise KeyError(
435                    f"could not find '{key = }' at path '{ext_path = }', specifically at index '{i = }'",
436                    f"'{type(item) =}', '{len(item) = }', '{item.keys() if isinstance(item, dict) else None = }'",  # type: ignore
437                    f"From error: {e = }",
438                    f"\n\n{item=}\n\n{ext_item=}",
439                ) from e
440
441        yield (ext_path, ext_item, item, path)
442
443
444class LoadedZANJ:
445    """for loading a zanj file"""
446
447    def __init__(
448        self,
449        path: str | Path,
450        zanj: _ZANJ_pre,
451    ) -> None:
452        # path and zanj object
453        self._path: str = str(path)
454        self._zanj: _ZANJ_pre = zanj
455
456        # load zip file
457        _zipf: zipfile.ZipFile = zipfile.ZipFile(file=self._path, mode="r")
458
459        # load data
460        self._meta: JSONdict = json.load(_zipf.open(ZANJ_META, "r"))
461        self._json_data: JSONitem = json.load(_zipf.open(ZANJ_MAIN, "r"))
462
463        # read externals
464        self._externals: dict[str, ExternalItem] = dict()
465        for fname, ext_item in self._meta["externals_info"].items():  # type: ignore
466            item_type: str = ext_item["item_type"]  # type: ignore
467            with _zipf.open(fname, "r") as fp:
468                self._externals[fname] = ExternalItem(
469                    item_type=item_type,  # type: ignore[arg-type]
470                    data=GET_EXTERNAL_LOAD_FUNC(item_type)(self, fp),
471                    path=ext_item["path"],  # type: ignore
472                )
473
474        # close zip file
475        _zipf.close()
476        del _zipf
477
478    def populate_externals(self) -> None:
479        """put all external items into the main json data"""
480
481        # loop over once, populating the externals only
482        for ext_path, ext_item, item, path in _each_item_in_externals(
483            self._externals, self._json_data
484        ):
485            # replace the item with the external item
486            assert _REF_KEY in item  # type: ignore
487            assert item[_REF_KEY] == ext_path  # type: ignore
488            item["data"] = ext_item.data  # type: ignore

@dataclass
class LoaderHandler:
 95@dataclass
 96class LoaderHandler:
 97    """handler for loading an object from a json file or a ZANJ archive"""
 98
 99    # TODO: add a separate "asserts" function?
100    # right now, any asserts must happen in `check` or `load` which is annoying with lambdas
101
102    # (json_data, path) -> whether to use this handler
103    check: Callable[[JSONitem, ObjectPath, _ZANJ_pre], bool]
104    # function to load the object (json_data, path) -> loaded_obj
105    load: Callable[[JSONitem, ObjectPath, _ZANJ_pre], Any]
106    # unique identifier for the handler, saved in __muutils_format__ field
107    uid: str
108    # source package of the handler -- note that this might be overridden by ZANJ
109    source_pckg: str
110    # priority of the handler, defaults are all 0
111    priority: int = 0
112    # description of the handler
113    desc: str = "(no description)"
114
115    def serialize(self) -> JSONdict:
116        """serialize the handler info"""
117        return {
118            # get the code and doc of the check function
119            "check": {
120                "code": safe_getsource(self.check),
121                "doc": string_as_lines(self.check.__doc__),
122            },
123            # get the code and doc of the load function
124            "load": {
125                "code": safe_getsource(self.load),
126                "doc": string_as_lines(self.load.__doc__),
127            },
128            # get the uid, source_pckg, priority, and desc
129            "uid": str(self.uid),
130            "source_pckg": str(self.source_pckg),
131            "priority": int(self.priority),
132            "desc": str(self.desc),
133        }
134
135    @classmethod
136    def from_formattedclass(cls, fc: type, priority: int = 0):
137        """create a loader from a class with `serialize`, `load` methods and `__muutils_format__` attribute"""
138        assert hasattr(fc, "serialize")
139        assert callable(fc.serialize)  # type: ignore
140        assert hasattr(fc, "load")
141        assert callable(fc.load)  # type: ignore
142        assert hasattr(fc, _FORMAT_KEY)
143        assert isinstance(fc.__muutils_format__, str)  # type: ignore
144
145        return cls(
146            check=lambda json_item, path=None, z=None: (  # type: ignore[misc]
147                json_item[_FORMAT_KEY] == fc.__muutils_format__  # type: ignore[attr-defined]
148            ),
149            load=lambda json_item, path=None, z=None: fc.load(json_item, path, z),  # type: ignore[misc]
150            uid=fc.__muutils_format__,  # type: ignore[attr-defined]
151            source_pckg=str(fc.__module__),
152            priority=priority,
153            desc=f"formatted class loader for {fc.__name__}",
154        )

handler for loading an object from a json file or a ZANJ archive

LoaderHandler( check: Callable[[Union[bool, int, float, str, NoneType, Sequence[Union[bool, int, float, str, NoneType, Sequence[ForwardRef('JSONitem')], Dict[str, ForwardRef('JSONitem')]]], Dict[str, Union[bool, int, float, str, NoneType, Sequence[ForwardRef('JSONitem')], Dict[str, ForwardRef('JSONitem')]]]], tuple[Union[str, int], ...], Any], bool], load: Callable[[Union[bool, int, float, str, NoneType, Sequence[Union[bool, int, float, str, NoneType, Sequence[ForwardRef('JSONitem')], Dict[str, ForwardRef('JSONitem')]]], Dict[str, Union[bool, int, float, str, NoneType, Sequence[ForwardRef('JSONitem')], Dict[str, ForwardRef('JSONitem')]]]], tuple[Union[str, int], ...], Any], Any], uid: str, source_pckg: str, priority: int = 0, desc: str = '(no description)')
check: Callable[[Union[bool, int, float, str, NoneType, Sequence[Union[bool, int, float, str, NoneType, Sequence[ForwardRef('JSONitem')], Dict[str, ForwardRef('JSONitem')]]], Dict[str, Union[bool, int, float, str, NoneType, Sequence[ForwardRef('JSONitem')], Dict[str, ForwardRef('JSONitem')]]]], tuple[Union[str, int], ...], Any], bool]
load: Callable[[Union[bool, int, float, str, NoneType, Sequence[Union[bool, int, float, str, NoneType, Sequence[ForwardRef('JSONitem')], Dict[str, ForwardRef('JSONitem')]]], Dict[str, Union[bool, int, float, str, NoneType, Sequence[ForwardRef('JSONitem')], Dict[str, ForwardRef('JSONitem')]]]], tuple[Union[str, int], ...], Any], Any]
uid: str
source_pckg: str
priority: int = 0
desc: str = '(no description)'
def serialize( self) -> Dict[str, Union[bool, int, float, str, NoneType, Sequence[Union[bool, int, float, str, NoneType, Sequence[ForwardRef('JSONitem')], Dict[str, ForwardRef('JSONitem')]]], Dict[str, Union[bool, int, float, str, NoneType, Sequence[ForwardRef('JSONitem')], Dict[str, ForwardRef('JSONitem')]]]]]:
115    def serialize(self) -> JSONdict:
116        """serialize the handler info"""
117        return {
118            # get the code and doc of the check function
119            "check": {
120                "code": safe_getsource(self.check),
121                "doc": string_as_lines(self.check.__doc__),
122            },
123            # get the code and doc of the load function
124            "load": {
125                "code": safe_getsource(self.load),
126                "doc": string_as_lines(self.load.__doc__),
127            },
128            # get the uid, source_pckg, priority, and desc
129            "uid": str(self.uid),
130            "source_pckg": str(self.source_pckg),
131            "priority": int(self.priority),
132            "desc": str(self.desc),
133        }

serialize the handler info

@classmethod
def from_formattedclass(cls, fc: type, priority: int = 0):
135    @classmethod
136    def from_formattedclass(cls, fc: type, priority: int = 0):
137        """create a loader from a class with `serialize`, `load` methods and `__muutils_format__` attribute"""
138        assert hasattr(fc, "serialize")
139        assert callable(fc.serialize)  # type: ignore
140        assert hasattr(fc, "load")
141        assert callable(fc.load)  # type: ignore
142        assert hasattr(fc, _FORMAT_KEY)
143        assert isinstance(fc.__muutils_format__, str)  # type: ignore
144
145        return cls(
146            check=lambda json_item, path=None, z=None: (  # type: ignore[misc]
147                json_item[_FORMAT_KEY] == fc.__muutils_format__  # type: ignore[attr-defined]
148            ),
149            load=lambda json_item, path=None, z=None: fc.load(json_item, path, z),  # type: ignore[misc]
150            uid=fc.__muutils_format__,  # type: ignore[attr-defined]
151            source_pckg=str(fc.__module__),
152            priority=priority,
153            desc=f"formatted class loader for {fc.__name__}",
154        )

create a loader from a class with serialize, load methods and __muutils_format__ attribute

LOADER_MAP_LOCK = <unlocked _thread.lock object>
LOADER_MAP: dict[str, LoaderHandler] = {'numpy.ndarray': LoaderHandler(check=<function <lambda>>, load=<function <lambda>>, uid='numpy.ndarray', source_pckg='zanj', priority=0, desc='numpy.ndarray loader'), 'torch.Tensor': LoaderHandler(check=<function <lambda>>, load=<function _torch_loaderhandler_load>, uid='torch.Tensor', source_pckg='zanj', priority=0, desc='torch.Tensor loader'), 'pandas.DataFrame': LoaderHandler(check=<function <lambda>>, load=<function <lambda>>, uid='pandas.DataFrame', source_pckg='zanj', priority=0, desc='pandas.DataFrame loader'), 'polars.DataFrame': LoaderHandler(check=<function <lambda>>, load=<function <lambda>>, uid='polars.DataFrame', source_pckg='zanj', priority=0, desc='polars.DataFrame loader'), 'list': LoaderHandler(check=<function <lambda>>, load=<function <lambda>>, uid='list', source_pckg='zanj', priority=0, desc='list loader, for externals'), 'tuple': LoaderHandler(check=<function <lambda>>, load=<function <lambda>>, uid='tuple', source_pckg='zanj', priority=0, desc='tuple loader, for externals')}
def register_loader_handler(handler: LoaderHandler):
289def register_loader_handler(handler: LoaderHandler):
290    """register a custom loader handler"""
291    global LOADER_MAP, LOADER_MAP_LOCK
292    with LOADER_MAP_LOCK:
293        LOADER_MAP[handler.uid] = handler

register a custom loader handler

def get_item_loader( json_item: Union[bool, int, float, str, NoneType, Sequence[ForwardRef('JSONitem')], Dict[str, ForwardRef('JSONitem')]], path: tuple[typing.Union[str, int], ...], zanj: typing.Any | None = None, error_mode: muutils.errormode.ErrorMode = ErrorMode.Warn) -> LoaderHandler | None:
296def get_item_loader(
297    json_item: JSONitem,
298    path: ObjectPath,
299    zanj: _ZANJ_pre | None = None,
300    error_mode: ErrorMode = ErrorMode.WARN,
301    # lh_map: dict[str, LoaderHandler] = LOADER_MAP,
302) -> LoaderHandler | None:
303    """get the loader for a json item"""
304    global LOADER_MAP
305
306    # check if we recognize the format
307    if isinstance(json_item, typing.Mapping) and _FORMAT_KEY in json_item:
308        if not isinstance(json_item[_FORMAT_KEY], str):  # type: ignore[index]
309            raise TypeError(
310                f"invalid __muutils_format__ type '{type(json_item[_FORMAT_KEY])}' in '{path=}': '{json_item[_FORMAT_KEY] = }'"  # type: ignore[index]
311            )
312        if json_item[_FORMAT_KEY] in LOADER_MAP:  # type: ignore[index]
313            return LOADER_MAP[json_item[_FORMAT_KEY]]  # type: ignore[index]
314
315    # if we dont recognize the format, try to find a loader that can handle it
316    for key, lh in LOADER_MAP.items():
317        if lh.check(json_item, path, zanj):
318            return lh
319
320    # if we still dont have a loader, return None
321    return None

get the loader for a json item

def load_item_recursive( json_item: Union[bool, int, float, str, NoneType, Sequence[ForwardRef('JSONitem')], Dict[str, ForwardRef('JSONitem')]], path: tuple[typing.Union[str, int], ...], zanj: typing.Any | None = None, error_mode: muutils.errormode.ErrorMode = ErrorMode.Warn, allow_not_loading: bool = True) -> Any:
324def load_item_recursive(
325    json_item: JSONitem,
326    path: ObjectPath,
327    zanj: _ZANJ_pre | None = None,
328    error_mode: ErrorMode = ErrorMode.WARN,
329    allow_not_loading: bool = True,
330) -> Any:
331    lh: LoaderHandler | None = get_item_loader(
332        json_item=json_item,
333        path=path,
334        zanj=zanj,
335        error_mode=error_mode,
336        # lh_map=lh_map,
337    )
338
339    if lh is not None:
340        # special case for serializable dataclasses
341        if (
342            isinstance(json_item, typing.Mapping)
343            and (_FORMAT_KEY in json_item)
344            and ("SerializableDataclass" in json_item[_FORMAT_KEY])  # type: ignore[operator]
345        ):
346            # why this horribleness?
347            # SerializableDataclass, if it has a field `x` which is also a SerializableDataclass, will automatically call `x.__class__.load()`
348            # However, we need to load things in containers, as well as arrays
349            processed_json_item: dict = {
350                key: (
351                    val
352                    if (
353                        isinstance(val, typing.Mapping)
354                        and (_FORMAT_KEY in val)
355                        and ("SerializableDataclass" in val[_FORMAT_KEY])  # type: ignore[operator, index]
356                    )
357                    else load_item_recursive(
358                        json_item=val,  # type: ignore[arg-type]
359                        path=tuple(path) + (key,),  # type: ignore[arg-type]
360                        zanj=zanj,
361                        error_mode=error_mode,
362                    )
363                )
364                for key, val in json_item.items()
365            }
366
367            return lh.load(processed_json_item, path, zanj)
368
369        else:
370            return lh.load(json_item, path, zanj)
371    else:
372        if isinstance(json_item, dict):
373            return {
374                key: load_item_recursive(
375                    # ty doesn't narrow JSONitem to dict after isinstance check; string key indexing is safe here
376                    json_item=json_item[key],  # type: ignore[invalid-argument-type, call-overload]
377                    path=tuple(path) + (key,),
378                    zanj=zanj,
379                    error_mode=error_mode,
380                    # lh_map=lh_map,
381                )
382                for key in json_item
383            }
384        elif isinstance(json_item, list):
385            return [
386                load_item_recursive(
387                    json_item=x,
388                    path=tuple(path) + (i,),
389                    zanj=zanj,
390                    error_mode=error_mode,
391                    # lh_map=lh_map,
392                )
393                for i, x in enumerate(json_item)
394            ]
395        elif isinstance(json_item, (str, int, float, bool, type(None))):
396            return json_item
397        else:
398            if allow_not_loading:
399                return json_item
400            else:
401                raise ValueError(
402                    f"unknown type {type(json_item)} at {path}\n{json_item}"
403                )
class LoadedZANJ:
445class LoadedZANJ:
446    """for loading a zanj file"""
447
448    def __init__(
449        self,
450        path: str | Path,
451        zanj: _ZANJ_pre,
452    ) -> None:
453        # path and zanj object
454        self._path: str = str(path)
455        self._zanj: _ZANJ_pre = zanj
456
457        # load zip file
458        _zipf: zipfile.ZipFile = zipfile.ZipFile(file=self._path, mode="r")
459
460        # load data
461        self._meta: JSONdict = json.load(_zipf.open(ZANJ_META, "r"))
462        self._json_data: JSONitem = json.load(_zipf.open(ZANJ_MAIN, "r"))
463
464        # read externals
465        self._externals: dict[str, ExternalItem] = dict()
466        for fname, ext_item in self._meta["externals_info"].items():  # type: ignore
467            item_type: str = ext_item["item_type"]  # type: ignore
468            with _zipf.open(fname, "r") as fp:
469                self._externals[fname] = ExternalItem(
470                    item_type=item_type,  # type: ignore[arg-type]
471                    data=GET_EXTERNAL_LOAD_FUNC(item_type)(self, fp),
472                    path=ext_item["path"],  # type: ignore
473                )
474
475        # close zip file
476        _zipf.close()
477        del _zipf
478
479    def populate_externals(self) -> None:
480        """put all external items into the main json data"""
481
482        # loop over once, populating the externals only
483        for ext_path, ext_item, item, path in _each_item_in_externals(
484            self._externals, self._json_data
485        ):
486            # replace the item with the external item
487            assert _REF_KEY in item  # type: ignore
488            assert item[_REF_KEY] == ext_path  # type: ignore
489            item["data"] = ext_item.data  # type: ignore

for loading a zanj file

LoadedZANJ(path: str | pathlib._local.Path, zanj: Any)
448    def __init__(
449        self,
450        path: str | Path,
451        zanj: _ZANJ_pre,
452    ) -> None:
453        # path and zanj object
454        self._path: str = str(path)
455        self._zanj: _ZANJ_pre = zanj
456
457        # load zip file
458        _zipf: zipfile.ZipFile = zipfile.ZipFile(file=self._path, mode="r")
459
460        # load data
461        self._meta: JSONdict = json.load(_zipf.open(ZANJ_META, "r"))
462        self._json_data: JSONitem = json.load(_zipf.open(ZANJ_MAIN, "r"))
463
464        # read externals
465        self._externals: dict[str, ExternalItem] = dict()
466        for fname, ext_item in self._meta["externals_info"].items():  # type: ignore
467            item_type: str = ext_item["item_type"]  # type: ignore
468            with _zipf.open(fname, "r") as fp:
469                self._externals[fname] = ExternalItem(
470                    item_type=item_type,  # type: ignore[arg-type]
471                    data=GET_EXTERNAL_LOAD_FUNC(item_type)(self, fp),
472                    path=ext_item["path"],  # type: ignore
473                )
474
475        # close zip file
476        _zipf.close()
477        del _zipf
def populate_externals(self) -> None:
479    def populate_externals(self) -> None:
480        """put all external items into the main json data"""
481
482        # loop over once, populating the externals only
483        for ext_path, ext_item, item, path in _each_item_in_externals(
484            self._externals, self._json_data
485        ):
486            # replace the item with the external item
487            assert _REF_KEY in item  # type: ignore
488            assert item[_REF_KEY] == ext_path  # type: ignore
489            item["data"] = ext_item.data  # type: ignore

put all external items into the main json data