Source code for projspec.content.data

"""Contents specifying datasets"""

from dataclasses import dataclass, field

from projspec.content import BaseContent


[docs] @dataclass class TabularData(BaseContent): """A tabular dataset, columns and rows This lists loadable tabular files with defined schema, typically from formats such as JSON, CSV, and parquet. """ icon = "📊" name: str metadata: dict = field(default_factory=dict) # allowed schema formats: # - dtype-like {fieldname: string-type} # - dtype-complex {fieldname: {...}} # - list like [{name:, ...}] # We may choose to normalise to just one of these eventually schema: dict | list = field(default_factory=dict)
[docs] @dataclass class IntakeSource(BaseContent): """A catalog of data assets, including basic properties (location) and how to load/process them. See https://intake.readthedocs.io/en/latest/ """ icon = "📖" # TODO: add better fields: args, driver/reader, metadata, description name: str
[docs] @dataclass class DataResource(BaseContent): """A data resource found inside a data-only directory. Describes one logical dataset — which may be a flat collection of files, a Hive-partitioned tree, an Iceberg/Delta table, a Zarr store, or any other recognised on-disk layout. The `path` field is a human-readable basename that identifies the resource: - Single file: `"data.csv"` - Multi-file series: `"part*.parquet"` (glob-style, common prefix + `*` + ext) - Directory-as-dataset (Hive partition, Zarr store, …): `"year=2024/"` The `modality` field classifies the broad nature of the data using the vocabulary established by intake's `structure` tags and napari's layer type system: - `"tabular"` — row/column data (CSV, Parquet, ORC, Excel, …) - `"array"` — N-dimensional arrays (NumPy, HDF5, NetCDF, Zarr, …) - `"image"` — 2-D/3-D images (PNG, JPEG, TIFF, DICOM, NIfTI, …) - `"timeseries"` — time-indexed signals (WAV, GRIB, …) - `"geospatial"` — vector/raster geodata (Shapefile, GeoJSON, GeoTIFF, …) - `"model"` — ML model weights (GGUF, SafeTensors, PyTorch, …) - `"nested"` — hierarchical / JSON-like (Avro, YAML, XML, …) - `"document"` — human-readable documents (PDF, DOCX, …) - `"video"` — video streams (MP4, AVI, …) - `"archive"` — compressed bundles (ZIP, tar.gz, …) - `""` — unknown / mixed The `schema` field is format-specific: - Tabular (Parquet, Arrow, CSV, …): `{column_name: dtype_str, …}` - Image / array: `{"width": int, "height": int, "channels": int, "mode": str}` - Audio: `{"sample_rate": int, "channels": int, "frames": int}` - HDF5 / Zarr / NetCDF: `{"variables": [...], "dims": {...}, "attrs": {...}}` - Unknown / library not available: `{}` """ icon = "📥" path: str # basename (or glob pattern / dir/ ) identifying this resource format: str # canonical format string, e.g. "parquet", "csv", "png", "hdf5" modality: str = "" # broad data nature; see docstring for vocabulary layout: str = "" # "flat"|"hive"|"iceberg"|"delta"|"zarr_store"|"tiledarray"|"" file_count: int = 0 total_size: int = 0 # bytes; 0 when unknown (e.g. remote FS without size info) schema: dict | list = field(default_factory=dict) # full path to one representative file, for use by preview loaders sample_path: str = "" metadata: dict = field(default_factory=dict) # catch-all extras _html = None def __repr__(self) -> str: from projspec.content.data_html import repr_text return repr_text(self) def _repr_html_(self) -> str: """Jupyter rich display — returns cached HTML, rendering on first call.""" # TODO: this is probably not what we want jupyter to dysplay, but it's # convenient for now. if self._html is None: from projspec.content.data_html import repr_html self._html = repr_html(self) return self._html def to_dict(self, compact=False): d = super().to_dict(compact=compact) if not compact: d["_html"] = self._repr_html_() return d