goa: caching function results in python including python effects

assume a function in a python file which does some data processing.
i want to cache the results including the side effects: stdot, stderr, files written to filesystem.
same input (including files which the function may read, and optionally env vars(selectable set)) should result in same output.
its ok to restrict input datatypes, but polars and pandas dataframes mus be supported.
must consider all dependencies the function use and the function itself: changing function or any of the dependencies must be different result. dependency tracking can be on module level, nbasically all whats imported or even on file level.

it should be:
- easy to use
- no manual version counter or anything like that, instead proper dependency tracking
- some restrictions might be ok
- assumin each transformation function lives in its own file is ok, but oth an arbitary number of complex dependencies must be supported, acroll python module boundaries.

consider below sketched out code which does this on process level, independent of python. may be a "trick" can be used to utilize this:

lets say we have a special decorater
@cached
def transform(a, b, c):
    ...

each in a single file.

then we may have additionally:

if __name__ == "__main__":
    import cached.auto_cli
    cached.auto_cli.run(transform) # creates a cli version of function, which also handles caching and dependency tracking

functions which decorator: @cached will basically start a seperate process with the same command line arguments

think along these lines. make a proper plan with verify in the style of other vessel_frame projects. understand structure etc. this should be a seperate python module.
if we go for this process level tracking, an arbitary cache for any cli program, like sketched out below should be part of the packag, too. but still main goal is to have caching in python.

the above is just to transport the inted, you can come up with different solution as long as it satisfies the basic requirements.

existing python packages do not solve it good enough for me.
but you stll may reccomend one if it really fits the requirements.

the cacher package should live at: ./casher

---

#!/usr/bin/env python3
import os
import sys
import json
import time
import hashlib
import subprocess
import shutil
import tempfile
from pathlib import Path

def sha256_file(path):
    h = hashlib.sha256()
    with open(path, "rb") as f:
        while chunk := f.read(8192):
            h.update(chunk)
    return h.hexdigest()

def start_bpftrace(pid):
    script = f"""
tracepoint:syscalls:sys_enter_openat
/ pid == {pid} /
{{
    printf("%s %d\\n", str(args->filename), args->flags);
}}
"""
    return subprocess.Popen(
        ["bpftrace", "-e", script],
        stdout=subprocess.PIPE,
        stderr=subprocess.DEVNULL,
        text=True
    )

def parse_events(lines):
    reads = set()
    writes = set()

    for line in lines:
        try:
            path, flags = line.strip().rsplit(" ", 1)
            flags = int(flags)

            # O_RDONLY = 0
            if flags == 0:
                reads.add(path)
            else:
                writes.add(path)
        except:
            continue

    return list(reads), list(writes)

def compute_key(cmd, inputs):
    h = hashlib.sha256()
    h.update(" ".join(cmd).encode())

    for path in sorted(inputs):
        if os.path.exists(path) and os.path.isfile(path):
            h.update(path.encode())
            h.update(sha256_file(path).encode())

    return h.hexdigest()

def save_outputs(outputs, cache_dir):
    base = os.path.join(cache_dir, "files")
    os.makedirs(base, exist_ok=True)

    saved = []
    for path in outputs:
        if os.path.isfile(path):
            dst = os.path.join(base, path.lstrip("/"))
            os.makedirs(os.path.dirname(dst), exist_ok=True)
            shutil.copy2(path, dst)
            saved.append(path)
    return saved

def restore_outputs(meta, cache_dir):
    for path in meta["outputs"]:
        src = os.path.join(cache_dir, "files", path.lstrip("/"))
        os.makedirs(os.path.dirname(path), exist_ok=True)
        shutil.copy2(src, path)

def main():
    if len(sys.argv) < 4 or sys.argv[1] != "--cache-dir":
        print("usage: casher --cache-dir DIR command...")
        sys.exit(1)

    cache_dir = sys.argv[2]
    cmd = sys.argv[3:]
    os.makedirs(cache_dir, exist_ok=True)

    # start target process
    proc = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )

    # attach eBPF tracer
    tracer = start_bpftrace(proc.pid)

    out, err = proc.communicate()
    tracer.terminate()

    events = tracer.stdout.readlines()
    reads, writes = parse_events(events)

    key = compute_key(cmd, reads)
    entry_dir = os.path.join(cache_dir, key)
    meta_file = os.path.join(entry_dir, "meta.json")

    if os.path.exists(meta_file):
        print("[casher] cache hit")

        with open(meta_file) as f:
            meta = json.load(f)

        restore_outputs(meta, entry_dir)

        print(meta["stdout"], end="")
        print(meta["stderr"], end="", file=sys.stderr)
        sys.exit(meta["returncode"])

    print("[casher] cache miss → storing")

    os.makedirs(entry_dir, exist_ok=True)

    saved_outputs = save_outputs(writes, entry_dir)

    meta = {
        "cmd": cmd,
        "inputs": reads,
        "outputs": saved_outputs,
        "stdout": out,
        "stderr": err,
        "returncode": proc.returncode,
    }

    with open(meta_file, "w") as f:
        json.dump(meta, f, indent=2)

    print(out, end="")
    print(err, end="", file=sys.stderr)

    sys.exit(proc.returncode)

if __name__ == "__main__":
    main()
