#!python
# -*-python-*-

if __name__ == "__main__":
    import argparse
    import datetime
    import sys
    import time

    from importlib.metadata import version
    from math import prod

    import pyfive
    import numpy as np

    from packaging.version import Version

    start0 = time.time()

    iam = "check_cmip7_packing"
    __version__ = "0.5"
    __date__ = "2025-11-13"

    pyfive_version = Version(version("pyfive"))
    if pyfive_version < Version("1.0"):
        raise RuntimeError(
            f"{iam} requires Python library 'pyfive' version 1.0 or newer. "
            f"Got: {pyfive_version}"
        )

    def print_help():
        import subprocess

        manpage = rf"""
.TH {iam} 1 {__date__} {__version__}
.
.SH NAME

{iam} - check that datasets meet the CMIP7 internal packing requirements.

.SH SYNOPSIS

\fB{iam}\fR [-h] [-v] [-V] \fIFILE\fR [\fIFILE\fR ...]

.SH DESCRIPTION

For each input \fIFILE\fR, \fB{iam}\fR will
.
.IP "\\(em" 2
Check that the time coordinate variable (assumed to be the variable
called "time" in the root group), if it exists, has a chunk.

.IP "\\(em" 2
Check that the time bounds variable (identified by the time coordinate
variable's "bounds" attribute), if it exists, has a single chunk.

.IP "\\(em" 2
Check that data variable (identified by the global "variable_id"
attribute), if it exists, has a single chunk or has an
uncompressed chunk size of at least 41943044 bytes (i.e. 4
MiB). However, the check will still pass for smaller chunks if
increasing the chunk's shape by one element along the leading
(i.e. slowest moving) dimension of the data would result in a chunk
size of at least 4 MiB.

.IP "\\(em" 2
Check that all of the internal file metadata is collated to a
contiguous block near the start of the file, before all of the
variables' data chunks.

.IP "" 0

Any input \fIFILE\fR that has been output by \fBcmip7repack\fR
<https://github.com/NCAS-CMS/cmip7repack> is guaranteed to pass these
checks.
        
.SH DEPENDENCIES

Requires Python 3.10 or later, and that the Python libraries
\fBpyfive\fR <https://pyfive.readthedocs.io>, \fBnumpy\fR
<https://numpy.org>, and \fBpackaging\fR <https://packaging.pypa.io>
are available from a location given by the PYTHONPATH environment
variable.

.SH METHOD

Each input \fIFILE\fR is analysed using the Python \fBpyfive\fR
package.
  
.SH OPTIONS

.TP
.B \-h
Display this help and exit.

.TP
.B \-v
Verbose mode. Print extra information.

.TP
.B \-V
Print version number and exit.

. SH EXIT STATUS

.IP \fB0\fR
All input files meet the CMIP7 internal file packing requirements.

.IP \fB1\fR
At least one input file does not meet the CMIP7 internal file packing
requirements. All files were checked.

.IP \fB2\fR
An incorrect command-line option. No input files are checked.

.IP \fB3\fR
An input file does not exist. No input files are checked.

.IP \fB4\fR
An input file can not be opened. No input files are checked.

.IP \fB5\fR
An input file can be opened, but not parsed as an HDF5 file. No input
files are checked.

.SH EXAMPLES

\fB1.\fR Testing two files that both pass the checks. The exit code is
0 because all files passed.

.nf
    \fB$ check_cmip7_packing file1.nc file2.nc\fR
    PASS: File 'file1.nc'
    PASS: File 'file2.nc'
    \fB$ echo $?\fR
    0
.fi

\fB2.\fR Repeating the test of example \fB1.\fR with verbose mode
enabled.

.nf
    \fB$ check_cmip7_packing -v file1.nc file2.nc\fR
    check_cmip7_packing: Version 0.5 at /usr/bin/check_cmip7_packing
    check_cmip7_packing: pyfive: Version 1.0.0 at /usr/bin/pyfive/__init__.py
    check_cmip7_packing: date-time: 2025-11-13 09:31:57.232149

    PASS: File 'file1.nc'
    PASS: File 'file2.nc'
    
    check_cmip7_packing: time taken: 0.0622 seconds
    check_cmip7_packing: 2/2 files passed, 0/2 files failed
.fi

\fB3.\fR Testing five files, one of which (file5.nc) passes the
checks, and the other four fail at least one check each. The exit code
is 1 because not all files passed.

.nf
    \fB$ check_cmip7_packing file[3-7].nc\fR
    PASS: File 'file5.nc'
    FAIL: File 'file3.nc' does not have consolidated internal metadata
    FAIL: File 'file4.nc' time coordinates variable 'time' has 6000 chunks (expected 1 chunk or contiguous)
    FAIL: File 'file6.nc' time bounds variable 'time_bnds' has 1800 chunks (expected 1 chunk or contiguous)
    FAIL: File 'file7.nc' data variable 'ps' has uncompressed chunk size 411840 bytes (expected at least 4111936 bytes or 1 chunk or contiguous)
    \fB$ echo $?\fR
    1
.fi

.SH AUTHORS
Written by David Hassell and Ezequiel Cimadevilla.

.SH REPORTING BUGS
Report any bugs to https://github.com/NCAS-CMS/cmip7repack/issues

.SH COPYRIGHT
Copyright 2025 License BSD 3-Clause
<https://opensource.org/license/bsd-3-clause>. This is free software:
you are free to change and redistribute it. There is NO WARRANTY, to
the extent permitted by law.

.SH SEE ALSO
\fBcmip7repack\fR(1)
"""
        p = subprocess.Popen(
            [
                "man",
                "-r",
                " Manual page check_cmip7_packing(1) ?ltline %lt?L/%L.:",
                "-l",
                "-",
            ],
            stdin=subprocess.PIPE,
        )
        p.communicate(bytes(manpage, "utf8"))

    # Parse command line options
    parser = argparse.ArgumentParser(
        prog=iam,
        description="Check that datasets meet the CMIP7 internal packing requirements",
    )
    parser.add_argument(
        "FILE", nargs="+", help="One or more CMIP7 files to check."
    )
    parser.add_argument(
        "-v",
        action="store_true",
        help="Verbose mode. Print extra information.",
    )
    parser.add_argument(
        "-V", action="store_true", help="Print versionnumber and exit."
    )

    if "-h" in sys.argv:
        print_help()
        sys.exit(0)

    if "-V" in sys.argv or "-v" in sys.argv:
        print(f"{iam}: Version {__version__} at {sys.argv[0]}")
        if "-V" in sys.argv:
            sys.exit(0)

        print(f"{iam}: pyfive: Version {pyfive_version} at {pyfive.__file__}")

    if len(sys.argv) == 1:
        print("usage: check_cmip7_packing [-h] [-V] [-v] FILE [FILE ...]")
        print("Full man page with -h")
        sys.exit(2)

    args = parser.parse_args()

    if args.v:
        print(f"{iam}: date-time: {datetime.datetime.now()}\n")

    four_MiB = 4 * (2**20)
    ok = []
    error = []

    # Loop round input files
    for filename in args.FILE:
        # Open the file with pyfive
        try:
            f = pyfive.File(filename)
        except FileNotFoundError:
            print(f"NON-EXISTENT: File {filename!r}\n\nNO INPUT FILES CHECKED")
            sys.exit(3)
        except PermissionError:
            print(f"CAN'T OPEN: File {filename!r}\n\nNO INPUT FILES CHECKED")
            sys.exit(4)
        except Exception:
            print(f"CAN'T PARSE: File {filename!r}\n\nNO INPUT FILES CHECKED")
            sys.exit(5)

        # Check for consolidated internal metadata
        try:
            if not f.consolidated_metadata:
                error.append(
                    f"FAIL: File {filename!r} does not have consolidated "
                    "internal metadata"
                )
                continue
        except Exception:
            print(f"CAN'T PARSE: File {filename!r}\n\nNO INPUT FILES CHECKED")
            sys.exit(5)

        if "time" in f:
            # Check for the time coordinates variable having one chunk
            t = f["time"]
            chunks = t.chunks
            if chunks is not None and t.id.get_num_chunks() > 1:
                # At least two chunks
                error.append(
                    f"FAIL: File {filename!r} time coordinates variable "
                    f"'time' has {t.id.get_num_chunks()} chunks "
                    "(expected 1 chunk or contiguous)"
                )
                continue

            # Check for the time bounds variable having one chunk
            if "bounds" in t.attrs:
                bounds = str(np.array(t.attrs["bounds"]).astype("U"))
                if bounds in f:
                    b = f[bounds]
                    chunks = b.chunks
                    if chunks is not None and b.id.get_num_chunks() > 1:
                        # At least two chunks
                        error.append(
                            f"FAIL: File {filename!r} time bounds variable "
                            f"{bounds!r} has {b.id.get_num_chunks()} chunks "
                            "(expected 1 chunk or contiguous)"
                        )
                        continue

        # Check for the data variable having one chunks of at least ~4MiB
        if "variable_id" in f.attrs:
            variable_id = str(np.array(f.attrs["variable_id"]).astype("U"))
            if variable_id in f:
                d = f[variable_id]
                if chunks is not None and d.id.get_num_chunks() > 1:
                    # At least two chunks
                    chunks = d.chunks
                    wordsize = d.dtype.itemsize
                    chunksize = prod(chunks) * wordsize

                    lee_way = 0
                    if len(chunks) > 1:
                        lee_way = prod(chunks[1:]) * wordsize

                    if chunksize + lee_way < four_MiB:
                        error.append(
                            f"FAIL: File {filename!r} data variable "
                            f"{variable_id!r} has uncompressed chunk size "
                            f"{chunksize} bytes (expected at least "
                            f"{four_MiB - lee_way} bytes or 1 chunk "
                            "or contiguous)"
                        )
                        continue

        # Still here? Then the file has passed all of the checks.
        ok.append(f"PASS: File {filename!r}")

    if ok:
        print("\n".join(ok))

    if error:
        print("\n".join(error))

    if args.v:
        print(f"\n{iam}: time taken: {time.time() - start0:.3} seconds")
        print(
            f"{iam}: {len(ok)}/{len(args.FILE)} files passed, "
            f"{len(error)}/{len(args.FILE)} files failed"
        )

    if error:
        # Some files failed
        sys.exit(1)

    # All files passed
    sys.exit(0)
