#!/bin/bash

# ====================================================================
#
# cmip7repack
#
# Written by David Hassell and Ezequiel Cimadevilla.
# 
# Copyright 2025 License BSD 3-Clause
# <https://opensource.org/license/bsd-3-clause>. This is free
# software: you are free to change and redistribute it. There is NO
# WARRANTY, to the extent permitted by law.
#
# Report any bugs to
# https://github.com/NCAS-CMS/cmip7repack/issues
# 
# ====================================================================
vn=0.5
release_date="2025-11-12"

iam_path=$(which "$0")
iam=$(basename "$iam_path")

usage="USAGE: $iam [-d size] [-h] [-o] [-V] [-x] [-z n] FILE [FILE ...]\nFull man page with -h"

helpFunction()
{
    (
	cat << EOF
.TH "$iam" 1 "$release_date" "$vn"

.SH NAME

$iam - repack CMIP7 datasets

.SH SYNOPSIS

\fB$iam\fR [-d \fIsize\fR] [-h] [-o] [-V] [-x] [-z \fIn\fR] \fIFILE\fR [\fIFILE\fR ...]

.SH DESCRIPTION

For each CMIP7-compliant netCDF-4 \fIFILE\fR, \fB$iam\fR will
.
.IP "\\(em" 2
Rechunk the time coordinate variable (assumed to be the variable
called "time" in the root group), if it exists, to have a single
compressed chunk.

.IP "\\(em" 2
Rechunk the time bounds variable (defined by the time coordinate
variable's "bounds" attribute), if it exists, to have a single
compressed chunk.

.IP "\\(em" 2
Rechunk the data variable (defined by the global attribute
"variable_id"), if it exists, to have a given chunk size (of at least
4 MiB).

.IP "\\(em" 2
Collate all of the internal file metadata to a contiguous block near
the start of the file, before all of the variables' data chunks.

.IP "" 0

All rechunked variables are de-interlaced with the HDF5 shuffle filter
(which significantly improves compression) before being compressed
with zlib (see the \fB-z\fR option), and also have the Fletcher32 HDF5
checksum algorithm activated.

Files repacked with \fB$iam\fR will pass the CMIP7 ESGF file-layout
checks.

.SH METHOD

Each input \fIFILE\fR is analysed using \fBh5stat\fR and \fBh5dump\fR,
and then repacked using \fBh5repack\fR, which changes the layout for
objects in the new output file. All file attributes and data values
are unchanged.

.SH OPTIONS

.TP
.B \-d \fIsize\fR
Rechunk the data variable (the variable named by the "variable_id"
global attribute) to have the given uncompressed chunk \fIsize\fR in
bytes. If \fB-d\fR is unset, then the \fIsize\fR defaults to 4194304
(i.e. 4 MiB). The \fIsize\fR must be at least 4194304. The chunk shape
will only ever be changed along the leading (i.e. slowest moving)
dimension of the data, such that resulting chunk size in the new file
is as large as possible without exceeding the \fIsize\fR.

However, if the original uncompressed chunk size in the input file is
already larger than \fBsize\fR, then the data variable will \fBnot\fR
be rechunked.

.TP
.B \-h
Display this help and exit.

.TP
.B \-o
Overwrite each input file with its repacked version, if the repacking
was successful. By default, a new file is created for each input file,
which has the same name with the addition of the suffix "_cmip7repack".

.TP
.B \-V
Print version number and exit.

.TP
.B \-x
Do a dry run. Show the \fBh5repack\fR commands for repacking each
input file, but do not run them. This allows the commands to be edited
before being run manually.

.TP
.B \-z \fIn\fR
Specify the zlib compression level (between 1 and 9, default 4) for
all rechunked variables.

. SH EXIT STATUS

.IP \fB0\fR
All input files successfully repacked.

.IP \fB1\fR
A failure occured during the repacking of one or more input files. The
exit only happens only after it has been attempted to repack all input
files, some of which may have been repacked successfully. The files
which could not be repacked may be found by looking for FAILED in the
text output log.

.IP \fB2\fR
An incorrect command-line option.

.IP \fB3\fR
A missing HDF5 dependency.

.SH EXAMPLES

\fB1.\fR Repack a file with the default settings (which guarantees
that the repacked files will pass the ESGF file-layout checks), and
replacing the original file with its repacked version. Note that the
data variable is rechunked to chunks of shape 37 x 144 x 192 elements.

.nf
    \fB$ cmip7repack -o file.nc\fR
    cmip7repack: Version 0.3 at /usr/bin/cmip7repack
    cmip7repack: h5repack: Version 1.14.6 at /usr/bin/h5repack

    cmip7repack: date-time: Wed  5 Nov 12:06:25 GMT 2025
    cmip7repack: file: 'file.nc'
    cmip7repack: repack command: h5repack --metadata_block_size=236570  -f /time:SHUF -f /time:GZIP=4 -f /time:FLET -l /time:CHUNK=1800 -f /time_bnds:SHUF -f /time_bnds:GZIP=4 -f /time_bnds:FLET -l /time_bnds:CHUNK=1800x2 -f /pr:SHUF -f /pr:GZIP=4 -f /pr:FLET -l /pr:CHUNK=37x144x192 file.nc file.nc_cmip7repack
    cmip7repack: running repack command (may take some time ...)
    cmip7repack: successfully created 'file.nc_cmip7repack'
    cmip7repack: renamed 'file.nc_cmip7repack' -> 'file.nc'
    cmip7repack: time taken: 5 seconds
   
    cmip7repack: 1/1 files (134892546 bytes) repacked in 5 seconds (26978509 B/s) to total size 94942759 bytes (29% smaller than input files)
.fi

\fB2.\fR Repack a file using the non-default data variable chunk size
of 8388608, replacing the original file with its repacked
version. Note that the data variable is rechunked to chunks of shape
75 x 144 x 192 elements (compare that with the rechunked data variable
chunk shape from example 1).

.nf
    \fB$ cmip7repack -d 8388608 -o file.nc\fR
    cmip7repack: Version 0.3 at /usr/bin/cmip7repack
    cmip7repack: h5repack: Version 1.14.6 at /usr/bin/h5repack
    
    cmip7repack: date-time: Wed  5 Nov 12:07:15 GMT 2025
    cmip7repack: file: 'file.nc'
    cmip7repack: repack command: h5repack --metadata_block_size=236570  -f /time:SHUF -f /time:GZIP=4 -f /time:FLET -l /time:CHUNK=1800 -f /time_bnds:SHUF -f /time_bnds:GZIP=4 -f /time_bnds:FLET -l /time_bnds:CHUNK=1800x2 -f /pr:SHUF -f /pr:GZIP=4 -f /pr:FLET -l /pr:CHUNK=75x144x192 file.nc file.nc_cmip7repack
    cmip7repack: running repack command (may take some time ...)
    cmip7repack: successfully created 'file.nc_cmip7repack'
    cmip7repack: renamed 'file.nc_cmip7repack' -> 'file.nc'
    cmip7repack: time taken: 5 seconds
     
    cmip7repack: 1/1 files (134892546 bytes) repacked in 5 seconds (26978509 B/s) to total size 94856788 bytes (29% smaller than input files)
.fi

\fB3.\fR Get the \fBh5repack\fR commands that would be used for
repacking each input file, but do not run them.

.nf
    \fB$ cmip7repack -x file.nc\fR
    cmip7repack: Version 0.3 at /usr/bin/cmip7repack
    cmip7repack: h5repack: Version 1.14.6 at /usr/bin/h5repack
    
    cmip7repack: date-time: Wed  5 Nov 12:08:02 GMT 2025
    cmip7repack: file: 'file.nc'
    cmip7repack: repack command: h5repack --metadata_block_size=236570  -f /time:SHUF -f /time:GZIP=4 -f /time:FLET -l /time:CHUNK=1800 -f /time_bnds:SHUF -f /time_bnds:GZIP=4 -f /time_bnds:FLET -l /time_bnds:CHUNK=1800x2 -f /pr:SHUF -f /pr:GZIP=4 -f /pr:FLET -l /pr:CHUNK=37x144x192 file.nc file.nc_cmip7repack
    cmip7repack: dry-run: not repacking
.fi

\fB4.\fR Repack multiple files with one command. This takes the same
time as repacking the files with separate commands, but may be more
convenient.

.nf
    \fB$ cmip7repack -o file[12].nc\fR
    cmip7repack: Version 0.3 at /usr/bin/cmip7repack
    cmip7repack: h5repack: Version 1.14.6 at /usr/bin/h5repack
    
    cmip7repack: date-time: Wed  5 Nov 12:09:13 GMT 2025
    cmip7repack: file: 'file1.nc'
    cmip7repack: repack command: h5repack --metadata_block_size=236570  -f /time:SHUF -f /time:GZIP=4 -f /time:FLET -l /time:CHUNK=1800 -f /time_bnds:SHUF -f /time_bnds:GZIP=4 -f /time_bnds:FLET -l /time_bnds:CHUNK=1800x2 -f /pr:SHUF -f /pr:GZIP=4 -f /pr:FLET -l /pr:CHUNK=37x144x192 file1.nc file1.nc_cmip7repack
    cmip7repack: running repack command (may take some time ...)
    cmip7repack: successfully created 'file1.nc_cmip7repack'
    cmip7repack: renamed 'file1.nc_cmip7repack' -> 'file1.nc'
    cmip7repack: time taken: 5 seconds
    
    cmip7repack: date-time: Wed  5 Nov 12:09:18 GMT 2025
    cmip7repack: file: 'file2.nc'
    cmip7repack: repack command: h5repack --metadata_block_size=149185  -f /time:SHUF -f /time:GZIP=4 -f /time:FLET -l /time:CHUNK=708 -f /time_bnds:SHUF -f /time_bnds:GZIP=4 -f /time_bnds:FLET -l /time_bnds:CHUNK=708x2 -f /toz:SHUF -f /toz:GZIP=4 -f /toz:FLET -l /toz:CHUNK=37x144x192 file2.nc file2.nc_cmip7repack
    cmip7repack: running repack command (may take some time ...)
    cmip7repack: successfully created 'file2.nc_cmip7repack'
    cmip7repack: renamed 'file2.nc_cmip7repack' -> 'file2.nc'
    cmip7repack: time taken: 1 seconds
    
    cmip7repack: 2/2 files (182714276 bytes) repacked in 6 seconds (30452379 B/s) to total size 140606512 bytes (23% smaller than input files)
.fi

.SH AUTHORS
Written by David Hassell and Ezequiel Cimadevilla.

.SH REPORTING BUGS
Report any bugs to https://github.com/NCAS-CMS/cmip7repack/issues

.SH COPYRIGHT
Copyright 2025 License BSD 3-Clause
<https://opensource.org/license/bsd-3-clause>. This is free software:
you are free to change and redistribute it. There is NO WARRANTY, to
the extent permitted by law.

.SH SEE ALSO
\fBh5repack\fR(1), \fBh5stat\fR(1), \fBh5dump\fR(1), \fBncdump\fR(1)

EOF
    ) | man -r "Manual page $iam(1) ?ltline %lt?L/%L.:" -l -
    exit 0
}

# --------------------------------------------------------------------
# Parse command line options
# --------------------------------------------------------------------
if [ $# -eq 0 ]; then
    echo -e "$usage"
    exit 2
fi

# Defaults
gzip=4 # -z
size=4194304 # -d
version=false # -V
overwrite=false # -o
dry_run=false # -x

while getopts ":d:hoVxz:" opt
do
    case $opt in
      d) size=$OPTARG
         echo "$size" | grep -q "^[0-9][0-9]*$" 
         if ! echo "$size" | grep -q "^[0-9][0-9]*$"; then
             echo "Option -d requires a numerical argument" >&2
             echo -e "$usage" >&2
             exit 2
         fi
         if [ "$size" -lt 419430 ]; then
             echo "Option -d requires an argument of at least 4194304 (i.e. 4 MiB)" >&2
             echo -e $"$usage" >&2
             exit 2
         fi ;;
      h) helpFunction ;;
      o) overwrite=true ;;
      V) version=true ;;
      x) dry_run=true ;;
      z) gzip=$OPTARG
         if ! echo "$gzip" | grep -q "^[1-9]$" ; then
             echo "Option -z requires a numerical argument between 1 and 9" >&2
             echo -e "$usage" >&2
             exit 2
         fi ;;
      \?) echo -e "Invalid option: -$OPTARG\n$usage" >&2
          exit 2 ;;
      :) echo "Option -$OPTARG requires an argument" >&2
         echo -e "$usage" >&2
         exit 2 ;;
    esac
done
shift $((OPTIND -1))

echo "$iam: Version $vn at $iam_path"
if [ "$version" = true ]; then
    exit 0
fi

# --------------------------------------------------------------------
# Check that the HDF5 commands are available
# --------------------------------------------------------------------
h5repack_path=$(which h5repack)
if [ "$h5repack_path" = "" ]; then
    echo "$iam: ERROR: Must install h5repack to use $iam"
    exit 3
fi
echo "$iam: $(h5repack -V) at $h5repack_path"

if ! which h5stat > /dev/null; then
    echo "$iam: ERROR: Must install h5stat to use $iam"
    exit 3
fi

if ! which h5dump > /dev/null; then
    echo "$iam: ERROR: Must install h5dump to use $iam"
    exit 3
fi

# --------------------------------------------------------------------
# Loop over input files
# --------------------------------------------------------------------
start0=$(date +%s)
totalsizein=0  # Total size of input files that are successfully repacked
totalsizeout=0  # Total size of output files
Nin=0  # Number of input files
Nout=0  # Number of repacked output files

for file in "$@"
do
    start_time=$(date +%s)
    
    echo ""
    echo "$iam: date-time: $(date)"
    echo "$iam: file: '$file'"

    if [ ! -e "$file" ]; then
        echo "iam: can't repack $file: file does not exist"
        echo "$iam: FAILED to repack $file"
        continue
    fi

    Nin=$(( Nin + 1 ))

    # ----------------------------------------------------------------
    # Get the h5repack --metadata_block_size option
    #
    # For instance:
    #
    #  From h5stat -S output:
    #    '  File metadata: 40988486 bytes'
    #  get:
    #    --metadata_block_size=40988486
    # ----------------------------------------------------------------
    metadata=$(h5stat -S "$file" \
		   | sed -n -E 's/^[[:space:]]*File metadata:[[:space:]]*([0-9]+).*/\1/p')
    if [ "$metadata" != "" ]; then
        metadata="--metadata_block_size=$metadata"
    fi

    # ----------------------------------------------------------------
    # Get the time bounds variable using CF bounds attribute.
    #
    # For instance:
    #
    #   From h5dump output:
    #       ATTRIBUTE "bounds" {
    #          DATATYPE  H5T_STRING {
    #             STRSIZE 10;
    #             STRPAD H5T_STR_NULLTERM;
    #             CSET H5T_CSET_ASCII;
    #             CTYPE H5T_C_S1;
    #          }
    #          DATASPACE  SCALAR
    #          DATA {
    #          (0): "time_bnds"
    #          }
    #       }
    #
    #   get:
    #     time_bnds
    # ----------------------------------------------------------------
    bounds=$(h5dump -A -d time "$file" | awk '$1=="ATTRIBUTE" && $2=="\"bounds\""{bnds=1}bnds && $1=="(0):"{v=$NF; gsub("\"", "", v); bnds=0}END{print v}')
    bounds="/${bounds}"
    
    # ----------------------------------------------------------------
    # Get the h5repack -l and -f options for the time variables
    #
    # For instance:
    #
    #   From h5dump output:
    #     '   DATASPACE  SIMPLE { ( 6000 ) / ( H5S_UNLIMITED ) }'
    #   get:
    #     -l /time:CHUNK=6000 -f /time:GZIP=4
    #   
    #   From h5dump output:
    #     '   DATASPACE  SIMPLE { ( 6000, 2 ) / ( H5S_UNLIMITED, 2 ) }'
    #   get:
    #     -l /time_bounds:CHUNK=6000x2 -f /time_bounds:GZIP=4
    # ----------------------------------------------------------------
    time=""
    for variable in "/time" "${bounds}"
    do
    	CHUNK=$(h5dump --dataset "$variable" --header "$file" 2>/dev/null \
    		    | grep "DATASPACE  SIMPLE" -m 1 \
    		    | sed -E 's/.*\(\s*([0-9, ]+)\s*\).*/\1/; s/[[:space:]]+//g; s/,/x/g')
    	if [ "$CHUNK" != "" ]; then
            time="$time -f ${variable}:SHUF -f ${variable}:GZIP=$gzip"
            time="$time -f ${variable}:FLET"
            time="$time -l ${variable}:CHUNK=$CHUNK"
    	fi
    done

    # ----------------------------------------------------------------
    # Get the h5repack -l and -f options for the data variable.
    #
    # For instance:
    #
    #   -l /uas:CHUNK=50x143x144 -f /uas:SHUF -f /uas:GZIP=4 -f /uas:FLET
    # ----------------------------------------------------------------
    data=""
    if [ "$size" != false ]; then
        # Find the data variable name
        variable=$(h5dump --attribute variable_id "$file" 2>/dev/null \
                       | grep "(0)" -m 1 \
                       | sed -n 's/.*"\(\w*\)"/\1/p')

        if [ "$variable" != "" ]; then
            variable="/$variable"

            # Find the chunk shape
            CHUNK=""
    	    chunks=$(h5dump --dataset "$variable" --properties --header "$file" 2>/dev/null \
    			 | grep "CHUNKED (" -m 1 \
    			 | sed -n 's/.*(\(.*\)).*/\1/p' \
    			 | sed 's/,\s*/ /g')
	    # shellcheck disable=SC2206
    	    chunks=($chunks)
            ndim=${#chunks[@]}

            # Find the word size, in bytes, of the data values.
            datatype=$(h5dump --dataset "$variable" --header "$file" 2>/dev/null \
                           | grep "DATATYPE" -m 1)
            if [ "$(echo "$datatype" | grep 32)" != "" ]; then
                wordsize=4
            else
                wordsize=8
    	    fi

	    # Find original chunk size in bytes
	    original_chunk_size=$wordsize
	    n=0
            while [ $n -lt "$ndim" ]
            do
                original_chunk_size=$((original_chunk_size * chunks[n]))
                n=$((n + 1))
            done

	    # If the current chunks size is less than the requested
	    # chunk size, then look for a new data variable chunk shape.
	    CHUNK=""	    
	    if [ $original_chunk_size -lt "$size" ]; then
                # Iteratively reduce the size of first chunk dimension, so
                # that the final chunk size is less than or equal to the
                # given size.
                chunks[0]=$((size / wordsize ))
                n=1
                while [ $n -lt "$ndim" ]
                do
                    chunks[0]=$((chunks[0] / chunks[n]))
                    n=$((n + 1))
                done

		# Find the new chunk size in bytes
		new_chunk_size=$wordsize
		n=0
		while [ $n -lt "$ndim" ]
		do
                    new_chunk_size=$((new_chunk_size * ${chunks[$n]} ))
                    n=$((n + 1))
		done

		# Only rechunk of the new chunks size is strictly
		# greater than the original chunks size
		if [ $new_chunk_size -gt $original_chunk_size ]; then
                    # Create an h5repack CHUNK value for the data variable
                    CHUNK=${chunks[0]}
                    n=1
                    while [ $n -lt "$ndim" ]
                    do
                        CHUNK=${CHUNK}x${chunks[$n]}
                        n=$((n + 1))
                    done
                fi
	    fi

	    if [ "$CHUNK" = "" ]; then
	        echo "$iam: not rechunking data variable $variable with existing chunk shape (${chunks[*]})"
            else
                data="-f ${variable}:SHUF -f ${variable}:GZIP=$gzip"
                data="$data -f ${variable}:FLET"
                data="$data -l ${variable}:CHUNK=$CHUNK"
	    fi
        fi	
    fi
    
    if [ "$metadata$time$data" = "" ]; then
        echo "$iam: can't repack $file: Couldn't find time, time bounds or data variables, nor the metadata block size"
        echo "$iam: FAILED to repack $file"
        echo "$iam: file $file is unchanged"	
        continue	    
    fi

    # Still here? Define the repacking command
    repacked_file=${file}_cmip7repack
    command="h5repack $metadata $time $data $file $repacked_file"
    echo "$iam: repack command: $command"
    if [ "$dry_run" = true ]; then
        echo "$iam: dry-run: not repacking"
        continue
    fi
    
    # Still here? Run the repacking command
    echo "$iam: running repack command (may take some time ...)"
    # shellcheck disable=SC2086
    if h5repack $metadata $time $data "$file" "$repacked_file"; then
        # Successfully repacked
        end_time=$(date +%s)
        Nout=$(( Nout + 1 ))

	filesize=$(wc -c < "$file")
	totalsizein=$(( totalsizein + filesize ))

        echo "$iam: successfully created '$repacked_file'"

	# Get the output file size
	filesize=$(wc -c < "$repacked_file")
	totalsizeout=$((totalsizeout + filesize))

        if [ "$overwrite" = true ]; then
            # Overwrite original file with repacked file
            mv_output=$(mv -v "$repacked_file" "$file")
            echo "$iam: $mv_output"
        fi

        echo "$iam: time taken: $((end_time - start_time)) seconds"
    else
        echo "$iam: FAILED to repack $file"
        echo "$iam: file $file is unchanged"
    fi
done
end0=$(date +%s)

if [ "$dry_run" = false ]; then
    # Get the repack rate
    totaltime=$((end0 - start0))
    if [ $totaltime -gt 0 ]; then
        rate="($((totalsizein / totaltime)) B/s)"
    else
        rate="(0 B/s)"
    fi

    if [ $totalsizein -gt 0 ]; then
        pcdiff=$(( (100 * (totalsizein - totalsizeout))/totalsizein ))
        if [ $pcdiff -ge 0 ]; then
            pcdiff="(${pcdiff}% smaller than input files)"
        else
    	pcdiff=$((pcdiff * -1))
            pcdiff="(${pcdiff}% larger than input files)"
        fi
    else
        pcdiff=0
    fi

    echo ""
    echo "$iam: $Nout/$Nin files ($totalsizein bytes) repacked in $totaltime seconds $rate to total size $totalsizeout bytes $pcdiff"

    if [ $Nout != $Nin ]; then
        exit 1
    fi
fi
