#!/usr/bin/env Rscript

# Copyright (C) 2025 Université de Reims Champagne-Ardenne.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#     (1) Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#
#     (2) Redistributions in binary form must reproduce the above copyright
#     notice, this list of conditions and the following disclaimer in
#     the documentation and/or other materials provided with the
#     distribution.
#
#     (3)The name of the author may not be used to
#     endorse or promote products derived from this software without
#     specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

`%>%` <- magrittr::`%>%`

options (warn=1)

args <- commandArgs (trailingOnly = TRUE)

if ('-h' %in% args || '--help' %in% args) {
    cat (sprintf ("Usage: hash-neonatal FILE ...\n"))
    cat (sprintf ("\n"))
    cat (sprintf ("Read the neonatal file and its associated annotations, take non-overlapping windows of the signal with associated segmentation labels, and add them to a dataset.\n"))
    cat (sprintf ("\n"))
    cat (sprintf ("A file named “files.csv” is expected to be present in the current working directory, but will be considered empty if not present. It will contain the list of all processed windows.\n"))
    cat (sprintf ("\n"))
    cat (sprintf ("A file named “subjects.csv” is expected to be present, and it will be populated with an association from file names to extracted windows. This is a risk to anonymization, you should maybe remove it.\n"))
    cat (sprintf ("\n"))
    cat (sprintf ("Signal sections are saved to CSV in the current working directory, then their contents are hashed, and the files are renamed as the hex encoding of their hashes, plus a “.csv” extension. The file modification date is reset to january 1, 2000. “files.csv” is expected to be a 1-column (of strings) CSV files without a header (the column meaning the filename). The added CSV files are 40-column float matrices with a header. The first 10 columns are the signal values, in the weakest filter (band-pass, [0.53 Hz, 70 Hz]), in the derivations: “Fp2-T4”, “Fp2-C4”, “Fz-Cz”, “Fp1-C3”, “Fp1-T3”, “T4-O2”, “C4-O2”, “Cz-Pz”, “C3-O1”, and “T3-O1”. The 30 remaining columns are binary labels. The labels come in groups of 3, each for a specific derivation. In each group, the first label marks the presence of Front Sharp Transients, the second label Anterior Slow Dysrythmia, and the third label Spike. The first group is for Fp2-T4 (columns 10-12), the second group is for Fp2-C4 (columns 13-15) and so on.\n"))
}

file_list <- tryCatch ({readr::read_csv ("files.csv", col_names = c ("filename"), col_types = list (readr::col_character ()))}, error = function(cond) {
    tibble::tibble (filename = character (0))
})
subjects <- tryCatch ({readr::read_csv ("subjects.csv", col_names = c ("patient", "filename"), col_types = list (readr::col_character (), readr::col_character ()))}, error = function (cond) {
    tibble::tibble (patient = character (0), filename = character (0))
})

for (filename in args) {
    ## Get the whole signal with both filter variants.
    file_size <- file.info(filename)$size
    ## Each value is encoded on 2 bytes.
    stopifnot (file_size %% 2 == 0)
    n_values <- file_size / 2
    ## There are 2 copies of the signal.
    stopifnot (n_values %% 2 == 0)
    signal_length <- n_values / 2
    ## Each copy of the signal has 11 channels of the same length.
    electrodes <- c ("Fp2", "T4", "C4", "O2", "Fz", "Cz", "Pz", "Fp1", "T3", "C3", "O1")
    n_electrodes <- length (electrodes)
    stopifnot (n_electrodes == 11)
    stopifnot (signal_length %% n_electrodes == 0)
    n_times <- signal_length / n_electrodes
    ## So we want to read the first 11 * n_times * 2 bytes of the signal
    encoded_signal <- readBin (filename, "integer", n = n_electrodes * n_times, size = 2, signed = FALSE, endian = "big")
    annotations <- tryCatch ({
        readr::read_csv (sprintf ("%s.annotations.csv", filename),
                         col_names = c ("onset", "duration", "figure_class", "electrode_list"),
                         col_types = list (readr::col_double (), readr::col_double (), readr::col_character (), readr::col_character ()))
    }, error = function (e) {
        warning(sprintf ("No annotation found for file %s", filename))
        tibble::tibble (onset = numeric(0), duration = numeric(0), figure_class = character(0), electrode_list = character(0))
    })
    signal <- (tibble::tibble (value = (encoded_signal / 65536.0) * 2048.0 - 1024.0,
                              electrode = rep (electrodes, each=n_times),
                              time = as.integer(rep (seq_len (n_times), n_electrodes)))
        %>% tidyr::pivot_wider (names_from = electrode, values_from = value, id_cols = c (time))
        %>% dplyr::mutate (`Fp2-T4` = Fp2 - T4,
                           `Fp2-C4` = Fp2 - C4,
                           `Fz-Cz` = Fz - Cz,
                           `Fp1-C3` = Fp1 - C3,
                           `Fp1-T3` = Fp1 - T3,
                           `T4-O2` = T4 - O2,
                           `C4-O2` = C4 - O2,
                           `Cz-Pz` = Cz - Pz,
                           `C3-O1` = C3 - O1,
                           `T3-O1` = T3 - O1)
        %>% dplyr::select (time, `Fp2-T4`, `Fp2-C4`, `Fz-Cz`, `Fp1-C3`, `Fp1-T3`, `T4-O2`, `C4-O2`, `Cz-Pz`, `C3-O1`, `T3-O1`)
        %>% tidyr::pivot_longer (!time,
                                 names_to = "electrode",
                                 values_to = "value")
        %>% dplyr::mutate (fsp = FALSE, asd = FALSE, spike = FALSE))
    for (i in seq_len (nrow (annotations))) {
        start <- annotations$onset[i]
        stop <- annotations$duration[i] + start
        start_index <- start * 256
        stop_index <- stop * 256
        locations <- strsplit (annotations$electrode_list[i], ";")[[1]]
        electrode_names <- character (0)
        for (loc in locations) {
            found <- FALSE
            for (derivation in c ("Fp2-T4", "Fp2-C4", "Fz-Cz", "Fp1-C3", "Fp1-T3", "T4-O2", "C4-O2", "Cz-Pz", "C3-O1", "T3-O1")) {
                if (loc == sprintf ("https://neonatool.github.io/adftool-v2#%s", derivation)) {
                    electrode_names <- c (electrode_names, derivation)
                    found <- TRUE
                }
            }
            if (!found) {
                warning (sprintf ("File %s, annotation %s at %f: unusable location %s", filename, annotations$figure_class[i], annotations$onset[i], loc))
            }
        }
        relevant_rows <- which (signal$time >= start_index & signal$time < stop_index & signal$electrode %in% electrode_names)
        if (length (relevant_rows) == 0) {
            warning (sprintf ("File %s, annotation %s at %f does not have any observable effect", filename, annotations$figure_class[i], annotations$onset[i]))
        }
        relevant_column <- NA
        if (annotations$figure_class[i] == "https://neonatool.github.io/adftool-v2#FSP") {
            relevant_column <- "fsp"
        } else if (annotations$figure_class[i] == "https://neonatool.github.io/adftool-v2#ASD") {
            relevant_column <- "asd"
        } else if (annotations$figure_class[i] == "https://neonatool.github.io/adftool-v2#Spike") {
            relevant_column <- "spike"
        } else {
            warning (sprintf ("File %s, annotation %s at %f does not have a usable figure class", filename, annotations$figure_class[i], annotations$onset[i]))
        }
        if (!is.na (relevant_column)) {
            signal[relevant_rows, relevant_column] <- TRUE
        }
    }
    for (time_start in seq (1, n_times, by = 256 * 12)) {
        time_stop <- time_start + 2560 - 1
        relevant_part <- (signal
            %>% dplyr::filter (time >= time_start & time <= time_stop)
            %>% tidyr::pivot_longer (c (value, fsp, asd, spike), names_to = "kind", values_to = "value", cols_vary = "fastest"))
        relevant_signal_part <- (relevant_part
            %>% dplyr::filter (kind == "value")
            %>% dplyr::select (time, feature = electrode, value))
        relevant_label_part <- (relevant_part
            %>% dplyr::filter (kind != "value")
            %>% tidyr::unite (feature, c (electrode, kind)))
        data_matrix <- (dplyr::bind_rows (relevant_signal_part, relevant_label_part)
            %>% tidyr::pivot_wider (names_from = feature, values_from = value, id_cols = c (time))
            %>% dplyr::arrange (time)
            %>% dplyr::select (!time))
        stopifnot (nrow (data_matrix) <= 2560)
        stopifnot (ncol (data_matrix) == 40)
        if (time_start + 256 * 15 - 1 <= n_times) {
            stopifnot (nrow (data_matrix) == 2560)
        } else {
            stopifnot (nrow (data_matrix) <= 2560)
        }
        if (nrow (data_matrix) == 2560) {
            readr::write_csv (data_matrix, "next.csv", col_names = FALSE)
            digest <- openssl::sha256 (file ("next.csv"))
            new_name <- sprintf ("%s.csv", digest)
            file.rename ("next.csv", new_name)
            date_reset <- strptime("2000-01-01", format="%Y-%m-%d", tz="UTC")
            Sys.setFileTime(new_name, date_reset)
            file_list <- file_list %>% dplyr::add_row (filename = new_name)
            subjects <- subjects %>% dplyr::add_row (patient = filename, filename = new_name)
        }
    }
}

readr::write_csv (file_list %>% dplyr::arrange (filename), "files.csv", col_names = FALSE)
readr::write_csv (subjects %>% dplyr::arrange (patient), "subjects.csv", col_names = FALSE)
