TSENAT coverage - 93.82%

Files
Source

# Helpers for Tsallis entropy calculations

.tsenat_calc_S <- function(p, q, tol, n, log_base, norm) {
    vapply(q, function(qi) {
        if (abs(qi - 1) < tol) {
            sh <- -sum(ifelse(p > 0, p * log(p, base = log_base), 0))
            if (norm) {
                if (n <= 1) {
                  # Single isoform: normalization is undefined (0/0) Return NaN
                  # to indicate undefined normalization
                  sh <- NaN
                } else {
                  sh <- sh/log(n, base = log_base)
                }
            }
            return(sh)
        } else {
            ts <- (1 - sum(p^qi))/(qi - 1)
            if (norm) {
                max_ts <- (1 - n^(1 - qi))/(qi - 1)
                # Handle single-isoform case: ts is 0, max_ts is also 0, so 0/0
                # is undefined Return NaN to indicate undefined normalization
                if (abs(max_ts) < tol || n <= 1) {
                  ts <- NaN
                } else {
                  ts <- ts/max_ts
                }
            }
            return(ts)
        }
    }, numeric(1))
}

.tsenat_calc_D <- function(p, q, tol, log_base) {
    vapply(q, function(qi) {
        if (abs(qi - 1) < tol) {
            sh <- -sum(ifelse(p > 0, p * log(p, base = log_base), 0))
            D1 <- (log_base)^(sh)
            return(D1)
        } else {
            spq <- sum(p^qi)
            Dq <- spq^(1/(1 - qi))
            return(Dq)
        }
    }, numeric(1))
}
# Input preparation
.tsenat_prepare_diversity_input <- function(x, genes = NULL, tpm = FALSE, assayno = 1,
    verbose = FALSE) {
    if (!(is.matrix(x) || is.data.frame(x) || is.list(x) || is(x, "DGEList") || is(x,
        "RangedSummarizedExperiment") || is(x, "SummarizedExperiment"))) {
        stop("Input data type is not supported! Please use ?calculate_diversity to see the possible arguments and details.",
            call. = FALSE)
    }

    if (is(x, "data.frame")) {
        x <- as.matrix(x)
    }

    if (tpm == TRUE && !is.list(x) && verbose == TRUE) {
        message("Note: tpm as a logical argument is only interpreted in case of",
            " tximport lists.")
    }

    se_assay_mat <- NULL
    if (is.list(x)) {
        if (length(x) == 4 && "counts" %in% names(x)) {
            if (tpm == FALSE) {
                x <- as.matrix(x$counts)
            }
            if (tpm == TRUE) {
                x <- as.matrix(x$abundance)
            }
        } else if (is(x, "DGEList")) {
            x <- as.matrix(x$counts)
            if (verbose == TRUE) {
                message("Note: calculate_diversity methods are only applicable",
                  " if your DGEList contains transcript-level expression", " data.")
            }
            if (tpm == TRUE && verbose == TRUE) {
                message("Note: tpm as a logical argument is only interpreted", " in case of tximport lists.")
            }
        } else {
            stop("The package cannot find any expression data in your input.", call. = FALSE)
        }
    }

    if (is(x, "RangedSummarizedExperiment") || is(x, "SummarizedExperiment")) {
        md <- NULL
        try(md <- S4Vectors::metadata(x), silent = TRUE)
        if (!is.null(md) && !is.null(md$readcounts)) {
            se_assay_mat <- as.matrix(md$readcounts)
            x <- se_assay_mat
        } else {
            assays_len <- length(SummarizedExperiment::assays(x))
            if (!is.numeric(assayno) || assays_len < assayno) {
                stop("Please provide a valid assay number.", call. = FALSE)
            }
            se_assay_mat <- as.matrix(SummarizedExperiment::assays(x)[[assayno]])
            x <- se_assay_mat
        }
        if (is.null(genes)) {
            if (exists("se_assay_mat") && !is.null(md) && !is.null(md$tx2gene) &&
                is.data.frame(md$tx2gene)) {
                txmap <- md$tx2gene
                tx_col <- if ("Transcript" %in% colnames(txmap)) {
                  "Transcript"
                } else {
                  colnames(txmap)[1]
                }
                gene_col <- if ("Gen" %in% colnames(txmap)) {
                  "Gen"
                } else {
                  colnames(txmap)[2]
                }
                genes <- as.character(txmap[[gene_col]][match(rownames(se_assay_mat),
                  txmap[[tx_col]])])
                rownames(x) <- NULL
            } else {
                genes <- rownames(x)
                # keep transcript rownames available for downstream metadata
                # (we will store original transcript-level counts separately)
                rownames(x) <- NULL
            }

            if (is.null(genes)) {
                stop("Please construct a valid gene set for your ", "SummarizedExperiment.",
                  call. = FALSE)
            }
        }
    }

    list(x = x, genes = genes, se_assay_mat = se_assay_mat)
}

#' Map external coldata into a SummarizedExperiment
#'
#' Map an external `coldata` table (sample IDs and a condition/label column)
#' into a `SummarizedExperiment` produced by `calculate_diversity()`. The
#' helper sets `colData(ts_se)$sample_type` when possible and records a
#' `sample_base` identifier for each column. Unmatched entries remain NA and
#' no automatic inference is attempted.
#' @param ts_se A `SummarizedExperiment` whose assay columns are named with
#'   sample identifiers. Names may include per-sample annotations; mapping is
#'   exact and case-sensitive unless you normalize identifiers beforehand.
#' @param coldata A data.frame with sample metadata (or `NULL`).
#' @param coldata_sample_col Name of the column in `coldata` containing sample
#'   identifiers (default: 'Sample').
#' @param coldata_condition_col Name of the column in `coldata` with
#'   condition/labels (default: 'Condition').
#' @param paired Logical; if `TRUE`, validate pairing and reorder columns so
#'   that matched samples for each base are adjacent (default: `FALSE`). Use
#'   `paired = TRUE` for paired analyses so downstream paired tests see
#'   aligned samples regardless of the original `coldata` order.
#' @return The input `ts_se` with `colData(ts_se)$sample_type` populated when
#' possible. `colData(ts_se)$sample_base` is also added containing the base
#' sample identifier; rownames of `colData(ts_se)` are aligned to the assay
#' column names.
#' @export
#' @examples
#' data('tcga_brca_luma', package = 'TSENAT')
#' rc <- as.matrix(tcga_brca_luma[1:20, -1, drop = FALSE])
#' gs <- tcga_brca_luma[1:20, 1]
#' se <- calculate_diversity(rc, gs, q = 0.1, norm = TRUE)
#' sample_names <- sub('_q=.*', '', colnames(SummarizedExperiment::assay(se)))
#' coldata_df <- data.frame(Sample = sample_names, Condition = rep(c('A', 'B'),
#'     length.out = ncol(se)
#' ))
#' map_metadata(se, coldata_df)
#' # Optionally validate pairs when appropriate
#' map_metadata(se, coldata_df, paired = TRUE)
map_metadata <- function(ts_se, coldata, coldata_sample_col = "Sample", coldata_condition_col = "Condition",
    paired = FALSE) {
    if (is.null(coldata)) {
        return(ts_se)
    }
    if (!is.data.frame(coldata) || !all(c(coldata_sample_col, coldata_condition_col) %in%
        colnames(coldata))) {
        return(ts_se)
    }
    # Prepare canonical condition order and base extraction.  `conds` is sorted
    # for deterministic ordering when constructing paired column order.
    # `coldata_base` removes a trailing role suffix (e.g.  `_N`/`_T`) from the
    # `coldata` sample IDs; note that assay column names may contain per-sample
    # annotations (e.g. per-q tags) which are handled separately below when
    # building `sample_base_names`.
    conds <- sort(unique(as.character(coldata[[coldata_condition_col]])))
    coldata_base <- sub("_[^_]+$", "", as.character(coldata[[coldata_sample_col]]))
    bases <- unique(coldata_base)

    # Optionally validate that coldata defines pairing between conditions
    if (isTRUE(paired) && length(conds) >= 2) {
        unpaired <- vapply(bases, function(b) {
            length(unique(coldata[[coldata_condition_col]][coldata_base == b]))
        }, integer(1))
        bad <- bases[unpaired != length(conds)]
        if (length(bad) > 0) {
            bad_list <- paste(bad, collapse = ", ")
            cond_list <- paste(conds, collapse = ", ")
            msg <- paste0("Unpaired samples found in coldata for bases: ", bad_list,
                ". Ensure each base has all conditions: ", cond_list)
            stop(msg, call. = FALSE)
        }
    }
    sample_base_names <- sub("_q=.*", "", colnames(SummarizedExperiment::assay(ts_se)))

    # Reorder the SummarizedExperiment columns.
    base_names <- sample_base_names
    idx_list <- integer(0)
    if (isTRUE(paired)) {
        # Build paired-aware order: for each base, list samples in the
        # deterministic `conds` order so matched samples become adjacent.
        for (b in bases) {
            for (c in conds) {
                # find sample names in coldata for this base+condition
                samples_for_pair <- as.character(coldata[[coldata_sample_col]])[coldata_base ==
                  b & as.character(coldata[[coldata_condition_col]]) == c]
                if (length(samples_for_pair) == 0) {
                  next
                }
                # for each sample name, find matching assay columns (after
                # q-stripping)
                for (s in samples_for_pair) {
                  matches <- which(base_names == s)
                  if (length(matches) > 0) {
                    idx_list <- c(idx_list, matches)
                  }
                }
            }
        }
        # Preserve original order for any unmatched columns
        remaining <- setdiff(seq_along(base_names), idx_list)
        new_order <- c(idx_list, remaining)
    } else {
        # Non-paired: follow the order present in `coldata`
        ordered_samples <- as.character(coldata[[coldata_sample_col]])
        for (s in ordered_samples) {
            matches <- which(base_names == s)
            if (length(matches) > 0) {
                idx_list <- c(idx_list, matches)
            }
        }
        remaining <- setdiff(seq_along(base_names), idx_list)
        new_order <- c(idx_list, remaining)
    }
    if (length(new_order) > 0 && !all(new_order == seq_along(base_names))) {
        ts_se <- ts_se[, new_order, drop = FALSE]
        sample_base_names <- sub("_q=.*", "", colnames(SummarizedExperiment::assay(ts_se)))
    }
    st_map <- setNames(as.character(coldata[[coldata_condition_col]]), as.character(coldata[[coldata_sample_col]]))
    sample_types <- unname(st_map[sample_base_names])
    missing_idx <- which(is.na(sample_types))
    if (length(missing_idx) > 0) {
        missing_samples <- sample_base_names[missing_idx]
        msg <- paste0("map_metadata: unmatched samples in 'coldata': ", paste(missing_samples,
            collapse = ", "), ". Provide matching entries in 'coldata' or populate ",
            "colData(ts_se)$sample_type beforehand.")
        stop(msg, call. = FALSE)
    }
    SummarizedExperiment::colData(ts_se)$sample_type <- sample_types
    # Also record the base sample names (without q suffix) per column so
    # downstream functions can rely on explicit pairing information.
    SummarizedExperiment::colData(ts_se)$sample_base <- sample_base_names
    # Ensure rownames of colData match assay column names
    assay_cols <- colnames(SummarizedExperiment::assay(ts_se))
    rownames(SummarizedExperiment::colData(ts_se)) <- assay_cols
    # Attach transcript-level readcounts and tx->gene mapping to metadata if
    # they are available in the calling environment or globalenv and not
    # already present in the SummarizedExperiment metadata. This simplifies
    # downstream plotting helpers that expect these objects.
    if (requireNamespace("S4Vectors", quietly = TRUE)) {
        md <- S4Vectors::metadata(ts_se)
        # prefer existing metadata values; otherwise try common names
        if (is.null(md$readcounts)) {
            if (exists("readcounts", envir = parent.frame())) {
                md$readcounts <- get("readcounts", envir = parent.frame())
            } else if (exists("readcounts", envir = globalenv())) {
                md$readcounts <- get("readcounts", envir = globalenv())
            }
        }
        if (is.null(md$tx2gene)) {
            # vignette uses 'txmap' variable name; also accept 'tx2gene'
            if (exists("txmap", envir = parent.frame())) {
                md$tx2gene <- get("txmap", envir = parent.frame())
            } else if (exists("tx2gene", envir = parent.frame())) {
                md$tx2gene <- get("tx2gene", envir = parent.frame())
            } else if (exists("txmap", envir = globalenv())) {
                md$tx2gene <- get("txmap", envir = globalenv())
            } else if (exists("tx2gene", envir = globalenv())) {
                md$tx2gene <- get("tx2gene", envir = globalenv())
            }
        }
        S4Vectors::metadata(ts_se) <- md
    }
    # If a diversity assay is present, prepare a simple diversity data.frame
    # (genes + per-sample diversity values) and store it in metadata so the
    # vignette and plotting helpers can use a ready-made table.
    if ("diversity" %in% SummarizedExperiment::assayNames(ts_se)) {
        div_mat <- as.matrix(SummarizedExperiment::assay(ts_se, "diversity"))
        # sample base names without per-q suffixes
        sample_base_names <- sub("_q=.*", "", colnames(div_mat))
        # prefer explicit sample_type in colData when present
        samples_vec <- NULL
        if ("sample_type" %in% colnames(SummarizedExperiment::colData(ts_se))) {
            samples_vec <- as.character(SummarizedExperiment::colData(ts_se)$sample_type)
        }
        div_df <- as.data.frame(div_mat)
        genes_col <- if (!is.null(SummarizedExperiment::rowData(ts_se)$genes)) {
            SummarizedExperiment::rowData(ts_se)$genes
        } else {
            rownames(div_df)
        }
        div_df <- cbind(genes = genes_col, div_df)
        md2 <- S4Vectors::metadata(ts_se)
        md2$diversity_df <- div_df
        md2$sample_base_names <- sample_base_names
        if (!is.null(samples_vec)) {
            md2$samples <- samples_vec
        }
        S4Vectors::metadata(ts_se) <- md2
    }
    return(ts_se)
}

# Map sample names (without '_q=...') to group labels using `colData(se)`.
# Mapping must be provided via `colData(se)`; no inference fallback is used.
map_samples_to_group <- function(sample_names, se = NULL, sample_type_col = NULL,
    mat = NULL) {
    # Prefer explicit mapping from colData(se)[, sample_type_col] when
    # provided. If `sample_type_col` is not provided, allow a single- condition
    # dataset by assigning a single default group 'Group' to all samples (this
    # permits plotting single-condition q-curves).
    base_names <- sub("_q=.*", "", colnames(if (!is.null(mat)) {
        mat
    } else {
        SummarizedExperiment::assay(se)
    }))

    if (!is.null(se) && !is.null(sample_type_col) && (sample_type_col %in% colnames(SummarizedExperiment::colData(se)))) {
        st_vec <- as.character(SummarizedExperiment::colData(se)[, sample_type_col])
        names(st_vec) <- base_names
        st_map <- st_vec[!duplicated(names(st_vec))]
    } else {
        # No explicit mapping: assume single-group dataset
        st_map <- setNames(rep("Group", length(base_names)), base_names)
    }

    mapped <- unname(st_map[sample_names])
    missing_idx <- which(is.na(mapped))
    if (length(missing_idx) > 0) {
        stop(sprintf("Missing sample_type mapping for samples: %s", paste(unique(sample_names[missing_idx]),
            collapse = ", ")))
    }
    mapped
}

# Prepare a long-format data.frame for a simple assay (one value per sample)
get_assay_long <- function(se, assay_name = "diversity", value_name = "diversity",
    sample_type_col = NULL) {
    if (!requireNamespace("tidyr", quietly = TRUE)) {
        stop("tidyr required")
    }
    if (!requireNamespace("dplyr", quietly = TRUE)) {
        stop("dplyr required")
    }
    if (!requireNamespace("SummarizedExperiment", quietly = TRUE)) {
        stop("SummarizedExperiment required")
    }

    mat <- SummarizedExperiment::assay(se, assay_name)
    if (is.null(mat)) {
        stop("Assay not found: ", assay_name)
    }
    df <- as.data.frame(mat)
    genes_col <- if (!is.null(SummarizedExperiment::rowData(se)$genes)) {
        SummarizedExperiment::rowData(se)$genes
    } else {
        rownames(df)
    }
    df <- cbind(df, Gene = genes_col)
    long <- tidyr::pivot_longer(df, -Gene, names_to = "sample", values_to = value_name)

    # sample_type: prefer explicit colData mapping when available. If not
    # provided, assume a single-group dataset and set `sample_type` to 'Group'
    # for all samples.
    if (!is.null(sample_type_col) && (sample_type_col %in% colnames(SummarizedExperiment::colData(se)))) {
        st <- as.character(SummarizedExperiment::colData(se)[, sample_type_col])
        names(st) <- colnames(mat)
        st_map <- st[!duplicated(names(st))]
        sample_base <- sub("_q=.*", "", long$sample)
        long$sample_type <- unname(st_map[sample_base])
        missing_idx <- which(is.na(long$sample_type))
        if (length(missing_idx) > 0) {
            stop(sprintf("Missing sample_type mapping for samples: %s", paste(unique(sample_base[missing_idx]),
                collapse = ", ")))
        }
    } else {
        long$sample_type <- rep("Group", nrow(long))
    }

    # Filter out NA values but retain sample_type information
    long_filtered <- long[!is.na(long[[value_name]]), , drop = FALSE]

    # Check if any data remains after filtering
    if (nrow(long_filtered) == 0) {
        stop(sprintf("No non-NA values found in assay '%s'. All values are NA.",
            assay_name), call. = FALSE)
    }

    long_filtered
}

# Internal small helper: prepare long-format tsallis data from a
# SummarizedExperiment
prepare_tsallis_long <- function(se, assay_name = "diversity", sample_type_col = "sample_type") {
    if (!requireNamespace("tidyr", quietly = TRUE)) {
        stop("tidyr required")
    }
    if (!requireNamespace("dplyr", quietly = TRUE)) {
        stop("dplyr required")
    }
    if (!requireNamespace("SummarizedExperiment", quietly = TRUE)) {
        stop("SummarizedExperiment required")
    }

    mat <- SummarizedExperiment::assay(se, assay_name)
    if (is.null(mat)) {
        stop("Assay not found: ", assay_name)
    }
    df <- as.data.frame(mat)
    genes_col <- if (!is.null(SummarizedExperiment::rowData(se)$genes)) {
        SummarizedExperiment::rowData(se)$genes
    } else {
        rownames(df)
    }
    df <- cbind(df, Gene = genes_col)

    long <- tidyr::pivot_longer(df, -Gene, names_to = "sample_q", values_to = "tsallis")
    if (any(grepl("_q=", long$sample_q))) {
        long <- tidyr::separate(long, sample_q, into = c("sample", "q"), sep = "_q=")
        long$q <- as.factor(as.numeric(long$q))
    } else {
        long$sample <- long$sample_q
        long$q <- NA
    }

    if (!is.null(sample_type_col) && (sample_type_col %in% colnames(SummarizedExperiment::colData(se)))) {
        st_vec <- as.character(SummarizedExperiment::colData(se)[, sample_type_col])
        names(st_vec) <- sub("_q=.*", "", colnames(mat))
        st_map <- st_vec[!duplicated(names(st_vec))]
        long$group <- unname(st_map[as.character(long$sample)])
        missing_idx <- which(is.na(long$group))
        if (length(missing_idx) > 0) {
            stop(sprintf("Missing sample_type mapping for samples: %s", paste(unique(as.character(long$sample)[missing_idx]),
                collapse = ", ")))
        }
    } else {
        long$group <- rep("Group", nrow(long))
    }

    as.data.frame(long[!is.na(long$tsallis), , drop = FALSE])
}


#' Map transcript IDs from a tx2gene table to a readcounts matrix
#'
#' Assign transcript identifiers as row names of a transcript-level read
#' counts matrix so downstream plotting functions can identify transcripts.
#' The tx2gene mapping can be provided as a file path (TSV) or a
#' data.frame with a `Transcript` column.
#'
#' @param readcounts A numeric matrix or data.frame of read counts (rows = transcripts).
#' @param tx2gene Either a path to a tab-delimited file or a data.frame with at least a `Transcript` column.
#' @param tx_col Name of the transcript ID column in `tx2gene` (default: 'Transcript').
#' @param verbose Logical; print informative messages (default: FALSE).
#' @return The input `readcounts` with rownames set to the transcript IDs.
#' @examples
#' readcounts <- matrix(1:4, nrow = 2, dimnames = list(NULL, c('s1', 's2')))
#' tx2gene <- data.frame(Transcript = c('tx1', 'tx2'), Gene = c('g1', 'g2'), stringsAsFactors = FALSE)
#' rc <- map_tx_to_readcounts(readcounts, tx2gene)
#' rownames(rc)
#' @export
map_tx_to_readcounts <- function(readcounts, tx2gene, tx_col = "Transcript", verbose = FALSE) {
    if (is.character(tx2gene) && length(tx2gene) == 1) {
        if (!file.exists(tx2gene)) {
            stop("tx2gene file not found: ", tx2gene, call. = FALSE)
        }
        txmap <- utils::read.delim(tx2gene, header = TRUE, stringsAsFactors = FALSE)
    } else if (is.data.frame(tx2gene)) {
        txmap <- tx2gene
    } else {
        stop("'tx2gene' must be a file path or a data.frame.", call. = FALSE)
    }

    if (!(tx_col %in% colnames(txmap))) {
        stop(sprintf("tx2gene mapping must contain column '%s'", tx_col), call. = FALSE)
    }

    # Ensure readcounts is a matrix-like object
    if (is.data.frame(readcounts)) {
        readcounts <- as.matrix(readcounts)
    }
    if (!is.matrix(readcounts)) {
        stop("'readcounts' must be a matrix or data.frame", call. = FALSE)
    }

    n_rc <- nrow(readcounts)
    n_tx <- nrow(txmap)

    if (n_tx == n_rc) {
        rownames(readcounts) <- as.character(txmap[[tx_col]])
        if (verbose) {
            message(sprintf("Assigned %d transcript rownames from tx2gene.", n_rc))
        }
        return(readcounts)
    }

    # If counts differ, attempt to match by transcript identifiers if present
    tx_ids <- as.character(txmap[[tx_col]])
    if (!is.null(rownames(readcounts)) && all(rownames(readcounts) %in% tx_ids)) {
        # reorder txmap to match readcounts row order and set rownames
        matched_idx <- match(rownames(readcounts), tx_ids)
        rownames(readcounts) <- tx_ids[matched_idx]
        if (verbose) {
            message("Matched and assigned transcript IDs by existing readcounts rownames.")
        }
        return(readcounts)
    }

    stop(sprintf("Number of transcripts in tx2gene (%d) does not match readcounts rows (%d), and automatic matching failed.",
        n_tx, n_rc), call. = FALSE)
}

# Summary reporting helper
.tsenat_report_fit_summary <- function(res, verbose = TRUE) {
    if (verbose && "fit_method" %in% colnames(res)) {
        total_genes <- nrow(res)
        fallback_mask <- !is.na(res$fit_method) & res$fit_method != "lmer"
        n_fallback <- sum(fallback_mask)
        if (n_fallback > 0) {
            tab <- table(res$fit_method[fallback_mask])
            tab_str <- paste(sprintf("%s=%d", names(tab), as.integer(tab)), collapse = ", ")
            message(sprintf("[calculate_lm_interaction] fallback fits used: %d/%d genes (%s)",
                n_fallback, total_genes, tab_str))
        }
        if ("singular" %in% colnames(res)) {
            n_sing <- sum(as.logical(res$singular), na.rm = TRUE)
            if (n_sing > 0) {
                message(sprintf("[calculate_lm_interaction] singular fits detected: %d/%d genes",
                  n_sing, total_genes))
            }
        }
    }
}
# GAM interaction helper
.tsenat_gam_interaction <- function(df, q_vals, g, min_obs = 10) {
    if (!requireNamespace("mgcv", quietly = TRUE)) {
        stop("Package 'mgcv' is required for method = 'gam'")
    }
    uq_len <- length(unique(na.omit(q_vals)))
    k_q <- max(2, min(10, uq_len - 1))
    fit_null <- try(mgcv::gam(entropy ~ group + s(q, k = k_q), data = df), silent = TRUE)
    fit_alt <- try(mgcv::gam(entropy ~ group + s(q, by = group, k = k_q), data = df),
        silent = TRUE)
    if (inherits(fit_null, "try-error") || inherits(fit_alt, "try-error")) {
        return(NULL)
    }
    an <- try(mgcv::anova.gam(fit_null, fit_alt, test = "F"), silent = TRUE)
    if (inherits(an, "try-error")) {
        return(NULL)
    }
    p_interaction <- NA_real_
    if (nrow(an) >= 2) {
        if ("Pr(F)" %in% colnames(an)) {
            p_interaction <- an[2, "Pr(F)"]
        } else if ("Pr(>F)" %in% colnames(an)) {
            p_interaction <- an[2, "Pr(>F)"]
        } else if ("p-value" %in% colnames(an)) {
            p_interaction <- an[2, "p-value"]
        }
    }
    return(data.frame(gene = g, p_interaction = p_interaction, stringsAsFactors = FALSE))
}

# FPCA interaction helper
.tsenat_fpca_interaction <- function(mat, q_vals, sample_names, group_vec, g, min_obs = 10) {
    uq <- sort(unique(q_vals))
    samples_u <- unique(sample_names)
    curve_mat <- matrix(NA_real_, nrow = length(samples_u), ncol = length(uq))
    if (length(samples_u) > 0) {
        rownames(curve_mat) <- samples_u
    }
    for (i in seq_along(sample_names)) {
        s <- sample_names[i]
        qv <- q_vals[i]
        qi <- match(qv, uq)
        if (is.na(qi)) {
            next
        }
        if (s %in% rownames(curve_mat)) {
            curve_mat[s, qi] <- as.numeric(mat[g, i])
        }
    }
    good_rows <- which(rowSums(!is.na(curve_mat)) >= max(2, ceiling(ncol(curve_mat)/2)))
    if (length(good_rows) < min_obs) {
        return(NULL)
    }
    mat_sub <- curve_mat[good_rows, , drop = FALSE]
    col_means <- apply(mat_sub, 2, function(col) mean(col, na.rm = TRUE))
    for (r in seq_len(nrow(mat_sub))) mat_sub[r, is.na(mat_sub[r, ])] <- col_means[is.na(mat_sub[r,
        ])]
    pca <- try(stats::prcomp(mat_sub, center = TRUE, scale. = FALSE), silent = TRUE)
    if (inherits(pca, "try-error")) {
        return(NULL)
    }
    if (ncol(pca$x) < 1) {
        return(NULL)
    }
    pc1 <- pca$x[, 1]
    used_samples <- rownames(mat_sub)
    grp_vals <- group_vec[match(used_samples, sample_names)]
    if (length(unique(na.omit(grp_vals))) < 2) {
        return(NULL)
    }
    g1 <- unique(na.omit(grp_vals))[1]
    g2 <- unique(na.omit(grp_vals))[2]
    x1 <- pc1[grp_vals == g1]
    x2 <- pc1[grp_vals == g2]
    if (length(x1) < 2 || length(x2) < 2) {
        return(NULL)
    }
    t_res <- try(stats::t.test(x1, x2), silent = TRUE)
    if (inherits(t_res, "try-error")) {
        return(NULL)
    }
    pval <- as.numeric(t_res$p.value)
    return(data.frame(gene = g, p_interaction = pval, stringsAsFactors = FALSE))
}

# Fit function extracted from calculate_lm_interaction
.tsenat_fit_one_interaction <- function(g, se, mat, q_vals, sample_names, group_vec,
    method, pvalue, subject_col, paired, min_obs, verbose, suppress_lme4_warnings,
    progress) {
    vals <- as.numeric(mat[g, ])
    df <- data.frame(entropy = vals, q = q_vals, group = factor(group_vec))
    if (sum(!is.na(df$entropy)) < min_obs) {
        return(NULL)
    }
    if (length(unique(na.omit(df$group))) < 2) {
        return(NULL)
    }

    if (method == "linear") {
        fit <- try(stats::lm(entropy ~ q * group, data = df), silent = TRUE)
        if (inherits(fit, "try-error")) {
            return(NULL)
        }
        coefs <- summary(fit)$coefficients
        ia_idx <- grep("^q:group", rownames(coefs))
        if (length(ia_idx) == 0) {
            return(NULL)
        }
        p_interaction <- coefs[ia_idx[1], "Pr(>|t|)"]
        return(data.frame(gene = g, p_interaction = p_interaction, stringsAsFactors = FALSE))
    }

    if (method == "lmm") {
        if (!requireNamespace("lme4", quietly = TRUE)) {
            stop("Package 'lme4' is required for method = 'lmm'")
        }
        # determine subject IDs for random effect
        subject <- NULL
        if (!is.null(subject_col)) {
            if (!(subject_col %in% colnames(SummarizedExperiment::colData(se)))) {
                stop(sprintf("subject_col '%s' not found in colData(se)", subject_col))
            }
            subj_full <- as.character(SummarizedExperiment::colData(se)[, subject_col])
            names(subj_full) <- SummarizedExperiment::colData(se)$samples %||% colnames(mat)
            subject <- unname(subj_full[sample_names])
        } else if (paired) {
            # require a paired identifier (sample_base) in colData
            if ("sample_base" %in% colnames(SummarizedExperiment::colData(se))) {
                sb <- as.character(SummarizedExperiment::colData(se)[, "sample_base"])
                names(sb) <- SummarizedExperiment::colData(se)$samples %||% colnames(mat)
                subject <- unname(sb[sample_names])
            } else {
                stop("paired = TRUE but no 'sample_base' column found in colData(se); call map_metadata(..., paired = TRUE) or supply subject_col")
            }
        } else {
            # fallback to sample base derived from column names
            subject <- sample_names
        }
        df$subject <- factor(subject)
        # require at least two subjects and at least two groups represented
        if (length(unique(na.omit(df$subject))) < 2) {
            return(NULL)
        }
        if (length(unique(na.omit(df$group))) < 2) {
            return(NULL)
        }
        # fit null (no interaction) and alternative (with q:group interaction)
        mm_suppress_pattern <- "boundary \\(singular\\) fit|Computed variance-covariance matrix problem|not a positive definite matrix"

        # attempt to fit mixed models (null and alternative) using helper
        fit0 <- .tsenat_try_lmer(entropy ~ q + group + (1 | subject), df, suppress_lme4_warnings = suppress_lme4_warnings,
            verbose = verbose, mm_suppress_pattern = mm_suppress_pattern)
        fit1 <- .tsenat_try_lmer(entropy ~ q * group + (1 | subject), df, suppress_lme4_warnings = suppress_lme4_warnings,
            verbose = verbose, mm_suppress_pattern = mm_suppress_pattern)

        # If either fit failed, try falling back to simpler approaches
        fallback_lm <- NULL
        used_fit_method <- "lmer"
        used_singular <- FALSE
        if (inherits(fit0, "try-error") || inherits(fit1, "try-error") || (inherits(fit0,
            "lmerMod") && attr(fit0, "singular") == TRUE) || (inherits(fit1, "lmerMod") &&
            attr(fit1, "singular") == TRUE)) {
            used_fit_method <- "fallback"
            used_singular <- (inherits(fit0, "lmerMod") && attr(fit0, "singular") ==
                TRUE) || (inherits(fit1, "lmerMod") && attr(fit1, "singular") ==
                TRUE)
            if ((verbose && progress) || (!verbose && progress)) {
                message("[calculate_lm_interaction] mixed model singular or failed; trying simpler fixed-effects fallback")
            }
            fb <- .tsenat_try_lm_fallbacks(df, verbose = verbose)
            if (!is.null(fb)) {
                fallback_lm <- fb
                used_fit_method <- fb$method
            }
        } else {
            used_singular <- (inherits(fit0, "lmerMod") && attr(fit0, "singular") ==
                TRUE) || (inherits(fit1, "lmerMod") && attr(fit1, "singular") ==
                TRUE)
            if (used_singular) {
                used_fit_method <- "lmer_singular"
            } else {
                used_fit_method <- "lmer"
            }
        }

        lrt_p <- NA_real_
        if (!is.null(fallback_lm)) {
            lrt_p <- .tsenat_extract_lrt_p(fallback_lm$fit0, fallback_lm$fit1)
        } else {
            lrt_p <- .tsenat_extract_lrt_p(fit0, fit1)
        }

        satter_p <- NA_real_
        if (pvalue %in% c("satterthwaite", "both")) {
            mm_suppress_pattern <- "boundary \\(singular\\) fit|Computed variance-covariance matrix problem|not a positive definite matrix"
            muffle_cond <- suppress_lme4_warnings || (!verbose)
            satter_p <- withCallingHandlers(.tsenat_extract_satterthwaite_p(fit1,
                fallback_lm), warning = function(w) {
                if (muffle_cond && grepl(mm_suppress_pattern, conditionMessage(w),
                  ignore.case = TRUE)) {
                  invokeRestart("muffleWarning")
                }
            }, message = function(m) {
                if (muffle_cond && grepl(mm_suppress_pattern, conditionMessage(m),
                  ignore.case = TRUE)) {
                  invokeRestart("muffleMessage")
                }
            })
        }

        # choose primary p_interaction according to user preference
        p_interaction <- NA_real_
        if (pvalue == "satterthwaite") {
            p_interaction <- satter_p
        } else if (pvalue == "lrt") {
            p_interaction <- lrt_p
        } else if (pvalue == "both") {
            if (!is.na(satter_p)) {
                p_interaction <- satter_p
            } else {
                p_interaction <- lrt_p
            }
        }

        return(data.frame(gene = g, p_interaction = p_interaction, p_lrt = lrt_p,
            p_satterthwaite = satter_p, fit_method = used_fit_method, singular = used_singular,
            stringsAsFactors = FALSE))
    }

    if (method == "gam") {
        return(.tsenat_gam_interaction(df, q_vals, g, min_obs = min_obs))
    }

    if (method == "fpca") {
        return(.tsenat_fpca_interaction(mat, q_vals, sample_names, group_vec, g,
            min_obs = min_obs))
    }
    return(NULL)
}
## All helpers for calculate_lm_interaction

# Try lme4::lmer with multiple optimizers and controlled warnings.
.tsenat_try_lmer <- function(formula, data, suppress_lme4_warnings = TRUE, verbose = FALSE,
    mm_suppress_pattern = "boundary \\(singular\\) fit|Computed variance-covariance matrix problem|not a positive definite matrix") {
    if (!requireNamespace("lme4", quietly = TRUE)) {
        stop("Package 'lme4' is required for mixed-model fitting")
    }
    opts <- list(list(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05)), list(optimizer = "nloptwrap",
        optCtrl = list(maxfun = 5e+05)))
    for (o in opts) {
        ctrl <- lme4::lmerControl(optimizer = o$optimizer, optCtrl = o$optCtrl)
        muffle_cond <- suppress_lme4_warnings || (!verbose)
        fit_try <- withCallingHandlers(try(lme4::lmer(formula, data = data, REML = FALSE,
            control = ctrl), silent = TRUE), warning = function(w) {
            if (muffle_cond && grepl(mm_suppress_pattern, conditionMessage(w), ignore.case = TRUE)) {
                invokeRestart("muffleWarning")
            }
        }, message = function(m) {
            if (muffle_cond && grepl(mm_suppress_pattern, conditionMessage(m), ignore.case = TRUE)) {
                invokeRestart("muffleMessage")
            }
        })
        if (!inherits(fit_try, "try-error")) {
            # check singularity if function available
            is_sing <- FALSE
            if (exists("isSingular", where = asNamespace("lme4"), inherits = FALSE)) {
                is_sing <- tryCatch(lme4::isSingular(fit_try, tol = 1e-04), error = function(e) FALSE)
            }
            attr(fit_try, "singular") <- is_sing
            return(fit_try)
        }
    }
    # all attempts failed
    return(structure("error", class = "try-error"))
}

# LM fallback helpers
.tsenat_try_lm_fallbacks <- function(df, verbose = FALSE) {
    fit0_lm <- try(stats::lm(entropy ~ q + group + subject, data = df), silent = TRUE)
    fit1_lm <- try(stats::lm(entropy ~ q * group + subject, data = df), silent = TRUE)
    if (!inherits(fit0_lm, "try-error") && !inherits(fit1_lm, "try-error")) {
        return(list(fit0 = fit0_lm, fit1 = fit1_lm, method = "lm_subject"))
    }
    fit0_lm2 <- try(stats::lm(entropy ~ q + group, data = df), silent = TRUE)
    fit1_lm2 <- try(stats::lm(entropy ~ q * group, data = df), silent = TRUE)
    if (!inherits(fit0_lm2, "try-error") && !inherits(fit1_lm2, "try-error")) {
        return(list(fit0 = fit0_lm2, fit1 = fit1_lm2, method = "lm_nosubject"))
    }
    return(NULL)
}

# LRT p-value extraction
.tsenat_extract_lrt_p <- function(fit0, fit1) {
    an <- try(stats::anova(fit0, fit1), silent = TRUE)
    if (!inherits(an, "try-error") && nrow(an) >= 2) {
        pcol <- grep("Pr\\(>F\\)|Pr\\(>Chisq\\)|Pr\\(>Chi\\)", colnames(an), value = TRUE)
        if (length(pcol) == 0) {
            return(as.numeric(an[2, ncol(an)]))
        } else {
            return(as.numeric(an[2, pcol[1]]))
        }
    }
    return(NA_real_)
}

# Satterthwaite p-value extraction
.tsenat_extract_satterthwaite_p <- function(fit1, fallback_lm = NULL) {
    if (!is.null(fallback_lm)) {
        coefs <- try(summary(fallback_lm$fit1)$coefficients, silent = TRUE)
        if (!is.null(coefs) && !inherits(coefs, "try-error")) {
            ia_idx <- grep("^q:group", rownames(coefs))
            if (length(ia_idx) > 0) {
                return(coefs[ia_idx[1], "Pr(>|t|)"])
            }
        }
        return(NA_real_)
    }
    if (requireNamespace("lmerTest", quietly = TRUE) && inherits(fit1, "lmerMod") &&
        !isTRUE(attr(fit1, "singular"))) {
        fit_lt <- try(lmerTest::lmer(stats::formula(fit1), data = stats::model.frame(fit1),
            REML = FALSE), silent = TRUE)
        if (!inherits(fit_lt, "try-error")) {
            coefs <- summary(fit_lt)$coefficients
            ia_idx <- grep("^q:group", rownames(coefs))
            if (length(ia_idx) > 0) {
                return(coefs[ia_idx[1], "Pr(>|t|)"])
            }
        }
    }
    return(NA_real_)
}
.tsenat_extract_satterthwaite_p <- function(fit1, fallback_lm = NULL, suppress_lme4_warnings = TRUE,
    verbose = FALSE, mm_suppress_pattern = "boundary \\(singular\\) fit|Computed variance-covariance matrix problem|not a positive definite matrix") {
    # If we have a fallback lm, extract from its coefficients
    if (!is.null(fallback_lm)) {
        coefs <- try(summary(fallback_lm$fit1)$coefficients, silent = TRUE)
        if (!inherits(coefs, "try-error")) {
            ia_idx <- grep("^q:group", rownames(coefs))
            if (length(ia_idx) > 0) {
                return(coefs[ia_idx[1], "Pr(>|t|)"])
            }
        }
        return(NA_real_)
    }
    # Prefer lmerTest when available; suppress known lme4/lmerTest warnings
    if (requireNamespace("lmerTest", quietly = TRUE) && inherits(fit1, "lmerMod")) {
        muffle_cond <- suppress_lme4_warnings || (!verbose)
        fit_lt <- withCallingHandlers(try(lmerTest::lmer(stats::formula(fit1), data = stats::model.frame(fit1),
            REML = FALSE), silent = TRUE), warning = function(w) {
            if (muffle_cond && grepl(mm_suppress_pattern, conditionMessage(w), ignore.case = TRUE)) {
                invokeRestart("muffleWarning")
            }
        }, message = function(m) {
            if (muffle_cond && grepl(mm_suppress_pattern, conditionMessage(m), ignore.case = TRUE)) {
                invokeRestart("muffleMessage")
            }
        })
        if (!inherits(fit_lt, "try-error")) {
            coefs <- summary(fit_lt)$coefficients
            ia_idx <- grep("^q:group", rownames(coefs))
            if (length(ia_idx) > 0) {
                return(coefs[ia_idx[1], "Pr(>|t|)"])
            }
        }
    }
    return(NA_real_)
}

# FPCA matrix preparation
.tsenat_prepare_fpca_matrix <- function(mat, min_frac = 0.01) {
    if (!is.matrix(mat)) {
        mat <- as.matrix(mat)
    }
    row_vars <- apply(mat, 1, stats::var, na.rm = TRUE)
    keep <- row_vars > (min_frac * max(row_vars, na.rm = TRUE))
    if (sum(keep) == 0) {
        keep <- rep(TRUE, nrow(mat))
    }
    m2 <- mat[keep, , drop = FALSE]
    m2 <- t(scale(t(m2)))
    return(list(mat = m2, keep = keep))
}

## Helpers for calculate_lm_interaction fallbacks, LRT and Satterthwaite
## p-values
.tsenat_try_lm_fallbacks <- function(df, verbose = FALSE) {
    # try lm with subject as fixed effect
    fit0_lm <- try(stats::lm(entropy ~ q + group + subject, data = df), silent = TRUE)
    fit1_lm <- try(stats::lm(entropy ~ q * group + subject, data = df), silent = TRUE)
    if (!inherits(fit0_lm, "try-error") && !inherits(fit1_lm, "try-error")) {
        return(list(fit0 = fit0_lm, fit1 = fit1_lm, method = "lm_subject"))
    }
    # last resort: drop subject, plain lm
    fit0_lm2 <- try(stats::lm(entropy ~ q + group, data = df), silent = TRUE)
    fit1_lm2 <- try(stats::lm(entropy ~ q * group, data = df), silent = TRUE)
    if (!inherits(fit0_lm2, "try-error") && !inherits(fit1_lm2, "try-error")) {
        return(list(fit0 = fit0_lm2, fit1 = fit1_lm2, method = "lm_nosubject"))
    }
    return(NULL)
}

.tsenat_extract_lrt_p <- function(fit0, fit1) {
    an <- try(stats::anova(fit0, fit1), silent = TRUE)
    if (!inherits(an, "try-error") && nrow(an) >= 2) {
        pcol <- grep("Pr\\(>F\\)|Pr\\(>Chisq\\)|Pr\\(>Chi\\)", colnames(an), value = TRUE)
        if (length(pcol) == 0) {
            return(as.numeric(an[2, ncol(an)]))
        } else {
            return(as.numeric(an[2, pcol[1]]))
        }
    }
    return(NA_real_)
}

.tsenat_extract_satterthwaite_p <- function(fit1, fallback_lm = NULL) {
    # if lmerTest available and fit1 is not singular use that
    if (!is.null(fallback_lm)) {
        coefs <- try(summary(fallback_lm$fit1)$coefficients, silent = TRUE)
        if (!inherits(coefs, "try-error")) {
            ia_idx <- grep("^q:group", rownames(coefs))
            if (length(ia_idx) > 0) {
                return(coefs[ia_idx[1], "Pr(>|t|)"])
            }
        }
        return(NA_real_)
    }
    if (requireNamespace("lmerTest", quietly = TRUE) && inherits(fit1, "lmerMod") &&
        !isTRUE(attr(fit1, "singular"))) {
        fit_lt <- try(lmerTest::lmer(stats::formula(fit1), data = stats::model.frame(fit1),
            REML = FALSE), silent = TRUE)
        if (!inherits(fit_lt, "try-error")) {
            coefs <- summary(fit_lt)$coefficients
            ia_idx <- grep("^q:group", rownames(coefs))
            if (length(ia_idx) > 0) {
                return(coefs[ia_idx[1], "Pr(>|t|)"])
            }
        }
    }
    return(NA_real_)
}

.tsenat_prepare_fpca_matrix <- function(mat, min_frac = 0.01) {
    # prepare matrix for FPCA: center, scale, and drop near-constant rows
    if (!is.matrix(mat)) {
        mat <- as.matrix(mat)
    }
    row_vars <- apply(mat, 1, stats::var, na.rm = TRUE)
    keep <- row_vars > (min_frac * max(row_vars, na.rm = TRUE))
    if (sum(keep) == 0) {
        keep <- rep(TRUE, nrow(mat))
    }
    m2 <- mat[keep, , drop = FALSE]
    m2 <- t(scale(t(m2)))
    return(list(mat = m2, keep = keep))
}
## Additional helpers for calculate_lm_interaction

# Try simple lm fallbacks when lmer fails. Returns list(fit0, fit1, method)
.tsenat_try_lm_fallbacks <- function(df, verbose = FALSE) {
    # try lm with subject as fixed effect
    fit0_lm <- try(stats::lm(entropy ~ q + group + subject, data = df), silent = TRUE)
    fit1_lm <- try(stats::lm(entropy ~ q * group + subject, data = df), silent = TRUE)
    if (!inherits(fit0_lm, "try-error") && !inherits(fit1_lm, "try-error")) {
        return(list(fit0 = fit0_lm, fit1 = fit1_lm, method = "lm_subject"))
    }
    # last resort: drop subject
    fit0_lm2 <- try(stats::lm(entropy ~ q + group, data = df), silent = TRUE)
    fit1_lm2 <- try(stats::lm(entropy ~ q * group, data = df), silent = TRUE)
    if (!inherits(fit0_lm2, "try-error") && !inherits(fit1_lm2, "try-error")) {
        return(list(fit0 = fit0_lm2, fit1 = fit1_lm2, method = "lm_nosubject"))
    }
    # indicate failure
    return(NULL)
}

# Extract LRT p-value from two fitted models (lm or lmer). Returns NA_real_ on
# failure.
.tsenat_extract_lrt_p <- function(fit0, fit1) {
    an <- try(stats::anova(fit0, fit1), silent = TRUE)
    if (inherits(an, "try-error") || nrow(an) < 2) {
        return(NA_real_)
    }
    pcol <- grep("Pr\\(>F\\)|Pr\\(>Chisq\\)|Pr\\(>Chi\\)", colnames(an), value = TRUE)
    if (length(pcol) == 0) {
        return(as.numeric(an[2, ncol(an)]))
    }
    return(as.numeric(an[2, pcol[1]]))
}

# Extract Satterthwaite p-value for q:group interaction from lmer/lmerTest or
# fallback lm.
.tsenat_extract_satterthwaite_p <- function(fit1, fallback_lm = NULL) {
    # prefer lmerTest when available
    if (!is.null(fallback_lm)) {
        coefs <- summary(fallback_lm$fit1)$coefficients
        ia_idx <- grep("^q:group", rownames(coefs))
        if (length(ia_idx) > 0) {
            return(coefs[ia_idx[1], "Pr(>|t|)"])
        }
        return(NA_real_)
    }
    if (requireNamespace("lmerTest", quietly = TRUE) && inherits(fit1, "lmerMod")) {
        fit_lt <- try(lmerTest::lmer(as.formula(formula(fit1)), data = model.frame(fit1),
            REML = FALSE), silent = TRUE)
        if (!inherits(fit_lt, "try-error")) {
            coefs <- summary(fit_lt)$coefficients
            ia_idx <- grep("^q:group", rownames(coefs))
            if (length(ia_idx) > 0) {
                return(coefs[ia_idx[1], "Pr(>|t|)"])
            }
        }
    }
    return(NA_real_)
}

# Helper for FPCA-style preprocessing used in calculate_lm_interaction fpca
# method.  Builds curve_mat, filters good rows, imputes column means, and
# returns list(mat_sub, used_samples)
.tsenat_prepare_fpca_matrix <- function(mat, sample_names, q_vals, min_obs = 10) {
    uq <- sort(unique(q_vals))
    samples_u <- unique(sample_names)
    curve_mat <- matrix(NA_real_, nrow = length(samples_u), ncol = length(uq))
    if (length(samples_u) > 0) {
        rownames(curve_mat) <- samples_u
    }
    for (i in seq_along(sample_names)) {
        s <- sample_names[i]
        qv <- q_vals[i]
        qi <- match(qv, uq)
        if (is.na(qi)) {
            next
        }
        if (s %in% rownames(curve_mat)) {
            curve_mat[s, qi] <- as.numeric(mat[, i])
        }
    }
    # keep samples with at least half of q points present
    good_rows <- which(rowSums(!is.na(curve_mat)) >= max(2, ceiling(ncol(curve_mat)/2)))
    if (length(good_rows) < min_obs) {
        return(NULL)
    }
    mat_sub <- curve_mat[good_rows, , drop = FALSE]
    col_means <- apply(mat_sub, 2, function(col) mean(col, na.rm = TRUE))
    for (r in seq_len(nrow(mat_sub))) mat_sub[r, is.na(mat_sub[r, ])] <- col_means[is.na(mat_sub[r,
        ])]
    list(mat_sub = mat_sub, used_samples = rownames(mat_sub))
}

# Helper utilities for calculate_difference

.tsenat_calculate_difference_partition <- function(df, samples, control, method,
    test, pcorr, randomizations, verbose) {
    if (ncol(df) - 1 != length(samples)) {
        stop("Column count doesn't match length(samples).", call. = FALSE)
    }
    uniq_groups <- unique(as.character(samples))
    if (length(uniq_groups) > 2) {
        stop("More than two conditions; provide exactly two.", call. = FALSE)
    }
    if (length(uniq_groups) < 2) {
        stop("Fewer than two conditions; provide exactly two.", call. = FALSE)
    }
    if (!(control %in% uniq_groups)) {
        stop("Control sample type not found in samples.", call. = FALSE)
    }

    case_label <- setdiff(uniq_groups, control)
    groups <- c(case_label, control)

    if (!(method %in% c("mean", "median"))) {
        stop("Invalid method; see ?calculate_difference.", call. = FALSE)
    }
    if (!(test %in% c("wilcoxon", "shuffle"))) {
        stop("Invalid test method; see ?calculate_difference.", call. = FALSE)
    }
    valid_pcorr <- c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr",
        "none")
    if (!(pcorr %in% valid_pcorr)) {
        stop("Invalid p-value correction; see ?calculate_difference.", call. = FALSE)
    }

    tab <- table(samples)
    if (test == "wilcoxon") {
        if (randomizations != 100 && verbose) {
            message("'randomizations' ignored for wilcoxon.")
        }
        if (any(tab < 3) || sum(tab) < 8) {
            warning("Low sample size for wilcoxon.", call. = FALSE)
        }
    }
    if (test == "shuffle") {
        if (sum(tab) <= 5) {
            warning("Low sample size for label shuffling.", call. = FALSE)
        }
        if (sum(tab) > 5 && sum(tab) < 10) {
            warning("Label shuffling may be unreliable.", call. = FALSE)
        }
    }

    idx_case <- which(samples == groups[1])
    idx_control <- which(samples == groups[2])
    idx1 <- idx_case
    idx2 <- idx_control

    df$cond_1 <- rowSums(!is.na(df[, idx1 + 1, drop = FALSE]))
    df$cond_2 <- rowSums(!is.na(df[, idx2 + 1, drop = FALSE]))

    if (test == "wilcoxon") {
        keep_mask <- (df$cond_1 >= 3 & df$cond_2 >= 3 & (df$cond_1 + df$cond_2) >=
            8)
    } else {
        keep_mask <- (df$cond_1 + df$cond_2) >= 5
    }

    df_keep <- df[keep_mask, , drop = FALSE]
    df_small <- df[!keep_mask, , drop = FALSE]

    list(df = df, samples = samples, groups = groups, idx1 = idx1, idx2 = idx2, df_keep = df_keep,
        df_small = df_small)
}

# Helpers for calculate_fc
.tsenat_aggregate_fc_values <- function(x, samples, method, control) {
    if (method == "mean") {
        value <- aggregate(t(x), by = list(samples), mean, na.rm = TRUE)
    } else if (method == "median") {
        value <- aggregate(t(x), by = list(samples), median, na.rm = TRUE)
    } else {
        stop("Invalid method; must be 'mean' or 'median'")
    }

    sorted <- value[value$Group.1 != control, ]
    sorted[2, ] <- value[value$Group.1 == control, ]
    value <- t(sorted[, -1])
    value[is.na(value[, 1]), c(1)] <- NA
    value[is.na(value[, 2]), c(2)] <- NA
    return(list(value = value, sorted = sorted))
}

.tsenat_apply_pseudocount <- function(value, pseudocount) {
    if (!is.numeric(pseudocount) || length(pseudocount) != 1) {
        pseudocount <- 0
    }
    if (pseudocount <= 0) {
        pos_vals <- value[!is.na(value) & value > 0]
        if (length(pos_vals) > 0) {
            pc <- min(pos_vals, na.rm = TRUE)/2
        } else {
            pc <- 1e-06
        }
    } else {
        pc <- pseudocount
    }
    replace_idx <- !is.na(value) & value <= 0
    value[replace_idx] <- pc
    return(value)
}

# Paired permutation helpers
.tsenat_permute_paired <- function(x, samples, control, method, randomizations, paired_method) {
    ncols <- ncol(x)
    if (ncols%%2 != 0) {
        stop("Paired permutation requires an even number of samples and paired column ordering",
            call. = FALSE)
    }
    npairs <- ncols/2
    if (paired_method == "swap") {
        perm_mat <- matrix(NA_real_, nrow = nrow(x), ncol = randomizations)
        for (r in seq_len(randomizations)) {
            swap <- sample(c(TRUE, FALSE), size = npairs, replace = TRUE)
            perm_samples <- samples
            for (p in seq_len(npairs)) {
                if (swap[p]) {
                  i1 <- (p - 1) * 2 + 1
                  i2 <- i1 + 1
                  perm_samples[c(i1, i2)] <- perm_samples[c(i2, i1)]
                }
            }
            df_perm <- calculate_fc(x, perm_samples, control, method)
            perm_mat[, r] <- as.numeric(df_perm[, 4])
        }
        return(perm_mat)
    } else if (paired_method == "signflip") {
        total_comb <- 2^npairs
        if (randomizations <= 0 || randomizations >= total_comb) {
            combos <- expand.grid(rep(list(c(0, 1)), npairs))
            nrep <- nrow(combos)
            perm_mat <- matrix(NA_real_, nrow = nrow(x), ncol = nrep)
            for (r in seq_len(nrep)) {
                swap <- as.logical(as.integer(combos[r, ]))
                perm_samples <- samples
                for (p in seq_len(npairs)) {
                  if (swap[p]) {
                    i1 <- (p - 1) * 2 + 1
                    i2 <- i1 + 1
                    perm_samples[c(i1, i2)] <- perm_samples[c(i2, i1)]
                  }
                }
                df_perm <- calculate_fc(x, perm_samples, control, method)
                perm_mat[, r] <- as.numeric(df_perm[, 4])
            }
            return(perm_mat)
        } else {
            perm_mat <- matrix(NA_real_, nrow = nrow(x), ncol = randomizations)
            for (r in seq_len(randomizations)) {
                swap <- sample(c(TRUE, FALSE), size = npairs, replace = TRUE)
                perm_samples <- samples
                for (p in seq_len(npairs)) {
                  if (swap[p]) {
                    i1 <- (p - 1) * 2 + 1
                    i2 <- i1 + 1
                    perm_samples[c(i1, i2)] <- perm_samples[c(i2, i1)]
                  }
                }
                df_perm <- calculate_fc(x, perm_samples, control, method)
                perm_mat[, r] <- as.numeric(df_perm[, 4])
            }
            return(perm_mat)
        }
    }
}

# Helpers for plot_top_transcripts internals

.ptt_select_genes_from_res <- function(res, top_n) {
    if (is.null(res)) {
        stop("Either 'gene' or 'res' must be provided")
    }
    if (!("genes" %in% colnames(res))) {
        stop("Provided 'res' must contain a 'genes' column")
    }
    if ("adjusted_p_values" %in% colnames(res)) {
        ord <- order(res$adjusted_p_values, na.last = NA)
    } else if ("raw_p_values" %in% colnames(res)) {
        ord <- order(res$raw_p_values, na.last = NA)
    } else {
        ord <- seq_len(nrow(res))
    }
    genes_sel <- as.character(res$genes[ord])
    genes_sel <- unique(genes_sel)
    head(genes_sel, top_n)
}

.ptt_infer_samples_from_coldata <- function(coldata, counts, sample_type_col) {
    if (is.character(coldata) && length(coldata) == 1) {
        if (!file.exists(coldata)) {
            stop("coldata file not found: ", coldata)
        }
        cdf <- utils::read.delim(coldata, header = TRUE, stringsAsFactors = FALSE)
    } else if (is.data.frame(coldata)) {
        cdf <- coldata
    } else {
        stop("`coldata` must be a data.frame or path to a tab-delimited file")
    }

    if (!is.null(rownames(cdf)) && all(colnames(counts) %in% rownames(cdf))) {
        as.character(cdf[colnames(counts), sample_type_col])
    } else {
        sample_id_cols <- c("sample", "Sample", "sample_id", "id")
        sid <- intersect(sample_id_cols, colnames(cdf))
        if (length(sid) > 0) {
            sid <- sid[1]
            if (!all(colnames(counts) %in% as.character(cdf[[sid]]))) {
                stop("coldata sample id column does not match column names of counts")
            }
            row_ix <- match(colnames(counts), as.character(cdf[[sid]]))
            as.character(cdf[[sample_type_col]][row_ix])
        } else {
            stop("Could not match `coldata` rows to `counts` columns. Provide `samples` or a row-named `coldata`.")
        }
    }
}

.ptt_read_tx2gene <- function(tx2gene) {
    if (is.null(tx2gene)) {
        stop("`tx2gene` must be provided as a file path or data.frame (or include mapping in metadata of provided SummarizedExperiment)")
    }
    if (is.character(tx2gene) && length(tx2gene) == 1) {
        if (!file.exists(tx2gene)) {
            stop("tx2gene file not found: ", tx2gene)
        }
        mapping <- utils::read.delim(tx2gene, stringsAsFactors = FALSE, header = TRUE)
    } else if (is.data.frame(tx2gene)) {
        mapping <- tx2gene
    } else {
        stop("`tx2gene` must be provided as a file path or data.frame (or include mapping in metadata of provided SummarizedExperiment)")
    }
    if (!all(c("Transcript", "Gen") %in% colnames(mapping))) {
        stop("tx2gene must have columns 'Transcript' and 'Gen'")
    }
    mapping
}

.ptt_make_agg <- function(metric = c("median", "mean", "variance", "iqr")) {
    metric_choice <- match.arg(metric)
    agg_fun <- switch(metric_choice, median = function(x) stats::median(x, na.rm = TRUE),
        mean = function(x) base::mean(x, na.rm = TRUE), variance = function(x) {
            stats::var(x, na.rm = TRUE)
        }, iqr = function(x) stats::IQR(x, na.rm = TRUE))
    agg_label_metric <- if (metric_choice == "iqr") {
        "IQR"
    } else {
        metric_choice
    }
    agg_label <- sprintf("Transcript-level expression with metric %s", agg_label_metric)
    .cnt <- as.integer(getOption("TSENAT.plot_top_counter", 0)) + 1L
    options(TSENAT.plot_top_counter = .cnt)
    agg_label_unique <- agg_label
    list(metric_choice = metric_choice, agg_fun = agg_fun, agg_label_unique = agg_label_unique)
}

.ptt_build_tx_long <- function(gene_single, mapping, counts, samples, top_n) {
    txs <- mapping$Transcript[mapping$Gen == gene_single]
    txs <- intersect(txs, rownames(counts))
    if (length(txs) == 0) {
        stop("No transcripts found for gene: ", gene_single)
    }
    if (!is.null(top_n)) {
        txs <- head(txs, top_n)
    }
    mat <- counts[txs, , drop = FALSE]
    df_all <- as.data.frame(mat)
    df_all$tx <- rownames(mat)
    df_long <- tidyr::pivot_longer(df_all, -tx, names_to = "sample", values_to = "expr")
    df_long$group <- rep(samples, times = length(txs))
    list(df_long = df_long, txs = txs)
}

.ptt_aggregate_df_long <- function(df_long, agg_fun, pseudocount) {
    df_summary <- stats::aggregate(expr ~ tx + group, data = df_long, FUN = agg_fun)
    df_summary$log2expr <- log2(df_summary$expr + pseudocount)
    df_summary$tx <- factor(df_summary$tx, levels = unique(df_summary$tx))
    df_summary
}

.ptt_build_plot_from_summary <- function(df_summary, agg_label_unique, fill_limits = NULL) {
    p <- ggplot2::ggplot(df_summary, ggplot2::aes(x = group, y = tx, fill = log2expr)) +
        ggplot2::geom_tile(color = "white", width = 0.95, height = 0.95) + ggplot2::scale_fill_viridis_c(option = "viridis",
        direction = -1, na.value = "grey80", limits = fill_limits) + ggplot2::theme_minimal(base_size = 14) +
        ggplot2::labs(title = agg_label_unique, x = NULL, y = NULL, fill = "log2(expr)") +
        ggplot2::theme(axis.text.y = ggplot2::element_text(size = 12), axis.text.x = ggplot2::element_text(size = 12),
            plot.title = ggplot2::element_text(size = 16, hjust = 0.6), legend.position = "bottom",
            legend.key.width = ggplot2::unit(1.2, "cm"), plot.margin = ggplot2::margin(4,
                4, 4, 4)) + ggplot2::guides(fill = ggplot2::guide_colorbar(title.position = "top",
        barwidth = 6, barheight = 0.35))
    p
}

.ptt_combine_patchwork <- function(plots, agg_label_unique) {
    combined <- Reduce(`+`, plots) + patchwork::plot_layout(nrow = 1, guides = "collect") &
        ggplot2::theme(legend.position = "bottom")
    combined <- combined + patchwork::plot_annotation(title = agg_label_unique, theme = ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.6,
        size = 16, margin = ggplot2::margin(b = 10))))
    combined
}

.ptt_combine_cowplot <- function(plots, output_file = NULL, agg_label_unique) {
    p_for_legend <- plots[[1]] + ggplot2::theme(legend.position = "bottom")
    legend <- cowplot::get_legend(p_for_legend)
    plots_nolegend <- lapply(plots, function(pp) pp + ggplot2::theme(legend.position = "none"))
    grid <- cowplot::plot_grid(plotlist = plots_nolegend, nrow = 1, align = "hv")
    title_grob <- cowplot::ggdraw() + cowplot::draw_label(agg_label_unique, fontface = "plain",
        x = 0.6, hjust = 0.5, size = 16)
    result_plot <- cowplot::plot_grid(title_grob, grid, legend, ncol = 1, rel_heights = c(0.08,
        1, 0.08))
    if (!is.null(output_file)) {
        ggplot2::ggsave(output_file, result_plot)
        invisible(NULL)
    }
    result_plot
}

.ptt_combine_grid <- function(plots, output_file = NULL, agg_label_unique) {
    plots_nolegend <- lapply(plots, function(pp) pp + ggplot2::theme(legend.position = "none"))
    grobs <- lapply(plots_nolegend, ggplot2::ggplotGrob)
    g_full <- ggplot2::ggplotGrob(plots[[1]])
    legend_idx <- which(vapply(g_full$grobs, function(x) x$name, character(1)) ==
        "guide-box")
    if (length(legend_idx)) {
        legend_grob <- g_full$grobs[[legend_idx[1]]]
    } else {
        legend_grob <- NULL
    }
    n <- length(grobs)
    heights <- grid::unit.c(grid::unit(0.6, "cm"), grid::unit(1, "null"), grid::unit(0.7,
        "cm"))
    if (!is.null(output_file)) {
        png(filename = output_file, width = 800 * n, height = 480, res = 150)
        .draw_transcript_grid(grobs, agg_label_unique, legend_grob, n, heights, to_file = output_file)
        invisible(NULL)
    } else {
        .draw_transcript_grid(grobs, agg_label_unique, legend_grob, n, heights)
        invisible(NULL)
    }
}

# # Minimalized diversity functions: only Tsallis entropy retained

#' Calculate Tsallis entropy for a vector of transcript-level
#' expression values of one gene.
#'
#' @param x Vector of (non-negative) expression values.
#' @param q Tsallis entropy parameter (q > 0). Scalar or numeric vector
#' (default: 2).
#' @param norm Logical; if TRUE, normalize entropy by its theoretical maximum
#' (values in [0,1]).
#' @param what Which quantity to return: 'S' (Tsallis entropy), 'D' (Hill
#' numbers), or 'both'.
#' @param log_base Base of the logarithm used for Shannon limits and
#' normalization
#' (default: \code{exp(1)}).
#' @export
#' @return For `what = 'S'` or `what = 'D'`: a numeric vector
#' (named when length(q) > 1). For `what = 'both'`: a list with
#' components `$S` and `$D`.
#' @details
#' Implements S_q = (1 - sum p^q)/(q - 1) and D_q = (sum p^q)^(1/(1-q)).
#' Uses the q->1 limits (Shannon entropy and its exponential). Natural
#' logarithms are used for the q->1 limit and for normalization.
#' @examples
#' x <- c(10, 5, 0)
#' calculate_tsallis_entropy(x, q = c(0.5, 1, 2), norm = TRUE)
calculate_tsallis_entropy <- function(x, q = 2, norm = TRUE, what = c("S", "D", "both"),
    log_base = exp(1)) {
    what <- match.arg(what)
    if (!is.numeric(q)) {
        stop("q must be numeric.")
    }
    if (any(q <= 0)) {
        stop("q must be greater than 0.")
    }
    if (!is.numeric(x)) {
        stop("x must be numeric")
    }

    n <- length(x)
    # If all counts sum to zero, return NA; allow single-element vectors to
    # proceed
    if (sum(x, na.rm = TRUE) <= 0) {
        if (what == "both") {
            return(list(S = rep(NA_real_, length(q)), D = rep(NA_real_, length(q))))
        }
        return(rep(NA_real_, length(q)))
    }

    p <- x/sum(x)

    tol <- sqrt(.Machine$double.eps)
    S_vec <- .tsenat_calc_S(p = p, q = q, tol = tol, n = n, log_base = log_base,
        norm = norm)
    D_vec <- .tsenat_calc_D(p = p, q = q, tol = tol, log_base = log_base)

    if (what == "S") {
        out <- S_vec
        if (length(q) > 1) {
            names(out) <- paste0("q=", q)
        }
        if (length(q) == 1) {
            return(unname(out))
        }
        return(out)
    }
    if (what == "D") {
        out <- D_vec
        if (length(q) > 1) {
            names(out) <- paste0("q=", q)
        }
        if (length(q) == 1) {
            return(unname(out))
        }
        return(out)
    }
    # both
    names(S_vec) <- paste0("q=", q)
    names(D_vec) <- paste0("q=", q)
    return(list(S = S_vec, D = D_vec))
}

## Helper: Extract tx2gene mapping from GFF3 file
#' @title Extract Transcript-to-Gene Mapping from GFF3 File
#' @description Internal function that parses GFF3 or GFF3.gz files to extract
#' transcript-to-gene mappings. This function is called internally by \code{build_se()}
#' when a GFF3 file path is provided. Users should not call this function directly.
#' @param gff3_file Path to a GFF3 or GFF3.gz file containing transcript/mRNA
#'   features with ID and Parent attributes.
#' @return A data.frame with two columns (Transcript, Gene) containing the
#'   transcript-to-gene mapping extracted from the GFF3 file.
#' @details
#' The function expects GFF3 format with the following structure:
#' \itemize{
#'   \item 9 tab-separated columns: seqname, source, feature, start, end,
#'         score, strand, phase, attributes
#'   \item Feature type column (3rd column) should contain 'transcript' or 'mRNA'
#'   \item Attributes column (9th column) should contain ID and Parent fields
#'   \item ID field: unique identifier for the transcript
#'   \item Parent field: references the gene ID that this transcript belongs to
#' }
#' Example GFF3 line:
#' \preformatted{chr1\tgencode\ttranscript\t1000\t3000\t.\t+\t.\t
#' ID=ENST00000001;Parent=ENSG00000101;Name=BRCA1-001}
#'
#' \strong{Performance:} This function is optimized for large GFF3 files:
#' \itemize{
#'   \item Reads files in 10,000-line chunks (not line-by-line)
#'   \item Uses fast pre-filtering (feature type check before regex)
#'   \item Employs efficient string operations instead of heavy regex on every line
#'   \item Handles both compressed (.gz) and uncompressed files seamlessly
#' }
#' @examples
#' \dontrun{
#' # Create a temporary GFF3 file with transcript features
#' gff3_lines <- c(
#'   'chr1\tgencode\ttranscript\t1000\t3000\t.\t+\t.\tID=ENST001;Parent=ENSG001',
#'   'chr1\tgencode\ttranscript\t1500\t3500\t.\t+\t.\tID=ENST002;Parent=ENSG001',
#'   'chr2\tgencode\ttranscript\t5000\t8000\t.\t-\t.\tID=ENST003;Parent=ENSG002'
#' )
#' tf <- tempfile(fileext = '.gff3')
#' writeLines(gff3_lines, tf)
#' 
#' # Extract transcript-to-gene mapping
#' tx2gene <- extract_tx2gene_from_gff3(tf)
#' head(tx2gene)
#' }
extract_tx2gene_from_gff3 <- function(gff3_file) {
    # Handle both .gff3 and .gff3.gz files
    if (grepl("\\.gff3\\.gz$", gff3_file)) {
        con <- gzfile(gff3_file, "rt")
    } else {
        con <- file(gff3_file, "r")
    }

    on.exit(close(con))

    # Pre-allocate vectors for efficiency (start with capacity for 10k
    # transcripts) This avoids repeatedly growing lists/data.frames which is
    # slow
    tx2gene_transcripts <- character(10000)
    tx2gene_genes <- character(10000)
    idx <- 0
    chunk_size <- 10000  # Read in chunks for better performance

    while (TRUE) {
        # Read lines in chunks instead of one-by-one for better performance
        lines <- readLines(con, n = chunk_size)
        if (length(lines) == 0)
            break

        # Process each line in the chunk
        for (line in lines) {
            # Skip comments and empty lines (fast pre-filter)
            if (startsWith(line, "#") || line == "")
                next

            # Parse GFF3 line format: seqname source feature start end score
            # strand phase attributes Use strsplit only once per line
            fields <- strsplit(line, "\t", fixed = TRUE)[[1]]
            if (length(fields) < 9)
                next

            # Extract feature type (3rd column) - do this check first to skip
            # early
            feature_type <- fields[3]
            if (!feature_type %in% c("transcript", "mRNA"))
                next

            # Only now extract attributes from the (potentially long) 9th
            # column
            attributes <- fields[9]

            # Extract ID and Parent using efficient substring operations Find
            # positions of ID= and Parent= patterns
            id_start <- regexpr("ID=", attributes, fixed = TRUE) + 3
            if (id_start > 3) {
                # ID found, extract until semicolon or end of string
                id_end <- regexpr(";", substr(attributes, id_start, nchar(attributes)),
                  fixed = TRUE)
                if (id_end > 0) {
                  transcript_id <- substr(attributes, id_start, id_start + id_end -
                    2)
                } else {
                  transcript_id <- substr(attributes, id_start, nchar(attributes))
                }
            } else {
                transcript_id <- NA_character_
            }

            parent_start <- regexpr("Parent=", attributes, fixed = TRUE) + 7
            if (parent_start > 7) {
                # Parent found, extract until semicolon or end of string
                parent_end <- regexpr(";", substr(attributes, parent_start, nchar(attributes)),
                  fixed = TRUE)
                if (parent_end > 0) {
                  gene_id <- substr(attributes, parent_start, parent_start + parent_end -
                    2)
                } else {
                  gene_id <- substr(attributes, parent_start, nchar(attributes))
                }
            } else {
                gene_id <- NA_character_
            }

            # Store mapping if both IDs are present
            if (!is.na(transcript_id) && !is.na(gene_id)) {
                idx <- idx + 1
                # Grow vectors if needed
                if (idx > length(tx2gene_transcripts)) {
                  tx2gene_transcripts <- c(tx2gene_transcripts, rep(NA_character_,
                    10000))
                  tx2gene_genes <- c(tx2gene_genes, rep(NA_character_, 10000))
                }
                tx2gene_transcripts[idx] <- transcript_id
                tx2gene_genes[idx] <- gene_id
            }
        }
    }

    # Trim to actual size and create data frame (empty if no mappings found)
    if (idx == 0) {
        tx2gene_df <- data.frame(Transcript = character(0), Gene = character(0),
            stringsAsFactors = FALSE)
    } else {
        tx2gene_df <- data.frame(Transcript = tx2gene_transcripts[seq_len(idx)],
            Gene = tx2gene_genes[seq_len(idx)], stringsAsFactors = FALSE)
    }
    rownames(tx2gene_df) <- NULL

    return(tx2gene_df)
}

## Helper: build SummarizedExperiment from readcounts + tx2gene
#' Build a SummarizedExperiment from transcript readcounts and tx->gene map
#'
#' This helper creates a `SummarizedExperiment` with an assay named
#' `counts`, stores the `tx2gene` table and the raw `readcounts` in
#' `metadata()`, and extracts gene assignments from the tx2gene mapping
#' to populate `rowData(se)$genes`.
#'
#' The function accepts three different input types for the tx2gene mapping:
#' a file path (TSV or GFF3 format) or an in-memory data.frame.
#'
#' @param readcounts Numeric matrix or data.frame of transcript-level counts
#'   (rows = transcripts, columns = samples). Rownames should contain
#'   transcript IDs that match the first column of tx2gene.
#'
#' @param tx2gene Transcript-to-gene mapping. Can be one of:
#'   \describe{
#'     \item{\strong{TSV File Path}}{A path to a tab-separated file with at least
#'       two columns: Transcript ID (first column) and Gene ID (second column).
#'       The file should have a header row with column names.
#'       File extension should be .tsv, .txt, or similar.
#'       Example:
#'       \preformatted{Transcript\tGene
#'       ENST00000001\tENSG00000101
#'       ENST00000002\tENSG00000102}
#'     }
#'     \item{\strong{GFF3 File Path}}{A path to a GFF3 annotation file (.gff3 or
#'       .gff3.gz format). The file should contain transcript/mRNA features with
#'       ID and Parent attributes. Parent attributes reference gene IDs.
#'       Example GFF3 line:
#'       \preformatted{chr1\tgencode\ttranscript\t100\t2100\t.\t+\t.\t
#'       ID=ENST00000001;Parent=ENSG00000101}
#'       The function automatically detects GFF3 format by file extension
#'       (.gff3 or .gff3.gz) and parses accordingly.
#'     }
#'     \item{\strong{Data Frame}}{An in-memory data.frame with Transcript
#'       and Gene columns. Useful when mapping data is already loaded in R.
#'       Example:
#'       \preformatted{  Transcript        Gene
#'       1 ENST00000001 ENSG00000101
#'       2 ENST00000002 ENSG00000102}
#'     }
#'   }
#'
#' @param assay_name Name for the assay to store readcounts (default: 'counts').
#'
#' @return A `SummarizedExperiment` with assay, `metadata()$tx2gene`,
#'   `metadata()$readcounts` and `rowData(se)$genes` populated.
#'
#' @details
#' \strong{Input Format Detection:}
#' \itemize{
#'   \item If tx2gene is a character string ending in .gff3 or .gff3.gz,
#'         it is parsed as a GFF3 file.
#'   \item If tx2gene is a character string with any other extension or
#'         no extension, it is parsed as a tab-separated file.
#'   \item If tx2gene is a data.frame, it is used directly.
#' }
#'
#' \strong{Transcript Matching:}
#' All transcript IDs in the readcounts object (rownames) must have a
#' corresponding entry in the tx2gene mapping. If any transcript is missing,
#' an error is raised.
#'
#' \strong{Performance:}
#' \itemize{
#'   \item GFF3 files are processed efficiently even for large annotations
#'         (e.g., full GENCODE with 100k+ transcripts)
#'   \item TSV files are standard tab-separated format for fast parsing
#'   \item Data.frame inputs have no I/O overhead
#' }
#'
#' @export
#'
#' @examples
#' # Example 1: Using data.frame (in-memory mapping)
#' tx2gene <- data.frame(
#'     Transcript = c('ENST00000001', 'ENST00000002'),
#'     Gene = c('ENSG00000101', 'ENSG00000102')
#' )
#' readcounts <- matrix(c(10, 5, 2, 3),
#'     nrow = 2,
#'     dimnames = list(c('ENST00000001', 'ENST00000002'), c('s1', 's2'))
#' )
#' se <- build_se(readcounts, tx2gene)
#'
#' # Example 2: Using TSV file path
#' # Assuming you have a file 'tx2gene.tsv' with Transcript and Gene columns
#' # se <- build_se(readcounts, 'path/to/tx2gene.tsv')
#'
#' # Example 3: Using GFF3.gz file path
#' # Assuming you have a file 'annotation.gff3.gz' with transcript features
#' # se <- build_se(readcounts, 'path/to/annotation.gff3.gz')
build_se <- function(readcounts, tx2gene, assay_name = "counts") {
    if (is.character(tx2gene) && length(tx2gene) == 1) {
        if (!file.exists(tx2gene)) {
            stop("tx2gene file not found: ", tx2gene, call. = FALSE)
        }

        # Detect file type and parse accordingly
        if (grepl("\\.gff3(\\.gz)?$", tx2gene, ignore.case = TRUE)) {
            message("Detected GFF3 format. Extracting transcript-to-gene mapping...")
            tx2gene_df <- extract_tx2gene_from_gff3(tx2gene)
        } else {
            # Assume TSV format (backward compatible)
            tx2gene_df <- utils::read.table(tx2gene, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
        }
    } else if (is.data.frame(tx2gene)) {
        tx2gene_df <- tx2gene
    } else {
        stop("'tx2gene' must be a path (TSV or GFF3) or a data.frame.", call. = FALSE)
    }

    if (is.data.frame(readcounts)) {
        readcounts <- as.matrix(readcounts)
    }
    if (!is.matrix(readcounts) || !is.numeric(readcounts)) {
        stop("'readcounts' must be a numeric matrix or numeric data.frame.", call. = FALSE)
    }

    # Extract transcript IDs from rownames or use row indices
    tx_ids <- rownames(readcounts)
    if (is.null(tx_ids)) {
        stop("'readcounts' must have transcript IDs as rownames.", call. = FALSE)
    }

    # Extract gene names from tx2gene based on transcript IDs
    tx_col <- if ("Transcript" %in% colnames(tx2gene_df))
        "Transcript" else colnames(tx2gene_df)[1]
    genes <- tx2gene_df$Gene[match(tx_ids, tx2gene_df[[tx_col]])]

    if (any(is.na(genes))) {
        stop("Some transcript IDs in readcounts were not found in tx2gene mapping.",
            call. = FALSE)
    }

    assays_list <- S4Vectors::SimpleList()
    assays_list[[assay_name]] <- readcounts

    se <- SummarizedExperiment::SummarizedExperiment(assays = assays_list)
    S4Vectors::metadata(se)$tx2gene <- tx2gene_df
    S4Vectors::metadata(se)$readcounts <- readcounts

    # Populate rowData with gene assignments and transcript IDs
    SummarizedExperiment::rowData(se) <- S4Vectors::DataFrame(genes = genes, row.names = tx_ids)

    return(se)
}

#' Calculate Tsallis diversity per gene across samples
#'
#' @param x A numeric matrix or data.frame of transcript-level expression
#' values (rows = transcripts, columns = samples), or a SummarizedExperiment-
#' like object.
#' @param tpm Logical. If TRUE and `x` is a tximport-style list, use the
#' `$abundance` matrix instead of `$counts`.
#' @param genes Character vector assigning each transcript (row) to a gene.
#' Must have length equal to nrow(x) or the number of transcripts in `x`.
#' @param norm Logical; if TRUE, normalize Tsallis entropy to [0,1] per gene.
#' @param assayno Integer assay index to use when `x` is a SummarizedExperiment.
#' @param verbose Logical; print diagnostic messages when TRUE.
#' @param q Numeric scalar or vector of Tsallis q values to evaluate (q > 0).
#' If length(q) > 1, the result will contain separate columns per sample and
#' q.
#' @param what Which quantity to return: 'S' for Tsallis entropy or 'D' for Hill
#' numbers.
#' @param nthreads Number of threads for parallel processing (default: 1).
#' Set to > 1 to parallelize per-gene entropy calculations.
#' @return A \link[SummarizedExperiment]{SummarizedExperiment} with assay
#' `diversity` containing per-gene diversity values.
#' @import methods
#' @importFrom SummarizedExperiment SummarizedExperiment assays assay rowData
#' colData
#' @export
#' @examples
#' data('tcga_brca_luma', package = 'TSENAT')
#' rc <- as.matrix(tcga_brca_luma[1:20, -1, drop = FALSE])
#' gs <- tcga_brca_luma[1:20, 1]
#' se <- calculate_diversity(rc, gs, q = 0.1, norm = TRUE)
#' SummarizedExperiment::assay(se)[1:3, 1:3]
calculate_diversity <- function(x, genes = NULL, norm = TRUE, tpm = FALSE, assayno = 1,
    verbose = FALSE, q = 2, what = c("S", "D"), nthreads = 1) {
    # Normalize and validate input data, extract matrix and gene mapping
    inp <- .tsenat_prepare_diversity_input(x = x, genes = genes, tpm = tpm, assayno = assayno,
        verbose = verbose)
    x <- inp$x
    genes <- inp$genes
    se_assay_mat <- inp$se_assay_mat

    if (!is.numeric(x)) {
        stop("Input data  must be numeric!", call. = FALSE)
    }

    if (any(is.na(x))) {
        stop("The data contains NA as expression values. NAs are not allowed", " in the input.",
            call. = FALSE)
    }

    if (nrow(x) != length(genes)) {
        stop("The number of rows is not equal to the given gene set.", call. = FALSE)
    }

    what <- match.arg(what)
    # validate q values (Tsallis parameter must be > 0)
    if (!is.numeric(q) || any(q <= 0)) {
        stop("Argument 'q' must be numeric and greater than 0.", call. = FALSE)
    }
    # keep a copy of transcript-level counts when available (non-null)
    if (!exists("se_assay_mat")) {
        se_assay_mat <- x
    }

    result <- calculate_method(x, genes, norm, verbose = verbose, q = q, what = what,
        nthreads = nthreads)

    # Prepare assay and row/col data
    result_assay <- result[, -1, drop = FALSE]
    result_rowData <- data.frame(genes = result[, 1], row.names = result[, 1])

    if (length(q) > 1) {
        col_split <- do.call(rbind, strsplit(colnames(result)[-1], "_q="))
        col_ids <- paste(col_split[, 1], "_q=", col_split[, 2], sep = "")
        row_ids <- as.character(result[, 1])
        result_colData <- data.frame(samples = as.character(col_split[, 1]), q = as.numeric(col_split[,
            2]), row.names = col_ids, stringsAsFactors = FALSE)
        colnames(result_assay) <- col_ids
        rownames(result_assay) <- row_ids
    } else {
        col_ids <- colnames(x)
        row_ids <- as.character(result[, 1])
        result_colData <- data.frame(samples = col_ids, row.names = col_ids)
        colnames(result_assay) <- col_ids
        rownames(result_assay) <- row_ids
    }

    result_metadata <- list(method = "tsallis", norm = norm, q = q, what = what)

    # if we have preserved the original transcript-level matrix, build a
    # tx->gene mapping and make both available in metadata so downstream
    # plotting helpers can find transcript IDs and their parent genes
    tx2gene_map <- NULL
    if (exists("se_assay_mat") && !is.null(rownames(se_assay_mat)) && length(genes) ==
        nrow(se_assay_mat)) {
        tx2gene_map <- data.frame(Transcript = rownames(se_assay_mat), Gen = as.character(genes),
            stringsAsFactors = FALSE)
    }

    assays_list <- list()
    if (what == "S") {
        assays_list$diversity <- result_assay
    } else if (what == "D") {
        assays_list$hill <- result_assay
    }

    result <- SummarizedExperiment::SummarizedExperiment(assays = assays_list, rowData = result_rowData,
        colData = result_colData, metadata = c(result_metadata, list(readcounts = if (exists("se_assay_mat")) se_assay_mat else NULL,
            tx2gene = tx2gene_map)))

    return(result)
}

# Internal helpers for calculate_method

.tsenat_tsallis_row <- function(x, genes, gene, q, norm, what) {
    idx <- which(genes == gene)
    out <- unlist(lapply(seq_len(ncol(x)), function(j) {
        v <- calculate_tsallis_entropy(x[idx, j], q = q, norm = norm, what = what)
        if (length(v) == length(q) && all(is.finite(v) | is.na(v))) {
            v
        } else {
            names_vec <- paste0("q=", q)
            setNames(rep(NA_real_, length(q)), names_vec)
        }
    }))
    out
}

# Internal plot helpers

.tsenat_format_label <- function(lbl) {
    if (is.null(lbl)) {
        return(NULL)
    }
    s <- gsub("_", " ", lbl)
    s <- gsub("\\s+", " ", s)
    s <- trimws(s)
    s <- tolower(s)
    if (nchar(s) == 0) {
        return(s)
    }
    if (nchar(s) == 1) {
        return(toupper(s))
    }
    paste0(toupper(substr(s, 1, 1)), substr(s, 2, nchar(s)))
}

.tsenat_prepare_ma_plot_df <- function(df, fold_col, mean_cols, x_label, y_label) {
    # Detect x-axis values
    if (length(mean_cols) >= 2) {
        xvals <- rowMeans(df[, mean_cols[seq_len(2)], drop = FALSE], na.rm = TRUE)
        x_label <- x_label %||% paste0(mean_cols[1], " vs ", mean_cols[2])
    } else if (length(mean_cols) == 1) {
        xvals <- as.numeric(df[[mean_cols[1]]])
        x_label <- x_label %||% mean_cols[1]
    } else if ("mean" %in% colnames(df)) {
        xvals <- as.numeric(df$mean)
        x_label <- x_label %||% "Mean"
    } else {
        xvals <- seq_len(nrow(df))
        x_label <- x_label %||% "Index"
    }

    yvals <- as.numeric(df[[fold_col]])

    padj_candidates <- c("adjusted_p_values", "adj_p_value", "adj_p", "padj", "p.adjust")
    padj_col <- intersect(padj_candidates, colnames(df))
    padj_col <- if (length(padj_col)) {
        padj_col[1]
    } else {
        NULL
    }

    padj <- if (!is.null(padj_col)) {
        as.numeric(df[[padj_col]])
    } else {
        rep(1, length(yvals))
    }
    padj[is.na(padj)] <- 1

    sig_flag <- ifelse(abs(yvals) > 0 & padj < 0.05, "significant", "non-significant")

    plot_df <- data.frame(genes = df$genes, x = xvals, y = yvals, padj = padj, significant = sig_flag,
        stringsAsFactors = FALSE)

    list(plot_df = plot_df, x_label = x_label, y_label = y_label)
}

.tsenat_prepare_volcano_df <- function(diff_df, x_col = NULL, padj_col = "adjusted_p_values",
    label_thresh = 0.1, padj_thresh = 0.05, title = NULL) {
    df <- as.data.frame(diff_df)
    cn <- colnames(df)

    # Auto-detect x-axis column if not specified
    if (is.null(x_col)) {
        diff_cols <- grep("_difference$", cn, value = TRUE, ignore.case = TRUE)
        if (length(diff_cols) > 0) {
            x_col <- diff_cols[1]
        } else {
            numeric_cols <- vapply(df, is.numeric, logical(1))
            p_cols <- grep("p_value|p.value", cn, ignore.case = TRUE)
            numeric_cols[p_cols] <- FALSE
            if (any(numeric_cols)) {
                x_col <- cn[which(numeric_cols)[1]]
            } else {
                stop("Could not find suitable column for x-axis. Specify 'x_col' explicitly.")
            }
        }
    }

    # Verify columns
    if (!(x_col %in% cn)) {
        stop(sprintf("Column '%s' not found in diff_df", x_col))
    }
    if (!(padj_col %in% cn)) {
        stop(sprintf("Column '%s' not found in diff_df", padj_col))
    }

    df$xval <- as.numeric(df[[x_col]])
    df$padj <- as.numeric(df[[padj_col]])
    df$padj[is.na(df$padj)] <- 1
    df$padj[df$padj <= 0] <- .Machine$double.xmin

    df$significant <- ifelse(abs(df$xval) >= label_thresh & df$padj < padj_thresh,
        "significant", "non-significant")
    df <- df[is.finite(df$xval) & is.finite(df$padj), ]

    if (nrow(df) == 0) {
        stop("No valid points to plot")
    }

    metric_label <- if (grepl("median", x_col, ignore.case = TRUE)) {
        "Median"
    } else if (grepl("mean", x_col, ignore.case = TRUE)) {
        "Mean"
    } else {
        "Value"
    }

    title_use <- title %||% "Volcano plot: fold-change vs significance"

    x_label_formatted <- .tsenat_format_label(x_col)
    padj_label_formatted <- .tsenat_format_label(padj_col)

    list(df = df, x_col = x_col, padj_col = padj_col, x_label_formatted = x_label_formatted,
        padj_label_formatted = padj_label_formatted, title_use = title_use)
}

# # Helpers for generating diversity plots (density, violin, MA)

#' @importFrom utils head
#' @importFrom grDevices png dev.off
if (getRversion() >= "2.15.1") {
    utils::globalVariables(
        c(
            "Gene",
            "diversity",
            "sample",
            "sample_q",
            "sample_type",
            "fold",
            "significant",
            "value",
            ".",
            "group",
            "tsallis",
            "q",
            "median",
            "IQR",
            "padj_num",
            "padj_clean",
            "xval",
            "label_flag",
            "genes",
            "mean",
            # transcript plotting globals
            "tx",
            "expr",
            "tx_cond",
            "log2expr",
            # generic plotting helpers
            "x",
            "y",
            "padj"
        )
    )

    require_pkgs <- function(pkgs) {
        missing <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)]
        if (length(missing)) {
            stop(
                sprintf(
                    "%s required",
                    paste(missing, collapse = ", ")
                )
            )
        }
        invisible(TRUE)
    }
}

# Small utility helpers to reduce repeated code paths when extracting
# samples, readcounts and tx->gene mappings from a
# SummarizedExperiment. Keeping these as focused helpers improves
# readability of the longer plotting functions below.
infer_samples_from_se <- function(se, samples = NULL, sample_type_col = "sample_type") {
    if (!is.null(samples)) {
        return(as.character(samples))
    }
    cd <- NULL
    try(cd <- SummarizedExperiment::colData(se), silent = TRUE)
    if (is.null(cd)) {
        return(NULL)
    }

    # Common column names to try
    candidates <- c(
        sample_type_col,
        "condition",
        "group",
        "sample_group",
        "sampleType",
        "class",
        "status",
        "phenotype"
    )
    for (nm in candidates) {
        if (nm %in% colnames(cd)) {
            return(as.character(cd[[nm]]))
        }
    }

    # Fallback: choose column with smallest >1 unique values
    uniq_counts <- vapply(cd, function(col) length(unique(na.omit(col))), integer(1))
    valid_cols <- names(uniq_counts[uniq_counts > 1])
    if (length(valid_cols) > 0) {
        bin_cols <- valid_cols[uniq_counts[valid_cols] == 2]
        pick <- if (length(bin_cols) > 0) bin_cols[1] else valid_cols[which.min(uniq_counts[valid_cols])]
        return(as.character(cd[[pick]]))
    }

    NULL
}

get_readcounts_from_se <- function(se, readcounts_arg = NULL) {
    # If user provided a readcounts object/path, accept it first
    if (!is.null(readcounts_arg)) {
        if (is.character(readcounts_arg) && length(readcounts_arg) == 1) {
            if (!file.exists(readcounts_arg)) stop("readcounts file not found: ", readcounts_arg)
            rc_df <- utils::read.delim(readcounts_arg, header = TRUE, stringsAsFactors = FALSE)
            if (!is.null(colnames(rc_df)) && ncol(rc_df) > 1) {
                counts <- as.matrix(rc_df[, -1, drop = FALSE])
                rownames(counts) <- rc_df[[1]]
            } else {
                counts <- as.matrix(rc_df)
            }
            return(counts)
        } else if (is.matrix(readcounts_arg) || is.data.frame(readcounts_arg)) {
            return(as.matrix(readcounts_arg))
        } else {
            stop("`readcounts` must be a matrix/data.frame or path to a file")
        }
    }

    md <- NULL
    try(md <- S4Vectors::metadata(se), silent = TRUE)
    if (!is.null(md) && !is.null(md$readcounts)) {
        return(as.matrix(md$readcounts))
    }

    assay_names <- SummarizedExperiment::assayNames(se)
    preferred_assays <- c("readcounts", "counts", "tx_counts", "counts_tx")
    chosen <- intersect(preferred_assays, assay_names)
    if (length(chosen) > 0) {
        return(as.matrix(SummarizedExperiment::assay(se, chosen[1])))
    }

    # fallback to first assay with a warning
    warning(
        "Using first assay from SummarizedExperiment to compute",
        " expression-based fold changes; ensure it contains",
        " transcript-level readcounts or provide metadata$readcounts"
    )
    as.matrix(SummarizedExperiment::assay(se))
}

get_tx2gene_from_se <- function(se, readcounts_mat = NULL) {
    md <- NULL
    try(md <- S4Vectors::metadata(se), silent = TRUE)
    # prefer explicit tx2gene in metadata
    if (!is.null(md) && !is.null(md$tx2gene) && is.data.frame(md$tx2gene)) {
        txmap <- md$tx2gene
        # attempt to find sensible columns
        tx_col <- if ("Transcript" %in% colnames(txmap)) "Transcript" else colnames(txmap)[1]
        gene_col <- if ("Gen" %in% colnames(txmap)) "Gen" else colnames(txmap)[2]
        return(list(
            type = "vector",
            mapping = as.character(txmap[[gene_col]][match(rownames(readcounts_mat), txmap[[tx_col]])])
        ))
    }

    # fallback: try rowData mapping
    rdata <- SummarizedExperiment::rowData(se)
    if (!is.null(rdata) && "genes" %in% colnames(rdata)) {
        genes_vec <- as.character(rdata$genes)
        if (!is.null(readcounts_mat) && length(genes_vec) == nrow(readcounts_mat)) {
            return(list(type = "vector", mapping = genes_vec))
        }
    }

    # last resort: use rownames of readcounts as gene identifiers
    if (!is.null(readcounts_mat)) {
        return(list(type = "vector", mapping = rownames(readcounts_mat)))
    }

    NULL
}

validate_control_in_samples <- function(control, samples) {
    uniq <- unique(samples)
    if (!is.null(control) && control %in% uniq) {
        return(control)
    }
    if ("Normal" %in% uniq) {
        return("Normal")
    }
    # fallback to first level and message
    chosen <- uniq[1]
    message(sprintf("`control` not found; using '%s' instead", chosen))
    chosen
}

#' Plot q-curve profile for a single gene comparing groups
#'
#' For a selected gene, plot per-sample Tsallis entropy across q values and
#' overlay per-group median +/- IQR ribbons so group-level differences are
#' easy to compare. Expects a `SummarizedExperiment` produced by
#' `calculate_diversity()` with `_q=` suffixes in column names.
#'
#' @param se A `SummarizedExperiment` from `calculate_diversity()`.
#' @param gene Character scalar or vector; gene symbol(s) to plot. If NULL and
#'   `lm_res` is supplied, the top `n_top` genes from `lm_res` (by
#'   `adj_p_interaction` or `p_interaction`) are used.
#' @param lm_res Optional data.frame result from `calculate_lm_interaction()`.
#'   When supplied and `gene` is NULL, the top `n_top` significant genes will
#'   be plotted.
#' @param n_top Number of top genes to plot when `lm_res` is provided (default: 10).
#' @param assay_name Name of the assay to use (default: "diversity").
#' @param sample_type_col Column name in `colData(se)` with sample type labels
#'   (default: "sample_type"). If missing, a single-group fallback is used.
#' @param show_samples Logical; if TRUE, draw per-sample lines in the
#'   background (default: FALSE).
#' @return A `ggplot` object when a single gene is requested, or a named list
#'   of `ggplot` objects when multiple genes are requested.
#' @examples
#' mat <- matrix(runif(8), nrow = 2, dimnames = list(c("g1", "g2"), c("s1_q=0.1", "s1_q=1", "s2_q=0.1", "s2_q=1")))
#' se <- SummarizedExperiment::SummarizedExperiment(assays = list(diversity = mat))
#' plot_tsallis_gene_profile(se, gene = "g1")
#' @export
plot_tsallis_gene_profile <- function(se,
                                      gene = NULL,
                                      lm_res = NULL,
                                      n_top = 10,
                                      assay_name = "diversity",
                                      sample_type_col = "sample_type",
                                      show_samples = FALSE) {
    if (!requireNamespace("ggplot2", quietly = TRUE)) stop("ggplot2 required")
    require_pkgs(c("dplyr", "tidyr", "SummarizedExperiment"))

    long <- prepare_tsallis_long(se, assay_name = assay_name, sample_type_col = sample_type_col)
    if (!("Gene" %in% colnames(long))) stop("prepare_tsallis_long did not return Gene column")

    # Resolve genes to plot: accept NULL (use lm_res), a single name, or a vector/list
    if (is.null(gene)) {
        if (is.null(lm_res)) stop("Either 'gene' or 'lm_res' must be provided")
        if (!is.data.frame(lm_res) || !("gene" %in% colnames(lm_res))) stop("'lm_res' must be a data.frame with a 'gene' column")
        # prefer adj_p_interaction if present
        pcol <- if ("adj_p_interaction" %in% colnames(lm_res)) "adj_p_interaction" else if ("p_interaction" %in% colnames(lm_res)) "p_interaction" else NULL
        if (is.null(pcol)) stop("'lm_res' must contain 'adj_p_interaction' or 'p_interaction' columns")
        genes_ordered <- unique(as.character(lm_res$gene[order(lm_res[[pcol]])]))
        genes <- head(genes_ordered, n_top)
    } else {
        genes <- as.character(unlist(gene))
    }

    if (length(genes) == 0) stop("No genes selected for plotting")

    # helper to build single plot for a gene
    make_plot_for_gene <- function(sel) {
        long_g <- long[as.character(long$Gene) == sel, , drop = FALSE]
        if (nrow(long_g) == 0) stop("Gene not found in assay: ", sel)
        long_g$qnum <- as.numeric(as.character(long_g$q))
        stats_df <- dplyr::summarise(dplyr::group_by(long_g, group, qnum),
            median = median(tsallis, na.rm = TRUE),
            IQR = stats::IQR(tsallis, na.rm = TRUE), .groups = "drop"
        )

        p <- ggplot2::ggplot() +
            ggplot2::theme_minimal(base_size = 14)

        if (isTRUE(show_samples)) {
            p <- p + ggplot2::geom_line(data = long_g, ggplot2::aes(x = qnum, y = tsallis, group = sample, color = group), alpha = 0.25)
        }

        p <- p +
            ggplot2::geom_ribbon(data = stats_df, ggplot2::aes(x = qnum, ymin = median - IQR / 2, ymax = median + IQR / 2, fill = group), alpha = 0.2, inherit.aes = FALSE) +
            ggplot2::geom_line(data = stats_df, ggplot2::aes(x = qnum, y = median, color = group), linewidth = 1.3) +
            ggplot2::labs(title = paste0(sel, ": Tsallis entropy q-curve profile"), x = "q value", y = "Tsallis entropy", color = "Group", fill = "Group") +
            ggplot2::scale_color_discrete(name = "Group") + ggplot2::scale_fill_discrete(name = "Group") +
            ggplot2::theme(plot.title = ggplot2::element_text(
                hjust = 0.5, size = 16,
                margin = ggplot2::margin(b = 10)
            ))
        p
    }

    # Return single ggplot for single gene, or a named list of ggplots for multiple genes
    if (length(genes) == 1) {
        return(make_plot_for_gene(genes))
    }

    plots <- lapply(genes, make_plot_for_gene)
    names(plots) <- genes
    plots
}

#' Plot diversity distributions (density) by sample type
#' @param se A `SummarizedExperiment` returned by `calculate_diversity`.
#' @param assay_name Name of the assay to use (default: "diversity").
#' @param sample_type_col Optional column name in `colData(se)` with sample
#' types.  If missing, sample types are inferred from column names (suffix after
#' the last underscore) or set to 'Group'.
#' @return A `ggplot` object with layered density plots.
#' @importFrom ggplot2 ggplot aes geom_density facet_grid scale_color_manual
#' guides theme_minimal labs
#' @export
#' @examples
#' data("tcga_brca_luma", package = "TSENAT")
#' rc <- as.matrix(tcga_brca_luma[1:100, -1, drop = FALSE])
#' gs <- tcga_brca_luma[1:100, 1]
#' se <- calculate_diversity(rc, gs, q = 0.1, norm = FALSE)
#' # Manually set sample_type in colData for plotting
#' SummarizedExperiment::colData(se)$sample_type <- 
#'   factor(gsub(".*_", "", colnames(rc)))
#' plot_diversity_density(se)
plot_diversity_density <- function(
  se,
  assay_name = "diversity",
  sample_type_col = NULL
) {
    require_pkgs(c("ggplot2", "tidyr", "dplyr", "SummarizedExperiment"))
    
    # For plot_diversity_density, sample_type is required for faceting
    # Determine which column to use for sample_type
    if (is.null(sample_type_col)) {
        # Check if "sample_type" exists in colData as a fallback
        if (!("sample_type" %in% colnames(SummarizedExperiment::colData(se)))) {
            stop("sample_type column not found in data", call. = FALSE)
        }
        sample_type_col <- "sample_type"
    } else if (!(sample_type_col %in% colnames(SummarizedExperiment::colData(se)))) {
        stop(sprintf("Column '%s' not found in colData.", sample_type_col), call. = FALSE)
    }
    
    # Check for all NA sample_type values
    st_col <- SummarizedExperiment::colData(se)[[sample_type_col]]
    if (all(is.na(st_col))) {
        stop("All sample_type values are NA. Cannot create faceted plot.", call. = FALSE)
    }
    
    long <- get_assay_long(
        se,
        assay_name = assay_name,
        value_name = "diversity",
        sample_type_col = sample_type_col
    )

    # Ensure sample_type column exists and has no NA values
    if (!("sample_type" %in% colnames(long))) {
        stop("sample_type column not found in data", call. = FALSE)
    }
    if (all(is.na(long$sample_type))) {
        stop("All sample_type values are NA. Cannot create faceted plot.", call. = FALSE)
    }

    ggplot2::ggplot(
        long,
        ggplot2::aes(x = diversity, group = sample, color = sample_type)
    ) +
        ggplot2::geom_density(alpha = 0.3) +
        ggplot2::facet_grid(. ~ sample_type) +
        ggplot2::scale_color_manual(values = c("black", "darkorchid4")) +
        ggplot2::guides(color = "none") +
        ggplot2::theme_minimal(base_size = 14) +
        ggplot2::labs(x = "Diversity values", y = "Density")
}


#' Plot violin of per-gene mean diversity by sample type
#' @importFrom magrittr %>%
#' @param se A `SummarizedExperiment` returned by `calculate_diversity`.
#' @param assay_name Name of the assay to use (default: "diversity").
#' @param sample_type_col Optional column name in `colData(se)` containing
#' sample types.
#' @return A `ggplot` violin plot object.
#' @export
#' @examples
#' data("tcga_brca_luma", package = "TSENAT")
#' rc <- as.matrix(tcga_brca_luma[1:100, -1, drop = FALSE])
#' gs <- tcga_brca_luma[1:100, 1]
#' se <- calculate_diversity(rc, gs, q = 0.1, norm = FALSE)
#' plot_mean_violin(se)
plot_mean_violin <- function(
  se,
  assay_name = "diversity",
  sample_type_col = NULL
) {
    require_pkgs(c("ggplot2", "dplyr", "SummarizedExperiment", "tidyr"))
    long <- get_assay_long(
        se,
        assay_name = assay_name,
        value_name = "diversity",
        sample_type_col = sample_type_col
    )

    tmp <- as.data.frame(long)
    plot_df <- stats::aggregate(
        diversity ~ sample_type + Gene,
        data = tmp,
        FUN = function(x) mean(x, na.rm = TRUE)
    )
    colnames(plot_df)[colnames(plot_df) == "diversity"] <- "value"

    ggplot2::ggplot(
        plot_df,
        ggplot2::aes(x = sample_type, y = value, fill = sample_type)
    ) +
        ggplot2::geom_violin(alpha = 0.6) +
        ggplot2::coord_flip() +
        ggplot2::theme_minimal(base_size = 14) +
        ggplot2::labs(x = "Samples", y = "Diversity") +
        ggplot2::scale_fill_viridis_d(name = "Group")
}


# Core MA plotting implementation documentation moved to internal block
#' Plot MA using Tsallis-based fold changes
#'
#' Wrapper around `plot_ma(..., type = "tsallis")` for convenience and
#' clearer API separation.
#'
#' @param x Data.frame from `calculate_difference()`.
#' @param sig_alpha Numeric significance threshold for adjusted p-values (default: 0.05).
#' @param x_label Optional x-axis label passed to `plot_ma`.
#' @param y_label Optional y-axis label passed to `plot_ma`.
#' @param title Optional plot title passed to `plot_ma`.
#' @param ... Additional arguments passed to `plot_ma()`.
#' @return A `ggplot2` object representing the MA plot.
#' @examples
#' x <- data.frame(genes = paste0("g", seq_len(5)), mean = runif(5), log2_fold_change = rnorm(5))
#' plot_ma_tsallis(x)
#' @export
plot_ma_tsallis <- function(x, sig_alpha = 0.05, x_label = NULL, y_label = NULL, title = NULL, ...) {
    title_use <- title %||% "Tsallis-based MA plot"
    x_label_use <- x_label %||% "mean_difference"
    y_label_use <- y_label %||% "Log10 fold-change of entropy"
    .plot_ma_core(x, fc_df = NULL, sig_alpha = sig_alpha, x_label = x_label_use, y_label = y_label_use, title = title_use)
}


#' Plot MA using expression/readcount-based fold changes
#'
#' Wrapper around `plot_ma(..., type = "expression")` that accepts a
#' `SummarizedExperiment` or precomputed fold-change `data.frame`.
#'
#' @param x Data.frame from `calculate_difference()`.
#' @param se A `SummarizedExperiment` or data.frame supplying readcounts or precomputed fold changes.
#' @param samples Optional sample grouping vector (passed to `plot_ma`).
#' @param control Control level name (passed to `plot_ma`).
#' @param fc_method Aggregation method for fold-change calculation (passed to `plot_ma`).
#' @param pseudocount Pseudocount added when computing log ratios (passed to `plot_ma`).
#' @param sig_alpha Numeric significance threshold for adjusted p-values (default: 0.05).
#' @param x_label Optional x-axis label passed to `plot_ma`.
#' @param y_label Optional y-axis label passed to `plot_ma`.
#' @param title Optional plot title passed to `plot_ma`.
#' @param ... Additional arguments passed to `plot_ma()`.
#' @return A `ggplot2` object representing the MA plot.
#' @examples
#' x <- data.frame(genes = paste0("g", seq_len(5)), mean = runif(5))
#' fc <- data.frame(genes = paste0("g", seq_len(5)), log2_fold_change = rnorm(5))
#' plot_ma_expression(x, se = fc)
#' @export
plot_ma_expression <- function(
  x,
  se,
  samples = NULL,
  control = NULL,
  fc_method = "median",
  pseudocount = 0,
  sig_alpha = 0.05,
  x_label = NULL,
  y_label = NULL,
  title = NULL,
  ...
) {
    title_use <- title %||% "Readcounts-based MA plot"
    x_label_use <- x_label %||% "Mean difference"
    y_label_use <- y_label %||% "Log10 fold-change of counts"
    plot_ma_expression_impl(
        x,
        se = se,
        samples = samples,
        control = control,
        fc_method = fc_method,
        pseudocount = pseudocount,
        sig_alpha = sig_alpha,
        x_label = x_label_use,
        y_label = y_label_use,
        title = title_use,
        ...
    )
}


# Core MA plotting implementation used by wrappers above. Accepts a
# differential results `x` (data.frame) and an optional `fc_df` with
# fold-changes (genes as rownames or a `genes` column). Returns a
# `ggplot` MA-plot.
#' Core MA plotting implementation (internal)
#'
#' This is an internal helper used by `plot_ma_tsallis()` and
#' `plot_ma_expression()`. It is documented here for developers but
#' is not exported.
#' @noRd
.plot_ma_core <- function(x,
                          fc_df = NULL,
                          diff_res = NULL,
                          sig_alpha = 0.05,
                          x_label = NULL,
                          y_label = NULL,
                          title = NULL,
                          ...) {
    require_pkgs(c("ggplot2"))

    df <- as.data.frame(x, stringsAsFactors = FALSE)
    # ensure a gene identifier column exists
    if (!("genes" %in% colnames(df))) {
        if (!is.null(rownames(df))) df$genes <- rownames(df)
    }

    # If an external fc_df is provided, merge fold values into df
    if (!is.null(fc_df)) {
        fdf <- as.data.frame(fc_df, stringsAsFactors = FALSE)
        if (!("genes" %in% colnames(fdf)) && !is.null(rownames(fdf))) fdf$genes <- rownames(fdf)
        if (!("log2_fold_change" %in% colnames(fdf))) {
            stop("Provided `fc_df` must contain 'log2_fold_change' column")
        }
        df <- merge(df, fdf[, c("genes", "log2_fold_change")], by = "genes", all.x = TRUE, suffixes = c("", ".fc"))
        # prefer fc_df fold values when present
        if ("log2_fold_change.fc" %in% colnames(df)) df$log2_fold_change <- ifelse(!is.na(df$log2_fold_change.fc), df$log2_fold_change.fc, df$log2_fold_change)
    }

    # Detect fold-change column
    fold_candidates <- c("log2_fold_change", "logFC", "fold", "estimate_interaction", "fold_change")
    fold_col <- intersect(fold_candidates, colnames(df))
    if (length(fold_col) == 0) stop("Could not find a fold-change column in input")
    fold_col <- fold_col[1]

    # Detect mean/average columns for x-axis
    mean_cols <- grep("_mean$|_median$", colnames(df), value = TRUE)
    if (length(mean_cols) >= 2) {
        # ensure the two chosen columns are consistent (both _mean or both _median)
        c1 <- mean_cols[1]
        c2 <- mean_cols[2]
        is_mean1 <- grepl("_mean$", c1)
        is_mean2 <- grepl("_mean$", c2)
        is_med1 <- grepl("_median$", c1)
        is_med2 <- grepl("_median$", c2)
        if (!((is_mean1 && is_mean2) || (is_med1 && is_med2))) {
            stop("Could not find two mean or two median columns")
        }
        xvals <- rowMeans(df[, mean_cols[seq_len(2)],
            drop = FALSE
        ], na.rm = TRUE)
        x_label <- x_label %||% paste0(mean_cols[1], " vs ", mean_cols[2])
    } else if (length(mean_cols) == 1) {
        xvals <- as.numeric(df[[mean_cols[1]]])
        x_label <- x_label %||% mean_cols[1]
    } else if ("mean" %in% colnames(df)) {
        xvals <- as.numeric(df$mean)
        x_label <- x_label %||% "Mean"
    } else {
        # fallback: use rank or index
        xvals <- seq_len(nrow(df))
        x_label <- x_label %||% "Index"
    }

    yvals <- as.numeric(df[[fold_col]])

    # p-value / adjusted p-value detection
    padj_candidates <- c("adjusted_p_values", "adj_p_value", "adj_p", "padj", "p.adjust")
    padj_col <- intersect(padj_candidates, colnames(df))
    padj_col <- if (length(padj_col)) padj_col[1] else NULL

    padj <- if (!is.null(padj_col)) as.numeric(df[[padj_col]]) else rep(1, length(yvals))
    padj[is.na(padj)] <- 1

    sig_flag <- ifelse(abs(yvals) > 0 & padj < sig_alpha, "significant", "non-significant")

    plot_df <- data.frame(genes = df$genes, x = xvals, y = yvals, padj = padj, significant = sig_flag, stringsAsFactors = FALSE)

    prep <- .tsenat_prepare_ma_plot_df(df, fold_col = fold_col, mean_cols = mean_cols, x_label = x_label, y_label = y_label)
    plot_df <- prep$plot_df
    x_label_formatted <- .tsenat_format_label(prep$x_label)
    y_label_raw <- prep$y_label %||% fold_col
    y_label_formatted <- .tsenat_format_label(y_label_raw)
    if (!is.null(y_label_formatted)) {
        y_label_formatted <- sub("\\blog2\\b", "log10", y_label_formatted, ignore.case = TRUE)
    }

    p <- ggplot2::ggplot(plot_df, ggplot2::aes(x = x, y = y, color = significant)) +
        ggplot2::geom_point(alpha = 0.75, size = 3.2) +
        ggplot2::scale_color_manual(
            values = c("non-significant" = "grey40", "significant" = "firebrick3"),
            guide = "none"
        ) +
        ggplot2::theme_minimal(base_size = 14) +
        ggplot2::labs(
            title = title %||% "MA plot: mean vs log10 fold-change",
            x = x_label_formatted,
            y = y_label_formatted
        ) +
        ggplot2::theme(
            plot.title = ggplot2::element_text(hjust = 0.5, size = 16, face = "plain"),
            axis.title = ggplot2::element_text(size = 14),
            axis.text = ggplot2::element_text(size = 12)
        )

    p
}


# Implementation that computes fold-changes from expression/readcounts
# stored in a `SummarizedExperiment` (or matrix) and forwards to core plotter.
plot_ma_expression_impl <- function(
  x,
  se,
  samples = NULL,
  control = NULL,
  fc_method = "median",
  pseudocount = 0,
  sig_alpha = 0.05,
  x_label = NULL,
  y_label = NULL,
  title = NULL,
  ...
) {
    require_pkgs(c("SummarizedExperiment"))

    # extract/readcounts
    if (inherits(se, "SummarizedExperiment")) {
        counts <- get_readcounts_from_se(se)
        samples <- infer_samples_from_se(se, samples)
        if (is.null(samples)) stop("Could not infer 'samples' from SummarizedExperiment; provide `samples`")
        control <- validate_control_in_samples(control, samples)

        # attempt to map transcripts -> genes and aggregate counts per gene
        tx2g <- get_tx2gene_from_se(se, readcounts_mat = counts)
        if (!is.null(tx2g) && tx2g$type == "vector") {
            mapping <- tx2g$mapping
            # ensure mapping length matches rows
            if (length(mapping) == nrow(counts)) {
                # aggregate transcript-level counts to gene-level using rowsum
                agg <- rowsum(counts, group = mapping)
                counts_gene <- as.matrix(agg)
            } else {
                counts_gene <- counts
            }
        } else {
            counts_gene <- counts
        }

        # compute fold-changes using calculate_fc (aggregates per-group)
        fc_res <- calculate_fc(
            counts_gene,
            samples,
            control,
            method = fc_method,
            pseudocount = pseudocount
        )
        # ensure genes column exists
        if (is.null(rownames(fc_res))) {
            fc_res$genes <- seq_len(nrow(fc_res))
        } else {
            fc_res$genes <- rownames(fc_res)
        }
        return(.plot_ma_core(
            x,
            fc_df = fc_res,
            sig_alpha = sig_alpha,
            x_label = x_label,
            y_label = y_label,
            title = title,
            ...
        ))
    }

    # If se is provided as a matrix/data.frame of precomputed fold changes
    if (is.matrix(se) || is.data.frame(se)) {
        fc_res <- as.data.frame(se, stringsAsFactors = FALSE)
        if (!("log2_fold_change" %in% colnames(fc_res))) stop("`se` data.frame must contain 'log2_fold_change' column when providing precomputed fold changes")
        if (!("genes" %in% colnames(fc_res)) && !is.null(rownames(fc_res))) fc_res$genes <- rownames(fc_res)
        return(.plot_ma_core(x, fc_df = fc_res, sig_alpha = sig_alpha, x_label = x_label, y_label = y_label, title = title, ...))
    }

    stop("Unsupported 'se' argument for plot_ma_expression_impl")
}


#' Plot median +- IQR of Tsallis entropy across q values by group
#'
#' This reproduces the `tsallis-q-curve-mean-sd` plot from the vignette: for
#' each q value, compute per-gene Tsallis entropy per sample, summarize across
#' genes by group (median and IQR) and plot median with a ribbon spanning
#' median +- IQR/2.
#' @param se A `SummarizedExperiment` returned by `calculate_diversity`.
#' @param assay_name Name of the assay to use (default: "diversity").
#' @param sample_type_col Column name in `colData(se)` containing sample
#'   type labels (default: "sample_type").
#' @return A `ggplot` object showing median +- IQR across q values by group.
#' @export
#' @examples
#' data("tcga_brca_luma", package = "TSENAT")
#' rc <- as.matrix(tcga_brca_luma[1:40, -1, drop = FALSE])
#' gs <- tcga_brca_luma[1:40, 1]
#' se <- calculate_diversity(rc, gs,
#'     q = seq(0.01, 0.1, by = 0.03), norm = FALSE
#' )
#' p <- plot_tsallis_q_curve(se)
#' p
plot_tsallis_q_curve <- function(
  se,
  assay_name = "diversity",
  sample_type_col = "sample_type"
) {
    # SE-first API: require a SummarizedExperiment with per-column sample
    # type mapping in `colData(se)[, sample_type_col]` (or allow a single
    # group dataset where `sample_type` is omitted).
    if (inherits(se, "SummarizedExperiment")) {
        require_pkgs(c("ggplot2", "dplyr", "tidyr", "SummarizedExperiment"))
        long <- prepare_tsallis_long(se, assay_name = assay_name, sample_type_col = sample_type_col)
        y_label <- "Tsallis entropy (S_q)"
        if (nrow(long) == 0) stop("No tsallis values found in SummarizedExperiment")
        # long$q may be a factor; convert to numeric for plotting
        long$qnum <- as.numeric(as.character(long$q))
        stats_df <- dplyr::summarise(dplyr::group_by(long, group, qnum),
            median = median(tsallis, na.rm = TRUE),
            IQR = stats::IQR(tsallis, na.rm = TRUE), .groups = "drop"
        )
        p <- ggplot2::ggplot(
            stats_df,
            ggplot2::aes(x = qnum, y = median, color = group, fill = group)
        ) +
            ggplot2::geom_line(linewidth = 1.3) +
            ggplot2::geom_ribbon(
                ggplot2::aes(ymin = median - IQR / 2, ymax = median + IQR / 2),
                alpha = 0.2,
                color = NA
            ) +
            ggplot2::theme_minimal(base_size = 14) +
            ggplot2::labs(
                title = "Global Tsallis q-curve: entropy across all genes",
                x = "q value",
                y = y_label,
                color = "Group",
                fill = "Group"
            ) +
            ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, face = "plain", size = 16))
        # use default discrete ggplot2 colours (not viridis)
        p <- p + ggplot2::scale_color_discrete(name = "Group") +
            ggplot2::scale_fill_discrete(name = "Group")
        # If there is only a single group present, hide the legend/Group label
        if (length(unique(stats_df$group)) == 1) {
            p <- p + ggplot2::theme(legend.position = "none")
        }

        return(p)
    }

    # Matrix/data.frame input is no longer supported for this plot function.
    stop(
        "plot_tsallis_q_curve requires a SummarizedExperiment from calculate_diversity."
    )
}
#' Violin plot of Tsallis entropy for multiple q values

#' @param se A `SummarizedExperiment` returned by `calculate_diversity` with
#' multiple q values (column names contain `_q=`).
#' @param assay_name Name of the assay to use (default: "diversity").
#' @return A `ggplot` violin plot object faceted/colored by group and q.
#' @export
#' @examples
#' data("tcga_brca_luma", package = "TSENAT")
#' rc <- as.matrix(tcga_brca_luma[1:20, -1, drop = FALSE])
#' gs <- tcga_brca_luma[1:20, 1]
#' se <- calculate_diversity(rc, gs, q = c(0.1, 1), norm = TRUE)
#' plot_tsallis_violin_multq(se)
plot_tsallis_violin_multq <- function(se, assay_name = "diversity") {
    require_pkgs(c("ggplot2", "tidyr", "dplyr"))
    long <- prepare_tsallis_long(se, assay_name = assay_name)

    ggplot2::ggplot(
        long,
        ggplot2::aes(x = q, y = tsallis, fill = group)
    ) +
        ggplot2::geom_violin(
            alpha = 0.5, width = 0.9,
            position = ggplot2::position_dodge(width = 0.8)
        ) +
        ggplot2::geom_boxplot(
            width = 0.15,
            position = ggplot2::position_dodge(width = 0.8),
            outlier.shape = NA
        ) +
        ggplot2::theme_minimal(base_size = 14) +
        ggplot2::scale_fill_discrete(name = "Group") +
        ggplot2::labs(
            title = "Violin plot: Tsallis entropy distribution across multiple q values",
            x = "q value",
            y = "Tsallis entropy",
            fill = "Group"
        ) +
        ggplot2::theme(plot.title = ggplot2::element_text(
            hjust = 0.5, size = 16, face = "plain",
            margin = ggplot2::margin(b = 10)
        ))
}
#' Density plot of Tsallis entropy for multiple q values

#' @param se A `SummarizedExperiment` returned by `calculate_diversity` with
#' multiple q values (column names contain `_q=`).
#' @param assay_name Name of the assay to use (default: "diversity").
#' @return A `ggplot` density plot object faceted by q and colored by group.
#' @export
#' @examples
#' data("tcga_brca_luma", package = "TSENAT")
#' rc <- as.matrix(tcga_brca_luma[1:20, -1, drop = FALSE])
#' gs <- tcga_brca_luma[1:20, 1]
#' se <- calculate_diversity(rc, gs, q = c(0.1, 1), norm = FALSE)
#' plot_tsallis_density_multq(se)
plot_tsallis_density_multq <- function(se, assay_name = "diversity") {
    require_pkgs(c("ggplot2", "tidyr", "dplyr"))
    long <- prepare_tsallis_long(se, assay_name = assay_name)

    ggplot2::ggplot(
        long,
        ggplot2::aes(x = tsallis, color = group, fill = group)
    ) +
        ggplot2::geom_density(alpha = 0.3) +
        ggplot2::facet_wrap(~q, scales = "free_y") +
        ggplot2::theme_minimal(base_size = 14) +
        ggplot2::scale_color_discrete(name = "Group") +
        ggplot2::scale_fill_discrete(name = "Group") +
        ggplot2::labs(
            title = "Density plot: Tsallis entropy distribution across multiple q values",
            x = "Tsallis entropy",
            y = "Density",
            color = "Group",
            fill = "Group"
        ) +
        ggplot2::theme(plot.title = ggplot2::element_text(
            hjust = 0.5, size = 16, face = "plain",
            margin = ggplot2::margin(b = 10)
        ))
}


#' Volcano plot for differential results
#'
#' Create a volcano plot showing fold-change (x-axis) versus adjusted
#' p-value significance (y-axis). The function auto-detects a suitable x-axis
#' column if one is not provided and expects an adjusted p-value column for
#' significance coloring.
#'
#' @param diff_df Data.frame from `test_differential()` or similar results.
#'   Should contain p-values and optionally fold-change columns.
#' @param x_col Optional column name for the x-axis. If `NULL`, the function
#'   will try to auto-detect a suitable numeric column (excluding p-values).
#' @param padj_col Adjusted p-value column name (default: "adjusted_p_values").
#' @param label_thresh Fold-change threshold used to annotate points (default: 0.1).
#' @param padj_thresh Adjusted p-value cutoff for significance (default: 0.05).
#' @param top_n Number of top significant genes to label (default: 5).
#' @param title Optional plot title; if `NULL` a default title is used.
#'
#' @return A `ggplot2` object.
#' @export
#' @examples
#' df <- data.frame(
#'     gene = paste0("g", seq_len(10)),
#'     mean_difference = runif(10),
#'     adjusted_p_values = runif(10)
#' )
#' # plot_volcano(df, x_col = "mean_difference", padj_col = "adjusted_p_values")
plot_volcano <- function(
  diff_df,
  x_col = NULL,
  padj_col = "adjusted_p_values",
  label_thresh = 0.1,
  padj_thresh = 0.05,
  top_n = 5,
  title = NULL
) {
    if (!requireNamespace("ggplot2", quietly = TRUE)) {
        stop("ggplot2 required")
    }

    prep_volcano <- .tsenat_prepare_volcano_df(diff_df = diff_df, x_col = x_col, padj_col = padj_col, label_thresh = label_thresh, padj_thresh = padj_thresh, title = title)
    df <- prep_volcano$df
    x_col <- prep_volcano$x_col
    padj_col <- prep_volcano$padj_col
    x_label_formatted <- prep_volcano$x_label_formatted
    padj_label_formatted <- prep_volcano$padj_label_formatted
    title_use <- prep_volcano$title_use

    p <- ggplot2::ggplot(
        df,
        ggplot2::aes(x = xval, y = -log10(padj), color = significant)
    ) +
        ggplot2::geom_point(alpha = 0.75, size = 3.4) +
        ggplot2::scale_color_manual(
            values = c("non-significant" = "black", "significant" = "red"),
            guide = "none"
        ) +
        ggplot2::geom_hline(
            yintercept = -log10(padj_thresh),
            linetype = "dashed",
            color = "gray50"
        ) +
        ggplot2::geom_vline(
            xintercept = c(-label_thresh, label_thresh),
            linetype = "dashed",
            color = "gray50"
        ) +
        ggplot2::theme_minimal(base_size = 14) +
        ggplot2::labs(
            title = title_use,
            x = x_label_formatted,
            y = paste0("-Log10(", padj_label_formatted, ")")
        ) +
        ggplot2::theme(
            plot.title = ggplot2::element_text(hjust = 0.5, size = 16, face = "plain", margin = ggplot2::margin(b = 4)),
            axis.title = ggplot2::element_text(size = 14),
            axis.text = ggplot2::element_text(size = 12)
        )

    p
}


#' Internal helper to compute fill limits across multiple genes (not exported)
#' @noRd
.compute_transcript_fill_limits <- function(genes, mapping, counts, samples, top_n, agg_fun, pseudocount) {
    mins <- maxs <- c()
    for (g in genes) {
        txs <- mapping$Transcript[mapping$Gen == g]
        txs <- intersect(txs, rownames(counts))
        if (length(txs) == 0) next
        if (!is.null(top_n)) txs <- head(txs, top_n)
        mat <- counts[txs, , drop = FALSE]
        df_all <- as.data.frame(mat)
        df_all$tx <- rownames(mat)
        df_long <- tidyr::pivot_longer(df_all, -tx, names_to = "sample", values_to = "expr")
        df_long$group <- rep(samples, times = length(txs))
        df_summary <- stats::aggregate(expr ~ tx + group, data = df_long, FUN = agg_fun)
        df_summary$log2expr <- log2(df_summary$expr + pseudocount)
        mins <- c(mins, min(df_summary$log2expr, na.rm = TRUE))
        maxs <- c(maxs, max(df_summary$log2expr, na.rm = TRUE))
    }
    if (length(mins) == 0) stop("No transcripts found for provided genes")
    c(min(mins, na.rm = TRUE), max(maxs, na.rm = TRUE))
}

#' Internal helper to draw grid layout with title, plots, and legend using base grid
#' @noRd
.draw_transcript_grid <- function(grobs, title, legend_grob, n, heights, to_file = NULL) {
    # If no output file is provided and no graphics device is open, render to a
    # temporary pdf device so that plotting in non-interactive sessions does not
    # create `Rplots.pdf` in the working directory.
    temp_dev <- FALSE
    # Only open a temporary PDF device when:
    #  - caller did not supply an output file (`to_file` is NULL),
    #  - the session is non-interactive, and
    #  - no graphics device is currently open (dev.cur() == 1)
    if (is.null(to_file) && !interactive() && grDevices::dev.cur() == 1L) {
        tmp <- tempfile("TSENAT_plot_", fileext = ".pdf")
        grDevices::pdf(tmp)
        temp_dev <- TRUE
        # Ensure device is closed and temporary file removed on exit
        on.exit(
            {
                try(grDevices::dev.off(), silent = TRUE)
                if (file.exists(tmp)) unlink(tmp)
            },
            add = TRUE
        )
    }

    grid::grid.newpage()
    grid::pushViewport(
        grid::viewport(layout = grid::grid.layout(3, n, heights = heights))
    )
    # Title row
    vp_title <- grid::viewport(layout.pos.row = 1, layout.pos.col = seq_len(n))
    grid::pushViewport(vp_title)
    grid::grid.text(title, x = 0.5, gp = grid::gpar(fontsize = 12))
    grid::upViewport()
    # Plot rows
    for (i in seq_along(grobs)) {
        vp <- grid::viewport(layout.pos.row = 2, layout.pos.col = i)
        grid::pushViewport(vp)
        grid::grid.draw(grobs[[i]])
        grid::upViewport()
    }
    # Legend row
    if (!is.null(legend_grob)) {
        vp_leg <- grid::viewport(layout.pos.row = 3, layout.pos.col = seq_len(n))
        grid::pushViewport(vp_leg)
        grid::grid.draw(legend_grob)
        grid::upViewport()
    }
    grid::upViewport()
    # If caller supplied a file (caller likely opened a device), close it here.
    if (!is.null(to_file)) grDevices::dev.off()
    invisible(NULL)
}

## Internal helpers for `plot_top_transcripts` refactor
## Create per-gene plot and combine multiple gene plots into final output
.ptt_make_plot_for_gene <- function(gene_single, mapping, counts, samples, top_n, agg_fun, pseudocount, agg_label_unique, fill_limits = NULL) {
    require_pkgs(c("ggplot2", "tidyr"))
    built <- .ptt_build_tx_long(gene_single, mapping, counts, samples, top_n)
    df_summary <- .ptt_aggregate_df_long(built$df_long, agg_fun, pseudocount)
    .ptt_build_plot_from_summary(df_summary, agg_label_unique, fill_limits)
}

.ptt_combine_plots <- function(plots, output_file = NULL, agg_label_unique = NULL) {
    require_pkgs(c("ggplot2"))
    # Allow callers to pass a single character second argument as the
    # `agg_label_unique` for convenience (legacy test call patterns).
    if (is.null(agg_label_unique) && !is.null(output_file) && is.character(output_file) && length(output_file) == 1) {
        agg_label_unique <- output_file
        output_file <- NULL
    }
    if (requireNamespace("patchwork", quietly = TRUE)) {
        .ptt_combine_patchwork(plots, agg_label_unique)
    } else if (requireNamespace("cowplot", quietly = TRUE)) {
        .ptt_combine_cowplot(plots, output_file = output_file, agg_label_unique = agg_label_unique)
    } else {
        .ptt_combine_grid(plots, output_file = output_file, agg_label_unique = agg_label_unique)
    }
}

## Prepare and validate inputs for `plot_top_transcripts`
.ptt_prepare_inputs <- function(counts, readcounts = NULL, samples = NULL, coldata = NULL, sample_type_col = "sample_type", tx2gene = NULL, res = NULL, top_n = NULL, pseudocount = 0, output_file = NULL, metric = c("median", "mean", "variance", "iqr")) {
    # handle selecting genes from `res` is left to caller; this function focuses
    # on normalizing counts, samples and tx2gene mapping and preparing agg functions
    if (inherits(counts, "SummarizedExperiment")) {
        require_pkgs(c("SummarizedExperiment", "S4Vectors"))
        se <- counts
        counts_mat <- get_readcounts_from_se(se, readcounts)
        counts <- as.matrix(counts_mat)
        samples <- infer_samples_from_se(se, samples, sample_type_col = sample_type_col)

        if (is.null(tx2gene)) {
            txres <- get_tx2gene_from_se(se, counts)
            if (!is.null(txres) && !is.null(txres$mapping)) {
                mapping <- data.frame(Transcript = rownames(counts), Gen = as.character(txres$mapping), stringsAsFactors = FALSE)
                tx2gene <- mapping
            }
        }
    }

    if (!is.matrix(counts) && !is.data.frame(counts)) stop("`counts` must be a matrix or data.frame with transcripts as rownames")
    counts <- as.matrix(counts)
    if (is.null(rownames(counts))) stop("`counts` must have rownames corresponding to transcript identifiers")

    # derive samples from coldata if needed
    if (is.null(samples)) {
        if (!is.null(coldata)) {
            if (is.character(coldata) && length(coldata) == 1) {
                if (!file.exists(coldata)) stop("coldata file not found: ", coldata)
                cdf <- utils::read.delim(coldata, header = TRUE, stringsAsFactors = FALSE)
            } else if (is.data.frame(coldata)) {
                cdf <- coldata
            } else {
                stop("`coldata` must be a data.frame or path to a tab-delimited file")
            }

            if (!is.null(rownames(cdf)) && all(colnames(counts) %in% rownames(cdf))) {
                samples <- as.character(cdf[colnames(counts), sample_type_col])
            } else {
                sample_id_cols <- c("sample", "Sample", "sample_id", "id")
                sid <- intersect(sample_id_cols, colnames(cdf))
                if (length(sid) > 0) {
                    sid <- sid[1]
                    if (!all(colnames(counts) %in% as.character(cdf[[sid]]))) stop("coldata sample id column does not match column names of counts")
                    row_ix <- match(colnames(counts), as.character(cdf[[sid]]))
                    samples <- as.character(cdf[[sample_type_col]][row_ix])
                } else {
                    stop("Could not match `coldata` rows to `counts` columns. Provide `samples` or a row-named `coldata`.")
                }
            }
        } else {
            stop("Either 'samples' or 'coldata' must be provided to determine sample groups")
        }
    }

    # normalize tx2gene mapping
    if (is.null(tx2gene)) stop("`tx2gene` must be provided as a file path or data.frame (or include mapping in metadata of provided SummarizedExperiment)")
    if (is.character(tx2gene) && length(tx2gene) == 1) {
        if (!file.exists(tx2gene)) stop("tx2gene file not found: ", tx2gene)
        mapping <- utils::read.delim(tx2gene, stringsAsFactors = FALSE, header = TRUE)
    } else if (is.data.frame(tx2gene)) {
        mapping <- tx2gene
    } else {
        stop("`tx2gene` must be provided as a file path or data.frame (or include mapping in metadata of provided SummarizedExperiment)")
    }

    if (!all(c("Transcript", "Gen") %in% colnames(mapping))) stop("tx2gene must have columns 'Transcript' and 'Gen'")

    if (!requireNamespace("ggplot2", quietly = TRUE)) stop("ggplot2 required for plotting")

    if (!is.null(samples) && length(samples) != ncol(counts)) stop("Length of `samples` must equal number of columns in `counts`")

    metric_choice <- match.arg(metric)
    agg_fun <- switch(metric_choice,
        median = function(x) stats::median(x, na.rm = TRUE),
        mean = function(x) base::mean(x, na.rm = TRUE),
        variance = function(x) stats::var(x, na.rm = TRUE),
        iqr = function(x) stats::IQR(x, na.rm = TRUE)
    )
    agg_label_metric <- if (metric_choice == "iqr") "IQR" else metric_choice
    agg_label <- sprintf("Transcript-level expression with metric %s", agg_label_metric)
    .cnt <- as.integer(getOption("TSENAT.plot_top_counter", 0)) + 1L
    options(TSENAT.plot_top_counter = .cnt)
    agg_label_unique <- agg_label

    list(counts = counts, samples = samples, mapping = mapping, metric_choice = metric_choice, agg_fun = agg_fun, agg_label_unique = agg_label_unique, top_n = top_n, pseudocount = pseudocount, output_file = output_file)
}

#' Plot top transcripts for a gene
#' @param counts Matrix or data.frame of transcript counts. Rows are transcripts and columns are samples.
#' @param readcounts Optional matrix or data.frame of raw read counts. Used for transcript-level quantification if provided.
#' @param gene Character; gene symbol to inspect.
#' @param samples Character vector of sample group labels (length = ncol(counts)).
#' @param coldata Optional data.frame or file path containing sample metadata. Used to infer sample groups if `samples` is not provided.
#' @param sample_type_col Character; column name in `coldata` or `SummarizedExperiment` colData to use for sample grouping. Default is "sample_type".
#' @param tx2gene Path or data.frame mapping transcripts to genes. Must contain columns `Transcript` and `Gen`.
#' @param res Optional result data.frame from a differential analysis. If provided and `gene` is NULL, top genes are selected by adjusted p-value.
#' @param top_n Integer number of transcripts to show (default = 3). Use NULL to plot all transcripts for the gene.
#' @param pseudocount Numeric pseudocount added before log2 (default = 1e-6) to avoid division by zero.
#' @param output_file Optional file path to save the plot. If `NULL`, the `ggplot` object is returned.
#' @param metric Aggregation metric used to summarize transcript expression per group when plotting. One of c("median", "mean", "variance", "iqr"). Use "iqr" to compute the interquartile range. Defaults to "median".
#' @return If \code{output_file} is \code{NULL}, returns a \code{ggplot} object. Otherwise, the plot is saved to the specified file and the function returns \code{NULL} invisibly.
#' @examples
#' tx_counts <- matrix(sample(1:100, 24, replace = TRUE), nrow = 6)
#' rownames(tx_counts) <- paste0("tx", seq_len(nrow(tx_counts)))
#' colnames(tx_counts) <- paste0("S", seq_len(ncol(tx_counts)))
#' tx2gene <- data.frame(Transcript = rownames(tx_counts), Gen = rep(paste0("G", seq_len(3)), each = 2), stringsAsFactors = FALSE)
#' samples <- rep(c("Normal", "Tumor"), length.out = ncol(tx_counts))
#' plot_top_transcripts(tx_counts, gene = c("G1", "G2"), samples = samples, tx2gene = tx2gene, top_n = 2)
#' @export
plot_top_transcripts <- function(
  counts,
  readcounts = NULL, # Optional matrix or data.frame of raw read counts. Used for transcript-level quantification if provided.
  gene = NULL,
  samples = NULL,
  coldata = NULL, # Optional data.frame or file path containing sample metadata. Used to infer sample groups if `samples` is not provided.
  sample_type_col = "sample_type", # Column name in `coldata` or `SummarizedExperiment` colData to use for sample grouping. Default is "sample_type".
  tx2gene = NULL,
  res = NULL, # Optional result data.frame from a differential analysis. If provided and `gene` is NULL, top genes are selected by adjusted p-value.
  top_n = 3,
  pseudocount = 1e-6,
  output_file = NULL,
  metric = c("median", "mean", "variance", "iqr")
) {
    # If `gene` is not provided, select top genes from `res` using `top_n`.
    if (is.null(gene)) {
        gene <- .ptt_select_genes_from_res(res, top_n)
    }
    per_gene_top_n <- top_n

    ## Prepare inputs and normalization via helper
    prep <- .ptt_prepare_inputs(counts = counts, readcounts = readcounts, samples = samples, coldata = coldata, sample_type_col = sample_type_col, tx2gene = tx2gene, res = res, top_n = per_gene_top_n, pseudocount = pseudocount, output_file = output_file, metric = metric)

    # if prep returned without gene selection, caller will check `gene`
    counts <- prep$counts
    samples <- prep$samples
    mapping <- prep$mapping
    metric_choice <- prep$metric_choice
    agg_fun <- prep$agg_fun
    agg_label_unique <- prep$agg_label_unique
    top_n <- prep$top_n
    pseudocount <- prep$pseudocount
    output_file <- prep$output_file

    make_plot_for_gene <- function(gene_single, fill_limits = NULL) {
        .ptt_make_plot_for_gene(gene_single, mapping, counts, samples, top_n, agg_fun, pseudocount, agg_label_unique, fill_limits)
    }

    # Produce plots (single or multiple). Do not save inside helper - save once
    # below.
    if (length(gene) > 1) {
        fill_limits <- .compute_transcript_fill_limits(gene, mapping, counts, samples, top_n, agg_fun, pseudocount)

        plots <- lapply(seq_along(gene), function(i) {
            gname <- gene[i]
            pp <- make_plot_for_gene(gname, fill_limits = fill_limits)
            per_gene_title <- if (!is.na(gname) && nzchar(as.character(gname))) as.character(gname) else ""
            pp <- pp + ggplot2::labs(title = per_gene_title) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 16))
            pp
        })

        result_plot <- .ptt_combine_plots(plots, output_file = output_file, agg_label_unique = agg_label_unique)
    } else {
        result_plot <- make_plot_for_gene(gene)
    }

    if (!is.null(output_file)) {
        ggplot2::ggsave(output_file, result_plot)
        invisible(NULL)
    } else {
        result_plot
    }
}

## Filter SummarizedExperiment by low-expression transcripts
#' Filter transcripts in a `SummarizedExperiment` by minimum count/sample
#'
#' Subset a `SummarizedExperiment` to keep transcripts with more than
#' `min_count` reads in strictly more than `min_samples` samples. The
#' function updates assays, `rowData`, and relevant entries in
#' `metadata()` (for example `readcounts` and `tx2gene`) so downstream
#' helpers receive a consistent object.
#'
#' @param se A `SummarizedExperiment` object containing transcript-level
#'   assays (rows = transcripts, columns = samples).
#' @param min_count Integer threshold for per-sample counts (default 5L).
#' @param min_samples Integer minimum number of samples exceeding
#'   `min_count` required to keep a transcript (default 5L).
#' @param assay_name Name or index of the assay to use for filtering
#'   (default: 'counts'). If not present, the first assay is used.
#' @param verbose Logical; print before/after counts when TRUE.
#' @return A filtered `SummarizedExperiment`.
#' @export
#' @examples
#' mat <- matrix(c(0, 6, 7, 2, 8, 9), nrow = 3, dimnames = list(paste0('tx', 1:3), paste0('S', 1:2)))
#' se <- SummarizedExperiment::SummarizedExperiment(assays = list(counts = mat))
#' filt <- filter_se(se, min_count = 5, min_samples = 1)
#' class(filt)
filter_se <- function(se, min_count = 5L, min_samples = 5L, assay_name = "counts",
    verbose = TRUE) {
    if (!is(se, "SummarizedExperiment")) {
        stop("'se' must be a SummarizedExperiment", call. = FALSE)
    }

    assays_list <- SummarizedExperiment::assays(se)
    if (is.character(assay_name) && assay_name %in% names(assays_list)) {
        assay_mat <- as.matrix(assays_list[[assay_name]])
    } else if (is.numeric(assay_name) && assay_name >= 1 && assay_name <= length(assays_list)) {
        assay_mat <- as.matrix(assays_list[[assay_name]])
    } else {
        # fallback to first assay
        assay_mat <- as.matrix(assays_list[[1]])
        warning("Requested assay not found; using first assay.", call. = FALSE)
    }

    if (!is.numeric(assay_mat)) {
        stop("Assay data must be numeric.", call. = FALSE)
    }

    before <- nrow(assay_mat)
    # Cap min_samples to the number of available samples to avoid impossible
    # thresholds on small datasets and use an inclusive threshold so callers
    # requesting e.g. 'min_samples = ncol' keep rows present in all samples.
    eff_min_samples <- min(as.integer(min_samples), ncol(assay_mat))
    tokeep <- rowSums(assay_mat > min_count) >= eff_min_samples
    after <- sum(tokeep)

    if (verbose) {
        message(sprintf("Transcripts: before = %d, after = %d", before, after))
    }

    if (after == 0L) {
        warning("Filtering removed all transcripts; returning empty SummarizedExperiment.",
            call. = FALSE)
    }

    # subset all assays
    new_assays <- S4Vectors::SimpleList(lapply(assays_list, function(a) {
        if (is.matrix(a) || is.data.frame(a)) {
            as.matrix(a)[tokeep, , drop = FALSE]
        } else {
            a
        }
    }))
    names(new_assays) <- names(assays_list)

    # subset rowData if present
    rd <- NULL
    if (nrow(SummarizedExperiment::rowData(se)) > 0) {
        rd <- SummarizedExperiment::rowData(se)[tokeep, , drop = FALSE]
    }

    # subset metadata readcounts and tx2gene if present
    md <- S4Vectors::metadata(se)
    if (!is.null(md$readcounts) && is.matrix(md$readcounts)) {
        md$readcounts <- as.matrix(md$readcounts)[tokeep, , drop = FALSE]
    }
    if (!is.null(md$tx2gene) && is.data.frame(md$tx2gene)) {
        txmap <- md$tx2gene
        txcol <- colnames(txmap)[1]
        md$tx2gene <- txmap[txmap[[txcol]] %in% rownames(assay_mat)[tokeep], , drop = FALSE]
    }

    # construct new SE with same colData
    new_se <- SummarizedExperiment::SummarizedExperiment(assays = new_assays, rowData = if (!is.null(rd)) {
        rd
    } else {
        S4Vectors::DataFrame()
    }, colData = SummarizedExperiment::colData(se), metadata = c(S4Vectors::metadata(se),
        list(filtered = list(min_count = min_count, min_samples = min_samples))))

    # attach updated metadata pieces
    S4Vectors::metadata(new_se)$readcounts <- if (!is.null(md$readcounts)) {
        md$readcounts
    } else {
        NULL
    }
    S4Vectors::metadata(new_se)$tx2gene <- if (!is.null(md$tx2gene)) {
        md$tx2gene
    } else {
        NULL
    }

    return(new_se)
}

#' Calculate splicing diversity changes between two conditions.
#'
#' @param x A \code{matrix} with the splicing diversity values.
#' @param samples Character vector with an equal length to the number of columns
#' in the input dataset, specifying the category of each sample.
#' @param control Name of the control sample category, defined in the
#' \code{samples} vector, e.g. \code{control = 'Normal'} or \code{control =
#' 'WT'}.
#' @param method Method to use for calculating the average splicing diversity
#' value in a condition. Can be \code{'mean'} or \code{'median'}.
#' @param pseudocount Numeric scalar. Small value added to non-positive
#' observed group summaries to avoid zeros when computing differences and
#' log2 fold-changes. If \code{pseudocount <= 0} the function will automatically
#' choose a scale-aware value equal to half the smallest positive observed
#' group summary (i.e. half the smallest observed mean/median across groups);
#' if no positive values are present the fallback is \code{1e-6}. Rows with
#' insufficient observations remain \code{NA} and are not imputed.
#' @return A \code{data.frame} with mean or median value of splicing diversity
#' across sample categories, the difference between these values and the log2
#' fold change values.
#' @details The function uses a matrix of splicing diversity values in order to
#' calculate mean or median differences and log2 fold changes between two
#' conditions.
#' @import stats
#' @export
#' @examples
#' # Simulate splicing diversity matrix (4 genes x 4 samples)
#' mat <- matrix(c(
#'   0.5, 0.6, 0.8, 0.9,  # gene1: low control, high treatment
#'   0.7, 0.75, 0.6, 0.5  # gene2: high control, low treatment
#' ), nrow = 2, byrow = TRUE)
#' samples <- c('Normal', 'Normal', 'Tumor', 'Tumor')
#' result <- calculate_fc(mat, samples, control = 'Normal', method = 'mean')
#' head(result)
calculate_fc <- function(x, samples, control, method = "mean", pseudocount = 0) {
    # validate control and samples inputs
    if (is.null(control) || !nzchar(control)) {
        stop("`control` must be provided to calculate_fc", call. = FALSE)
    }
    if (length(samples) != ncol(x)) {
        stop("Length of 'samples' must equal number of columns in 'x'", call. = FALSE)
    }
    if (!(control %in% samples)) {
        stop("Control sample type not found in samples.", call. = FALSE)
    }
    agg <- .tsenat_aggregate_fc_values(x = x, samples = samples, method = method,
        control = control)
    value <- agg$value
    sorted <- agg$sorted

    # Defensive numeric coercion: ensure group means are numeric and mark any
    # non-finite or non-positive values as NA. This prevents Inf/NaN when
    # computing log2 fold changes downstream.
    value <- matrix(as.numeric(value), nrow = nrow(value), ncol = ncol(value), dimnames = dimnames(value))
    value[!is.finite(value)] <- NA

    # compute and apply pseudocount based on observed group summaries
    value <- .tsenat_apply_pseudocount(value, pseudocount)

    # compute difference and log2 fold-change with NA-safe handling
    diff_vec <- value[, 1] - value[, 2]
    na_mask <- is.na(value[, 1]) | is.na(value[, 2])
    diff_vec[na_mask] <- NA

    log2fc_vec <- log2(value[, 1]/value[, 2])
    log2fc_vec[na_mask] <- NA

    result <- data.frame(value, difference = diff_vec, log2_fold_change = log2fc_vec,
        check.names = FALSE, stringsAsFactors = FALSE)
    colnames(result) <- c(paste(sorted[1, 1], "_", method, sep = ""), paste(sorted[2,
        1], "_", method, sep = ""), paste(method, "_difference", sep = ""), "log2_fold_change")
    return(result)
}

#' Calculate p-values using Wilcoxon rank sum test.
#'
#' @param x A \code{matrix} with the splicing diversity values.
#' @param samples Character vector with an equal length to the number of columns
#' in the input dataset, specifying the category of each sample.
#' @param pcorr P-value correction method applied to the results, as defined in
#' the \code{p.adjust} function.
#' @param paired If \code{TRUE}, the Wilcox-test will be paired, and therefore
#' it will be a signed rank test instead of the rank sum test.
#' @param exact If \code{TRUE}, an exact p-value will be computed.
#' @param nthreads Number of threads for parallel processing (default: 1).
#' Set to > 1 to parallelize per-feature Wilcoxon tests.
#' @return Raw and corrected p-values in a matrix.
#' @import stats
#' @export
#' @examples
#' # Create a matrix of splicing diversity values (3 genes x 6 samples)
#' mat <- matrix(rnorm(18), nrow = 3)
#' samples <- rep(c('Control', 'Treatment'), each = 3)
#' 
#' # Run Wilcoxon test
#' result <- wilcoxon(mat, samples, pcorr = 'BH')
#' head(result)
wilcoxon <- function(x, samples, pcorr = "BH", paired = FALSE, exact = FALSE, nthreads = 1) {
    # Determine group indices (two groups expected)
    groups <- unique(sort(samples))
    if (length(groups) != 2) {
        stop("`samples` must contain exactly two groups for Wilcoxon tests.")
    }
    g1_idx <- as.numeric(which(samples %in% groups[1]))
    g2_idx <- as.numeric(which(samples %in% groups[2]))

    if (isTRUE(paired)) {
        if (length(g1_idx) != length(g2_idx)) {
            stop("Paired Wilcoxon requires equal numbers of samples in each ", "group.")
        }
        # Paired tests assume columns are already ordered/aligned by the caller
        # (e.g., via `map_metadata()`); do not attempt to infer pairing here.
    }

    # Function to compute Wilcoxon test for a single feature
    .wilcox_one <- function(i) {
        tryCatch({
            wilcox.test(x[i, g1_idx], x[i, g2_idx], paired = paired, exact = exact)$p.value
        }, error = function(e) {
            NA_real_
        }, warning = function(w) {
            # swallow specific warnings but return NA on unusual states
            NA_real_
        })
    }

    # Apply in parallel
    p_values <- .tsenat_bplapply(seq_len(nrow(x)), .wilcox_one, nthreads = nthreads)

    raw_p_values <- ifelse(is.na(vapply(p_values, c, numeric(1))), 1, vapply(p_values,
        c, numeric(1)))
    adjusted_p_values <- p.adjust(raw_p_values, method = pcorr)
    out <- cbind(raw_p_values, adjusted_p_values)
    colnames(out) <- c("raw_p_values", "adjusted_p_values")
    return(out)
}

#' Calculate p-values using label shuffling.
#'
#' @param x A \code{matrix} with the splicing diversity values.
#' @param samples Character vector with an equal length to the number of columns
#' in the input dataset, specifying the category of each sample.
#' @param control Name of the control sample category, defined in the
#' \code{samples} vector, e.g. \code{control = 'Normal'} or \code{control =
#' 'WT'}.
#' @param method Method to use for calculating the average splicing diversity
#' value in a condition. Can be \code{'mean'} or \code{'median'}.
#' @param randomizations The number of random shuffles.
#' @param pcorr P-value correction method applied to the results, as defined in
#' the \code{p.adjust()} function.
#' @param paired Logical; if \code{TRUE} perform a paired permutation scheme
#'   (default: \code{FALSE}). When paired is \code{TRUE}, permutations
#'   should preserve pairing between samples; the function currently permutes
#'   sample labels and therefore paired analyses are only meaningful when the
#'   caller has arranged \code{samples} accordingly.
#' @param paired_method Character; method for paired permutations. One of
#'   \code{'swap'} (randomly swap labels within pairs) or \code{'signflip'}
#'   (perform sign-flip permutations; can enumerate all 2^n_pairs combinations
#'   for an exact test when \code{randomizations = 0} or \code{randomizations >= 2^n_pairs}).
#' @param nthreads Number of threads for parallel processing (default: 1).
#' Set to > 1 to parallelize per-feature p-value computation.
#' @return Raw and corrected p-values.
#' @details The permutation p-values are computed two-sided as the proportion
#' of permuted log2 fold-changes at least as extreme as the observed value,
#' with a pseudocount added: (count + 1) / (n_perm + 1).
#' @import stats
#' @note The permutation test returns two-sided empirical p-values using a
#' pseudocount to avoid zero p-values for small numbers of permutations. See
#' the function documentation for details.
#' @export
#' @examples
#' set.seed(123)
#' # Create a matrix of splicing diversity values (2 genes x 4 samples)
#' mat <- matrix(rnorm(8), nrow = 2)
#' samples <- c('Normal', 'Normal', 'Tumor', 'Tumor')
#' 
#' # Run label shuffling test (100 permutations)
#' result <- label_shuffling(mat, samples, control = 'Normal', 
#'                           method = 'mean', randomizations = 100, pcorr = 'BH')
#' head(result)
label_shuffling <- function(x, samples, control, method, randomizations = 100, pcorr = "BH",
    paired = FALSE, paired_method = c("swap", "signflip"), nthreads = 1) {
    paired_method <- match.arg(paired_method)
    # observed log2 fold changes
    log2_fc <- calculate_fc(x, samples, control, method)[, 4]

    # build permutation/null distribution of log2 fold changes
    if (isTRUE(paired)) {
        perm_mat <- .tsenat_permute_paired(x = x, samples = samples, control = control,
            method = method, randomizations = randomizations, paired_method = paired_method)
    } else {
        # Generate permutations
        permuted <- replicate(randomizations, calculate_fc(x, sample(samples), control,
            method), simplify = FALSE)
        # each element is a data.frame/matrix; extract log2_fold_change column
        # (4th column)
        perm_mat <- vapply(permuted, function(z) as.numeric(z[, 4]), numeric(nrow(x)))
    }

    # Function to compute p-value for a single feature
    .compute_pval <- function(i) {
        obs <- log2_fc[i]
        nulls <- perm_mat[i, ]
        if (is.na(obs) || all(is.na(nulls))) {
            return(1)
        }
        nulls_non_na <- nulls[!is.na(nulls)]
        n_non_na <- length(nulls_non_na)
        if (n_non_na == 0) {
            return(1)
        }
        cnt <- sum(abs(nulls_non_na) >= abs(obs))
        pval <- (cnt + 1)/(n_non_na + 1)
        return(pval)
    }

    # compute two-sided permutation p-value with pseudocount, in parallel
    raw_p_values <- unlist(.tsenat_bplapply(seq_len(nrow(perm_mat)), .compute_pval,
        nthreads = nthreads))

    adjusted_p_values <- p.adjust(raw_p_values, method = pcorr)
    out <- cbind(raw_p_values, adjusted_p_values)
    colnames(out) <- c("raw_p_values", "adjusted_p_values")
    return(out)
}


#' Run a differential test by name: Wilcoxon or label-shuffle
#'
#' Thin wrapper that selects between `wilcoxon()` and `label_shuffling()`.
#'
#' @param x A matrix with splicing diversity values (rows = features).
#' @param samples Character vector of sample group labels (length = ncol(x)).
#' @param control Name of the control group (required for label-shuffle).
#' @param method Character; one of `'wilcoxon'` or `'shuffle'`.
#' @param fc_method Character; aggregation method used by the permutation
#'   test when `method = 'shuffle'` ('mean' or 'median').
#' @param paired Logical passed to `wilcoxon()` when using the Wilcoxon test.
#' @param exact Logical passed to `wilcoxon()` to request exact p-values.
#' @param randomizations Integer number of permutations for `label_shuffling()`.
#' @param pcorr P-value adjustment method (passed to `p.adjust`).
#' @param seed Integer seed used to make permutations reproducible (default
#'   123). The function sets a temporary RNG seed via `withr::local_seed(seed)`
#'   before running `label_shuffling()` when `method = 'shuffle'`.
#' @param nthreads Number of threads for parallel processing (default: 1).
#'   Set to > 1 to parallelize per-feature statistical tests in
#'   `wilcoxon()` or `label_shuffling()`.
#' @return A two-column matrix with raw and adjusted p-values (as returned by
#'   the underlying functions).
#' @export
#' @examples
#' mat <- matrix(rnorm(20), nrow = 5)
#' samples <- rep(c('A', 'B'), length.out = ncol(mat))
#' test_differential(mat, samples, control = 'A', method = 'wilcoxon')
#'
#' @param paired_method Character; forwarded to `label_shuffling()` when
#'   `method = 'shuffle'`. See `label_shuffling()` for details.
test_differential <- function(x, samples, control = NULL, method = c("wilcoxon",
    "shuffle"), fc_method = "mean", paired = FALSE, exact = FALSE, randomizations = 100,
    pcorr = "BH", seed = 123L, paired_method = c("swap", "signflip"), nthreads = 1) {
    paired_method <- match.arg(paired_method)
    method <- match.arg(method)
    if (method == "wilcoxon") {
        return(wilcoxon(x, samples, pcorr = pcorr, paired = paired, exact = exact,
            nthreads = nthreads))
    }

    # shuffle / permutation-based test
    if (is.null(control) || !nzchar(control)) {
        stop("`control` must be provided when method = 'shuffle'", call. = FALSE)
    }
    if (!is.null(seed)) {
        if (!is.numeric(seed) || length(seed) != 1) {
            stop("`seed` must be a single numeric value", call. = FALSE)
        }
        # Use withr::local_seed to temporarily set RNG state for reproducible
        # permutation tests without permanently modifying the global RNG.
        withr::local_seed(as.integer(seed))
    }
    return(label_shuffling(x, samples, control, fc_method, randomizations = randomizations,
        pcorr = pcorr, paired = paired, paired_method = paired_method, nthreads = nthreads))
}

#' Calculate splicing diversity changes between two conditions.
#' @param x A \code{SummarizedExperiment} with splicing diversity values for
#' each gene in each sample or a \code{data.frame} with gene names in the  first
#' column and splicing diversity values for each sample in additional  columns.
#' @param samples A vector of length one, specifying the column name of the
#' \code{colData} annotation column from the \code{SummarizedExperiment}
#' object, that should be used as the category column or a character vector
#' with an equal length to the number of columns in the input dataset,
#' specifying the category of each sample in the case of a \code{data.frame}
#' input.
#' @param control Name of the control sample category, defined in the
#' \code{samples} vector, e.g. \code{control = 'Normal'} or \code{control =
#' 'WT'}.
#' @param method Method to use for calculating the average splicing diversity
#' value in a condition. Can be \code{'mean'} or \code{'median'}.
#' @param test Method to use for p-value calculation: use \code{'wilcoxon'} for
#' Wilcoxon rank sum test or \code{'shuffle'} for a label shuffling test.
#' @param randomizations Number of random shuffles, used for the label shuffling
#' test (default = 100).
#' @param pcorr P-value correction method applied to the Wilcoxon rank sum test
#' or label shuffling test results, as defined in the \code{p.adjust}  function.
#' @param assayno An integer value. In case of multiple assays in a
#' \code{SummarizedExperiment} input, the argument specifies the assay number
#' to use for difference calculations.
#' @param verbose If \code{TRUE}, the function will print additional diagnostic
#' messages.
#' @param pseudocount Numeric scalar. Passed to \code{calculate_fc} and used to
#' add a small value to non-positive group summaries before computing
#' differences and log2 fold-changes. Default \code{1e-6}. Rows excluded for
#' low sample counts remain \code{NA}.
#' @param paired Logical; if `TRUE`, run paired versions of tests when
#'   supported (default: `FALSE`).
#' @param exact Logical; passed to the Wilcoxon test to request exact p-values
#'   when supported (default: `FALSE`).
#' @param nthreads Number of threads for parallel processing (default: 1).
#'   Set to > 1 to parallelize per-feature statistical tests.
#' @return A \code{data.frame} with the mean or median values of splicing
#' diversity across sample categories and all samples, log2(fold change) of  the
#' two different conditions, raw and corrected p-values.
#' @import methods
#' @importFrom SummarizedExperiment SummarizedExperiment assays assay colData
#' @export
#' @details The function calculates diversity changes between two sample
#' conditions. It uses the output of the diversity calculation function, which
#' is a \code{SummarizedExperiment} object of splicing diversity values.
#' Additionally, it can use a \code{data.frame} as input, where the first column
#' contains gene names, and all additional columns contain splicing diversity
#' values for each sample. A vector of sample conditions also serves as input,
#' used for aggregating the samples by condition.   It calculates the mean or
#' median of the splicing diversity data per sample  condition, the difference
#' of these values and the log2 fold change of the two  conditions. Furthermore,
#' the user can select a statistical method to  calculate the significance of
#' the changes. The p-values and adjusted p-values  are calculated using a
#' Wilcoxon sum rank test or label shuffling test.   The function will exclude
#' genes of low sample size from the significance  calculation, depending on
#' which statistical test is applied.
#' @examples
#' x <- data.frame(Genes = letters[seq_len(10)], matrix(runif(80), ncol = 8))
#' samples <- c(rep('Healthy', 4), rep('Pathogenic', 4))
#' calculate_difference(x, samples,
#'     control = 'Healthy', method = 'mean', test =
#'         'wilcoxon'
#' )
calculate_difference <- function(x, samples = NULL, control, method = "mean", test = "wilcoxon",
    randomizations = 100, pcorr = "BH", assayno = 1, verbose = TRUE, paired = FALSE,
    exact = FALSE, pseudocount = 0, nthreads = 1) {
    # internal small helpers (kept here to avoid adding new files)
    .tsenat_prepare_df <- function(x, samples, assayno) {
        if (inherits(x, "RangedSummarizedExperiment") || inherits(x, "SummarizedExperiment")) {
            # allow samples to be NULL (use default 'sample_type' col)
            if (is.null(samples)) {
                if ("sample_type" %in% colnames(SummarizedExperiment::colData(x))) {
                  samples_col <- "sample_type"
                } else {
                  stop("When providing a SummarizedExperiment, supply 'samples' as a colData column name or call map_metadata() to populate 'sample_type'",
                    call. = FALSE)
                }
            } else {
                if (length(samples) != 1) {
                  stop("'samples' must be a single colData column.", call. = FALSE)
                }
                samples_col <- samples
            }
            samples_vec <- SummarizedExperiment::colData(x)[[samples_col]]
            if (!is.numeric(assayno) || length(SummarizedExperiment::assays(x)) <
                assayno) {
                stop("Invalid 'assayno'.", call. = FALSE)
            }
            df <- as.data.frame(SummarizedExperiment::assays(x)[[assayno]])
            genes <- rownames(df)
            df <- cbind(genes = genes, df)
            list(df = df, samples = samples_vec)
        } else {
            df <- as.data.frame(x)
            list(df = df, samples = samples)
        }
    }

    .tsenat_sample_matrix <- function(dfr) {
        as.matrix(dfr[, -c(1, ncol(dfr) - 1, ncol(dfr)), drop = FALSE])
    }

    # Validate input container Reject matrices explicitly (tests expect this
    # error for matrix input)
    if (is.matrix(x)) {
        stop("Input type unsupported; see ?calculate_difference.", call. = FALSE)
    }
    if (!(is.data.frame(x) || inherits(x, "RangedSummarizedExperiment") || inherits(x,
        "SummarizedExperiment"))) {
        stop("Input data type not supported; see ?calculate_difference.", call. = FALSE)
    }

    # prepare data.frame and sample vector (handles SummarizedExperiment)
    pd <- .tsenat_prepare_df(x, samples, assayno)
    df <- pd$df
    samples <- pd$samples

    # Partition and validate inputs
    part <- .tsenat_calculate_difference_partition(df = df, samples = samples, control = control,
        method = method, test = test, pcorr = pcorr, randomizations = randomizations,
        verbose = verbose)
    df <- part$df
    samples <- part$samples
    groups <- part$groups
    idx1 <- part$idx1
    idx2 <- part$idx2
    df_keep <- part$df_keep
    df_small <- part$df_small

    result_list <- list()

    # Helper to extract the numeric matrix of sample columns (keeps original
    # order)
    sample_matrix <- .tsenat_sample_matrix

    if (nrow(df_keep) > 0) {
        if (nrow(df_small) > 0 && verbose) {
            message(sprintf("Note: %d genes excluded due to low sample counts.",
                nrow(df_small)))
        }
        ymat <- sample_matrix(df_keep)
        # p-value calculation
        if (test == "wilcoxon") {
            ptab <- wilcoxon(ymat, samples, pcorr = pcorr, paired = paired, exact = exact,
                nthreads = nthreads)
            # wilcoxon should return a data.frame of p-values named
            # appropriately
        } else {
            ptab <- label_shuffling(ymat, samples, control, method, randomizations = randomizations,
                pcorr = pcorr, paired = paired, nthreads = nthreads)
        }
        result_list$tested <- data.frame(genes = df_keep[, 1], calculate_fc(ymat,
            samples, control, method, pseudocount = pseudocount), ptab, stringsAsFactors = FALSE)
    }

    if (nrow(df_small) > 0) {
        small_mat <- sample_matrix(df_small)
        result_list$small <- data.frame(genes = df_small[, 1], calculate_fc(small_mat,
            samples, control, method, pseudocount = pseudocount), raw_p_values = NA,
            adjusted_p_values = NA, stringsAsFactors = FALSE)
    }

    # Combine results preserving tested rows first
    if (length(result_list) == 0) {
        return(data.frame())
    }
    res <- do.call(rbind, result_list)
    rownames(res) <- NULL
    if (!("log2_fold_change" %in% colnames(res)) && ("log2FC" %in% colnames(res))) {
        res$log2_fold_change <- res$log2FC
    }
    return(res)
}


#' Linear-model interaction test for Tsallis entropy   For each gene, fit a
#' linear model of the form `entropy ~ q * group` and  extract the p-value for
#' the interaction term (whether the effect of `q`  differs between groups). The
#' function expects a `SummarizedExperiment`  produced by
#' `calculate_diversity()` when multiple `q` values have been  computed (column
#' names contain `_q=`).
#' @param se A `SummarizedExperiment` containing a `diversity` assay produced
#' by `calculate_diversity(..., q = <vector>)`.
#' @param sample_type_col Optional column name in `colData(se)` that contains
#' a grouping factor for samples (character). If `NULL`, the function will
#' attempt to infer group from column names (suffix `_N` interpreted as
#' 'Normal').
#' @param min_obs Minimum number of non-NA observations required to fit a
#' model for a gene (default: 10).
#' @param method Modeling method to use for interaction testing: one of
#' \code{c('linear', 'lmm', 'gam', 'fpca')} (default: 'linear').
#' @param pvalue Type of p-value to compute for linear mixed models: one of
#' \code{c('satterthwaite', 'lrt', 'both')} (default: 'satterthwaite').
#' @param subject_col Optional column name in `colData(se)` that contains
#' subject/individual identifiers for paired or repeated-measures designs
#' (character). If provided with `method = 'lmm'`, used as random effect.
#' @param paired Logical; whether samples are paired (default: FALSE).
#' @param nthreads Number of threads (mc.cores) to use for parallel processing
#' (default: 1).
#' @param assay_name Name of the assay in the SummarizedExperiment to use
#' (default: 'diversity').
#' @param verbose Logical; whether to print progress messages during execution
#' (default: FALSE).
#' @return A data.frame with columns `gene`, `p_interaction`, and
#' `adj_p_interaction`, ordered by ascending `p_interaction`.
#' @export
#' @examples
#' data('tcga_brca_luma', package = 'TSENAT')
#' rc <- as.matrix(tcga_brca_luma[1:20, -1, drop = FALSE])
#' gs <- tcga_brca_luma[1:20, 1]
#' se <- calculate_diversity(rc, gs, q = c(0.1, 1), norm = TRUE)
#' # Provide a minimal sample-type mapping so the example runs during checks
#' SummarizedExperiment::colData(se) <- S4Vectors::DataFrame(
#'     sample_type = rep(c('Normal', 'Tumor'), length.out = ncol(se)),
#'     row.names = colnames(se)
#' )
#' calculate_lm_interaction(se, sample_type_col = 'sample_type')
calculate_lm_interaction <- function(se, sample_type_col = NULL, min_obs = 10, method = c("linear",
    "lmm", "gam", "fpca"), pvalue = c("satterthwaite", "lrt", "both"), subject_col = NULL,
    paired = FALSE, nthreads = 1, assay_name = "diversity", verbose = FALSE) {
    method <- match.arg(method)
    pvalue <- match.arg(pvalue)
    if (verbose) {
        message("[calculate_lm_interaction] method=", method)
    }
    # internal flags: keep these internal to avoid documenting them in Rd
    suppress_lme4_warnings <- TRUE
    progress <- FALSE
    if (!requireNamespace("SummarizedExperiment", quietly = TRUE)) {
        stop("SummarizedExperiment required")
    }

    mat <- SummarizedExperiment::assay(se, assay_name)
    if (is.null(mat)) {
        stop(sprintf("Assay '%s' not found in SummarizedExperiment", assay_name))
    }

    sample_q <- colnames(mat)
    if (is.null(sample_q) || length(sample_q) == 0) {
        stop("No column names found on diversity assay")
    }

    # parse sample names and q values from column names like 'Sample_q=0.01'
    sample_names <- sub("_q=.*", "", sample_q)
    has_q <- grepl("_q=", sample_q)
    if (!any(has_q)) {
        stop("Could not parse q values; expected '_q=' in column names", call. = FALSE)
    }
    if (!all(has_q)) {
        stop("Some column names are missing '_q='; ensure all diversity columns include a q value",
            call. = FALSE)
    }
    q_vals <- as.numeric(sub(".*_q=", "", sample_q))

    # determine group for each sample
    sample_type_in_coldata <- !is.null(sample_type_col) && sample_type_col %in% colnames(SummarizedExperiment::colData(se))
    if (sample_type_in_coldata) {
        st <- as.character(SummarizedExperiment::colData(se)[, sample_type_col])
        names(st) <- SummarizedExperiment::colData(se)$samples %||% colnames(mat)
        # when user provides a sample_type_col, index by the sample names
        # (strip q suffix)
        group_vec <- unname(st[sample_names])
    } else {
        stop("No sample grouping found: please supply `sample_type_col` or map sample",
            "types into `colData(se)` before calling calculate_lm_interaction().",
            call. = FALSE)
    }

    if (verbose && progress) {
        message("[calculate_lm_interaction] parsed samples and groups")
    }
    all_results <- list()
    fit_one <- function(g) {
        .tsenat_fit_one_interaction(g = g, se = se, mat = mat, q_vals = q_vals, sample_names = sample_names,
            group_vec = group_vec, method = method, pvalue = pvalue, subject_col = subject_col,
            paired = paired, min_obs = min_obs, verbose = verbose, suppress_lme4_warnings = suppress_lme4_warnings,
            progress = progress)
    }

    if (nthreads > 1) {
        res_list <- .tsenat_bplapply(rownames(mat), fit_one, nthreads = nthreads)
    } else {
        res_list <- lapply(rownames(mat), fit_one)
    }
    all_results <- Filter(Negate(is.null), res_list)

    if (length(all_results) == 0) {
        return(data.frame())
    }
    res <- do.call(rbind, all_results)
    res$adj_p_interaction <- stats::p.adjust(res$p_interaction, method = "BH")
    # Sort first by adjusted p-values, then by raw p-values for stable ordering
    res <- res[order(res$adj_p_interaction, res$p_interaction), , drop = FALSE]
    rownames(res) <- NULL

    .tsenat_report_fit_summary(res, verbose = verbose)

    # Return the result data.frame (do not attach to or return a
    # SummarizedExperiment)
    return(res)
}

# small helper (replacement for `%||%`) to provide default when NULL
`%||%` <- function(a, b) if (is.null(a)) b else a

#' Calculate Tsallis diversity values for transcripts grouped by gene
#' @param x Numeric matrix or data.frame of transcript-level expression
#' values (rows = transcripts, columns = samples).
#' @param genes Character vector with length equal to nrow(x) assigning each
#' transcript to a gene.
#' @param norm Logical; if TRUE normalize Tsallis entropy values per gene.
#' @param q Numeric scalar or vector of q values to evaluate.
#' @param verbose Logical; show diagnostic messages when TRUE.
#' @param what Which quantity to return from `calculate_tsallis_entropy`:
#' 'S' (Tsallis entropy) or 'D' (Hill numbers) (default: 'S').
#' @param nthreads Number of threads for parallel processing (default: 1).
#' Set to > 1 to parallelize per-gene entropy calculations.
#' @return A data.frame with genes in the first column and per-sample (and
#' per-q) Tsallis entropy values in subsequent columns.
#' @import stats
#' @export
#' @examples
#' # Create a small transcript expression matrix (4 transcripts x 2 samples)
#' mat <- matrix(c(10, 5, 0, 0, 2, 8, 3, 7), nrow = 4, byrow = TRUE)
#' colnames(mat) <- c('Sample1', 'Sample2')
#' genes <- c('geneA', 'geneA', 'geneB', 'geneB')
#' 
#' # Calculate Tsallis diversity for q=1
#' result <- calculate_method(mat, genes, norm = TRUE, q = 1)
#' result
calculate_method <- function(x, genes, norm = TRUE, verbose = FALSE, q = 2, what = c("S",
    "D"), nthreads = 1) {
    what <- match.arg(what)
    # validate q
    if (!is.numeric(q) || any(q <= 0)) {
        stop("Argument 'q' must be numeric and greater than 0.", call. = FALSE)
    }
    # cannot use aggregate because calculate_tsallis_entropy may return
    # multiple values when length(q) > 1
    gene_levels <- unique(genes)
    # ensure column names order matches the order used when constructing the
    # result matrix (samples vary outer, q varies inner). If sample names are
    # missing, synthesize deterministic names so column creation still works.
    sample_names <- colnames(x)
    if (is.null(sample_names)) {
        sample_names <- paste0("Sample", seq_len(ncol(x)))
    }
    coln <- as.vector(t(outer(sample_names, q, function(s, qq) paste0(s, "_q=", qq))))
    rown <- gene_levels

    # compute requested quantity ('S' or 'D') in parallel
    result_list <- .tsenat_bplapply(gene_levels, function(gene) {
        .tsenat_tsallis_row(x = x, genes = genes, gene = gene, q = q, norm = norm,
            what = what)
    }, nthreads = nthreads)

    # Convert list to matrix (each element is a named vector) result_list is a
    # list of vectors; combine them into a matrix
    result_mat <- t(vapply(result_list, identity, FUN.VALUE = setNames(numeric(length(coln)),
        coln)))
    colnames(result_mat) <- coln
    rownames(result_mat) <- rown
    out_df <- data.frame(Gene = rown, result_mat, check.names = FALSE)
    if (all(rowSums(!is.na(result_mat)) == 0)) {
        out_df <- data.frame(Gene = character(0))
        for (nm in coln) out_df[[nm]] <- numeric(0)
        x <- out_df
        return(x)
    }
    x <- out_df
    y <- x[apply(x[2:ncol(x)], 1, function(X) all(is.finite(X))), ]
    if (nrow(x) - nrow(y) > 0 && verbose == TRUE) {
        message(sprintf("Note: %d genes excluded.", nrow(x) - nrow(y)))
    }
    colnames(y)[1] <- "Gene"
    return(y)
}

# Helper functions for parallel processing using BiocParallel and parallel
# package

# Select and initialize parallel backend @param nthreads Number of threads to
# use (default: 1) @return BiocParallel BPPARAM object
.tsenat_get_bpparam <- function(nthreads = 1) {
    if (nthreads <= 1) {
        return(BiocParallel::SerialParam())
    }

    # On Unix-like systems, use MulticoreParam; on Windows use SnowParam
    if (.Platform$OS.type == "unix") {
        return(BiocParallel::MulticoreParam(workers = nthreads))
    } else {
        return(BiocParallel::SnowParam(workers = nthreads))
    }
}

# Apply function in parallel using the best available backend @param X Vector
# or list to iterate over @param FUN Function to apply @param nthreads Number
# of threads (default: 1) @param SIMPLIFY Whether to simplify results (default:
# TRUE) @param FUN.VALUE Template for vapply (optional)
.tsenat_bplapply <- function(X, FUN, nthreads = 1, SIMPLIFY = TRUE, FUN.VALUE = NULL) {
    if (nthreads <= 1) {
        # Serial execution
        if (is.null(FUN.VALUE)) {
            return(lapply(X, FUN))
        } else {
            return(unname(vapply(X, FUN, FUN.VALUE = FUN.VALUE)))
        }
    }

    # Parallel execution
    bpparam <- .tsenat_get_bpparam(nthreads)

    if (is.null(FUN.VALUE)) {
        return(BiocParallel::bplapply(X, FUN, BPPARAM = bpparam))
    } else {
        # Use bplapply and then simplify with vapply
        result_list <- BiocParallel::bplapply(X, FUN, BPPARAM = bpparam)
        return(vapply(result_list, identity, FUN.VALUE = FUN.VALUE))
    }
}

# Apply function over two vectors in parallel @param X First vector or list
# @param Y Second vector or list @param FUN Function to apply (takes two
# arguments) @param nthreads Number of threads (default: 1)
.tsenat_bpmapply <- function(X, Y, FUN, nthreads = 1) {
    if (nthreads <= 1) {
        return(unname(mapply(FUN, X, Y, SIMPLIFY = FALSE)))
    }

    bpparam <- .tsenat_get_bpparam(nthreads)
    return(unname(BiocParallel::bpmapply(FUN, X, Y, BPPARAM = bpparam, SIMPLIFY = FALSE)))
}

1		# Helpers for Tsallis entropy calculations
2
3		.tsenat_calc_S <- function(p, q, tol, n, log_base, norm) {
4	4839x	vapply(q, function(qi) {
5	5707x	if (abs(qi - 1) < tol) {
6	316x	sh <- -sum(ifelse(p > 0, p * log(p, base = log_base), 0))
7	316x	if (norm) {
8	313x	if (n <= 1) {
9		# Single isoform: normalization is undefined (0/0) Return NaN
10		# to indicate undefined normalization
11	26x	sh <- NaN
12		} else {
13	287x	sh <- sh/log(n, base = log_base)
14		}
15		}
16	316x	return(sh)
17		} else {
18	5391x	ts <- (1 - sum(p^qi))/(qi - 1)
19	5391x	if (norm) {
20	5384x	max_ts <- (1 - n^(1 - qi))/(qi - 1)
21		# Handle single-isoform case: ts is 0, max_ts is also 0, so 0/0
22		# is undefined Return NaN to indicate undefined normalization
23	5384x	if (abs(max_ts) < tol \|\| n <= 1) {
24	19x	ts <- NaN
25		} else {
26	5365x	ts <- ts/max_ts
27		}
28		}
29	5391x	return(ts)
30		}
31	4839x	}, numeric(1))
32		}
33
34		.tsenat_calc_D <- function(p, q, tol, log_base) {
35	4839x	vapply(q, function(qi) {
36	5707x	if (abs(qi - 1) < tol) {
37	316x	sh <- -sum(ifelse(p > 0, p * log(p, base = log_base), 0))
38	316x	D1 <- (log_base)^(sh)
39	316x	return(D1)
40		} else {
41	5391x	spq <- sum(p^qi)
42	5391x	Dq <- spq^(1/(1 - qi))
43	5391x	return(Dq)
44		}
45	4839x	}, numeric(1))
46		}
47		# Input preparation
48		.tsenat_prepare_diversity_input <- function(x, genes = NULL, tpm = FALSE, assayno = 1,
49		verbose = FALSE) {
50	45x	if (!(is.matrix(x) \|\| is.data.frame(x) \|\| is.list(x) \|\| is(x, "DGEList") \|\| is(x,
51	45x	"RangedSummarizedExperiment") \|\| is(x, "SummarizedExperiment"))) {
52	2x	stop("Input data type is not supported! Please use ?calculate_diversity to see the possible arguments and details.",
53	2x	call. = FALSE)
54		}
55
56	43x	if (is(x, "data.frame")) {
57	3x	x <- as.matrix(x)
58		}
59
60	43x	if (tpm == TRUE && !is.list(x) && verbose == TRUE) {
61	2x	message("Note: tpm as a logical argument is only interpreted in case of",
62	2x	" tximport lists.")
63		}
64
65	43x	se_assay_mat <- NULL
66	43x	if (is.list(x)) {
67	9x	if (length(x) == 4 && "counts" %in% names(x)) {
68	4x	if (tpm == FALSE) {
69	2x	x <- as.matrix(x$counts)
70		}
71	4x	if (tpm == TRUE) {
72	2x	x <- as.matrix(x$abundance)
73		}
74	5x	} else if (is(x, "DGEList")) {
75	3x	x <- as.matrix(x$counts)
76	3x	if (verbose == TRUE) {
77	2x	message("Note: calculate_diversity methods are only applicable",
78	2x	" if your DGEList contains transcript-level expression", " data.")
79		}
80	3x	if (tpm == TRUE && verbose == TRUE) {
81	1x	message("Note: tpm as a logical argument is only interpreted", " in case of tximport lists.")
82		}
83		} else {
84	2x	stop("The package cannot find any expression data in your input.", call. = FALSE)
85		}
86		}
87
88	41x	if (is(x, "RangedSummarizedExperiment") \|\| is(x, "SummarizedExperiment")) {
89	8x	md <- NULL
90	8x	try(md <- S4Vectors::metadata(x), silent = TRUE)
91	8x	if (!is.null(md) && !is.null(md$readcounts)) {
92	4x	se_assay_mat <- as.matrix(md$readcounts)
93	4x	x <- se_assay_mat
94		} else {
95	4x	assays_len <- length(SummarizedExperiment::assays(x))
96	4x	if (!is.numeric(assayno) \|\| assays_len < assayno) {
97	3x	stop("Please provide a valid assay number.", call. = FALSE)
98		}
99	1x	se_assay_mat <- as.matrix(SummarizedExperiment::assays(x)[[assayno]])
100	1x	x <- se_assay_mat
101		}
102	5x	if (is.null(genes)) {
103	5x	if (exists("se_assay_mat") && !is.null(md) && !is.null(md$tx2gene) &&
104	5x	is.data.frame(md$tx2gene)) {
105	4x	txmap <- md$tx2gene
106	4x	tx_col <- if ("Transcript" %in% colnames(txmap)) {
107	3x	"Transcript"
108		} else {
109	1x	colnames(txmap)[1]
110		}
111	4x	gene_col <- if ("Gen" %in% colnames(txmap)) {
112	3x	"Gen"
113		} else {
114	1x	colnames(txmap)[2]
115		}
116	4x	genes <- as.character(txmap[[gene_col]][match(rownames(se_assay_mat),
117	4x	txmap[[tx_col]])])
118	4x	rownames(x) <- NULL
119		} else {
120	1x	genes <- rownames(x)
121		# keep transcript rownames available for downstream metadata
122		# (we will store original transcript-level counts separately)
123	1x	rownames(x) <- NULL
124		}
125
126	5x	if (is.null(genes)) {
127	!	stop("Please construct a valid gene set for your ", "SummarizedExperiment.",
128	!	call. = FALSE)
129		}
130		}
131		}
132
133	38x	list(x = x, genes = genes, se_assay_mat = se_assay_mat)
134		}

1		# Helper utilities for calculate_difference
2
3		.tsenat_calculate_difference_partition <- function(df, samples, control, method,
4		test, pcorr, randomizations, verbose) {
5	36x	if (ncol(df) - 1 != length(samples)) {
6	4x	stop("Column count doesn't match length(samples).", call. = FALSE)
7		}
8	32x	uniq_groups <- unique(as.character(samples))
9	32x	if (length(uniq_groups) > 2) {
10	4x	stop("More than two conditions; provide exactly two.", call. = FALSE)
11		}
12	28x	if (length(uniq_groups) < 2) {
13	4x	stop("Fewer than two conditions; provide exactly two.", call. = FALSE)
14		}
15	24x	if (!(control %in% uniq_groups)) {
16	4x	stop("Control sample type not found in samples.", call. = FALSE)
17		}
18
19	20x	case_label <- setdiff(uniq_groups, control)
20	20x	groups <- c(case_label, control)
21
22	20x	if (!(method %in% c("mean", "median"))) {
23	1x	stop("Invalid method; see ?calculate_difference.", call. = FALSE)
24		}
25	19x	if (!(test %in% c("wilcoxon", "shuffle"))) {
26	1x	stop("Invalid test method; see ?calculate_difference.", call. = FALSE)
27		}
28	18x	valid_pcorr <- c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr",
29	18x	"none")
30	18x	if (!(pcorr %in% valid_pcorr)) {
31	1x	stop("Invalid p-value correction; see ?calculate_difference.", call. = FALSE)
32		}
33
34	17x	tab <- table(samples)
35	17x	if (test == "wilcoxon") {
36	12x	if (randomizations != 100 && verbose) {
37	2x	message("'randomizations' ignored for wilcoxon.")
38		}
39	12x	if (any(tab < 3) \|\| sum(tab) < 8) {
40	3x	warning("Low sample size for wilcoxon.", call. = FALSE)
41		}
42		}
43	17x	if (test == "shuffle") {
44	5x	if (sum(tab) <= 5) {
45	2x	warning("Low sample size for label shuffling.", call. = FALSE)
46		}
47	5x	if (sum(tab) > 5 && sum(tab) < 10) {
48	2x	warning("Label shuffling may be unreliable.", call. = FALSE)
49		}
50		}
51
52	17x	idx_case <- which(samples == groups[1])
53	17x	idx_control <- which(samples == groups[2])
54	17x	idx1 <- idx_case
55	17x	idx2 <- idx_control
56
57	17x	df$cond_1 <- rowSums(!is.na(df[, idx1 + 1, drop = FALSE]))
58	17x	df$cond_2 <- rowSums(!is.na(df[, idx2 + 1, drop = FALSE]))
59
60	17x	if (test == "wilcoxon") {
61	12x	keep_mask <- (df$cond_1 >= 3 & df$cond_2 >= 3 & (df$cond_1 + df$cond_2) >=
62	12x	8)
63		} else {
64	5x	keep_mask <- (df$cond_1 + df$cond_2) >= 5
65		}
66
67	17x	df_keep <- df[keep_mask, , drop = FALSE]
68	17x	df_small <- df[!keep_mask, , drop = FALSE]
69
70	17x	list(df = df, samples = samples, groups = groups, idx1 = idx1, idx2 = idx2, df_keep = df_keep,
71	17x	df_small = df_small)
72		}
73
74		# Helpers for calculate_fc
75		.tsenat_aggregate_fc_values <- function(x, samples, method, control) {
76	1796x	if (method == "mean") {
77	1590x	value <- aggregate(t(x), by = list(samples), mean, na.rm = TRUE)
78	206x	} else if (method == "median") {
79	206x	value <- aggregate(t(x), by = list(samples), median, na.rm = TRUE)
80		} else {
81	!	stop("Invalid method; must be 'mean' or 'median'")
82		}
83
84	1796x	sorted <- value[value$Group.1 != control, ]
85	1796x	sorted[2, ] <- value[value$Group.1 == control, ]
86	1796x	value <- t(sorted[, -1])
87	1796x	value[is.na(value[, 1]), c(1)] <- NA
88	1796x	value[is.na(value[, 2]), c(2)] <- NA
89	1796x	return(list(value = value, sorted = sorted))
90		}
91
92		.tsenat_apply_pseudocount <- function(value, pseudocount) {
93	1798x	if (!is.numeric(pseudocount) \|\| length(pseudocount) != 1) {
94	!	pseudocount <- 0
95		}
96	1798x	if (pseudocount <= 0) {
97	1796x	pos_vals <- value[!is.na(value) & value > 0]
98	1796x	if (length(pos_vals) > 0) {
99	1794x	pc <- min(pos_vals, na.rm = TRUE)/2
100		} else {
101	2x	pc <- 1e-06
102		}
103		} else {
104	2x	pc <- pseudocount
105		}
106	1798x	replace_idx <- !is.na(value) & value <= 0
107	1798x	value[replace_idx] <- pc
108	1798x	return(value)
109		}
110
111		# Paired permutation helpers
112		.tsenat_permute_paired <- function(x, samples, control, method, randomizations, paired_method) {
113	14x	ncols <- ncol(x)
114	14x	if (ncols%%2 != 0) {
115	1x	stop("Paired permutation requires an even number of samples and paired column ordering",
116	1x	call. = FALSE)
117		}
118	13x	npairs <- ncols/2
119	13x	if (paired_method == "swap") {
120	5x	perm_mat <- matrix(NA_real_, nrow = nrow(x), ncol = randomizations)
121	5x	for (r in seq_len(randomizations)) {
122	310x	swap <- sample(c(TRUE, FALSE), size = npairs, replace = TRUE)
123	310x	perm_samples <- samples
124	310x	for (p in seq_len(npairs)) {
125	820x	if (swap[p]) {
126	431x	i1 <- (p - 1) * 2 + 1
127	431x	i2 <- i1 + 1
128	431x	perm_samples[c(i1, i2)] <- perm_samples[c(i2, i1)]
129		}
130		}
131	310x	df_perm <- calculate_fc(x, perm_samples, control, method)
132	310x	perm_mat[, r] <- as.numeric(df_perm[, 4])
133		}
134	5x	return(perm_mat)
135	8x	} else if (paired_method == "signflip") {
136	8x	total_comb <- 2^npairs
137	8x	if (randomizations <= 0 \|\| randomizations >= total_comb) {
138	7x	combos <- expand.grid(rep(list(c(0, 1)), npairs))
139	7x	nrep <- nrow(combos)
140	7x	perm_mat <- matrix(NA_real_, nrow = nrow(x), ncol = nrep)
141	7x	for (r in seq_len(nrep)) {
142	36x	swap <- as.logical(as.integer(combos[r, ]))
143	36x	perm_samples <- samples
144	36x	for (p in seq_len(npairs)) {
145	88x	if (swap[p]) {
146	44x	i1 <- (p - 1) * 2 + 1
147	44x	i2 <- i1 + 1
148	44x	perm_samples[c(i1, i2)] <- perm_samples[c(i2, i1)]
149		}
150		}
151	36x	df_perm <- calculate_fc(x, perm_samples, control, method)
152	36x	perm_mat[, r] <- as.numeric(df_perm[, 4])
153		}
154	7x	return(perm_mat)
155		} else {
156	1x	perm_mat <- matrix(NA_real_, nrow = nrow(x), ncol = randomizations)
157	1x	for (r in seq_len(randomizations)) {
158	2x	swap <- sample(c(TRUE, FALSE), size = npairs, replace = TRUE)
159	2x	perm_samples <- samples
160	2x	for (p in seq_len(npairs)) {
161	4x	if (swap[p]) {
162	1x	i1 <- (p - 1) * 2 + 1
163	1x	i2 <- i1 + 1
164	1x	perm_samples[c(i1, i2)] <- perm_samples[c(i2, i1)]
165		}
166		}
167	2x	df_perm <- calculate_fc(x, perm_samples, control, method)
168	2x	perm_mat[, r] <- as.numeric(df_perm[, 4])
169		}
170	1x	return(perm_mat)
171		}
172		}
173		}

1		# Helpers for plot_top_transcripts internals
2
3		.ptt_select_genes_from_res <- function(res, top_n) {
4	9x	if (is.null(res)) {
5	2x	stop("Either 'gene' or 'res' must be provided")
6		}
7	7x	if (!("genes" %in% colnames(res))) {
8	2x	stop("Provided 'res' must contain a 'genes' column")
9		}
10	5x	if ("adjusted_p_values" %in% colnames(res)) {
11	3x	ord <- order(res$adjusted_p_values, na.last = NA)
12	2x	} else if ("raw_p_values" %in% colnames(res)) {
13	2x	ord <- order(res$raw_p_values, na.last = NA)
14		} else {
15	!	ord <- seq_len(nrow(res))
16		}
17	5x	genes_sel <- as.character(res$genes[ord])
18	5x	genes_sel <- unique(genes_sel)
19	5x	head(genes_sel, top_n)
20		}
21
22		.ptt_infer_samples_from_coldata <- function(coldata, counts, sample_type_col) {
23	7x	if (is.character(coldata) && length(coldata) == 1) {
24	1x	if (!file.exists(coldata)) {
25	!	stop("coldata file not found: ", coldata)
26		}
27	1x	cdf <- utils::read.delim(coldata, header = TRUE, stringsAsFactors = FALSE)
28	6x	} else if (is.data.frame(coldata)) {
29	5x	cdf <- coldata
30		} else {
31	1x	stop("`coldata` must be a data.frame or path to a tab-delimited file")
32		}
33
34	6x	if (!is.null(rownames(cdf)) && all(colnames(counts) %in% rownames(cdf))) {
35	2x	as.character(cdf[colnames(counts), sample_type_col])
36		} else {
37	4x	sample_id_cols <- c("sample", "Sample", "sample_id", "id")
38	4x	sid <- intersect(sample_id_cols, colnames(cdf))
39	4x	if (length(sid) > 0) {
40	3x	sid <- sid[1]
41	3x	if (!all(colnames(counts) %in% as.character(cdf[[sid]]))) {
42	1x	stop("coldata sample id column does not match column names of counts")
43		}
44	2x	row_ix <- match(colnames(counts), as.character(cdf[[sid]]))
45	2x	as.character(cdf[[sample_type_col]][row_ix])
46		} else {
47	1x	stop("Could not match `coldata` rows to `counts` columns. Provide `samples` or a row-named `coldata`.")
48		}
49		}
50		}
51
52		.ptt_read_tx2gene <- function(tx2gene) {
53	8x	if (is.null(tx2gene)) {
54	1x	stop("`tx2gene` must be provided as a file path or data.frame (or include mapping in metadata of provided SummarizedExperiment)")
55		}
56	7x	if (is.character(tx2gene) && length(tx2gene) == 1) {
57	3x	if (!file.exists(tx2gene)) {
58	1x	stop("tx2gene file not found: ", tx2gene)
59		}
60	2x	mapping <- utils::read.delim(tx2gene, stringsAsFactors = FALSE, header = TRUE)
61	4x	} else if (is.data.frame(tx2gene)) {
62	4x	mapping <- tx2gene
63		} else {
64	!	stop("`tx2gene` must be provided as a file path or data.frame (or include mapping in metadata of provided SummarizedExperiment)")
65		}
66	6x	if (!all(c("Transcript", "Gen") %in% colnames(mapping))) {
67	2x	stop("tx2gene must have columns 'Transcript' and 'Gen'")
68		}
69	4x	mapping
70		}
71
72		.ptt_make_agg <- function(metric = c("median", "mean", "variance", "iqr")) {
73	7x	metric_choice <- match.arg(metric)
74	7x	agg_fun <- switch(metric_choice, median = function(x) stats::median(x, na.rm = TRUE),
75	7x	mean = function(x) base::mean(x, na.rm = TRUE), variance = function(x) {
76	1x	stats::var(x, na.rm = TRUE)
77	7x	}, iqr = function(x) stats::IQR(x, na.rm = TRUE))
78	7x	agg_label_metric <- if (metric_choice == "iqr") {
79	2x	"IQR"
80		} else {
81	5x	metric_choice
82		}
83	7x	agg_label <- sprintf("Transcript-level expression with metric %s", agg_label_metric)
84	7x	.cnt <- as.integer(getOption("TSENAT.plot_top_counter", 0)) + 1L
85	7x	options(TSENAT.plot_top_counter = .cnt)
86	7x	agg_label_unique <- agg_label
87	7x	list(metric_choice = metric_choice, agg_fun = agg_fun, agg_label_unique = agg_label_unique)
88		}
89
90		.ptt_build_tx_long <- function(gene_single, mapping, counts, samples, top_n) {
91	13x	txs <- mapping$Transcript[mapping$Gen == gene_single]
92	13x	txs <- intersect(txs, rownames(counts))
93	13x	if (length(txs) == 0) {
94	1x	stop("No transcripts found for gene: ", gene_single)
95		}
96	12x	if (!is.null(top_n)) {
97	12x	txs <- head(txs, top_n)
98		}
99	12x	mat <- counts[txs, , drop = FALSE]
100	12x	df_all <- as.data.frame(mat)
101	12x	df_all$tx <- rownames(mat)
102	12x	df_long <- tidyr::pivot_longer(df_all, -tx, names_to = "sample", values_to = "expr")
103	12x	df_long$group <- rep(samples, times = length(txs))
104	12x	list(df_long = df_long, txs = txs)
105		}
106
107		.ptt_aggregate_df_long <- function(df_long, agg_fun, pseudocount) {
108	12x	df_summary <- stats::aggregate(expr ~ tx + group, data = df_long, FUN = agg_fun)
109	12x	df_summary$log2expr <- log2(df_summary$expr + pseudocount)
110	12x	df_summary$tx <- factor(df_summary$tx, levels = unique(df_summary$tx))
111	12x	df_summary
112		}
113
114		.ptt_build_plot_from_summary <- function(df_summary, agg_label_unique, fill_limits = NULL) {
115	12x	p <- ggplot2::ggplot(df_summary, ggplot2::aes(x = group, y = tx, fill = log2expr)) +
116	12x	ggplot2::geom_tile(color = "white", width = 0.95, height = 0.95) + ggplot2::scale_fill_viridis_c(option = "viridis",
117	12x	direction = -1, na.value = "grey80", limits = fill_limits) + ggplot2::theme_minimal(base_size = 14) +
118	12x	ggplot2::labs(title = agg_label_unique, x = NULL, y = NULL, fill = "log2(expr)") +
119	12x	ggplot2::theme(axis.text.y = ggplot2::element_text(size = 12), axis.text.x = ggplot2::element_text(size = 12),
120	12x	plot.title = ggplot2::element_text(size = 16, hjust = 0.6), legend.position = "bottom",
121	12x	legend.key.width = ggplot2::unit(1.2, "cm"), plot.margin = ggplot2::margin(4,
122	12x	4, 4, 4)) + ggplot2::guides(fill = ggplot2::guide_colorbar(title.position = "top",
123	12x	barwidth = 6, barheight = 0.35))
124	12x	p
125		}
126
127		.ptt_combine_patchwork <- function(plots, agg_label_unique) {
128	8x	combined <- Reduce(`+`, plots) + patchwork::plot_layout(nrow = 1, guides = "collect") &
129	8x	ggplot2::theme(legend.position = "bottom")
130	8x	combined <- combined + patchwork::plot_annotation(title = agg_label_unique, theme = ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.6,
131	8x	size = 16, margin = ggplot2::margin(b = 10))))
132	8x	combined
133		}
134
135		.ptt_combine_cowplot <- function(plots, output_file = NULL, agg_label_unique) {
136	1x	p_for_legend <- plots[[1]] + ggplot2::theme(legend.position = "bottom")
137	1x	legend <- cowplot::get_legend(p_for_legend)
138	1x	plots_nolegend <- lapply(plots, function(pp) pp + ggplot2::theme(legend.position = "none"))
139	1x	grid <- cowplot::plot_grid(plotlist = plots_nolegend, nrow = 1, align = "hv")
140	1x	title_grob <- cowplot::ggdraw() + cowplot::draw_label(agg_label_unique, fontface = "plain",
141	1x	x = 0.6, hjust = 0.5, size = 16)
142	1x	result_plot <- cowplot::plot_grid(title_grob, grid, legend, ncol = 1, rel_heights = c(0.08,
143	1x	1, 0.08))
144	1x	if (!is.null(output_file)) {
145	!	ggplot2::ggsave(output_file, result_plot)
146	!	invisible(NULL)
147		}
148	1x	result_plot
149		}
150
151		.ptt_combine_grid <- function(plots, output_file = NULL, agg_label_unique) {
152	2x	plots_nolegend <- lapply(plots, function(pp) pp + ggplot2::theme(legend.position = "none"))
153	2x	grobs <- lapply(plots_nolegend, ggplot2::ggplotGrob)
154	2x	g_full <- ggplot2::ggplotGrob(plots[[1]])
155	2x	legend_idx <- which(vapply(g_full$grobs, function(x) x$name, character(1)) ==
156	2x	"guide-box")
157	2x	if (length(legend_idx)) {
158	1x	legend_grob <- g_full$grobs[[legend_idx[1]]]
159		} else {
160	1x	legend_grob <- NULL
161		}
162	2x	n <- length(grobs)
163	2x	heights <- grid::unit.c(grid::unit(0.6, "cm"), grid::unit(1, "null"), grid::unit(0.7,
164	2x	"cm"))
165	2x	if (!is.null(output_file)) {
166	1x	png(filename = output_file, width = 800 * n, height = 480, res = 150)
167	1x	.draw_transcript_grid(grobs, agg_label_unique, legend_grob, n, heights, to_file = output_file)
168	1x	invisible(NULL)
169		} else {
170	1x	.draw_transcript_grid(grobs, agg_label_unique, legend_grob, n, heights)
171	1x	invisible(NULL)
172		}
173		}

1		# # Minimalized diversity functions: only Tsallis entropy retained
2
3		#' Calculate Tsallis entropy for a vector of transcript-level
4		#' expression values of one gene.
5		#'
6		#' @param x Vector of (non-negative) expression values.
7		#' @param q Tsallis entropy parameter (q > 0). Scalar or numeric vector
8		#' (default: 2).
9		#' @param norm Logical; if TRUE, normalize entropy by its theoretical maximum
10		#' (values in [0,1]).
11		#' @param what Which quantity to return: 'S' (Tsallis entropy), 'D' (Hill
12		#' numbers), or 'both'.
13		#' @param log_base Base of the logarithm used for Shannon limits and
14		#' normalization
15		#' (default: \code{exp(1)}).
16		#' @export
17		#' @return For `what = 'S'` or `what = 'D'`: a numeric vector
18		#' (named when length(q) > 1). For `what = 'both'`: a list with
19		#' components `$S` and `$D`.
20		#' @details
21		#' Implements S_q = (1 - sum p^q)/(q - 1) and D_q = (sum p^q)^(1/(1-q)).
22		#' Uses the q->1 limits (Shannon entropy and its exponential). Natural
23		#' logarithms are used for the q->1 limit and for normalization.
24		#' @examples
25		#' x <- c(10, 5, 0)
26		#' calculate_tsallis_entropy(x, q = c(0.5, 1, 2), norm = TRUE)
27		calculate_tsallis_entropy <- function(x, q = 2, norm = TRUE, what = c("S", "D", "both"),
28		log_base = exp(1)) {
29	4875x	what <- match.arg(what)
30	4875x	if (!is.numeric(q)) {
31	1x	stop("q must be numeric.")
32		}
33	4874x	if (any(q <= 0)) {
34	5x	stop("q must be greater than 0.")
35		}
36	4869x	if (!is.numeric(x)) {
37	1x	stop("x must be numeric")
38		}
39
40	4868x	n <- length(x)
41		# If all counts sum to zero, return NA; allow single-element vectors to
42		# proceed
43	4868x	if (sum(x, na.rm = TRUE) <= 0) {
44	31x	if (what == "both") {
45	1x	return(list(S = rep(NA_real_, length(q)), D = rep(NA_real_, length(q))))
46		}
47	30x	return(rep(NA_real_, length(q)))
48		}
49
50	4837x	p <- x/sum(x)
51
52	4837x	tol <- sqrt(.Machine$double.eps)
53	4837x	S_vec <- .tsenat_calc_S(p = p, q = q, tol = tol, n = n, log_base = log_base,
54	4837x	norm = norm)
55	4837x	D_vec <- .tsenat_calc_D(p = p, q = q, tol = tol, log_base = log_base)
56
57	4837x	if (what == "S") {
58	4822x	out <- S_vec
59	4822x	if (length(q) > 1) {
60	413x	names(out) <- paste0("q=", q)
61		}
62	4822x	if (length(q) == 1) {
63	4409x	return(unname(out))
64		}
65	413x	return(out)
66		}
67	15x	if (what == "D") {
68	15x	out <- D_vec
69	15x	if (length(q) > 1) {
70	9x	names(out) <- paste0("q=", q)
71		}
72	15x	if (length(q) == 1) {
73	6x	return(unname(out))
74		}
75	9x	return(out)
76		}
77		# both
78	!	names(S_vec) <- paste0("q=", q)
79	!	names(D_vec) <- paste0("q=", q)
80	!	return(list(S = S_vec, D = D_vec))
81		}

1		#' Calculate Tsallis diversity per gene across samples
2		#'
3		#' @param x A numeric matrix or data.frame of transcript-level expression
4		#' values (rows = transcripts, columns = samples), or a SummarizedExperiment-
5		#' like object.
6		#' @param tpm Logical. If TRUE and `x` is a tximport-style list, use the
7		#' `$abundance` matrix instead of `$counts`.
8		#' @param genes Character vector assigning each transcript (row) to a gene.
9		#' Must have length equal to nrow(x) or the number of transcripts in `x`.
10		#' @param norm Logical; if TRUE, normalize Tsallis entropy to [0,1] per gene.
11		#' @param assayno Integer assay index to use when `x` is a SummarizedExperiment.
12		#' @param verbose Logical; print diagnostic messages when TRUE.
13		#' @param q Numeric scalar or vector of Tsallis q values to evaluate (q > 0).
14		#' If length(q) > 1, the result will contain separate columns per sample and
15		#' q.
16		#' @param what Which quantity to return: 'S' for Tsallis entropy or 'D' for Hill
17		#' numbers.
18		#' @param nthreads Number of threads for parallel processing (default: 1).
19		#' Set to > 1 to parallelize per-gene entropy calculations.
20		#' @return A \link[SummarizedExperiment]{SummarizedExperiment} with assay
21		#' `diversity` containing per-gene diversity values.
22		#' @import methods
23		#' @importFrom SummarizedExperiment SummarizedExperiment assays assay rowData
24		#' colData
25		#' @export
26		#' @examples
27		#' data('tcga_brca_luma', package = 'TSENAT')
28		#' rc <- as.matrix(tcga_brca_luma[1:20, -1, drop = FALSE])
29		#' gs <- tcga_brca_luma[1:20, 1]
30		#' se <- calculate_diversity(rc, gs, q = 0.1, norm = TRUE)
31		#' SummarizedExperiment::assay(se)[1:3, 1:3]
32		calculate_diversity <- function(x, genes = NULL, norm = TRUE, tpm = FALSE, assayno = 1,
33		verbose = FALSE, q = 2, what = c("S", "D"), nthreads = 1) {
34		# Normalize and validate input data, extract matrix and gene mapping
35	30x	inp <- .tsenat_prepare_diversity_input(x = x, genes = genes, tpm = tpm, assayno = assayno,
36	30x	verbose = verbose)
37	28x	x <- inp$x
38	28x	genes <- inp$genes
39	28x	se_assay_mat <- inp$se_assay_mat
40
41	28x	if (!is.numeric(x)) {
42	1x	stop("Input data must be numeric!", call. = FALSE)
43		}
44
45	27x	if (any(is.na(x))) {
46	1x	stop("The data contains NA as expression values. NAs are not allowed", " in the input.",
47	1x	call. = FALSE)
48		}
49
50	26x	if (nrow(x) != length(genes)) {
51	1x	stop("The number of rows is not equal to the given gene set.", call. = FALSE)
52		}
53
54	25x	what <- match.arg(what)
55		# validate q values (Tsallis parameter must be > 0)
56	25x	if (!is.numeric(q) \|\| any(q <= 0)) {
57	3x	stop("Argument 'q' must be numeric and greater than 0.", call. = FALSE)
58		}
59		# keep a copy of transcript-level counts when available (non-null)
60	22x	if (!exists("se_assay_mat")) {
61	!	se_assay_mat <- x
62		}
63
64	22x	result <- calculate_method(x, genes, norm, verbose = verbose, q = q, what = what,
65	22x	nthreads = nthreads)
66
67		# Prepare assay and row/col data
68	22x	result_assay <- result[, -1, drop = FALSE]
69	22x	result_rowData <- data.frame(genes = result[, 1], row.names = result[, 1])
70
71	22x	if (length(q) > 1) {
72	8x	col_split <- do.call(rbind, strsplit(colnames(result)[-1], "_q="))
73	8x	col_ids <- paste(col_split[, 1], "_q=", col_split[, 2], sep = "")
74	8x	row_ids <- as.character(result[, 1])
75	8x	result_colData <- data.frame(samples = as.character(col_split[, 1]), q = as.numeric(col_split[,
76	8x	2]), row.names = col_ids, stringsAsFactors = FALSE)
77	8x	colnames(result_assay) <- col_ids
78	8x	rownames(result_assay) <- row_ids
79		} else {
80	14x	col_ids <- colnames(x)
81	14x	row_ids <- as.character(result[, 1])
82	14x	result_colData <- data.frame(samples = col_ids, row.names = col_ids)
83	14x	colnames(result_assay) <- col_ids
84	14x	rownames(result_assay) <- row_ids
85		}
86
87	22x	result_metadata <- list(method = "tsallis", norm = norm, q = q, what = what)
88
89		# if we have preserved the original transcript-level matrix, build a
90		# tx->gene mapping and make both available in metadata so downstream
91		# plotting helpers can find transcript IDs and their parent genes
92	22x	tx2gene_map <- NULL
93	22x	if (exists("se_assay_mat") && !is.null(rownames(se_assay_mat)) && length(genes) ==
94	22x	nrow(se_assay_mat)) {
95	2x	tx2gene_map <- data.frame(Transcript = rownames(se_assay_mat), Gen = as.character(genes),
96	2x	stringsAsFactors = FALSE)
97		}
98
99	22x	assays_list <- list()
100	22x	if (what == "S") {
101	21x	assays_list$diversity <- result_assay
102	1x	} else if (what == "D") {
103	1x	assays_list$hill <- result_assay
104		}
105
106	22x	result <- SummarizedExperiment::SummarizedExperiment(assays = assays_list, rowData = result_rowData,
107	22x	colData = result_colData, metadata = c(result_metadata, list(readcounts = if (exists("se_assay_mat")) se_assay_mat else NULL,
108	22x	tx2gene = tx2gene_map)))
109
110	22x	return(result)
111		}

1		#' Map external coldata into a SummarizedExperiment
2		#'
3		#' Map an external `coldata` table (sample IDs and a condition/label column)
4		#' into a `SummarizedExperiment` produced by `calculate_diversity()`. The
5		#' helper sets `colData(ts_se)$sample_type` when possible and records a
6		#' `sample_base` identifier for each column. Unmatched entries remain NA and
7		#' no automatic inference is attempted.
8		#' @param ts_se A `SummarizedExperiment` whose assay columns are named with
9		#' sample identifiers. Names may include per-sample annotations; mapping is
10		#' exact and case-sensitive unless you normalize identifiers beforehand.
11		#' @param coldata A data.frame with sample metadata (or `NULL`).
12		#' @param coldata_sample_col Name of the column in `coldata` containing sample
13		#' identifiers (default: 'Sample').
14		#' @param coldata_condition_col Name of the column in `coldata` with
15		#' condition/labels (default: 'Condition').
16		#' @param paired Logical; if `TRUE`, validate pairing and reorder columns so
17		#' that matched samples for each base are adjacent (default: `FALSE`). Use
18		#' `paired = TRUE` for paired analyses so downstream paired tests see
19		#' aligned samples regardless of the original `coldata` order.
20		#' @return The input `ts_se` with `colData(ts_se)$sample_type` populated when
21		#' possible. `colData(ts_se)$sample_base` is also added containing the base
22		#' sample identifier; rownames of `colData(ts_se)` are aligned to the assay
23		#' column names.
24		#' @export
25		#' @examples
26		#' data('tcga_brca_luma', package = 'TSENAT')
27		#' rc <- as.matrix(tcga_brca_luma[1:20, -1, drop = FALSE])
28		#' gs <- tcga_brca_luma[1:20, 1]
29		#' se <- calculate_diversity(rc, gs, q = 0.1, norm = TRUE)
30		#' sample_names <- sub('_q=.*', '', colnames(SummarizedExperiment::assay(se)))
31		#' coldata_df <- data.frame(Sample = sample_names, Condition = rep(c('A', 'B'),
32		#' length.out = ncol(se)
33		#' ))
34		#' map_metadata(se, coldata_df)
35		#' # Optionally validate pairs when appropriate
36		#' map_metadata(se, coldata_df, paired = TRUE)
37		map_metadata <- function(ts_se, coldata, coldata_sample_col = "Sample", coldata_condition_col = "Condition",
38		paired = FALSE) {
39	30x	if (is.null(coldata)) {
40	1x	return(ts_se)
41		}
42	29x	if (!is.data.frame(coldata) \|\| !all(c(coldata_sample_col, coldata_condition_col) %in%
43	29x	colnames(coldata))) {
44	2x	return(ts_se)
45		}
46		# Prepare canonical condition order and base extraction. `conds` is sorted
47		# for deterministic ordering when constructing paired column order.
48		# `coldata_base` removes a trailing role suffix (e.g. `_N`/`_T`) from the
49		# `coldata` sample IDs; note that assay column names may contain per-sample
50		# annotations (e.g. per-q tags) which are handled separately below when
51		# building `sample_base_names`.
52	27x	conds <- sort(unique(as.character(coldata[[coldata_condition_col]])))
53	27x	coldata_base <- sub("_[^_]+$", "", as.character(coldata[[coldata_sample_col]]))
54	27x	bases <- unique(coldata_base)
55
56		# Optionally validate that coldata defines pairing between conditions
57	27x	if (isTRUE(paired) && length(conds) >= 2) {
58	6x	unpaired <- vapply(bases, function(b) {
59	24x	length(unique(coldata[[coldata_condition_col]][coldata_base == b]))
60	6x	}, integer(1))
61	6x	bad <- bases[unpaired != length(conds)]
62	6x	if (length(bad) > 0) {
63	3x	bad_list <- paste(bad, collapse = ", ")
64	3x	cond_list <- paste(conds, collapse = ", ")
65	3x	msg <- paste0("Unpaired samples found in coldata for bases: ", bad_list,
66	3x	". Ensure each base has all conditions: ", cond_list)
67	3x	stop(msg, call. = FALSE)
68		}
69		}
70	24x	sample_base_names <- sub("_q=.*", "", colnames(SummarizedExperiment::assay(ts_se)))
71
72		# Reorder the SummarizedExperiment columns.
73	24x	base_names <- sample_base_names
74	24x	idx_list <- integer(0)
75	24x	if (isTRUE(paired)) {
76		# Build paired-aware order: for each base, list samples in the
77		# deterministic `conds` order so matched samples become adjacent.
78	3x	for (b in bases) {
79	6x	for (c in conds) {
80		# find sample names in coldata for this base+condition
81	12x	samples_for_pair <- as.character(coldata[[coldata_sample_col]])[coldata_base ==
82	12x	b & as.character(coldata[[coldata_condition_col]]) == c]
83	12x	if (length(samples_for_pair) == 0) {
84	!	next
85		}
86		# for each sample name, find matching assay columns (after
87		# q-stripping)
88	12x	for (s in samples_for_pair) {
89	12x	matches <- which(base_names == s)
90	12x	if (length(matches) > 0) {
91	12x	idx_list <- c(idx_list, matches)
92		}
93		}
94		}
95		}
96		# Preserve original order for any unmatched columns
97	3x	remaining <- setdiff(seq_along(base_names), idx_list)
98	3x	new_order <- c(idx_list, remaining)
99		} else {
100		# Non-paired: follow the order present in `coldata`
101	21x	ordered_samples <- as.character(coldata[[coldata_sample_col]])
102	21x	for (s in ordered_samples) {
103	103x	matches <- which(base_names == s)
104	103x	if (length(matches) > 0) {
105	91x	idx_list <- c(idx_list, matches)
106		}
107		}
108	21x	remaining <- setdiff(seq_along(base_names), idx_list)
109	21x	new_order <- c(idx_list, remaining)
110		}
111	24x	if (length(new_order) > 0 && !all(new_order == seq_along(base_names))) {
112	4x	ts_se <- ts_se[, new_order, drop = FALSE]
113	4x	sample_base_names <- sub("_q=.*", "", colnames(SummarizedExperiment::assay(ts_se)))
114		}
115	24x	st_map <- setNames(as.character(coldata[[coldata_condition_col]]), as.character(coldata[[coldata_sample_col]]))
116	24x	sample_types <- unname(st_map[sample_base_names])
117	24x	missing_idx <- which(is.na(sample_types))
118	24x	if (length(missing_idx) > 0) {
119	3x	missing_samples <- sample_base_names[missing_idx]
120	3x	msg <- paste0("map_metadata: unmatched samples in 'coldata': ", paste(missing_samples,
121	3x	collapse = ", "), ". Provide matching entries in 'coldata' or populate ",
122	3x	"colData(ts_se)$sample_type beforehand.")
123	3x	stop(msg, call. = FALSE)
124		}
125	21x	SummarizedExperiment::colData(ts_se)$sample_type <- sample_types
126		# Also record the base sample names (without q suffix) per column so
127		# downstream functions can rely on explicit pairing information.
128	21x	SummarizedExperiment::colData(ts_se)$sample_base <- sample_base_names
129		# Ensure rownames of colData match assay column names
130	21x	assay_cols <- colnames(SummarizedExperiment::assay(ts_se))
131	21x	rownames(SummarizedExperiment::colData(ts_se)) <- assay_cols
132		# Attach transcript-level readcounts and tx->gene mapping to metadata if
133		# they are available in the calling environment or globalenv and not
134		# already present in the SummarizedExperiment metadata. This simplifies
135		# downstream plotting helpers that expect these objects.
136	21x	if (requireNamespace("S4Vectors", quietly = TRUE)) {
137	21x	md <- S4Vectors::metadata(ts_se)
138		# prefer existing metadata values; otherwise try common names
139	21x	if (is.null(md$readcounts)) {
140	21x	if (exists("readcounts", envir = parent.frame())) {
141	6x	md$readcounts <- get("readcounts", envir = parent.frame())
142	15x	} else if (exists("readcounts", envir = globalenv())) {
143	!	md$readcounts <- get("readcounts", envir = globalenv())
144		}
145		}
146	21x	if (is.null(md$tx2gene)) {
147		# vignette uses 'txmap' variable name; also accept 'tx2gene'
148	21x	if (exists("txmap", envir = parent.frame())) {
149	2x	md$tx2gene <- get("txmap", envir = parent.frame())
150	19x	} else if (exists("tx2gene", envir = parent.frame())) {
151	4x	md$tx2gene <- get("tx2gene", envir = parent.frame())
152	15x	} else if (exists("txmap", envir = globalenv())) {
153	!	md$tx2gene <- get("txmap", envir = globalenv())
154	15x	} else if (exists("tx2gene", envir = globalenv())) {
155	!	md$tx2gene <- get("tx2gene", envir = globalenv())
156		}
157		}
158	21x	S4Vectors::metadata(ts_se) <- md
159		}
160		# If a diversity assay is present, prepare a simple diversity data.frame
161		# (genes + per-sample diversity values) and store it in metadata so the
162		# vignette and plotting helpers can use a ready-made table.
163	21x	if ("diversity" %in% SummarizedExperiment::assayNames(ts_se)) {
164	15x	div_mat <- as.matrix(SummarizedExperiment::assay(ts_se, "diversity"))
165		# sample base names without per-q suffixes
166	15x	sample_base_names <- sub("_q=.*", "", colnames(div_mat))
167		# prefer explicit sample_type in colData when present
168	15x	samples_vec <- NULL
169	15x	if ("sample_type" %in% colnames(SummarizedExperiment::colData(ts_se))) {
170	15x	samples_vec <- as.character(SummarizedExperiment::colData(ts_se)$sample_type)
171		}
172	15x	div_df <- as.data.frame(div_mat)
173	15x	genes_col <- if (!is.null(SummarizedExperiment::rowData(ts_se)$genes)) {
174	5x	SummarizedExperiment::rowData(ts_se)$genes
175		} else {
176	10x	rownames(div_df)
177		}
178	15x	div_df <- cbind(genes = genes_col, div_df)
179	15x	md2 <- S4Vectors::metadata(ts_se)
180	15x	md2$diversity_df <- div_df
181	15x	md2$sample_base_names <- sample_base_names
182	15x	if (!is.null(samples_vec)) {
183	15x	md2$samples <- samples_vec
184		}
185	15x	S4Vectors::metadata(ts_se) <- md2
186		}
187	21x	return(ts_se)
188		}
189
190		# Map sample names (without '_q=...') to group labels using `colData(se)`.
191		# Mapping must be provided via `colData(se)`; no inference fallback is used.
192		map_samples_to_group <- function(sample_names, se = NULL, sample_type_col = NULL,
193		mat = NULL) {
194		# Prefer explicit mapping from colData(se)[, sample_type_col] when
195		# provided. If `sample_type_col` is not provided, allow a single- condition
196		# dataset by assigning a single default group 'Group' to all samples (this
197		# permits plotting single-condition q-curves).
198	3x	base_names <- sub("_q=.*", "", colnames(if (!is.null(mat)) {
199	1x	mat
200		} else {
201	2x	SummarizedExperiment::assay(se)
202		}))
203
204	3x	if (!is.null(se) && !is.null(sample_type_col) && (sample_type_col %in% colnames(SummarizedExperiment::colData(se)))) {
205	2x	st_vec <- as.character(SummarizedExperiment::colData(se)[, sample_type_col])
206	2x	names(st_vec) <- base_names
207	2x	st_map <- st_vec[!duplicated(names(st_vec))]
208		} else {
209		# No explicit mapping: assume single-group dataset
210	1x	st_map <- setNames(rep("Group", length(base_names)), base_names)
211		}
212
213	3x	mapped <- unname(st_map[sample_names])
214	3x	missing_idx <- which(is.na(mapped))
215	3x	if (length(missing_idx) > 0) {
216	1x	stop(sprintf("Missing sample_type mapping for samples: %s", paste(unique(sample_names[missing_idx]),
217	1x	collapse = ", ")))
218		}
219	2x	mapped
220		}
221
222		# Prepare a long-format data.frame for a simple assay (one value per sample)
223		get_assay_long <- function(se, assay_name = "diversity", value_name = "diversity",
224		sample_type_col = NULL) {
225	11x	if (!requireNamespace("tidyr", quietly = TRUE)) {
226	!	stop("tidyr required")
227		}
228	11x	if (!requireNamespace("dplyr", quietly = TRUE)) {
229	!	stop("dplyr required")
230		}
231	11x	if (!requireNamespace("SummarizedExperiment", quietly = TRUE)) {
232	!	stop("SummarizedExperiment required")
233		}
234
235	11x	mat <- SummarizedExperiment::assay(se, assay_name)
236	10x	if (is.null(mat)) {
237	!	stop("Assay not found: ", assay_name)
238		}
239	10x	df <- as.data.frame(mat)
240	10x	genes_col <- if (!is.null(SummarizedExperiment::rowData(se)$genes)) {
241	8x	SummarizedExperiment::rowData(se)$genes
242		} else {
243	2x	rownames(df)
244		}
245	10x	df <- cbind(df, Gene = genes_col)
246	10x	long <- tidyr::pivot_longer(df, -Gene, names_to = "sample", values_to = value_name)
247
248		# sample_type: prefer explicit colData mapping when available. If not
249		# provided, assume a single-group dataset and set `sample_type` to 'Group'
250		# for all samples.
251	10x	if (!is.null(sample_type_col) && (sample_type_col %in% colnames(SummarizedExperiment::colData(se)))) {
252	6x	st <- as.character(SummarizedExperiment::colData(se)[, sample_type_col])
253	6x	names(st) <- colnames(mat)
254	6x	st_map <- st[!duplicated(names(st))]
255	6x	sample_base <- sub("_q=.*", "", long$sample)
256	6x	long$sample_type <- unname(st_map[sample_base])
257	6x	missing_idx <- which(is.na(long$sample_type))
258	6x	if (length(missing_idx) > 0) {
259	!	stop(sprintf("Missing sample_type mapping for samples: %s", paste(unique(sample_base[missing_idx]),
260	!	collapse = ", ")))
261		}
262		} else {
263	4x	long$sample_type <- rep("Group", nrow(long))
264		}
265
266		# Filter out NA values but retain sample_type information
267	10x	long_filtered <- long[!is.na(long[[value_name]]), , drop = FALSE]
268
269		# Check if any data remains after filtering
270	10x	if (nrow(long_filtered) == 0) {
271	1x	stop(sprintf("No non-NA values found in assay '%s'. All values are NA.",
272	1x	assay_name), call. = FALSE)
273		}
274
275	9x	long_filtered
276		}
277
278		# Internal small helper: prepare long-format tsallis data from a
279		# SummarizedExperiment
280		prepare_tsallis_long <- function(se, assay_name = "diversity", sample_type_col = "sample_type") {
281	24x	if (!requireNamespace("tidyr", quietly = TRUE)) {
282	!	stop("tidyr required")
283		}
284	24x	if (!requireNamespace("dplyr", quietly = TRUE)) {
285	!	stop("dplyr required")
286		}
287	24x	if (!requireNamespace("SummarizedExperiment", quietly = TRUE)) {
288	!	stop("SummarizedExperiment required")
289		}
290
291	24x	mat <- SummarizedExperiment::assay(se, assay_name)
292	24x	if (is.null(mat)) {
293	!	stop("Assay not found: ", assay_name)
294		}
295	24x	df <- as.data.frame(mat)
296	24x	genes_col <- if (!is.null(SummarizedExperiment::rowData(se)$genes)) {
297	14x	SummarizedExperiment::rowData(se)$genes
298		} else {
299	10x	rownames(df)
300		}
301	24x	df <- cbind(df, Gene = genes_col)
302
303	24x	long <- tidyr::pivot_longer(df, -Gene, names_to = "sample_q", values_to = "tsallis")
304	24x	if (any(grepl("_q=", long$sample_q))) {
305	13x	long <- tidyr::separate(long, sample_q, into = c("sample", "q"), sep = "_q=")
306	13x	long$q <- as.factor(as.numeric(long$q))
307		} else {
308	11x	long$sample <- long$sample_q
309	11x	long$q <- NA
310		}
311
312	24x	if (!is.null(sample_type_col) && (sample_type_col %in% colnames(SummarizedExperiment::colData(se)))) {
313	20x	st_vec <- as.character(SummarizedExperiment::colData(se)[, sample_type_col])
314	20x	names(st_vec) <- sub("_q=.*", "", colnames(mat))
315	20x	st_map <- st_vec[!duplicated(names(st_vec))]
316	20x	long$group <- unname(st_map[as.character(long$sample)])
317	20x	missing_idx <- which(is.na(long$group))
318	20x	if (length(missing_idx) > 0) {
319	!	stop(sprintf("Missing sample_type mapping for samples: %s", paste(unique(as.character(long$sample)[missing_idx]),
320	!	collapse = ", ")))
321		}
322		} else {
323	4x	long$group <- rep("Group", nrow(long))
324		}
325
326	24x	as.data.frame(long[!is.na(long$tsallis), , drop = FALSE])
327		}
328
329
330		#' Map transcript IDs from a tx2gene table to a readcounts matrix
331		#'
332		#' Assign transcript identifiers as row names of a transcript-level read
333		#' counts matrix so downstream plotting functions can identify transcripts.
334		#' The tx2gene mapping can be provided as a file path (TSV) or a
335		#' data.frame with a `Transcript` column.
336		#'
337		#' @param readcounts A numeric matrix or data.frame of read counts (rows = transcripts).
338		#' @param tx2gene Either a path to a tab-delimited file or a data.frame with at least a `Transcript` column.
339		#' @param tx_col Name of the transcript ID column in `tx2gene` (default: 'Transcript').
340		#' @param verbose Logical; print informative messages (default: FALSE).
341		#' @return The input `readcounts` with rownames set to the transcript IDs.
342		#' @examples
343		#' readcounts <- matrix(1:4, nrow = 2, dimnames = list(NULL, c('s1', 's2')))
344		#' tx2gene <- data.frame(Transcript = c('tx1', 'tx2'), Gene = c('g1', 'g2'), stringsAsFactors = FALSE)
345		#' rc <- map_tx_to_readcounts(readcounts, tx2gene)
346		#' rownames(rc)
347		#' @export
348		map_tx_to_readcounts <- function(readcounts, tx2gene, tx_col = "Transcript", verbose = FALSE) {
349	7x	if (is.character(tx2gene) && length(tx2gene) == 1) {
350	2x	if (!file.exists(tx2gene)) {
351	!	stop("tx2gene file not found: ", tx2gene, call. = FALSE)
352		}
353	2x	txmap <- utils::read.delim(tx2gene, header = TRUE, stringsAsFactors = FALSE)
354	5x	} else if (is.data.frame(tx2gene)) {
355	5x	txmap <- tx2gene
356		} else {
357	!	stop("'tx2gene' must be a file path or a data.frame.", call. = FALSE)
358		}
359
360	7x	if (!(tx_col %in% colnames(txmap))) {
361	!	stop(sprintf("tx2gene mapping must contain column '%s'", tx_col), call. = FALSE)
362		}
363
364		# Ensure readcounts is a matrix-like object
365	7x	if (is.data.frame(readcounts)) {
366	1x	readcounts <- as.matrix(readcounts)
367		}
368	7x	if (!is.matrix(readcounts)) {
369	!	stop("'readcounts' must be a matrix or data.frame", call. = FALSE)
370		}
371
372	7x	n_rc <- nrow(readcounts)
373	7x	n_tx <- nrow(txmap)
374
375	7x	if (n_tx == n_rc) {
376	3x	rownames(readcounts) <- as.character(txmap[[tx_col]])
377	3x	if (verbose) {
378	!	message(sprintf("Assigned %d transcript rownames from tx2gene.", n_rc))
379		}
380	3x	return(readcounts)
381		}
382
383		# If counts differ, attempt to match by transcript identifiers if present
384	4x	tx_ids <- as.character(txmap[[tx_col]])
385	4x	if (!is.null(rownames(readcounts)) && all(rownames(readcounts) %in% tx_ids)) {
386		# reorder txmap to match readcounts row order and set rownames
387	1x	matched_idx <- match(rownames(readcounts), tx_ids)
388	1x	rownames(readcounts) <- tx_ids[matched_idx]
389	1x	if (verbose) {
390	1x	message("Matched and assigned transcript IDs by existing readcounts rownames.")
391		}
392	1x	return(readcounts)
393		}
394
395	3x	stop(sprintf("Number of transcripts in tx2gene (%d) does not match readcounts rows (%d), and automatic matching failed.",
396	3x	n_tx, n_rc), call. = FALSE)
397		}

1		## Helper: Extract tx2gene mapping from GFF3 file
2		#' @title Extract Transcript-to-Gene Mapping from GFF3 File
3		#' @description Internal function that parses GFF3 or GFF3.gz files to extract
4		#' transcript-to-gene mappings. This function is called internally by \code{build_se()}
5		#' when a GFF3 file path is provided. Users should not call this function directly.
6		#' @param gff3_file Path to a GFF3 or GFF3.gz file containing transcript/mRNA
7		#' features with ID and Parent attributes.
8		#' @return A data.frame with two columns (Transcript, Gene) containing the
9		#' transcript-to-gene mapping extracted from the GFF3 file.
10		#' @details
11		#' The function expects GFF3 format with the following structure:
12		#' \itemize{
13		#' \item 9 tab-separated columns: seqname, source, feature, start, end,
14		#' score, strand, phase, attributes
15		#' \item Feature type column (3rd column) should contain 'transcript' or 'mRNA'
16		#' \item Attributes column (9th column) should contain ID and Parent fields
17		#' \item ID field: unique identifier for the transcript
18		#' \item Parent field: references the gene ID that this transcript belongs to
19		#' }
20		#' Example GFF3 line:
21		#' \preformatted{chr1\tgencode\ttranscript\t1000\t3000\t.\t+\t.\t
22		#' ID=ENST00000001;Parent=ENSG00000101;Name=BRCA1-001}
23		#'
24		#' \strong{Performance:} This function is optimized for large GFF3 files:
25		#' \itemize{
26		#' \item Reads files in 10,000-line chunks (not line-by-line)
27		#' \item Uses fast pre-filtering (feature type check before regex)
28		#' \item Employs efficient string operations instead of heavy regex on every line
29		#' \item Handles both compressed (.gz) and uncompressed files seamlessly
30		#' }
31		#' @examples
32		#' \dontrun{
33		#' # Create a temporary GFF3 file with transcript features
34		#' gff3_lines <- c(
35		#' 'chr1\tgencode\ttranscript\t1000\t3000\t.\t+\t.\tID=ENST001;Parent=ENSG001',
36		#' 'chr1\tgencode\ttranscript\t1500\t3500\t.\t+\t.\tID=ENST002;Parent=ENSG001',
37		#' 'chr2\tgencode\ttranscript\t5000\t8000\t.\t-\t.\tID=ENST003;Parent=ENSG002'
38		#' )
39		#' tf <- tempfile(fileext = '.gff3')
40		#' writeLines(gff3_lines, tf)
41		#'
42		#' # Extract transcript-to-gene mapping
43		#' tx2gene <- extract_tx2gene_from_gff3(tf)
44		#' head(tx2gene)
45		#' }
46		extract_tx2gene_from_gff3 <- function(gff3_file) {
47		# Handle both .gff3 and .gff3.gz files
48	25x	if (grepl("\\.gff3\\.gz$", gff3_file)) {
49	4x	con <- gzfile(gff3_file, "rt")
50		} else {
51	21x	con <- file(gff3_file, "r")
52		}
53
54	25x	on.exit(close(con))
55
56		# Pre-allocate vectors for efficiency (start with capacity for 10k
57		# transcripts) This avoids repeatedly growing lists/data.frames which is
58		# slow
59	25x	tx2gene_transcripts <- character(10000)
60	25x	tx2gene_genes <- character(10000)
61	25x	idx <- 0
62	25x	chunk_size <- 10000 # Read in chunks for better performance
63
64	25x	while (TRUE) {
65		# Read lines in chunks instead of one-by-one for better performance
66	50x	lines <- readLines(con, n = chunk_size)
67	50x	if (length(lines) == 0)
68	25x	break
69
70		# Process each line in the chunk
71	25x	for (line in lines) {
72		# Skip comments and empty lines (fast pre-filter)
73	216x	if (startsWith(line, "#") \|\| line == "")
74	40x	next
75
76		# Parse GFF3 line format: seqname source feature start end score
77		# strand phase attributes Use strsplit only once per line
78	176x	fields <- strsplit(line, "\t", fixed = TRUE)[[1]]
79	176x	if (length(fields) < 9)
80	1x	next
81
82		# Extract feature type (3rd column) - do this check first to skip
83		# early
84	175x	feature_type <- fields[3]
85	175x	if (!feature_type %in% c("transcript", "mRNA"))
86	30x	next
87
88		# Only now extract attributes from the (potentially long) 9th
89		# column
90	145x	attributes <- fields[9]
91
92		# Extract ID and Parent using efficient substring operations Find
93		# positions of ID= and Parent= patterns
94	145x	id_start <- regexpr("ID=", attributes, fixed = TRUE) + 3
95	145x	if (id_start > 3) {
96		# ID found, extract until semicolon or end of string
97	144x	id_end <- regexpr(";", substr(attributes, id_start, nchar(attributes)),
98	144x	fixed = TRUE)
99	144x	if (id_end > 0) {
100	142x	transcript_id <- substr(attributes, id_start, id_start + id_end -
101	142x	2)
102		} else {
103	2x	transcript_id <- substr(attributes, id_start, nchar(attributes))
104		}
105		} else {
106	1x	transcript_id <- NA_character_
107		}
108
109	145x	parent_start <- regexpr("Parent=", attributes, fixed = TRUE) + 7
110	145x	if (parent_start > 7) {
111		# Parent found, extract until semicolon or end of string
112	143x	parent_end <- regexpr(";", substr(attributes, parent_start, nchar(attributes)),
113	143x	fixed = TRUE)
114	143x	if (parent_end > 0) {
115	27x	gene_id <- substr(attributes, parent_start, parent_start + parent_end -
116	27x	2)
117		} else {
118	116x	gene_id <- substr(attributes, parent_start, nchar(attributes))
119		}
120		} else {
121	2x	gene_id <- NA_character_
122		}
123
124		# Store mapping if both IDs are present
125	145x	if (!is.na(transcript_id) && !is.na(gene_id)) {
126	142x	idx <- idx + 1
127		# Grow vectors if needed
128	142x	if (idx > length(tx2gene_transcripts)) {
129	!	tx2gene_transcripts <- c(tx2gene_transcripts, rep(NA_character_,
130	!	10000))
131	!	tx2gene_genes <- c(tx2gene_genes, rep(NA_character_, 10000))
132		}
133	142x	tx2gene_transcripts[idx] <- transcript_id
134	142x	tx2gene_genes[idx] <- gene_id
135		}
136		}
137		}
138
139		# Trim to actual size and create data frame (empty if no mappings found)
140	25x	if (idx == 0) {
141	7x	tx2gene_df <- data.frame(Transcript = character(0), Gene = character(0),
142	7x	stringsAsFactors = FALSE)
143		} else {
144	18x	tx2gene_df <- data.frame(Transcript = tx2gene_transcripts[seq_len(idx)],
145	18x	Gene = tx2gene_genes[seq_len(idx)], stringsAsFactors = FALSE)
146		}
147	25x	rownames(tx2gene_df) <- NULL
148
149	25x	return(tx2gene_df)
150		}
151
152		## Helper: build SummarizedExperiment from readcounts + tx2gene
153		#' Build a SummarizedExperiment from transcript readcounts and tx->gene map
154		#'
155		#' This helper creates a `SummarizedExperiment` with an assay named
156		#' `counts`, stores the `tx2gene` table and the raw `readcounts` in
157		#' `metadata()`, and extracts gene assignments from the tx2gene mapping
158		#' to populate `rowData(se)$genes`.
159		#'
160		#' The function accepts three different input types for the tx2gene mapping:
161		#' a file path (TSV or GFF3 format) or an in-memory data.frame.
162		#'
163		#' @param readcounts Numeric matrix or data.frame of transcript-level counts
164		#' (rows = transcripts, columns = samples). Rownames should contain
165		#' transcript IDs that match the first column of tx2gene.
166		#'
167		#' @param tx2gene Transcript-to-gene mapping. Can be one of:
168		#' \describe{
169		#' \item{\strong{TSV File Path}}{A path to a tab-separated file with at least
170		#' two columns: Transcript ID (first column) and Gene ID (second column).
171		#' The file should have a header row with column names.
172		#' File extension should be .tsv, .txt, or similar.
173		#' Example:
174		#' \preformatted{Transcript\tGene
175		#' ENST00000001\tENSG00000101
176		#' ENST00000002\tENSG00000102}
177		#' }
178		#' \item{\strong{GFF3 File Path}}{A path to a GFF3 annotation file (.gff3 or
179		#' .gff3.gz format). The file should contain transcript/mRNA features with
180		#' ID and Parent attributes. Parent attributes reference gene IDs.
181		#' Example GFF3 line:
182		#' \preformatted{chr1\tgencode\ttranscript\t100\t2100\t.\t+\t.\t
183		#' ID=ENST00000001;Parent=ENSG00000101}
184		#' The function automatically detects GFF3 format by file extension
185		#' (.gff3 or .gff3.gz) and parses accordingly.
186		#' }
187		#' \item{\strong{Data Frame}}{An in-memory data.frame with Transcript
188		#' and Gene columns. Useful when mapping data is already loaded in R.
189		#' Example:
190		#' \preformatted{ Transcript Gene
191		#' 1 ENST00000001 ENSG00000101
192		#' 2 ENST00000002 ENSG00000102}
193		#' }
194		#' }
195		#'
196		#' @param assay_name Name for the assay to store readcounts (default: 'counts').
197		#'
198		#' @return A `SummarizedExperiment` with assay, `metadata()$tx2gene`,
199		#' `metadata()$readcounts` and `rowData(se)$genes` populated.
200		#'
201		#' @details
202		#' \strong{Input Format Detection:}
203		#' \itemize{
204		#' \item If tx2gene is a character string ending in .gff3 or .gff3.gz,
205		#' it is parsed as a GFF3 file.
206		#' \item If tx2gene is a character string with any other extension or
207		#' no extension, it is parsed as a tab-separated file.
208		#' \item If tx2gene is a data.frame, it is used directly.
209		#' }
210		#'
211		#' \strong{Transcript Matching:}
212		#' All transcript IDs in the readcounts object (rownames) must have a
213		#' corresponding entry in the tx2gene mapping. If any transcript is missing,
214		#' an error is raised.
215		#'
216		#' \strong{Performance:}
217		#' \itemize{
218		#' \item GFF3 files are processed efficiently even for large annotations
219		#' (e.g., full GENCODE with 100k+ transcripts)
220		#' \item TSV files are standard tab-separated format for fast parsing
221		#' \item Data.frame inputs have no I/O overhead
222		#' }
223		#'
224		#' @export
225		#'
226		#' @examples
227		#' # Example 1: Using data.frame (in-memory mapping)
228		#' tx2gene <- data.frame(
229		#' Transcript = c('ENST00000001', 'ENST00000002'),
230		#' Gene = c('ENSG00000101', 'ENSG00000102')
231		#' )
232		#' readcounts <- matrix(c(10, 5, 2, 3),
233		#' nrow = 2,
234		#' dimnames = list(c('ENST00000001', 'ENST00000002'), c('s1', 's2'))
235		#' )
236		#' se <- build_se(readcounts, tx2gene)
237		#'
238		#' # Example 2: Using TSV file path
239		#' # Assuming you have a file 'tx2gene.tsv' with Transcript and Gene columns
240		#' # se <- build_se(readcounts, 'path/to/tx2gene.tsv')
241		#'
242		#' # Example 3: Using GFF3.gz file path
243		#' # Assuming you have a file 'annotation.gff3.gz' with transcript features
244		#' # se <- build_se(readcounts, 'path/to/annotation.gff3.gz')
245		build_se <- function(readcounts, tx2gene, assay_name = "counts") {
246	38x	if (is.character(tx2gene) && length(tx2gene) == 1) {
247	26x	if (!file.exists(tx2gene)) {
248	3x	stop("tx2gene file not found: ", tx2gene, call. = FALSE)
249		}
250
251		# Detect file type and parse accordingly
252	23x	if (grepl("\\.gff3(\\.gz)?$", tx2gene, ignore.case = TRUE)) {
253	21x	message("Detected GFF3 format. Extracting transcript-to-gene mapping...")
254	21x	tx2gene_df <- extract_tx2gene_from_gff3(tx2gene)
255		} else {
256		# Assume TSV format (backward compatible)
257	2x	tx2gene_df <- utils::read.table(tx2gene, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
258		}
259	12x	} else if (is.data.frame(tx2gene)) {
260	10x	tx2gene_df <- tx2gene
261		} else {
262	2x	stop("'tx2gene' must be a path (TSV or GFF3) or a data.frame.", call. = FALSE)
263		}
264
265	33x	if (is.data.frame(readcounts)) {
266	1x	readcounts <- as.matrix(readcounts)
267		}
268	33x	if (!is.matrix(readcounts) \|\| !is.numeric(readcounts)) {
269	1x	stop("'readcounts' must be a numeric matrix or numeric data.frame.", call. = FALSE)
270		}
271
272		# Extract transcript IDs from rownames or use row indices
273	32x	tx_ids <- rownames(readcounts)
274	32x	if (is.null(tx_ids)) {
275	!	stop("'readcounts' must have transcript IDs as rownames.", call. = FALSE)
276		}
277
278		# Extract gene names from tx2gene based on transcript IDs
279	32x	tx_col <- if ("Transcript" %in% colnames(tx2gene_df))
280	32x	"Transcript" else colnames(tx2gene_df)[1]
281	32x	genes <- tx2gene_df$Gene[match(tx_ids, tx2gene_df[[tx_col]])]
282
283	32x	if (any(is.na(genes))) {
284	9x	stop("Some transcript IDs in readcounts were not found in tx2gene mapping.",
285	9x	call. = FALSE)
286		}
287
288	23x	assays_list <- S4Vectors::SimpleList()
289	23x	assays_list[[assay_name]] <- readcounts
290
291	23x	se <- SummarizedExperiment::SummarizedExperiment(assays = assays_list)
292	23x	S4Vectors::metadata(se)$tx2gene <- tx2gene_df
293	23x	S4Vectors::metadata(se)$readcounts <- readcounts
294
295		# Populate rowData with gene assignments and transcript IDs
296	23x	SummarizedExperiment::rowData(se) <- S4Vectors::DataFrame(genes = genes, row.names = tx_ids)
297
298	23x	return(se)
299		}

1		# Internal helpers for calculate_method
2
3		.tsenat_tsallis_row <- function(x, genes, gene, q, norm, what) {
4	687x	idx <- which(genes == gene)
5	687x	out <- unlist(lapply(seq_len(ncol(x)), function(j) {
6	4817x	v <- calculate_tsallis_entropy(x[idx, j], q = q, norm = norm, what = what)
7	4817x	if (length(v) == length(q) && all(is.finite(v) \| is.na(v))) {
8	4815x	v
9		} else {
10	2x	names_vec <- paste0("q=", q)
11	2x	setNames(rep(NA_real_, length(q)), names_vec)
12		}
13		}))
14	687x	out
15		}

1		# Internal plot helpers
2
3		.tsenat_format_label <- function(lbl) {
4	41x	if (is.null(lbl)) {
5	1x	return(NULL)
6		}
7	40x	s <- gsub("_", " ", lbl)
8	40x	s <- gsub("\\s+", " ", s)
9	40x	s <- trimws(s)
10	40x	s <- tolower(s)
11	40x	if (nchar(s) == 0) {
12	1x	return(s)
13		}
14	39x	if (nchar(s) == 1) {
15	1x	return(toupper(s))
16		}
17	38x	paste0(toupper(substr(s, 1, 1)), substr(s, 2, nchar(s)))
18		}
19
20		.tsenat_prepare_ma_plot_df <- function(df, fold_col, mean_cols, x_label, y_label) {
21		# Detect x-axis values
22	15x	if (length(mean_cols) >= 2) {
23	3x	xvals <- rowMeans(df[, mean_cols[seq_len(2)], drop = FALSE], na.rm = TRUE)
24	3x	x_label <- x_label %\|\|% paste0(mean_cols[1], " vs ", mean_cols[2])
25	12x	} else if (length(mean_cols) == 1) {
26	1x	xvals <- as.numeric(df[[mean_cols[1]]])
27	1x	x_label <- x_label %\|\|% mean_cols[1]
28	11x	} else if ("mean" %in% colnames(df)) {
29	9x	xvals <- as.numeric(df$mean)
30	9x	x_label <- x_label %\|\|% "Mean"
31		} else {
32	2x	xvals <- seq_len(nrow(df))
33	2x	x_label <- x_label %\|\|% "Index"
34		}
35
36	15x	yvals <- as.numeric(df[[fold_col]])
37
38	15x	padj_candidates <- c("adjusted_p_values", "adj_p_value", "adj_p", "padj", "p.adjust")
39	15x	padj_col <- intersect(padj_candidates, colnames(df))
40	15x	padj_col <- if (length(padj_col)) {
41	5x	padj_col[1]
42		} else {
43	10x	NULL
44		}
45
46	15x	padj <- if (!is.null(padj_col)) {
47	5x	as.numeric(df[[padj_col]])
48		} else {
49	10x	rep(1, length(yvals))
50		}
51	15x	padj[is.na(padj)] <- 1
52
53	15x	sig_flag <- ifelse(abs(yvals) > 0 & padj < 0.05, "significant", "non-significant")
54
55	15x	plot_df <- data.frame(genes = df$genes, x = xvals, y = yvals, padj = padj, significant = sig_flag,
56	15x	stringsAsFactors = FALSE)
57
58	15x	list(plot_df = plot_df, x_label = x_label, y_label = y_label)
59		}
60
61		.tsenat_prepare_volcano_df <- function(diff_df, x_col = NULL, padj_col = "adjusted_p_values",
62		label_thresh = 0.1, padj_thresh = 0.05, title = NULL) {
63	14x	df <- as.data.frame(diff_df)
64	14x	cn <- colnames(df)
65
66		# Auto-detect x-axis column if not specified
67	14x	if (is.null(x_col)) {
68	5x	diff_cols <- grep("_difference$", cn, value = TRUE, ignore.case = TRUE)
69	5x	if (length(diff_cols) > 0) {
70	2x	x_col <- diff_cols[1]
71		} else {
72	3x	numeric_cols <- vapply(df, is.numeric, logical(1))
73	3x	p_cols <- grep("p_value\|p.value", cn, ignore.case = TRUE)
74	3x	numeric_cols[p_cols] <- FALSE
75	3x	if (any(numeric_cols)) {
76	3x	x_col <- cn[which(numeric_cols)[1]]
77		} else {
78	!	stop("Could not find suitable column for x-axis. Specify 'x_col' explicitly.")
79		}
80		}
81		}
82
83		# Verify columns
84	14x	if (!(x_col %in% cn)) {
85	1x	stop(sprintf("Column '%s' not found in diff_df", x_col))
86		}
87	13x	if (!(padj_col %in% cn)) {
88	5x	stop(sprintf("Column '%s' not found in diff_df", padj_col))
89		}
90
91	8x	df$xval <- as.numeric(df[[x_col]])
92	8x	df$padj <- as.numeric(df[[padj_col]])
93	8x	df$padj[is.na(df$padj)] <- 1
94	8x	df$padj[df$padj <= 0] <- .Machine$double.xmin
95
96	8x	df$significant <- ifelse(abs(df$xval) >= label_thresh & df$padj < padj_thresh,
97	8x	"significant", "non-significant")
98	8x	df <- df[is.finite(df$xval) & is.finite(df$padj), ]
99
100	8x	if (nrow(df) == 0) {
101	1x	stop("No valid points to plot")
102		}
103
104	7x	metric_label <- if (grepl("median", x_col, ignore.case = TRUE)) {
105	1x	"Median"
106	7x	} else if (grepl("mean", x_col, ignore.case = TRUE)) {
107	3x	"Mean"
108		} else {
109	3x	"Value"
110		}
111
112	7x	title_use <- title %\|\|% "Volcano plot: fold-change vs significance"
113
114	7x	x_label_formatted <- .tsenat_format_label(x_col)
115	7x	padj_label_formatted <- .tsenat_format_label(padj_col)
116
117	7x	list(df = df, x_col = x_col, padj_col = padj_col, x_label_formatted = x_label_formatted,
118	7x	padj_label_formatted = padj_label_formatted, title_use = title_use)
119		}

1		## Filter SummarizedExperiment by low-expression transcripts
2		#' Filter transcripts in a `SummarizedExperiment` by minimum count/sample
3		#'
4		#' Subset a `SummarizedExperiment` to keep transcripts with more than
5		#' `min_count` reads in strictly more than `min_samples` samples. The
6		#' function updates assays, `rowData`, and relevant entries in
7		#' `metadata()` (for example `readcounts` and `tx2gene`) so downstream
8		#' helpers receive a consistent object.
9		#'
10		#' @param se A `SummarizedExperiment` object containing transcript-level
11		#' assays (rows = transcripts, columns = samples).
12		#' @param min_count Integer threshold for per-sample counts (default 5L).
13		#' @param min_samples Integer minimum number of samples exceeding
14		#' `min_count` required to keep a transcript (default 5L).
15		#' @param assay_name Name or index of the assay to use for filtering
16		#' (default: 'counts'). If not present, the first assay is used.
17		#' @param verbose Logical; print before/after counts when TRUE.
18		#' @return A filtered `SummarizedExperiment`.
19		#' @export
20		#' @examples
21		#' mat <- matrix(c(0, 6, 7, 2, 8, 9), nrow = 3, dimnames = list(paste0('tx', 1:3), paste0('S', 1:2)))
22		#' se <- SummarizedExperiment::SummarizedExperiment(assays = list(counts = mat))
23		#' filt <- filter_se(se, min_count = 5, min_samples = 1)
24		#' class(filt)
25		filter_se <- function(se, min_count = 5L, min_samples = 5L, assay_name = "counts",
26		verbose = TRUE) {
27	7x	if (!is(se, "SummarizedExperiment")) {
28	1x	stop("'se' must be a SummarizedExperiment", call. = FALSE)
29		}
30
31	6x	assays_list <- SummarizedExperiment::assays(se)
32	6x	if (is.character(assay_name) && assay_name %in% names(assays_list)) {
33	5x	assay_mat <- as.matrix(assays_list[[assay_name]])
34	1x	} else if (is.numeric(assay_name) && assay_name >= 1 && assay_name <= length(assays_list)) {
35	!	assay_mat <- as.matrix(assays_list[[assay_name]])
36		} else {
37		# fallback to first assay
38	1x	assay_mat <- as.matrix(assays_list[[1]])
39	1x	warning("Requested assay not found; using first assay.", call. = FALSE)
40		}
41
42	6x	if (!is.numeric(assay_mat)) {
43	!	stop("Assay data must be numeric.", call. = FALSE)
44		}
45
46	6x	before <- nrow(assay_mat)
47		# Cap min_samples to the number of available samples to avoid impossible
48		# thresholds on small datasets and use an inclusive threshold so callers
49		# requesting e.g. 'min_samples = ncol' keep rows present in all samples.
50	6x	eff_min_samples <- min(as.integer(min_samples), ncol(assay_mat))
51	6x	tokeep <- rowSums(assay_mat > min_count) >= eff_min_samples
52	6x	after <- sum(tokeep)
53
54	6x	if (verbose) {
55	!	message(sprintf("Transcripts: before = %d, after = %d", before, after))
56		}
57
58	6x	if (after == 0L) {
59	2x	warning("Filtering removed all transcripts; returning empty SummarizedExperiment.",
60	2x	call. = FALSE)
61		}
62
63		# subset all assays
64	6x	new_assays <- S4Vectors::SimpleList(lapply(assays_list, function(a) {
65	8x	if (is.matrix(a) \|\| is.data.frame(a)) {
66	8x	as.matrix(a)[tokeep, , drop = FALSE]
67		} else {
68	!	a
69		}
70		}))
71	6x	names(new_assays) <- names(assays_list)
72
73		# subset rowData if present
74	6x	rd <- NULL
75	6x	if (nrow(SummarizedExperiment::rowData(se)) > 0) {
76	6x	rd <- SummarizedExperiment::rowData(se)[tokeep, , drop = FALSE]
77		}
78
79		# subset metadata readcounts and tx2gene if present
80	6x	md <- S4Vectors::metadata(se)
81	6x	if (!is.null(md$readcounts) && is.matrix(md$readcounts)) {
82	3x	md$readcounts <- as.matrix(md$readcounts)[tokeep, , drop = FALSE]
83		}
84	6x	if (!is.null(md$tx2gene) && is.data.frame(md$tx2gene)) {
85	3x	txmap <- md$tx2gene
86	3x	txcol <- colnames(txmap)[1]
87	3x	md$tx2gene <- txmap[txmap[[txcol]] %in% rownames(assay_mat)[tokeep], , drop = FALSE]
88		}
89
90		# construct new SE with same colData
91	6x	new_se <- SummarizedExperiment::SummarizedExperiment(assays = new_assays, rowData = if (!is.null(rd)) {
92	6x	rd
93		} else {
94	!	S4Vectors::DataFrame()
95	6x	}, colData = SummarizedExperiment::colData(se), metadata = c(S4Vectors::metadata(se),
96	6x	list(filtered = list(min_count = min_count, min_samples = min_samples))))
97
98		# attach updated metadata pieces
99	6x	S4Vectors::metadata(new_se)$readcounts <- if (!is.null(md$readcounts)) {
100	3x	md$readcounts
101		} else {
102	3x	NULL
103		}
104	6x	S4Vectors::metadata(new_se)$tx2gene <- if (!is.null(md$tx2gene)) {
105	3x	md$tx2gene
106		} else {
107	3x	NULL
108		}
109
110	6x	return(new_se)
111		}

1		#' Calculate splicing diversity changes between two conditions.
2		#'
3		#' @param x A \code{matrix} with the splicing diversity values.
4		#' @param samples Character vector with an equal length to the number of columns
5		#' in the input dataset, specifying the category of each sample.
6		#' @param control Name of the control sample category, defined in the
7		#' \code{samples} vector, e.g. \code{control = 'Normal'} or \code{control =
8		#' 'WT'}.
9		#' @param method Method to use for calculating the average splicing diversity
10		#' value in a condition. Can be \code{'mean'} or \code{'median'}.
11		#' @param pseudocount Numeric scalar. Small value added to non-positive
12		#' observed group summaries to avoid zeros when computing differences and
13		#' log2 fold-changes. If \code{pseudocount <= 0} the function will automatically
14		#' choose a scale-aware value equal to half the smallest positive observed
15		#' group summary (i.e. half the smallest observed mean/median across groups);
16		#' if no positive values are present the fallback is \code{1e-6}. Rows with
17		#' insufficient observations remain \code{NA} and are not imputed.
18		#' @return A \code{data.frame} with mean or median value of splicing diversity
19		#' across sample categories, the difference between these values and the log2
20		#' fold change values.
21		#' @details The function uses a matrix of splicing diversity values in order to
22		#' calculate mean or median differences and log2 fold changes between two
23		#' conditions.
24		#' @import stats
25		#' @export
26		#' @examples
27		#' # Simulate splicing diversity matrix (4 genes x 4 samples)
28		#' mat <- matrix(c(
29		#' 0.5, 0.6, 0.8, 0.9, # gene1: low control, high treatment
30		#' 0.7, 0.75, 0.6, 0.5 # gene2: high control, low treatment
31		#' ), nrow = 2, byrow = TRUE)
32		#' samples <- c('Normal', 'Normal', 'Tumor', 'Tumor')
33		#' result <- calculate_fc(mat, samples, control = 'Normal', method = 'mean')
34		#' head(result)
35		calculate_fc <- function(x, samples, control, method = "mean", pseudocount = 0) {
36		# validate control and samples inputs
37	1802x	if (is.null(control) \|\| !nzchar(control)) {
38	2x	stop("`control` must be provided to calculate_fc", call. = FALSE)
39		}
40	1800x	if (length(samples) != ncol(x)) {
41	3x	stop("Length of 'samples' must equal number of columns in 'x'", call. = FALSE)
42		}
43	1797x	if (!(control %in% samples)) {
44	2x	stop("Control sample type not found in samples.", call. = FALSE)
45		}
46	1795x	agg <- .tsenat_aggregate_fc_values(x = x, samples = samples, method = method,
47	1795x	control = control)
48	1795x	value <- agg$value
49	1795x	sorted <- agg$sorted
50
51		# Defensive numeric coercion: ensure group means are numeric and mark any
52		# non-finite or non-positive values as NA. This prevents Inf/NaN when
53		# computing log2 fold changes downstream.
54	1795x	value <- matrix(as.numeric(value), nrow = nrow(value), ncol = ncol(value), dimnames = dimnames(value))
55	1795x	value[!is.finite(value)] <- NA
56
57		# compute and apply pseudocount based on observed group summaries
58	1795x	value <- .tsenat_apply_pseudocount(value, pseudocount)
59
60		# compute difference and log2 fold-change with NA-safe handling
61	1795x	diff_vec <- value[, 1] - value[, 2]
62	1795x	na_mask <- is.na(value[, 1]) \| is.na(value[, 2])
63	1795x	diff_vec[na_mask] <- NA
64
65	1795x	log2fc_vec <- log2(value[, 1]/value[, 2])
66	1795x	log2fc_vec[na_mask] <- NA
67
68	1795x	result <- data.frame(value, difference = diff_vec, log2_fold_change = log2fc_vec,
69	1795x	check.names = FALSE, stringsAsFactors = FALSE)
70	1795x	colnames(result) <- c(paste(sorted[1, 1], "_", method, sep = ""), paste(sorted[2,
71	1795x	1], "_", method, sep = ""), paste(method, "_difference", sep = ""), "log2_fold_change")
72	1795x	return(result)
73		}
74
75		#' Calculate p-values using Wilcoxon rank sum test.
76		#'
77		#' @param x A \code{matrix} with the splicing diversity values.
78		#' @param samples Character vector with an equal length to the number of columns
79		#' in the input dataset, specifying the category of each sample.
80		#' @param pcorr P-value correction method applied to the results, as defined in
81		#' the \code{p.adjust} function.
82		#' @param paired If \code{TRUE}, the Wilcox-test will be paired, and therefore
83		#' it will be a signed rank test instead of the rank sum test.
84		#' @param exact If \code{TRUE}, an exact p-value will be computed.
85		#' @param nthreads Number of threads for parallel processing (default: 1).
86		#' Set to > 1 to parallelize per-feature Wilcoxon tests.
87		#' @return Raw and corrected p-values in a matrix.
88		#' @import stats
89		#' @export
90		#' @examples
91		#' # Create a matrix of splicing diversity values (3 genes x 6 samples)
92		#' mat <- matrix(rnorm(18), nrow = 3)
93		#' samples <- rep(c('Control', 'Treatment'), each = 3)
94		#'
95		#' # Run Wilcoxon test
96		#' result <- wilcoxon(mat, samples, pcorr = 'BH')
97		#' head(result)
98		wilcoxon <- function(x, samples, pcorr = "BH", paired = FALSE, exact = FALSE, nthreads = 1) {
99		# Determine group indices (two groups expected)
100	34x	groups <- unique(sort(samples))
101	34x	if (length(groups) != 2) {
102	!	stop("`samples` must contain exactly two groups for Wilcoxon tests.")
103		}
104	34x	g1_idx <- as.numeric(which(samples %in% groups[1]))
105	34x	g2_idx <- as.numeric(which(samples %in% groups[2]))
106
107	34x	if (isTRUE(paired)) {
108	8x	if (length(g1_idx) != length(g2_idx)) {
109	1x	stop("Paired Wilcoxon requires equal numbers of samples in each ", "group.")
110		}
111		# Paired tests assume columns are already ordered/aligned by the caller
112		# (e.g., via `map_metadata()`); do not attempt to infer pairing here.
113		}
114
115		# Function to compute Wilcoxon test for a single feature
116	33x	.wilcox_one <- function(i) {
117	2555x	tryCatch({
118	2555x	wilcox.test(x[i, g1_idx], x[i, g2_idx], paired = paired, exact = exact)$p.value
119	2555x	}, error = function(e) {
120	2555x	NA_real_
121	2555x	}, warning = function(w) {
122		# swallow specific warnings but return NA on unusual states
123	2555x	NA_real_
124		})
125		}
126
127		# Apply in parallel
128	33x	p_values <- .tsenat_bplapply(seq_len(nrow(x)), .wilcox_one, nthreads = nthreads)
129
130	33x	raw_p_values <- ifelse(is.na(vapply(p_values, c, numeric(1))), 1, vapply(p_values,
131	33x	c, numeric(1)))
132	33x	adjusted_p_values <- p.adjust(raw_p_values, method = pcorr)
133	33x	out <- cbind(raw_p_values, adjusted_p_values)
134	33x	colnames(out) <- c("raw_p_values", "adjusted_p_values")
135	33x	return(out)
136		}
137
138		#' Calculate p-values using label shuffling.
139		#'
140		#' @param x A \code{matrix} with the splicing diversity values.
141		#' @param samples Character vector with an equal length to the number of columns
142		#' in the input dataset, specifying the category of each sample.
143		#' @param control Name of the control sample category, defined in the
144		#' \code{samples} vector, e.g. \code{control = 'Normal'} or \code{control =
145		#' 'WT'}.
146		#' @param method Method to use for calculating the average splicing diversity
147		#' value in a condition. Can be \code{'mean'} or \code{'median'}.
148		#' @param randomizations The number of random shuffles.
149		#' @param pcorr P-value correction method applied to the results, as defined in
150		#' the \code{p.adjust()} function.
151		#' @param paired Logical; if \code{TRUE} perform a paired permutation scheme
152		#' (default: \code{FALSE}). When paired is \code{TRUE}, permutations
153		#' should preserve pairing between samples; the function currently permutes
154		#' sample labels and therefore paired analyses are only meaningful when the
155		#' caller has arranged \code{samples} accordingly.
156		#' @param paired_method Character; method for paired permutations. One of
157		#' \code{'swap'} (randomly swap labels within pairs) or \code{'signflip'}
158		#' (perform sign-flip permutations; can enumerate all 2^n_pairs combinations
159		#' for an exact test when \code{randomizations = 0} or \code{randomizations >= 2^n_pairs}).
160		#' @param nthreads Number of threads for parallel processing (default: 1).
161		#' Set to > 1 to parallelize per-feature p-value computation.
162		#' @return Raw and corrected p-values.
163		#' @details The permutation p-values are computed two-sided as the proportion
164		#' of permuted log2 fold-changes at least as extreme as the observed value,
165		#' with a pseudocount added: (count + 1) / (n_perm + 1).
166		#' @import stats
167		#' @note The permutation test returns two-sided empirical p-values using a
168		#' pseudocount to avoid zero p-values for small numbers of permutations. See
169		#' the function documentation for details.
170		#' @export
171		#' @examples
172		#' set.seed(123)
173		#' # Create a matrix of splicing diversity values (2 genes x 4 samples)
174		#' mat <- matrix(rnorm(8), nrow = 2)
175		#' samples <- c('Normal', 'Normal', 'Tumor', 'Tumor')
176		#'
177		#' # Run label shuffling test (100 permutations)
178		#' result <- label_shuffling(mat, samples, control = 'Normal',
179		#' method = 'mean', randomizations = 100, pcorr = 'BH')
180		#' head(result)
181		label_shuffling <- function(x, samples, control, method, randomizations = 100, pcorr = "BH",
182		paired = FALSE, paired_method = c("swap", "signflip"), nthreads = 1) {
183	32x	paired_method <- match.arg(paired_method)
184		# observed log2 fold changes
185	32x	log2_fc <- calculate_fc(x, samples, control, method)[, 4]
186
187		# build permutation/null distribution of log2 fold changes
188	32x	if (isTRUE(paired)) {
189	10x	perm_mat <- .tsenat_permute_paired(x = x, samples = samples, control = control,
190	10x	method = method, randomizations = randomizations, paired_method = paired_method)
191		} else {
192		# Generate permutations
193	22x	permuted <- replicate(randomizations, calculate_fc(x, sample(samples), control,
194	22x	method), simplify = FALSE)
195		# each element is a data.frame/matrix; extract log2_fold_change column
196		# (4th column)
197	22x	perm_mat <- vapply(permuted, function(z) as.numeric(z[, 4]), numeric(nrow(x)))
198		}
199
200		# Function to compute p-value for a single feature
201	31x	.compute_pval <- function(i) {
202	936x	obs <- log2_fc[i]
203	936x	nulls <- perm_mat[i, ]
204	936x	if (is.na(obs) \|\| all(is.na(nulls))) {
205	!	return(1)
206		}
207	936x	nulls_non_na <- nulls[!is.na(nulls)]
208	936x	n_non_na <- length(nulls_non_na)
209	936x	if (n_non_na == 0) {
210	!	return(1)
211		}
212	936x	cnt <- sum(abs(nulls_non_na) >= abs(obs))
213	936x	pval <- (cnt + 1)/(n_non_na + 1)
214	936x	return(pval)
215		}
216
217		# compute two-sided permutation p-value with pseudocount, in parallel
218	31x	raw_p_values <- unlist(.tsenat_bplapply(seq_len(nrow(perm_mat)), .compute_pval,
219	31x	nthreads = nthreads))
220
221	31x	adjusted_p_values <- p.adjust(raw_p_values, method = pcorr)
222	31x	out <- cbind(raw_p_values, adjusted_p_values)
223	31x	colnames(out) <- c("raw_p_values", "adjusted_p_values")
224	31x	return(out)
225		}
226
227
228		#' Run a differential test by name: Wilcoxon or label-shuffle
229		#'
230		#' Thin wrapper that selects between `wilcoxon()` and `label_shuffling()`.
231		#'
232		#' @param x A matrix with splicing diversity values (rows = features).
233		#' @param samples Character vector of sample group labels (length = ncol(x)).
234		#' @param control Name of the control group (required for label-shuffle).
235		#' @param method Character; one of `'wilcoxon'` or `'shuffle'`.
236		#' @param fc_method Character; aggregation method used by the permutation
237		#' test when `method = 'shuffle'` ('mean' or 'median').
238		#' @param paired Logical passed to `wilcoxon()` when using the Wilcoxon test.
239		#' @param exact Logical passed to `wilcoxon()` to request exact p-values.
240		#' @param randomizations Integer number of permutations for `label_shuffling()`.
241		#' @param pcorr P-value adjustment method (passed to `p.adjust`).
242		#' @param seed Integer seed used to make permutations reproducible (default
243		#' 123). The function sets a temporary RNG seed via `withr::local_seed(seed)`
244		#' before running `label_shuffling()` when `method = 'shuffle'`.
245		#' @param nthreads Number of threads for parallel processing (default: 1).
246		#' Set to > 1 to parallelize per-feature statistical tests in
247		#' `wilcoxon()` or `label_shuffling()`.
248		#' @return A two-column matrix with raw and adjusted p-values (as returned by
249		#' the underlying functions).
250		#' @export
251		#' @examples
252		#' mat <- matrix(rnorm(20), nrow = 5)
253		#' samples <- rep(c('A', 'B'), length.out = ncol(mat))
254		#' test_differential(mat, samples, control = 'A', method = 'wilcoxon')
255		#'
256		#' @param paired_method Character; forwarded to `label_shuffling()` when
257		#' `method = 'shuffle'`. See `label_shuffling()` for details.
258		test_differential <- function(x, samples, control = NULL, method = c("wilcoxon",
259		"shuffle"), fc_method = "mean", paired = FALSE, exact = FALSE, randomizations = 100,
260		pcorr = "BH", seed = 123L, paired_method = c("swap", "signflip"), nthreads = 1) {
261	21x	paired_method <- match.arg(paired_method)
262	21x	method <- match.arg(method)
263	21x	if (method == "wilcoxon") {
264	6x	return(wilcoxon(x, samples, pcorr = pcorr, paired = paired, exact = exact,
265	6x	nthreads = nthreads))
266		}
267
268		# shuffle / permutation-based test
269	15x	if (is.null(control) \|\| !nzchar(control)) {
270	1x	stop("`control` must be provided when method = 'shuffle'", call. = FALSE)
271		}
272	14x	if (!is.null(seed)) {
273	14x	if (!is.numeric(seed) \|\| length(seed) != 1) {
274	1x	stop("`seed` must be a single numeric value", call. = FALSE)
275		}
276		# Use withr::local_seed to temporarily set RNG state for reproducible
277		# permutation tests without permanently modifying the global RNG.
278	13x	withr::local_seed(as.integer(seed))
279		}
280	13x	return(label_shuffling(x, samples, control, fc_method, randomizations = randomizations,
281	13x	pcorr = pcorr, paired = paired, paired_method = paired_method, nthreads = nthreads))
282		}

1		#' Calculate splicing diversity changes between two conditions.
2		#' @param x A \code{SummarizedExperiment} with splicing diversity values for
3		#' each gene in each sample or a \code{data.frame} with gene names in the first
4		#' column and splicing diversity values for each sample in additional columns.
5		#' @param samples A vector of length one, specifying the column name of the
6		#' \code{colData} annotation column from the \code{SummarizedExperiment}
7		#' object, that should be used as the category column or a character vector
8		#' with an equal length to the number of columns in the input dataset,
9		#' specifying the category of each sample in the case of a \code{data.frame}
10		#' input.
11		#' @param control Name of the control sample category, defined in the
12		#' \code{samples} vector, e.g. \code{control = 'Normal'} or \code{control =
13		#' 'WT'}.
14		#' @param method Method to use for calculating the average splicing diversity
15		#' value in a condition. Can be \code{'mean'} or \code{'median'}.
16		#' @param test Method to use for p-value calculation: use \code{'wilcoxon'} for
17		#' Wilcoxon rank sum test or \code{'shuffle'} for a label shuffling test.
18		#' @param randomizations Number of random shuffles, used for the label shuffling
19		#' test (default = 100).
20		#' @param pcorr P-value correction method applied to the Wilcoxon rank sum test
21		#' or label shuffling test results, as defined in the \code{p.adjust} function.
22		#' @param assayno An integer value. In case of multiple assays in a
23		#' \code{SummarizedExperiment} input, the argument specifies the assay number
24		#' to use for difference calculations.
25		#' @param verbose If \code{TRUE}, the function will print additional diagnostic
26		#' messages.
27		#' @param pseudocount Numeric scalar. Passed to \code{calculate_fc} and used to
28		#' add a small value to non-positive group summaries before computing
29		#' differences and log2 fold-changes. Default \code{1e-6}. Rows excluded for
30		#' low sample counts remain \code{NA}.
31		#' @param paired Logical; if `TRUE`, run paired versions of tests when
32		#' supported (default: `FALSE`).
33		#' @param exact Logical; passed to the Wilcoxon test to request exact p-values
34		#' when supported (default: `FALSE`).
35		#' @param nthreads Number of threads for parallel processing (default: 1).
36		#' Set to > 1 to parallelize per-feature statistical tests.
37		#' @return A \code{data.frame} with the mean or median values of splicing
38		#' diversity across sample categories and all samples, log2(fold change) of the
39		#' two different conditions, raw and corrected p-values.
40		#' @import methods
41		#' @importFrom SummarizedExperiment SummarizedExperiment assays assay colData
42		#' @export
43		#' @details The function calculates diversity changes between two sample
44		#' conditions. It uses the output of the diversity calculation function, which
45		#' is a \code{SummarizedExperiment} object of splicing diversity values.
46		#' Additionally, it can use a \code{data.frame} as input, where the first column
47		#' contains gene names, and all additional columns contain splicing diversity
48		#' values for each sample. A vector of sample conditions also serves as input,
49		#' used for aggregating the samples by condition. It calculates the mean or
50		#' median of the splicing diversity data per sample condition, the difference
51		#' of these values and the log2 fold change of the two conditions. Furthermore,
52		#' the user can select a statistical method to calculate the significance of
53		#' the changes. The p-values and adjusted p-values are calculated using a
54		#' Wilcoxon sum rank test or label shuffling test. The function will exclude
55		#' genes of low sample size from the significance calculation, depending on
56		#' which statistical test is applied.
57		#' @examples
58		#' x <- data.frame(Genes = letters[seq_len(10)], matrix(runif(80), ncol = 8))
59		#' samples <- c(rep('Healthy', 4), rep('Pathogenic', 4))
60		#' calculate_difference(x, samples,
61		#' control = 'Healthy', method = 'mean', test =
62		#' 'wilcoxon'
63		#' )
64		calculate_difference <- function(x, samples = NULL, control, method = "mean", test = "wilcoxon",
65		randomizations = 100, pcorr = "BH", assayno = 1, verbose = TRUE, paired = FALSE,
66		exact = FALSE, pseudocount = 0, nthreads = 1) {
67		# internal small helpers (kept here to avoid adding new files)
68	43x	.tsenat_prepare_df <- function(x, samples, assayno) {
69	39x	if (inherits(x, "RangedSummarizedExperiment") \|\| inherits(x, "SummarizedExperiment")) {
70		# allow samples to be NULL (use default 'sample_type' col)
71	4x	if (is.null(samples)) {
72	3x	if ("sample_type" %in% colnames(SummarizedExperiment::colData(x))) {
73	2x	samples_col <- "sample_type"
74		} else {
75	1x	stop("When providing a SummarizedExperiment, supply 'samples' as a colData column name or call map_metadata() to populate 'sample_type'",
76	1x	call. = FALSE)
77		}
78		} else {
79	1x	if (length(samples) != 1) {
80	1x	stop("'samples' must be a single colData column.", call. = FALSE)
81		}
82	!	samples_col <- samples
83		}
84	2x	samples_vec <- SummarizedExperiment::colData(x)[[samples_col]]
85	2x	if (!is.numeric(assayno) \|\| length(SummarizedExperiment::assays(x)) <
86	2x	assayno) {
87	1x	stop("Invalid 'assayno'.", call. = FALSE)
88		}
89	1x	df <- as.data.frame(SummarizedExperiment::assays(x)[[assayno]])
90	1x	genes <- rownames(df)
91	1x	df <- cbind(genes = genes, df)
92	1x	list(df = df, samples = samples_vec)
93		} else {
94	35x	df <- as.data.frame(x)
95	35x	list(df = df, samples = samples)
96		}
97		}
98
99	43x	.tsenat_sample_matrix <- function(dfr) {
100	17x	as.matrix(dfr[, -c(1, ncol(dfr) - 1, ncol(dfr)), drop = FALSE])
101		}
102
103		# Validate input container Reject matrices explicitly (tests expect this
104		# error for matrix input)
105	43x	if (is.matrix(x)) {
106	4x	stop("Input type unsupported; see ?calculate_difference.", call. = FALSE)
107		}
108	39x	if (!(is.data.frame(x) \|\| inherits(x, "RangedSummarizedExperiment") \|\| inherits(x,
109	39x	"SummarizedExperiment"))) {
110	!	stop("Input data type not supported; see ?calculate_difference.", call. = FALSE)
111		}
112
113		# prepare data.frame and sample vector (handles SummarizedExperiment)
114	39x	pd <- .tsenat_prepare_df(x, samples, assayno)
115	36x	df <- pd$df
116	36x	samples <- pd$samples
117
118		# Partition and validate inputs
119	36x	part <- .tsenat_calculate_difference_partition(df = df, samples = samples, control = control,
120	36x	method = method, test = test, pcorr = pcorr, randomizations = randomizations,
121	36x	verbose = verbose)
122	17x	df <- part$df
123	17x	samples <- part$samples
124	17x	groups <- part$groups
125	17x	idx1 <- part$idx1
126	17x	idx2 <- part$idx2
127	17x	df_keep <- part$df_keep
128	17x	df_small <- part$df_small
129
130	17x	result_list <- list()
131
132		# Helper to extract the numeric matrix of sample columns (keeps original
133		# order)
134	17x	sample_matrix <- .tsenat_sample_matrix
135
136	17x	if (nrow(df_keep) > 0) {
137	11x	if (nrow(df_small) > 0 && verbose) {
138	1x	message(sprintf("Note: %d genes excluded due to low sample counts.",
139	1x	nrow(df_small)))
140		}
141	11x	ymat <- sample_matrix(df_keep)
142		# p-value calculation
143	11x	if (test == "wilcoxon") {
144	8x	ptab <- wilcoxon(ymat, samples, pcorr = pcorr, paired = paired, exact = exact,
145	8x	nthreads = nthreads)
146		# wilcoxon should return a data.frame of p-values named
147		# appropriately
148		} else {
149	3x	ptab <- label_shuffling(ymat, samples, control, method, randomizations = randomizations,
150	3x	pcorr = pcorr, paired = paired, nthreads = nthreads)
151		}
152	11x	result_list$tested <- data.frame(genes = df_keep[, 1], calculate_fc(ymat,
153	11x	samples, control, method, pseudocount = pseudocount), ptab, stringsAsFactors = FALSE)
154		}
155
156	17x	if (nrow(df_small) > 0) {
157	6x	small_mat <- sample_matrix(df_small)
158	6x	result_list$small <- data.frame(genes = df_small[, 1], calculate_fc(small_mat,
159	6x	samples, control, method, pseudocount = pseudocount), raw_p_values = NA,
160	6x	adjusted_p_values = NA, stringsAsFactors = FALSE)
161		}
162
163		# Combine results preserving tested rows first
164	17x	if (length(result_list) == 0) {
165	1x	return(data.frame())
166		}
167	16x	res <- do.call(rbind, result_list)
168	16x	rownames(res) <- NULL
169	16x	if (!("log2_fold_change" %in% colnames(res)) && ("log2FC" %in% colnames(res))) {
170	!	res$log2_fold_change <- res$log2FC
171		}
172	16x	return(res)
173		}
174
175
176		#' Linear-model interaction test for Tsallis entropy For each gene, fit a
177		#' linear model of the form `entropy ~ q * group` and extract the p-value for
178		#' the interaction term (whether the effect of `q` differs between groups). The
179		#' function expects a `SummarizedExperiment` produced by
180		#' `calculate_diversity()` when multiple `q` values have been computed (column
181		#' names contain `_q=`).
182		#' @param se A `SummarizedExperiment` containing a `diversity` assay produced
183		#' by `calculate_diversity(..., q = <vector>)`.
184		#' @param sample_type_col Optional column name in `colData(se)` that contains
185		#' a grouping factor for samples (character). If `NULL`, the function will
186		#' attempt to infer group from column names (suffix `_N` interpreted as
187		#' 'Normal').
188		#' @param min_obs Minimum number of non-NA observations required to fit a
189		#' model for a gene (default: 10).
190		#' @param method Modeling method to use for interaction testing: one of
191		#' \code{c('linear', 'lmm', 'gam', 'fpca')} (default: 'linear').
192		#' @param pvalue Type of p-value to compute for linear mixed models: one of
193		#' \code{c('satterthwaite', 'lrt', 'both')} (default: 'satterthwaite').
194		#' @param subject_col Optional column name in `colData(se)` that contains
195		#' subject/individual identifiers for paired or repeated-measures designs
196		#' (character). If provided with `method = 'lmm'`, used as random effect.
197		#' @param paired Logical; whether samples are paired (default: FALSE).
198		#' @param nthreads Number of threads (mc.cores) to use for parallel processing
199		#' (default: 1).
200		#' @param assay_name Name of the assay in the SummarizedExperiment to use
201		#' (default: 'diversity').
202		#' @param verbose Logical; whether to print progress messages during execution
203		#' (default: FALSE).
204		#' @return A data.frame with columns `gene`, `p_interaction`, and
205		#' `adj_p_interaction`, ordered by ascending `p_interaction`.
206		#' @export
207		#' @examples
208		#' data('tcga_brca_luma', package = 'TSENAT')
209		#' rc <- as.matrix(tcga_brca_luma[1:20, -1, drop = FALSE])
210		#' gs <- tcga_brca_luma[1:20, 1]
211		#' se <- calculate_diversity(rc, gs, q = c(0.1, 1), norm = TRUE)
212		#' # Provide a minimal sample-type mapping so the example runs during checks
213		#' SummarizedExperiment::colData(se) <- S4Vectors::DataFrame(
214		#' sample_type = rep(c('Normal', 'Tumor'), length.out = ncol(se)),
215		#' row.names = colnames(se)
216		#' )
217		#' calculate_lm_interaction(se, sample_type_col = 'sample_type')
218		calculate_lm_interaction <- function(se, sample_type_col = NULL, min_obs = 10, method = c("linear",
219		"lmm", "gam", "fpca"), pvalue = c("satterthwaite", "lrt", "both"), subject_col = NULL,
220		paired = FALSE, nthreads = 1, assay_name = "diversity", verbose = FALSE) {
221	15x	method <- match.arg(method)
222	14x	pvalue <- match.arg(pvalue)
223	13x	if (verbose) {
224	!	message("[calculate_lm_interaction] method=", method)
225		}
226		# internal flags: keep these internal to avoid documenting them in Rd
227	13x	suppress_lme4_warnings <- TRUE
228	13x	progress <- FALSE
229	13x	if (!requireNamespace("SummarizedExperiment", quietly = TRUE)) {
230	!	stop("SummarizedExperiment required")
231		}
232
233	13x	mat <- SummarizedExperiment::assay(se, assay_name)
234	13x	if (is.null(mat)) {
235	!	stop(sprintf("Assay '%s' not found in SummarizedExperiment", assay_name))
236		}
237
238	13x	sample_q <- colnames(mat)
239	13x	if (is.null(sample_q) \|\| length(sample_q) == 0) {
240	!	stop("No column names found on diversity assay")
241		}
242
243		# parse sample names and q values from column names like 'Sample_q=0.01'
244	13x	sample_names <- sub("_q=.*", "", sample_q)
245	13x	has_q <- grepl("_q=", sample_q)
246	13x	if (!any(has_q)) {
247	1x	stop("Could not parse q values; expected '_q=' in column names", call. = FALSE)
248		}
249	12x	if (!all(has_q)) {
250	!	stop("Some column names are missing '_q='; ensure all diversity columns include a q value",
251	!	call. = FALSE)
252		}
253	12x	q_vals <- as.numeric(sub(".*_q=", "", sample_q))
254
255		# determine group for each sample
256	12x	sample_type_in_coldata <- !is.null(sample_type_col) && sample_type_col %in% colnames(SummarizedExperiment::colData(se))
257	12x	if (sample_type_in_coldata) {
258	11x	st <- as.character(SummarizedExperiment::colData(se)[, sample_type_col])
259	11x	names(st) <- SummarizedExperiment::colData(se)$samples %\|\|% colnames(mat)
260		# when user provides a sample_type_col, index by the sample names
261		# (strip q suffix)
262	11x	group_vec <- unname(st[sample_names])
263		} else {
264	1x	stop("No sample grouping found: please supply `sample_type_col` or map sample",
265	1x	"types into `colData(se)` before calling calculate_lm_interaction().",
266	1x	call. = FALSE)
267		}
268
269	11x	if (verbose && progress) {
270	!	message("[calculate_lm_interaction] parsed samples and groups")
271		}
272	11x	all_results <- list()
273	11x	fit_one <- function(g) {
274	69x	.tsenat_fit_one_interaction(g = g, se = se, mat = mat, q_vals = q_vals, sample_names = sample_names,
275	69x	group_vec = group_vec, method = method, pvalue = pvalue, subject_col = subject_col,
276	69x	paired = paired, min_obs = min_obs, verbose = verbose, suppress_lme4_warnings = suppress_lme4_warnings,
277	69x	progress = progress)
278		}
279
280	11x	if (nthreads > 1) {
281	1x	res_list <- .tsenat_bplapply(rownames(mat), fit_one, nthreads = nthreads)
282		} else {
283	10x	res_list <- lapply(rownames(mat), fit_one)
284		}
285	11x	all_results <- Filter(Negate(is.null), res_list)
286
287	11x	if (length(all_results) == 0) {
288	2x	return(data.frame())
289		}
290	9x	res <- do.call(rbind, all_results)
291	9x	res$adj_p_interaction <- stats::p.adjust(res$p_interaction, method = "BH")
292		# Sort first by adjusted p-values, then by raw p-values for stable ordering
293	9x	res <- res[order(res$adj_p_interaction, res$p_interaction), , drop = FALSE]
294	9x	rownames(res) <- NULL
295
296	9x	.tsenat_report_fit_summary(res, verbose = verbose)
297
298		# Return the result data.frame (do not attach to or return a
299		# SummarizedExperiment)
300	9x	return(res)
301		}
302
303		# small helper (replacement for `%\|\|%`) to provide default when NULL
304	87x	`%\|\|%` <- function(a, b) if (is.null(a)) b else a

1		#' Calculate Tsallis diversity values for transcripts grouped by gene
2		#' @param x Numeric matrix or data.frame of transcript-level expression
3		#' values (rows = transcripts, columns = samples).
4		#' @param genes Character vector with length equal to nrow(x) assigning each
5		#' transcript to a gene.
6		#' @param norm Logical; if TRUE normalize Tsallis entropy values per gene.
7		#' @param q Numeric scalar or vector of q values to evaluate.
8		#' @param verbose Logical; show diagnostic messages when TRUE.
9		#' @param what Which quantity to return from `calculate_tsallis_entropy`:
10		#' 'S' (Tsallis entropy) or 'D' (Hill numbers) (default: 'S').
11		#' @param nthreads Number of threads for parallel processing (default: 1).
12		#' Set to > 1 to parallelize per-gene entropy calculations.
13		#' @return A data.frame with genes in the first column and per-sample (and
14		#' per-q) Tsallis entropy values in subsequent columns.
15		#' @import stats
16		#' @export
17		#' @examples
18		#' # Create a small transcript expression matrix (4 transcripts x 2 samples)
19		#' mat <- matrix(c(10, 5, 0, 0, 2, 8, 3, 7), nrow = 4, byrow = TRUE)
20		#' colnames(mat) <- c('Sample1', 'Sample2')
21		#' genes <- c('geneA', 'geneA', 'geneB', 'geneB')
22		#'
23		#' # Calculate Tsallis diversity for q=1
24		#' result <- calculate_method(mat, genes, norm = TRUE, q = 1)
25		#' result
26		calculate_method <- function(x, genes, norm = TRUE, verbose = FALSE, q = 2, what = c("S",
27		"D"), nthreads = 1) {
28	35x	what <- match.arg(what)
29		# validate q
30	35x	if (!is.numeric(q) \|\| any(q <= 0)) {
31	!	stop("Argument 'q' must be numeric and greater than 0.", call. = FALSE)
32		}
33		# cannot use aggregate because calculate_tsallis_entropy may return
34		# multiple values when length(q) > 1
35	35x	gene_levels <- unique(genes)
36		# ensure column names order matches the order used when constructing the
37		# result matrix (samples vary outer, q varies inner). If sample names are
38		# missing, synthesize deterministic names so column creation still works.
39	35x	sample_names <- colnames(x)
40	35x	if (is.null(sample_names)) {
41	1x	sample_names <- paste0("Sample", seq_len(ncol(x)))
42		}
43	35x	coln <- as.vector(t(outer(sample_names, q, function(s, qq) paste0(s, "_q=", qq))))
44	35x	rown <- gene_levels
45
46		# compute requested quantity ('S' or 'D') in parallel
47	35x	result_list <- .tsenat_bplapply(gene_levels, function(gene) {
48	681x	.tsenat_tsallis_row(x = x, genes = genes, gene = gene, q = q, norm = norm,
49	681x	what = what)
50	35x	}, nthreads = nthreads)
51
52		# Convert list to matrix (each element is a named vector) result_list is a
53		# list of vectors; combine them into a matrix
54	35x	result_mat <- t(vapply(result_list, identity, FUN.VALUE = setNames(numeric(length(coln)),
55	35x	coln)))
56	35x	colnames(result_mat) <- coln
57	35x	rownames(result_mat) <- rown
58	35x	out_df <- data.frame(Gene = rown, result_mat, check.names = FALSE)
59	35x	if (all(rowSums(!is.na(result_mat)) == 0)) {
60	3x	out_df <- data.frame(Gene = character(0))
61	3x	for (nm in coln) out_df[[nm]] <- numeric(0)
62	3x	x <- out_df
63	3x	return(x)
64		}
65	32x	x <- out_df
66	32x	y <- x[apply(x[2:ncol(x)], 1, function(X) all(is.finite(X))), ]
67	32x	if (nrow(x) - nrow(y) > 0 && verbose == TRUE) {
68	1x	message(sprintf("Note: %d genes excluded.", nrow(x) - nrow(y)))
69		}
70	32x	colnames(y)[1] <- "Gene"
71	32x	return(y)
72		}

1		# Helper functions for parallel processing using BiocParallel and parallel
2		# package
3
4		# Select and initialize parallel backend @param nthreads Number of threads to
5		# use (default: 1) @return BiocParallel BPPARAM object
6		.tsenat_get_bpparam <- function(nthreads = 1) {
7	41x	if (nthreads <= 1) {
8	1x	return(BiocParallel::SerialParam())
9		}
10
11		# On Unix-like systems, use MulticoreParam; on Windows use SnowParam
12	40x	if (.Platform$OS.type == "unix") {
13	40x	return(BiocParallel::MulticoreParam(workers = nthreads))
14		} else {
15	!	return(BiocParallel::SnowParam(workers = nthreads))
16		}
17		}
18
19		# Apply function in parallel using the best available backend @param X Vector
20		# or list to iterate over @param FUN Function to apply @param nthreads Number
21		# of threads (default: 1) @param SIMPLIFY Whether to simplify results (default:
22		# TRUE) @param FUN.VALUE Template for vapply (optional)
23		.tsenat_bplapply <- function(X, FUN, nthreads = 1, SIMPLIFY = TRUE, FUN.VALUE = NULL) {
24	119x	if (nthreads <= 1) {
25		# Serial execution
26	90x	if (is.null(FUN.VALUE)) {
27	81x	return(lapply(X, FUN))
28		} else {
29	9x	return(unname(vapply(X, FUN, FUN.VALUE = FUN.VALUE)))
30		}
31		}
32
33		# Parallel execution
34	29x	bpparam <- .tsenat_get_bpparam(nthreads)
35
36	29x	if (is.null(FUN.VALUE)) {
37	22x	return(BiocParallel::bplapply(X, FUN, BPPARAM = bpparam))
38		} else {
39		# Use bplapply and then simplify with vapply
40	7x	result_list <- BiocParallel::bplapply(X, FUN, BPPARAM = bpparam)
41	7x	return(vapply(result_list, identity, FUN.VALUE = FUN.VALUE))
42		}
43		}
44
45		# Apply function over two vectors in parallel @param X First vector or list
46		# @param Y Second vector or list @param FUN Function to apply (takes two
47		# arguments) @param nthreads Number of threads (default: 1)
48		.tsenat_bpmapply <- function(X, Y, FUN, nthreads = 1) {
49	19x	if (nthreads <= 1) {
50	9x	return(unname(mapply(FUN, X, Y, SIMPLIFY = FALSE)))
51		}
52
53	10x	bpparam <- .tsenat_get_bpparam(nthreads)
54	10x	return(unname(BiocParallel::bpmapply(FUN, X, Y, BPPARAM = bpparam, SIMPLIFY = FALSE)))
55		}