| 1 |
# Helpers for Tsallis entropy calculations |
|
| 2 | ||
| 3 |
.tsenat_calc_S <- function(p, q, tol, n, log_base, norm) {
|
|
| 4 | 4839x |
vapply(q, function(qi) {
|
| 5 | 5707x |
if (abs(qi - 1) < tol) {
|
| 6 | 316x |
sh <- -sum(ifelse(p > 0, p * log(p, base = log_base), 0)) |
| 7 | 316x |
if (norm) {
|
| 8 | 313x |
if (n <= 1) {
|
| 9 |
# Single isoform: normalization is undefined (0/0) Return NaN |
|
| 10 |
# to indicate undefined normalization |
|
| 11 | 26x |
sh <- NaN |
| 12 |
} else {
|
|
| 13 | 287x |
sh <- sh/log(n, base = log_base) |
| 14 |
} |
|
| 15 |
} |
|
| 16 | 316x |
return(sh) |
| 17 |
} else {
|
|
| 18 | 5391x |
ts <- (1 - sum(p^qi))/(qi - 1) |
| 19 | 5391x |
if (norm) {
|
| 20 | 5384x |
max_ts <- (1 - n^(1 - qi))/(qi - 1) |
| 21 |
# Handle single-isoform case: ts is 0, max_ts is also 0, so 0/0 |
|
| 22 |
# is undefined Return NaN to indicate undefined normalization |
|
| 23 | 5384x |
if (abs(max_ts) < tol || n <= 1) {
|
| 24 | 19x |
ts <- NaN |
| 25 |
} else {
|
|
| 26 | 5365x |
ts <- ts/max_ts |
| 27 |
} |
|
| 28 |
} |
|
| 29 | 5391x |
return(ts) |
| 30 |
} |
|
| 31 | 4839x |
}, numeric(1)) |
| 32 |
} |
|
| 33 | ||
| 34 |
.tsenat_calc_D <- function(p, q, tol, log_base) {
|
|
| 35 | 4839x |
vapply(q, function(qi) {
|
| 36 | 5707x |
if (abs(qi - 1) < tol) {
|
| 37 | 316x |
sh <- -sum(ifelse(p > 0, p * log(p, base = log_base), 0)) |
| 38 | 316x |
D1 <- (log_base)^(sh) |
| 39 | 316x |
return(D1) |
| 40 |
} else {
|
|
| 41 | 5391x |
spq <- sum(p^qi) |
| 42 | 5391x |
Dq <- spq^(1/(1 - qi)) |
| 43 | 5391x |
return(Dq) |
| 44 |
} |
|
| 45 | 4839x |
}, numeric(1)) |
| 46 |
} |
|
| 47 |
# Input preparation |
|
| 48 |
.tsenat_prepare_diversity_input <- function(x, genes = NULL, tpm = FALSE, assayno = 1, |
|
| 49 |
verbose = FALSE) {
|
|
| 50 | 45x |
if (!(is.matrix(x) || is.data.frame(x) || is.list(x) || is(x, "DGEList") || is(x, |
| 51 | 45x |
"RangedSummarizedExperiment") || is(x, "SummarizedExperiment"))) {
|
| 52 | 2x |
stop("Input data type is not supported! Please use ?calculate_diversity to see the possible arguments and details.",
|
| 53 | 2x |
call. = FALSE) |
| 54 |
} |
|
| 55 | ||
| 56 | 43x |
if (is(x, "data.frame")) {
|
| 57 | 3x |
x <- as.matrix(x) |
| 58 |
} |
|
| 59 | ||
| 60 | 43x |
if (tpm == TRUE && !is.list(x) && verbose == TRUE) {
|
| 61 | 2x |
message("Note: tpm as a logical argument is only interpreted in case of",
|
| 62 | 2x |
" tximport lists.") |
| 63 |
} |
|
| 64 | ||
| 65 | 43x |
se_assay_mat <- NULL |
| 66 | 43x |
if (is.list(x)) {
|
| 67 | 9x |
if (length(x) == 4 && "counts" %in% names(x)) {
|
| 68 | 4x |
if (tpm == FALSE) {
|
| 69 | 2x |
x <- as.matrix(x$counts) |
| 70 |
} |
|
| 71 | 4x |
if (tpm == TRUE) {
|
| 72 | 2x |
x <- as.matrix(x$abundance) |
| 73 |
} |
|
| 74 | 5x |
} else if (is(x, "DGEList")) {
|
| 75 | 3x |
x <- as.matrix(x$counts) |
| 76 | 3x |
if (verbose == TRUE) {
|
| 77 | 2x |
message("Note: calculate_diversity methods are only applicable",
|
| 78 | 2x |
" if your DGEList contains transcript-level expression", " data.") |
| 79 |
} |
|
| 80 | 3x |
if (tpm == TRUE && verbose == TRUE) {
|
| 81 | 1x |
message("Note: tpm as a logical argument is only interpreted", " in case of tximport lists.")
|
| 82 |
} |
|
| 83 |
} else {
|
|
| 84 | 2x |
stop("The package cannot find any expression data in your input.", call. = FALSE)
|
| 85 |
} |
|
| 86 |
} |
|
| 87 | ||
| 88 | 41x |
if (is(x, "RangedSummarizedExperiment") || is(x, "SummarizedExperiment")) {
|
| 89 | 8x |
md <- NULL |
| 90 | 8x |
try(md <- S4Vectors::metadata(x), silent = TRUE) |
| 91 | 8x |
if (!is.null(md) && !is.null(md$readcounts)) {
|
| 92 | 4x |
se_assay_mat <- as.matrix(md$readcounts) |
| 93 | 4x |
x <- se_assay_mat |
| 94 |
} else {
|
|
| 95 | 4x |
assays_len <- length(SummarizedExperiment::assays(x)) |
| 96 | 4x |
if (!is.numeric(assayno) || assays_len < assayno) {
|
| 97 | 3x |
stop("Please provide a valid assay number.", call. = FALSE)
|
| 98 |
} |
|
| 99 | 1x |
se_assay_mat <- as.matrix(SummarizedExperiment::assays(x)[[assayno]]) |
| 100 | 1x |
x <- se_assay_mat |
| 101 |
} |
|
| 102 | 5x |
if (is.null(genes)) {
|
| 103 | 5x |
if (exists("se_assay_mat") && !is.null(md) && !is.null(md$tx2gene) &&
|
| 104 | 5x |
is.data.frame(md$tx2gene)) {
|
| 105 | 4x |
txmap <- md$tx2gene |
| 106 | 4x |
tx_col <- if ("Transcript" %in% colnames(txmap)) {
|
| 107 | 3x |
"Transcript" |
| 108 |
} else {
|
|
| 109 | 1x |
colnames(txmap)[1] |
| 110 |
} |
|
| 111 | 4x |
gene_col <- if ("Gen" %in% colnames(txmap)) {
|
| 112 | 3x |
"Gen" |
| 113 |
} else {
|
|
| 114 | 1x |
colnames(txmap)[2] |
| 115 |
} |
|
| 116 | 4x |
genes <- as.character(txmap[[gene_col]][match(rownames(se_assay_mat), |
| 117 | 4x |
txmap[[tx_col]])]) |
| 118 | 4x |
rownames(x) <- NULL |
| 119 |
} else {
|
|
| 120 | 1x |
genes <- rownames(x) |
| 121 |
# keep transcript rownames available for downstream metadata |
|
| 122 |
# (we will store original transcript-level counts separately) |
|
| 123 | 1x |
rownames(x) <- NULL |
| 124 |
} |
|
| 125 | ||
| 126 | 5x |
if (is.null(genes)) {
|
| 127 | ! |
stop("Please construct a valid gene set for your ", "SummarizedExperiment.",
|
| 128 | ! |
call. = FALSE) |
| 129 |
} |
|
| 130 |
} |
|
| 131 |
} |
|
| 132 | ||
| 133 | 38x |
list(x = x, genes = genes, se_assay_mat = se_assay_mat) |
| 134 |
} |
| 1 |
#' Map external coldata into a SummarizedExperiment |
|
| 2 |
#' |
|
| 3 |
#' Map an external `coldata` table (sample IDs and a condition/label column) |
|
| 4 |
#' into a `SummarizedExperiment` produced by `calculate_diversity()`. The |
|
| 5 |
#' helper sets `colData(ts_se)$sample_type` when possible and records a |
|
| 6 |
#' `sample_base` identifier for each column. Unmatched entries remain NA and |
|
| 7 |
#' no automatic inference is attempted. |
|
| 8 |
#' @param ts_se A `SummarizedExperiment` whose assay columns are named with |
|
| 9 |
#' sample identifiers. Names may include per-sample annotations; mapping is |
|
| 10 |
#' exact and case-sensitive unless you normalize identifiers beforehand. |
|
| 11 |
#' @param coldata A data.frame with sample metadata (or `NULL`). |
|
| 12 |
#' @param coldata_sample_col Name of the column in `coldata` containing sample |
|
| 13 |
#' identifiers (default: 'Sample'). |
|
| 14 |
#' @param coldata_condition_col Name of the column in `coldata` with |
|
| 15 |
#' condition/labels (default: 'Condition'). |
|
| 16 |
#' @param paired Logical; if `TRUE`, validate pairing and reorder columns so |
|
| 17 |
#' that matched samples for each base are adjacent (default: `FALSE`). Use |
|
| 18 |
#' `paired = TRUE` for paired analyses so downstream paired tests see |
|
| 19 |
#' aligned samples regardless of the original `coldata` order. |
|
| 20 |
#' @return The input `ts_se` with `colData(ts_se)$sample_type` populated when |
|
| 21 |
#' possible. `colData(ts_se)$sample_base` is also added containing the base |
|
| 22 |
#' sample identifier; rownames of `colData(ts_se)` are aligned to the assay |
|
| 23 |
#' column names. |
|
| 24 |
#' @export |
|
| 25 |
#' @examples |
|
| 26 |
#' data('tcga_brca_luma', package = 'TSENAT')
|
|
| 27 |
#' rc <- as.matrix(tcga_brca_luma[1:20, -1, drop = FALSE]) |
|
| 28 |
#' gs <- tcga_brca_luma[1:20, 1] |
|
| 29 |
#' se <- calculate_diversity(rc, gs, q = 0.1, norm = TRUE) |
|
| 30 |
#' sample_names <- sub('_q=.*', '', colnames(SummarizedExperiment::assay(se)))
|
|
| 31 |
#' coldata_df <- data.frame(Sample = sample_names, Condition = rep(c('A', 'B'),
|
|
| 32 |
#' length.out = ncol(se) |
|
| 33 |
#' )) |
|
| 34 |
#' map_metadata(se, coldata_df) |
|
| 35 |
#' # Optionally validate pairs when appropriate |
|
| 36 |
#' map_metadata(se, coldata_df, paired = TRUE) |
|
| 37 |
map_metadata <- function(ts_se, coldata, coldata_sample_col = "Sample", coldata_condition_col = "Condition", |
|
| 38 |
paired = FALSE) {
|
|
| 39 | 30x |
if (is.null(coldata)) {
|
| 40 | 1x |
return(ts_se) |
| 41 |
} |
|
| 42 | 29x |
if (!is.data.frame(coldata) || !all(c(coldata_sample_col, coldata_condition_col) %in% |
| 43 | 29x |
colnames(coldata))) {
|
| 44 | 2x |
return(ts_se) |
| 45 |
} |
|
| 46 |
# Prepare canonical condition order and base extraction. `conds` is sorted |
|
| 47 |
# for deterministic ordering when constructing paired column order. |
|
| 48 |
# `coldata_base` removes a trailing role suffix (e.g. `_N`/`_T`) from the |
|
| 49 |
# `coldata` sample IDs; note that assay column names may contain per-sample |
|
| 50 |
# annotations (e.g. per-q tags) which are handled separately below when |
|
| 51 |
# building `sample_base_names`. |
|
| 52 | 27x |
conds <- sort(unique(as.character(coldata[[coldata_condition_col]]))) |
| 53 | 27x |
coldata_base <- sub("_[^_]+$", "", as.character(coldata[[coldata_sample_col]]))
|
| 54 | 27x |
bases <- unique(coldata_base) |
| 55 | ||
| 56 |
# Optionally validate that coldata defines pairing between conditions |
|
| 57 | 27x |
if (isTRUE(paired) && length(conds) >= 2) {
|
| 58 | 6x |
unpaired <- vapply(bases, function(b) {
|
| 59 | 24x |
length(unique(coldata[[coldata_condition_col]][coldata_base == b])) |
| 60 | 6x |
}, integer(1)) |
| 61 | 6x |
bad <- bases[unpaired != length(conds)] |
| 62 | 6x |
if (length(bad) > 0) {
|
| 63 | 3x |
bad_list <- paste(bad, collapse = ", ") |
| 64 | 3x |
cond_list <- paste(conds, collapse = ", ") |
| 65 | 3x |
msg <- paste0("Unpaired samples found in coldata for bases: ", bad_list,
|
| 66 | 3x |
". Ensure each base has all conditions: ", cond_list) |
| 67 | 3x |
stop(msg, call. = FALSE) |
| 68 |
} |
|
| 69 |
} |
|
| 70 | 24x |
sample_base_names <- sub("_q=.*", "", colnames(SummarizedExperiment::assay(ts_se)))
|
| 71 | ||
| 72 |
# Reorder the SummarizedExperiment columns. |
|
| 73 | 24x |
base_names <- sample_base_names |
| 74 | 24x |
idx_list <- integer(0) |
| 75 | 24x |
if (isTRUE(paired)) {
|
| 76 |
# Build paired-aware order: for each base, list samples in the |
|
| 77 |
# deterministic `conds` order so matched samples become adjacent. |
|
| 78 | 3x |
for (b in bases) {
|
| 79 | 6x |
for (c in conds) {
|
| 80 |
# find sample names in coldata for this base+condition |
|
| 81 | 12x |
samples_for_pair <- as.character(coldata[[coldata_sample_col]])[coldata_base == |
| 82 | 12x |
b & as.character(coldata[[coldata_condition_col]]) == c] |
| 83 | 12x |
if (length(samples_for_pair) == 0) {
|
| 84 | ! |
next |
| 85 |
} |
|
| 86 |
# for each sample name, find matching assay columns (after |
|
| 87 |
# q-stripping) |
|
| 88 | 12x |
for (s in samples_for_pair) {
|
| 89 | 12x |
matches <- which(base_names == s) |
| 90 | 12x |
if (length(matches) > 0) {
|
| 91 | 12x |
idx_list <- c(idx_list, matches) |
| 92 |
} |
|
| 93 |
} |
|
| 94 |
} |
|
| 95 |
} |
|
| 96 |
# Preserve original order for any unmatched columns |
|
| 97 | 3x |
remaining <- setdiff(seq_along(base_names), idx_list) |
| 98 | 3x |
new_order <- c(idx_list, remaining) |
| 99 |
} else {
|
|
| 100 |
# Non-paired: follow the order present in `coldata` |
|
| 101 | 21x |
ordered_samples <- as.character(coldata[[coldata_sample_col]]) |
| 102 | 21x |
for (s in ordered_samples) {
|
| 103 | 103x |
matches <- which(base_names == s) |
| 104 | 103x |
if (length(matches) > 0) {
|
| 105 | 91x |
idx_list <- c(idx_list, matches) |
| 106 |
} |
|
| 107 |
} |
|
| 108 | 21x |
remaining <- setdiff(seq_along(base_names), idx_list) |
| 109 | 21x |
new_order <- c(idx_list, remaining) |
| 110 |
} |
|
| 111 | 24x |
if (length(new_order) > 0 && !all(new_order == seq_along(base_names))) {
|
| 112 | 4x |
ts_se <- ts_se[, new_order, drop = FALSE] |
| 113 | 4x |
sample_base_names <- sub("_q=.*", "", colnames(SummarizedExperiment::assay(ts_se)))
|
| 114 |
} |
|
| 115 | 24x |
st_map <- setNames(as.character(coldata[[coldata_condition_col]]), as.character(coldata[[coldata_sample_col]])) |
| 116 | 24x |
sample_types <- unname(st_map[sample_base_names]) |
| 117 | 24x |
missing_idx <- which(is.na(sample_types)) |
| 118 | 24x |
if (length(missing_idx) > 0) {
|
| 119 | 3x |
missing_samples <- sample_base_names[missing_idx] |
| 120 | 3x |
msg <- paste0("map_metadata: unmatched samples in 'coldata': ", paste(missing_samples,
|
| 121 | 3x |
collapse = ", "), ". Provide matching entries in 'coldata' or populate ", |
| 122 | 3x |
"colData(ts_se)$sample_type beforehand.") |
| 123 | 3x |
stop(msg, call. = FALSE) |
| 124 |
} |
|
| 125 | 21x |
SummarizedExperiment::colData(ts_se)$sample_type <- sample_types |
| 126 |
# Also record the base sample names (without q suffix) per column so |
|
| 127 |
# downstream functions can rely on explicit pairing information. |
|
| 128 | 21x |
SummarizedExperiment::colData(ts_se)$sample_base <- sample_base_names |
| 129 |
# Ensure rownames of colData match assay column names |
|
| 130 | 21x |
assay_cols <- colnames(SummarizedExperiment::assay(ts_se)) |
| 131 | 21x |
rownames(SummarizedExperiment::colData(ts_se)) <- assay_cols |
| 132 |
# Attach transcript-level readcounts and tx->gene mapping to metadata if |
|
| 133 |
# they are available in the calling environment or globalenv and not |
|
| 134 |
# already present in the SummarizedExperiment metadata. This simplifies |
|
| 135 |
# downstream plotting helpers that expect these objects. |
|
| 136 | 21x |
if (requireNamespace("S4Vectors", quietly = TRUE)) {
|
| 137 | 21x |
md <- S4Vectors::metadata(ts_se) |
| 138 |
# prefer existing metadata values; otherwise try common names |
|
| 139 | 21x |
if (is.null(md$readcounts)) {
|
| 140 | 21x |
if (exists("readcounts", envir = parent.frame())) {
|
| 141 | 6x |
md$readcounts <- get("readcounts", envir = parent.frame())
|
| 142 | 15x |
} else if (exists("readcounts", envir = globalenv())) {
|
| 143 | ! |
md$readcounts <- get("readcounts", envir = globalenv())
|
| 144 |
} |
|
| 145 |
} |
|
| 146 | 21x |
if (is.null(md$tx2gene)) {
|
| 147 |
# vignette uses 'txmap' variable name; also accept 'tx2gene' |
|
| 148 | 21x |
if (exists("txmap", envir = parent.frame())) {
|
| 149 | 2x |
md$tx2gene <- get("txmap", envir = parent.frame())
|
| 150 | 19x |
} else if (exists("tx2gene", envir = parent.frame())) {
|
| 151 | 4x |
md$tx2gene <- get("tx2gene", envir = parent.frame())
|
| 152 | 15x |
} else if (exists("txmap", envir = globalenv())) {
|
| 153 | ! |
md$tx2gene <- get("txmap", envir = globalenv())
|
| 154 | 15x |
} else if (exists("tx2gene", envir = globalenv())) {
|
| 155 | ! |
md$tx2gene <- get("tx2gene", envir = globalenv())
|
| 156 |
} |
|
| 157 |
} |
|
| 158 | 21x |
S4Vectors::metadata(ts_se) <- md |
| 159 |
} |
|
| 160 |
# If a diversity assay is present, prepare a simple diversity data.frame |
|
| 161 |
# (genes + per-sample diversity values) and store it in metadata so the |
|
| 162 |
# vignette and plotting helpers can use a ready-made table. |
|
| 163 | 21x |
if ("diversity" %in% SummarizedExperiment::assayNames(ts_se)) {
|
| 164 | 15x |
div_mat <- as.matrix(SummarizedExperiment::assay(ts_se, "diversity")) |
| 165 |
# sample base names without per-q suffixes |
|
| 166 | 15x |
sample_base_names <- sub("_q=.*", "", colnames(div_mat))
|
| 167 |
# prefer explicit sample_type in colData when present |
|
| 168 | 15x |
samples_vec <- NULL |
| 169 | 15x |
if ("sample_type" %in% colnames(SummarizedExperiment::colData(ts_se))) {
|
| 170 | 15x |
samples_vec <- as.character(SummarizedExperiment::colData(ts_se)$sample_type) |
| 171 |
} |
|
| 172 | 15x |
div_df <- as.data.frame(div_mat) |
| 173 | 15x |
genes_col <- if (!is.null(SummarizedExperiment::rowData(ts_se)$genes)) {
|
| 174 | 5x |
SummarizedExperiment::rowData(ts_se)$genes |
| 175 |
} else {
|
|
| 176 | 10x |
rownames(div_df) |
| 177 |
} |
|
| 178 | 15x |
div_df <- cbind(genes = genes_col, div_df) |
| 179 | 15x |
md2 <- S4Vectors::metadata(ts_se) |
| 180 | 15x |
md2$diversity_df <- div_df |
| 181 | 15x |
md2$sample_base_names <- sample_base_names |
| 182 | 15x |
if (!is.null(samples_vec)) {
|
| 183 | 15x |
md2$samples <- samples_vec |
| 184 |
} |
|
| 185 | 15x |
S4Vectors::metadata(ts_se) <- md2 |
| 186 |
} |
|
| 187 | 21x |
return(ts_se) |
| 188 |
} |
|
| 189 | ||
| 190 |
# Map sample names (without '_q=...') to group labels using `colData(se)`. |
|
| 191 |
# Mapping must be provided via `colData(se)`; no inference fallback is used. |
|
| 192 |
map_samples_to_group <- function(sample_names, se = NULL, sample_type_col = NULL, |
|
| 193 |
mat = NULL) {
|
|
| 194 |
# Prefer explicit mapping from colData(se)[, sample_type_col] when |
|
| 195 |
# provided. If `sample_type_col` is not provided, allow a single- condition |
|
| 196 |
# dataset by assigning a single default group 'Group' to all samples (this |
|
| 197 |
# permits plotting single-condition q-curves). |
|
| 198 | 3x |
base_names <- sub("_q=.*", "", colnames(if (!is.null(mat)) {
|
| 199 | 1x |
mat |
| 200 |
} else {
|
|
| 201 | 2x |
SummarizedExperiment::assay(se) |
| 202 |
})) |
|
| 203 | ||
| 204 | 3x |
if (!is.null(se) && !is.null(sample_type_col) && (sample_type_col %in% colnames(SummarizedExperiment::colData(se)))) {
|
| 205 | 2x |
st_vec <- as.character(SummarizedExperiment::colData(se)[, sample_type_col]) |
| 206 | 2x |
names(st_vec) <- base_names |
| 207 | 2x |
st_map <- st_vec[!duplicated(names(st_vec))] |
| 208 |
} else {
|
|
| 209 |
# No explicit mapping: assume single-group dataset |
|
| 210 | 1x |
st_map <- setNames(rep("Group", length(base_names)), base_names)
|
| 211 |
} |
|
| 212 | ||
| 213 | 3x |
mapped <- unname(st_map[sample_names]) |
| 214 | 3x |
missing_idx <- which(is.na(mapped)) |
| 215 | 3x |
if (length(missing_idx) > 0) {
|
| 216 | 1x |
stop(sprintf("Missing sample_type mapping for samples: %s", paste(unique(sample_names[missing_idx]),
|
| 217 | 1x |
collapse = ", "))) |
| 218 |
} |
|
| 219 | 2x |
mapped |
| 220 |
} |
|
| 221 | ||
| 222 |
# Prepare a long-format data.frame for a simple assay (one value per sample) |
|
| 223 |
get_assay_long <- function(se, assay_name = "diversity", value_name = "diversity", |
|
| 224 |
sample_type_col = NULL) {
|
|
| 225 | 11x |
if (!requireNamespace("tidyr", quietly = TRUE)) {
|
| 226 | ! |
stop("tidyr required")
|
| 227 |
} |
|
| 228 | 11x |
if (!requireNamespace("dplyr", quietly = TRUE)) {
|
| 229 | ! |
stop("dplyr required")
|
| 230 |
} |
|
| 231 | 11x |
if (!requireNamespace("SummarizedExperiment", quietly = TRUE)) {
|
| 232 | ! |
stop("SummarizedExperiment required")
|
| 233 |
} |
|
| 234 | ||
| 235 | 11x |
mat <- SummarizedExperiment::assay(se, assay_name) |
| 236 | 10x |
if (is.null(mat)) {
|
| 237 | ! |
stop("Assay not found: ", assay_name)
|
| 238 |
} |
|
| 239 | 10x |
df <- as.data.frame(mat) |
| 240 | 10x |
genes_col <- if (!is.null(SummarizedExperiment::rowData(se)$genes)) {
|
| 241 | 8x |
SummarizedExperiment::rowData(se)$genes |
| 242 |
} else {
|
|
| 243 | 2x |
rownames(df) |
| 244 |
} |
|
| 245 | 10x |
df <- cbind(df, Gene = genes_col) |
| 246 | 10x |
long <- tidyr::pivot_longer(df, -Gene, names_to = "sample", values_to = value_name) |
| 247 | ||
| 248 |
# sample_type: prefer explicit colData mapping when available. If not |
|
| 249 |
# provided, assume a single-group dataset and set `sample_type` to 'Group' |
|
| 250 |
# for all samples. |
|
| 251 | 10x |
if (!is.null(sample_type_col) && (sample_type_col %in% colnames(SummarizedExperiment::colData(se)))) {
|
| 252 | 6x |
st <- as.character(SummarizedExperiment::colData(se)[, sample_type_col]) |
| 253 | 6x |
names(st) <- colnames(mat) |
| 254 | 6x |
st_map <- st[!duplicated(names(st))] |
| 255 | 6x |
sample_base <- sub("_q=.*", "", long$sample)
|
| 256 | 6x |
long$sample_type <- unname(st_map[sample_base]) |
| 257 | 6x |
missing_idx <- which(is.na(long$sample_type)) |
| 258 | 6x |
if (length(missing_idx) > 0) {
|
| 259 | ! |
stop(sprintf("Missing sample_type mapping for samples: %s", paste(unique(sample_base[missing_idx]),
|
| 260 | ! |
collapse = ", "))) |
| 261 |
} |
|
| 262 |
} else {
|
|
| 263 | 4x |
long$sample_type <- rep("Group", nrow(long))
|
| 264 |
} |
|
| 265 | ||
| 266 |
# Filter out NA values but retain sample_type information |
|
| 267 | 10x |
long_filtered <- long[!is.na(long[[value_name]]), , drop = FALSE] |
| 268 | ||
| 269 |
# Check if any data remains after filtering |
|
| 270 | 10x |
if (nrow(long_filtered) == 0) {
|
| 271 | 1x |
stop(sprintf("No non-NA values found in assay '%s'. All values are NA.",
|
| 272 | 1x |
assay_name), call. = FALSE) |
| 273 |
} |
|
| 274 | ||
| 275 | 9x |
long_filtered |
| 276 |
} |
|
| 277 | ||
| 278 |
# Internal small helper: prepare long-format tsallis data from a |
|
| 279 |
# SummarizedExperiment |
|
| 280 |
prepare_tsallis_long <- function(se, assay_name = "diversity", sample_type_col = "sample_type") {
|
|
| 281 | 24x |
if (!requireNamespace("tidyr", quietly = TRUE)) {
|
| 282 | ! |
stop("tidyr required")
|
| 283 |
} |
|
| 284 | 24x |
if (!requireNamespace("dplyr", quietly = TRUE)) {
|
| 285 | ! |
stop("dplyr required")
|
| 286 |
} |
|
| 287 | 24x |
if (!requireNamespace("SummarizedExperiment", quietly = TRUE)) {
|
| 288 | ! |
stop("SummarizedExperiment required")
|
| 289 |
} |
|
| 290 | ||
| 291 | 24x |
mat <- SummarizedExperiment::assay(se, assay_name) |
| 292 | 24x |
if (is.null(mat)) {
|
| 293 | ! |
stop("Assay not found: ", assay_name)
|
| 294 |
} |
|
| 295 | 24x |
df <- as.data.frame(mat) |
| 296 | 24x |
genes_col <- if (!is.null(SummarizedExperiment::rowData(se)$genes)) {
|
| 297 | 14x |
SummarizedExperiment::rowData(se)$genes |
| 298 |
} else {
|
|
| 299 | 10x |
rownames(df) |
| 300 |
} |
|
| 301 | 24x |
df <- cbind(df, Gene = genes_col) |
| 302 | ||
| 303 | 24x |
long <- tidyr::pivot_longer(df, -Gene, names_to = "sample_q", values_to = "tsallis") |
| 304 | 24x |
if (any(grepl("_q=", long$sample_q))) {
|
| 305 | 13x |
long <- tidyr::separate(long, sample_q, into = c("sample", "q"), sep = "_q=")
|
| 306 | 13x |
long$q <- as.factor(as.numeric(long$q)) |
| 307 |
} else {
|
|
| 308 | 11x |
long$sample <- long$sample_q |
| 309 | 11x |
long$q <- NA |
| 310 |
} |
|
| 311 | ||
| 312 | 24x |
if (!is.null(sample_type_col) && (sample_type_col %in% colnames(SummarizedExperiment::colData(se)))) {
|
| 313 | 20x |
st_vec <- as.character(SummarizedExperiment::colData(se)[, sample_type_col]) |
| 314 | 20x |
names(st_vec) <- sub("_q=.*", "", colnames(mat))
|
| 315 | 20x |
st_map <- st_vec[!duplicated(names(st_vec))] |
| 316 | 20x |
long$group <- unname(st_map[as.character(long$sample)]) |
| 317 | 20x |
missing_idx <- which(is.na(long$group)) |
| 318 | 20x |
if (length(missing_idx) > 0) {
|
| 319 | ! |
stop(sprintf("Missing sample_type mapping for samples: %s", paste(unique(as.character(long$sample)[missing_idx]),
|
| 320 | ! |
collapse = ", "))) |
| 321 |
} |
|
| 322 |
} else {
|
|
| 323 | 4x |
long$group <- rep("Group", nrow(long))
|
| 324 |
} |
|
| 325 | ||
| 326 | 24x |
as.data.frame(long[!is.na(long$tsallis), , drop = FALSE]) |
| 327 |
} |
|
| 328 | ||
| 329 | ||
| 330 |
#' Map transcript IDs from a tx2gene table to a readcounts matrix |
|
| 331 |
#' |
|
| 332 |
#' Assign transcript identifiers as row names of a transcript-level read |
|
| 333 |
#' counts matrix so downstream plotting functions can identify transcripts. |
|
| 334 |
#' The tx2gene mapping can be provided as a file path (TSV) or a |
|
| 335 |
#' data.frame with a `Transcript` column. |
|
| 336 |
#' |
|
| 337 |
#' @param readcounts A numeric matrix or data.frame of read counts (rows = transcripts). |
|
| 338 |
#' @param tx2gene Either a path to a tab-delimited file or a data.frame with at least a `Transcript` column. |
|
| 339 |
#' @param tx_col Name of the transcript ID column in `tx2gene` (default: 'Transcript'). |
|
| 340 |
#' @param verbose Logical; print informative messages (default: FALSE). |
|
| 341 |
#' @return The input `readcounts` with rownames set to the transcript IDs. |
|
| 342 |
#' @examples |
|
| 343 |
#' readcounts <- matrix(1:4, nrow = 2, dimnames = list(NULL, c('s1', 's2')))
|
|
| 344 |
#' tx2gene <- data.frame(Transcript = c('tx1', 'tx2'), Gene = c('g1', 'g2'), stringsAsFactors = FALSE)
|
|
| 345 |
#' rc <- map_tx_to_readcounts(readcounts, tx2gene) |
|
| 346 |
#' rownames(rc) |
|
| 347 |
#' @export |
|
| 348 |
map_tx_to_readcounts <- function(readcounts, tx2gene, tx_col = "Transcript", verbose = FALSE) {
|
|
| 349 | 7x |
if (is.character(tx2gene) && length(tx2gene) == 1) {
|
| 350 | 2x |
if (!file.exists(tx2gene)) {
|
| 351 | ! |
stop("tx2gene file not found: ", tx2gene, call. = FALSE)
|
| 352 |
} |
|
| 353 | 2x |
txmap <- utils::read.delim(tx2gene, header = TRUE, stringsAsFactors = FALSE) |
| 354 | 5x |
} else if (is.data.frame(tx2gene)) {
|
| 355 | 5x |
txmap <- tx2gene |
| 356 |
} else {
|
|
| 357 | ! |
stop("'tx2gene' must be a file path or a data.frame.", call. = FALSE)
|
| 358 |
} |
|
| 359 | ||
| 360 | 7x |
if (!(tx_col %in% colnames(txmap))) {
|
| 361 | ! |
stop(sprintf("tx2gene mapping must contain column '%s'", tx_col), call. = FALSE)
|
| 362 |
} |
|
| 363 | ||
| 364 |
# Ensure readcounts is a matrix-like object |
|
| 365 | 7x |
if (is.data.frame(readcounts)) {
|
| 366 | 1x |
readcounts <- as.matrix(readcounts) |
| 367 |
} |
|
| 368 | 7x |
if (!is.matrix(readcounts)) {
|
| 369 | ! |
stop("'readcounts' must be a matrix or data.frame", call. = FALSE)
|
| 370 |
} |
|
| 371 | ||
| 372 | 7x |
n_rc <- nrow(readcounts) |
| 373 | 7x |
n_tx <- nrow(txmap) |
| 374 | ||
| 375 | 7x |
if (n_tx == n_rc) {
|
| 376 | 3x |
rownames(readcounts) <- as.character(txmap[[tx_col]]) |
| 377 | 3x |
if (verbose) {
|
| 378 | ! |
message(sprintf("Assigned %d transcript rownames from tx2gene.", n_rc))
|
| 379 |
} |
|
| 380 | 3x |
return(readcounts) |
| 381 |
} |
|
| 382 | ||
| 383 |
# If counts differ, attempt to match by transcript identifiers if present |
|
| 384 | 4x |
tx_ids <- as.character(txmap[[tx_col]]) |
| 385 | 4x |
if (!is.null(rownames(readcounts)) && all(rownames(readcounts) %in% tx_ids)) {
|
| 386 |
# reorder txmap to match readcounts row order and set rownames |
|
| 387 | 1x |
matched_idx <- match(rownames(readcounts), tx_ids) |
| 388 | 1x |
rownames(readcounts) <- tx_ids[matched_idx] |
| 389 | 1x |
if (verbose) {
|
| 390 | 1x |
message("Matched and assigned transcript IDs by existing readcounts rownames.")
|
| 391 |
} |
|
| 392 | 1x |
return(readcounts) |
| 393 |
} |
|
| 394 | ||
| 395 | 3x |
stop(sprintf("Number of transcripts in tx2gene (%d) does not match readcounts rows (%d), and automatic matching failed.",
|
| 396 | 3x |
n_tx, n_rc), call. = FALSE) |
| 397 |
} |
| 1 |
# Summary reporting helper |
|
| 2 |
.tsenat_report_fit_summary <- function(res, verbose = TRUE) {
|
|
| 3 | 12x |
if (verbose && "fit_method" %in% colnames(res)) {
|
| 4 | 2x |
total_genes <- nrow(res) |
| 5 | 2x |
fallback_mask <- !is.na(res$fit_method) & res$fit_method != "lmer" |
| 6 | 2x |
n_fallback <- sum(fallback_mask) |
| 7 | 2x |
if (n_fallback > 0) {
|
| 8 | 2x |
tab <- table(res$fit_method[fallback_mask]) |
| 9 | 2x |
tab_str <- paste(sprintf("%s=%d", names(tab), as.integer(tab)), collapse = ", ")
|
| 10 | 2x |
message(sprintf("[calculate_lm_interaction] fallback fits used: %d/%d genes (%s)",
|
| 11 | 2x |
n_fallback, total_genes, tab_str)) |
| 12 |
} |
|
| 13 | 2x |
if ("singular" %in% colnames(res)) {
|
| 14 | 2x |
n_sing <- sum(as.logical(res$singular), na.rm = TRUE) |
| 15 | 2x |
if (n_sing > 0) {
|
| 16 | 2x |
message(sprintf("[calculate_lm_interaction] singular fits detected: %d/%d genes",
|
| 17 | 2x |
n_sing, total_genes)) |
| 18 |
} |
|
| 19 |
} |
|
| 20 |
} |
|
| 21 |
} |
|
| 22 |
# GAM interaction helper |
|
| 23 |
.tsenat_gam_interaction <- function(df, q_vals, g, min_obs = 10) {
|
|
| 24 | 12x |
if (!requireNamespace("mgcv", quietly = TRUE)) {
|
| 25 | ! |
stop("Package 'mgcv' is required for method = 'gam'")
|
| 26 |
} |
|
| 27 | 12x |
uq_len <- length(unique(na.omit(q_vals))) |
| 28 | 12x |
k_q <- max(2, min(10, uq_len - 1)) |
| 29 | 12x |
fit_null <- try(mgcv::gam(entropy ~ group + s(q, k = k_q), data = df), silent = TRUE) |
| 30 | 12x |
fit_alt <- try(mgcv::gam(entropy ~ group + s(q, by = group, k = k_q), data = df), |
| 31 | 12x |
silent = TRUE) |
| 32 | 12x |
if (inherits(fit_null, "try-error") || inherits(fit_alt, "try-error")) {
|
| 33 | 5x |
return(NULL) |
| 34 |
} |
|
| 35 | 7x |
an <- try(mgcv::anova.gam(fit_null, fit_alt, test = "F"), silent = TRUE) |
| 36 | 7x |
if (inherits(an, "try-error")) {
|
| 37 | ! |
return(NULL) |
| 38 |
} |
|
| 39 | 7x |
p_interaction <- NA_real_ |
| 40 | 7x |
if (nrow(an) >= 2) {
|
| 41 | 7x |
if ("Pr(F)" %in% colnames(an)) {
|
| 42 | ! |
p_interaction <- an[2, "Pr(F)"] |
| 43 | 7x |
} else if ("Pr(>F)" %in% colnames(an)) {
|
| 44 | 4x |
p_interaction <- an[2, "Pr(>F)"] |
| 45 | 3x |
} else if ("p-value" %in% colnames(an)) {
|
| 46 | ! |
p_interaction <- an[2, "p-value"] |
| 47 |
} |
|
| 48 |
} |
|
| 49 | 7x |
return(data.frame(gene = g, p_interaction = p_interaction, stringsAsFactors = FALSE)) |
| 50 |
} |
|
| 51 | ||
| 52 |
# FPCA interaction helper |
|
| 53 |
.tsenat_fpca_interaction <- function(mat, q_vals, sample_names, group_vec, g, min_obs = 10) {
|
|
| 54 | 12x |
uq <- sort(unique(q_vals)) |
| 55 | 12x |
samples_u <- unique(sample_names) |
| 56 | 12x |
curve_mat <- matrix(NA_real_, nrow = length(samples_u), ncol = length(uq)) |
| 57 | 12x |
if (length(samples_u) > 0) {
|
| 58 | 12x |
rownames(curve_mat) <- samples_u |
| 59 |
} |
|
| 60 | 12x |
for (i in seq_along(sample_names)) {
|
| 61 | 116x |
s <- sample_names[i] |
| 62 | 116x |
qv <- q_vals[i] |
| 63 | 116x |
qi <- match(qv, uq) |
| 64 | 116x |
if (is.na(qi)) {
|
| 65 | 2x |
next |
| 66 |
} |
|
| 67 | 114x |
if (s %in% rownames(curve_mat)) {
|
| 68 | 114x |
curve_mat[s, qi] <- as.numeric(mat[g, i]) |
| 69 |
} |
|
| 70 |
} |
|
| 71 | 12x |
good_rows <- which(rowSums(!is.na(curve_mat)) >= max(2, ceiling(ncol(curve_mat)/2))) |
| 72 | 12x |
if (length(good_rows) < min_obs) {
|
| 73 | 6x |
return(NULL) |
| 74 |
} |
|
| 75 | 6x |
mat_sub <- curve_mat[good_rows, , drop = FALSE] |
| 76 | 6x |
col_means <- apply(mat_sub, 2, function(col) mean(col, na.rm = TRUE)) |
| 77 | 6x |
for (r in seq_len(nrow(mat_sub))) mat_sub[r, is.na(mat_sub[r, ])] <- col_means[is.na(mat_sub[r, |
| 78 |
])] |
|
| 79 | 6x |
pca <- try(stats::prcomp(mat_sub, center = TRUE, scale. = FALSE), silent = TRUE) |
| 80 | 6x |
if (inherits(pca, "try-error")) {
|
| 81 | ! |
return(NULL) |
| 82 |
} |
|
| 83 | 6x |
if (ncol(pca$x) < 1) {
|
| 84 | ! |
return(NULL) |
| 85 |
} |
|
| 86 | 6x |
pc1 <- pca$x[, 1] |
| 87 | 6x |
used_samples <- rownames(mat_sub) |
| 88 | 6x |
grp_vals <- group_vec[match(used_samples, sample_names)] |
| 89 | 6x |
if (length(unique(na.omit(grp_vals))) < 2) {
|
| 90 | 1x |
return(NULL) |
| 91 |
} |
|
| 92 | 5x |
g1 <- unique(na.omit(grp_vals))[1] |
| 93 | 5x |
g2 <- unique(na.omit(grp_vals))[2] |
| 94 | 5x |
x1 <- pc1[grp_vals == g1] |
| 95 | 5x |
x2 <- pc1[grp_vals == g2] |
| 96 | 5x |
if (length(x1) < 2 || length(x2) < 2) {
|
| 97 | 1x |
return(NULL) |
| 98 |
} |
|
| 99 | 4x |
t_res <- try(stats::t.test(x1, x2), silent = TRUE) |
| 100 | 4x |
if (inherits(t_res, "try-error")) {
|
| 101 | ! |
return(NULL) |
| 102 |
} |
|
| 103 | 4x |
pval <- as.numeric(t_res$p.value) |
| 104 | 4x |
return(data.frame(gene = g, p_interaction = pval, stringsAsFactors = FALSE)) |
| 105 |
} |
|
| 106 | ||
| 107 |
# Fit function extracted from calculate_lm_interaction |
|
| 108 |
.tsenat_fit_one_interaction <- function(g, se, mat, q_vals, sample_names, group_vec, |
|
| 109 |
method, pvalue, subject_col, paired, min_obs, verbose, suppress_lme4_warnings, |
|
| 110 |
progress) {
|
|
| 111 | 81x |
vals <- as.numeric(mat[g, ]) |
| 112 | 81x |
df <- data.frame(entropy = vals, q = q_vals, group = factor(group_vec)) |
| 113 | 81x |
if (sum(!is.na(df$entropy)) < min_obs) {
|
| 114 | 2x |
return(NULL) |
| 115 |
} |
|
| 116 | 79x |
if (length(unique(na.omit(df$group))) < 2) {
|
| 117 | 51x |
return(NULL) |
| 118 |
} |
|
| 119 | ||
| 120 | 28x |
if (method == "linear") {
|
| 121 | 7x |
fit <- try(stats::lm(entropy ~ q * group, data = df), silent = TRUE) |
| 122 | 7x |
if (inherits(fit, "try-error")) {
|
| 123 | ! |
return(NULL) |
| 124 |
} |
|
| 125 | 7x |
coefs <- summary(fit)$coefficients |
| 126 | 7x |
ia_idx <- grep("^q:group", rownames(coefs))
|
| 127 | 7x |
if (length(ia_idx) == 0) {
|
| 128 | ! |
return(NULL) |
| 129 |
} |
|
| 130 | 7x |
p_interaction <- coefs[ia_idx[1], "Pr(>|t|)"] |
| 131 | 7x |
return(data.frame(gene = g, p_interaction = p_interaction, stringsAsFactors = FALSE)) |
| 132 |
} |
|
| 133 | ||
| 134 | 21x |
if (method == "lmm") {
|
| 135 | 15x |
if (!requireNamespace("lme4", quietly = TRUE)) {
|
| 136 | ! |
stop("Package 'lme4' is required for method = 'lmm'")
|
| 137 |
} |
|
| 138 |
# determine subject IDs for random effect |
|
| 139 | 15x |
subject <- NULL |
| 140 | 15x |
if (!is.null(subject_col)) {
|
| 141 | 3x |
if (!(subject_col %in% colnames(SummarizedExperiment::colData(se)))) {
|
| 142 | 1x |
stop(sprintf("subject_col '%s' not found in colData(se)", subject_col))
|
| 143 |
} |
|
| 144 | 2x |
subj_full <- as.character(SummarizedExperiment::colData(se)[, subject_col]) |
| 145 | 2x |
names(subj_full) <- SummarizedExperiment::colData(se)$samples %||% colnames(mat) |
| 146 | 2x |
subject <- unname(subj_full[sample_names]) |
| 147 | 12x |
} else if (paired) {
|
| 148 |
# require a paired identifier (sample_base) in colData |
|
| 149 | 2x |
if ("sample_base" %in% colnames(SummarizedExperiment::colData(se))) {
|
| 150 | 1x |
sb <- as.character(SummarizedExperiment::colData(se)[, "sample_base"]) |
| 151 | 1x |
names(sb) <- SummarizedExperiment::colData(se)$samples %||% colnames(mat) |
| 152 | 1x |
subject <- unname(sb[sample_names]) |
| 153 |
} else {
|
|
| 154 | 1x |
stop("paired = TRUE but no 'sample_base' column found in colData(se); call map_metadata(..., paired = TRUE) or supply subject_col")
|
| 155 |
} |
|
| 156 |
} else {
|
|
| 157 |
# fallback to sample base derived from column names |
|
| 158 | 10x |
subject <- sample_names |
| 159 |
} |
|
| 160 | 13x |
df$subject <- factor(subject) |
| 161 |
# require at least two subjects and at least two groups represented |
|
| 162 | 13x |
if (length(unique(na.omit(df$subject))) < 2) {
|
| 163 | 1x |
return(NULL) |
| 164 |
} |
|
| 165 | 12x |
if (length(unique(na.omit(df$group))) < 2) {
|
| 166 | ! |
return(NULL) |
| 167 |
} |
|
| 168 |
# fit null (no interaction) and alternative (with q:group interaction) |
|
| 169 | 12x |
mm_suppress_pattern <- "boundary \\(singular\\) fit|Computed variance-covariance matrix problem|not a positive definite matrix" |
| 170 | ||
| 171 |
# attempt to fit mixed models (null and alternative) using helper |
|
| 172 | 12x |
fit0 <- .tsenat_try_lmer(entropy ~ q + group + (1 | subject), df, suppress_lme4_warnings = suppress_lme4_warnings, |
| 173 | 12x |
verbose = verbose, mm_suppress_pattern = mm_suppress_pattern) |
| 174 | 12x |
fit1 <- .tsenat_try_lmer(entropy ~ q * group + (1 | subject), df, suppress_lme4_warnings = suppress_lme4_warnings, |
| 175 | 12x |
verbose = verbose, mm_suppress_pattern = mm_suppress_pattern) |
| 176 | ||
| 177 |
# If either fit failed, try falling back to simpler approaches |
|
| 178 | 12x |
fallback_lm <- NULL |
| 179 | 12x |
used_fit_method <- "lmer" |
| 180 | 12x |
used_singular <- FALSE |
| 181 | 12x |
if (inherits(fit0, "try-error") || inherits(fit1, "try-error") || (inherits(fit0, |
| 182 | 12x |
"lmerMod") && attr(fit0, "singular") == TRUE) || (inherits(fit1, "lmerMod") && |
| 183 | 12x |
attr(fit1, "singular") == TRUE)) {
|
| 184 | 11x |
used_fit_method <- "fallback" |
| 185 | 11x |
used_singular <- (inherits(fit0, "lmerMod") && attr(fit0, "singular") == |
| 186 | 11x |
TRUE) || (inherits(fit1, "lmerMod") && attr(fit1, "singular") == |
| 187 | 11x |
TRUE) |
| 188 | 11x |
if ((verbose && progress) || (!verbose && progress)) {
|
| 189 | 1x |
message("[calculate_lm_interaction] mixed model singular or failed; trying simpler fixed-effects fallback")
|
| 190 |
} |
|
| 191 | 11x |
fb <- .tsenat_try_lm_fallbacks(df, verbose = verbose) |
| 192 | 11x |
if (!is.null(fb)) {
|
| 193 | 11x |
fallback_lm <- fb |
| 194 | 11x |
used_fit_method <- fb$method |
| 195 |
} |
|
| 196 |
} else {
|
|
| 197 | 1x |
used_singular <- (inherits(fit0, "lmerMod") && attr(fit0, "singular") == |
| 198 | 1x |
TRUE) || (inherits(fit1, "lmerMod") && attr(fit1, "singular") == |
| 199 | 1x |
TRUE) |
| 200 | 1x |
if (used_singular) {
|
| 201 | ! |
used_fit_method <- "lmer_singular" |
| 202 |
} else {
|
|
| 203 | 1x |
used_fit_method <- "lmer" |
| 204 |
} |
|
| 205 |
} |
|
| 206 | ||
| 207 | 12x |
lrt_p <- NA_real_ |
| 208 | 12x |
if (!is.null(fallback_lm)) {
|
| 209 | 11x |
lrt_p <- .tsenat_extract_lrt_p(fallback_lm$fit0, fallback_lm$fit1) |
| 210 |
} else {
|
|
| 211 | 1x |
lrt_p <- .tsenat_extract_lrt_p(fit0, fit1) |
| 212 |
} |
|
| 213 | ||
| 214 | 12x |
satter_p <- NA_real_ |
| 215 | 12x |
if (pvalue %in% c("satterthwaite", "both")) {
|
| 216 | 8x |
mm_suppress_pattern <- "boundary \\(singular\\) fit|Computed variance-covariance matrix problem|not a positive definite matrix" |
| 217 | 8x |
muffle_cond <- suppress_lme4_warnings || (!verbose) |
| 218 | 8x |
satter_p <- withCallingHandlers(.tsenat_extract_satterthwaite_p(fit1, |
| 219 | 8x |
fallback_lm), warning = function(w) {
|
| 220 | ! |
if (muffle_cond && grepl(mm_suppress_pattern, conditionMessage(w), |
| 221 | ! |
ignore.case = TRUE)) {
|
| 222 | ! |
invokeRestart("muffleWarning")
|
| 223 |
} |
|
| 224 | 8x |
}, message = function(m) {
|
| 225 | ! |
if (muffle_cond && grepl(mm_suppress_pattern, conditionMessage(m), |
| 226 | ! |
ignore.case = TRUE)) {
|
| 227 | ! |
invokeRestart("muffleMessage")
|
| 228 |
} |
|
| 229 |
}) |
|
| 230 |
} |
|
| 231 | ||
| 232 |
# choose primary p_interaction according to user preference |
|
| 233 | 12x |
p_interaction <- NA_real_ |
| 234 | 12x |
if (pvalue == "satterthwaite") {
|
| 235 | 4x |
p_interaction <- satter_p |
| 236 | 8x |
} else if (pvalue == "lrt") {
|
| 237 | 4x |
p_interaction <- lrt_p |
| 238 | 4x |
} else if (pvalue == "both") {
|
| 239 | 4x |
if (!is.na(satter_p)) {
|
| 240 | 2x |
p_interaction <- satter_p |
| 241 |
} else {
|
|
| 242 | 2x |
p_interaction <- lrt_p |
| 243 |
} |
|
| 244 |
} |
|
| 245 | ||
| 246 | 12x |
return(data.frame(gene = g, p_interaction = p_interaction, p_lrt = lrt_p, |
| 247 | 12x |
p_satterthwaite = satter_p, fit_method = used_fit_method, singular = used_singular, |
| 248 | 12x |
stringsAsFactors = FALSE)) |
| 249 |
} |
|
| 250 | ||
| 251 | 6x |
if (method == "gam") {
|
| 252 | 3x |
return(.tsenat_gam_interaction(df, q_vals, g, min_obs = min_obs)) |
| 253 |
} |
|
| 254 | ||
| 255 | 3x |
if (method == "fpca") {
|
| 256 | 3x |
return(.tsenat_fpca_interaction(mat, q_vals, sample_names, group_vec, g, |
| 257 | 3x |
min_obs = min_obs)) |
| 258 |
} |
|
| 259 | ! |
return(NULL) |
| 260 |
} |
|
| 261 |
## All helpers for calculate_lm_interaction |
|
| 262 | ||
| 263 |
# Try lme4::lmer with multiple optimizers and controlled warnings. |
|
| 264 |
.tsenat_try_lmer <- function(formula, data, suppress_lme4_warnings = TRUE, verbose = FALSE, |
|
| 265 |
mm_suppress_pattern = "boundary \\(singular\\) fit|Computed variance-covariance matrix problem|not a positive definite matrix") {
|
|
| 266 | 23x |
if (!requireNamespace("lme4", quietly = TRUE)) {
|
| 267 | ! |
stop("Package 'lme4' is required for mixed-model fitting")
|
| 268 |
} |
|
| 269 | 23x |
opts <- list(list(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05)), list(optimizer = "nloptwrap", |
| 270 | 23x |
optCtrl = list(maxfun = 5e+05))) |
| 271 | 23x |
for (o in opts) {
|
| 272 | 24x |
ctrl <- lme4::lmerControl(optimizer = o$optimizer, optCtrl = o$optCtrl) |
| 273 | 24x |
muffle_cond <- suppress_lme4_warnings || (!verbose) |
| 274 | 24x |
fit_try <- withCallingHandlers(try(lme4::lmer(formula, data = data, REML = FALSE, |
| 275 | 24x |
control = ctrl), silent = TRUE), warning = function(w) {
|
| 276 | ! |
if (muffle_cond && grepl(mm_suppress_pattern, conditionMessage(w), ignore.case = TRUE)) {
|
| 277 | ! |
invokeRestart("muffleWarning")
|
| 278 |
} |
|
| 279 | 24x |
}, message = function(m) {
|
| 280 | 16x |
if (muffle_cond && grepl(mm_suppress_pattern, conditionMessage(m), ignore.case = TRUE)) {
|
| 281 | 15x |
invokeRestart("muffleMessage")
|
| 282 |
} |
|
| 283 |
}) |
|
| 284 | 24x |
if (!inherits(fit_try, "try-error")) {
|
| 285 |
# check singularity if function available |
|
| 286 | 22x |
is_sing <- FALSE |
| 287 | 22x |
if (exists("isSingular", where = asNamespace("lme4"), inherits = FALSE)) {
|
| 288 | 22x |
is_sing <- tryCatch(lme4::isSingular(fit_try, tol = 1e-04), error = function(e) FALSE) |
| 289 |
} |
|
| 290 | 22x |
attr(fit_try, "singular") <- is_sing |
| 291 | 22x |
return(fit_try) |
| 292 |
} |
|
| 293 |
} |
|
| 294 |
# all attempts failed |
|
| 295 | 1x |
return(structure("error", class = "try-error"))
|
| 296 |
} |
|
| 297 | ||
| 298 |
# LM fallback helpers |
|
| 299 |
.tsenat_try_lm_fallbacks <- function(df, verbose = FALSE) {
|
|
| 300 |
fit0_lm <- try(stats::lm(entropy ~ q + group + subject, data = df), silent = TRUE) |
|
| 301 |
fit1_lm <- try(stats::lm(entropy ~ q * group + subject, data = df), silent = TRUE) |
|
| 302 |
if (!inherits(fit0_lm, "try-error") && !inherits(fit1_lm, "try-error")) {
|
|
| 303 |
return(list(fit0 = fit0_lm, fit1 = fit1_lm, method = "lm_subject")) |
|
| 304 |
} |
|
| 305 |
fit0_lm2 <- try(stats::lm(entropy ~ q + group, data = df), silent = TRUE) |
|
| 306 |
fit1_lm2 <- try(stats::lm(entropy ~ q * group, data = df), silent = TRUE) |
|
| 307 |
if (!inherits(fit0_lm2, "try-error") && !inherits(fit1_lm2, "try-error")) {
|
|
| 308 |
return(list(fit0 = fit0_lm2, fit1 = fit1_lm2, method = "lm_nosubject")) |
|
| 309 |
} |
|
| 310 |
return(NULL) |
|
| 311 |
} |
|
| 312 | ||
| 313 |
# LRT p-value extraction |
|
| 314 |
.tsenat_extract_lrt_p <- function(fit0, fit1) {
|
|
| 315 |
an <- try(stats::anova(fit0, fit1), silent = TRUE) |
|
| 316 |
if (!inherits(an, "try-error") && nrow(an) >= 2) {
|
|
| 317 |
pcol <- grep("Pr\\(>F\\)|Pr\\(>Chisq\\)|Pr\\(>Chi\\)", colnames(an), value = TRUE)
|
|
| 318 |
if (length(pcol) == 0) {
|
|
| 319 |
return(as.numeric(an[2, ncol(an)])) |
|
| 320 |
} else {
|
|
| 321 |
return(as.numeric(an[2, pcol[1]])) |
|
| 322 |
} |
|
| 323 |
} |
|
| 324 |
return(NA_real_) |
|
| 325 |
} |
|
| 326 | ||
| 327 |
# Satterthwaite p-value extraction |
|
| 328 |
.tsenat_extract_satterthwaite_p <- function(fit1, fallback_lm = NULL) {
|
|
| 329 |
if (!is.null(fallback_lm)) {
|
|
| 330 |
coefs <- try(summary(fallback_lm$fit1)$coefficients, silent = TRUE) |
|
| 331 |
if (!is.null(coefs) && !inherits(coefs, "try-error")) {
|
|
| 332 |
ia_idx <- grep("^q:group", rownames(coefs))
|
|
| 333 |
if (length(ia_idx) > 0) {
|
|
| 334 |
return(coefs[ia_idx[1], "Pr(>|t|)"]) |
|
| 335 |
} |
|
| 336 |
} |
|
| 337 |
return(NA_real_) |
|
| 338 |
} |
|
| 339 |
if (requireNamespace("lmerTest", quietly = TRUE) && inherits(fit1, "lmerMod") &&
|
|
| 340 |
!isTRUE(attr(fit1, "singular"))) {
|
|
| 341 |
fit_lt <- try(lmerTest::lmer(stats::formula(fit1), data = stats::model.frame(fit1), |
|
| 342 |
REML = FALSE), silent = TRUE) |
|
| 343 |
if (!inherits(fit_lt, "try-error")) {
|
|
| 344 |
coefs <- summary(fit_lt)$coefficients |
|
| 345 |
ia_idx <- grep("^q:group", rownames(coefs))
|
|
| 346 |
if (length(ia_idx) > 0) {
|
|
| 347 |
return(coefs[ia_idx[1], "Pr(>|t|)"]) |
|
| 348 |
} |
|
| 349 |
} |
|
| 350 |
} |
|
| 351 |
return(NA_real_) |
|
| 352 |
} |
|
| 353 |
.tsenat_extract_satterthwaite_p <- function(fit1, fallback_lm = NULL, suppress_lme4_warnings = TRUE, |
|
| 354 |
verbose = FALSE, mm_suppress_pattern = "boundary \\(singular\\) fit|Computed variance-covariance matrix problem|not a positive definite matrix") {
|
|
| 355 |
# If we have a fallback lm, extract from its coefficients |
|
| 356 |
if (!is.null(fallback_lm)) {
|
|
| 357 |
coefs <- try(summary(fallback_lm$fit1)$coefficients, silent = TRUE) |
|
| 358 |
if (!inherits(coefs, "try-error")) {
|
|
| 359 |
ia_idx <- grep("^q:group", rownames(coefs))
|
|
| 360 |
if (length(ia_idx) > 0) {
|
|
| 361 |
return(coefs[ia_idx[1], "Pr(>|t|)"]) |
|
| 362 |
} |
|
| 363 |
} |
|
| 364 |
return(NA_real_) |
|
| 365 |
} |
|
| 366 |
# Prefer lmerTest when available; suppress known lme4/lmerTest warnings |
|
| 367 |
if (requireNamespace("lmerTest", quietly = TRUE) && inherits(fit1, "lmerMod")) {
|
|
| 368 |
muffle_cond <- suppress_lme4_warnings || (!verbose) |
|
| 369 |
fit_lt <- withCallingHandlers(try(lmerTest::lmer(stats::formula(fit1), data = stats::model.frame(fit1), |
|
| 370 |
REML = FALSE), silent = TRUE), warning = function(w) {
|
|
| 371 |
if (muffle_cond && grepl(mm_suppress_pattern, conditionMessage(w), ignore.case = TRUE)) {
|
|
| 372 |
invokeRestart("muffleWarning")
|
|
| 373 |
} |
|
| 374 |
}, message = function(m) {
|
|
| 375 |
if (muffle_cond && grepl(mm_suppress_pattern, conditionMessage(m), ignore.case = TRUE)) {
|
|
| 376 |
invokeRestart("muffleMessage")
|
|
| 377 |
} |
|
| 378 |
}) |
|
| 379 |
if (!inherits(fit_lt, "try-error")) {
|
|
| 380 |
coefs <- summary(fit_lt)$coefficients |
|
| 381 |
ia_idx <- grep("^q:group", rownames(coefs))
|
|
| 382 |
if (length(ia_idx) > 0) {
|
|
| 383 |
return(coefs[ia_idx[1], "Pr(>|t|)"]) |
|
| 384 |
} |
|
| 385 |
} |
|
| 386 |
} |
|
| 387 |
return(NA_real_) |
|
| 388 |
} |
|
| 389 | ||
| 390 |
# FPCA matrix preparation |
|
| 391 |
.tsenat_prepare_fpca_matrix <- function(mat, min_frac = 0.01) {
|
|
| 392 |
if (!is.matrix(mat)) {
|
|
| 393 |
mat <- as.matrix(mat) |
|
| 394 |
} |
|
| 395 |
row_vars <- apply(mat, 1, stats::var, na.rm = TRUE) |
|
| 396 |
keep <- row_vars > (min_frac * max(row_vars, na.rm = TRUE)) |
|
| 397 |
if (sum(keep) == 0) {
|
|
| 398 |
keep <- rep(TRUE, nrow(mat)) |
|
| 399 |
} |
|
| 400 |
m2 <- mat[keep, , drop = FALSE] |
|
| 401 |
m2 <- t(scale(t(m2))) |
|
| 402 |
return(list(mat = m2, keep = keep)) |
|
| 403 |
} |
|
| 404 | ||
| 405 |
## Helpers for calculate_lm_interaction fallbacks, LRT and Satterthwaite |
|
| 406 |
## p-values |
|
| 407 |
.tsenat_try_lm_fallbacks <- function(df, verbose = FALSE) {
|
|
| 408 |
# try lm with subject as fixed effect |
|
| 409 |
fit0_lm <- try(stats::lm(entropy ~ q + group + subject, data = df), silent = TRUE) |
|
| 410 |
fit1_lm <- try(stats::lm(entropy ~ q * group + subject, data = df), silent = TRUE) |
|
| 411 |
if (!inherits(fit0_lm, "try-error") && !inherits(fit1_lm, "try-error")) {
|
|
| 412 |
return(list(fit0 = fit0_lm, fit1 = fit1_lm, method = "lm_subject")) |
|
| 413 |
} |
|
| 414 |
# last resort: drop subject, plain lm |
|
| 415 |
fit0_lm2 <- try(stats::lm(entropy ~ q + group, data = df), silent = TRUE) |
|
| 416 |
fit1_lm2 <- try(stats::lm(entropy ~ q * group, data = df), silent = TRUE) |
|
| 417 |
if (!inherits(fit0_lm2, "try-error") && !inherits(fit1_lm2, "try-error")) {
|
|
| 418 |
return(list(fit0 = fit0_lm2, fit1 = fit1_lm2, method = "lm_nosubject")) |
|
| 419 |
} |
|
| 420 |
return(NULL) |
|
| 421 |
} |
|
| 422 | ||
| 423 |
.tsenat_extract_lrt_p <- function(fit0, fit1) {
|
|
| 424 |
an <- try(stats::anova(fit0, fit1), silent = TRUE) |
|
| 425 |
if (!inherits(an, "try-error") && nrow(an) >= 2) {
|
|
| 426 |
pcol <- grep("Pr\\(>F\\)|Pr\\(>Chisq\\)|Pr\\(>Chi\\)", colnames(an), value = TRUE)
|
|
| 427 |
if (length(pcol) == 0) {
|
|
| 428 |
return(as.numeric(an[2, ncol(an)])) |
|
| 429 |
} else {
|
|
| 430 |
return(as.numeric(an[2, pcol[1]])) |
|
| 431 |
} |
|
| 432 |
} |
|
| 433 |
return(NA_real_) |
|
| 434 |
} |
|
| 435 | ||
| 436 |
.tsenat_extract_satterthwaite_p <- function(fit1, fallback_lm = NULL) {
|
|
| 437 |
# if lmerTest available and fit1 is not singular use that |
|
| 438 |
if (!is.null(fallback_lm)) {
|
|
| 439 |
coefs <- try(summary(fallback_lm$fit1)$coefficients, silent = TRUE) |
|
| 440 |
if (!inherits(coefs, "try-error")) {
|
|
| 441 |
ia_idx <- grep("^q:group", rownames(coefs))
|
|
| 442 |
if (length(ia_idx) > 0) {
|
|
| 443 |
return(coefs[ia_idx[1], "Pr(>|t|)"]) |
|
| 444 |
} |
|
| 445 |
} |
|
| 446 |
return(NA_real_) |
|
| 447 |
} |
|
| 448 |
if (requireNamespace("lmerTest", quietly = TRUE) && inherits(fit1, "lmerMod") &&
|
|
| 449 |
!isTRUE(attr(fit1, "singular"))) {
|
|
| 450 |
fit_lt <- try(lmerTest::lmer(stats::formula(fit1), data = stats::model.frame(fit1), |
|
| 451 |
REML = FALSE), silent = TRUE) |
|
| 452 |
if (!inherits(fit_lt, "try-error")) {
|
|
| 453 |
coefs <- summary(fit_lt)$coefficients |
|
| 454 |
ia_idx <- grep("^q:group", rownames(coefs))
|
|
| 455 |
if (length(ia_idx) > 0) {
|
|
| 456 |
return(coefs[ia_idx[1], "Pr(>|t|)"]) |
|
| 457 |
} |
|
| 458 |
} |
|
| 459 |
} |
|
| 460 |
return(NA_real_) |
|
| 461 |
} |
|
| 462 | ||
| 463 |
.tsenat_prepare_fpca_matrix <- function(mat, min_frac = 0.01) {
|
|
| 464 |
# prepare matrix for FPCA: center, scale, and drop near-constant rows |
|
| 465 |
if (!is.matrix(mat)) {
|
|
| 466 |
mat <- as.matrix(mat) |
|
| 467 |
} |
|
| 468 |
row_vars <- apply(mat, 1, stats::var, na.rm = TRUE) |
|
| 469 |
keep <- row_vars > (min_frac * max(row_vars, na.rm = TRUE)) |
|
| 470 |
if (sum(keep) == 0) {
|
|
| 471 |
keep <- rep(TRUE, nrow(mat)) |
|
| 472 |
} |
|
| 473 |
m2 <- mat[keep, , drop = FALSE] |
|
| 474 |
m2 <- t(scale(t(m2))) |
|
| 475 |
return(list(mat = m2, keep = keep)) |
|
| 476 |
} |
|
| 477 |
## Additional helpers for calculate_lm_interaction |
|
| 478 | ||
| 479 |
# Try simple lm fallbacks when lmer fails. Returns list(fit0, fit1, method) |
|
| 480 |
.tsenat_try_lm_fallbacks <- function(df, verbose = FALSE) {
|
|
| 481 |
# try lm with subject as fixed effect |
|
| 482 | 15x |
fit0_lm <- try(stats::lm(entropy ~ q + group + subject, data = df), silent = TRUE) |
| 483 | 15x |
fit1_lm <- try(stats::lm(entropy ~ q * group + subject, data = df), silent = TRUE) |
| 484 | 15x |
if (!inherits(fit0_lm, "try-error") && !inherits(fit1_lm, "try-error")) {
|
| 485 | 14x |
return(list(fit0 = fit0_lm, fit1 = fit1_lm, method = "lm_subject")) |
| 486 |
} |
|
| 487 |
# last resort: drop subject |
|
| 488 | 1x |
fit0_lm2 <- try(stats::lm(entropy ~ q + group, data = df), silent = TRUE) |
| 489 | 1x |
fit1_lm2 <- try(stats::lm(entropy ~ q * group, data = df), silent = TRUE) |
| 490 | 1x |
if (!inherits(fit0_lm2, "try-error") && !inherits(fit1_lm2, "try-error")) {
|
| 491 | 1x |
return(list(fit0 = fit0_lm2, fit1 = fit1_lm2, method = "lm_nosubject")) |
| 492 |
} |
|
| 493 |
# indicate failure |
|
| 494 | ! |
return(NULL) |
| 495 |
} |
|
| 496 | ||
| 497 |
# Extract LRT p-value from two fitted models (lm or lmer). Returns NA_real_ on |
|
| 498 |
# failure. |
|
| 499 |
.tsenat_extract_lrt_p <- function(fit0, fit1) {
|
|
| 500 | 15x |
an <- try(stats::anova(fit0, fit1), silent = TRUE) |
| 501 | 15x |
if (inherits(an, "try-error") || nrow(an) < 2) {
|
| 502 | 2x |
return(NA_real_) |
| 503 |
} |
|
| 504 | 13x |
pcol <- grep("Pr\\(>F\\)|Pr\\(>Chisq\\)|Pr\\(>Chi\\)", colnames(an), value = TRUE)
|
| 505 | 13x |
if (length(pcol) == 0) {
|
| 506 | ! |
return(as.numeric(an[2, ncol(an)])) |
| 507 |
} |
|
| 508 | 13x |
return(as.numeric(an[2, pcol[1]])) |
| 509 |
} |
|
| 510 | ||
| 511 |
# Extract Satterthwaite p-value for q:group interaction from lmer/lmerTest or |
|
| 512 |
# fallback lm. |
|
| 513 |
.tsenat_extract_satterthwaite_p <- function(fit1, fallback_lm = NULL) {
|
|
| 514 |
# prefer lmerTest when available |
|
| 515 | 13x |
if (!is.null(fallback_lm)) {
|
| 516 | 12x |
coefs <- summary(fallback_lm$fit1)$coefficients |
| 517 | 12x |
ia_idx <- grep("^q:group", rownames(coefs))
|
| 518 | 12x |
if (length(ia_idx) > 0) {
|
| 519 | 8x |
return(coefs[ia_idx[1], "Pr(>|t|)"]) |
| 520 |
} |
|
| 521 | 4x |
return(NA_real_) |
| 522 |
} |
|
| 523 | 1x |
if (requireNamespace("lmerTest", quietly = TRUE) && inherits(fit1, "lmerMod")) {
|
| 524 | 1x |
fit_lt <- try(lmerTest::lmer(as.formula(formula(fit1)), data = model.frame(fit1), |
| 525 | 1x |
REML = FALSE), silent = TRUE) |
| 526 | 1x |
if (!inherits(fit_lt, "try-error")) {
|
| 527 | 1x |
coefs <- summary(fit_lt)$coefficients |
| 528 | 1x |
ia_idx <- grep("^q:group", rownames(coefs))
|
| 529 | 1x |
if (length(ia_idx) > 0) {
|
| 530 | 1x |
return(coefs[ia_idx[1], "Pr(>|t|)"]) |
| 531 |
} |
|
| 532 |
} |
|
| 533 |
} |
|
| 534 | ! |
return(NA_real_) |
| 535 |
} |
|
| 536 | ||
| 537 |
# Helper for FPCA-style preprocessing used in calculate_lm_interaction fpca |
|
| 538 |
# method. Builds curve_mat, filters good rows, imputes column means, and |
|
| 539 |
# returns list(mat_sub, used_samples) |
|
| 540 |
.tsenat_prepare_fpca_matrix <- function(mat, sample_names, q_vals, min_obs = 10) {
|
|
| 541 | 3x |
uq <- sort(unique(q_vals)) |
| 542 | 3x |
samples_u <- unique(sample_names) |
| 543 | 3x |
curve_mat <- matrix(NA_real_, nrow = length(samples_u), ncol = length(uq)) |
| 544 | 3x |
if (length(samples_u) > 0) {
|
| 545 | 3x |
rownames(curve_mat) <- samples_u |
| 546 |
} |
|
| 547 | 3x |
for (i in seq_along(sample_names)) {
|
| 548 | 24x |
s <- sample_names[i] |
| 549 | 24x |
qv <- q_vals[i] |
| 550 | 24x |
qi <- match(qv, uq) |
| 551 | 24x |
if (is.na(qi)) {
|
| 552 | ! |
next |
| 553 |
} |
|
| 554 | 24x |
if (s %in% rownames(curve_mat)) {
|
| 555 | 24x |
curve_mat[s, qi] <- as.numeric(mat[, i]) |
| 556 |
} |
|
| 557 |
} |
|
| 558 |
# keep samples with at least half of q points present |
|
| 559 | 3x |
good_rows <- which(rowSums(!is.na(curve_mat)) >= max(2, ceiling(ncol(curve_mat)/2))) |
| 560 | 3x |
if (length(good_rows) < min_obs) {
|
| 561 | 2x |
return(NULL) |
| 562 |
} |
|
| 563 | 1x |
mat_sub <- curve_mat[good_rows, , drop = FALSE] |
| 564 | 1x |
col_means <- apply(mat_sub, 2, function(col) mean(col, na.rm = TRUE)) |
| 565 | 1x |
for (r in seq_len(nrow(mat_sub))) mat_sub[r, is.na(mat_sub[r, ])] <- col_means[is.na(mat_sub[r, |
| 566 |
])] |
|
| 567 | 1x |
list(mat_sub = mat_sub, used_samples = rownames(mat_sub)) |
| 568 |
} |
| 1 |
# Helper utilities for calculate_difference |
|
| 2 | ||
| 3 |
.tsenat_calculate_difference_partition <- function(df, samples, control, method, |
|
| 4 |
test, pcorr, randomizations, verbose) {
|
|
| 5 | 36x |
if (ncol(df) - 1 != length(samples)) {
|
| 6 | 4x |
stop("Column count doesn't match length(samples).", call. = FALSE)
|
| 7 |
} |
|
| 8 | 32x |
uniq_groups <- unique(as.character(samples)) |
| 9 | 32x |
if (length(uniq_groups) > 2) {
|
| 10 | 4x |
stop("More than two conditions; provide exactly two.", call. = FALSE)
|
| 11 |
} |
|
| 12 | 28x |
if (length(uniq_groups) < 2) {
|
| 13 | 4x |
stop("Fewer than two conditions; provide exactly two.", call. = FALSE)
|
| 14 |
} |
|
| 15 | 24x |
if (!(control %in% uniq_groups)) {
|
| 16 | 4x |
stop("Control sample type not found in samples.", call. = FALSE)
|
| 17 |
} |
|
| 18 | ||
| 19 | 20x |
case_label <- setdiff(uniq_groups, control) |
| 20 | 20x |
groups <- c(case_label, control) |
| 21 | ||
| 22 | 20x |
if (!(method %in% c("mean", "median"))) {
|
| 23 | 1x |
stop("Invalid method; see ?calculate_difference.", call. = FALSE)
|
| 24 |
} |
|
| 25 | 19x |
if (!(test %in% c("wilcoxon", "shuffle"))) {
|
| 26 | 1x |
stop("Invalid test method; see ?calculate_difference.", call. = FALSE)
|
| 27 |
} |
|
| 28 | 18x |
valid_pcorr <- c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr",
|
| 29 | 18x |
"none") |
| 30 | 18x |
if (!(pcorr %in% valid_pcorr)) {
|
| 31 | 1x |
stop("Invalid p-value correction; see ?calculate_difference.", call. = FALSE)
|
| 32 |
} |
|
| 33 | ||
| 34 | 17x |
tab <- table(samples) |
| 35 | 17x |
if (test == "wilcoxon") {
|
| 36 | 12x |
if (randomizations != 100 && verbose) {
|
| 37 | 2x |
message("'randomizations' ignored for wilcoxon.")
|
| 38 |
} |
|
| 39 | 12x |
if (any(tab < 3) || sum(tab) < 8) {
|
| 40 | 3x |
warning("Low sample size for wilcoxon.", call. = FALSE)
|
| 41 |
} |
|
| 42 |
} |
|
| 43 | 17x |
if (test == "shuffle") {
|
| 44 | 5x |
if (sum(tab) <= 5) {
|
| 45 | 2x |
warning("Low sample size for label shuffling.", call. = FALSE)
|
| 46 |
} |
|
| 47 | 5x |
if (sum(tab) > 5 && sum(tab) < 10) {
|
| 48 | 2x |
warning("Label shuffling may be unreliable.", call. = FALSE)
|
| 49 |
} |
|
| 50 |
} |
|
| 51 | ||
| 52 | 17x |
idx_case <- which(samples == groups[1]) |
| 53 | 17x |
idx_control <- which(samples == groups[2]) |
| 54 | 17x |
idx1 <- idx_case |
| 55 | 17x |
idx2 <- idx_control |
| 56 | ||
| 57 | 17x |
df$cond_1 <- rowSums(!is.na(df[, idx1 + 1, drop = FALSE])) |
| 58 | 17x |
df$cond_2 <- rowSums(!is.na(df[, idx2 + 1, drop = FALSE])) |
| 59 | ||
| 60 | 17x |
if (test == "wilcoxon") {
|
| 61 | 12x |
keep_mask <- (df$cond_1 >= 3 & df$cond_2 >= 3 & (df$cond_1 + df$cond_2) >= |
| 62 | 12x |
8) |
| 63 |
} else {
|
|
| 64 | 5x |
keep_mask <- (df$cond_1 + df$cond_2) >= 5 |
| 65 |
} |
|
| 66 | ||
| 67 | 17x |
df_keep <- df[keep_mask, , drop = FALSE] |
| 68 | 17x |
df_small <- df[!keep_mask, , drop = FALSE] |
| 69 | ||
| 70 | 17x |
list(df = df, samples = samples, groups = groups, idx1 = idx1, idx2 = idx2, df_keep = df_keep, |
| 71 | 17x |
df_small = df_small) |
| 72 |
} |
|
| 73 | ||
| 74 |
# Helpers for calculate_fc |
|
| 75 |
.tsenat_aggregate_fc_values <- function(x, samples, method, control) {
|
|
| 76 | 1796x |
if (method == "mean") {
|
| 77 | 1590x |
value <- aggregate(t(x), by = list(samples), mean, na.rm = TRUE) |
| 78 | 206x |
} else if (method == "median") {
|
| 79 | 206x |
value <- aggregate(t(x), by = list(samples), median, na.rm = TRUE) |
| 80 |
} else {
|
|
| 81 | ! |
stop("Invalid method; must be 'mean' or 'median'")
|
| 82 |
} |
|
| 83 | ||
| 84 | 1796x |
sorted <- value[value$Group.1 != control, ] |
| 85 | 1796x |
sorted[2, ] <- value[value$Group.1 == control, ] |
| 86 | 1796x |
value <- t(sorted[, -1]) |
| 87 | 1796x |
value[is.na(value[, 1]), c(1)] <- NA |
| 88 | 1796x |
value[is.na(value[, 2]), c(2)] <- NA |
| 89 | 1796x |
return(list(value = value, sorted = sorted)) |
| 90 |
} |
|
| 91 | ||
| 92 |
.tsenat_apply_pseudocount <- function(value, pseudocount) {
|
|
| 93 | 1798x |
if (!is.numeric(pseudocount) || length(pseudocount) != 1) {
|
| 94 | ! |
pseudocount <- 0 |
| 95 |
} |
|
| 96 | 1798x |
if (pseudocount <= 0) {
|
| 97 | 1796x |
pos_vals <- value[!is.na(value) & value > 0] |
| 98 | 1796x |
if (length(pos_vals) > 0) {
|
| 99 | 1794x |
pc <- min(pos_vals, na.rm = TRUE)/2 |
| 100 |
} else {
|
|
| 101 | 2x |
pc <- 1e-06 |
| 102 |
} |
|
| 103 |
} else {
|
|
| 104 | 2x |
pc <- pseudocount |
| 105 |
} |
|
| 106 | 1798x |
replace_idx <- !is.na(value) & value <= 0 |
| 107 | 1798x |
value[replace_idx] <- pc |
| 108 | 1798x |
return(value) |
| 109 |
} |
|
| 110 | ||
| 111 |
# Paired permutation helpers |
|
| 112 |
.tsenat_permute_paired <- function(x, samples, control, method, randomizations, paired_method) {
|
|
| 113 | 14x |
ncols <- ncol(x) |
| 114 | 14x |
if (ncols%%2 != 0) {
|
| 115 | 1x |
stop("Paired permutation requires an even number of samples and paired column ordering",
|
| 116 | 1x |
call. = FALSE) |
| 117 |
} |
|
| 118 | 13x |
npairs <- ncols/2 |
| 119 | 13x |
if (paired_method == "swap") {
|
| 120 | 5x |
perm_mat <- matrix(NA_real_, nrow = nrow(x), ncol = randomizations) |
| 121 | 5x |
for (r in seq_len(randomizations)) {
|
| 122 | 310x |
swap <- sample(c(TRUE, FALSE), size = npairs, replace = TRUE) |
| 123 | 310x |
perm_samples <- samples |
| 124 | 310x |
for (p in seq_len(npairs)) {
|
| 125 | 820x |
if (swap[p]) {
|
| 126 | 431x |
i1 <- (p - 1) * 2 + 1 |
| 127 | 431x |
i2 <- i1 + 1 |
| 128 | 431x |
perm_samples[c(i1, i2)] <- perm_samples[c(i2, i1)] |
| 129 |
} |
|
| 130 |
} |
|
| 131 | 310x |
df_perm <- calculate_fc(x, perm_samples, control, method) |
| 132 | 310x |
perm_mat[, r] <- as.numeric(df_perm[, 4]) |
| 133 |
} |
|
| 134 | 5x |
return(perm_mat) |
| 135 | 8x |
} else if (paired_method == "signflip") {
|
| 136 | 8x |
total_comb <- 2^npairs |
| 137 | 8x |
if (randomizations <= 0 || randomizations >= total_comb) {
|
| 138 | 7x |
combos <- expand.grid(rep(list(c(0, 1)), npairs)) |
| 139 | 7x |
nrep <- nrow(combos) |
| 140 | 7x |
perm_mat <- matrix(NA_real_, nrow = nrow(x), ncol = nrep) |
| 141 | 7x |
for (r in seq_len(nrep)) {
|
| 142 | 36x |
swap <- as.logical(as.integer(combos[r, ])) |
| 143 | 36x |
perm_samples <- samples |
| 144 | 36x |
for (p in seq_len(npairs)) {
|
| 145 | 88x |
if (swap[p]) {
|
| 146 | 44x |
i1 <- (p - 1) * 2 + 1 |
| 147 | 44x |
i2 <- i1 + 1 |
| 148 | 44x |
perm_samples[c(i1, i2)] <- perm_samples[c(i2, i1)] |
| 149 |
} |
|
| 150 |
} |
|
| 151 | 36x |
df_perm <- calculate_fc(x, perm_samples, control, method) |
| 152 | 36x |
perm_mat[, r] <- as.numeric(df_perm[, 4]) |
| 153 |
} |
|
| 154 | 7x |
return(perm_mat) |
| 155 |
} else {
|
|
| 156 | 1x |
perm_mat <- matrix(NA_real_, nrow = nrow(x), ncol = randomizations) |
| 157 | 1x |
for (r in seq_len(randomizations)) {
|
| 158 | 2x |
swap <- sample(c(TRUE, FALSE), size = npairs, replace = TRUE) |
| 159 | 2x |
perm_samples <- samples |
| 160 | 2x |
for (p in seq_len(npairs)) {
|
| 161 | 4x |
if (swap[p]) {
|
| 162 | 1x |
i1 <- (p - 1) * 2 + 1 |
| 163 | 1x |
i2 <- i1 + 1 |
| 164 | 1x |
perm_samples[c(i1, i2)] <- perm_samples[c(i2, i1)] |
| 165 |
} |
|
| 166 |
} |
|
| 167 | 2x |
df_perm <- calculate_fc(x, perm_samples, control, method) |
| 168 | 2x |
perm_mat[, r] <- as.numeric(df_perm[, 4]) |
| 169 |
} |
|
| 170 | 1x |
return(perm_mat) |
| 171 |
} |
|
| 172 |
} |
|
| 173 |
} |
| 1 |
# Helpers for plot_top_transcripts internals |
|
| 2 | ||
| 3 |
.ptt_select_genes_from_res <- function(res, top_n) {
|
|
| 4 | 9x |
if (is.null(res)) {
|
| 5 | 2x |
stop("Either 'gene' or 'res' must be provided")
|
| 6 |
} |
|
| 7 | 7x |
if (!("genes" %in% colnames(res))) {
|
| 8 | 2x |
stop("Provided 'res' must contain a 'genes' column")
|
| 9 |
} |
|
| 10 | 5x |
if ("adjusted_p_values" %in% colnames(res)) {
|
| 11 | 3x |
ord <- order(res$adjusted_p_values, na.last = NA) |
| 12 | 2x |
} else if ("raw_p_values" %in% colnames(res)) {
|
| 13 | 2x |
ord <- order(res$raw_p_values, na.last = NA) |
| 14 |
} else {
|
|
| 15 | ! |
ord <- seq_len(nrow(res)) |
| 16 |
} |
|
| 17 | 5x |
genes_sel <- as.character(res$genes[ord]) |
| 18 | 5x |
genes_sel <- unique(genes_sel) |
| 19 | 5x |
head(genes_sel, top_n) |
| 20 |
} |
|
| 21 | ||
| 22 |
.ptt_infer_samples_from_coldata <- function(coldata, counts, sample_type_col) {
|
|
| 23 | 7x |
if (is.character(coldata) && length(coldata) == 1) {
|
| 24 | 1x |
if (!file.exists(coldata)) {
|
| 25 | ! |
stop("coldata file not found: ", coldata)
|
| 26 |
} |
|
| 27 | 1x |
cdf <- utils::read.delim(coldata, header = TRUE, stringsAsFactors = FALSE) |
| 28 | 6x |
} else if (is.data.frame(coldata)) {
|
| 29 | 5x |
cdf <- coldata |
| 30 |
} else {
|
|
| 31 | 1x |
stop("`coldata` must be a data.frame or path to a tab-delimited file")
|
| 32 |
} |
|
| 33 | ||
| 34 | 6x |
if (!is.null(rownames(cdf)) && all(colnames(counts) %in% rownames(cdf))) {
|
| 35 | 2x |
as.character(cdf[colnames(counts), sample_type_col]) |
| 36 |
} else {
|
|
| 37 | 4x |
sample_id_cols <- c("sample", "Sample", "sample_id", "id")
|
| 38 | 4x |
sid <- intersect(sample_id_cols, colnames(cdf)) |
| 39 | 4x |
if (length(sid) > 0) {
|
| 40 | 3x |
sid <- sid[1] |
| 41 | 3x |
if (!all(colnames(counts) %in% as.character(cdf[[sid]]))) {
|
| 42 | 1x |
stop("coldata sample id column does not match column names of counts")
|
| 43 |
} |
|
| 44 | 2x |
row_ix <- match(colnames(counts), as.character(cdf[[sid]])) |
| 45 | 2x |
as.character(cdf[[sample_type_col]][row_ix]) |
| 46 |
} else {
|
|
| 47 | 1x |
stop("Could not match `coldata` rows to `counts` columns. Provide `samples` or a row-named `coldata`.")
|
| 48 |
} |
|
| 49 |
} |
|
| 50 |
} |
|
| 51 | ||
| 52 |
.ptt_read_tx2gene <- function(tx2gene) {
|
|
| 53 | 8x |
if (is.null(tx2gene)) {
|
| 54 | 1x |
stop("`tx2gene` must be provided as a file path or data.frame (or include mapping in metadata of provided SummarizedExperiment)")
|
| 55 |
} |
|
| 56 | 7x |
if (is.character(tx2gene) && length(tx2gene) == 1) {
|
| 57 | 3x |
if (!file.exists(tx2gene)) {
|
| 58 | 1x |
stop("tx2gene file not found: ", tx2gene)
|
| 59 |
} |
|
| 60 | 2x |
mapping <- utils::read.delim(tx2gene, stringsAsFactors = FALSE, header = TRUE) |
| 61 | 4x |
} else if (is.data.frame(tx2gene)) {
|
| 62 | 4x |
mapping <- tx2gene |
| 63 |
} else {
|
|
| 64 | ! |
stop("`tx2gene` must be provided as a file path or data.frame (or include mapping in metadata of provided SummarizedExperiment)")
|
| 65 |
} |
|
| 66 | 6x |
if (!all(c("Transcript", "Gen") %in% colnames(mapping))) {
|
| 67 | 2x |
stop("tx2gene must have columns 'Transcript' and 'Gen'")
|
| 68 |
} |
|
| 69 | 4x |
mapping |
| 70 |
} |
|
| 71 | ||
| 72 |
.ptt_make_agg <- function(metric = c("median", "mean", "variance", "iqr")) {
|
|
| 73 | 7x |
metric_choice <- match.arg(metric) |
| 74 | 7x |
agg_fun <- switch(metric_choice, median = function(x) stats::median(x, na.rm = TRUE), |
| 75 | 7x |
mean = function(x) base::mean(x, na.rm = TRUE), variance = function(x) {
|
| 76 | 1x |
stats::var(x, na.rm = TRUE) |
| 77 | 7x |
}, iqr = function(x) stats::IQR(x, na.rm = TRUE)) |
| 78 | 7x |
agg_label_metric <- if (metric_choice == "iqr") {
|
| 79 | 2x |
"IQR" |
| 80 |
} else {
|
|
| 81 | 5x |
metric_choice |
| 82 |
} |
|
| 83 | 7x |
agg_label <- sprintf("Transcript-level expression with metric %s", agg_label_metric)
|
| 84 | 7x |
.cnt <- as.integer(getOption("TSENAT.plot_top_counter", 0)) + 1L
|
| 85 | 7x |
options(TSENAT.plot_top_counter = .cnt) |
| 86 | 7x |
agg_label_unique <- agg_label |
| 87 | 7x |
list(metric_choice = metric_choice, agg_fun = agg_fun, agg_label_unique = agg_label_unique) |
| 88 |
} |
|
| 89 | ||
| 90 |
.ptt_build_tx_long <- function(gene_single, mapping, counts, samples, top_n) {
|
|
| 91 | 13x |
txs <- mapping$Transcript[mapping$Gen == gene_single] |
| 92 | 13x |
txs <- intersect(txs, rownames(counts)) |
| 93 | 13x |
if (length(txs) == 0) {
|
| 94 | 1x |
stop("No transcripts found for gene: ", gene_single)
|
| 95 |
} |
|
| 96 | 12x |
if (!is.null(top_n)) {
|
| 97 | 12x |
txs <- head(txs, top_n) |
| 98 |
} |
|
| 99 | 12x |
mat <- counts[txs, , drop = FALSE] |
| 100 | 12x |
df_all <- as.data.frame(mat) |
| 101 | 12x |
df_all$tx <- rownames(mat) |
| 102 | 12x |
df_long <- tidyr::pivot_longer(df_all, -tx, names_to = "sample", values_to = "expr") |
| 103 | 12x |
df_long$group <- rep(samples, times = length(txs)) |
| 104 | 12x |
list(df_long = df_long, txs = txs) |
| 105 |
} |
|
| 106 | ||
| 107 |
.ptt_aggregate_df_long <- function(df_long, agg_fun, pseudocount) {
|
|
| 108 | 12x |
df_summary <- stats::aggregate(expr ~ tx + group, data = df_long, FUN = agg_fun) |
| 109 | 12x |
df_summary$log2expr <- log2(df_summary$expr + pseudocount) |
| 110 | 12x |
df_summary$tx <- factor(df_summary$tx, levels = unique(df_summary$tx)) |
| 111 | 12x |
df_summary |
| 112 |
} |
|
| 113 | ||
| 114 |
.ptt_build_plot_from_summary <- function(df_summary, agg_label_unique, fill_limits = NULL) {
|
|
| 115 | 12x |
p <- ggplot2::ggplot(df_summary, ggplot2::aes(x = group, y = tx, fill = log2expr)) + |
| 116 | 12x |
ggplot2::geom_tile(color = "white", width = 0.95, height = 0.95) + ggplot2::scale_fill_viridis_c(option = "viridis", |
| 117 | 12x |
direction = -1, na.value = "grey80", limits = fill_limits) + ggplot2::theme_minimal(base_size = 14) + |
| 118 | 12x |
ggplot2::labs(title = agg_label_unique, x = NULL, y = NULL, fill = "log2(expr)") + |
| 119 | 12x |
ggplot2::theme(axis.text.y = ggplot2::element_text(size = 12), axis.text.x = ggplot2::element_text(size = 12), |
| 120 | 12x |
plot.title = ggplot2::element_text(size = 16, hjust = 0.6), legend.position = "bottom", |
| 121 | 12x |
legend.key.width = ggplot2::unit(1.2, "cm"), plot.margin = ggplot2::margin(4, |
| 122 | 12x |
4, 4, 4)) + ggplot2::guides(fill = ggplot2::guide_colorbar(title.position = "top", |
| 123 | 12x |
barwidth = 6, barheight = 0.35)) |
| 124 | 12x |
p |
| 125 |
} |
|
| 126 | ||
| 127 |
.ptt_combine_patchwork <- function(plots, agg_label_unique) {
|
|
| 128 | 8x |
combined <- Reduce(`+`, plots) + patchwork::plot_layout(nrow = 1, guides = "collect") & |
| 129 | 8x |
ggplot2::theme(legend.position = "bottom") |
| 130 | 8x |
combined <- combined + patchwork::plot_annotation(title = agg_label_unique, theme = ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.6, |
| 131 | 8x |
size = 16, margin = ggplot2::margin(b = 10)))) |
| 132 | 8x |
combined |
| 133 |
} |
|
| 134 | ||
| 135 |
.ptt_combine_cowplot <- function(plots, output_file = NULL, agg_label_unique) {
|
|
| 136 | 1x |
p_for_legend <- plots[[1]] + ggplot2::theme(legend.position = "bottom") |
| 137 | 1x |
legend <- cowplot::get_legend(p_for_legend) |
| 138 | 1x |
plots_nolegend <- lapply(plots, function(pp) pp + ggplot2::theme(legend.position = "none")) |
| 139 | 1x |
grid <- cowplot::plot_grid(plotlist = plots_nolegend, nrow = 1, align = "hv") |
| 140 | 1x |
title_grob <- cowplot::ggdraw() + cowplot::draw_label(agg_label_unique, fontface = "plain", |
| 141 | 1x |
x = 0.6, hjust = 0.5, size = 16) |
| 142 | 1x |
result_plot <- cowplot::plot_grid(title_grob, grid, legend, ncol = 1, rel_heights = c(0.08, |
| 143 | 1x |
1, 0.08)) |
| 144 | 1x |
if (!is.null(output_file)) {
|
| 145 | ! |
ggplot2::ggsave(output_file, result_plot) |
| 146 | ! |
invisible(NULL) |
| 147 |
} |
|
| 148 | 1x |
result_plot |
| 149 |
} |
|
| 150 | ||
| 151 |
.ptt_combine_grid <- function(plots, output_file = NULL, agg_label_unique) {
|
|
| 152 | 2x |
plots_nolegend <- lapply(plots, function(pp) pp + ggplot2::theme(legend.position = "none")) |
| 153 | 2x |
grobs <- lapply(plots_nolegend, ggplot2::ggplotGrob) |
| 154 | 2x |
g_full <- ggplot2::ggplotGrob(plots[[1]]) |
| 155 | 2x |
legend_idx <- which(vapply(g_full$grobs, function(x) x$name, character(1)) == |
| 156 | 2x |
"guide-box") |
| 157 | 2x |
if (length(legend_idx)) {
|
| 158 | 1x |
legend_grob <- g_full$grobs[[legend_idx[1]]] |
| 159 |
} else {
|
|
| 160 | 1x |
legend_grob <- NULL |
| 161 |
} |
|
| 162 | 2x |
n <- length(grobs) |
| 163 | 2x |
heights <- grid::unit.c(grid::unit(0.6, "cm"), grid::unit(1, "null"), grid::unit(0.7, |
| 164 | 2x |
"cm")) |
| 165 | 2x |
if (!is.null(output_file)) {
|
| 166 | 1x |
png(filename = output_file, width = 800 * n, height = 480, res = 150) |
| 167 | 1x |
.draw_transcript_grid(grobs, agg_label_unique, legend_grob, n, heights, to_file = output_file) |
| 168 | 1x |
invisible(NULL) |
| 169 |
} else {
|
|
| 170 | 1x |
.draw_transcript_grid(grobs, agg_label_unique, legend_grob, n, heights) |
| 171 | 1x |
invisible(NULL) |
| 172 |
} |
|
| 173 |
} |
| 1 |
# # Minimalized diversity functions: only Tsallis entropy retained |
|
| 2 | ||
| 3 |
#' Calculate Tsallis entropy for a vector of transcript-level |
|
| 4 |
#' expression values of one gene. |
|
| 5 |
#' |
|
| 6 |
#' @param x Vector of (non-negative) expression values. |
|
| 7 |
#' @param q Tsallis entropy parameter (q > 0). Scalar or numeric vector |
|
| 8 |
#' (default: 2). |
|
| 9 |
#' @param norm Logical; if TRUE, normalize entropy by its theoretical maximum |
|
| 10 |
#' (values in [0,1]). |
|
| 11 |
#' @param what Which quantity to return: 'S' (Tsallis entropy), 'D' (Hill |
|
| 12 |
#' numbers), or 'both'. |
|
| 13 |
#' @param log_base Base of the logarithm used for Shannon limits and |
|
| 14 |
#' normalization |
|
| 15 |
#' (default: \code{exp(1)}).
|
|
| 16 |
#' @export |
|
| 17 |
#' @return For `what = 'S'` or `what = 'D'`: a numeric vector |
|
| 18 |
#' (named when length(q) > 1). For `what = 'both'`: a list with |
|
| 19 |
#' components `$S` and `$D`. |
|
| 20 |
#' @details |
|
| 21 |
#' Implements S_q = (1 - sum p^q)/(q - 1) and D_q = (sum p^q)^(1/(1-q)). |
|
| 22 |
#' Uses the q->1 limits (Shannon entropy and its exponential). Natural |
|
| 23 |
#' logarithms are used for the q->1 limit and for normalization. |
|
| 24 |
#' @examples |
|
| 25 |
#' x <- c(10, 5, 0) |
|
| 26 |
#' calculate_tsallis_entropy(x, q = c(0.5, 1, 2), norm = TRUE) |
|
| 27 |
calculate_tsallis_entropy <- function(x, q = 2, norm = TRUE, what = c("S", "D", "both"),
|
|
| 28 |
log_base = exp(1)) {
|
|
| 29 | 4875x |
what <- match.arg(what) |
| 30 | 4875x |
if (!is.numeric(q)) {
|
| 31 | 1x |
stop("q must be numeric.")
|
| 32 |
} |
|
| 33 | 4874x |
if (any(q <= 0)) {
|
| 34 | 5x |
stop("q must be greater than 0.")
|
| 35 |
} |
|
| 36 | 4869x |
if (!is.numeric(x)) {
|
| 37 | 1x |
stop("x must be numeric")
|
| 38 |
} |
|
| 39 | ||
| 40 | 4868x |
n <- length(x) |
| 41 |
# If all counts sum to zero, return NA; allow single-element vectors to |
|
| 42 |
# proceed |
|
| 43 | 4868x |
if (sum(x, na.rm = TRUE) <= 0) {
|
| 44 | 31x |
if (what == "both") {
|
| 45 | 1x |
return(list(S = rep(NA_real_, length(q)), D = rep(NA_real_, length(q)))) |
| 46 |
} |
|
| 47 | 30x |
return(rep(NA_real_, length(q))) |
| 48 |
} |
|
| 49 | ||
| 50 | 4837x |
p <- x/sum(x) |
| 51 | ||
| 52 | 4837x |
tol <- sqrt(.Machine$double.eps) |
| 53 | 4837x |
S_vec <- .tsenat_calc_S(p = p, q = q, tol = tol, n = n, log_base = log_base, |
| 54 | 4837x |
norm = norm) |
| 55 | 4837x |
D_vec <- .tsenat_calc_D(p = p, q = q, tol = tol, log_base = log_base) |
| 56 | ||
| 57 | 4837x |
if (what == "S") {
|
| 58 | 4822x |
out <- S_vec |
| 59 | 4822x |
if (length(q) > 1) {
|
| 60 | 413x |
names(out) <- paste0("q=", q)
|
| 61 |
} |
|
| 62 | 4822x |
if (length(q) == 1) {
|
| 63 | 4409x |
return(unname(out)) |
| 64 |
} |
|
| 65 | 413x |
return(out) |
| 66 |
} |
|
| 67 | 15x |
if (what == "D") {
|
| 68 | 15x |
out <- D_vec |
| 69 | 15x |
if (length(q) > 1) {
|
| 70 | 9x |
names(out) <- paste0("q=", q)
|
| 71 |
} |
|
| 72 | 15x |
if (length(q) == 1) {
|
| 73 | 6x |
return(unname(out)) |
| 74 |
} |
|
| 75 | 9x |
return(out) |
| 76 |
} |
|
| 77 |
# both |
|
| 78 | ! |
names(S_vec) <- paste0("q=", q)
|
| 79 | ! |
names(D_vec) <- paste0("q=", q)
|
| 80 | ! |
return(list(S = S_vec, D = D_vec)) |
| 81 |
} |
| 1 |
## Helper: Extract tx2gene mapping from GFF3 file |
|
| 2 |
#' @title Extract Transcript-to-Gene Mapping from GFF3 File |
|
| 3 |
#' @description Internal function that parses GFF3 or GFF3.gz files to extract |
|
| 4 |
#' transcript-to-gene mappings. This function is called internally by \code{build_se()}
|
|
| 5 |
#' when a GFF3 file path is provided. Users should not call this function directly. |
|
| 6 |
#' @param gff3_file Path to a GFF3 or GFF3.gz file containing transcript/mRNA |
|
| 7 |
#' features with ID and Parent attributes. |
|
| 8 |
#' @return A data.frame with two columns (Transcript, Gene) containing the |
|
| 9 |
#' transcript-to-gene mapping extracted from the GFF3 file. |
|
| 10 |
#' @details |
|
| 11 |
#' The function expects GFF3 format with the following structure: |
|
| 12 |
#' \itemize{
|
|
| 13 |
#' \item 9 tab-separated columns: seqname, source, feature, start, end, |
|
| 14 |
#' score, strand, phase, attributes |
|
| 15 |
#' \item Feature type column (3rd column) should contain 'transcript' or 'mRNA' |
|
| 16 |
#' \item Attributes column (9th column) should contain ID and Parent fields |
|
| 17 |
#' \item ID field: unique identifier for the transcript |
|
| 18 |
#' \item Parent field: references the gene ID that this transcript belongs to |
|
| 19 |
#' } |
|
| 20 |
#' Example GFF3 line: |
|
| 21 |
#' \preformatted{chr1\tgencode\ttranscript\t1000\t3000\t.\t+\t.\t
|
|
| 22 |
#' ID=ENST00000001;Parent=ENSG00000101;Name=BRCA1-001} |
|
| 23 |
#' |
|
| 24 |
#' \strong{Performance:} This function is optimized for large GFF3 files:
|
|
| 25 |
#' \itemize{
|
|
| 26 |
#' \item Reads files in 10,000-line chunks (not line-by-line) |
|
| 27 |
#' \item Uses fast pre-filtering (feature type check before regex) |
|
| 28 |
#' \item Employs efficient string operations instead of heavy regex on every line |
|
| 29 |
#' \item Handles both compressed (.gz) and uncompressed files seamlessly |
|
| 30 |
#' } |
|
| 31 |
#' @examples |
|
| 32 |
#' \dontrun{
|
|
| 33 |
#' # Create a temporary GFF3 file with transcript features |
|
| 34 |
#' gff3_lines <- c( |
|
| 35 |
#' 'chr1\tgencode\ttranscript\t1000\t3000\t.\t+\t.\tID=ENST001;Parent=ENSG001', |
|
| 36 |
#' 'chr1\tgencode\ttranscript\t1500\t3500\t.\t+\t.\tID=ENST002;Parent=ENSG001', |
|
| 37 |
#' 'chr2\tgencode\ttranscript\t5000\t8000\t.\t-\t.\tID=ENST003;Parent=ENSG002' |
|
| 38 |
#' ) |
|
| 39 |
#' tf <- tempfile(fileext = '.gff3') |
|
| 40 |
#' writeLines(gff3_lines, tf) |
|
| 41 |
#' |
|
| 42 |
#' # Extract transcript-to-gene mapping |
|
| 43 |
#' tx2gene <- extract_tx2gene_from_gff3(tf) |
|
| 44 |
#' head(tx2gene) |
|
| 45 |
#' } |
|
| 46 |
extract_tx2gene_from_gff3 <- function(gff3_file) {
|
|
| 47 |
# Handle both .gff3 and .gff3.gz files |
|
| 48 | 25x |
if (grepl("\\.gff3\\.gz$", gff3_file)) {
|
| 49 | 4x |
con <- gzfile(gff3_file, "rt") |
| 50 |
} else {
|
|
| 51 | 21x |
con <- file(gff3_file, "r") |
| 52 |
} |
|
| 53 | ||
| 54 | 25x |
on.exit(close(con)) |
| 55 | ||
| 56 |
# Pre-allocate vectors for efficiency (start with capacity for 10k |
|
| 57 |
# transcripts) This avoids repeatedly growing lists/data.frames which is |
|
| 58 |
# slow |
|
| 59 | 25x |
tx2gene_transcripts <- character(10000) |
| 60 | 25x |
tx2gene_genes <- character(10000) |
| 61 | 25x |
idx <- 0 |
| 62 | 25x |
chunk_size <- 10000 # Read in chunks for better performance |
| 63 | ||
| 64 | 25x |
while (TRUE) {
|
| 65 |
# Read lines in chunks instead of one-by-one for better performance |
|
| 66 | 50x |
lines <- readLines(con, n = chunk_size) |
| 67 | 50x |
if (length(lines) == 0) |
| 68 | 25x |
break |
| 69 | ||
| 70 |
# Process each line in the chunk |
|
| 71 | 25x |
for (line in lines) {
|
| 72 |
# Skip comments and empty lines (fast pre-filter) |
|
| 73 | 216x |
if (startsWith(line, "#") || line == "") |
| 74 | 40x |
next |
| 75 | ||
| 76 |
# Parse GFF3 line format: seqname source feature start end score |
|
| 77 |
# strand phase attributes Use strsplit only once per line |
|
| 78 | 176x |
fields <- strsplit(line, "\t", fixed = TRUE)[[1]] |
| 79 | 176x |
if (length(fields) < 9) |
| 80 | 1x |
next |
| 81 | ||
| 82 |
# Extract feature type (3rd column) - do this check first to skip |
|
| 83 |
# early |
|
| 84 | 175x |
feature_type <- fields[3] |
| 85 | 175x |
if (!feature_type %in% c("transcript", "mRNA"))
|
| 86 | 30x |
next |
| 87 | ||
| 88 |
# Only now extract attributes from the (potentially long) 9th |
|
| 89 |
# column |
|
| 90 | 145x |
attributes <- fields[9] |
| 91 | ||
| 92 |
# Extract ID and Parent using efficient substring operations Find |
|
| 93 |
# positions of ID= and Parent= patterns |
|
| 94 | 145x |
id_start <- regexpr("ID=", attributes, fixed = TRUE) + 3
|
| 95 | 145x |
if (id_start > 3) {
|
| 96 |
# ID found, extract until semicolon or end of string |
|
| 97 | 144x |
id_end <- regexpr(";", substr(attributes, id_start, nchar(attributes)),
|
| 98 | 144x |
fixed = TRUE) |
| 99 | 144x |
if (id_end > 0) {
|
| 100 | 142x |
transcript_id <- substr(attributes, id_start, id_start + id_end - |
| 101 | 142x |
2) |
| 102 |
} else {
|
|
| 103 | 2x |
transcript_id <- substr(attributes, id_start, nchar(attributes)) |
| 104 |
} |
|
| 105 |
} else {
|
|
| 106 | 1x |
transcript_id <- NA_character_ |
| 107 |
} |
|
| 108 | ||
| 109 | 145x |
parent_start <- regexpr("Parent=", attributes, fixed = TRUE) + 7
|
| 110 | 145x |
if (parent_start > 7) {
|
| 111 |
# Parent found, extract until semicolon or end of string |
|
| 112 | 143x |
parent_end <- regexpr(";", substr(attributes, parent_start, nchar(attributes)),
|
| 113 | 143x |
fixed = TRUE) |
| 114 | 143x |
if (parent_end > 0) {
|
| 115 | 27x |
gene_id <- substr(attributes, parent_start, parent_start + parent_end - |
| 116 | 27x |
2) |
| 117 |
} else {
|
|
| 118 | 116x |
gene_id <- substr(attributes, parent_start, nchar(attributes)) |
| 119 |
} |
|
| 120 |
} else {
|
|
| 121 | 2x |
gene_id <- NA_character_ |
| 122 |
} |
|
| 123 | ||
| 124 |
# Store mapping if both IDs are present |
|
| 125 | 145x |
if (!is.na(transcript_id) && !is.na(gene_id)) {
|
| 126 | 142x |
idx <- idx + 1 |
| 127 |
# Grow vectors if needed |
|
| 128 | 142x |
if (idx > length(tx2gene_transcripts)) {
|
| 129 | ! |
tx2gene_transcripts <- c(tx2gene_transcripts, rep(NA_character_, |
| 130 | ! |
10000)) |
| 131 | ! |
tx2gene_genes <- c(tx2gene_genes, rep(NA_character_, 10000)) |
| 132 |
} |
|
| 133 | 142x |
tx2gene_transcripts[idx] <- transcript_id |
| 134 | 142x |
tx2gene_genes[idx] <- gene_id |
| 135 |
} |
|
| 136 |
} |
|
| 137 |
} |
|
| 138 | ||
| 139 |
# Trim to actual size and create data frame (empty if no mappings found) |
|
| 140 | 25x |
if (idx == 0) {
|
| 141 | 7x |
tx2gene_df <- data.frame(Transcript = character(0), Gene = character(0), |
| 142 | 7x |
stringsAsFactors = FALSE) |
| 143 |
} else {
|
|
| 144 | 18x |
tx2gene_df <- data.frame(Transcript = tx2gene_transcripts[seq_len(idx)], |
| 145 | 18x |
Gene = tx2gene_genes[seq_len(idx)], stringsAsFactors = FALSE) |
| 146 |
} |
|
| 147 | 25x |
rownames(tx2gene_df) <- NULL |
| 148 | ||
| 149 | 25x |
return(tx2gene_df) |
| 150 |
} |
|
| 151 | ||
| 152 |
## Helper: build SummarizedExperiment from readcounts + tx2gene |
|
| 153 |
#' Build a SummarizedExperiment from transcript readcounts and tx->gene map |
|
| 154 |
#' |
|
| 155 |
#' This helper creates a `SummarizedExperiment` with an assay named |
|
| 156 |
#' `counts`, stores the `tx2gene` table and the raw `readcounts` in |
|
| 157 |
#' `metadata()`, and extracts gene assignments from the tx2gene mapping |
|
| 158 |
#' to populate `rowData(se)$genes`. |
|
| 159 |
#' |
|
| 160 |
#' The function accepts three different input types for the tx2gene mapping: |
|
| 161 |
#' a file path (TSV or GFF3 format) or an in-memory data.frame. |
|
| 162 |
#' |
|
| 163 |
#' @param readcounts Numeric matrix or data.frame of transcript-level counts |
|
| 164 |
#' (rows = transcripts, columns = samples). Rownames should contain |
|
| 165 |
#' transcript IDs that match the first column of tx2gene. |
|
| 166 |
#' |
|
| 167 |
#' @param tx2gene Transcript-to-gene mapping. Can be one of: |
|
| 168 |
#' \describe{
|
|
| 169 |
#' \item{\strong{TSV File Path}}{A path to a tab-separated file with at least
|
|
| 170 |
#' two columns: Transcript ID (first column) and Gene ID (second column). |
|
| 171 |
#' The file should have a header row with column names. |
|
| 172 |
#' File extension should be .tsv, .txt, or similar. |
|
| 173 |
#' Example: |
|
| 174 |
#' \preformatted{Transcript\tGene
|
|
| 175 |
#' ENST00000001\tENSG00000101 |
|
| 176 |
#' ENST00000002\tENSG00000102} |
|
| 177 |
#' } |
|
| 178 |
#' \item{\strong{GFF3 File Path}}{A path to a GFF3 annotation file (.gff3 or
|
|
| 179 |
#' .gff3.gz format). The file should contain transcript/mRNA features with |
|
| 180 |
#' ID and Parent attributes. Parent attributes reference gene IDs. |
|
| 181 |
#' Example GFF3 line: |
|
| 182 |
#' \preformatted{chr1\tgencode\ttranscript\t100\t2100\t.\t+\t.\t
|
|
| 183 |
#' ID=ENST00000001;Parent=ENSG00000101} |
|
| 184 |
#' The function automatically detects GFF3 format by file extension |
|
| 185 |
#' (.gff3 or .gff3.gz) and parses accordingly. |
|
| 186 |
#' } |
|
| 187 |
#' \item{\strong{Data Frame}}{An in-memory data.frame with Transcript
|
|
| 188 |
#' and Gene columns. Useful when mapping data is already loaded in R. |
|
| 189 |
#' Example: |
|
| 190 |
#' \preformatted{ Transcript Gene
|
|
| 191 |
#' 1 ENST00000001 ENSG00000101 |
|
| 192 |
#' 2 ENST00000002 ENSG00000102} |
|
| 193 |
#' } |
|
| 194 |
#' } |
|
| 195 |
#' |
|
| 196 |
#' @param assay_name Name for the assay to store readcounts (default: 'counts'). |
|
| 197 |
#' |
|
| 198 |
#' @return A `SummarizedExperiment` with assay, `metadata()$tx2gene`, |
|
| 199 |
#' `metadata()$readcounts` and `rowData(se)$genes` populated. |
|
| 200 |
#' |
|
| 201 |
#' @details |
|
| 202 |
#' \strong{Input Format Detection:}
|
|
| 203 |
#' \itemize{
|
|
| 204 |
#' \item If tx2gene is a character string ending in .gff3 or .gff3.gz, |
|
| 205 |
#' it is parsed as a GFF3 file. |
|
| 206 |
#' \item If tx2gene is a character string with any other extension or |
|
| 207 |
#' no extension, it is parsed as a tab-separated file. |
|
| 208 |
#' \item If tx2gene is a data.frame, it is used directly. |
|
| 209 |
#' } |
|
| 210 |
#' |
|
| 211 |
#' \strong{Transcript Matching:}
|
|
| 212 |
#' All transcript IDs in the readcounts object (rownames) must have a |
|
| 213 |
#' corresponding entry in the tx2gene mapping. If any transcript is missing, |
|
| 214 |
#' an error is raised. |
|
| 215 |
#' |
|
| 216 |
#' \strong{Performance:}
|
|
| 217 |
#' \itemize{
|
|
| 218 |
#' \item GFF3 files are processed efficiently even for large annotations |
|
| 219 |
#' (e.g., full GENCODE with 100k+ transcripts) |
|
| 220 |
#' \item TSV files are standard tab-separated format for fast parsing |
|
| 221 |
#' \item Data.frame inputs have no I/O overhead |
|
| 222 |
#' } |
|
| 223 |
#' |
|
| 224 |
#' @export |
|
| 225 |
#' |
|
| 226 |
#' @examples |
|
| 227 |
#' # Example 1: Using data.frame (in-memory mapping) |
|
| 228 |
#' tx2gene <- data.frame( |
|
| 229 |
#' Transcript = c('ENST00000001', 'ENST00000002'),
|
|
| 230 |
#' Gene = c('ENSG00000101', 'ENSG00000102')
|
|
| 231 |
#' ) |
|
| 232 |
#' readcounts <- matrix(c(10, 5, 2, 3), |
|
| 233 |
#' nrow = 2, |
|
| 234 |
#' dimnames = list(c('ENST00000001', 'ENST00000002'), c('s1', 's2'))
|
|
| 235 |
#' ) |
|
| 236 |
#' se <- build_se(readcounts, tx2gene) |
|
| 237 |
#' |
|
| 238 |
#' # Example 2: Using TSV file path |
|
| 239 |
#' # Assuming you have a file 'tx2gene.tsv' with Transcript and Gene columns |
|
| 240 |
#' # se <- build_se(readcounts, 'path/to/tx2gene.tsv') |
|
| 241 |
#' |
|
| 242 |
#' # Example 3: Using GFF3.gz file path |
|
| 243 |
#' # Assuming you have a file 'annotation.gff3.gz' with transcript features |
|
| 244 |
#' # se <- build_se(readcounts, 'path/to/annotation.gff3.gz') |
|
| 245 |
build_se <- function(readcounts, tx2gene, assay_name = "counts") {
|
|
| 246 | 38x |
if (is.character(tx2gene) && length(tx2gene) == 1) {
|
| 247 | 26x |
if (!file.exists(tx2gene)) {
|
| 248 | 3x |
stop("tx2gene file not found: ", tx2gene, call. = FALSE)
|
| 249 |
} |
|
| 250 | ||
| 251 |
# Detect file type and parse accordingly |
|
| 252 | 23x |
if (grepl("\\.gff3(\\.gz)?$", tx2gene, ignore.case = TRUE)) {
|
| 253 | 21x |
message("Detected GFF3 format. Extracting transcript-to-gene mapping...")
|
| 254 | 21x |
tx2gene_df <- extract_tx2gene_from_gff3(tx2gene) |
| 255 |
} else {
|
|
| 256 |
# Assume TSV format (backward compatible) |
|
| 257 | 2x |
tx2gene_df <- utils::read.table(tx2gene, header = TRUE, sep = "\t", stringsAsFactors = FALSE) |
| 258 |
} |
|
| 259 | 12x |
} else if (is.data.frame(tx2gene)) {
|
| 260 | 10x |
tx2gene_df <- tx2gene |
| 261 |
} else {
|
|
| 262 | 2x |
stop("'tx2gene' must be a path (TSV or GFF3) or a data.frame.", call. = FALSE)
|
| 263 |
} |
|
| 264 | ||
| 265 | 33x |
if (is.data.frame(readcounts)) {
|
| 266 | 1x |
readcounts <- as.matrix(readcounts) |
| 267 |
} |
|
| 268 | 33x |
if (!is.matrix(readcounts) || !is.numeric(readcounts)) {
|
| 269 | 1x |
stop("'readcounts' must be a numeric matrix or numeric data.frame.", call. = FALSE)
|
| 270 |
} |
|
| 271 | ||
| 272 |
# Extract transcript IDs from rownames or use row indices |
|
| 273 | 32x |
tx_ids <- rownames(readcounts) |
| 274 | 32x |
if (is.null(tx_ids)) {
|
| 275 | ! |
stop("'readcounts' must have transcript IDs as rownames.", call. = FALSE)
|
| 276 |
} |
|
| 277 | ||
| 278 |
# Extract gene names from tx2gene based on transcript IDs |
|
| 279 | 32x |
tx_col <- if ("Transcript" %in% colnames(tx2gene_df))
|
| 280 | 32x |
"Transcript" else colnames(tx2gene_df)[1] |
| 281 | 32x |
genes <- tx2gene_df$Gene[match(tx_ids, tx2gene_df[[tx_col]])] |
| 282 | ||
| 283 | 32x |
if (any(is.na(genes))) {
|
| 284 | 9x |
stop("Some transcript IDs in readcounts were not found in tx2gene mapping.",
|
| 285 | 9x |
call. = FALSE) |
| 286 |
} |
|
| 287 | ||
| 288 | 23x |
assays_list <- S4Vectors::SimpleList() |
| 289 | 23x |
assays_list[[assay_name]] <- readcounts |
| 290 | ||
| 291 | 23x |
se <- SummarizedExperiment::SummarizedExperiment(assays = assays_list) |
| 292 | 23x |
S4Vectors::metadata(se)$tx2gene <- tx2gene_df |
| 293 | 23x |
S4Vectors::metadata(se)$readcounts <- readcounts |
| 294 | ||
| 295 |
# Populate rowData with gene assignments and transcript IDs |
|
| 296 | 23x |
SummarizedExperiment::rowData(se) <- S4Vectors::DataFrame(genes = genes, row.names = tx_ids) |
| 297 | ||
| 298 | 23x |
return(se) |
| 299 |
} |
| 1 |
#' Calculate Tsallis diversity per gene across samples |
|
| 2 |
#' |
|
| 3 |
#' @param x A numeric matrix or data.frame of transcript-level expression |
|
| 4 |
#' values (rows = transcripts, columns = samples), or a SummarizedExperiment- |
|
| 5 |
#' like object. |
|
| 6 |
#' @param tpm Logical. If TRUE and `x` is a tximport-style list, use the |
|
| 7 |
#' `$abundance` matrix instead of `$counts`. |
|
| 8 |
#' @param genes Character vector assigning each transcript (row) to a gene. |
|
| 9 |
#' Must have length equal to nrow(x) or the number of transcripts in `x`. |
|
| 10 |
#' @param norm Logical; if TRUE, normalize Tsallis entropy to [0,1] per gene. |
|
| 11 |
#' @param assayno Integer assay index to use when `x` is a SummarizedExperiment. |
|
| 12 |
#' @param verbose Logical; print diagnostic messages when TRUE. |
|
| 13 |
#' @param q Numeric scalar or vector of Tsallis q values to evaluate (q > 0). |
|
| 14 |
#' If length(q) > 1, the result will contain separate columns per sample and |
|
| 15 |
#' q. |
|
| 16 |
#' @param what Which quantity to return: 'S' for Tsallis entropy or 'D' for Hill |
|
| 17 |
#' numbers. |
|
| 18 |
#' @param nthreads Number of threads for parallel processing (default: 1). |
|
| 19 |
#' Set to > 1 to parallelize per-gene entropy calculations. |
|
| 20 |
#' @return A \link[SummarizedExperiment]{SummarizedExperiment} with assay
|
|
| 21 |
#' `diversity` containing per-gene diversity values. |
|
| 22 |
#' @import methods |
|
| 23 |
#' @importFrom SummarizedExperiment SummarizedExperiment assays assay rowData |
|
| 24 |
#' colData |
|
| 25 |
#' @export |
|
| 26 |
#' @examples |
|
| 27 |
#' data('tcga_brca_luma', package = 'TSENAT')
|
|
| 28 |
#' rc <- as.matrix(tcga_brca_luma[1:20, -1, drop = FALSE]) |
|
| 29 |
#' gs <- tcga_brca_luma[1:20, 1] |
|
| 30 |
#' se <- calculate_diversity(rc, gs, q = 0.1, norm = TRUE) |
|
| 31 |
#' SummarizedExperiment::assay(se)[1:3, 1:3] |
|
| 32 |
calculate_diversity <- function(x, genes = NULL, norm = TRUE, tpm = FALSE, assayno = 1, |
|
| 33 |
verbose = FALSE, q = 2, what = c("S", "D"), nthreads = 1) {
|
|
| 34 |
# Normalize and validate input data, extract matrix and gene mapping |
|
| 35 | 30x |
inp <- .tsenat_prepare_diversity_input(x = x, genes = genes, tpm = tpm, assayno = assayno, |
| 36 | 30x |
verbose = verbose) |
| 37 | 28x |
x <- inp$x |
| 38 | 28x |
genes <- inp$genes |
| 39 | 28x |
se_assay_mat <- inp$se_assay_mat |
| 40 | ||
| 41 | 28x |
if (!is.numeric(x)) {
|
| 42 | 1x |
stop("Input data must be numeric!", call. = FALSE)
|
| 43 |
} |
|
| 44 | ||
| 45 | 27x |
if (any(is.na(x))) {
|
| 46 | 1x |
stop("The data contains NA as expression values. NAs are not allowed", " in the input.",
|
| 47 | 1x |
call. = FALSE) |
| 48 |
} |
|
| 49 | ||
| 50 | 26x |
if (nrow(x) != length(genes)) {
|
| 51 | 1x |
stop("The number of rows is not equal to the given gene set.", call. = FALSE)
|
| 52 |
} |
|
| 53 | ||
| 54 | 25x |
what <- match.arg(what) |
| 55 |
# validate q values (Tsallis parameter must be > 0) |
|
| 56 | 25x |
if (!is.numeric(q) || any(q <= 0)) {
|
| 57 | 3x |
stop("Argument 'q' must be numeric and greater than 0.", call. = FALSE)
|
| 58 |
} |
|
| 59 |
# keep a copy of transcript-level counts when available (non-null) |
|
| 60 | 22x |
if (!exists("se_assay_mat")) {
|
| 61 | ! |
se_assay_mat <- x |
| 62 |
} |
|
| 63 | ||
| 64 | 22x |
result <- calculate_method(x, genes, norm, verbose = verbose, q = q, what = what, |
| 65 | 22x |
nthreads = nthreads) |
| 66 | ||
| 67 |
# Prepare assay and row/col data |
|
| 68 | 22x |
result_assay <- result[, -1, drop = FALSE] |
| 69 | 22x |
result_rowData <- data.frame(genes = result[, 1], row.names = result[, 1]) |
| 70 | ||
| 71 | 22x |
if (length(q) > 1) {
|
| 72 | 8x |
col_split <- do.call(rbind, strsplit(colnames(result)[-1], "_q=")) |
| 73 | 8x |
col_ids <- paste(col_split[, 1], "_q=", col_split[, 2], sep = "") |
| 74 | 8x |
row_ids <- as.character(result[, 1]) |
| 75 | 8x |
result_colData <- data.frame(samples = as.character(col_split[, 1]), q = as.numeric(col_split[, |
| 76 | 8x |
2]), row.names = col_ids, stringsAsFactors = FALSE) |
| 77 | 8x |
colnames(result_assay) <- col_ids |
| 78 | 8x |
rownames(result_assay) <- row_ids |
| 79 |
} else {
|
|
| 80 | 14x |
col_ids <- colnames(x) |
| 81 | 14x |
row_ids <- as.character(result[, 1]) |
| 82 | 14x |
result_colData <- data.frame(samples = col_ids, row.names = col_ids) |
| 83 | 14x |
colnames(result_assay) <- col_ids |
| 84 | 14x |
rownames(result_assay) <- row_ids |
| 85 |
} |
|
| 86 | ||
| 87 | 22x |
result_metadata <- list(method = "tsallis", norm = norm, q = q, what = what) |
| 88 | ||
| 89 |
# if we have preserved the original transcript-level matrix, build a |
|
| 90 |
# tx->gene mapping and make both available in metadata so downstream |
|
| 91 |
# plotting helpers can find transcript IDs and their parent genes |
|
| 92 | 22x |
tx2gene_map <- NULL |
| 93 | 22x |
if (exists("se_assay_mat") && !is.null(rownames(se_assay_mat)) && length(genes) ==
|
| 94 | 22x |
nrow(se_assay_mat)) {
|
| 95 | 2x |
tx2gene_map <- data.frame(Transcript = rownames(se_assay_mat), Gen = as.character(genes), |
| 96 | 2x |
stringsAsFactors = FALSE) |
| 97 |
} |
|
| 98 | ||
| 99 | 22x |
assays_list <- list() |
| 100 | 22x |
if (what == "S") {
|
| 101 | 21x |
assays_list$diversity <- result_assay |
| 102 | 1x |
} else if (what == "D") {
|
| 103 | 1x |
assays_list$hill <- result_assay |
| 104 |
} |
|
| 105 | ||
| 106 | 22x |
result <- SummarizedExperiment::SummarizedExperiment(assays = assays_list, rowData = result_rowData, |
| 107 | 22x |
colData = result_colData, metadata = c(result_metadata, list(readcounts = if (exists("se_assay_mat")) se_assay_mat else NULL,
|
| 108 | 22x |
tx2gene = tx2gene_map))) |
| 109 | ||
| 110 | 22x |
return(result) |
| 111 |
} |
| 1 |
# Internal helpers for calculate_method |
|
| 2 | ||
| 3 |
.tsenat_tsallis_row <- function(x, genes, gene, q, norm, what) {
|
|
| 4 | 687x |
idx <- which(genes == gene) |
| 5 | 687x |
out <- unlist(lapply(seq_len(ncol(x)), function(j) {
|
| 6 | 4817x |
v <- calculate_tsallis_entropy(x[idx, j], q = q, norm = norm, what = what) |
| 7 | 4817x |
if (length(v) == length(q) && all(is.finite(v) | is.na(v))) {
|
| 8 | 4815x |
v |
| 9 |
} else {
|
|
| 10 | 2x |
names_vec <- paste0("q=", q)
|
| 11 | 2x |
setNames(rep(NA_real_, length(q)), names_vec) |
| 12 |
} |
|
| 13 |
})) |
|
| 14 | 687x |
out |
| 15 |
} |
| 1 |
# Internal plot helpers |
|
| 2 | ||
| 3 |
.tsenat_format_label <- function(lbl) {
|
|
| 4 | 41x |
if (is.null(lbl)) {
|
| 5 | 1x |
return(NULL) |
| 6 |
} |
|
| 7 | 40x |
s <- gsub("_", " ", lbl)
|
| 8 | 40x |
s <- gsub("\\s+", " ", s)
|
| 9 | 40x |
s <- trimws(s) |
| 10 | 40x |
s <- tolower(s) |
| 11 | 40x |
if (nchar(s) == 0) {
|
| 12 | 1x |
return(s) |
| 13 |
} |
|
| 14 | 39x |
if (nchar(s) == 1) {
|
| 15 | 1x |
return(toupper(s)) |
| 16 |
} |
|
| 17 | 38x |
paste0(toupper(substr(s, 1, 1)), substr(s, 2, nchar(s))) |
| 18 |
} |
|
| 19 | ||
| 20 |
.tsenat_prepare_ma_plot_df <- function(df, fold_col, mean_cols, x_label, y_label) {
|
|
| 21 |
# Detect x-axis values |
|
| 22 | 15x |
if (length(mean_cols) >= 2) {
|
| 23 | 3x |
xvals <- rowMeans(df[, mean_cols[seq_len(2)], drop = FALSE], na.rm = TRUE) |
| 24 | 3x |
x_label <- x_label %||% paste0(mean_cols[1], " vs ", mean_cols[2]) |
| 25 | 12x |
} else if (length(mean_cols) == 1) {
|
| 26 | 1x |
xvals <- as.numeric(df[[mean_cols[1]]]) |
| 27 | 1x |
x_label <- x_label %||% mean_cols[1] |
| 28 | 11x |
} else if ("mean" %in% colnames(df)) {
|
| 29 | 9x |
xvals <- as.numeric(df$mean) |
| 30 | 9x |
x_label <- x_label %||% "Mean" |
| 31 |
} else {
|
|
| 32 | 2x |
xvals <- seq_len(nrow(df)) |
| 33 | 2x |
x_label <- x_label %||% "Index" |
| 34 |
} |
|
| 35 | ||
| 36 | 15x |
yvals <- as.numeric(df[[fold_col]]) |
| 37 | ||
| 38 | 15x |
padj_candidates <- c("adjusted_p_values", "adj_p_value", "adj_p", "padj", "p.adjust")
|
| 39 | 15x |
padj_col <- intersect(padj_candidates, colnames(df)) |
| 40 | 15x |
padj_col <- if (length(padj_col)) {
|
| 41 | 5x |
padj_col[1] |
| 42 |
} else {
|
|
| 43 | 10x |
NULL |
| 44 |
} |
|
| 45 | ||
| 46 | 15x |
padj <- if (!is.null(padj_col)) {
|
| 47 | 5x |
as.numeric(df[[padj_col]]) |
| 48 |
} else {
|
|
| 49 | 10x |
rep(1, length(yvals)) |
| 50 |
} |
|
| 51 | 15x |
padj[is.na(padj)] <- 1 |
| 52 | ||
| 53 | 15x |
sig_flag <- ifelse(abs(yvals) > 0 & padj < 0.05, "significant", "non-significant") |
| 54 | ||
| 55 | 15x |
plot_df <- data.frame(genes = df$genes, x = xvals, y = yvals, padj = padj, significant = sig_flag, |
| 56 | 15x |
stringsAsFactors = FALSE) |
| 57 | ||
| 58 | 15x |
list(plot_df = plot_df, x_label = x_label, y_label = y_label) |
| 59 |
} |
|
| 60 | ||
| 61 |
.tsenat_prepare_volcano_df <- function(diff_df, x_col = NULL, padj_col = "adjusted_p_values", |
|
| 62 |
label_thresh = 0.1, padj_thresh = 0.05, title = NULL) {
|
|
| 63 | 14x |
df <- as.data.frame(diff_df) |
| 64 | 14x |
cn <- colnames(df) |
| 65 | ||
| 66 |
# Auto-detect x-axis column if not specified |
|
| 67 | 14x |
if (is.null(x_col)) {
|
| 68 | 5x |
diff_cols <- grep("_difference$", cn, value = TRUE, ignore.case = TRUE)
|
| 69 | 5x |
if (length(diff_cols) > 0) {
|
| 70 | 2x |
x_col <- diff_cols[1] |
| 71 |
} else {
|
|
| 72 | 3x |
numeric_cols <- vapply(df, is.numeric, logical(1)) |
| 73 | 3x |
p_cols <- grep("p_value|p.value", cn, ignore.case = TRUE)
|
| 74 | 3x |
numeric_cols[p_cols] <- FALSE |
| 75 | 3x |
if (any(numeric_cols)) {
|
| 76 | 3x |
x_col <- cn[which(numeric_cols)[1]] |
| 77 |
} else {
|
|
| 78 | ! |
stop("Could not find suitable column for x-axis. Specify 'x_col' explicitly.")
|
| 79 |
} |
|
| 80 |
} |
|
| 81 |
} |
|
| 82 | ||
| 83 |
# Verify columns |
|
| 84 | 14x |
if (!(x_col %in% cn)) {
|
| 85 | 1x |
stop(sprintf("Column '%s' not found in diff_df", x_col))
|
| 86 |
} |
|
| 87 | 13x |
if (!(padj_col %in% cn)) {
|
| 88 | 5x |
stop(sprintf("Column '%s' not found in diff_df", padj_col))
|
| 89 |
} |
|
| 90 | ||
| 91 | 8x |
df$xval <- as.numeric(df[[x_col]]) |
| 92 | 8x |
df$padj <- as.numeric(df[[padj_col]]) |
| 93 | 8x |
df$padj[is.na(df$padj)] <- 1 |
| 94 | 8x |
df$padj[df$padj <= 0] <- .Machine$double.xmin |
| 95 | ||
| 96 | 8x |
df$significant <- ifelse(abs(df$xval) >= label_thresh & df$padj < padj_thresh, |
| 97 | 8x |
"significant", "non-significant") |
| 98 | 8x |
df <- df[is.finite(df$xval) & is.finite(df$padj), ] |
| 99 | ||
| 100 | 8x |
if (nrow(df) == 0) {
|
| 101 | 1x |
stop("No valid points to plot")
|
| 102 |
} |
|
| 103 | ||
| 104 | 7x |
metric_label <- if (grepl("median", x_col, ignore.case = TRUE)) {
|
| 105 | 1x |
"Median" |
| 106 | 7x |
} else if (grepl("mean", x_col, ignore.case = TRUE)) {
|
| 107 | 3x |
"Mean" |
| 108 |
} else {
|
|
| 109 | 3x |
"Value" |
| 110 |
} |
|
| 111 | ||
| 112 | 7x |
title_use <- title %||% "Volcano plot: fold-change vs significance" |
| 113 | ||
| 114 | 7x |
x_label_formatted <- .tsenat_format_label(x_col) |
| 115 | 7x |
padj_label_formatted <- .tsenat_format_label(padj_col) |
| 116 | ||
| 117 | 7x |
list(df = df, x_col = x_col, padj_col = padj_col, x_label_formatted = x_label_formatted, |
| 118 | 7x |
padj_label_formatted = padj_label_formatted, title_use = title_use) |
| 119 |
} |
| 1 |
# # Helpers for generating diversity plots (density, violin, MA) |
|
| 2 | ||
| 3 |
#' @importFrom utils head |
|
| 4 |
#' @importFrom grDevices png dev.off |
|
| 5 |
if (getRversion() >= "2.15.1") {
|
|
| 6 |
utils::globalVariables( |
|
| 7 |
c( |
|
| 8 |
"Gene", |
|
| 9 |
"diversity", |
|
| 10 |
"sample", |
|
| 11 |
"sample_q", |
|
| 12 |
"sample_type", |
|
| 13 |
"fold", |
|
| 14 |
"significant", |
|
| 15 |
"value", |
|
| 16 |
".", |
|
| 17 |
"group", |
|
| 18 |
"tsallis", |
|
| 19 |
"q", |
|
| 20 |
"median", |
|
| 21 |
"IQR", |
|
| 22 |
"padj_num", |
|
| 23 |
"padj_clean", |
|
| 24 |
"xval", |
|
| 25 |
"label_flag", |
|
| 26 |
"genes", |
|
| 27 |
"mean", |
|
| 28 |
# transcript plotting globals |
|
| 29 |
"tx", |
|
| 30 |
"expr", |
|
| 31 |
"tx_cond", |
|
| 32 |
"log2expr", |
|
| 33 |
# generic plotting helpers |
|
| 34 |
"x", |
|
| 35 |
"y", |
|
| 36 |
"padj" |
|
| 37 |
) |
|
| 38 |
) |
|
| 39 | ||
| 40 |
require_pkgs <- function(pkgs) {
|
|
| 41 | 71x |
missing <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)] |
| 42 | 71x |
if (length(missing)) {
|
| 43 | 1x |
stop( |
| 44 | 1x |
sprintf( |
| 45 | 1x |
"%s required", |
| 46 | 1x |
paste(missing, collapse = ", ") |
| 47 |
) |
|
| 48 |
) |
|
| 49 |
} |
|
| 50 | 70x |
invisible(TRUE) |
| 51 |
} |
|
| 52 |
} |
|
| 53 | ||
| 54 |
# Small utility helpers to reduce repeated code paths when extracting |
|
| 55 |
# samples, readcounts and tx->gene mappings from a |
|
| 56 |
# SummarizedExperiment. Keeping these as focused helpers improves |
|
| 57 |
# readability of the longer plotting functions below. |
|
| 58 |
infer_samples_from_se <- function(se, samples = NULL, sample_type_col = "sample_type") {
|
|
| 59 | 8x |
if (!is.null(samples)) {
|
| 60 | 1x |
return(as.character(samples)) |
| 61 |
} |
|
| 62 | 7x |
cd <- NULL |
| 63 | 7x |
try(cd <- SummarizedExperiment::colData(se), silent = TRUE) |
| 64 | 7x |
if (is.null(cd)) {
|
| 65 | ! |
return(NULL) |
| 66 |
} |
|
| 67 | ||
| 68 |
# Common column names to try |
|
| 69 | 7x |
candidates <- c( |
| 70 | 7x |
sample_type_col, |
| 71 | 7x |
"condition", |
| 72 | 7x |
"group", |
| 73 | 7x |
"sample_group", |
| 74 | 7x |
"sampleType", |
| 75 | 7x |
"class", |
| 76 | 7x |
"status", |
| 77 | 7x |
"phenotype" |
| 78 |
) |
|
| 79 | 7x |
for (nm in candidates) {
|
| 80 | 49x |
if (nm %in% colnames(cd)) {
|
| 81 | 1x |
return(as.character(cd[[nm]])) |
| 82 |
} |
|
| 83 |
} |
|
| 84 | ||
| 85 |
# Fallback: choose column with smallest >1 unique values |
|
| 86 | 6x |
uniq_counts <- vapply(cd, function(col) length(unique(na.omit(col))), integer(1)) |
| 87 | 6x |
valid_cols <- names(uniq_counts[uniq_counts > 1]) |
| 88 | 6x |
if (length(valid_cols) > 0) {
|
| 89 | 3x |
bin_cols <- valid_cols[uniq_counts[valid_cols] == 2] |
| 90 | 3x |
pick <- if (length(bin_cols) > 0) bin_cols[1] else valid_cols[which.min(uniq_counts[valid_cols])] |
| 91 | 3x |
return(as.character(cd[[pick]])) |
| 92 |
} |
|
| 93 | ||
| 94 | 3x |
NULL |
| 95 |
} |
|
| 96 | ||
| 97 |
get_readcounts_from_se <- function(se, readcounts_arg = NULL) {
|
|
| 98 |
# If user provided a readcounts object/path, accept it first |
|
| 99 | 14x |
if (!is.null(readcounts_arg)) {
|
| 100 | 6x |
if (is.character(readcounts_arg) && length(readcounts_arg) == 1) {
|
| 101 | ! |
if (!file.exists(readcounts_arg)) stop("readcounts file not found: ", readcounts_arg)
|
| 102 | 3x |
rc_df <- utils::read.delim(readcounts_arg, header = TRUE, stringsAsFactors = FALSE) |
| 103 | 3x |
if (!is.null(colnames(rc_df)) && ncol(rc_df) > 1) {
|
| 104 | 2x |
counts <- as.matrix(rc_df[, -1, drop = FALSE]) |
| 105 | 2x |
rownames(counts) <- rc_df[[1]] |
| 106 |
} else {
|
|
| 107 | 1x |
counts <- as.matrix(rc_df) |
| 108 |
} |
|
| 109 | 3x |
return(counts) |
| 110 | 3x |
} else if (is.matrix(readcounts_arg) || is.data.frame(readcounts_arg)) {
|
| 111 | 2x |
return(as.matrix(readcounts_arg)) |
| 112 |
} else {
|
|
| 113 | 1x |
stop("`readcounts` must be a matrix/data.frame or path to a file")
|
| 114 |
} |
|
| 115 |
} |
|
| 116 | ||
| 117 | 8x |
md <- NULL |
| 118 | 8x |
try(md <- S4Vectors::metadata(se), silent = TRUE) |
| 119 | 8x |
if (!is.null(md) && !is.null(md$readcounts)) {
|
| 120 | 2x |
return(as.matrix(md$readcounts)) |
| 121 |
} |
|
| 122 | ||
| 123 | 6x |
assay_names <- SummarizedExperiment::assayNames(se) |
| 124 | 6x |
preferred_assays <- c("readcounts", "counts", "tx_counts", "counts_tx")
|
| 125 | 6x |
chosen <- intersect(preferred_assays, assay_names) |
| 126 | 6x |
if (length(chosen) > 0) {
|
| 127 | 5x |
return(as.matrix(SummarizedExperiment::assay(se, chosen[1]))) |
| 128 |
} |
|
| 129 | ||
| 130 |
# fallback to first assay with a warning |
|
| 131 | 1x |
warning( |
| 132 | 1x |
"Using first assay from SummarizedExperiment to compute", |
| 133 | 1x |
" expression-based fold changes; ensure it contains", |
| 134 | 1x |
" transcript-level readcounts or provide metadata$readcounts" |
| 135 |
) |
|
| 136 | 1x |
as.matrix(SummarizedExperiment::assay(se)) |
| 137 |
} |
|
| 138 | ||
| 139 |
get_tx2gene_from_se <- function(se, readcounts_mat = NULL) {
|
|
| 140 | 7x |
md <- NULL |
| 141 | 7x |
try(md <- S4Vectors::metadata(se), silent = TRUE) |
| 142 |
# prefer explicit tx2gene in metadata |
|
| 143 | 7x |
if (!is.null(md) && !is.null(md$tx2gene) && is.data.frame(md$tx2gene)) {
|
| 144 | 3x |
txmap <- md$tx2gene |
| 145 |
# attempt to find sensible columns |
|
| 146 | 3x |
tx_col <- if ("Transcript" %in% colnames(txmap)) "Transcript" else colnames(txmap)[1]
|
| 147 | 3x |
gene_col <- if ("Gen" %in% colnames(txmap)) "Gen" else colnames(txmap)[2]
|
| 148 | 3x |
return(list( |
| 149 | 3x |
type = "vector", |
| 150 | 3x |
mapping = as.character(txmap[[gene_col]][match(rownames(readcounts_mat), txmap[[tx_col]])]) |
| 151 |
)) |
|
| 152 |
} |
|
| 153 | ||
| 154 |
# fallback: try rowData mapping |
|
| 155 | 4x |
rdata <- SummarizedExperiment::rowData(se) |
| 156 | 4x |
if (!is.null(rdata) && "genes" %in% colnames(rdata)) {
|
| 157 | 2x |
genes_vec <- as.character(rdata$genes) |
| 158 | 2x |
if (!is.null(readcounts_mat) && length(genes_vec) == nrow(readcounts_mat)) {
|
| 159 | 2x |
return(list(type = "vector", mapping = genes_vec)) |
| 160 |
} |
|
| 161 |
} |
|
| 162 | ||
| 163 |
# last resort: use rownames of readcounts as gene identifiers |
|
| 164 | 2x |
if (!is.null(readcounts_mat)) {
|
| 165 | 2x |
return(list(type = "vector", mapping = rownames(readcounts_mat))) |
| 166 |
} |
|
| 167 | ||
| 168 | ! |
NULL |
| 169 |
} |
|
| 170 | ||
| 171 |
validate_control_in_samples <- function(control, samples) {
|
|
| 172 | 4x |
uniq <- unique(samples) |
| 173 | 4x |
if (!is.null(control) && control %in% uniq) {
|
| 174 | 1x |
return(control) |
| 175 |
} |
|
| 176 | 3x |
if ("Normal" %in% uniq) {
|
| 177 | 2x |
return("Normal")
|
| 178 |
} |
|
| 179 |
# fallback to first level and message |
|
| 180 | 1x |
chosen <- uniq[1] |
| 181 | 1x |
message(sprintf("`control` not found; using '%s' instead", chosen))
|
| 182 | 1x |
chosen |
| 183 |
} |
|
| 184 | ||
| 185 |
#' Plot q-curve profile for a single gene comparing groups |
|
| 186 |
#' |
|
| 187 |
#' For a selected gene, plot per-sample Tsallis entropy across q values and |
|
| 188 |
#' overlay per-group median +/- IQR ribbons so group-level differences are |
|
| 189 |
#' easy to compare. Expects a `SummarizedExperiment` produced by |
|
| 190 |
#' `calculate_diversity()` with `_q=` suffixes in column names. |
|
| 191 |
#' |
|
| 192 |
#' @param se A `SummarizedExperiment` from `calculate_diversity()`. |
|
| 193 |
#' @param gene Character scalar or vector; gene symbol(s) to plot. If NULL and |
|
| 194 |
#' `lm_res` is supplied, the top `n_top` genes from `lm_res` (by |
|
| 195 |
#' `adj_p_interaction` or `p_interaction`) are used. |
|
| 196 |
#' @param lm_res Optional data.frame result from `calculate_lm_interaction()`. |
|
| 197 |
#' When supplied and `gene` is NULL, the top `n_top` significant genes will |
|
| 198 |
#' be plotted. |
|
| 199 |
#' @param n_top Number of top genes to plot when `lm_res` is provided (default: 10). |
|
| 200 |
#' @param assay_name Name of the assay to use (default: "diversity"). |
|
| 201 |
#' @param sample_type_col Column name in `colData(se)` with sample type labels |
|
| 202 |
#' (default: "sample_type"). If missing, a single-group fallback is used. |
|
| 203 |
#' @param show_samples Logical; if TRUE, draw per-sample lines in the |
|
| 204 |
#' background (default: FALSE). |
|
| 205 |
#' @return A `ggplot` object when a single gene is requested, or a named list |
|
| 206 |
#' of `ggplot` objects when multiple genes are requested. |
|
| 207 |
#' @examples |
|
| 208 |
#' mat <- matrix(runif(8), nrow = 2, dimnames = list(c("g1", "g2"), c("s1_q=0.1", "s1_q=1", "s2_q=0.1", "s2_q=1")))
|
|
| 209 |
#' se <- SummarizedExperiment::SummarizedExperiment(assays = list(diversity = mat)) |
|
| 210 |
#' plot_tsallis_gene_profile(se, gene = "g1") |
|
| 211 |
#' @export |
|
| 212 |
plot_tsallis_gene_profile <- function(se, |
|
| 213 |
gene = NULL, |
|
| 214 |
lm_res = NULL, |
|
| 215 |
n_top = 10, |
|
| 216 |
assay_name = "diversity", |
|
| 217 |
sample_type_col = "sample_type", |
|
| 218 |
show_samples = FALSE) {
|
|
| 219 | ! |
if (!requireNamespace("ggplot2", quietly = TRUE)) stop("ggplot2 required")
|
| 220 | 14x |
require_pkgs(c("dplyr", "tidyr", "SummarizedExperiment"))
|
| 221 | ||
| 222 | 14x |
long <- prepare_tsallis_long(se, assay_name = assay_name, sample_type_col = sample_type_col) |
| 223 | ! |
if (!("Gene" %in% colnames(long))) stop("prepare_tsallis_long did not return Gene column")
|
| 224 | ||
| 225 |
# Resolve genes to plot: accept NULL (use lm_res), a single name, or a vector/list |
|
| 226 | 14x |
if (is.null(gene)) {
|
| 227 | ! |
if (is.null(lm_res)) stop("Either 'gene' or 'lm_res' must be provided")
|
| 228 | 2x |
if (!is.data.frame(lm_res) || !("gene" %in% colnames(lm_res))) stop("'lm_res' must be a data.frame with a 'gene' column")
|
| 229 |
# prefer adj_p_interaction if present |
|
| 230 | 5x |
pcol <- if ("adj_p_interaction" %in% colnames(lm_res)) "adj_p_interaction" else if ("p_interaction" %in% colnames(lm_res)) "p_interaction" else NULL
|
| 231 | 1x |
if (is.null(pcol)) stop("'lm_res' must contain 'adj_p_interaction' or 'p_interaction' columns")
|
| 232 | 4x |
genes_ordered <- unique(as.character(lm_res$gene[order(lm_res[[pcol]])])) |
| 233 | 4x |
genes <- head(genes_ordered, n_top) |
| 234 |
} else {
|
|
| 235 | 7x |
genes <- as.character(unlist(gene)) |
| 236 |
} |
|
| 237 | ||
| 238 | ! |
if (length(genes) == 0) stop("No genes selected for plotting")
|
| 239 | ||
| 240 |
# helper to build single plot for a gene |
|
| 241 | 11x |
make_plot_for_gene <- function(sel) {
|
| 242 | 18x |
long_g <- long[as.character(long$Gene) == sel, , drop = FALSE] |
| 243 | 2x |
if (nrow(long_g) == 0) stop("Gene not found in assay: ", sel)
|
| 244 | 16x |
long_g$qnum <- as.numeric(as.character(long_g$q)) |
| 245 | 16x |
stats_df <- dplyr::summarise(dplyr::group_by(long_g, group, qnum), |
| 246 | 16x |
median = median(tsallis, na.rm = TRUE), |
| 247 | 16x |
IQR = stats::IQR(tsallis, na.rm = TRUE), .groups = "drop" |
| 248 |
) |
|
| 249 | ||
| 250 | 16x |
p <- ggplot2::ggplot() + |
| 251 | 16x |
ggplot2::theme_minimal(base_size = 14) |
| 252 | ||
| 253 | 16x |
if (isTRUE(show_samples)) {
|
| 254 | 2x |
p <- p + ggplot2::geom_line(data = long_g, ggplot2::aes(x = qnum, y = tsallis, group = sample, color = group), alpha = 0.25) |
| 255 |
} |
|
| 256 | ||
| 257 | 16x |
p <- p + |
| 258 | 16x |
ggplot2::geom_ribbon(data = stats_df, ggplot2::aes(x = qnum, ymin = median - IQR / 2, ymax = median + IQR / 2, fill = group), alpha = 0.2, inherit.aes = FALSE) + |
| 259 | 16x |
ggplot2::geom_line(data = stats_df, ggplot2::aes(x = qnum, y = median, color = group), linewidth = 1.3) + |
| 260 | 16x |
ggplot2::labs(title = paste0(sel, ": Tsallis entropy q-curve profile"), x = "q value", y = "Tsallis entropy", color = "Group", fill = "Group") + |
| 261 | 16x |
ggplot2::scale_color_discrete(name = "Group") + ggplot2::scale_fill_discrete(name = "Group") + |
| 262 | 16x |
ggplot2::theme(plot.title = ggplot2::element_text( |
| 263 | 16x |
hjust = 0.5, size = 16, |
| 264 | 16x |
margin = ggplot2::margin(b = 10) |
| 265 |
)) |
|
| 266 | 16x |
p |
| 267 |
} |
|
| 268 | ||
| 269 |
# Return single ggplot for single gene, or a named list of ggplots for multiple genes |
|
| 270 | 11x |
if (length(genes) == 1) {
|
| 271 | 7x |
return(make_plot_for_gene(genes)) |
| 272 |
} |
|
| 273 | ||
| 274 | 4x |
plots <- lapply(genes, make_plot_for_gene) |
| 275 | 4x |
names(plots) <- genes |
| 276 | 4x |
plots |
| 277 |
} |
|
| 278 | ||
| 279 |
#' Plot diversity distributions (density) by sample type |
|
| 280 |
#' @param se A `SummarizedExperiment` returned by `calculate_diversity`. |
|
| 281 |
#' @param assay_name Name of the assay to use (default: "diversity"). |
|
| 282 |
#' @param sample_type_col Optional column name in `colData(se)` with sample |
|
| 283 |
#' types. If missing, sample types are inferred from column names (suffix after |
|
| 284 |
#' the last underscore) or set to 'Group'. |
|
| 285 |
#' @return A `ggplot` object with layered density plots. |
|
| 286 |
#' @importFrom ggplot2 ggplot aes geom_density facet_grid scale_color_manual |
|
| 287 |
#' guides theme_minimal labs |
|
| 288 |
#' @export |
|
| 289 |
#' @examples |
|
| 290 |
#' data("tcga_brca_luma", package = "TSENAT")
|
|
| 291 |
#' rc <- as.matrix(tcga_brca_luma[1:100, -1, drop = FALSE]) |
|
| 292 |
#' gs <- tcga_brca_luma[1:100, 1] |
|
| 293 |
#' se <- calculate_diversity(rc, gs, q = 0.1, norm = FALSE) |
|
| 294 |
#' # Manually set sample_type in colData for plotting |
|
| 295 |
#' SummarizedExperiment::colData(se)$sample_type <- |
|
| 296 |
#' factor(gsub(".*_", "", colnames(rc)))
|
|
| 297 |
#' plot_diversity_density(se) |
|
| 298 |
plot_diversity_density <- function( |
|
| 299 |
se, |
|
| 300 |
assay_name = "diversity", |
|
| 301 |
sample_type_col = NULL |
|
| 302 |
) {
|
|
| 303 | 5x |
require_pkgs(c("ggplot2", "tidyr", "dplyr", "SummarizedExperiment"))
|
| 304 |
|
|
| 305 |
# For plot_diversity_density, sample_type is required for faceting |
|
| 306 |
# Determine which column to use for sample_type |
|
| 307 | 5x |
if (is.null(sample_type_col)) {
|
| 308 |
# Check if "sample_type" exists in colData as a fallback |
|
| 309 | 4x |
if (!("sample_type" %in% colnames(SummarizedExperiment::colData(se)))) {
|
| 310 | 1x |
stop("sample_type column not found in data", call. = FALSE)
|
| 311 |
} |
|
| 312 | 3x |
sample_type_col <- "sample_type" |
| 313 | 1x |
} else if (!(sample_type_col %in% colnames(SummarizedExperiment::colData(se)))) {
|
| 314 | ! |
stop(sprintf("Column '%s' not found in colData.", sample_type_col), call. = FALSE)
|
| 315 |
} |
|
| 316 |
|
|
| 317 |
# Check for all NA sample_type values |
|
| 318 | 4x |
st_col <- SummarizedExperiment::colData(se)[[sample_type_col]] |
| 319 | 4x |
if (all(is.na(st_col))) {
|
| 320 | 1x |
stop("All sample_type values are NA. Cannot create faceted plot.", call. = FALSE)
|
| 321 |
} |
|
| 322 |
|
|
| 323 | 3x |
long <- get_assay_long( |
| 324 | 3x |
se, |
| 325 | 3x |
assay_name = assay_name, |
| 326 | 3x |
value_name = "diversity", |
| 327 | 3x |
sample_type_col = sample_type_col |
| 328 |
) |
|
| 329 | ||
| 330 |
# Ensure sample_type column exists and has no NA values |
|
| 331 | 3x |
if (!("sample_type" %in% colnames(long))) {
|
| 332 | ! |
stop("sample_type column not found in data", call. = FALSE)
|
| 333 |
} |
|
| 334 | 3x |
if (all(is.na(long$sample_type))) {
|
| 335 | ! |
stop("All sample_type values are NA. Cannot create faceted plot.", call. = FALSE)
|
| 336 |
} |
|
| 337 | ||
| 338 | 3x |
ggplot2::ggplot( |
| 339 | 3x |
long, |
| 340 | 3x |
ggplot2::aes(x = diversity, group = sample, color = sample_type) |
| 341 |
) + |
|
| 342 | 3x |
ggplot2::geom_density(alpha = 0.3) + |
| 343 | 3x |
ggplot2::facet_grid(. ~ sample_type) + |
| 344 | 3x |
ggplot2::scale_color_manual(values = c("black", "darkorchid4")) +
|
| 345 | 3x |
ggplot2::guides(color = "none") + |
| 346 | 3x |
ggplot2::theme_minimal(base_size = 14) + |
| 347 | 3x |
ggplot2::labs(x = "Diversity values", y = "Density") |
| 348 |
} |
|
| 349 | ||
| 350 | ||
| 351 |
#' Plot violin of per-gene mean diversity by sample type |
|
| 352 |
#' @importFrom magrittr %>% |
|
| 353 |
#' @param se A `SummarizedExperiment` returned by `calculate_diversity`. |
|
| 354 |
#' @param assay_name Name of the assay to use (default: "diversity"). |
|
| 355 |
#' @param sample_type_col Optional column name in `colData(se)` containing |
|
| 356 |
#' sample types. |
|
| 357 |
#' @return A `ggplot` violin plot object. |
|
| 358 |
#' @export |
|
| 359 |
#' @examples |
|
| 360 |
#' data("tcga_brca_luma", package = "TSENAT")
|
|
| 361 |
#' rc <- as.matrix(tcga_brca_luma[1:100, -1, drop = FALSE]) |
|
| 362 |
#' gs <- tcga_brca_luma[1:100, 1] |
|
| 363 |
#' se <- calculate_diversity(rc, gs, q = 0.1, norm = FALSE) |
|
| 364 |
#' plot_mean_violin(se) |
|
| 365 |
plot_mean_violin <- function( |
|
| 366 |
se, |
|
| 367 |
assay_name = "diversity", |
|
| 368 |
sample_type_col = NULL |
|
| 369 |
) {
|
|
| 370 | 3x |
require_pkgs(c("ggplot2", "dplyr", "SummarizedExperiment", "tidyr"))
|
| 371 | 3x |
long <- get_assay_long( |
| 372 | 3x |
se, |
| 373 | 3x |
assay_name = assay_name, |
| 374 | 3x |
value_name = "diversity", |
| 375 | 3x |
sample_type_col = sample_type_col |
| 376 |
) |
|
| 377 | ||
| 378 | 3x |
tmp <- as.data.frame(long) |
| 379 | 3x |
plot_df <- stats::aggregate( |
| 380 | 3x |
diversity ~ sample_type + Gene, |
| 381 | 3x |
data = tmp, |
| 382 | 3x |
FUN = function(x) mean(x, na.rm = TRUE) |
| 383 |
) |
|
| 384 | 3x |
colnames(plot_df)[colnames(plot_df) == "diversity"] <- "value" |
| 385 | ||
| 386 | 3x |
ggplot2::ggplot( |
| 387 | 3x |
plot_df, |
| 388 | 3x |
ggplot2::aes(x = sample_type, y = value, fill = sample_type) |
| 389 |
) + |
|
| 390 | 3x |
ggplot2::geom_violin(alpha = 0.6) + |
| 391 | 3x |
ggplot2::coord_flip() + |
| 392 | 3x |
ggplot2::theme_minimal(base_size = 14) + |
| 393 | 3x |
ggplot2::labs(x = "Samples", y = "Diversity") + |
| 394 | 3x |
ggplot2::scale_fill_viridis_d(name = "Group") |
| 395 |
} |
|
| 396 | ||
| 397 | ||
| 398 |
# Core MA plotting implementation documentation moved to internal block |
|
| 399 |
#' Plot MA using Tsallis-based fold changes |
|
| 400 |
#' |
|
| 401 |
#' Wrapper around `plot_ma(..., type = "tsallis")` for convenience and |
|
| 402 |
#' clearer API separation. |
|
| 403 |
#' |
|
| 404 |
#' @param x Data.frame from `calculate_difference()`. |
|
| 405 |
#' @param sig_alpha Numeric significance threshold for adjusted p-values (default: 0.05). |
|
| 406 |
#' @param x_label Optional x-axis label passed to `plot_ma`. |
|
| 407 |
#' @param y_label Optional y-axis label passed to `plot_ma`. |
|
| 408 |
#' @param title Optional plot title passed to `plot_ma`. |
|
| 409 |
#' @param ... Additional arguments passed to `plot_ma()`. |
|
| 410 |
#' @return A `ggplot2` object representing the MA plot. |
|
| 411 |
#' @examples |
|
| 412 |
#' x <- data.frame(genes = paste0("g", seq_len(5)), mean = runif(5), log2_fold_change = rnorm(5))
|
|
| 413 |
#' plot_ma_tsallis(x) |
|
| 414 |
#' @export |
|
| 415 |
plot_ma_tsallis <- function(x, sig_alpha = 0.05, x_label = NULL, y_label = NULL, title = NULL, ...) {
|
|
| 416 | 4x |
title_use <- title %||% "Tsallis-based MA plot" |
| 417 | 4x |
x_label_use <- x_label %||% "mean_difference" |
| 418 | 4x |
y_label_use <- y_label %||% "Log10 fold-change of entropy" |
| 419 | 4x |
.plot_ma_core(x, fc_df = NULL, sig_alpha = sig_alpha, x_label = x_label_use, y_label = y_label_use, title = title_use) |
| 420 |
} |
|
| 421 | ||
| 422 | ||
| 423 |
#' Plot MA using expression/readcount-based fold changes |
|
| 424 |
#' |
|
| 425 |
#' Wrapper around `plot_ma(..., type = "expression")` that accepts a |
|
| 426 |
#' `SummarizedExperiment` or precomputed fold-change `data.frame`. |
|
| 427 |
#' |
|
| 428 |
#' @param x Data.frame from `calculate_difference()`. |
|
| 429 |
#' @param se A `SummarizedExperiment` or data.frame supplying readcounts or precomputed fold changes. |
|
| 430 |
#' @param samples Optional sample grouping vector (passed to `plot_ma`). |
|
| 431 |
#' @param control Control level name (passed to `plot_ma`). |
|
| 432 |
#' @param fc_method Aggregation method for fold-change calculation (passed to `plot_ma`). |
|
| 433 |
#' @param pseudocount Pseudocount added when computing log ratios (passed to `plot_ma`). |
|
| 434 |
#' @param sig_alpha Numeric significance threshold for adjusted p-values (default: 0.05). |
|
| 435 |
#' @param x_label Optional x-axis label passed to `plot_ma`. |
|
| 436 |
#' @param y_label Optional y-axis label passed to `plot_ma`. |
|
| 437 |
#' @param title Optional plot title passed to `plot_ma`. |
|
| 438 |
#' @param ... Additional arguments passed to `plot_ma()`. |
|
| 439 |
#' @return A `ggplot2` object representing the MA plot. |
|
| 440 |
#' @examples |
|
| 441 |
#' x <- data.frame(genes = paste0("g", seq_len(5)), mean = runif(5))
|
|
| 442 |
#' fc <- data.frame(genes = paste0("g", seq_len(5)), log2_fold_change = rnorm(5))
|
|
| 443 |
#' plot_ma_expression(x, se = fc) |
|
| 444 |
#' @export |
|
| 445 |
plot_ma_expression <- function( |
|
| 446 |
x, |
|
| 447 |
se, |
|
| 448 |
samples = NULL, |
|
| 449 |
control = NULL, |
|
| 450 |
fc_method = "median", |
|
| 451 |
pseudocount = 0, |
|
| 452 |
sig_alpha = 0.05, |
|
| 453 |
x_label = NULL, |
|
| 454 |
y_label = NULL, |
|
| 455 |
title = NULL, |
|
| 456 |
... |
|
| 457 |
) {
|
|
| 458 | 2x |
title_use <- title %||% "Readcounts-based MA plot" |
| 459 | 2x |
x_label_use <- x_label %||% "Mean difference" |
| 460 | 2x |
y_label_use <- y_label %||% "Log10 fold-change of counts" |
| 461 | 2x |
plot_ma_expression_impl( |
| 462 | 2x |
x, |
| 463 | 2x |
se = se, |
| 464 | 2x |
samples = samples, |
| 465 | 2x |
control = control, |
| 466 | 2x |
fc_method = fc_method, |
| 467 | 2x |
pseudocount = pseudocount, |
| 468 | 2x |
sig_alpha = sig_alpha, |
| 469 | 2x |
x_label = x_label_use, |
| 470 | 2x |
y_label = y_label_use, |
| 471 | 2x |
title = title_use, |
| 472 |
... |
|
| 473 |
) |
|
| 474 |
} |
|
| 475 | ||
| 476 | ||
| 477 |
# Core MA plotting implementation used by wrappers above. Accepts a |
|
| 478 |
# differential results `x` (data.frame) and an optional `fc_df` with |
|
| 479 |
# fold-changes (genes as rownames or a `genes` column). Returns a |
|
| 480 |
# `ggplot` MA-plot. |
|
| 481 |
#' Core MA plotting implementation (internal) |
|
| 482 |
#' |
|
| 483 |
#' This is an internal helper used by `plot_ma_tsallis()` and |
|
| 484 |
#' `plot_ma_expression()`. It is documented here for developers but |
|
| 485 |
#' is not exported. |
|
| 486 |
#' @noRd |
|
| 487 |
.plot_ma_core <- function(x, |
|
| 488 |
fc_df = NULL, |
|
| 489 |
diff_res = NULL, |
|
| 490 |
sig_alpha = 0.05, |
|
| 491 |
x_label = NULL, |
|
| 492 |
y_label = NULL, |
|
| 493 |
title = NULL, |
|
| 494 |
...) {
|
|
| 495 | 14x |
require_pkgs(c("ggplot2"))
|
| 496 | ||
| 497 | 14x |
df <- as.data.frame(x, stringsAsFactors = FALSE) |
| 498 |
# ensure a gene identifier column exists |
|
| 499 | 14x |
if (!("genes" %in% colnames(df))) {
|
| 500 | 6x |
if (!is.null(rownames(df))) df$genes <- rownames(df) |
| 501 |
} |
|
| 502 | ||
| 503 |
# If an external fc_df is provided, merge fold values into df |
|
| 504 | 14x |
if (!is.null(fc_df)) {
|
| 505 | 6x |
fdf <- as.data.frame(fc_df, stringsAsFactors = FALSE) |
| 506 | ! |
if (!("genes" %in% colnames(fdf)) && !is.null(rownames(fdf))) fdf$genes <- rownames(fdf)
|
| 507 | 6x |
if (!("log2_fold_change" %in% colnames(fdf))) {
|
| 508 | 1x |
stop("Provided `fc_df` must contain 'log2_fold_change' column")
|
| 509 |
} |
|
| 510 | 5x |
df <- merge(df, fdf[, c("genes", "log2_fold_change")], by = "genes", all.x = TRUE, suffixes = c("", ".fc"))
|
| 511 |
# prefer fc_df fold values when present |
|
| 512 | 3x |
if ("log2_fold_change.fc" %in% colnames(df)) df$log2_fold_change <- ifelse(!is.na(df$log2_fold_change.fc), df$log2_fold_change.fc, df$log2_fold_change)
|
| 513 |
} |
|
| 514 | ||
| 515 |
# Detect fold-change column |
|
| 516 | 13x |
fold_candidates <- c("log2_fold_change", "logFC", "fold", "estimate_interaction", "fold_change")
|
| 517 | 13x |
fold_col <- intersect(fold_candidates, colnames(df)) |
| 518 | 1x |
if (length(fold_col) == 0) stop("Could not find a fold-change column in input")
|
| 519 | 12x |
fold_col <- fold_col[1] |
| 520 | ||
| 521 |
# Detect mean/average columns for x-axis |
|
| 522 | 12x |
mean_cols <- grep("_mean$|_median$", colnames(df), value = TRUE)
|
| 523 | 12x |
if (length(mean_cols) >= 2) {
|
| 524 |
# ensure the two chosen columns are consistent (both _mean or both _median) |
|
| 525 | 3x |
c1 <- mean_cols[1] |
| 526 | 3x |
c2 <- mean_cols[2] |
| 527 | 3x |
is_mean1 <- grepl("_mean$", c1)
|
| 528 | 3x |
is_mean2 <- grepl("_mean$", c2)
|
| 529 | 3x |
is_med1 <- grepl("_median$", c1)
|
| 530 | 3x |
is_med2 <- grepl("_median$", c2)
|
| 531 | 3x |
if (!((is_mean1 && is_mean2) || (is_med1 && is_med2))) {
|
| 532 | 1x |
stop("Could not find two mean or two median columns")
|
| 533 |
} |
|
| 534 | 2x |
xvals <- rowMeans(df[, mean_cols[seq_len(2)], |
| 535 | 2x |
drop = FALSE |
| 536 | 2x |
], na.rm = TRUE) |
| 537 | 2x |
x_label <- x_label %||% paste0(mean_cols[1], " vs ", mean_cols[2]) |
| 538 | 9x |
} else if (length(mean_cols) == 1) {
|
| 539 | ! |
xvals <- as.numeric(df[[mean_cols[1]]]) |
| 540 | ! |
x_label <- x_label %||% mean_cols[1] |
| 541 | 9x |
} else if ("mean" %in% colnames(df)) {
|
| 542 | 8x |
xvals <- as.numeric(df$mean) |
| 543 | 8x |
x_label <- x_label %||% "Mean" |
| 544 |
} else {
|
|
| 545 |
# fallback: use rank or index |
|
| 546 | 1x |
xvals <- seq_len(nrow(df)) |
| 547 | 1x |
x_label <- x_label %||% "Index" |
| 548 |
} |
|
| 549 | ||
| 550 | 11x |
yvals <- as.numeric(df[[fold_col]]) |
| 551 | ||
| 552 |
# p-value / adjusted p-value detection |
|
| 553 | 11x |
padj_candidates <- c("adjusted_p_values", "adj_p_value", "adj_p", "padj", "p.adjust")
|
| 554 | 11x |
padj_col <- intersect(padj_candidates, colnames(df)) |
| 555 | 11x |
padj_col <- if (length(padj_col)) padj_col[1] else NULL |
| 556 | ||
| 557 | 11x |
padj <- if (!is.null(padj_col)) as.numeric(df[[padj_col]]) else rep(1, length(yvals)) |
| 558 | 11x |
padj[is.na(padj)] <- 1 |
| 559 | ||
| 560 | 11x |
sig_flag <- ifelse(abs(yvals) > 0 & padj < sig_alpha, "significant", "non-significant") |
| 561 | ||
| 562 | 11x |
plot_df <- data.frame(genes = df$genes, x = xvals, y = yvals, padj = padj, significant = sig_flag, stringsAsFactors = FALSE) |
| 563 | ||
| 564 | 11x |
prep <- .tsenat_prepare_ma_plot_df(df, fold_col = fold_col, mean_cols = mean_cols, x_label = x_label, y_label = y_label) |
| 565 | 11x |
plot_df <- prep$plot_df |
| 566 | 11x |
x_label_formatted <- .tsenat_format_label(prep$x_label) |
| 567 | 11x |
y_label_raw <- prep$y_label %||% fold_col |
| 568 | 11x |
y_label_formatted <- .tsenat_format_label(y_label_raw) |
| 569 | 11x |
if (!is.null(y_label_formatted)) {
|
| 570 | 11x |
y_label_formatted <- sub("\\blog2\\b", "log10", y_label_formatted, ignore.case = TRUE)
|
| 571 |
} |
|
| 572 | ||
| 573 | 11x |
p <- ggplot2::ggplot(plot_df, ggplot2::aes(x = x, y = y, color = significant)) + |
| 574 | 11x |
ggplot2::geom_point(alpha = 0.75, size = 3.2) + |
| 575 | 11x |
ggplot2::scale_color_manual( |
| 576 | 11x |
values = c("non-significant" = "grey40", "significant" = "firebrick3"),
|
| 577 | 11x |
guide = "none" |
| 578 |
) + |
|
| 579 | 11x |
ggplot2::theme_minimal(base_size = 14) + |
| 580 | 11x |
ggplot2::labs( |
| 581 | 11x |
title = title %||% "MA plot: mean vs log10 fold-change", |
| 582 | 11x |
x = x_label_formatted, |
| 583 | 11x |
y = y_label_formatted |
| 584 |
) + |
|
| 585 | 11x |
ggplot2::theme( |
| 586 | 11x |
plot.title = ggplot2::element_text(hjust = 0.5, size = 16, face = "plain"), |
| 587 | 11x |
axis.title = ggplot2::element_text(size = 14), |
| 588 | 11x |
axis.text = ggplot2::element_text(size = 12) |
| 589 |
) |
|
| 590 | ||
| 591 | 11x |
p |
| 592 |
} |
|
| 593 | ||
| 594 | ||
| 595 |
# Implementation that computes fold-changes from expression/readcounts |
|
| 596 |
# stored in a `SummarizedExperiment` (or matrix) and forwards to core plotter. |
|
| 597 |
plot_ma_expression_impl <- function( |
|
| 598 |
x, |
|
| 599 |
se, |
|
| 600 |
samples = NULL, |
|
| 601 |
control = NULL, |
|
| 602 |
fc_method = "median", |
|
| 603 |
pseudocount = 0, |
|
| 604 |
sig_alpha = 0.05, |
|
| 605 |
x_label = NULL, |
|
| 606 |
y_label = NULL, |
|
| 607 |
title = NULL, |
|
| 608 |
... |
|
| 609 |
) {
|
|
| 610 | 8x |
require_pkgs(c("SummarizedExperiment"))
|
| 611 | ||
| 612 |
# extract/readcounts |
|
| 613 | 8x |
if (inherits(se, "SummarizedExperiment")) {
|
| 614 | 2x |
counts <- get_readcounts_from_se(se) |
| 615 | 2x |
samples <- infer_samples_from_se(se, samples) |
| 616 | 1x |
if (is.null(samples)) stop("Could not infer 'samples' from SummarizedExperiment; provide `samples`")
|
| 617 | 1x |
control <- validate_control_in_samples(control, samples) |
| 618 | ||
| 619 |
# attempt to map transcripts -> genes and aggregate counts per gene |
|
| 620 | 1x |
tx2g <- get_tx2gene_from_se(se, readcounts_mat = counts) |
| 621 | 1x |
if (!is.null(tx2g) && tx2g$type == "vector") {
|
| 622 | 1x |
mapping <- tx2g$mapping |
| 623 |
# ensure mapping length matches rows |
|
| 624 | 1x |
if (length(mapping) == nrow(counts)) {
|
| 625 |
# aggregate transcript-level counts to gene-level using rowsum |
|
| 626 | 1x |
agg <- rowsum(counts, group = mapping) |
| 627 | 1x |
counts_gene <- as.matrix(agg) |
| 628 |
} else {
|
|
| 629 | ! |
counts_gene <- counts |
| 630 |
} |
|
| 631 |
} else {
|
|
| 632 | ! |
counts_gene <- counts |
| 633 |
} |
|
| 634 | ||
| 635 |
# compute fold-changes using calculate_fc (aggregates per-group) |
|
| 636 | 1x |
fc_res <- calculate_fc( |
| 637 | 1x |
counts_gene, |
| 638 | 1x |
samples, |
| 639 | 1x |
control, |
| 640 | 1x |
method = fc_method, |
| 641 | 1x |
pseudocount = pseudocount |
| 642 |
) |
|
| 643 |
# ensure genes column exists |
|
| 644 | 1x |
if (is.null(rownames(fc_res))) {
|
| 645 | ! |
fc_res$genes <- seq_len(nrow(fc_res)) |
| 646 |
} else {
|
|
| 647 | 1x |
fc_res$genes <- rownames(fc_res) |
| 648 |
} |
|
| 649 | 1x |
return(.plot_ma_core( |
| 650 | 1x |
x, |
| 651 | 1x |
fc_df = fc_res, |
| 652 | 1x |
sig_alpha = sig_alpha, |
| 653 | 1x |
x_label = x_label, |
| 654 | 1x |
y_label = y_label, |
| 655 | 1x |
title = title, |
| 656 |
... |
|
| 657 |
)) |
|
| 658 |
} |
|
| 659 | ||
| 660 |
# If se is provided as a matrix/data.frame of precomputed fold changes |
|
| 661 | 6x |
if (is.matrix(se) || is.data.frame(se)) {
|
| 662 | 5x |
fc_res <- as.data.frame(se, stringsAsFactors = FALSE) |
| 663 | 2x |
if (!("log2_fold_change" %in% colnames(fc_res))) stop("`se` data.frame must contain 'log2_fold_change' column when providing precomputed fold changes")
|
| 664 | 1x |
if (!("genes" %in% colnames(fc_res)) && !is.null(rownames(fc_res))) fc_res$genes <- rownames(fc_res)
|
| 665 | 3x |
return(.plot_ma_core(x, fc_df = fc_res, sig_alpha = sig_alpha, x_label = x_label, y_label = y_label, title = title, ...)) |
| 666 |
} |
|
| 667 | ||
| 668 | 1x |
stop("Unsupported 'se' argument for plot_ma_expression_impl")
|
| 669 |
} |
|
| 670 | ||
| 671 | ||
| 672 |
#' Plot median +- IQR of Tsallis entropy across q values by group |
|
| 673 |
#' |
|
| 674 |
#' This reproduces the `tsallis-q-curve-mean-sd` plot from the vignette: for |
|
| 675 |
#' each q value, compute per-gene Tsallis entropy per sample, summarize across |
|
| 676 |
#' genes by group (median and IQR) and plot median with a ribbon spanning |
|
| 677 |
#' median +- IQR/2. |
|
| 678 |
#' @param se A `SummarizedExperiment` returned by `calculate_diversity`. |
|
| 679 |
#' @param assay_name Name of the assay to use (default: "diversity"). |
|
| 680 |
#' @param sample_type_col Column name in `colData(se)` containing sample |
|
| 681 |
#' type labels (default: "sample_type"). |
|
| 682 |
#' @return A `ggplot` object showing median +- IQR across q values by group. |
|
| 683 |
#' @export |
|
| 684 |
#' @examples |
|
| 685 |
#' data("tcga_brca_luma", package = "TSENAT")
|
|
| 686 |
#' rc <- as.matrix(tcga_brca_luma[1:40, -1, drop = FALSE]) |
|
| 687 |
#' gs <- tcga_brca_luma[1:40, 1] |
|
| 688 |
#' se <- calculate_diversity(rc, gs, |
|
| 689 |
#' q = seq(0.01, 0.1, by = 0.03), norm = FALSE |
|
| 690 |
#' ) |
|
| 691 |
#' p <- plot_tsallis_q_curve(se) |
|
| 692 |
#' p |
|
| 693 |
plot_tsallis_q_curve <- function( |
|
| 694 |
se, |
|
| 695 |
assay_name = "diversity", |
|
| 696 |
sample_type_col = "sample_type" |
|
| 697 |
) {
|
|
| 698 |
# SE-first API: require a SummarizedExperiment with per-column sample |
|
| 699 |
# type mapping in `colData(se)[, sample_type_col]` (or allow a single |
|
| 700 |
# group dataset where `sample_type` is omitted). |
|
| 701 | 5x |
if (inherits(se, "SummarizedExperiment")) {
|
| 702 | 4x |
require_pkgs(c("ggplot2", "dplyr", "tidyr", "SummarizedExperiment"))
|
| 703 | 4x |
long <- prepare_tsallis_long(se, assay_name = assay_name, sample_type_col = sample_type_col) |
| 704 | 4x |
y_label <- "Tsallis entropy (S_q)" |
| 705 | 1x |
if (nrow(long) == 0) stop("No tsallis values found in SummarizedExperiment")
|
| 706 |
# long$q may be a factor; convert to numeric for plotting |
|
| 707 | 3x |
long$qnum <- as.numeric(as.character(long$q)) |
| 708 | 3x |
stats_df <- dplyr::summarise(dplyr::group_by(long, group, qnum), |
| 709 | 3x |
median = median(tsallis, na.rm = TRUE), |
| 710 | 3x |
IQR = stats::IQR(tsallis, na.rm = TRUE), .groups = "drop" |
| 711 |
) |
|
| 712 | 3x |
p <- ggplot2::ggplot( |
| 713 | 3x |
stats_df, |
| 714 | 3x |
ggplot2::aes(x = qnum, y = median, color = group, fill = group) |
| 715 |
) + |
|
| 716 | 3x |
ggplot2::geom_line(linewidth = 1.3) + |
| 717 | 3x |
ggplot2::geom_ribbon( |
| 718 | 3x |
ggplot2::aes(ymin = median - IQR / 2, ymax = median + IQR / 2), |
| 719 | 3x |
alpha = 0.2, |
| 720 | 3x |
color = NA |
| 721 |
) + |
|
| 722 | 3x |
ggplot2::theme_minimal(base_size = 14) + |
| 723 | 3x |
ggplot2::labs( |
| 724 | 3x |
title = "Global Tsallis q-curve: entropy across all genes", |
| 725 | 3x |
x = "q value", |
| 726 | 3x |
y = y_label, |
| 727 | 3x |
color = "Group", |
| 728 | 3x |
fill = "Group" |
| 729 |
) + |
|
| 730 | 3x |
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, face = "plain", size = 16)) |
| 731 |
# use default discrete ggplot2 colours (not viridis) |
|
| 732 | 3x |
p <- p + ggplot2::scale_color_discrete(name = "Group") + |
| 733 | 3x |
ggplot2::scale_fill_discrete(name = "Group") |
| 734 |
# If there is only a single group present, hide the legend/Group label |
|
| 735 | 3x |
if (length(unique(stats_df$group)) == 1) {
|
| 736 | 1x |
p <- p + ggplot2::theme(legend.position = "none") |
| 737 |
} |
|
| 738 | ||
| 739 | 3x |
return(p) |
| 740 |
} |
|
| 741 | ||
| 742 |
# Matrix/data.frame input is no longer supported for this plot function. |
|
| 743 | 1x |
stop( |
| 744 | 1x |
"plot_tsallis_q_curve requires a SummarizedExperiment from calculate_diversity." |
| 745 |
) |
|
| 746 |
} |
|
| 747 |
#' Violin plot of Tsallis entropy for multiple q values |
|
| 748 | ||
| 749 |
#' @param se A `SummarizedExperiment` returned by `calculate_diversity` with |
|
| 750 |
#' multiple q values (column names contain `_q=`). |
|
| 751 |
#' @param assay_name Name of the assay to use (default: "diversity"). |
|
| 752 |
#' @return A `ggplot` violin plot object faceted/colored by group and q. |
|
| 753 |
#' @export |
|
| 754 |
#' @examples |
|
| 755 |
#' data("tcga_brca_luma", package = "TSENAT")
|
|
| 756 |
#' rc <- as.matrix(tcga_brca_luma[1:20, -1, drop = FALSE]) |
|
| 757 |
#' gs <- tcga_brca_luma[1:20, 1] |
|
| 758 |
#' se <- calculate_diversity(rc, gs, q = c(0.1, 1), norm = TRUE) |
|
| 759 |
#' plot_tsallis_violin_multq(se) |
|
| 760 |
plot_tsallis_violin_multq <- function(se, assay_name = "diversity") {
|
|
| 761 | 2x |
require_pkgs(c("ggplot2", "tidyr", "dplyr"))
|
| 762 | 2x |
long <- prepare_tsallis_long(se, assay_name = assay_name) |
| 763 | ||
| 764 | 2x |
ggplot2::ggplot( |
| 765 | 2x |
long, |
| 766 | 2x |
ggplot2::aes(x = q, y = tsallis, fill = group) |
| 767 |
) + |
|
| 768 | 2x |
ggplot2::geom_violin( |
| 769 | 2x |
alpha = 0.5, width = 0.9, |
| 770 | 2x |
position = ggplot2::position_dodge(width = 0.8) |
| 771 |
) + |
|
| 772 | 2x |
ggplot2::geom_boxplot( |
| 773 | 2x |
width = 0.15, |
| 774 | 2x |
position = ggplot2::position_dodge(width = 0.8), |
| 775 | 2x |
outlier.shape = NA |
| 776 |
) + |
|
| 777 | 2x |
ggplot2::theme_minimal(base_size = 14) + |
| 778 | 2x |
ggplot2::scale_fill_discrete(name = "Group") + |
| 779 | 2x |
ggplot2::labs( |
| 780 | 2x |
title = "Violin plot: Tsallis entropy distribution across multiple q values", |
| 781 | 2x |
x = "q value", |
| 782 | 2x |
y = "Tsallis entropy", |
| 783 | 2x |
fill = "Group" |
| 784 |
) + |
|
| 785 | 2x |
ggplot2::theme(plot.title = ggplot2::element_text( |
| 786 | 2x |
hjust = 0.5, size = 16, face = "plain", |
| 787 | 2x |
margin = ggplot2::margin(b = 10) |
| 788 |
)) |
|
| 789 |
} |
|
| 790 |
#' Density plot of Tsallis entropy for multiple q values |
|
| 791 | ||
| 792 |
#' @param se A `SummarizedExperiment` returned by `calculate_diversity` with |
|
| 793 |
#' multiple q values (column names contain `_q=`). |
|
| 794 |
#' @param assay_name Name of the assay to use (default: "diversity"). |
|
| 795 |
#' @return A `ggplot` density plot object faceted by q and colored by group. |
|
| 796 |
#' @export |
|
| 797 |
#' @examples |
|
| 798 |
#' data("tcga_brca_luma", package = "TSENAT")
|
|
| 799 |
#' rc <- as.matrix(tcga_brca_luma[1:20, -1, drop = FALSE]) |
|
| 800 |
#' gs <- tcga_brca_luma[1:20, 1] |
|
| 801 |
#' se <- calculate_diversity(rc, gs, q = c(0.1, 1), norm = FALSE) |
|
| 802 |
#' plot_tsallis_density_multq(se) |
|
| 803 |
plot_tsallis_density_multq <- function(se, assay_name = "diversity") {
|
|
| 804 | 2x |
require_pkgs(c("ggplot2", "tidyr", "dplyr"))
|
| 805 | 2x |
long <- prepare_tsallis_long(se, assay_name = assay_name) |
| 806 | ||
| 807 | 2x |
ggplot2::ggplot( |
| 808 | 2x |
long, |
| 809 | 2x |
ggplot2::aes(x = tsallis, color = group, fill = group) |
| 810 |
) + |
|
| 811 | 2x |
ggplot2::geom_density(alpha = 0.3) + |
| 812 | 2x |
ggplot2::facet_wrap(~q, scales = "free_y") + |
| 813 | 2x |
ggplot2::theme_minimal(base_size = 14) + |
| 814 | 2x |
ggplot2::scale_color_discrete(name = "Group") + |
| 815 | 2x |
ggplot2::scale_fill_discrete(name = "Group") + |
| 816 | 2x |
ggplot2::labs( |
| 817 | 2x |
title = "Density plot: Tsallis entropy distribution across multiple q values", |
| 818 | 2x |
x = "Tsallis entropy", |
| 819 | 2x |
y = "Density", |
| 820 | 2x |
color = "Group", |
| 821 | 2x |
fill = "Group" |
| 822 |
) + |
|
| 823 | 2x |
ggplot2::theme(plot.title = ggplot2::element_text( |
| 824 | 2x |
hjust = 0.5, size = 16, face = "plain", |
| 825 | 2x |
margin = ggplot2::margin(b = 10) |
| 826 |
)) |
|
| 827 |
} |
|
| 828 | ||
| 829 | ||
| 830 |
#' Volcano plot for differential results |
|
| 831 |
#' |
|
| 832 |
#' Create a volcano plot showing fold-change (x-axis) versus adjusted |
|
| 833 |
#' p-value significance (y-axis). The function auto-detects a suitable x-axis |
|
| 834 |
#' column if one is not provided and expects an adjusted p-value column for |
|
| 835 |
#' significance coloring. |
|
| 836 |
#' |
|
| 837 |
#' @param diff_df Data.frame from `test_differential()` or similar results. |
|
| 838 |
#' Should contain p-values and optionally fold-change columns. |
|
| 839 |
#' @param x_col Optional column name for the x-axis. If `NULL`, the function |
|
| 840 |
#' will try to auto-detect a suitable numeric column (excluding p-values). |
|
| 841 |
#' @param padj_col Adjusted p-value column name (default: "adjusted_p_values"). |
|
| 842 |
#' @param label_thresh Fold-change threshold used to annotate points (default: 0.1). |
|
| 843 |
#' @param padj_thresh Adjusted p-value cutoff for significance (default: 0.05). |
|
| 844 |
#' @param top_n Number of top significant genes to label (default: 5). |
|
| 845 |
#' @param title Optional plot title; if `NULL` a default title is used. |
|
| 846 |
#' |
|
| 847 |
#' @return A `ggplot2` object. |
|
| 848 |
#' @export |
|
| 849 |
#' @examples |
|
| 850 |
#' df <- data.frame( |
|
| 851 |
#' gene = paste0("g", seq_len(10)),
|
|
| 852 |
#' mean_difference = runif(10), |
|
| 853 |
#' adjusted_p_values = runif(10) |
|
| 854 |
#' ) |
|
| 855 |
#' # plot_volcano(df, x_col = "mean_difference", padj_col = "adjusted_p_values") |
|
| 856 |
plot_volcano <- function( |
|
| 857 |
diff_df, |
|
| 858 |
x_col = NULL, |
|
| 859 |
padj_col = "adjusted_p_values", |
|
| 860 |
label_thresh = 0.1, |
|
| 861 |
padj_thresh = 0.05, |
|
| 862 |
top_n = 5, |
|
| 863 |
title = NULL |
|
| 864 |
) {
|
|
| 865 | 5x |
if (!requireNamespace("ggplot2", quietly = TRUE)) {
|
| 866 | ! |
stop("ggplot2 required")
|
| 867 |
} |
|
| 868 | ||
| 869 | 5x |
prep_volcano <- .tsenat_prepare_volcano_df(diff_df = diff_df, x_col = x_col, padj_col = padj_col, label_thresh = label_thresh, padj_thresh = padj_thresh, title = title) |
| 870 | 4x |
df <- prep_volcano$df |
| 871 | 4x |
x_col <- prep_volcano$x_col |
| 872 | 4x |
padj_col <- prep_volcano$padj_col |
| 873 | 4x |
x_label_formatted <- prep_volcano$x_label_formatted |
| 874 | 4x |
padj_label_formatted <- prep_volcano$padj_label_formatted |
| 875 | 4x |
title_use <- prep_volcano$title_use |
| 876 | ||
| 877 | 4x |
p <- ggplot2::ggplot( |
| 878 | 4x |
df, |
| 879 | 4x |
ggplot2::aes(x = xval, y = -log10(padj), color = significant) |
| 880 |
) + |
|
| 881 | 4x |
ggplot2::geom_point(alpha = 0.75, size = 3.4) + |
| 882 | 4x |
ggplot2::scale_color_manual( |
| 883 | 4x |
values = c("non-significant" = "black", "significant" = "red"),
|
| 884 | 4x |
guide = "none" |
| 885 |
) + |
|
| 886 | 4x |
ggplot2::geom_hline( |
| 887 | 4x |
yintercept = -log10(padj_thresh), |
| 888 | 4x |
linetype = "dashed", |
| 889 | 4x |
color = "gray50" |
| 890 |
) + |
|
| 891 | 4x |
ggplot2::geom_vline( |
| 892 | 4x |
xintercept = c(-label_thresh, label_thresh), |
| 893 | 4x |
linetype = "dashed", |
| 894 | 4x |
color = "gray50" |
| 895 |
) + |
|
| 896 | 4x |
ggplot2::theme_minimal(base_size = 14) + |
| 897 | 4x |
ggplot2::labs( |
| 898 | 4x |
title = title_use, |
| 899 | 4x |
x = x_label_formatted, |
| 900 | 4x |
y = paste0("-Log10(", padj_label_formatted, ")")
|
| 901 |
) + |
|
| 902 | 4x |
ggplot2::theme( |
| 903 | 4x |
plot.title = ggplot2::element_text(hjust = 0.5, size = 16, face = "plain", margin = ggplot2::margin(b = 4)), |
| 904 | 4x |
axis.title = ggplot2::element_text(size = 14), |
| 905 | 4x |
axis.text = ggplot2::element_text(size = 12) |
| 906 |
) |
|
| 907 | ||
| 908 | 4x |
p |
| 909 |
} |
|
| 910 | ||
| 911 | ||
| 912 |
#' Internal helper to compute fill limits across multiple genes (not exported) |
|
| 913 |
#' @noRd |
|
| 914 |
.compute_transcript_fill_limits <- function(genes, mapping, counts, samples, top_n, agg_fun, pseudocount) {
|
|
| 915 | 5x |
mins <- maxs <- c() |
| 916 | 5x |
for (g in genes) {
|
| 917 | 9x |
txs <- mapping$Transcript[mapping$Gen == g] |
| 918 | 9x |
txs <- intersect(txs, rownames(counts)) |
| 919 | 2x |
if (length(txs) == 0) next |
| 920 | 7x |
if (!is.null(top_n)) txs <- head(txs, top_n) |
| 921 | 7x |
mat <- counts[txs, , drop = FALSE] |
| 922 | 7x |
df_all <- as.data.frame(mat) |
| 923 | 7x |
df_all$tx <- rownames(mat) |
| 924 | 7x |
df_long <- tidyr::pivot_longer(df_all, -tx, names_to = "sample", values_to = "expr") |
| 925 | 7x |
df_long$group <- rep(samples, times = length(txs)) |
| 926 | 7x |
df_summary <- stats::aggregate(expr ~ tx + group, data = df_long, FUN = agg_fun) |
| 927 | 7x |
df_summary$log2expr <- log2(df_summary$expr + pseudocount) |
| 928 | 7x |
mins <- c(mins, min(df_summary$log2expr, na.rm = TRUE)) |
| 929 | 7x |
maxs <- c(maxs, max(df_summary$log2expr, na.rm = TRUE)) |
| 930 |
} |
|
| 931 | 1x |
if (length(mins) == 0) stop("No transcripts found for provided genes")
|
| 932 | 4x |
c(min(mins, na.rm = TRUE), max(maxs, na.rm = TRUE)) |
| 933 |
} |
|
| 934 | ||
| 935 |
#' Internal helper to draw grid layout with title, plots, and legend using base grid |
|
| 936 |
#' @noRd |
|
| 937 |
.draw_transcript_grid <- function(grobs, title, legend_grob, n, heights, to_file = NULL) {
|
|
| 938 |
# If no output file is provided and no graphics device is open, render to a |
|
| 939 |
# temporary pdf device so that plotting in non-interactive sessions does not |
|
| 940 |
# create `Rplots.pdf` in the working directory. |
|
| 941 | 4x |
temp_dev <- FALSE |
| 942 |
# Only open a temporary PDF device when: |
|
| 943 |
# - caller did not supply an output file (`to_file` is NULL), |
|
| 944 |
# - the session is non-interactive, and |
|
| 945 |
# - no graphics device is currently open (dev.cur() == 1) |
|
| 946 | 4x |
if (is.null(to_file) && !interactive() && grDevices::dev.cur() == 1L) {
|
| 947 | ! |
tmp <- tempfile("TSENAT_plot_", fileext = ".pdf")
|
| 948 | ! |
grDevices::pdf(tmp) |
| 949 | ! |
temp_dev <- TRUE |
| 950 |
# Ensure device is closed and temporary file removed on exit |
|
| 951 | ! |
on.exit( |
| 952 |
{
|
|
| 953 | ! |
try(grDevices::dev.off(), silent = TRUE) |
| 954 | ! |
if (file.exists(tmp)) unlink(tmp) |
| 955 |
}, |
|
| 956 | ! |
add = TRUE |
| 957 |
) |
|
| 958 |
} |
|
| 959 | ||
| 960 | 4x |
grid::grid.newpage() |
| 961 | 4x |
grid::pushViewport( |
| 962 | 4x |
grid::viewport(layout = grid::grid.layout(3, n, heights = heights)) |
| 963 |
) |
|
| 964 |
# Title row |
|
| 965 | 4x |
vp_title <- grid::viewport(layout.pos.row = 1, layout.pos.col = seq_len(n)) |
| 966 | 4x |
grid::pushViewport(vp_title) |
| 967 | 4x |
grid::grid.text(title, x = 0.5, gp = grid::gpar(fontsize = 12)) |
| 968 | 4x |
grid::upViewport() |
| 969 |
# Plot rows |
|
| 970 | 4x |
for (i in seq_along(grobs)) {
|
| 971 | 6x |
vp <- grid::viewport(layout.pos.row = 2, layout.pos.col = i) |
| 972 | 6x |
grid::pushViewport(vp) |
| 973 | 6x |
grid::grid.draw(grobs[[i]]) |
| 974 | 6x |
grid::upViewport() |
| 975 |
} |
|
| 976 |
# Legend row |
|
| 977 | 4x |
if (!is.null(legend_grob)) {
|
| 978 | 1x |
vp_leg <- grid::viewport(layout.pos.row = 3, layout.pos.col = seq_len(n)) |
| 979 | 1x |
grid::pushViewport(vp_leg) |
| 980 | 1x |
grid::grid.draw(legend_grob) |
| 981 | 1x |
grid::upViewport() |
| 982 |
} |
|
| 983 | 4x |
grid::upViewport() |
| 984 |
# If caller supplied a file (caller likely opened a device), close it here. |
|
| 985 | 2x |
if (!is.null(to_file)) grDevices::dev.off() |
| 986 | 4x |
invisible(NULL) |
| 987 |
} |
|
| 988 | ||
| 989 |
## Internal helpers for `plot_top_transcripts` refactor |
|
| 990 |
## Create per-gene plot and combine multiple gene plots into final output |
|
| 991 |
.ptt_make_plot_for_gene <- function(gene_single, mapping, counts, samples, top_n, agg_fun, pseudocount, agg_label_unique, fill_limits = NULL) {
|
|
| 992 | 10x |
require_pkgs(c("ggplot2", "tidyr"))
|
| 993 | 10x |
built <- .ptt_build_tx_long(gene_single, mapping, counts, samples, top_n) |
| 994 | 10x |
df_summary <- .ptt_aggregate_df_long(built$df_long, agg_fun, pseudocount) |
| 995 | 10x |
.ptt_build_plot_from_summary(df_summary, agg_label_unique, fill_limits) |
| 996 |
} |
|
| 997 | ||
| 998 |
.ptt_combine_plots <- function(plots, output_file = NULL, agg_label_unique = NULL) {
|
|
| 999 | 7x |
require_pkgs(c("ggplot2"))
|
| 1000 |
# Allow callers to pass a single character second argument as the |
|
| 1001 |
# `agg_label_unique` for convenience (legacy test call patterns). |
|
| 1002 | 7x |
if (is.null(agg_label_unique) && !is.null(output_file) && is.character(output_file) && length(output_file) == 1) {
|
| 1003 | 2x |
agg_label_unique <- output_file |
| 1004 | 2x |
output_file <- NULL |
| 1005 |
} |
|
| 1006 | 7x |
if (requireNamespace("patchwork", quietly = TRUE)) {
|
| 1007 | 7x |
.ptt_combine_patchwork(plots, agg_label_unique) |
| 1008 | ! |
} else if (requireNamespace("cowplot", quietly = TRUE)) {
|
| 1009 | ! |
.ptt_combine_cowplot(plots, output_file = output_file, agg_label_unique = agg_label_unique) |
| 1010 |
} else {
|
|
| 1011 | ! |
.ptt_combine_grid(plots, output_file = output_file, agg_label_unique = agg_label_unique) |
| 1012 |
} |
|
| 1013 |
} |
|
| 1014 | ||
| 1015 |
## Prepare and validate inputs for `plot_top_transcripts` |
|
| 1016 |
.ptt_prepare_inputs <- function(counts, readcounts = NULL, samples = NULL, coldata = NULL, sample_type_col = "sample_type", tx2gene = NULL, res = NULL, top_n = NULL, pseudocount = 0, output_file = NULL, metric = c("median", "mean", "variance", "iqr")) {
|
|
| 1017 |
# handle selecting genes from `res` is left to caller; this function focuses |
|
| 1018 |
# on normalizing counts, samples and tx2gene mapping and preparing agg functions |
|
| 1019 | 21x |
if (inherits(counts, "SummarizedExperiment")) {
|
| 1020 | 1x |
require_pkgs(c("SummarizedExperiment", "S4Vectors"))
|
| 1021 | 1x |
se <- counts |
| 1022 | 1x |
counts_mat <- get_readcounts_from_se(se, readcounts) |
| 1023 | 1x |
counts <- as.matrix(counts_mat) |
| 1024 | 1x |
samples <- infer_samples_from_se(se, samples, sample_type_col = sample_type_col) |
| 1025 | ||
| 1026 | 1x |
if (is.null(tx2gene)) {
|
| 1027 | 1x |
txres <- get_tx2gene_from_se(se, counts) |
| 1028 | 1x |
if (!is.null(txres) && !is.null(txres$mapping)) {
|
| 1029 | 1x |
mapping <- data.frame(Transcript = rownames(counts), Gen = as.character(txres$mapping), stringsAsFactors = FALSE) |
| 1030 | 1x |
tx2gene <- mapping |
| 1031 |
} |
|
| 1032 |
} |
|
| 1033 |
} |
|
| 1034 | ||
| 1035 | 1x |
if (!is.matrix(counts) && !is.data.frame(counts)) stop("`counts` must be a matrix or data.frame with transcripts as rownames")
|
| 1036 | 20x |
counts <- as.matrix(counts) |
| 1037 | 2x |
if (is.null(rownames(counts))) stop("`counts` must have rownames corresponding to transcript identifiers")
|
| 1038 | ||
| 1039 |
# derive samples from coldata if needed |
|
| 1040 | 18x |
if (is.null(samples)) {
|
| 1041 | 6x |
if (!is.null(coldata)) {
|
| 1042 | 5x |
if (is.character(coldata) && length(coldata) == 1) {
|
| 1043 | 1x |
if (!file.exists(coldata)) stop("coldata file not found: ", coldata)
|
| 1044 | 4x |
cdf <- utils::read.delim(coldata, header = TRUE, stringsAsFactors = FALSE) |
| 1045 | ! |
} else if (is.data.frame(coldata)) {
|
| 1046 | ! |
cdf <- coldata |
| 1047 |
} else {
|
|
| 1048 | ! |
stop("`coldata` must be a data.frame or path to a tab-delimited file")
|
| 1049 |
} |
|
| 1050 | ||
| 1051 | 4x |
if (!is.null(rownames(cdf)) && all(colnames(counts) %in% rownames(cdf))) {
|
| 1052 | ! |
samples <- as.character(cdf[colnames(counts), sample_type_col]) |
| 1053 |
} else {
|
|
| 1054 | 4x |
sample_id_cols <- c("sample", "Sample", "sample_id", "id")
|
| 1055 | 4x |
sid <- intersect(sample_id_cols, colnames(cdf)) |
| 1056 | 4x |
if (length(sid) > 0) {
|
| 1057 | 3x |
sid <- sid[1] |
| 1058 | 1x |
if (!all(colnames(counts) %in% as.character(cdf[[sid]]))) stop("coldata sample id column does not match column names of counts")
|
| 1059 | 2x |
row_ix <- match(colnames(counts), as.character(cdf[[sid]])) |
| 1060 | 2x |
samples <- as.character(cdf[[sample_type_col]][row_ix]) |
| 1061 |
} else {
|
|
| 1062 | 1x |
stop("Could not match `coldata` rows to `counts` columns. Provide `samples` or a row-named `coldata`.")
|
| 1063 |
} |
|
| 1064 |
} |
|
| 1065 |
} else {
|
|
| 1066 | 1x |
stop("Either 'samples' or 'coldata' must be provided to determine sample groups")
|
| 1067 |
} |
|
| 1068 |
} |
|
| 1069 | ||
| 1070 |
# normalize tx2gene mapping |
|
| 1071 | 2x |
if (is.null(tx2gene)) stop("`tx2gene` must be provided as a file path or data.frame (or include mapping in metadata of provided SummarizedExperiment)")
|
| 1072 | 12x |
if (is.character(tx2gene) && length(tx2gene) == 1) {
|
| 1073 | 1x |
if (!file.exists(tx2gene)) stop("tx2gene file not found: ", tx2gene)
|
| 1074 | 3x |
mapping <- utils::read.delim(tx2gene, stringsAsFactors = FALSE, header = TRUE) |
| 1075 | 8x |
} else if (is.data.frame(tx2gene)) {
|
| 1076 | 8x |
mapping <- tx2gene |
| 1077 |
} else {
|
|
| 1078 | ! |
stop("`tx2gene` must be provided as a file path or data.frame (or include mapping in metadata of provided SummarizedExperiment)")
|
| 1079 |
} |
|
| 1080 | ||
| 1081 | 1x |
if (!all(c("Transcript", "Gen") %in% colnames(mapping))) stop("tx2gene must have columns 'Transcript' and 'Gen'")
|
| 1082 | ||
| 1083 | ! |
if (!requireNamespace("ggplot2", quietly = TRUE)) stop("ggplot2 required for plotting")
|
| 1084 | ||
| 1085 | 1x |
if (!is.null(samples) && length(samples) != ncol(counts)) stop("Length of `samples` must equal number of columns in `counts`")
|
| 1086 | ||
| 1087 | 9x |
metric_choice <- match.arg(metric) |
| 1088 | 9x |
agg_fun <- switch(metric_choice, |
| 1089 | 9x |
median = function(x) stats::median(x, na.rm = TRUE), |
| 1090 | 9x |
mean = function(x) base::mean(x, na.rm = TRUE), |
| 1091 | 9x |
variance = function(x) stats::var(x, na.rm = TRUE), |
| 1092 | 9x |
iqr = function(x) stats::IQR(x, na.rm = TRUE) |
| 1093 |
) |
|
| 1094 | 9x |
agg_label_metric <- if (metric_choice == "iqr") "IQR" else metric_choice |
| 1095 | 9x |
agg_label <- sprintf("Transcript-level expression with metric %s", agg_label_metric)
|
| 1096 | 9x |
.cnt <- as.integer(getOption("TSENAT.plot_top_counter", 0)) + 1L
|
| 1097 | 9x |
options(TSENAT.plot_top_counter = .cnt) |
| 1098 | 9x |
agg_label_unique <- agg_label |
| 1099 | ||
| 1100 | 9x |
list(counts = counts, samples = samples, mapping = mapping, metric_choice = metric_choice, agg_fun = agg_fun, agg_label_unique = agg_label_unique, top_n = top_n, pseudocount = pseudocount, output_file = output_file) |
| 1101 |
} |
|
| 1102 | ||
| 1103 |
#' Plot top transcripts for a gene |
|
| 1104 |
#' @param counts Matrix or data.frame of transcript counts. Rows are transcripts and columns are samples. |
|
| 1105 |
#' @param readcounts Optional matrix or data.frame of raw read counts. Used for transcript-level quantification if provided. |
|
| 1106 |
#' @param gene Character; gene symbol to inspect. |
|
| 1107 |
#' @param samples Character vector of sample group labels (length = ncol(counts)). |
|
| 1108 |
#' @param coldata Optional data.frame or file path containing sample metadata. Used to infer sample groups if `samples` is not provided. |
|
| 1109 |
#' @param sample_type_col Character; column name in `coldata` or `SummarizedExperiment` colData to use for sample grouping. Default is "sample_type". |
|
| 1110 |
#' @param tx2gene Path or data.frame mapping transcripts to genes. Must contain columns `Transcript` and `Gen`. |
|
| 1111 |
#' @param res Optional result data.frame from a differential analysis. If provided and `gene` is NULL, top genes are selected by adjusted p-value. |
|
| 1112 |
#' @param top_n Integer number of transcripts to show (default = 3). Use NULL to plot all transcripts for the gene. |
|
| 1113 |
#' @param pseudocount Numeric pseudocount added before log2 (default = 1e-6) to avoid division by zero. |
|
| 1114 |
#' @param output_file Optional file path to save the plot. If `NULL`, the `ggplot` object is returned. |
|
| 1115 |
#' @param metric Aggregation metric used to summarize transcript expression per group when plotting. One of c("median", "mean", "variance", "iqr"). Use "iqr" to compute the interquartile range. Defaults to "median".
|
|
| 1116 |
#' @return If \code{output_file} is \code{NULL}, returns a \code{ggplot} object. Otherwise, the plot is saved to the specified file and the function returns \code{NULL} invisibly.
|
|
| 1117 |
#' @examples |
|
| 1118 |
#' tx_counts <- matrix(sample(1:100, 24, replace = TRUE), nrow = 6) |
|
| 1119 |
#' rownames(tx_counts) <- paste0("tx", seq_len(nrow(tx_counts)))
|
|
| 1120 |
#' colnames(tx_counts) <- paste0("S", seq_len(ncol(tx_counts)))
|
|
| 1121 |
#' tx2gene <- data.frame(Transcript = rownames(tx_counts), Gen = rep(paste0("G", seq_len(3)), each = 2), stringsAsFactors = FALSE)
|
|
| 1122 |
#' samples <- rep(c("Normal", "Tumor"), length.out = ncol(tx_counts))
|
|
| 1123 |
#' plot_top_transcripts(tx_counts, gene = c("G1", "G2"), samples = samples, tx2gene = tx2gene, top_n = 2)
|
|
| 1124 |
#' @export |
|
| 1125 |
plot_top_transcripts <- function( |
|
| 1126 |
counts, |
|
| 1127 |
readcounts = NULL, # Optional matrix or data.frame of raw read counts. Used for transcript-level quantification if provided. |
|
| 1128 |
gene = NULL, |
|
| 1129 |
samples = NULL, |
|
| 1130 |
coldata = NULL, # Optional data.frame or file path containing sample metadata. Used to infer sample groups if `samples` is not provided. |
|
| 1131 |
sample_type_col = "sample_type", # Column name in `coldata` or `SummarizedExperiment` colData to use for sample grouping. Default is "sample_type". |
|
| 1132 |
tx2gene = NULL, |
|
| 1133 |
res = NULL, # Optional result data.frame from a differential analysis. If provided and `gene` is NULL, top genes are selected by adjusted p-value. |
|
| 1134 |
top_n = 3, |
|
| 1135 |
pseudocount = 1e-6, |
|
| 1136 |
output_file = NULL, |
|
| 1137 |
metric = c("median", "mean", "variance", "iqr")
|
|
| 1138 |
) {
|
|
| 1139 |
# If `gene` is not provided, select top genes from `res` using `top_n`. |
|
| 1140 | 7x |
if (is.null(gene)) {
|
| 1141 | 1x |
gene <- .ptt_select_genes_from_res(res, top_n) |
| 1142 |
} |
|
| 1143 | 7x |
per_gene_top_n <- top_n |
| 1144 | ||
| 1145 |
## Prepare inputs and normalization via helper |
|
| 1146 | 7x |
prep <- .ptt_prepare_inputs(counts = counts, readcounts = readcounts, samples = samples, coldata = coldata, sample_type_col = sample_type_col, tx2gene = tx2gene, res = res, top_n = per_gene_top_n, pseudocount = pseudocount, output_file = output_file, metric = metric) |
| 1147 | ||
| 1148 |
# if prep returned without gene selection, caller will check `gene` |
|
| 1149 | 6x |
counts <- prep$counts |
| 1150 | 6x |
samples <- prep$samples |
| 1151 | 6x |
mapping <- prep$mapping |
| 1152 | 6x |
metric_choice <- prep$metric_choice |
| 1153 | 6x |
agg_fun <- prep$agg_fun |
| 1154 | 6x |
agg_label_unique <- prep$agg_label_unique |
| 1155 | 6x |
top_n <- prep$top_n |
| 1156 | 6x |
pseudocount <- prep$pseudocount |
| 1157 | 6x |
output_file <- prep$output_file |
| 1158 | ||
| 1159 | 6x |
make_plot_for_gene <- function(gene_single, fill_limits = NULL) {
|
| 1160 | 9x |
.ptt_make_plot_for_gene(gene_single, mapping, counts, samples, top_n, agg_fun, pseudocount, agg_label_unique, fill_limits) |
| 1161 |
} |
|
| 1162 | ||
| 1163 |
# Produce plots (single or multiple). Do not save inside helper - save once |
|
| 1164 |
# below. |
|
| 1165 | 6x |
if (length(gene) > 1) {
|
| 1166 | 3x |
fill_limits <- .compute_transcript_fill_limits(gene, mapping, counts, samples, top_n, agg_fun, pseudocount) |
| 1167 | ||
| 1168 | 3x |
plots <- lapply(seq_along(gene), function(i) {
|
| 1169 | 6x |
gname <- gene[i] |
| 1170 | 6x |
pp <- make_plot_for_gene(gname, fill_limits = fill_limits) |
| 1171 | 6x |
per_gene_title <- if (!is.na(gname) && nzchar(as.character(gname))) as.character(gname) else "" |
| 1172 | 6x |
pp <- pp + ggplot2::labs(title = per_gene_title) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, size = 16)) |
| 1173 | 6x |
pp |
| 1174 |
}) |
|
| 1175 | ||
| 1176 | 3x |
result_plot <- .ptt_combine_plots(plots, output_file = output_file, agg_label_unique = agg_label_unique) |
| 1177 |
} else {
|
|
| 1178 | 3x |
result_plot <- make_plot_for_gene(gene) |
| 1179 |
} |
|
| 1180 | ||
| 1181 | 6x |
if (!is.null(output_file)) {
|
| 1182 | 2x |
ggplot2::ggsave(output_file, result_plot) |
| 1183 | 2x |
invisible(NULL) |
| 1184 |
} else {
|
|
| 1185 | 4x |
result_plot |
| 1186 |
} |
|
| 1187 |
} |
| 1 |
## Filter SummarizedExperiment by low-expression transcripts |
|
| 2 |
#' Filter transcripts in a `SummarizedExperiment` by minimum count/sample |
|
| 3 |
#' |
|
| 4 |
#' Subset a `SummarizedExperiment` to keep transcripts with more than |
|
| 5 |
#' `min_count` reads in strictly more than `min_samples` samples. The |
|
| 6 |
#' function updates assays, `rowData`, and relevant entries in |
|
| 7 |
#' `metadata()` (for example `readcounts` and `tx2gene`) so downstream |
|
| 8 |
#' helpers receive a consistent object. |
|
| 9 |
#' |
|
| 10 |
#' @param se A `SummarizedExperiment` object containing transcript-level |
|
| 11 |
#' assays (rows = transcripts, columns = samples). |
|
| 12 |
#' @param min_count Integer threshold for per-sample counts (default 5L). |
|
| 13 |
#' @param min_samples Integer minimum number of samples exceeding |
|
| 14 |
#' `min_count` required to keep a transcript (default 5L). |
|
| 15 |
#' @param assay_name Name or index of the assay to use for filtering |
|
| 16 |
#' (default: 'counts'). If not present, the first assay is used. |
|
| 17 |
#' @param verbose Logical; print before/after counts when TRUE. |
|
| 18 |
#' @return A filtered `SummarizedExperiment`. |
|
| 19 |
#' @export |
|
| 20 |
#' @examples |
|
| 21 |
#' mat <- matrix(c(0, 6, 7, 2, 8, 9), nrow = 3, dimnames = list(paste0('tx', 1:3), paste0('S', 1:2)))
|
|
| 22 |
#' se <- SummarizedExperiment::SummarizedExperiment(assays = list(counts = mat)) |
|
| 23 |
#' filt <- filter_se(se, min_count = 5, min_samples = 1) |
|
| 24 |
#' class(filt) |
|
| 25 |
filter_se <- function(se, min_count = 5L, min_samples = 5L, assay_name = "counts", |
|
| 26 |
verbose = TRUE) {
|
|
| 27 | 7x |
if (!is(se, "SummarizedExperiment")) {
|
| 28 | 1x |
stop("'se' must be a SummarizedExperiment", call. = FALSE)
|
| 29 |
} |
|
| 30 | ||
| 31 | 6x |
assays_list <- SummarizedExperiment::assays(se) |
| 32 | 6x |
if (is.character(assay_name) && assay_name %in% names(assays_list)) {
|
| 33 | 5x |
assay_mat <- as.matrix(assays_list[[assay_name]]) |
| 34 | 1x |
} else if (is.numeric(assay_name) && assay_name >= 1 && assay_name <= length(assays_list)) {
|
| 35 | ! |
assay_mat <- as.matrix(assays_list[[assay_name]]) |
| 36 |
} else {
|
|
| 37 |
# fallback to first assay |
|
| 38 | 1x |
assay_mat <- as.matrix(assays_list[[1]]) |
| 39 | 1x |
warning("Requested assay not found; using first assay.", call. = FALSE)
|
| 40 |
} |
|
| 41 | ||
| 42 | 6x |
if (!is.numeric(assay_mat)) {
|
| 43 | ! |
stop("Assay data must be numeric.", call. = FALSE)
|
| 44 |
} |
|
| 45 | ||
| 46 | 6x |
before <- nrow(assay_mat) |
| 47 |
# Cap min_samples to the number of available samples to avoid impossible |
|
| 48 |
# thresholds on small datasets and use an inclusive threshold so callers |
|
| 49 |
# requesting e.g. 'min_samples = ncol' keep rows present in all samples. |
|
| 50 | 6x |
eff_min_samples <- min(as.integer(min_samples), ncol(assay_mat)) |
| 51 | 6x |
tokeep <- rowSums(assay_mat > min_count) >= eff_min_samples |
| 52 | 6x |
after <- sum(tokeep) |
| 53 | ||
| 54 | 6x |
if (verbose) {
|
| 55 | ! |
message(sprintf("Transcripts: before = %d, after = %d", before, after))
|
| 56 |
} |
|
| 57 | ||
| 58 | 6x |
if (after == 0L) {
|
| 59 | 2x |
warning("Filtering removed all transcripts; returning empty SummarizedExperiment.",
|
| 60 | 2x |
call. = FALSE) |
| 61 |
} |
|
| 62 | ||
| 63 |
# subset all assays |
|
| 64 | 6x |
new_assays <- S4Vectors::SimpleList(lapply(assays_list, function(a) {
|
| 65 | 8x |
if (is.matrix(a) || is.data.frame(a)) {
|
| 66 | 8x |
as.matrix(a)[tokeep, , drop = FALSE] |
| 67 |
} else {
|
|
| 68 | ! |
a |
| 69 |
} |
|
| 70 |
})) |
|
| 71 | 6x |
names(new_assays) <- names(assays_list) |
| 72 | ||
| 73 |
# subset rowData if present |
|
| 74 | 6x |
rd <- NULL |
| 75 | 6x |
if (nrow(SummarizedExperiment::rowData(se)) > 0) {
|
| 76 | 6x |
rd <- SummarizedExperiment::rowData(se)[tokeep, , drop = FALSE] |
| 77 |
} |
|
| 78 | ||
| 79 |
# subset metadata readcounts and tx2gene if present |
|
| 80 | 6x |
md <- S4Vectors::metadata(se) |
| 81 | 6x |
if (!is.null(md$readcounts) && is.matrix(md$readcounts)) {
|
| 82 | 3x |
md$readcounts <- as.matrix(md$readcounts)[tokeep, , drop = FALSE] |
| 83 |
} |
|
| 84 | 6x |
if (!is.null(md$tx2gene) && is.data.frame(md$tx2gene)) {
|
| 85 | 3x |
txmap <- md$tx2gene |
| 86 | 3x |
txcol <- colnames(txmap)[1] |
| 87 | 3x |
md$tx2gene <- txmap[txmap[[txcol]] %in% rownames(assay_mat)[tokeep], , drop = FALSE] |
| 88 |
} |
|
| 89 | ||
| 90 |
# construct new SE with same colData |
|
| 91 | 6x |
new_se <- SummarizedExperiment::SummarizedExperiment(assays = new_assays, rowData = if (!is.null(rd)) {
|
| 92 | 6x |
rd |
| 93 |
} else {
|
|
| 94 | ! |
S4Vectors::DataFrame() |
| 95 | 6x |
}, colData = SummarizedExperiment::colData(se), metadata = c(S4Vectors::metadata(se), |
| 96 | 6x |
list(filtered = list(min_count = min_count, min_samples = min_samples)))) |
| 97 | ||
| 98 |
# attach updated metadata pieces |
|
| 99 | 6x |
S4Vectors::metadata(new_se)$readcounts <- if (!is.null(md$readcounts)) {
|
| 100 | 3x |
md$readcounts |
| 101 |
} else {
|
|
| 102 | 3x |
NULL |
| 103 |
} |
|
| 104 | 6x |
S4Vectors::metadata(new_se)$tx2gene <- if (!is.null(md$tx2gene)) {
|
| 105 | 3x |
md$tx2gene |
| 106 |
} else {
|
|
| 107 | 3x |
NULL |
| 108 |
} |
|
| 109 | ||
| 110 | 6x |
return(new_se) |
| 111 |
} |
| 1 |
#' Calculate splicing diversity changes between two conditions. |
|
| 2 |
#' |
|
| 3 |
#' @param x A \code{matrix} with the splicing diversity values.
|
|
| 4 |
#' @param samples Character vector with an equal length to the number of columns |
|
| 5 |
#' in the input dataset, specifying the category of each sample. |
|
| 6 |
#' @param control Name of the control sample category, defined in the |
|
| 7 |
#' \code{samples} vector, e.g. \code{control = 'Normal'} or \code{control =
|
|
| 8 |
#' 'WT'}. |
|
| 9 |
#' @param method Method to use for calculating the average splicing diversity |
|
| 10 |
#' value in a condition. Can be \code{'mean'} or \code{'median'}.
|
|
| 11 |
#' @param pseudocount Numeric scalar. Small value added to non-positive |
|
| 12 |
#' observed group summaries to avoid zeros when computing differences and |
|
| 13 |
#' log2 fold-changes. If \code{pseudocount <= 0} the function will automatically
|
|
| 14 |
#' choose a scale-aware value equal to half the smallest positive observed |
|
| 15 |
#' group summary (i.e. half the smallest observed mean/median across groups); |
|
| 16 |
#' if no positive values are present the fallback is \code{1e-6}. Rows with
|
|
| 17 |
#' insufficient observations remain \code{NA} and are not imputed.
|
|
| 18 |
#' @return A \code{data.frame} with mean or median value of splicing diversity
|
|
| 19 |
#' across sample categories, the difference between these values and the log2 |
|
| 20 |
#' fold change values. |
|
| 21 |
#' @details The function uses a matrix of splicing diversity values in order to |
|
| 22 |
#' calculate mean or median differences and log2 fold changes between two |
|
| 23 |
#' conditions. |
|
| 24 |
#' @import stats |
|
| 25 |
#' @export |
|
| 26 |
#' @examples |
|
| 27 |
#' # Simulate splicing diversity matrix (4 genes x 4 samples) |
|
| 28 |
#' mat <- matrix(c( |
|
| 29 |
#' 0.5, 0.6, 0.8, 0.9, # gene1: low control, high treatment |
|
| 30 |
#' 0.7, 0.75, 0.6, 0.5 # gene2: high control, low treatment |
|
| 31 |
#' ), nrow = 2, byrow = TRUE) |
|
| 32 |
#' samples <- c('Normal', 'Normal', 'Tumor', 'Tumor')
|
|
| 33 |
#' result <- calculate_fc(mat, samples, control = 'Normal', method = 'mean') |
|
| 34 |
#' head(result) |
|
| 35 |
calculate_fc <- function(x, samples, control, method = "mean", pseudocount = 0) {
|
|
| 36 |
# validate control and samples inputs |
|
| 37 | 1802x |
if (is.null(control) || !nzchar(control)) {
|
| 38 | 2x |
stop("`control` must be provided to calculate_fc", call. = FALSE)
|
| 39 |
} |
|
| 40 | 1800x |
if (length(samples) != ncol(x)) {
|
| 41 | 3x |
stop("Length of 'samples' must equal number of columns in 'x'", call. = FALSE)
|
| 42 |
} |
|
| 43 | 1797x |
if (!(control %in% samples)) {
|
| 44 | 2x |
stop("Control sample type not found in samples.", call. = FALSE)
|
| 45 |
} |
|
| 46 | 1795x |
agg <- .tsenat_aggregate_fc_values(x = x, samples = samples, method = method, |
| 47 | 1795x |
control = control) |
| 48 | 1795x |
value <- agg$value |
| 49 | 1795x |
sorted <- agg$sorted |
| 50 | ||
| 51 |
# Defensive numeric coercion: ensure group means are numeric and mark any |
|
| 52 |
# non-finite or non-positive values as NA. This prevents Inf/NaN when |
|
| 53 |
# computing log2 fold changes downstream. |
|
| 54 | 1795x |
value <- matrix(as.numeric(value), nrow = nrow(value), ncol = ncol(value), dimnames = dimnames(value)) |
| 55 | 1795x |
value[!is.finite(value)] <- NA |
| 56 | ||
| 57 |
# compute and apply pseudocount based on observed group summaries |
|
| 58 | 1795x |
value <- .tsenat_apply_pseudocount(value, pseudocount) |
| 59 | ||
| 60 |
# compute difference and log2 fold-change with NA-safe handling |
|
| 61 | 1795x |
diff_vec <- value[, 1] - value[, 2] |
| 62 | 1795x |
na_mask <- is.na(value[, 1]) | is.na(value[, 2]) |
| 63 | 1795x |
diff_vec[na_mask] <- NA |
| 64 | ||
| 65 | 1795x |
log2fc_vec <- log2(value[, 1]/value[, 2]) |
| 66 | 1795x |
log2fc_vec[na_mask] <- NA |
| 67 | ||
| 68 | 1795x |
result <- data.frame(value, difference = diff_vec, log2_fold_change = log2fc_vec, |
| 69 | 1795x |
check.names = FALSE, stringsAsFactors = FALSE) |
| 70 | 1795x |
colnames(result) <- c(paste(sorted[1, 1], "_", method, sep = ""), paste(sorted[2, |
| 71 | 1795x |
1], "_", method, sep = ""), paste(method, "_difference", sep = ""), "log2_fold_change") |
| 72 | 1795x |
return(result) |
| 73 |
} |
|
| 74 | ||
| 75 |
#' Calculate p-values using Wilcoxon rank sum test. |
|
| 76 |
#' |
|
| 77 |
#' @param x A \code{matrix} with the splicing diversity values.
|
|
| 78 |
#' @param samples Character vector with an equal length to the number of columns |
|
| 79 |
#' in the input dataset, specifying the category of each sample. |
|
| 80 |
#' @param pcorr P-value correction method applied to the results, as defined in |
|
| 81 |
#' the \code{p.adjust} function.
|
|
| 82 |
#' @param paired If \code{TRUE}, the Wilcox-test will be paired, and therefore
|
|
| 83 |
#' it will be a signed rank test instead of the rank sum test. |
|
| 84 |
#' @param exact If \code{TRUE}, an exact p-value will be computed.
|
|
| 85 |
#' @param nthreads Number of threads for parallel processing (default: 1). |
|
| 86 |
#' Set to > 1 to parallelize per-feature Wilcoxon tests. |
|
| 87 |
#' @return Raw and corrected p-values in a matrix. |
|
| 88 |
#' @import stats |
|
| 89 |
#' @export |
|
| 90 |
#' @examples |
|
| 91 |
#' # Create a matrix of splicing diversity values (3 genes x 6 samples) |
|
| 92 |
#' mat <- matrix(rnorm(18), nrow = 3) |
|
| 93 |
#' samples <- rep(c('Control', 'Treatment'), each = 3)
|
|
| 94 |
#' |
|
| 95 |
#' # Run Wilcoxon test |
|
| 96 |
#' result <- wilcoxon(mat, samples, pcorr = 'BH') |
|
| 97 |
#' head(result) |
|
| 98 |
wilcoxon <- function(x, samples, pcorr = "BH", paired = FALSE, exact = FALSE, nthreads = 1) {
|
|
| 99 |
# Determine group indices (two groups expected) |
|
| 100 | 34x |
groups <- unique(sort(samples)) |
| 101 | 34x |
if (length(groups) != 2) {
|
| 102 | ! |
stop("`samples` must contain exactly two groups for Wilcoxon tests.")
|
| 103 |
} |
|
| 104 | 34x |
g1_idx <- as.numeric(which(samples %in% groups[1])) |
| 105 | 34x |
g2_idx <- as.numeric(which(samples %in% groups[2])) |
| 106 | ||
| 107 | 34x |
if (isTRUE(paired)) {
|
| 108 | 8x |
if (length(g1_idx) != length(g2_idx)) {
|
| 109 | 1x |
stop("Paired Wilcoxon requires equal numbers of samples in each ", "group.")
|
| 110 |
} |
|
| 111 |
# Paired tests assume columns are already ordered/aligned by the caller |
|
| 112 |
# (e.g., via `map_metadata()`); do not attempt to infer pairing here. |
|
| 113 |
} |
|
| 114 | ||
| 115 |
# Function to compute Wilcoxon test for a single feature |
|
| 116 | 33x |
.wilcox_one <- function(i) {
|
| 117 | 2555x |
tryCatch({
|
| 118 | 2555x |
wilcox.test(x[i, g1_idx], x[i, g2_idx], paired = paired, exact = exact)$p.value |
| 119 | 2555x |
}, error = function(e) {
|
| 120 | 2555x |
NA_real_ |
| 121 | 2555x |
}, warning = function(w) {
|
| 122 |
# swallow specific warnings but return NA on unusual states |
|
| 123 | 2555x |
NA_real_ |
| 124 |
}) |
|
| 125 |
} |
|
| 126 | ||
| 127 |
# Apply in parallel |
|
| 128 | 33x |
p_values <- .tsenat_bplapply(seq_len(nrow(x)), .wilcox_one, nthreads = nthreads) |
| 129 | ||
| 130 | 33x |
raw_p_values <- ifelse(is.na(vapply(p_values, c, numeric(1))), 1, vapply(p_values, |
| 131 | 33x |
c, numeric(1))) |
| 132 | 33x |
adjusted_p_values <- p.adjust(raw_p_values, method = pcorr) |
| 133 | 33x |
out <- cbind(raw_p_values, adjusted_p_values) |
| 134 | 33x |
colnames(out) <- c("raw_p_values", "adjusted_p_values")
|
| 135 | 33x |
return(out) |
| 136 |
} |
|
| 137 | ||
| 138 |
#' Calculate p-values using label shuffling. |
|
| 139 |
#' |
|
| 140 |
#' @param x A \code{matrix} with the splicing diversity values.
|
|
| 141 |
#' @param samples Character vector with an equal length to the number of columns |
|
| 142 |
#' in the input dataset, specifying the category of each sample. |
|
| 143 |
#' @param control Name of the control sample category, defined in the |
|
| 144 |
#' \code{samples} vector, e.g. \code{control = 'Normal'} or \code{control =
|
|
| 145 |
#' 'WT'}. |
|
| 146 |
#' @param method Method to use for calculating the average splicing diversity |
|
| 147 |
#' value in a condition. Can be \code{'mean'} or \code{'median'}.
|
|
| 148 |
#' @param randomizations The number of random shuffles. |
|
| 149 |
#' @param pcorr P-value correction method applied to the results, as defined in |
|
| 150 |
#' the \code{p.adjust()} function.
|
|
| 151 |
#' @param paired Logical; if \code{TRUE} perform a paired permutation scheme
|
|
| 152 |
#' (default: \code{FALSE}). When paired is \code{TRUE}, permutations
|
|
| 153 |
#' should preserve pairing between samples; the function currently permutes |
|
| 154 |
#' sample labels and therefore paired analyses are only meaningful when the |
|
| 155 |
#' caller has arranged \code{samples} accordingly.
|
|
| 156 |
#' @param paired_method Character; method for paired permutations. One of |
|
| 157 |
#' \code{'swap'} (randomly swap labels within pairs) or \code{'signflip'}
|
|
| 158 |
#' (perform sign-flip permutations; can enumerate all 2^n_pairs combinations |
|
| 159 |
#' for an exact test when \code{randomizations = 0} or \code{randomizations >= 2^n_pairs}).
|
|
| 160 |
#' @param nthreads Number of threads for parallel processing (default: 1). |
|
| 161 |
#' Set to > 1 to parallelize per-feature p-value computation. |
|
| 162 |
#' @return Raw and corrected p-values. |
|
| 163 |
#' @details The permutation p-values are computed two-sided as the proportion |
|
| 164 |
#' of permuted log2 fold-changes at least as extreme as the observed value, |
|
| 165 |
#' with a pseudocount added: (count + 1) / (n_perm + 1). |
|
| 166 |
#' @import stats |
|
| 167 |
#' @note The permutation test returns two-sided empirical p-values using a |
|
| 168 |
#' pseudocount to avoid zero p-values for small numbers of permutations. See |
|
| 169 |
#' the function documentation for details. |
|
| 170 |
#' @export |
|
| 171 |
#' @examples |
|
| 172 |
#' set.seed(123) |
|
| 173 |
#' # Create a matrix of splicing diversity values (2 genes x 4 samples) |
|
| 174 |
#' mat <- matrix(rnorm(8), nrow = 2) |
|
| 175 |
#' samples <- c('Normal', 'Normal', 'Tumor', 'Tumor')
|
|
| 176 |
#' |
|
| 177 |
#' # Run label shuffling test (100 permutations) |
|
| 178 |
#' result <- label_shuffling(mat, samples, control = 'Normal', |
|
| 179 |
#' method = 'mean', randomizations = 100, pcorr = 'BH') |
|
| 180 |
#' head(result) |
|
| 181 |
label_shuffling <- function(x, samples, control, method, randomizations = 100, pcorr = "BH", |
|
| 182 |
paired = FALSE, paired_method = c("swap", "signflip"), nthreads = 1) {
|
|
| 183 | 32x |
paired_method <- match.arg(paired_method) |
| 184 |
# observed log2 fold changes |
|
| 185 | 32x |
log2_fc <- calculate_fc(x, samples, control, method)[, 4] |
| 186 | ||
| 187 |
# build permutation/null distribution of log2 fold changes |
|
| 188 | 32x |
if (isTRUE(paired)) {
|
| 189 | 10x |
perm_mat <- .tsenat_permute_paired(x = x, samples = samples, control = control, |
| 190 | 10x |
method = method, randomizations = randomizations, paired_method = paired_method) |
| 191 |
} else {
|
|
| 192 |
# Generate permutations |
|
| 193 | 22x |
permuted <- replicate(randomizations, calculate_fc(x, sample(samples), control, |
| 194 | 22x |
method), simplify = FALSE) |
| 195 |
# each element is a data.frame/matrix; extract log2_fold_change column |
|
| 196 |
# (4th column) |
|
| 197 | 22x |
perm_mat <- vapply(permuted, function(z) as.numeric(z[, 4]), numeric(nrow(x))) |
| 198 |
} |
|
| 199 | ||
| 200 |
# Function to compute p-value for a single feature |
|
| 201 | 31x |
.compute_pval <- function(i) {
|
| 202 | 936x |
obs <- log2_fc[i] |
| 203 | 936x |
nulls <- perm_mat[i, ] |
| 204 | 936x |
if (is.na(obs) || all(is.na(nulls))) {
|
| 205 | ! |
return(1) |
| 206 |
} |
|
| 207 | 936x |
nulls_non_na <- nulls[!is.na(nulls)] |
| 208 | 936x |
n_non_na <- length(nulls_non_na) |
| 209 | 936x |
if (n_non_na == 0) {
|
| 210 | ! |
return(1) |
| 211 |
} |
|
| 212 | 936x |
cnt <- sum(abs(nulls_non_na) >= abs(obs)) |
| 213 | 936x |
pval <- (cnt + 1)/(n_non_na + 1) |
| 214 | 936x |
return(pval) |
| 215 |
} |
|
| 216 | ||
| 217 |
# compute two-sided permutation p-value with pseudocount, in parallel |
|
| 218 | 31x |
raw_p_values <- unlist(.tsenat_bplapply(seq_len(nrow(perm_mat)), .compute_pval, |
| 219 | 31x |
nthreads = nthreads)) |
| 220 | ||
| 221 | 31x |
adjusted_p_values <- p.adjust(raw_p_values, method = pcorr) |
| 222 | 31x |
out <- cbind(raw_p_values, adjusted_p_values) |
| 223 | 31x |
colnames(out) <- c("raw_p_values", "adjusted_p_values")
|
| 224 | 31x |
return(out) |
| 225 |
} |
|
| 226 | ||
| 227 | ||
| 228 |
#' Run a differential test by name: Wilcoxon or label-shuffle |
|
| 229 |
#' |
|
| 230 |
#' Thin wrapper that selects between `wilcoxon()` and `label_shuffling()`. |
|
| 231 |
#' |
|
| 232 |
#' @param x A matrix with splicing diversity values (rows = features). |
|
| 233 |
#' @param samples Character vector of sample group labels (length = ncol(x)). |
|
| 234 |
#' @param control Name of the control group (required for label-shuffle). |
|
| 235 |
#' @param method Character; one of `'wilcoxon'` or `'shuffle'`. |
|
| 236 |
#' @param fc_method Character; aggregation method used by the permutation |
|
| 237 |
#' test when `method = 'shuffle'` ('mean' or 'median').
|
|
| 238 |
#' @param paired Logical passed to `wilcoxon()` when using the Wilcoxon test. |
|
| 239 |
#' @param exact Logical passed to `wilcoxon()` to request exact p-values. |
|
| 240 |
#' @param randomizations Integer number of permutations for `label_shuffling()`. |
|
| 241 |
#' @param pcorr P-value adjustment method (passed to `p.adjust`). |
|
| 242 |
#' @param seed Integer seed used to make permutations reproducible (default |
|
| 243 |
#' 123). The function sets a temporary RNG seed via `withr::local_seed(seed)` |
|
| 244 |
#' before running `label_shuffling()` when `method = 'shuffle'`. |
|
| 245 |
#' @param nthreads Number of threads for parallel processing (default: 1). |
|
| 246 |
#' Set to > 1 to parallelize per-feature statistical tests in |
|
| 247 |
#' `wilcoxon()` or `label_shuffling()`. |
|
| 248 |
#' @return A two-column matrix with raw and adjusted p-values (as returned by |
|
| 249 |
#' the underlying functions). |
|
| 250 |
#' @export |
|
| 251 |
#' @examples |
|
| 252 |
#' mat <- matrix(rnorm(20), nrow = 5) |
|
| 253 |
#' samples <- rep(c('A', 'B'), length.out = ncol(mat))
|
|
| 254 |
#' test_differential(mat, samples, control = 'A', method = 'wilcoxon') |
|
| 255 |
#' |
|
| 256 |
#' @param paired_method Character; forwarded to `label_shuffling()` when |
|
| 257 |
#' `method = 'shuffle'`. See `label_shuffling()` for details. |
|
| 258 |
test_differential <- function(x, samples, control = NULL, method = c("wilcoxon",
|
|
| 259 |
"shuffle"), fc_method = "mean", paired = FALSE, exact = FALSE, randomizations = 100, |
|
| 260 |
pcorr = "BH", seed = 123L, paired_method = c("swap", "signflip"), nthreads = 1) {
|
|
| 261 | 21x |
paired_method <- match.arg(paired_method) |
| 262 | 21x |
method <- match.arg(method) |
| 263 | 21x |
if (method == "wilcoxon") {
|
| 264 | 6x |
return(wilcoxon(x, samples, pcorr = pcorr, paired = paired, exact = exact, |
| 265 | 6x |
nthreads = nthreads)) |
| 266 |
} |
|
| 267 | ||
| 268 |
# shuffle / permutation-based test |
|
| 269 | 15x |
if (is.null(control) || !nzchar(control)) {
|
| 270 | 1x |
stop("`control` must be provided when method = 'shuffle'", call. = FALSE)
|
| 271 |
} |
|
| 272 | 14x |
if (!is.null(seed)) {
|
| 273 | 14x |
if (!is.numeric(seed) || length(seed) != 1) {
|
| 274 | 1x |
stop("`seed` must be a single numeric value", call. = FALSE)
|
| 275 |
} |
|
| 276 |
# Use withr::local_seed to temporarily set RNG state for reproducible |
|
| 277 |
# permutation tests without permanently modifying the global RNG. |
|
| 278 | 13x |
withr::local_seed(as.integer(seed)) |
| 279 |
} |
|
| 280 | 13x |
return(label_shuffling(x, samples, control, fc_method, randomizations = randomizations, |
| 281 | 13x |
pcorr = pcorr, paired = paired, paired_method = paired_method, nthreads = nthreads)) |
| 282 |
} |
| 1 |
#' Calculate splicing diversity changes between two conditions. |
|
| 2 |
#' @param x A \code{SummarizedExperiment} with splicing diversity values for
|
|
| 3 |
#' each gene in each sample or a \code{data.frame} with gene names in the first
|
|
| 4 |
#' column and splicing diversity values for each sample in additional columns. |
|
| 5 |
#' @param samples A vector of length one, specifying the column name of the |
|
| 6 |
#' \code{colData} annotation column from the \code{SummarizedExperiment}
|
|
| 7 |
#' object, that should be used as the category column or a character vector |
|
| 8 |
#' with an equal length to the number of columns in the input dataset, |
|
| 9 |
#' specifying the category of each sample in the case of a \code{data.frame}
|
|
| 10 |
#' input. |
|
| 11 |
#' @param control Name of the control sample category, defined in the |
|
| 12 |
#' \code{samples} vector, e.g. \code{control = 'Normal'} or \code{control =
|
|
| 13 |
#' 'WT'}. |
|
| 14 |
#' @param method Method to use for calculating the average splicing diversity |
|
| 15 |
#' value in a condition. Can be \code{'mean'} or \code{'median'}.
|
|
| 16 |
#' @param test Method to use for p-value calculation: use \code{'wilcoxon'} for
|
|
| 17 |
#' Wilcoxon rank sum test or \code{'shuffle'} for a label shuffling test.
|
|
| 18 |
#' @param randomizations Number of random shuffles, used for the label shuffling |
|
| 19 |
#' test (default = 100). |
|
| 20 |
#' @param pcorr P-value correction method applied to the Wilcoxon rank sum test |
|
| 21 |
#' or label shuffling test results, as defined in the \code{p.adjust} function.
|
|
| 22 |
#' @param assayno An integer value. In case of multiple assays in a |
|
| 23 |
#' \code{SummarizedExperiment} input, the argument specifies the assay number
|
|
| 24 |
#' to use for difference calculations. |
|
| 25 |
#' @param verbose If \code{TRUE}, the function will print additional diagnostic
|
|
| 26 |
#' messages. |
|
| 27 |
#' @param pseudocount Numeric scalar. Passed to \code{calculate_fc} and used to
|
|
| 28 |
#' add a small value to non-positive group summaries before computing |
|
| 29 |
#' differences and log2 fold-changes. Default \code{1e-6}. Rows excluded for
|
|
| 30 |
#' low sample counts remain \code{NA}.
|
|
| 31 |
#' @param paired Logical; if `TRUE`, run paired versions of tests when |
|
| 32 |
#' supported (default: `FALSE`). |
|
| 33 |
#' @param exact Logical; passed to the Wilcoxon test to request exact p-values |
|
| 34 |
#' when supported (default: `FALSE`). |
|
| 35 |
#' @param nthreads Number of threads for parallel processing (default: 1). |
|
| 36 |
#' Set to > 1 to parallelize per-feature statistical tests. |
|
| 37 |
#' @return A \code{data.frame} with the mean or median values of splicing
|
|
| 38 |
#' diversity across sample categories and all samples, log2(fold change) of the |
|
| 39 |
#' two different conditions, raw and corrected p-values. |
|
| 40 |
#' @import methods |
|
| 41 |
#' @importFrom SummarizedExperiment SummarizedExperiment assays assay colData |
|
| 42 |
#' @export |
|
| 43 |
#' @details The function calculates diversity changes between two sample |
|
| 44 |
#' conditions. It uses the output of the diversity calculation function, which |
|
| 45 |
#' is a \code{SummarizedExperiment} object of splicing diversity values.
|
|
| 46 |
#' Additionally, it can use a \code{data.frame} as input, where the first column
|
|
| 47 |
#' contains gene names, and all additional columns contain splicing diversity |
|
| 48 |
#' values for each sample. A vector of sample conditions also serves as input, |
|
| 49 |
#' used for aggregating the samples by condition. It calculates the mean or |
|
| 50 |
#' median of the splicing diversity data per sample condition, the difference |
|
| 51 |
#' of these values and the log2 fold change of the two conditions. Furthermore, |
|
| 52 |
#' the user can select a statistical method to calculate the significance of |
|
| 53 |
#' the changes. The p-values and adjusted p-values are calculated using a |
|
| 54 |
#' Wilcoxon sum rank test or label shuffling test. The function will exclude |
|
| 55 |
#' genes of low sample size from the significance calculation, depending on |
|
| 56 |
#' which statistical test is applied. |
|
| 57 |
#' @examples |
|
| 58 |
#' x <- data.frame(Genes = letters[seq_len(10)], matrix(runif(80), ncol = 8)) |
|
| 59 |
#' samples <- c(rep('Healthy', 4), rep('Pathogenic', 4))
|
|
| 60 |
#' calculate_difference(x, samples, |
|
| 61 |
#' control = 'Healthy', method = 'mean', test = |
|
| 62 |
#' 'wilcoxon' |
|
| 63 |
#' ) |
|
| 64 |
calculate_difference <- function(x, samples = NULL, control, method = "mean", test = "wilcoxon", |
|
| 65 |
randomizations = 100, pcorr = "BH", assayno = 1, verbose = TRUE, paired = FALSE, |
|
| 66 |
exact = FALSE, pseudocount = 0, nthreads = 1) {
|
|
| 67 |
# internal small helpers (kept here to avoid adding new files) |
|
| 68 | 43x |
.tsenat_prepare_df <- function(x, samples, assayno) {
|
| 69 | 39x |
if (inherits(x, "RangedSummarizedExperiment") || inherits(x, "SummarizedExperiment")) {
|
| 70 |
# allow samples to be NULL (use default 'sample_type' col) |
|
| 71 | 4x |
if (is.null(samples)) {
|
| 72 | 3x |
if ("sample_type" %in% colnames(SummarizedExperiment::colData(x))) {
|
| 73 | 2x |
samples_col <- "sample_type" |
| 74 |
} else {
|
|
| 75 | 1x |
stop("When providing a SummarizedExperiment, supply 'samples' as a colData column name or call map_metadata() to populate 'sample_type'",
|
| 76 | 1x |
call. = FALSE) |
| 77 |
} |
|
| 78 |
} else {
|
|
| 79 | 1x |
if (length(samples) != 1) {
|
| 80 | 1x |
stop("'samples' must be a single colData column.", call. = FALSE)
|
| 81 |
} |
|
| 82 | ! |
samples_col <- samples |
| 83 |
} |
|
| 84 | 2x |
samples_vec <- SummarizedExperiment::colData(x)[[samples_col]] |
| 85 | 2x |
if (!is.numeric(assayno) || length(SummarizedExperiment::assays(x)) < |
| 86 | 2x |
assayno) {
|
| 87 | 1x |
stop("Invalid 'assayno'.", call. = FALSE)
|
| 88 |
} |
|
| 89 | 1x |
df <- as.data.frame(SummarizedExperiment::assays(x)[[assayno]]) |
| 90 | 1x |
genes <- rownames(df) |
| 91 | 1x |
df <- cbind(genes = genes, df) |
| 92 | 1x |
list(df = df, samples = samples_vec) |
| 93 |
} else {
|
|
| 94 | 35x |
df <- as.data.frame(x) |
| 95 | 35x |
list(df = df, samples = samples) |
| 96 |
} |
|
| 97 |
} |
|
| 98 | ||
| 99 | 43x |
.tsenat_sample_matrix <- function(dfr) {
|
| 100 | 17x |
as.matrix(dfr[, -c(1, ncol(dfr) - 1, ncol(dfr)), drop = FALSE]) |
| 101 |
} |
|
| 102 | ||
| 103 |
# Validate input container Reject matrices explicitly (tests expect this |
|
| 104 |
# error for matrix input) |
|
| 105 | 43x |
if (is.matrix(x)) {
|
| 106 | 4x |
stop("Input type unsupported; see ?calculate_difference.", call. = FALSE)
|
| 107 |
} |
|
| 108 | 39x |
if (!(is.data.frame(x) || inherits(x, "RangedSummarizedExperiment") || inherits(x, |
| 109 | 39x |
"SummarizedExperiment"))) {
|
| 110 | ! |
stop("Input data type not supported; see ?calculate_difference.", call. = FALSE)
|
| 111 |
} |
|
| 112 | ||
| 113 |
# prepare data.frame and sample vector (handles SummarizedExperiment) |
|
| 114 | 39x |
pd <- .tsenat_prepare_df(x, samples, assayno) |
| 115 | 36x |
df <- pd$df |
| 116 | 36x |
samples <- pd$samples |
| 117 | ||
| 118 |
# Partition and validate inputs |
|
| 119 | 36x |
part <- .tsenat_calculate_difference_partition(df = df, samples = samples, control = control, |
| 120 | 36x |
method = method, test = test, pcorr = pcorr, randomizations = randomizations, |
| 121 | 36x |
verbose = verbose) |
| 122 | 17x |
df <- part$df |
| 123 | 17x |
samples <- part$samples |
| 124 | 17x |
groups <- part$groups |
| 125 | 17x |
idx1 <- part$idx1 |
| 126 | 17x |
idx2 <- part$idx2 |
| 127 | 17x |
df_keep <- part$df_keep |
| 128 | 17x |
df_small <- part$df_small |
| 129 | ||
| 130 | 17x |
result_list <- list() |
| 131 | ||
| 132 |
# Helper to extract the numeric matrix of sample columns (keeps original |
|
| 133 |
# order) |
|
| 134 | 17x |
sample_matrix <- .tsenat_sample_matrix |
| 135 | ||
| 136 | 17x |
if (nrow(df_keep) > 0) {
|
| 137 | 11x |
if (nrow(df_small) > 0 && verbose) {
|
| 138 | 1x |
message(sprintf("Note: %d genes excluded due to low sample counts.",
|
| 139 | 1x |
nrow(df_small))) |
| 140 |
} |
|
| 141 | 11x |
ymat <- sample_matrix(df_keep) |
| 142 |
# p-value calculation |
|
| 143 | 11x |
if (test == "wilcoxon") {
|
| 144 | 8x |
ptab <- wilcoxon(ymat, samples, pcorr = pcorr, paired = paired, exact = exact, |
| 145 | 8x |
nthreads = nthreads) |
| 146 |
# wilcoxon should return a data.frame of p-values named |
|
| 147 |
# appropriately |
|
| 148 |
} else {
|
|
| 149 | 3x |
ptab <- label_shuffling(ymat, samples, control, method, randomizations = randomizations, |
| 150 | 3x |
pcorr = pcorr, paired = paired, nthreads = nthreads) |
| 151 |
} |
|
| 152 | 11x |
result_list$tested <- data.frame(genes = df_keep[, 1], calculate_fc(ymat, |
| 153 | 11x |
samples, control, method, pseudocount = pseudocount), ptab, stringsAsFactors = FALSE) |
| 154 |
} |
|
| 155 | ||
| 156 | 17x |
if (nrow(df_small) > 0) {
|
| 157 | 6x |
small_mat <- sample_matrix(df_small) |
| 158 | 6x |
result_list$small <- data.frame(genes = df_small[, 1], calculate_fc(small_mat, |
| 159 | 6x |
samples, control, method, pseudocount = pseudocount), raw_p_values = NA, |
| 160 | 6x |
adjusted_p_values = NA, stringsAsFactors = FALSE) |
| 161 |
} |
|
| 162 | ||
| 163 |
# Combine results preserving tested rows first |
|
| 164 | 17x |
if (length(result_list) == 0) {
|
| 165 | 1x |
return(data.frame()) |
| 166 |
} |
|
| 167 | 16x |
res <- do.call(rbind, result_list) |
| 168 | 16x |
rownames(res) <- NULL |
| 169 | 16x |
if (!("log2_fold_change" %in% colnames(res)) && ("log2FC" %in% colnames(res))) {
|
| 170 | ! |
res$log2_fold_change <- res$log2FC |
| 171 |
} |
|
| 172 | 16x |
return(res) |
| 173 |
} |
|
| 174 | ||
| 175 | ||
| 176 |
#' Linear-model interaction test for Tsallis entropy For each gene, fit a |
|
| 177 |
#' linear model of the form `entropy ~ q * group` and extract the p-value for |
|
| 178 |
#' the interaction term (whether the effect of `q` differs between groups). The |
|
| 179 |
#' function expects a `SummarizedExperiment` produced by |
|
| 180 |
#' `calculate_diversity()` when multiple `q` values have been computed (column |
|
| 181 |
#' names contain `_q=`). |
|
| 182 |
#' @param se A `SummarizedExperiment` containing a `diversity` assay produced |
|
| 183 |
#' by `calculate_diversity(..., q = <vector>)`. |
|
| 184 |
#' @param sample_type_col Optional column name in `colData(se)` that contains |
|
| 185 |
#' a grouping factor for samples (character). If `NULL`, the function will |
|
| 186 |
#' attempt to infer group from column names (suffix `_N` interpreted as |
|
| 187 |
#' 'Normal'). |
|
| 188 |
#' @param min_obs Minimum number of non-NA observations required to fit a |
|
| 189 |
#' model for a gene (default: 10). |
|
| 190 |
#' @param method Modeling method to use for interaction testing: one of |
|
| 191 |
#' \code{c('linear', 'lmm', 'gam', 'fpca')} (default: 'linear').
|
|
| 192 |
#' @param pvalue Type of p-value to compute for linear mixed models: one of |
|
| 193 |
#' \code{c('satterthwaite', 'lrt', 'both')} (default: 'satterthwaite').
|
|
| 194 |
#' @param subject_col Optional column name in `colData(se)` that contains |
|
| 195 |
#' subject/individual identifiers for paired or repeated-measures designs |
|
| 196 |
#' (character). If provided with `method = 'lmm'`, used as random effect. |
|
| 197 |
#' @param paired Logical; whether samples are paired (default: FALSE). |
|
| 198 |
#' @param nthreads Number of threads (mc.cores) to use for parallel processing |
|
| 199 |
#' (default: 1). |
|
| 200 |
#' @param assay_name Name of the assay in the SummarizedExperiment to use |
|
| 201 |
#' (default: 'diversity'). |
|
| 202 |
#' @param verbose Logical; whether to print progress messages during execution |
|
| 203 |
#' (default: FALSE). |
|
| 204 |
#' @return A data.frame with columns `gene`, `p_interaction`, and |
|
| 205 |
#' `adj_p_interaction`, ordered by ascending `p_interaction`. |
|
| 206 |
#' @export |
|
| 207 |
#' @examples |
|
| 208 |
#' data('tcga_brca_luma', package = 'TSENAT')
|
|
| 209 |
#' rc <- as.matrix(tcga_brca_luma[1:20, -1, drop = FALSE]) |
|
| 210 |
#' gs <- tcga_brca_luma[1:20, 1] |
|
| 211 |
#' se <- calculate_diversity(rc, gs, q = c(0.1, 1), norm = TRUE) |
|
| 212 |
#' # Provide a minimal sample-type mapping so the example runs during checks |
|
| 213 |
#' SummarizedExperiment::colData(se) <- S4Vectors::DataFrame( |
|
| 214 |
#' sample_type = rep(c('Normal', 'Tumor'), length.out = ncol(se)),
|
|
| 215 |
#' row.names = colnames(se) |
|
| 216 |
#' ) |
|
| 217 |
#' calculate_lm_interaction(se, sample_type_col = 'sample_type') |
|
| 218 |
calculate_lm_interaction <- function(se, sample_type_col = NULL, min_obs = 10, method = c("linear",
|
|
| 219 |
"lmm", "gam", "fpca"), pvalue = c("satterthwaite", "lrt", "both"), subject_col = NULL,
|
|
| 220 |
paired = FALSE, nthreads = 1, assay_name = "diversity", verbose = FALSE) {
|
|
| 221 | 15x |
method <- match.arg(method) |
| 222 | 14x |
pvalue <- match.arg(pvalue) |
| 223 | 13x |
if (verbose) {
|
| 224 | ! |
message("[calculate_lm_interaction] method=", method)
|
| 225 |
} |
|
| 226 |
# internal flags: keep these internal to avoid documenting them in Rd |
|
| 227 | 13x |
suppress_lme4_warnings <- TRUE |
| 228 | 13x |
progress <- FALSE |
| 229 | 13x |
if (!requireNamespace("SummarizedExperiment", quietly = TRUE)) {
|
| 230 | ! |
stop("SummarizedExperiment required")
|
| 231 |
} |
|
| 232 | ||
| 233 | 13x |
mat <- SummarizedExperiment::assay(se, assay_name) |
| 234 | 13x |
if (is.null(mat)) {
|
| 235 | ! |
stop(sprintf("Assay '%s' not found in SummarizedExperiment", assay_name))
|
| 236 |
} |
|
| 237 | ||
| 238 | 13x |
sample_q <- colnames(mat) |
| 239 | 13x |
if (is.null(sample_q) || length(sample_q) == 0) {
|
| 240 | ! |
stop("No column names found on diversity assay")
|
| 241 |
} |
|
| 242 | ||
| 243 |
# parse sample names and q values from column names like 'Sample_q=0.01' |
|
| 244 | 13x |
sample_names <- sub("_q=.*", "", sample_q)
|
| 245 | 13x |
has_q <- grepl("_q=", sample_q)
|
| 246 | 13x |
if (!any(has_q)) {
|
| 247 | 1x |
stop("Could not parse q values; expected '_q=' in column names", call. = FALSE)
|
| 248 |
} |
|
| 249 | 12x |
if (!all(has_q)) {
|
| 250 | ! |
stop("Some column names are missing '_q='; ensure all diversity columns include a q value",
|
| 251 | ! |
call. = FALSE) |
| 252 |
} |
|
| 253 | 12x |
q_vals <- as.numeric(sub(".*_q=", "", sample_q))
|
| 254 | ||
| 255 |
# determine group for each sample |
|
| 256 | 12x |
sample_type_in_coldata <- !is.null(sample_type_col) && sample_type_col %in% colnames(SummarizedExperiment::colData(se)) |
| 257 | 12x |
if (sample_type_in_coldata) {
|
| 258 | 11x |
st <- as.character(SummarizedExperiment::colData(se)[, sample_type_col]) |
| 259 | 11x |
names(st) <- SummarizedExperiment::colData(se)$samples %||% colnames(mat) |
| 260 |
# when user provides a sample_type_col, index by the sample names |
|
| 261 |
# (strip q suffix) |
|
| 262 | 11x |
group_vec <- unname(st[sample_names]) |
| 263 |
} else {
|
|
| 264 | 1x |
stop("No sample grouping found: please supply `sample_type_col` or map sample",
|
| 265 | 1x |
"types into `colData(se)` before calling calculate_lm_interaction().", |
| 266 | 1x |
call. = FALSE) |
| 267 |
} |
|
| 268 | ||
| 269 | 11x |
if (verbose && progress) {
|
| 270 | ! |
message("[calculate_lm_interaction] parsed samples and groups")
|
| 271 |
} |
|
| 272 | 11x |
all_results <- list() |
| 273 | 11x |
fit_one <- function(g) {
|
| 274 | 69x |
.tsenat_fit_one_interaction(g = g, se = se, mat = mat, q_vals = q_vals, sample_names = sample_names, |
| 275 | 69x |
group_vec = group_vec, method = method, pvalue = pvalue, subject_col = subject_col, |
| 276 | 69x |
paired = paired, min_obs = min_obs, verbose = verbose, suppress_lme4_warnings = suppress_lme4_warnings, |
| 277 | 69x |
progress = progress) |
| 278 |
} |
|
| 279 | ||
| 280 | 11x |
if (nthreads > 1) {
|
| 281 | 1x |
res_list <- .tsenat_bplapply(rownames(mat), fit_one, nthreads = nthreads) |
| 282 |
} else {
|
|
| 283 | 10x |
res_list <- lapply(rownames(mat), fit_one) |
| 284 |
} |
|
| 285 | 11x |
all_results <- Filter(Negate(is.null), res_list) |
| 286 | ||
| 287 | 11x |
if (length(all_results) == 0) {
|
| 288 | 2x |
return(data.frame()) |
| 289 |
} |
|
| 290 | 9x |
res <- do.call(rbind, all_results) |
| 291 | 9x |
res$adj_p_interaction <- stats::p.adjust(res$p_interaction, method = "BH") |
| 292 |
# Sort first by adjusted p-values, then by raw p-values for stable ordering |
|
| 293 | 9x |
res <- res[order(res$adj_p_interaction, res$p_interaction), , drop = FALSE] |
| 294 | 9x |
rownames(res) <- NULL |
| 295 | ||
| 296 | 9x |
.tsenat_report_fit_summary(res, verbose = verbose) |
| 297 | ||
| 298 |
# Return the result data.frame (do not attach to or return a |
|
| 299 |
# SummarizedExperiment) |
|
| 300 | 9x |
return(res) |
| 301 |
} |
|
| 302 | ||
| 303 |
# small helper (replacement for `%||%`) to provide default when NULL |
|
| 304 | 87x |
`%||%` <- function(a, b) if (is.null(a)) b else a |
| 1 |
#' Calculate Tsallis diversity values for transcripts grouped by gene |
|
| 2 |
#' @param x Numeric matrix or data.frame of transcript-level expression |
|
| 3 |
#' values (rows = transcripts, columns = samples). |
|
| 4 |
#' @param genes Character vector with length equal to nrow(x) assigning each |
|
| 5 |
#' transcript to a gene. |
|
| 6 |
#' @param norm Logical; if TRUE normalize Tsallis entropy values per gene. |
|
| 7 |
#' @param q Numeric scalar or vector of q values to evaluate. |
|
| 8 |
#' @param verbose Logical; show diagnostic messages when TRUE. |
|
| 9 |
#' @param what Which quantity to return from `calculate_tsallis_entropy`: |
|
| 10 |
#' 'S' (Tsallis entropy) or 'D' (Hill numbers) (default: 'S'). |
|
| 11 |
#' @param nthreads Number of threads for parallel processing (default: 1). |
|
| 12 |
#' Set to > 1 to parallelize per-gene entropy calculations. |
|
| 13 |
#' @return A data.frame with genes in the first column and per-sample (and |
|
| 14 |
#' per-q) Tsallis entropy values in subsequent columns. |
|
| 15 |
#' @import stats |
|
| 16 |
#' @export |
|
| 17 |
#' @examples |
|
| 18 |
#' # Create a small transcript expression matrix (4 transcripts x 2 samples) |
|
| 19 |
#' mat <- matrix(c(10, 5, 0, 0, 2, 8, 3, 7), nrow = 4, byrow = TRUE) |
|
| 20 |
#' colnames(mat) <- c('Sample1', 'Sample2')
|
|
| 21 |
#' genes <- c('geneA', 'geneA', 'geneB', 'geneB')
|
|
| 22 |
#' |
|
| 23 |
#' # Calculate Tsallis diversity for q=1 |
|
| 24 |
#' result <- calculate_method(mat, genes, norm = TRUE, q = 1) |
|
| 25 |
#' result |
|
| 26 |
calculate_method <- function(x, genes, norm = TRUE, verbose = FALSE, q = 2, what = c("S",
|
|
| 27 |
"D"), nthreads = 1) {
|
|
| 28 | 35x |
what <- match.arg(what) |
| 29 |
# validate q |
|
| 30 | 35x |
if (!is.numeric(q) || any(q <= 0)) {
|
| 31 | ! |
stop("Argument 'q' must be numeric and greater than 0.", call. = FALSE)
|
| 32 |
} |
|
| 33 |
# cannot use aggregate because calculate_tsallis_entropy may return |
|
| 34 |
# multiple values when length(q) > 1 |
|
| 35 | 35x |
gene_levels <- unique(genes) |
| 36 |
# ensure column names order matches the order used when constructing the |
|
| 37 |
# result matrix (samples vary outer, q varies inner). If sample names are |
|
| 38 |
# missing, synthesize deterministic names so column creation still works. |
|
| 39 | 35x |
sample_names <- colnames(x) |
| 40 | 35x |
if (is.null(sample_names)) {
|
| 41 | 1x |
sample_names <- paste0("Sample", seq_len(ncol(x)))
|
| 42 |
} |
|
| 43 | 35x |
coln <- as.vector(t(outer(sample_names, q, function(s, qq) paste0(s, "_q=", qq)))) |
| 44 | 35x |
rown <- gene_levels |
| 45 | ||
| 46 |
# compute requested quantity ('S' or 'D') in parallel
|
|
| 47 | 35x |
result_list <- .tsenat_bplapply(gene_levels, function(gene) {
|
| 48 | 681x |
.tsenat_tsallis_row(x = x, genes = genes, gene = gene, q = q, norm = norm, |
| 49 | 681x |
what = what) |
| 50 | 35x |
}, nthreads = nthreads) |
| 51 | ||
| 52 |
# Convert list to matrix (each element is a named vector) result_list is a |
|
| 53 |
# list of vectors; combine them into a matrix |
|
| 54 | 35x |
result_mat <- t(vapply(result_list, identity, FUN.VALUE = setNames(numeric(length(coln)), |
| 55 | 35x |
coln))) |
| 56 | 35x |
colnames(result_mat) <- coln |
| 57 | 35x |
rownames(result_mat) <- rown |
| 58 | 35x |
out_df <- data.frame(Gene = rown, result_mat, check.names = FALSE) |
| 59 | 35x |
if (all(rowSums(!is.na(result_mat)) == 0)) {
|
| 60 | 3x |
out_df <- data.frame(Gene = character(0)) |
| 61 | 3x |
for (nm in coln) out_df[[nm]] <- numeric(0) |
| 62 | 3x |
x <- out_df |
| 63 | 3x |
return(x) |
| 64 |
} |
|
| 65 | 32x |
x <- out_df |
| 66 | 32x |
y <- x[apply(x[2:ncol(x)], 1, function(X) all(is.finite(X))), ] |
| 67 | 32x |
if (nrow(x) - nrow(y) > 0 && verbose == TRUE) {
|
| 68 | 1x |
message(sprintf("Note: %d genes excluded.", nrow(x) - nrow(y)))
|
| 69 |
} |
|
| 70 | 32x |
colnames(y)[1] <- "Gene" |
| 71 | 32x |
return(y) |
| 72 |
} |
| 1 |
# Helper functions for parallel processing using BiocParallel and parallel |
|
| 2 |
# package |
|
| 3 | ||
| 4 |
# Select and initialize parallel backend @param nthreads Number of threads to |
|
| 5 |
# use (default: 1) @return BiocParallel BPPARAM object |
|
| 6 |
.tsenat_get_bpparam <- function(nthreads = 1) {
|
|
| 7 | 41x |
if (nthreads <= 1) {
|
| 8 | 1x |
return(BiocParallel::SerialParam()) |
| 9 |
} |
|
| 10 | ||
| 11 |
# On Unix-like systems, use MulticoreParam; on Windows use SnowParam |
|
| 12 | 40x |
if (.Platform$OS.type == "unix") {
|
| 13 | 40x |
return(BiocParallel::MulticoreParam(workers = nthreads)) |
| 14 |
} else {
|
|
| 15 | ! |
return(BiocParallel::SnowParam(workers = nthreads)) |
| 16 |
} |
|
| 17 |
} |
|
| 18 | ||
| 19 |
# Apply function in parallel using the best available backend @param X Vector |
|
| 20 |
# or list to iterate over @param FUN Function to apply @param nthreads Number |
|
| 21 |
# of threads (default: 1) @param SIMPLIFY Whether to simplify results (default: |
|
| 22 |
# TRUE) @param FUN.VALUE Template for vapply (optional) |
|
| 23 |
.tsenat_bplapply <- function(X, FUN, nthreads = 1, SIMPLIFY = TRUE, FUN.VALUE = NULL) {
|
|
| 24 | 119x |
if (nthreads <= 1) {
|
| 25 |
# Serial execution |
|
| 26 | 90x |
if (is.null(FUN.VALUE)) {
|
| 27 | 81x |
return(lapply(X, FUN)) |
| 28 |
} else {
|
|
| 29 | 9x |
return(unname(vapply(X, FUN, FUN.VALUE = FUN.VALUE))) |
| 30 |
} |
|
| 31 |
} |
|
| 32 | ||
| 33 |
# Parallel execution |
|
| 34 | 29x |
bpparam <- .tsenat_get_bpparam(nthreads) |
| 35 | ||
| 36 | 29x |
if (is.null(FUN.VALUE)) {
|
| 37 | 22x |
return(BiocParallel::bplapply(X, FUN, BPPARAM = bpparam)) |
| 38 |
} else {
|
|
| 39 |
# Use bplapply and then simplify with vapply |
|
| 40 | 7x |
result_list <- BiocParallel::bplapply(X, FUN, BPPARAM = bpparam) |
| 41 | 7x |
return(vapply(result_list, identity, FUN.VALUE = FUN.VALUE)) |
| 42 |
} |
|
| 43 |
} |
|
| 44 | ||
| 45 |
# Apply function over two vectors in parallel @param X First vector or list |
|
| 46 |
# @param Y Second vector or list @param FUN Function to apply (takes two |
|
| 47 |
# arguments) @param nthreads Number of threads (default: 1) |
|
| 48 |
.tsenat_bpmapply <- function(X, Y, FUN, nthreads = 1) {
|
|
| 49 | 19x |
if (nthreads <= 1) {
|
| 50 | 9x |
return(unname(mapply(FUN, X, Y, SIMPLIFY = FALSE))) |
| 51 |
} |
|
| 52 | ||
| 53 | 10x |
bpparam <- .tsenat_get_bpparam(nthreads) |
| 54 | 10x |
return(unname(BiocParallel::bpmapply(FUN, X, Y, BPPARAM = bpparam, SIMPLIFY = FALSE))) |
| 55 |
} |