Mercurial > repos > iuc > deseq2
diff deseq2.R @ 44:1cb33de18af5 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deseq2 commit 5bf5011af827e93ddeecbfba4815fe9b85f02594
| author | iuc |
|---|---|
| date | Tue, 09 Dec 2025 17:43:58 +0000 |
| parents | 621ddf5967d2 |
| children |
line wrap: on
line diff
--- a/deseq2.R Tue Jul 18 14:58:28 2023 +0000 +++ b/deseq2.R Tue Dec 09 17:43:58 2025 +0000 @@ -32,8 +32,8 @@ # setup R error handling to go to stderr options(show.error.messages = FALSE, error = function() { - cat(geterrmessage(), file = stderr()) - q("no", 1, FALSE) + cat(geterrmessage(), file = stderr()) + q("no", 1, FALSE) }) library("getopt") @@ -44,50 +44,59 @@ # get options, using the spec as defined by the enclosed list. # we read the options from the default: commandArgs(TRUE). spec <- matrix(c( - "quiet", "q", 0, "logical", - "help", "h", 0, "logical", - "cores", "s", 0, "integer", - "batch_factors", "w", 1, "character", - "outfile", "o", 1, "character", - "countsfile", "n", 1, "character", - "sizefactorsfile", "F", 1, "character", - "rlogfile", "r", 1, "character", - "vstfile", "v", 1, "character", - "header", "H", 0, "logical", - "factors", "f", 1, "character", - "files_to_labels", "l", 1, "character", - "plots", "p", 1, "character", - "tximport", "i", 0, "logical", - "txtype", "y", 1, "character", - "tx2gene", "x", 1, "character", # a space-sep tx-to-gene map or GTF/GFF3 file - "esf", "e", 1, "character", - "fit_type", "t", 1, "integer", - "many_contrasts", "m", 0, "logical", - "outlier_replace_off", "a", 0, "logical", - "outlier_filter_off", "b", 0, "logical", - "auto_mean_filter_off", "c", 0, "logical", - "use_beta_priors", "d", 0, "logical", - "alpha_ma", "A", 1, "numeric", - "prefilter", "P", 0, "logical", - "prefilter_value", "V", 1, "numeric" + "quiet", "q", 0, "logical", + "help", "h", 0, "logical", + "cores", "s", 0, "integer", + "batch_factors", "w", 1, "character", + "outfile", "o", 1, "character", + "countsfile", "n", 1, "character", + "sizefactorsfile", "F", 1, "character", + "rlogfile", "r", 1, "character", + "vstfile", "v", 1, "character", + "header", "H", 0, "logical", + "factors", "f", 1, "character", + "files_to_labels", "l", 1, "character", + "plots", "p", 1, "character", + "tximport", "i", 0, "logical", + "txtype", "y", 1, "character", + "tx2gene", "x", 1, "character", # a space-sep tx-to-gene map or GTF/GFF3 file + "esf", "e", 1, "character", + "fit_type", "t", 1, "integer", + "many_contrasts", "m", 0, "logical", + "outlier_replace_off", "a", 0, "logical", + "outlier_filter_off", "b", 0, "logical", + "auto_mean_filter_off", "c", 0, "logical", + "use_beta_priors", "d", 0, "logical", + "alpha_ma", "A", 1, "numeric", + "prefilter", "P", 0, "logical", + "prefilter_value", "V", 1, "numeric", + "sample_sheet_mode", "S", 0, "logical", + "sample_sheet", "g", 1, "character", + "factor_columns", "j", 1, "character", + "reference_level", "R", 1, "character", + "target_level", "T", 1, "character", + "collection_files", "C", 1, "character", + "custom_design_formula", "D", 0, "logical", + "design_formula", "G", 1, "character", + "contrast_definition", "K", 1, "character" ), byrow = TRUE, ncol = 4) opt <- getopt(spec) # if help was asked for print a friendly message # and exit with a non-zero error code if (!is.null(opt$help)) { - cat(getopt(spec, usage = TRUE)) - q(status = 1) + cat(getopt(spec, usage = TRUE)) + q(status = 1) } # enforce the following required arguments if (is.null(opt$outfile)) { - cat("'outfile' is required\n") - q(status = 1) + cat("'outfile' is required\n") + q(status = 1) } -if (is.null(opt$factors)) { - cat("'factors' is required\n") - q(status = 1) +if (is.null(opt$factors) && is.null(opt$sample_sheet_mode)) { + cat("'factors' is required when not using sample_sheet_mode\n") + q(status = 1) } verbose <- is.null(opt$quiet) @@ -101,17 +110,17 @@ source_local("get_deseq_dataset.R") suppressPackageStartupMessages({ - library("DESeq2") - library("RColorBrewer") - library("gplots") + library("DESeq2") + library("RColorBrewer") + library("gplots") }) if (opt$cores > 1) { - library("BiocParallel") - register(MulticoreParam(opt$cores)) - parallel <- TRUE + library("BiocParallel") + register(MulticoreParam(opt$cores)) + parallel <- TRUE } else { - parallel <- FALSE + parallel <- FALSE } # build or read sample table @@ -120,97 +129,341 @@ # switch on if 'factors' was provided: library("rjson") -parser <- newJSONParser() -parser$addData(opt$factors) -factor_list <- parser$getObject() -filenames_to_labels <- fromJSON(opt$files_to_labels) + +if (!is.null(opt$sample_sheet_mode)) { + # Sample sheet mode: build factor_list from sample sheet + filenames_to_labels <- fromJSON(opt$files_to_labels) + + # Read sample sheet + sample_sheet <- read.table(opt$sample_sheet, sep = "\t", header = TRUE, stringsAsFactors = FALSE) + + # Parse collection files + collection_files <- strsplit(opt$collection_files, ",")[[1]] + + # First column of sample sheet should contain sample identifiers + sample_id_col <- colnames(sample_sheet)[1] + + if (!is.null(opt$custom_design_formula)) { + # Custom design formula mode + # Extract variable names from formula (remove ~, +, *, :, whitespace) + formula_str <- gsub("^~\\s*", "", opt$design_formula) + formula_vars <- unique(trimws(unlist(strsplit(formula_str, "[+*:]")))) + + # Validate all variables exist in sample sheet + missing_vars <- setdiff(formula_vars, colnames(sample_sheet)) + if (length(missing_vars) > 0) { + cat(paste0("Error: Variables not found in sample sheet: ", paste(missing_vars, collapse = ", "), "\n")) + cat(paste0("Available columns: ", paste(colnames(sample_sheet), collapse = ", "), "\n")) + q(status = 1) + } + + # Use all formula variables as factor columns + factor_col_names <- formula_vars + } else { + # Automatic mode: Parse factor columns (comma-separated column numbers, 1-indexed) + factor_col_nums <- as.integer(strsplit(opt$factor_columns, ",")[[1]]) + factor_col_names <- colnames(sample_sheet)[factor_col_nums] + } + + # Validate sample sheet matches collection before building factor_list + # Get element identifiers from collection + collection_element_ids <- character(length(collection_files)) + for (idx in seq_along(collection_files)) { + file <- collection_files[idx] + file_basename <- basename(file) + if (file_basename %in% names(filenames_to_labels)) { + collection_element_ids[idx] <- filenames_to_labels[[file_basename]] + } else { + cat("Error: Sample sheet validation failed\n") + cat(paste0("Collection file '", file_basename, "' does not have a corresponding element identifier.\n")) + cat("This is an internal error - please report this issue.\n") + q(status = 1) + } + } + + # Get sample identifiers from sample sheet + sample_sheet_ids <- sample_sheet[[sample_id_col]] + + # Check for mismatches + collection_not_in_sheet <- setdiff(collection_element_ids, sample_sheet_ids) + sheet_not_in_collection <- setdiff(sample_sheet_ids, collection_element_ids) + + if (length(collection_not_in_sheet) > 0 || length(sheet_not_in_collection) > 0) { + cat("Error: Sample sheet does not match the input collection\n\n") + + if (length(collection_not_in_sheet) > 0) { + cat("The following samples are in the collection but NOT in the sample sheet:\n") + for (id in collection_not_in_sheet) { + cat(paste0(" - ", id, "\n")) + } + cat("\n") + } + + if (length(sheet_not_in_collection) > 0) { + cat("The following samples are in the sample sheet but NOT in the collection:\n") + for (id in sheet_not_in_collection) { + cat(paste0(" - ", id, "\n")) + } + cat("\n") + } + + cat("Please ensure that:\n") + cat(paste0("1. The first column ('", sample_id_col, "') of the sample sheet contains sample identifiers\n")) + cat("2. These identifiers exactly match the element identifiers in your collection\n") + cat("3. All samples in the collection are listed in the sample sheet\n") + cat("4. All samples in the sample sheet exist in the collection\n") + q(status = 1) + } + + # Determine which factor will be the primary factor for contrasts + # In custom mode, the primary factor is the last one in the formula (rightmost term) + # In automatic mode, the first selected factor is primary (formula gets reversed) + if (!is.null(opt$custom_design_formula)) { + primary_factor_index <- length(factor_col_names) + } else { + primary_factor_index <- 1 + } + + # Build factor_list structure + factor_list <- list() + for (i in seq_along(factor_col_names)) { + factor_name <- factor_col_names[i] + + # Group files by factor level, preserving the order of first appearance + level_to_files <- list() + level_order <- character(0) # Track order of first appearance + for (file in collection_files) { + element_id <- filenames_to_labels[[basename(file)]] + # Find matching row in sample sheet + matching_row <- which(sample_sheet[[sample_id_col]] == element_id) + if (length(matching_row) > 0) { + level <- sample_sheet[[factor_name]][matching_row[1]] + if (!(level %in% names(level_to_files))) { + level_to_files[[level]] <- character(0) + level_order <- c(level_order, level) # Record order of first appearance + } + level_to_files[[level]] <- c(level_to_files[[level]], file) + } + } + + # Get all levels in order of first appearance (not alphabetically sorted) + all_levels <- level_order + + # Handle reference and target levels for the primary contrast factor + # In custom mode: this is the LAST factor in the formula + # In automatic mode: this is the FIRST factor selected (formula will be reversed) + if (i == primary_factor_index) { + # This is the primary factor for contrasts: handle reference/target + # Determine reference level + if (!is.null(opt$reference_level) && nchar(opt$reference_level) > 0) { + ref_level <- trim(opt$reference_level) + # Validate that reference level exists + if (!(ref_level %in% all_levels)) { + cat(paste0("Error: Reference level '", ref_level, "' not found in factor '", factor_name, "'. Available levels: ", paste(all_levels, collapse = ", "), "\n")) + q(status = 1) + } + } else { + # Default: use first level encountered in sample sheet as reference + ref_level <- all_levels[1] + } + + # Build factor levels with reference first, then others + # This applies to both automatic and custom modes + if (!is.null(opt$target_level) && nchar(opt$target_level) > 0) { + target_level <- trim(opt$target_level) + # Validate that target level exists + if (!(target_level %in% all_levels)) { + cat(paste0("Error: Target level '", target_level, "' not found in factor '", factor_name, "'. Available levels: ", paste(all_levels, collapse = ", "), "\n")) + q(status = 1) + } + # Explicit target: reference first, target second + levels_to_use <- c(ref_level, target_level) + } else { + # Reference first, then others in order of appearance + other_levels <- all_levels[all_levels != ref_level] + levels_to_use <- c(ref_level, other_levels) + # Target is the first non-reference level + target_level <- other_levels[1] + } + + # Store reference and target levels for later contrast specification + # This ensures we use the intended levels, not whatever order files happened to be in + primary_ref_level <- ref_level + primary_target_level <- target_level + } else { + # For secondary factors, use all levels in order of appearance + levels_to_use <- all_levels + } + + # Build factor structure in the order specified by levels_to_use + # Following standard DESeq2 convention: reference level is first + factor_levels <- list() + for (level in levels_to_use) { + if (level %in% names(level_to_files)) { + level_entry <- list() + level_entry[[level]] <- level_to_files[[level]] + factor_levels[[length(factor_levels) + 1]] <- level_entry + } + } + + factor_list[[length(factor_list) + 1]] <- list(factor_name, factor_levels) + } + + # Set primary_factor for sample sheet mode + # This is the factor that will be used for contrasts + if (!is.null(opt$custom_design_formula)) { + # Custom mode: primary factor is the last one in the formula (rightmost term) + primary_factor <- factor_col_names[length(factor_col_names)] + } else { + # Automatic mode: primary factor is the first selected factor (formula will be reversed) + primary_factor <- factor_col_names[1] + } + + # Parse contrast_definition if in custom mode + if (!is.null(opt$custom_design_formula) && !is.null(opt$contrast_definition) && nchar(opt$contrast_definition) > 0) { + contrast_parts <- strsplit(opt$contrast_definition, ",")[[1]] + if (length(contrast_parts) != 3) { + cat("Error: contrast_definition must be in format 'factor,target,reference'\n") + q(status = 1) + } + custom_contrast <- list( + factor = trim(contrast_parts[1]), + target = trim(contrast_parts[2]), + reference = trim(contrast_parts[3]) + ) + # Validate the contrast + if (!(custom_contrast$factor %in% factor_col_names)) { + cat(paste0("Error: Contrast factor '", custom_contrast$factor, "' not found in design formula.\n")) + cat(paste0("Available factors: ", paste(factor_col_names, collapse = ", "), "\n")) + q(status = 1) + } + } +} else { + # Original mode: factors provided directly + parser <- newJSONParser() + parser$addData(opt$factors) + factor_list <- parser$getObject() + filenames_to_labels <- fromJSON(opt$files_to_labels) + + # For original mode, extract reference and target levels from the first factor + # In original mode: ref=level1 (denominator), target=level2 (numerator) -> log2(level2/level1) + # So we swap the naming to match the contrast direction used below + primary_factor_data <- factor_list[[1]] + primary_levels <- sapply(primary_factor_data[[2]], function(x) names(x)) + primary_target_level <- primary_levels[1] # First level becomes "target" for the swap below + primary_ref_level <- if (length(primary_levels) >= 2) primary_levels[2] else primary_levels[1] # Second level becomes "ref" +} + factors <- sapply(factor_list, function(x) x[[1]]) -primary_factor <- factors[1] +# For original mode (not sample_sheet_mode), set primary_factor to first factor +# In sample_sheet_mode, it's already set correctly above +if (is.null(opt$sample_sheet_mode)) { + primary_factor <- factors[1] +} filenames_in <- unname(unlist(factor_list[[1]][[2]])) labs <- unname(unlist(filenames_to_labels[basename(filenames_in)])) sample_table <- data.frame( - sample = basename(filenames_in), - filename = filenames_in, - row.names = filenames_in, - stringsAsFactors = FALSE + sample = basename(filenames_in), + filename = filenames_in, + row.names = filenames_in, + stringsAsFactors = FALSE ) for (factor in factor_list) { - factor_name <- trim(factor[[1]]) - sample_table[[factor_name]] <- character(nrow(sample_table)) - lvls <- sapply(factor[[2]], function(x) names(x)) - for (i in seq_along(factor[[2]])) { - files <- factor[[2]][[i]][[1]] - sample_table[files, factor_name] <- trim(lvls[i]) - } - sample_table[[factor_name]] <- factor(sample_table[[factor_name]], levels = lvls) + factor_name <- trim(factor[[1]]) + sample_table[[factor_name]] <- character(nrow(sample_table)) + lvls <- sapply(factor[[2]], function(x) names(x)) + for (i in seq_along(factor[[2]])) { + files <- factor[[2]][[i]][[1]] + sample_table[files, factor_name] <- trim(lvls[i]) + } + sample_table[[factor_name]] <- factor(sample_table[[factor_name]], levels = lvls) + # Explicitly set the reference level using relevel() for the primary factor in sample_sheet_mode + # This ensures DESeq2 knows the reference regardless of factor level order + # In original mode, we don't relevel because the order from factor_list is already correct + # Note: primary_factor is already set correctly in sample_sheet_mode before we get here + if (exists("primary_factor") && factor_name == primary_factor && exists("primary_ref_level") && !is.null(opt$sample_sheet_mode)) { + sample_table[[factor_name]] <- relevel(sample_table[[factor_name]], ref = primary_ref_level) + } } rownames(sample_table) <- labs -design_formula <- as.formula(paste("~", paste(rev(factors), collapse = " + "))) +# Build design formula +if (!is.null(opt$custom_design_formula)) { + # Custom mode: use user-provided formula + design_formula <- as.formula(opt$design_formula) + # In original mode (not sample_sheet), set primary factor to last variable in formula + # In sample_sheet_mode, it's already set correctly above + if (is.null(opt$sample_sheet_mode)) { + primary_factor <- factors[length(factors)] + } +} else { + # Automatic mode: build from selected factors + design_formula <- as.formula(paste("~", paste(rev(factors), collapse = " + "))) +} # these are plots which are made once for each analysis generate_generic_plots <- function(dds, factors) { - library("ggplot2") - library("ggrepel") - library("pheatmap") + library("ggplot2") + library("ggrepel") + library("pheatmap") - rld <- rlog(dds) - p <- plotPCA(rld, intgroup = rev(factors)) - print(p + geom_text_repel(aes_string(x = "PC1", y = "PC2", label = factor(colnames(dds))), size = 3) + geom_point()) - dat <- assay(rld) - dists_rl <- dist(t(dat)) - mat <- as.matrix(dists_rl) - colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255) - pheatmap( - mat, - clustering_distance_rows = dists_rl, - clustering_distance_cols = dists_rl, - col = colors, - main = "Sample-to-sample distances" - ) - plotDispEsts(dds, main = "Dispersion estimates") + rld <- rlog(dds) + p <- plotPCA(rld, intgroup = rev(factors)) + print(p + geom_text_repel(aes_string(x = "PC1", y = "PC2", label = factor(colnames(dds))), size = 3) + geom_point()) + dat <- assay(rld) + dists_rl <- dist(t(dat)) + mat <- as.matrix(dists_rl) + colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255) + pheatmap( + mat, + clustering_distance_rows = dists_rl, + clustering_distance_cols = dists_rl, + col = colors, + main = "Sample-to-sample distances" + ) + plotDispEsts(dds, main = "Dispersion estimates") } # these are plots which can be made for each comparison, e.g. # once for C vs A and once for B vs A generate_specific_plots <- function(res, threshold, title_suffix) { - use <- res$baseMean > threshold - if (sum(!use) == 0) { - h <- hist(res$pvalue, breaks = 0:50 / 50, plot = FALSE) - barplot( - height = h$counts, - col = "powderblue", - space = 0, - xlab = "p-values", - ylab = "frequency", - main = paste("Histogram of p-values for", title_suffix) - ) - text(x = c(0, length(h$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA) - } else { - h1 <- hist(res$pvalue[!use], breaks = 0:50 / 50, plot = FALSE) - h2 <- hist(res$pvalue[use], breaks = 0:50 / 50, plot = FALSE) - colori <- c("filtered (low count)" = "khaki", "not filtered" = "powderblue") - barplot( - height = rbind(h1$counts, h2$counts), - beside = FALSE, - col = colori, - space = 0, - xlab = "p-values", - ylab = "frequency", - main = paste("Histogram of p-values for", title_suffix) - ) - text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA) - legend("topright", fill = rev(colori), legend = rev(names(colori)), bg = "white") - } + use <- res$baseMean > threshold + if (sum(!use) == 0) { + h <- hist(res$pvalue, breaks = 0:50 / 50, plot = FALSE) + barplot( + height = h$counts, + col = "powderblue", + space = 0, + xlab = "p-values", + ylab = "frequency", + main = paste("Histogram of p-values for", title_suffix) + ) + text(x = c(0, length(h$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA) + } else { + h1 <- hist(res$pvalue[!use], breaks = 0:50 / 50, plot = FALSE) + h2 <- hist(res$pvalue[use], breaks = 0:50 / 50, plot = FALSE) + colori <- c("filtered (low count)" = "khaki", "not filtered" = "powderblue") + barplot( + height = rbind(h1$counts, h2$counts), + beside = FALSE, + col = colori, + space = 0, + xlab = "p-values", + ylab = "frequency", + main = paste("Histogram of p-values for", title_suffix) + ) + text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA) + legend("topright", fill = rev(colori), legend = rev(names(colori)), bg = "white") + } plotMA(res, main = paste("MA-plot for", title_suffix), ylim = range(res$log2FoldChange, na.rm = TRUE), alpha = opt$alpha_ma) } if (verbose) { - cat(paste("primary factor:", primary_factor, "\n")) - if (length(factors) > 1) { - cat(paste("other factors in design:", paste(factors[-length(factors)], collapse = ","), "\n")) - } - cat("\n---------------------\n") + cat(paste("primary factor:", primary_factor, "\n")) + if (length(factors) > 1) { + cat(paste("other factors in design:", paste(factors[-length(factors)], collapse = ","), "\n")) + } + cat("\n---------------------\n") } dds <- get_deseq_dataset(sample_table, header = opt$header, design_formula = design_formula, tximport = opt$tximport, txtype = opt$txtype, tx2gene = opt$tx2gene) @@ -259,41 +512,49 @@ } apply_batch_factors <- function(dds, batch_factors) { - rownames(batch_factors) <- batch_factors$identifier - batch_factors <- subset(batch_factors, select = -c(identifier, condition)) - dds_samples <- colnames(dds) - batch_samples <- rownames(batch_factors) - if (!setequal(batch_samples, dds_samples)) { - stop("Batch factor names don't correspond to input sample names, check input files") - } - dds_data <- colData(dds) - # Merge dds_data with batch_factors using indexes, which are sample names - # Set sort to False, which maintains the order in dds_data - reordered_batch <- merge(dds_data, batch_factors, by.x = 0, by.y = 0, sort = FALSE) - batch_factors <- reordered_batch[, ncol(dds_data):ncol(reordered_batch)] - for (factor in colnames(batch_factors)) { - dds[[factor]] <- batch_factors[[factor]] - } - colnames(dds) <- reordered_batch[, 1] - return(dds) + rownames(batch_factors) <- batch_factors$identifier + batch_factors <- subset(batch_factors, select = -c(identifier, condition)) + dds_samples <- colnames(dds) + batch_samples <- rownames(batch_factors) + if (!setequal(batch_samples, dds_samples)) { + stop("Batch factor names don't correspond to input sample names, check input files") + } + dds_data <- colData(dds) + # Merge dds_data with batch_factors using indexes, which are sample names + # Set sort to False, which maintains the order in dds_data + reordered_batch <- merge(dds_data, batch_factors, by.x = 0, by.y = 0, sort = FALSE) + batch_factors <- reordered_batch[, ncol(dds_data):ncol(reordered_batch)] + for (factor in colnames(batch_factors)) { + dds[[factor]] <- batch_factors[[factor]] + } + colnames(dds) <- reordered_batch[, 1] + return(dds) } if (!is.null(opt$batch_factors)) { - batch_factors <- read.table(opt$batch_factors, sep = "\t", header = TRUE) - dds <- apply_batch_factors(dds = dds, batch_factors = batch_factors) - batch_design <- colnames(batch_factors)[-c(1, 2)] - design_formula <- as.formula(paste("~", paste(c(batch_design, rev(factors)), collapse = " + "))) - design(dds) <- design_formula + batch_factors <- read.table(opt$batch_factors, sep = "\t", header = TRUE) + dds <- apply_batch_factors(dds = dds, batch_factors = batch_factors) + batch_design <- colnames(batch_factors)[-c(1, 2)] + + if (!is.null(opt$custom_design_formula)) { + # Custom mode: prepend batch factors to user's formula + user_formula_rhs <- gsub("^~\\s*", "", opt$design_formula) + design_formula <- as.formula(paste("~", paste(c(batch_design, user_formula_rhs), collapse = " + "))) + } else { + # Automatic mode: prepend batch factors to generated formula + design_formula <- as.formula(paste("~", paste(c(batch_design, rev(factors)), collapse = " + "))) + } + design(dds) <- design_formula } if (verbose) { - cat("DESeq2 run information\n\n") - cat("sample table:\n") - print(sample_table[, -c(1:2), drop = FALSE]) - cat("\ndesign formula:\n") - print(design_formula) - cat("\n\n") - cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n")) + cat("DESeq2 run information\n\n") + cat("sample table:\n") + print(sample_table[, -c(1:2), drop = FALSE]) + cat("\ndesign formula:\n") + print(design_formula) + cat("\n\n") + cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n")) } # minimal pre-filtering @@ -304,42 +565,43 @@ # optional outlier behavior if (is.null(opt$outlier_replace_off)) { - min_rep <- 7 + min_rep <- 7 } else { - min_rep <- Inf - if (verbose) cat("outlier replacement off\n") + min_rep <- Inf + if (verbose) cat("outlier replacement off\n") } if (is.null(opt$outlier_filter_off)) { - cooks_cutoff <- TRUE + cooks_cutoff <- TRUE } else { - cooks_cutoff <- FALSE - if (verbose) cat("outlier filtering off\n") + cooks_cutoff <- FALSE + if (verbose) cat("outlier filtering off\n") } # optional automatic mean filtering if (is.null(opt$auto_mean_filter_off)) { - independent_filtering <- TRUE + independent_filtering <- TRUE } else { - independent_filtering <- FALSE - if (verbose) cat("automatic filtering on the mean off\n") + independent_filtering <- FALSE + if (verbose) cat("automatic filtering on the mean off\n") } # shrinkage of LFCs if (is.null(opt$use_beta_priors)) { - beta_prior <- FALSE - if (verbose) - cat("Applied default - beta prior off\n") + beta_prior <- FALSE + if (verbose) { + cat("Applied default - beta prior off\n") + } } else { - beta_prior <- opt$use_beta_priors + beta_prior <- opt$use_beta_priors } sprintf("use_beta_prior is set to %s", beta_prior) # dispersion fit type if (is.null(opt$fit_type)) { - fit_type <- "parametric" + fit_type <- "parametric" } else { - fit_type <- c("parametric", "local", "mean")[opt$fit_type] + fit_type <- c("parametric", "local", "mean")[opt$fit_type] } if (verbose) cat(paste("using disperion fit type:", fit_type, "\n")) @@ -349,9 +611,9 @@ # create the generic plots and leave the device open if (!is.null(opt$plots)) { - if (verbose) cat("creating plots\n") - pdf(opt$plots) - generate_generic_plots(dds, factors) + if (verbose) cat("creating plots\n") + pdf(opt$plots) + generate_generic_plots(dds, factors) } n <- nlevels(colData(dds)[[primary_factor]]) @@ -376,72 +638,105 @@ if (is.null(opt$many_contrasts)) { - # only contrast the first and second level of the primary factor - ref <- all_levels[1] - lvl <- all_levels[2] - res <- results( - dds, - contrast = c(primary_factor, lvl, ref), - cooksCutoff = cooks_cutoff, - independentFiltering = independent_filtering - ) - if (verbose) { - cat("summary of results\n") - cat(paste0(primary_factor, ": ", lvl, " vs ", ref, "\n")) - print(summary(res)) - } - res_sorted <- res[order(res$padj), ] - out_df <- as.data.frame(res_sorted) - out_df$geneID <- rownames(out_df) # nolint - out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")] - filename <- opt$outfile - write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) - if (independent_filtering) { - threshold <- unname(attr(res, "filterThreshold")) - } else { - threshold <- 0 - } - title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref) - if (!is.null(opt$plots)) { - generate_specific_plots(res, threshold, title_suffix) - } -} else { - # rotate through the possible contrasts of the primary factor - # write out a sorted table of results with the contrast as a suffix - # add contrast specific plots to the device - for (i in seq_len(n - 1)) { - ref <- all_levels[i] - contrast_levels <- all_levels[(i + 1):n] - for (lvl in contrast_levels) { - res <- results( + # Single contrast mode + if (!is.null(opt$custom_design_formula) && exists("custom_contrast")) { + # Use explicit custom contrast + ref <- custom_contrast$reference + lvl <- custom_contrast$target + contrast_factor <- custom_contrast$factor + } else { + # Default: contrast using stored reference and target levels + # This ensures we use the intended reference, not whatever order files happened to be in + + # Check if we have more than 2 levels without explicit target + if (length(all_levels) > 2 && is.null(opt$target_level)) { + cat("Error: Multiple factor levels detected without explicit target_level specification.\n") + cat(paste0("Factor '", primary_factor, "' has ", length(all_levels), " levels: ", paste(all_levels, collapse = ", "), "\n")) + cat("Please either:\n") + cat(" 1. Specify target_level to indicate which level to compare against the reference, or\n") + cat(" 2. Use many_contrasts mode to compare all levels pairwise\n") + q(status = 1) + } + + # Use stored reference and target levels + # contrast = c(factor, level1, level2) computes log2(level1 / level2) + # In original mode: variable names were swapped for backward compatibility + # In sample_sheet_mode: use them directly (ref=denominator, target=numerator) + if (!is.null(opt$sample_sheet_mode)) { + # Sample sheet mode: use natural order + ref <- primary_ref_level # Reference is denominator + lvl <- primary_target_level # Target is numerator + } else { + # Original mode: swap for backward compatibility + ref <- primary_target_level # This becomes the denominator + lvl <- primary_ref_level # This becomes the numerator + } + contrast_factor <- primary_factor + } + + res <- results( dds, - contrast = c(primary_factor, lvl, ref), + contrast = c(contrast_factor, lvl, ref), cooksCutoff = cooks_cutoff, independentFiltering = independent_filtering - ) - res_sorted <- res[order(res$padj), ] - out_df <- as.data.frame(res_sorted) - out_df$geneID <- rownames(out_df) # nolint - out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")] - filename <- paste0(primary_factor, "_", lvl, "_vs_", ref) - write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) - if (independent_filtering) { + ) + if (verbose) { + cat("summary of results\n") + cat(paste0(contrast_factor, ": ", lvl, " vs ", ref, "\n")) + print(summary(res)) + } + res_sorted <- res[order(res$padj), ] + out_df <- as.data.frame(res_sorted) + out_df$geneID <- rownames(out_df) # nolint + out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")] + filename <- opt$outfile + write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + if (independent_filtering) { threshold <- unname(attr(res, "filterThreshold")) - } else { + } else { threshold <- 0 - } - title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref) - if (!is.null(opt$plots)) { + } + title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref) + if (!is.null(opt$plots)) { generate_specific_plots(res, threshold, title_suffix) - } } - } +} else { + # rotate through the possible contrasts of the primary factor + # write out a sorted table of results with the contrast as a suffix + # add contrast specific plots to the device + for (i in seq_len(n - 1)) { + ref <- all_levels[i] + contrast_levels <- all_levels[(i + 1):n] + for (lvl in contrast_levels) { + res <- results( + dds, + contrast = c(primary_factor, lvl, ref), + cooksCutoff = cooks_cutoff, + independentFiltering = independent_filtering + ) + res_sorted <- res[order(res$padj), ] + out_df <- as.data.frame(res_sorted) + out_df$geneID <- rownames(out_df) # nolint + out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")] + filename <- paste0(primary_factor, "_", lvl, "_vs_", ref) + write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE) + if (independent_filtering) { + threshold <- unname(attr(res, "filterThreshold")) + } else { + threshold <- 0 + } + title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref) + if (!is.null(opt$plots)) { + generate_specific_plots(res, threshold, title_suffix) + } + } + } } # close the plot device if (!is.null(opt$plots)) { - cat("closing plot device\n") - dev.off() + cat("closing plot device\n") + dev.off() } cat("Session information:\n\n")
