# HG changeset patch
# User iuc
# Date 1765302238 0
# Node ID 1cb33de18af5638d296ff67fff98bb627c3a0d61
# Parent 621ddf5967d239c2969d737d11492b0f8d801675
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deseq2 commit 5bf5011af827e93ddeecbfba4815fe9b85f02594
diff -r 621ddf5967d2 -r 1cb33de18af5 deseq2.R
--- a/deseq2.R Tue Jul 18 14:58:28 2023 +0000
+++ b/deseq2.R Tue Dec 09 17:43:58 2025 +0000
@@ -32,8 +32,8 @@
# setup R error handling to go to stderr
options(show.error.messages = FALSE, error = function() {
- cat(geterrmessage(), file = stderr())
- q("no", 1, FALSE)
+ cat(geterrmessage(), file = stderr())
+ q("no", 1, FALSE)
})
library("getopt")
@@ -44,50 +44,59 @@
# get options, using the spec as defined by the enclosed list.
# we read the options from the default: commandArgs(TRUE).
spec <- matrix(c(
- "quiet", "q", 0, "logical",
- "help", "h", 0, "logical",
- "cores", "s", 0, "integer",
- "batch_factors", "w", 1, "character",
- "outfile", "o", 1, "character",
- "countsfile", "n", 1, "character",
- "sizefactorsfile", "F", 1, "character",
- "rlogfile", "r", 1, "character",
- "vstfile", "v", 1, "character",
- "header", "H", 0, "logical",
- "factors", "f", 1, "character",
- "files_to_labels", "l", 1, "character",
- "plots", "p", 1, "character",
- "tximport", "i", 0, "logical",
- "txtype", "y", 1, "character",
- "tx2gene", "x", 1, "character", # a space-sep tx-to-gene map or GTF/GFF3 file
- "esf", "e", 1, "character",
- "fit_type", "t", 1, "integer",
- "many_contrasts", "m", 0, "logical",
- "outlier_replace_off", "a", 0, "logical",
- "outlier_filter_off", "b", 0, "logical",
- "auto_mean_filter_off", "c", 0, "logical",
- "use_beta_priors", "d", 0, "logical",
- "alpha_ma", "A", 1, "numeric",
- "prefilter", "P", 0, "logical",
- "prefilter_value", "V", 1, "numeric"
+ "quiet", "q", 0, "logical",
+ "help", "h", 0, "logical",
+ "cores", "s", 0, "integer",
+ "batch_factors", "w", 1, "character",
+ "outfile", "o", 1, "character",
+ "countsfile", "n", 1, "character",
+ "sizefactorsfile", "F", 1, "character",
+ "rlogfile", "r", 1, "character",
+ "vstfile", "v", 1, "character",
+ "header", "H", 0, "logical",
+ "factors", "f", 1, "character",
+ "files_to_labels", "l", 1, "character",
+ "plots", "p", 1, "character",
+ "tximport", "i", 0, "logical",
+ "txtype", "y", 1, "character",
+ "tx2gene", "x", 1, "character", # a space-sep tx-to-gene map or GTF/GFF3 file
+ "esf", "e", 1, "character",
+ "fit_type", "t", 1, "integer",
+ "many_contrasts", "m", 0, "logical",
+ "outlier_replace_off", "a", 0, "logical",
+ "outlier_filter_off", "b", 0, "logical",
+ "auto_mean_filter_off", "c", 0, "logical",
+ "use_beta_priors", "d", 0, "logical",
+ "alpha_ma", "A", 1, "numeric",
+ "prefilter", "P", 0, "logical",
+ "prefilter_value", "V", 1, "numeric",
+ "sample_sheet_mode", "S", 0, "logical",
+ "sample_sheet", "g", 1, "character",
+ "factor_columns", "j", 1, "character",
+ "reference_level", "R", 1, "character",
+ "target_level", "T", 1, "character",
+ "collection_files", "C", 1, "character",
+ "custom_design_formula", "D", 0, "logical",
+ "design_formula", "G", 1, "character",
+ "contrast_definition", "K", 1, "character"
), byrow = TRUE, ncol = 4)
opt <- getopt(spec)
# if help was asked for print a friendly message
# and exit with a non-zero error code
if (!is.null(opt$help)) {
- cat(getopt(spec, usage = TRUE))
- q(status = 1)
+ cat(getopt(spec, usage = TRUE))
+ q(status = 1)
}
# enforce the following required arguments
if (is.null(opt$outfile)) {
- cat("'outfile' is required\n")
- q(status = 1)
+ cat("'outfile' is required\n")
+ q(status = 1)
}
-if (is.null(opt$factors)) {
- cat("'factors' is required\n")
- q(status = 1)
+if (is.null(opt$factors) && is.null(opt$sample_sheet_mode)) {
+ cat("'factors' is required when not using sample_sheet_mode\n")
+ q(status = 1)
}
verbose <- is.null(opt$quiet)
@@ -101,17 +110,17 @@
source_local("get_deseq_dataset.R")
suppressPackageStartupMessages({
- library("DESeq2")
- library("RColorBrewer")
- library("gplots")
+ library("DESeq2")
+ library("RColorBrewer")
+ library("gplots")
})
if (opt$cores > 1) {
- library("BiocParallel")
- register(MulticoreParam(opt$cores))
- parallel <- TRUE
+ library("BiocParallel")
+ register(MulticoreParam(opt$cores))
+ parallel <- TRUE
} else {
- parallel <- FALSE
+ parallel <- FALSE
}
# build or read sample table
@@ -120,97 +129,341 @@
# switch on if 'factors' was provided:
library("rjson")
-parser <- newJSONParser()
-parser$addData(opt$factors)
-factor_list <- parser$getObject()
-filenames_to_labels <- fromJSON(opt$files_to_labels)
+
+if (!is.null(opt$sample_sheet_mode)) {
+ # Sample sheet mode: build factor_list from sample sheet
+ filenames_to_labels <- fromJSON(opt$files_to_labels)
+
+ # Read sample sheet
+ sample_sheet <- read.table(opt$sample_sheet, sep = "\t", header = TRUE, stringsAsFactors = FALSE)
+
+ # Parse collection files
+ collection_files <- strsplit(opt$collection_files, ",")[[1]]
+
+ # First column of sample sheet should contain sample identifiers
+ sample_id_col <- colnames(sample_sheet)[1]
+
+ if (!is.null(opt$custom_design_formula)) {
+ # Custom design formula mode
+ # Extract variable names from formula (remove ~, +, *, :, whitespace)
+ formula_str <- gsub("^~\\s*", "", opt$design_formula)
+ formula_vars <- unique(trimws(unlist(strsplit(formula_str, "[+*:]"))))
+
+ # Validate all variables exist in sample sheet
+ missing_vars <- setdiff(formula_vars, colnames(sample_sheet))
+ if (length(missing_vars) > 0) {
+ cat(paste0("Error: Variables not found in sample sheet: ", paste(missing_vars, collapse = ", "), "\n"))
+ cat(paste0("Available columns: ", paste(colnames(sample_sheet), collapse = ", "), "\n"))
+ q(status = 1)
+ }
+
+ # Use all formula variables as factor columns
+ factor_col_names <- formula_vars
+ } else {
+ # Automatic mode: Parse factor columns (comma-separated column numbers, 1-indexed)
+ factor_col_nums <- as.integer(strsplit(opt$factor_columns, ",")[[1]])
+ factor_col_names <- colnames(sample_sheet)[factor_col_nums]
+ }
+
+ # Validate sample sheet matches collection before building factor_list
+ # Get element identifiers from collection
+ collection_element_ids <- character(length(collection_files))
+ for (idx in seq_along(collection_files)) {
+ file <- collection_files[idx]
+ file_basename <- basename(file)
+ if (file_basename %in% names(filenames_to_labels)) {
+ collection_element_ids[idx] <- filenames_to_labels[[file_basename]]
+ } else {
+ cat("Error: Sample sheet validation failed\n")
+ cat(paste0("Collection file '", file_basename, "' does not have a corresponding element identifier.\n"))
+ cat("This is an internal error - please report this issue.\n")
+ q(status = 1)
+ }
+ }
+
+ # Get sample identifiers from sample sheet
+ sample_sheet_ids <- sample_sheet[[sample_id_col]]
+
+ # Check for mismatches
+ collection_not_in_sheet <- setdiff(collection_element_ids, sample_sheet_ids)
+ sheet_not_in_collection <- setdiff(sample_sheet_ids, collection_element_ids)
+
+ if (length(collection_not_in_sheet) > 0 || length(sheet_not_in_collection) > 0) {
+ cat("Error: Sample sheet does not match the input collection\n\n")
+
+ if (length(collection_not_in_sheet) > 0) {
+ cat("The following samples are in the collection but NOT in the sample sheet:\n")
+ for (id in collection_not_in_sheet) {
+ cat(paste0(" - ", id, "\n"))
+ }
+ cat("\n")
+ }
+
+ if (length(sheet_not_in_collection) > 0) {
+ cat("The following samples are in the sample sheet but NOT in the collection:\n")
+ for (id in sheet_not_in_collection) {
+ cat(paste0(" - ", id, "\n"))
+ }
+ cat("\n")
+ }
+
+ cat("Please ensure that:\n")
+ cat(paste0("1. The first column ('", sample_id_col, "') of the sample sheet contains sample identifiers\n"))
+ cat("2. These identifiers exactly match the element identifiers in your collection\n")
+ cat("3. All samples in the collection are listed in the sample sheet\n")
+ cat("4. All samples in the sample sheet exist in the collection\n")
+ q(status = 1)
+ }
+
+ # Determine which factor will be the primary factor for contrasts
+ # In custom mode, the primary factor is the last one in the formula (rightmost term)
+ # In automatic mode, the first selected factor is primary (formula gets reversed)
+ if (!is.null(opt$custom_design_formula)) {
+ primary_factor_index <- length(factor_col_names)
+ } else {
+ primary_factor_index <- 1
+ }
+
+ # Build factor_list structure
+ factor_list <- list()
+ for (i in seq_along(factor_col_names)) {
+ factor_name <- factor_col_names[i]
+
+ # Group files by factor level, preserving the order of first appearance
+ level_to_files <- list()
+ level_order <- character(0) # Track order of first appearance
+ for (file in collection_files) {
+ element_id <- filenames_to_labels[[basename(file)]]
+ # Find matching row in sample sheet
+ matching_row <- which(sample_sheet[[sample_id_col]] == element_id)
+ if (length(matching_row) > 0) {
+ level <- sample_sheet[[factor_name]][matching_row[1]]
+ if (!(level %in% names(level_to_files))) {
+ level_to_files[[level]] <- character(0)
+ level_order <- c(level_order, level) # Record order of first appearance
+ }
+ level_to_files[[level]] <- c(level_to_files[[level]], file)
+ }
+ }
+
+ # Get all levels in order of first appearance (not alphabetically sorted)
+ all_levels <- level_order
+
+ # Handle reference and target levels for the primary contrast factor
+ # In custom mode: this is the LAST factor in the formula
+ # In automatic mode: this is the FIRST factor selected (formula will be reversed)
+ if (i == primary_factor_index) {
+ # This is the primary factor for contrasts: handle reference/target
+ # Determine reference level
+ if (!is.null(opt$reference_level) && nchar(opt$reference_level) > 0) {
+ ref_level <- trim(opt$reference_level)
+ # Validate that reference level exists
+ if (!(ref_level %in% all_levels)) {
+ cat(paste0("Error: Reference level '", ref_level, "' not found in factor '", factor_name, "'. Available levels: ", paste(all_levels, collapse = ", "), "\n"))
+ q(status = 1)
+ }
+ } else {
+ # Default: use first level encountered in sample sheet as reference
+ ref_level <- all_levels[1]
+ }
+
+ # Build factor levels with reference first, then others
+ # This applies to both automatic and custom modes
+ if (!is.null(opt$target_level) && nchar(opt$target_level) > 0) {
+ target_level <- trim(opt$target_level)
+ # Validate that target level exists
+ if (!(target_level %in% all_levels)) {
+ cat(paste0("Error: Target level '", target_level, "' not found in factor '", factor_name, "'. Available levels: ", paste(all_levels, collapse = ", "), "\n"))
+ q(status = 1)
+ }
+ # Explicit target: reference first, target second
+ levels_to_use <- c(ref_level, target_level)
+ } else {
+ # Reference first, then others in order of appearance
+ other_levels <- all_levels[all_levels != ref_level]
+ levels_to_use <- c(ref_level, other_levels)
+ # Target is the first non-reference level
+ target_level <- other_levels[1]
+ }
+
+ # Store reference and target levels for later contrast specification
+ # This ensures we use the intended levels, not whatever order files happened to be in
+ primary_ref_level <- ref_level
+ primary_target_level <- target_level
+ } else {
+ # For secondary factors, use all levels in order of appearance
+ levels_to_use <- all_levels
+ }
+
+ # Build factor structure in the order specified by levels_to_use
+ # Following standard DESeq2 convention: reference level is first
+ factor_levels <- list()
+ for (level in levels_to_use) {
+ if (level %in% names(level_to_files)) {
+ level_entry <- list()
+ level_entry[[level]] <- level_to_files[[level]]
+ factor_levels[[length(factor_levels) + 1]] <- level_entry
+ }
+ }
+
+ factor_list[[length(factor_list) + 1]] <- list(factor_name, factor_levels)
+ }
+
+ # Set primary_factor for sample sheet mode
+ # This is the factor that will be used for contrasts
+ if (!is.null(opt$custom_design_formula)) {
+ # Custom mode: primary factor is the last one in the formula (rightmost term)
+ primary_factor <- factor_col_names[length(factor_col_names)]
+ } else {
+ # Automatic mode: primary factor is the first selected factor (formula will be reversed)
+ primary_factor <- factor_col_names[1]
+ }
+
+ # Parse contrast_definition if in custom mode
+ if (!is.null(opt$custom_design_formula) && !is.null(opt$contrast_definition) && nchar(opt$contrast_definition) > 0) {
+ contrast_parts <- strsplit(opt$contrast_definition, ",")[[1]]
+ if (length(contrast_parts) != 3) {
+ cat("Error: contrast_definition must be in format 'factor,target,reference'\n")
+ q(status = 1)
+ }
+ custom_contrast <- list(
+ factor = trim(contrast_parts[1]),
+ target = trim(contrast_parts[2]),
+ reference = trim(contrast_parts[3])
+ )
+ # Validate the contrast
+ if (!(custom_contrast$factor %in% factor_col_names)) {
+ cat(paste0("Error: Contrast factor '", custom_contrast$factor, "' not found in design formula.\n"))
+ cat(paste0("Available factors: ", paste(factor_col_names, collapse = ", "), "\n"))
+ q(status = 1)
+ }
+ }
+} else {
+ # Original mode: factors provided directly
+ parser <- newJSONParser()
+ parser$addData(opt$factors)
+ factor_list <- parser$getObject()
+ filenames_to_labels <- fromJSON(opt$files_to_labels)
+
+ # For original mode, extract reference and target levels from the first factor
+ # In original mode: ref=level1 (denominator), target=level2 (numerator) -> log2(level2/level1)
+ # So we swap the naming to match the contrast direction used below
+ primary_factor_data <- factor_list[[1]]
+ primary_levels <- sapply(primary_factor_data[[2]], function(x) names(x))
+ primary_target_level <- primary_levels[1] # First level becomes "target" for the swap below
+ primary_ref_level <- if (length(primary_levels) >= 2) primary_levels[2] else primary_levels[1] # Second level becomes "ref"
+}
+
factors <- sapply(factor_list, function(x) x[[1]])
-primary_factor <- factors[1]
+# For original mode (not sample_sheet_mode), set primary_factor to first factor
+# In sample_sheet_mode, it's already set correctly above
+if (is.null(opt$sample_sheet_mode)) {
+ primary_factor <- factors[1]
+}
filenames_in <- unname(unlist(factor_list[[1]][[2]]))
labs <- unname(unlist(filenames_to_labels[basename(filenames_in)]))
sample_table <- data.frame(
- sample = basename(filenames_in),
- filename = filenames_in,
- row.names = filenames_in,
- stringsAsFactors = FALSE
+ sample = basename(filenames_in),
+ filename = filenames_in,
+ row.names = filenames_in,
+ stringsAsFactors = FALSE
)
for (factor in factor_list) {
- factor_name <- trim(factor[[1]])
- sample_table[[factor_name]] <- character(nrow(sample_table))
- lvls <- sapply(factor[[2]], function(x) names(x))
- for (i in seq_along(factor[[2]])) {
- files <- factor[[2]][[i]][[1]]
- sample_table[files, factor_name] <- trim(lvls[i])
- }
- sample_table[[factor_name]] <- factor(sample_table[[factor_name]], levels = lvls)
+ factor_name <- trim(factor[[1]])
+ sample_table[[factor_name]] <- character(nrow(sample_table))
+ lvls <- sapply(factor[[2]], function(x) names(x))
+ for (i in seq_along(factor[[2]])) {
+ files <- factor[[2]][[i]][[1]]
+ sample_table[files, factor_name] <- trim(lvls[i])
+ }
+ sample_table[[factor_name]] <- factor(sample_table[[factor_name]], levels = lvls)
+ # Explicitly set the reference level using relevel() for the primary factor in sample_sheet_mode
+ # This ensures DESeq2 knows the reference regardless of factor level order
+ # In original mode, we don't relevel because the order from factor_list is already correct
+ # Note: primary_factor is already set correctly in sample_sheet_mode before we get here
+ if (exists("primary_factor") && factor_name == primary_factor && exists("primary_ref_level") && !is.null(opt$sample_sheet_mode)) {
+ sample_table[[factor_name]] <- relevel(sample_table[[factor_name]], ref = primary_ref_level)
+ }
}
rownames(sample_table) <- labs
-design_formula <- as.formula(paste("~", paste(rev(factors), collapse = " + ")))
+# Build design formula
+if (!is.null(opt$custom_design_formula)) {
+ # Custom mode: use user-provided formula
+ design_formula <- as.formula(opt$design_formula)
+ # In original mode (not sample_sheet), set primary factor to last variable in formula
+ # In sample_sheet_mode, it's already set correctly above
+ if (is.null(opt$sample_sheet_mode)) {
+ primary_factor <- factors[length(factors)]
+ }
+} else {
+ # Automatic mode: build from selected factors
+ design_formula <- as.formula(paste("~", paste(rev(factors), collapse = " + ")))
+}
# these are plots which are made once for each analysis
generate_generic_plots <- function(dds, factors) {
- library("ggplot2")
- library("ggrepel")
- library("pheatmap")
+ library("ggplot2")
+ library("ggrepel")
+ library("pheatmap")
- rld <- rlog(dds)
- p <- plotPCA(rld, intgroup = rev(factors))
- print(p + geom_text_repel(aes_string(x = "PC1", y = "PC2", label = factor(colnames(dds))), size = 3) + geom_point())
- dat <- assay(rld)
- dists_rl <- dist(t(dat))
- mat <- as.matrix(dists_rl)
- colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
- pheatmap(
- mat,
- clustering_distance_rows = dists_rl,
- clustering_distance_cols = dists_rl,
- col = colors,
- main = "Sample-to-sample distances"
- )
- plotDispEsts(dds, main = "Dispersion estimates")
+ rld <- rlog(dds)
+ p <- plotPCA(rld, intgroup = rev(factors))
+ print(p + geom_text_repel(aes_string(x = "PC1", y = "PC2", label = factor(colnames(dds))), size = 3) + geom_point())
+ dat <- assay(rld)
+ dists_rl <- dist(t(dat))
+ mat <- as.matrix(dists_rl)
+ colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
+ pheatmap(
+ mat,
+ clustering_distance_rows = dists_rl,
+ clustering_distance_cols = dists_rl,
+ col = colors,
+ main = "Sample-to-sample distances"
+ )
+ plotDispEsts(dds, main = "Dispersion estimates")
}
# these are plots which can be made for each comparison, e.g.
# once for C vs A and once for B vs A
generate_specific_plots <- function(res, threshold, title_suffix) {
- use <- res$baseMean > threshold
- if (sum(!use) == 0) {
- h <- hist(res$pvalue, breaks = 0:50 / 50, plot = FALSE)
- barplot(
- height = h$counts,
- col = "powderblue",
- space = 0,
- xlab = "p-values",
- ylab = "frequency",
- main = paste("Histogram of p-values for", title_suffix)
- )
- text(x = c(0, length(h$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA)
- } else {
- h1 <- hist(res$pvalue[!use], breaks = 0:50 / 50, plot = FALSE)
- h2 <- hist(res$pvalue[use], breaks = 0:50 / 50, plot = FALSE)
- colori <- c("filtered (low count)" = "khaki", "not filtered" = "powderblue")
- barplot(
- height = rbind(h1$counts, h2$counts),
- beside = FALSE,
- col = colori,
- space = 0,
- xlab = "p-values",
- ylab = "frequency",
- main = paste("Histogram of p-values for", title_suffix)
- )
- text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA)
- legend("topright", fill = rev(colori), legend = rev(names(colori)), bg = "white")
- }
+ use <- res$baseMean > threshold
+ if (sum(!use) == 0) {
+ h <- hist(res$pvalue, breaks = 0:50 / 50, plot = FALSE)
+ barplot(
+ height = h$counts,
+ col = "powderblue",
+ space = 0,
+ xlab = "p-values",
+ ylab = "frequency",
+ main = paste("Histogram of p-values for", title_suffix)
+ )
+ text(x = c(0, length(h$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA)
+ } else {
+ h1 <- hist(res$pvalue[!use], breaks = 0:50 / 50, plot = FALSE)
+ h2 <- hist(res$pvalue[use], breaks = 0:50 / 50, plot = FALSE)
+ colori <- c("filtered (low count)" = "khaki", "not filtered" = "powderblue")
+ barplot(
+ height = rbind(h1$counts, h2$counts),
+ beside = FALSE,
+ col = colori,
+ space = 0,
+ xlab = "p-values",
+ ylab = "frequency",
+ main = paste("Histogram of p-values for", title_suffix)
+ )
+ text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA)
+ legend("topright", fill = rev(colori), legend = rev(names(colori)), bg = "white")
+ }
plotMA(res, main = paste("MA-plot for", title_suffix), ylim = range(res$log2FoldChange, na.rm = TRUE), alpha = opt$alpha_ma)
}
if (verbose) {
- cat(paste("primary factor:", primary_factor, "\n"))
- if (length(factors) > 1) {
- cat(paste("other factors in design:", paste(factors[-length(factors)], collapse = ","), "\n"))
- }
- cat("\n---------------------\n")
+ cat(paste("primary factor:", primary_factor, "\n"))
+ if (length(factors) > 1) {
+ cat(paste("other factors in design:", paste(factors[-length(factors)], collapse = ","), "\n"))
+ }
+ cat("\n---------------------\n")
}
dds <- get_deseq_dataset(sample_table, header = opt$header, design_formula = design_formula, tximport = opt$tximport, txtype = opt$txtype, tx2gene = opt$tx2gene)
@@ -259,41 +512,49 @@
}
apply_batch_factors <- function(dds, batch_factors) {
- rownames(batch_factors) <- batch_factors$identifier
- batch_factors <- subset(batch_factors, select = -c(identifier, condition))
- dds_samples <- colnames(dds)
- batch_samples <- rownames(batch_factors)
- if (!setequal(batch_samples, dds_samples)) {
- stop("Batch factor names don't correspond to input sample names, check input files")
- }
- dds_data <- colData(dds)
- # Merge dds_data with batch_factors using indexes, which are sample names
- # Set sort to False, which maintains the order in dds_data
- reordered_batch <- merge(dds_data, batch_factors, by.x = 0, by.y = 0, sort = FALSE)
- batch_factors <- reordered_batch[, ncol(dds_data):ncol(reordered_batch)]
- for (factor in colnames(batch_factors)) {
- dds[[factor]] <- batch_factors[[factor]]
- }
- colnames(dds) <- reordered_batch[, 1]
- return(dds)
+ rownames(batch_factors) <- batch_factors$identifier
+ batch_factors <- subset(batch_factors, select = -c(identifier, condition))
+ dds_samples <- colnames(dds)
+ batch_samples <- rownames(batch_factors)
+ if (!setequal(batch_samples, dds_samples)) {
+ stop("Batch factor names don't correspond to input sample names, check input files")
+ }
+ dds_data <- colData(dds)
+ # Merge dds_data with batch_factors using indexes, which are sample names
+ # Set sort to False, which maintains the order in dds_data
+ reordered_batch <- merge(dds_data, batch_factors, by.x = 0, by.y = 0, sort = FALSE)
+ batch_factors <- reordered_batch[, ncol(dds_data):ncol(reordered_batch)]
+ for (factor in colnames(batch_factors)) {
+ dds[[factor]] <- batch_factors[[factor]]
+ }
+ colnames(dds) <- reordered_batch[, 1]
+ return(dds)
}
if (!is.null(opt$batch_factors)) {
- batch_factors <- read.table(opt$batch_factors, sep = "\t", header = TRUE)
- dds <- apply_batch_factors(dds = dds, batch_factors = batch_factors)
- batch_design <- colnames(batch_factors)[-c(1, 2)]
- design_formula <- as.formula(paste("~", paste(c(batch_design, rev(factors)), collapse = " + ")))
- design(dds) <- design_formula
+ batch_factors <- read.table(opt$batch_factors, sep = "\t", header = TRUE)
+ dds <- apply_batch_factors(dds = dds, batch_factors = batch_factors)
+ batch_design <- colnames(batch_factors)[-c(1, 2)]
+
+ if (!is.null(opt$custom_design_formula)) {
+ # Custom mode: prepend batch factors to user's formula
+ user_formula_rhs <- gsub("^~\\s*", "", opt$design_formula)
+ design_formula <- as.formula(paste("~", paste(c(batch_design, user_formula_rhs), collapse = " + ")))
+ } else {
+ # Automatic mode: prepend batch factors to generated formula
+ design_formula <- as.formula(paste("~", paste(c(batch_design, rev(factors)), collapse = " + ")))
+ }
+ design(dds) <- design_formula
}
if (verbose) {
- cat("DESeq2 run information\n\n")
- cat("sample table:\n")
- print(sample_table[, -c(1:2), drop = FALSE])
- cat("\ndesign formula:\n")
- print(design_formula)
- cat("\n\n")
- cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n"))
+ cat("DESeq2 run information\n\n")
+ cat("sample table:\n")
+ print(sample_table[, -c(1:2), drop = FALSE])
+ cat("\ndesign formula:\n")
+ print(design_formula)
+ cat("\n\n")
+ cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n"))
}
# minimal pre-filtering
@@ -304,42 +565,43 @@
# optional outlier behavior
if (is.null(opt$outlier_replace_off)) {
- min_rep <- 7
+ min_rep <- 7
} else {
- min_rep <- Inf
- if (verbose) cat("outlier replacement off\n")
+ min_rep <- Inf
+ if (verbose) cat("outlier replacement off\n")
}
if (is.null(opt$outlier_filter_off)) {
- cooks_cutoff <- TRUE
+ cooks_cutoff <- TRUE
} else {
- cooks_cutoff <- FALSE
- if (verbose) cat("outlier filtering off\n")
+ cooks_cutoff <- FALSE
+ if (verbose) cat("outlier filtering off\n")
}
# optional automatic mean filtering
if (is.null(opt$auto_mean_filter_off)) {
- independent_filtering <- TRUE
+ independent_filtering <- TRUE
} else {
- independent_filtering <- FALSE
- if (verbose) cat("automatic filtering on the mean off\n")
+ independent_filtering <- FALSE
+ if (verbose) cat("automatic filtering on the mean off\n")
}
# shrinkage of LFCs
if (is.null(opt$use_beta_priors)) {
- beta_prior <- FALSE
- if (verbose)
- cat("Applied default - beta prior off\n")
+ beta_prior <- FALSE
+ if (verbose) {
+ cat("Applied default - beta prior off\n")
+ }
} else {
- beta_prior <- opt$use_beta_priors
+ beta_prior <- opt$use_beta_priors
}
sprintf("use_beta_prior is set to %s", beta_prior)
# dispersion fit type
if (is.null(opt$fit_type)) {
- fit_type <- "parametric"
+ fit_type <- "parametric"
} else {
- fit_type <- c("parametric", "local", "mean")[opt$fit_type]
+ fit_type <- c("parametric", "local", "mean")[opt$fit_type]
}
if (verbose) cat(paste("using disperion fit type:", fit_type, "\n"))
@@ -349,9 +611,9 @@
# create the generic plots and leave the device open
if (!is.null(opt$plots)) {
- if (verbose) cat("creating plots\n")
- pdf(opt$plots)
- generate_generic_plots(dds, factors)
+ if (verbose) cat("creating plots\n")
+ pdf(opt$plots)
+ generate_generic_plots(dds, factors)
}
n <- nlevels(colData(dds)[[primary_factor]])
@@ -376,72 +638,105 @@
if (is.null(opt$many_contrasts)) {
- # only contrast the first and second level of the primary factor
- ref <- all_levels[1]
- lvl <- all_levels[2]
- res <- results(
- dds,
- contrast = c(primary_factor, lvl, ref),
- cooksCutoff = cooks_cutoff,
- independentFiltering = independent_filtering
- )
- if (verbose) {
- cat("summary of results\n")
- cat(paste0(primary_factor, ": ", lvl, " vs ", ref, "\n"))
- print(summary(res))
- }
- res_sorted <- res[order(res$padj), ]
- out_df <- as.data.frame(res_sorted)
- out_df$geneID <- rownames(out_df) # nolint
- out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
- filename <- opt$outfile
- write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
- if (independent_filtering) {
- threshold <- unname(attr(res, "filterThreshold"))
- } else {
- threshold <- 0
- }
- title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref)
- if (!is.null(opt$plots)) {
- generate_specific_plots(res, threshold, title_suffix)
- }
-} else {
- # rotate through the possible contrasts of the primary factor
- # write out a sorted table of results with the contrast as a suffix
- # add contrast specific plots to the device
- for (i in seq_len(n - 1)) {
- ref <- all_levels[i]
- contrast_levels <- all_levels[(i + 1):n]
- for (lvl in contrast_levels) {
- res <- results(
+ # Single contrast mode
+ if (!is.null(opt$custom_design_formula) && exists("custom_contrast")) {
+ # Use explicit custom contrast
+ ref <- custom_contrast$reference
+ lvl <- custom_contrast$target
+ contrast_factor <- custom_contrast$factor
+ } else {
+ # Default: contrast using stored reference and target levels
+ # This ensures we use the intended reference, not whatever order files happened to be in
+
+ # Check if we have more than 2 levels without explicit target
+ if (length(all_levels) > 2 && is.null(opt$target_level)) {
+ cat("Error: Multiple factor levels detected without explicit target_level specification.\n")
+ cat(paste0("Factor '", primary_factor, "' has ", length(all_levels), " levels: ", paste(all_levels, collapse = ", "), "\n"))
+ cat("Please either:\n")
+ cat(" 1. Specify target_level to indicate which level to compare against the reference, or\n")
+ cat(" 2. Use many_contrasts mode to compare all levels pairwise\n")
+ q(status = 1)
+ }
+
+ # Use stored reference and target levels
+ # contrast = c(factor, level1, level2) computes log2(level1 / level2)
+ # In original mode: variable names were swapped for backward compatibility
+ # In sample_sheet_mode: use them directly (ref=denominator, target=numerator)
+ if (!is.null(opt$sample_sheet_mode)) {
+ # Sample sheet mode: use natural order
+ ref <- primary_ref_level # Reference is denominator
+ lvl <- primary_target_level # Target is numerator
+ } else {
+ # Original mode: swap for backward compatibility
+ ref <- primary_target_level # This becomes the denominator
+ lvl <- primary_ref_level # This becomes the numerator
+ }
+ contrast_factor <- primary_factor
+ }
+
+ res <- results(
dds,
- contrast = c(primary_factor, lvl, ref),
+ contrast = c(contrast_factor, lvl, ref),
cooksCutoff = cooks_cutoff,
independentFiltering = independent_filtering
- )
- res_sorted <- res[order(res$padj), ]
- out_df <- as.data.frame(res_sorted)
- out_df$geneID <- rownames(out_df) # nolint
- out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
- filename <- paste0(primary_factor, "_", lvl, "_vs_", ref)
- write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
- if (independent_filtering) {
+ )
+ if (verbose) {
+ cat("summary of results\n")
+ cat(paste0(contrast_factor, ": ", lvl, " vs ", ref, "\n"))
+ print(summary(res))
+ }
+ res_sorted <- res[order(res$padj), ]
+ out_df <- as.data.frame(res_sorted)
+ out_df$geneID <- rownames(out_df) # nolint
+ out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
+ filename <- opt$outfile
+ write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
+ if (independent_filtering) {
threshold <- unname(attr(res, "filterThreshold"))
- } else {
+ } else {
threshold <- 0
- }
- title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref)
- if (!is.null(opt$plots)) {
+ }
+ title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref)
+ if (!is.null(opt$plots)) {
generate_specific_plots(res, threshold, title_suffix)
- }
}
- }
+} else {
+ # rotate through the possible contrasts of the primary factor
+ # write out a sorted table of results with the contrast as a suffix
+ # add contrast specific plots to the device
+ for (i in seq_len(n - 1)) {
+ ref <- all_levels[i]
+ contrast_levels <- all_levels[(i + 1):n]
+ for (lvl in contrast_levels) {
+ res <- results(
+ dds,
+ contrast = c(primary_factor, lvl, ref),
+ cooksCutoff = cooks_cutoff,
+ independentFiltering = independent_filtering
+ )
+ res_sorted <- res[order(res$padj), ]
+ out_df <- as.data.frame(res_sorted)
+ out_df$geneID <- rownames(out_df) # nolint
+ out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
+ filename <- paste0(primary_factor, "_", lvl, "_vs_", ref)
+ write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
+ if (independent_filtering) {
+ threshold <- unname(attr(res, "filterThreshold"))
+ } else {
+ threshold <- 0
+ }
+ title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref)
+ if (!is.null(opt$plots)) {
+ generate_specific_plots(res, threshold, title_suffix)
+ }
+ }
+ }
}
# close the plot device
if (!is.null(opt$plots)) {
- cat("closing plot device\n")
- dev.off()
+ cat("closing plot device\n")
+ dev.off()
}
cat("Session information:\n\n")
diff -r 621ddf5967d2 -r 1cb33de18af5 deseq2.xml
--- a/deseq2.xml Tue Jul 18 14:58:28 2023 +0000
+++ b/deseq2.xml Tue Dec 09 17:43:58 2025 +0000
@@ -7,6 +7,7 @@
+
+
@@ -136,6 +164,39 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ^~\s*[a-zA-Z][a-zA-Z0-9_.]*(\s*[+*:]\s*[a-zA-Z][a-zA-Z0-9_.]*)*$
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -209,9 +270,9 @@
label="Turn off independent filtering"
help=" DESeq2 performs independent filtering by default using the mean of normalized counts as a filter statistic"/>
-
@@ -265,7 +326,7 @@
-
+
@@ -309,7 +370,7 @@
-
+
@@ -361,7 +422,7 @@
-
+
@@ -380,7 +441,7 @@
+
-
+
@@ -423,7 +484,7 @@
+
@@ -434,7 +495,7 @@
-
+
@@ -452,7 +513,7 @@
+
@@ -463,7 +524,7 @@
-
+
@@ -481,7 +542,7 @@
+
@@ -492,7 +553,7 @@
-
+
@@ -521,7 +582,7 @@
+
@@ -532,7 +593,7 @@
-
+
@@ -561,7 +622,7 @@
+
@@ -574,7 +635,7 @@
-
+
@@ -593,7 +654,7 @@
+
@@ -632,7 +693,7 @@
-
+
@@ -666,7 +727,7 @@
-
+
@@ -697,7 +758,7 @@
-
+
@@ -730,6 +791,199 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
2.11.40.8
1.40.2
- 0
+ 1
22.01
diff -r 621ddf5967d2 -r 1cb33de18af5 test-data/sample_sheet.tab
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_sheet.tab Tue Dec 09 17:43:58 2025 +0000
@@ -0,0 +1,8 @@
+sample treatment sequencing
+GSM461179_treat_single.counts Treated single
+GSM461180_treat_paired.counts Treated paired
+GSM461181_treat_paired.counts Treated paired
+GSM461176_untreat_single.counts Untreated single
+GSM461177_untreat_paired.counts Untreated paired
+GSM461178_untreat_paired.counts Untreated paired
+GSM461182_untreat_single.counts Untreated single