Mercurial > repos > iuc > deseq2

--- a/deseq2.R	Tue Jul 18 14:58:28 2023 +0000
+++ b/deseq2.R	Tue Dec 09 17:43:58 2025 +0000
@@ -32,8 +32,8 @@

 # setup R error handling to go to stderr
 options(show.error.messages = FALSE, error = function() {
-  cat(geterrmessage(), file = stderr())
-  q("no", 1, FALSE)
+    cat(geterrmessage(), file = stderr())
+    q("no", 1, FALSE)
 })

 library("getopt")
@@ -44,50 +44,59 @@
 # get options, using the spec as defined by the enclosed list.
 # we read the options from the default: commandArgs(TRUE).
 spec <- matrix(c(
-  "quiet", "q", 0, "logical",
-  "help", "h", 0, "logical",
-  "cores", "s", 0, "integer",
-  "batch_factors", "w", 1, "character",
-  "outfile", "o", 1, "character",
-  "countsfile", "n", 1, "character",
-  "sizefactorsfile", "F", 1, "character",
-  "rlogfile", "r", 1, "character",
-  "vstfile", "v", 1, "character",
-  "header", "H", 0, "logical",
-  "factors", "f", 1, "character",
-  "files_to_labels", "l", 1, "character",
-  "plots", "p", 1, "character",
-  "tximport", "i", 0, "logical",
-  "txtype", "y", 1, "character",
-  "tx2gene", "x", 1, "character", # a space-sep tx-to-gene map or GTF/GFF3 file
-  "esf", "e", 1, "character",
-  "fit_type", "t", 1, "integer",
-  "many_contrasts", "m", 0, "logical",
-  "outlier_replace_off", "a", 0, "logical",
-  "outlier_filter_off", "b", 0, "logical",
-  "auto_mean_filter_off", "c", 0, "logical",
-  "use_beta_priors", "d", 0, "logical",
-  "alpha_ma", "A", 1, "numeric",
-  "prefilter", "P", 0, "logical",
-  "prefilter_value", "V", 1, "numeric"
+    "quiet", "q", 0, "logical",
+    "help", "h", 0, "logical",
+    "cores", "s", 0, "integer",
+    "batch_factors", "w", 1, "character",
+    "outfile", "o", 1, "character",
+    "countsfile", "n", 1, "character",
+    "sizefactorsfile", "F", 1, "character",
+    "rlogfile", "r", 1, "character",
+    "vstfile", "v", 1, "character",
+    "header", "H", 0, "logical",
+    "factors", "f", 1, "character",
+    "files_to_labels", "l", 1, "character",
+    "plots", "p", 1, "character",
+    "tximport", "i", 0, "logical",
+    "txtype", "y", 1, "character",
+    "tx2gene", "x", 1, "character", # a space-sep tx-to-gene map or GTF/GFF3 file
+    "esf", "e", 1, "character",
+    "fit_type", "t", 1, "integer",
+    "many_contrasts", "m", 0, "logical",
+    "outlier_replace_off", "a", 0, "logical",
+    "outlier_filter_off", "b", 0, "logical",
+    "auto_mean_filter_off", "c", 0, "logical",
+    "use_beta_priors", "d", 0, "logical",
+    "alpha_ma", "A", 1, "numeric",
+    "prefilter", "P", 0, "logical",
+    "prefilter_value", "V", 1, "numeric",
+    "sample_sheet_mode", "S", 0, "logical",
+    "sample_sheet", "g", 1, "character",
+    "factor_columns", "j", 1, "character",
+    "reference_level", "R", 1, "character",
+    "target_level", "T", 1, "character",
+    "collection_files", "C", 1, "character",
+    "custom_design_formula", "D", 0, "logical",
+    "design_formula", "G", 1, "character",
+    "contrast_definition", "K", 1, "character"
 ), byrow = TRUE, ncol = 4)
 opt <- getopt(spec)

 # if help was asked for print a friendly message
 # and exit with a non-zero error code
 if (!is.null(opt$help)) {
-  cat(getopt(spec, usage = TRUE))
-  q(status = 1)
+    cat(getopt(spec, usage = TRUE))
+    q(status = 1)
 }

 # enforce the following required arguments
 if (is.null(opt$outfile)) {
-  cat("'outfile' is required\n")
-  q(status = 1)
+    cat("'outfile' is required\n")
+    q(status = 1)
 }
-if (is.null(opt$factors)) {
-  cat("'factors' is required\n")
-  q(status = 1)
+if (is.null(opt$factors) && is.null(opt$sample_sheet_mode)) {
+    cat("'factors' is required when not using sample_sheet_mode\n")
+    q(status = 1)
 }

 verbose <- is.null(opt$quiet)
@@ -101,17 +110,17 @@
 source_local("get_deseq_dataset.R")

 suppressPackageStartupMessages({
-  library("DESeq2")
-  library("RColorBrewer")
-  library("gplots")
+    library("DESeq2")
+    library("RColorBrewer")
+    library("gplots")
 })

 if (opt$cores > 1) {
-  library("BiocParallel")
-  register(MulticoreParam(opt$cores))
-  parallel <- TRUE
+    library("BiocParallel")
+    register(MulticoreParam(opt$cores))
+    parallel <- TRUE
 } else {
-  parallel <- FALSE
+    parallel <- FALSE
 }

 # build or read sample table
@@ -120,97 +129,341 @@

 # switch on if 'factors' was provided:
 library("rjson")
-parser <- newJSONParser()
-parser$addData(opt$factors)
-factor_list <- parser$getObject()
-filenames_to_labels <- fromJSON(opt$files_to_labels)
+
+if (!is.null(opt$sample_sheet_mode)) {
+    # Sample sheet mode: build factor_list from sample sheet
+    filenames_to_labels <- fromJSON(opt$files_to_labels)
+
+    # Read sample sheet
+    sample_sheet <- read.table(opt$sample_sheet, sep = "\t", header = TRUE, stringsAsFactors = FALSE)
+
+    # Parse collection files
+    collection_files <- strsplit(opt$collection_files, ",")[[1]]
+
+    # First column of sample sheet should contain sample identifiers
+    sample_id_col <- colnames(sample_sheet)[1]
+
+    if (!is.null(opt$custom_design_formula)) {
+        # Custom design formula mode
+        # Extract variable names from formula (remove ~, +, *, :, whitespace)
+        formula_str <- gsub("^~\\s*", "", opt$design_formula)
+        formula_vars <- unique(trimws(unlist(strsplit(formula_str, "[+*:]"))))
+
+        # Validate all variables exist in sample sheet
+        missing_vars <- setdiff(formula_vars, colnames(sample_sheet))
+        if (length(missing_vars) > 0) {
+            cat(paste0("Error: Variables not found in sample sheet: ", paste(missing_vars, collapse = ", "), "\n"))
+            cat(paste0("Available columns: ", paste(colnames(sample_sheet), collapse = ", "), "\n"))
+            q(status = 1)
+        }
+
+        # Use all formula variables as factor columns
+        factor_col_names <- formula_vars
+    } else {
+        # Automatic mode: Parse factor columns (comma-separated column numbers, 1-indexed)
+        factor_col_nums <- as.integer(strsplit(opt$factor_columns, ",")[[1]])
+        factor_col_names <- colnames(sample_sheet)[factor_col_nums]
+    }
+
+    # Validate sample sheet matches collection before building factor_list
+    # Get element identifiers from collection
+    collection_element_ids <- character(length(collection_files))
+    for (idx in seq_along(collection_files)) {
+        file <- collection_files[idx]
+        file_basename <- basename(file)
+        if (file_basename %in% names(filenames_to_labels)) {
+            collection_element_ids[idx] <- filenames_to_labels[[file_basename]]
+        } else {
+            cat("Error: Sample sheet validation failed\n")
+            cat(paste0("Collection file '", file_basename, "' does not have a corresponding element identifier.\n"))
+            cat("This is an internal error - please report this issue.\n")
+            q(status = 1)
+        }
+    }
+
+    # Get sample identifiers from sample sheet
+    sample_sheet_ids <- sample_sheet[[sample_id_col]]
+
+    # Check for mismatches
+    collection_not_in_sheet <- setdiff(collection_element_ids, sample_sheet_ids)
+    sheet_not_in_collection <- setdiff(sample_sheet_ids, collection_element_ids)
+
+    if (length(collection_not_in_sheet) > 0 || length(sheet_not_in_collection) > 0) {
+        cat("Error: Sample sheet does not match the input collection\n\n")
+
+        if (length(collection_not_in_sheet) > 0) {
+            cat("The following samples are in the collection but NOT in the sample sheet:\n")
+            for (id in collection_not_in_sheet) {
+                cat(paste0("  - ", id, "\n"))
+            }
+            cat("\n")
+        }
+
+        if (length(sheet_not_in_collection) > 0) {
+            cat("The following samples are in the sample sheet but NOT in the collection:\n")
+            for (id in sheet_not_in_collection) {
+                cat(paste0("  - ", id, "\n"))
+            }
+            cat("\n")
+        }
+
+        cat("Please ensure that:\n")
+        cat(paste0("1. The first column ('", sample_id_col, "') of the sample sheet contains sample identifiers\n"))
+        cat("2. These identifiers exactly match the element identifiers in your collection\n")
+        cat("3. All samples in the collection are listed in the sample sheet\n")
+        cat("4. All samples in the sample sheet exist in the collection\n")
+        q(status = 1)
+    }
+
+    # Determine which factor will be the primary factor for contrasts
+    # In custom mode, the primary factor is the last one in the formula (rightmost term)
+    # In automatic mode, the first selected factor is primary (formula gets reversed)
+    if (!is.null(opt$custom_design_formula)) {
+        primary_factor_index <- length(factor_col_names)
+    } else {
+        primary_factor_index <- 1
+    }
+
+    # Build factor_list structure
+    factor_list <- list()
+    for (i in seq_along(factor_col_names)) {
+        factor_name <- factor_col_names[i]
+
+        # Group files by factor level, preserving the order of first appearance
+        level_to_files <- list()
+        level_order <- character(0) # Track order of first appearance
+        for (file in collection_files) {
+            element_id <- filenames_to_labels[[basename(file)]]
+            # Find matching row in sample sheet
+            matching_row <- which(sample_sheet[[sample_id_col]] == element_id)
+            if (length(matching_row) > 0) {
+                level <- sample_sheet[[factor_name]][matching_row[1]]
+                if (!(level %in% names(level_to_files))) {
+                    level_to_files[[level]] <- character(0)
+                    level_order <- c(level_order, level) # Record order of first appearance
+                }
+                level_to_files[[level]] <- c(level_to_files[[level]], file)
+            }
+        }
+
+        # Get all levels in order of first appearance (not alphabetically sorted)
+        all_levels <- level_order
+
+        # Handle reference and target levels for the primary contrast factor
+        # In custom mode: this is the LAST factor in the formula
+        # In automatic mode: this is the FIRST factor selected (formula will be reversed)
+        if (i == primary_factor_index) {
+            # This is the primary factor for contrasts: handle reference/target
+            # Determine reference level
+            if (!is.null(opt$reference_level) && nchar(opt$reference_level) > 0) {
+                ref_level <- trim(opt$reference_level)
+                # Validate that reference level exists
+                if (!(ref_level %in% all_levels)) {
+                    cat(paste0("Error: Reference level '", ref_level, "' not found in factor '", factor_name, "'. Available levels: ", paste(all_levels, collapse = ", "), "\n"))
+                    q(status = 1)
+                }
+            } else {
+                # Default: use first level encountered in sample sheet as reference
+                ref_level <- all_levels[1]
+            }
+
+            # Build factor levels with reference first, then others
+            # This applies to both automatic and custom modes
+            if (!is.null(opt$target_level) && nchar(opt$target_level) > 0) {
+                target_level <- trim(opt$target_level)
+                # Validate that target level exists
+                if (!(target_level %in% all_levels)) {
+                    cat(paste0("Error: Target level '", target_level, "' not found in factor '", factor_name, "'. Available levels: ", paste(all_levels, collapse = ", "), "\n"))
+                    q(status = 1)
+                }
+                # Explicit target: reference first, target second
+                levels_to_use <- c(ref_level, target_level)
+            } else {
+                # Reference first, then others in order of appearance
+                other_levels <- all_levels[all_levels != ref_level]
+                levels_to_use <- c(ref_level, other_levels)
+                # Target is the first non-reference level
+                target_level <- other_levels[1]
+            }
+
+            # Store reference and target levels for later contrast specification
+            # This ensures we use the intended levels, not whatever order files happened to be in
+            primary_ref_level <- ref_level
+            primary_target_level <- target_level
+        } else {
+            # For secondary factors, use all levels in order of appearance
+            levels_to_use <- all_levels
+        }
+
+        # Build factor structure in the order specified by levels_to_use
+        # Following standard DESeq2 convention: reference level is first
+        factor_levels <- list()
+        for (level in levels_to_use) {
+            if (level %in% names(level_to_files)) {
+                level_entry <- list()
+                level_entry[[level]] <- level_to_files[[level]]
+                factor_levels[[length(factor_levels) + 1]] <- level_entry
+            }
+        }
+
+        factor_list[[length(factor_list) + 1]] <- list(factor_name, factor_levels)
+    }
+
+    # Set primary_factor for sample sheet mode
+    # This is the factor that will be used for contrasts
+    if (!is.null(opt$custom_design_formula)) {
+        # Custom mode: primary factor is the last one in the formula (rightmost term)
+        primary_factor <- factor_col_names[length(factor_col_names)]
+    } else {
+        # Automatic mode: primary factor is the first selected factor (formula will be reversed)
+        primary_factor <- factor_col_names[1]
+    }
+
+    # Parse contrast_definition if in custom mode
+    if (!is.null(opt$custom_design_formula) && !is.null(opt$contrast_definition) && nchar(opt$contrast_definition) > 0) {
+        contrast_parts <- strsplit(opt$contrast_definition, ",")[[1]]
+        if (length(contrast_parts) != 3) {
+            cat("Error: contrast_definition must be in format 'factor,target,reference'\n")
+            q(status = 1)
+        }
+        custom_contrast <- list(
+            factor = trim(contrast_parts[1]),
+            target = trim(contrast_parts[2]),
+            reference = trim(contrast_parts[3])
+        )
+        # Validate the contrast
+        if (!(custom_contrast$factor %in% factor_col_names)) {
+            cat(paste0("Error: Contrast factor '", custom_contrast$factor, "' not found in design formula.\n"))
+            cat(paste0("Available factors: ", paste(factor_col_names, collapse = ", "), "\n"))
+            q(status = 1)
+        }
+    }
+} else {
+    # Original mode: factors provided directly
+    parser <- newJSONParser()
+    parser$addData(opt$factors)
+    factor_list <- parser$getObject()
+    filenames_to_labels <- fromJSON(opt$files_to_labels)
+
+    # For original mode, extract reference and target levels from the first factor
+    # In original mode: ref=level1 (denominator), target=level2 (numerator) -> log2(level2/level1)
+    # So we swap the naming to match the contrast direction used below
+    primary_factor_data <- factor_list[[1]]
+    primary_levels <- sapply(primary_factor_data[[2]], function(x) names(x))
+    primary_target_level <- primary_levels[1] # First level becomes "target" for the swap below
+    primary_ref_level <- if (length(primary_levels) >= 2) primary_levels[2] else primary_levels[1] # Second level becomes "ref"
+}
+
 factors <- sapply(factor_list, function(x) x[[1]])
-primary_factor <- factors[1]
+# For original mode (not sample_sheet_mode), set primary_factor to first factor
+# In sample_sheet_mode, it's already set correctly above
+if (is.null(opt$sample_sheet_mode)) {
+    primary_factor <- factors[1]
+}
 filenames_in <- unname(unlist(factor_list[[1]][[2]]))
 labs <- unname(unlist(filenames_to_labels[basename(filenames_in)]))
 sample_table <- data.frame(
-  sample = basename(filenames_in),
-  filename = filenames_in,
-  row.names = filenames_in,
-  stringsAsFactors = FALSE
+    sample = basename(filenames_in),
+    filename = filenames_in,
+    row.names = filenames_in,
+    stringsAsFactors = FALSE
 )
 for (factor in factor_list) {
-  factor_name <- trim(factor[[1]])
-  sample_table[[factor_name]] <- character(nrow(sample_table))
-  lvls <- sapply(factor[[2]], function(x) names(x))
-  for (i in seq_along(factor[[2]])) {
-    files <- factor[[2]][[i]][[1]]
-    sample_table[files, factor_name] <- trim(lvls[i])
-  }
-  sample_table[[factor_name]] <- factor(sample_table[[factor_name]], levels = lvls)
+    factor_name <- trim(factor[[1]])
+    sample_table[[factor_name]] <- character(nrow(sample_table))
+    lvls <- sapply(factor[[2]], function(x) names(x))
+    for (i in seq_along(factor[[2]])) {
+        files <- factor[[2]][[i]][[1]]
+        sample_table[files, factor_name] <- trim(lvls[i])
+    }
+    sample_table[[factor_name]] <- factor(sample_table[[factor_name]], levels = lvls)
+    # Explicitly set the reference level using relevel() for the primary factor in sample_sheet_mode
+    # This ensures DESeq2 knows the reference regardless of factor level order
+    # In original mode, we don't relevel because the order from factor_list is already correct
+    # Note: primary_factor is already set correctly in sample_sheet_mode before we get here
+    if (exists("primary_factor") && factor_name == primary_factor && exists("primary_ref_level") && !is.null(opt$sample_sheet_mode)) {
+        sample_table[[factor_name]] <- relevel(sample_table[[factor_name]], ref = primary_ref_level)
+    }
 }
 rownames(sample_table) <- labs

-design_formula <- as.formula(paste("~", paste(rev(factors), collapse = " + ")))
+# Build design formula
+if (!is.null(opt$custom_design_formula)) {
+    # Custom mode: use user-provided formula
+    design_formula <- as.formula(opt$design_formula)
+    # In original mode (not sample_sheet), set primary factor to last variable in formula
+    # In sample_sheet_mode, it's already set correctly above
+    if (is.null(opt$sample_sheet_mode)) {
+        primary_factor <- factors[length(factors)]
+    }
+} else {
+    # Automatic mode: build from selected factors
+    design_formula <- as.formula(paste("~", paste(rev(factors), collapse = " + ")))
+}

 # these are plots which are made once for each analysis
 generate_generic_plots <- function(dds, factors) {
-  library("ggplot2")
-  library("ggrepel")
-  library("pheatmap")
+    library("ggplot2")
+    library("ggrepel")
+    library("pheatmap")

-  rld <- rlog(dds)
-  p <- plotPCA(rld, intgroup = rev(factors))
-  print(p + geom_text_repel(aes_string(x = "PC1", y = "PC2", label = factor(colnames(dds))), size = 3)  + geom_point())
-  dat <- assay(rld)
-  dists_rl <- dist(t(dat))
-  mat <- as.matrix(dists_rl)
-  colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
-  pheatmap(
-    mat,
-    clustering_distance_rows = dists_rl,
-    clustering_distance_cols = dists_rl,
-    col = colors,
-    main = "Sample-to-sample distances"
-  )
-  plotDispEsts(dds, main = "Dispersion estimates")
+    rld <- rlog(dds)
+    p <- plotPCA(rld, intgroup = rev(factors))
+    print(p + geom_text_repel(aes_string(x = "PC1", y = "PC2", label = factor(colnames(dds))), size = 3) + geom_point())
+    dat <- assay(rld)
+    dists_rl <- dist(t(dat))
+    mat <- as.matrix(dists_rl)
+    colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
+    pheatmap(
+        mat,
+        clustering_distance_rows = dists_rl,
+        clustering_distance_cols = dists_rl,
+        col = colors,
+        main = "Sample-to-sample distances"
+    )
+    plotDispEsts(dds, main = "Dispersion estimates")
 }

 # these are plots which can be made for each comparison, e.g.
 # once for C vs A and once for B vs A
 generate_specific_plots <- function(res, threshold, title_suffix) {
-  use <- res$baseMean > threshold
-  if (sum(!use) == 0) {
-    h <- hist(res$pvalue, breaks = 0:50 / 50, plot = FALSE)
-    barplot(
-      height = h$counts,
-      col = "powderblue",
-      space = 0,
-      xlab = "p-values",
-      ylab = "frequency",
-      main = paste("Histogram of p-values for", title_suffix)
-    )
-    text(x = c(0, length(h$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA)
-  } else {
-    h1 <- hist(res$pvalue[!use], breaks = 0:50 / 50, plot = FALSE)
-    h2 <- hist(res$pvalue[use], breaks = 0:50 / 50, plot = FALSE)
-    colori <- c("filtered (low count)" = "khaki", "not filtered" = "powderblue")
-    barplot(
-      height = rbind(h1$counts, h2$counts),
-      beside = FALSE,
-      col = colori,
-      space = 0,
-      xlab = "p-values",
-      ylab = "frequency",
-      main = paste("Histogram of p-values for", title_suffix)
-    )
-    text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA)
-    legend("topright", fill = rev(colori), legend = rev(names(colori)), bg = "white")
-  }
+    use <- res$baseMean > threshold
+    if (sum(!use) == 0) {
+        h <- hist(res$pvalue, breaks = 0:50 / 50, plot = FALSE)
+        barplot(
+            height = h$counts,
+            col = "powderblue",
+            space = 0,
+            xlab = "p-values",
+            ylab = "frequency",
+            main = paste("Histogram of p-values for", title_suffix)
+        )
+        text(x = c(0, length(h$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA)
+    } else {
+        h1 <- hist(res$pvalue[!use], breaks = 0:50 / 50, plot = FALSE)
+        h2 <- hist(res$pvalue[use], breaks = 0:50 / 50, plot = FALSE)
+        colori <- c("filtered (low count)" = "khaki", "not filtered" = "powderblue")
+        barplot(
+            height = rbind(h1$counts, h2$counts),
+            beside = FALSE,
+            col = colori,
+            space = 0,
+            xlab = "p-values",
+            ylab = "frequency",
+            main = paste("Histogram of p-values for", title_suffix)
+        )
+        text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0, 1)), adj = c(0.5, 1.7), xpd = NA)
+        legend("topright", fill = rev(colori), legend = rev(names(colori)), bg = "white")
+    }
     plotMA(res, main = paste("MA-plot for", title_suffix), ylim = range(res$log2FoldChange, na.rm = TRUE), alpha = opt$alpha_ma)
 }

 if (verbose) {
-  cat(paste("primary factor:", primary_factor, "\n"))
-  if (length(factors) > 1) {
-    cat(paste("other factors in design:", paste(factors[-length(factors)], collapse = ","), "\n"))
-  }
-  cat("\n---------------------\n")
+    cat(paste("primary factor:", primary_factor, "\n"))
+    if (length(factors) > 1) {
+        cat(paste("other factors in design:", paste(factors[-length(factors)], collapse = ","), "\n"))
+    }
+    cat("\n---------------------\n")
 }

 dds <- get_deseq_dataset(sample_table, header = opt$header, design_formula = design_formula, tximport = opt$tximport, txtype = opt$txtype, tx2gene = opt$tx2gene)
@@ -259,41 +512,49 @@
 }

 apply_batch_factors <- function(dds, batch_factors) {
-  rownames(batch_factors) <- batch_factors$identifier
-  batch_factors <- subset(batch_factors, select = -c(identifier, condition))
-  dds_samples <- colnames(dds)
-  batch_samples <- rownames(batch_factors)
-  if (!setequal(batch_samples, dds_samples)) {
-    stop("Batch factor names don't correspond to input sample names, check input files")
-  }
-  dds_data <- colData(dds)
-  # Merge dds_data with batch_factors using indexes, which are sample names
-  # Set sort to False, which maintains the order in dds_data
-  reordered_batch <- merge(dds_data, batch_factors, by.x = 0, by.y = 0, sort = FALSE)
-  batch_factors <- reordered_batch[, ncol(dds_data):ncol(reordered_batch)]
-  for (factor in colnames(batch_factors)) {
-    dds[[factor]] <- batch_factors[[factor]]
-  }
-  colnames(dds) <- reordered_batch[, 1]
-  return(dds)
+    rownames(batch_factors) <- batch_factors$identifier
+    batch_factors <- subset(batch_factors, select = -c(identifier, condition))
+    dds_samples <- colnames(dds)
+    batch_samples <- rownames(batch_factors)
+    if (!setequal(batch_samples, dds_samples)) {
+        stop("Batch factor names don't correspond to input sample names, check input files")
+    }
+    dds_data <- colData(dds)
+    # Merge dds_data with batch_factors using indexes, which are sample names
+    # Set sort to False, which maintains the order in dds_data
+    reordered_batch <- merge(dds_data, batch_factors, by.x = 0, by.y = 0, sort = FALSE)
+    batch_factors <- reordered_batch[, ncol(dds_data):ncol(reordered_batch)]
+    for (factor in colnames(batch_factors)) {
+        dds[[factor]] <- batch_factors[[factor]]
+    }
+    colnames(dds) <- reordered_batch[, 1]
+    return(dds)
 }

 if (!is.null(opt$batch_factors)) {
-  batch_factors <- read.table(opt$batch_factors, sep = "\t", header = TRUE)
-  dds <- apply_batch_factors(dds = dds, batch_factors = batch_factors)
-  batch_design <- colnames(batch_factors)[-c(1, 2)]
-  design_formula <- as.formula(paste("~", paste(c(batch_design, rev(factors)), collapse = " + ")))
-  design(dds) <- design_formula
+    batch_factors <- read.table(opt$batch_factors, sep = "\t", header = TRUE)
+    dds <- apply_batch_factors(dds = dds, batch_factors = batch_factors)
+    batch_design <- colnames(batch_factors)[-c(1, 2)]
+
+    if (!is.null(opt$custom_design_formula)) {
+        # Custom mode: prepend batch factors to user's formula
+        user_formula_rhs <- gsub("^~\\s*", "", opt$design_formula)
+        design_formula <- as.formula(paste("~", paste(c(batch_design, user_formula_rhs), collapse = " + ")))
+    } else {
+        # Automatic mode: prepend batch factors to generated formula
+        design_formula <- as.formula(paste("~", paste(c(batch_design, rev(factors)), collapse = " + ")))
+    }
+    design(dds) <- design_formula
 }

 if (verbose) {
-  cat("DESeq2 run information\n\n")
-  cat("sample table:\n")
-  print(sample_table[, -c(1:2), drop = FALSE])
-  cat("\ndesign formula:\n")
-  print(design_formula)
-  cat("\n\n")
-  cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n"))
+    cat("DESeq2 run information\n\n")
+    cat("sample table:\n")
+    print(sample_table[, -c(1:2), drop = FALSE])
+    cat("\ndesign formula:\n")
+    print(design_formula)
+    cat("\n\n")
+    cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n"))
 }

 # minimal pre-filtering
@@ -304,42 +565,43 @@

 # optional outlier behavior
 if (is.null(opt$outlier_replace_off)) {
-  min_rep <- 7
+    min_rep <- 7
 } else {
-  min_rep <- Inf
-  if (verbose) cat("outlier replacement off\n")
+    min_rep <- Inf
+    if (verbose) cat("outlier replacement off\n")
 }
 if (is.null(opt$outlier_filter_off)) {
-  cooks_cutoff <- TRUE
+    cooks_cutoff <- TRUE
 } else {
-  cooks_cutoff <- FALSE
-  if (verbose) cat("outlier filtering off\n")
+    cooks_cutoff <- FALSE
+    if (verbose) cat("outlier filtering off\n")
 }

 # optional automatic mean filtering
 if (is.null(opt$auto_mean_filter_off)) {
-  independent_filtering <- TRUE
+    independent_filtering <- TRUE
 } else {
-  independent_filtering <- FALSE
-  if (verbose) cat("automatic filtering on the mean off\n")
+    independent_filtering <- FALSE
+    if (verbose) cat("automatic filtering on the mean off\n")
 }

 # shrinkage of LFCs
 if (is.null(opt$use_beta_priors)) {
-  beta_prior <- FALSE
-  if (verbose)
-    cat("Applied default - beta prior off\n")
+    beta_prior <- FALSE
+    if (verbose) {
+        cat("Applied default - beta prior off\n")
+    }
 } else {
-  beta_prior <- opt$use_beta_priors
+    beta_prior <- opt$use_beta_priors
 }
 sprintf("use_beta_prior is set to %s", beta_prior)


 # dispersion fit type
 if (is.null(opt$fit_type)) {
-  fit_type <- "parametric"
+    fit_type <- "parametric"
 } else {
-  fit_type <- c("parametric", "local", "mean")[opt$fit_type]
+    fit_type <- c("parametric", "local", "mean")[opt$fit_type]
 }

 if (verbose) cat(paste("using disperion fit type:", fit_type, "\n"))
@@ -349,9 +611,9 @@

 # create the generic plots and leave the device open
 if (!is.null(opt$plots)) {
-  if (verbose) cat("creating plots\n")
-  pdf(opt$plots)
-  generate_generic_plots(dds, factors)
+    if (verbose) cat("creating plots\n")
+    pdf(opt$plots)
+    generate_generic_plots(dds, factors)
 }

 n <- nlevels(colData(dds)[[primary_factor]])
@@ -376,72 +638,105 @@


 if (is.null(opt$many_contrasts)) {
-  # only contrast the first and second level of the primary factor
-  ref <- all_levels[1]
-  lvl <- all_levels[2]
-  res <- results(
-    dds,
-    contrast = c(primary_factor, lvl, ref),
-    cooksCutoff = cooks_cutoff,
-    independentFiltering = independent_filtering
-  )
-  if (verbose) {
-    cat("summary of results\n")
-    cat(paste0(primary_factor, ": ", lvl, " vs ", ref, "\n"))
-    print(summary(res))
-  }
-  res_sorted <- res[order(res$padj), ]
-  out_df <- as.data.frame(res_sorted)
-  out_df$geneID <- rownames(out_df)  # nolint
-  out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
-  filename <- opt$outfile
-  write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
-  if (independent_filtering) {
-    threshold <- unname(attr(res, "filterThreshold"))
-  } else {
-    threshold <- 0
-  }
-  title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref)
-  if (!is.null(opt$plots)) {
-    generate_specific_plots(res, threshold, title_suffix)
-  }
-} else {
-  # rotate through the possible contrasts of the primary factor
-  # write out a sorted table of results with the contrast as a suffix
-  # add contrast specific plots to the device
-  for (i in seq_len(n - 1)) {
-    ref <- all_levels[i]
-    contrast_levels <- all_levels[(i + 1):n]
-    for (lvl in contrast_levels) {
-      res <- results(
+    # Single contrast mode
+    if (!is.null(opt$custom_design_formula) && exists("custom_contrast")) {
+        # Use explicit custom contrast
+        ref <- custom_contrast$reference
+        lvl <- custom_contrast$target
+        contrast_factor <- custom_contrast$factor
+    } else {
+        # Default: contrast using stored reference and target levels
+        # This ensures we use the intended reference, not whatever order files happened to be in
+
+        # Check if we have more than 2 levels without explicit target
+        if (length(all_levels) > 2 && is.null(opt$target_level)) {
+            cat("Error: Multiple factor levels detected without explicit target_level specification.\n")
+            cat(paste0("Factor '", primary_factor, "' has ", length(all_levels), " levels: ", paste(all_levels, collapse = ", "), "\n"))
+            cat("Please either:\n")
+            cat("  1. Specify target_level to indicate which level to compare against the reference, or\n")
+            cat("  2. Use many_contrasts mode to compare all levels pairwise\n")
+            q(status = 1)
+        }
+
+        # Use stored reference and target levels
+        # contrast = c(factor, level1, level2) computes log2(level1 / level2)
+        # In original mode: variable names were swapped for backward compatibility
+        # In sample_sheet_mode: use them directly (ref=denominator, target=numerator)
+        if (!is.null(opt$sample_sheet_mode)) {
+            # Sample sheet mode: use natural order
+            ref <- primary_ref_level # Reference is denominator
+            lvl <- primary_target_level # Target is numerator
+        } else {
+            # Original mode: swap for backward compatibility
+            ref <- primary_target_level # This becomes the denominator
+            lvl <- primary_ref_level # This becomes the numerator
+        }
+        contrast_factor <- primary_factor
+    }
+
+    res <- results(
         dds,
-        contrast = c(primary_factor, lvl, ref),
+        contrast = c(contrast_factor, lvl, ref),
         cooksCutoff = cooks_cutoff,
         independentFiltering = independent_filtering
-      )
-      res_sorted <- res[order(res$padj), ]
-      out_df <- as.data.frame(res_sorted)
-      out_df$geneID <- rownames(out_df)  # nolint
-      out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
-      filename <- paste0(primary_factor, "_", lvl, "_vs_", ref)
-      write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
-      if (independent_filtering) {
+    )
+    if (verbose) {
+        cat("summary of results\n")
+        cat(paste0(contrast_factor, ": ", lvl, " vs ", ref, "\n"))
+        print(summary(res))
+    }
+    res_sorted <- res[order(res$padj), ]
+    out_df <- as.data.frame(res_sorted)
+    out_df$geneID <- rownames(out_df) # nolint
+    out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
+    filename <- opt$outfile
+    write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
+    if (independent_filtering) {
         threshold <- unname(attr(res, "filterThreshold"))
-      } else {
+    } else {
         threshold <- 0
-      }
-      title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref)
-      if (!is.null(opt$plots)) {
+    }
+    title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref)
+    if (!is.null(opt$plots)) {
         generate_specific_plots(res, threshold, title_suffix)
-      }
     }
-  }
+} else {
+    # rotate through the possible contrasts of the primary factor
+    # write out a sorted table of results with the contrast as a suffix
+    # add contrast specific plots to the device
+    for (i in seq_len(n - 1)) {
+        ref <- all_levels[i]
+        contrast_levels <- all_levels[(i + 1):n]
+        for (lvl in contrast_levels) {
+            res <- results(
+                dds,
+                contrast = c(primary_factor, lvl, ref),
+                cooksCutoff = cooks_cutoff,
+                independentFiltering = independent_filtering
+            )
+            res_sorted <- res[order(res$padj), ]
+            out_df <- as.data.frame(res_sorted)
+            out_df$geneID <- rownames(out_df) # nolint
+            out_df <- out_df[, c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
+            filename <- paste0(primary_factor, "_", lvl, "_vs_", ref)
+            write.table(out_df, file = filename, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
+            if (independent_filtering) {
+                threshold <- unname(attr(res, "filterThreshold"))
+            } else {
+                threshold <- 0
+            }
+            title_suffix <- paste0(primary_factor, ": ", lvl, " vs ", ref)
+            if (!is.null(opt$plots)) {
+                generate_specific_plots(res, threshold, title_suffix)
+            }
+        }
+    }
 }

 # close the plot device
 if (!is.null(opt$plots)) {
-  cat("closing plot device\n")
-  dev.off()
+    cat("closing plot device\n")
+    dev.off()
 }

 cat("Session information:\n\n")
--- a/deseq2.xml	Tue Jul 18 14:58:28 2023 +0000
+++ b/deseq2.xml	Tue Dec 09 17:43:58 2025 +0000
@@ -7,6 +7,7 @@
     <expand macro='xrefs'/>
     <expand macro='requirements'/>
     <stdio>
+        <exit_code range="1:" level="fatal" />
         <regex match="Execution halted"
            source="both"
            level="fatal"
@@ -58,32 +59,58 @@
     #end if
     #set $filename_to_element_identifiers = {}
     #set $temp_factor_names = list()
-    #for $factor in $select_data.rep_factorName:
-        #set $temp_factor = list()
-        #for $level in $factor.rep_factorLevel:
-            #set $count_files = list()
-            #if $select_data.how == 'group_tags':
-                #for $group in $level.groups.value:
-                    #for $file in $select_data.countsFile.get_datasets_for_group($group):
+    #if $select_data.how == 'sample_sheet_contrasts':
+        ## Pass collection file paths and element identifiers for sample sheet mode
+        #set $count_files = list()
+        #for $file in $select_data.countsFile:
+            $count_files.append(str($file))
+            $filename_to_element_identifiers.__setitem__(os.path.basename(str($file)), $file.element_identifier)
+        #end for
+        --sample_sheet_mode
+        --sample_sheet '$select_data.sample_sheet'
+        --collection_files '#echo ",".join($count_files)#'
+        #if $select_data.design_formula_mode.mode == 'custom':
+            --custom_design_formula
+            --design_formula '$select_data.design_formula_mode.design_formula'
+        #else:
+            --factor_columns '$select_data.design_formula_mode.factor'
+        #end if
+        #if str($select_data.design_formula_mode.reference_level).strip():
+            --reference_level '${select_data.design_formula_mode.reference_level.strip()}'
+        #end if
+        #if str($select_data.design_formula_mode.target_level).strip():
+            --target_level '${select_data.design_formula_mode.target_level.strip()}'
+        #end if
+    #else:
+        #for $factor in $select_data.rep_factorName:
+            #set $temp_factor = list()
+            #for $level in $factor.rep_factorLevel:
+                #set $count_files = list()
+                #if $select_data.how == 'group_tags':
+                    #for $group in $level.groups.value:
+                        #for $file in $select_data.countsFile.get_datasets_for_group($group):
+                            $count_files.append(str($file))
+                            $filename_to_element_identifiers.__setitem__(os.path.basename(str($file)),  $file.element_identifier)
+                        #end for
+                    #end for
+                #else:
+                    #for $file in $level.countsFile:
                         $count_files.append(str($file))
                         $filename_to_element_identifiers.__setitem__(os.path.basename(str($file)),  $file.element_identifier)
                     #end for
-                #end for
-            #else:
-                #for $file in $level.countsFile:
-                    $count_files.append(str($file))
-                    $filename_to_element_identifiers.__setitem__(os.path.basename(str($file)),  $file.element_identifier)
-                #end for
-            #end if
-            $temp_factor.append( {str($level.factorLevel): $count_files} )
+                #end if
+                $temp_factor.append( {str($level.factorLevel): $count_files} )
+            #end for
+            $temp_factor.reverse()
+            $temp_factor_names.append([str($factor.factorName), $temp_factor])
         #end for
-        $temp_factor.reverse()
-        $temp_factor_names.append([str($factor.factorName), $temp_factor])
-    #end for
+    #end if

     $header

-    -f '#echo json.dumps(temp_factor_names)#'
+    #if $select_data.how != 'sample_sheet_contrasts':
+        -f '#echo json.dumps(temp_factor_names)#'
+    #end if
     -l '#echo json.dumps(filename_to_element_identifiers)#'
     #if $advanced_options.esf_cond.esf:
         #if $advanced_options.esf_cond.esf == "user":
@@ -100,7 +127,7 @@
         $advanced_options.prefilter_conditional.prefilter
         -V $advanced_options.prefilter_conditional.prefilter_value
     #end if
-
+
     $advanced_options.outlier_replace_off
     $advanced_options.outlier_filter_off
     $advanced_options.auto_mean_filter_off
@@ -124,6 +151,7 @@
             <param name="how" type="select">
                 <option value="datasets_per_level">Select datasets per level</option>
                 <option value="group_tags">Select group tags corresponding to levels</option>
+                <option value="sample_sheet_contrasts">Select contrasts from sample sheet columns</option>
             </param>
             <when value="group_tags">
                 <param name="countsFile" type="data_collection" format="tabular" label="Count file(s) collection" multiple="true"/>
@@ -136,6 +164,39 @@
                     <param name="countsFile" type="data" format="tabular" multiple="true" label="Counts file(s)"/>
                 </expand>
             </when>
+            <when value="sample_sheet_contrasts">
+                <param name="countsFile" type="data_collection" format="tabular" label="Count file(s) collection" multiple="true"/>
+                <param name="sample_sheet" type="data" format="tabular" label="Sample sheet file" help="A tabular file containing at least two columns: one with the element identifiers (matching those in the collection) and one or more columns with factor levels."/>
+                <conditional name="design_formula_mode">
+                    <param name="mode" type="select" label="Design formula specification">
+                        <option value="automatic" selected="true">Automatic (select factors from columns)</option>
+                        <option value="custom">Custom design formula</option>
+                    </param>
+                    <when value="automatic">
+                        <param name="factor" type="data_column" use_header_names="true" data_ref="sample_sheet" multiple="true" optional="false" label="Factor levels column(s)" help="Select one or more columns from the sample sheet to define factor levels for DESeq2 analysis. First selected factor is the primary factor"/>
+                        <param name="reference_level" type="text" label="Reference level for primary factor" help="Specify the reference level for the primary factor. If left blank, the first encountered level of the primary factor will be used as reference." optional="true"/>
+                        <param name="target_level" type="text" label="Target level for primary factor" help="Specify the target level for the primary factor. If left blank, all levels will be compared against the primary factor." optional="true"/>
+                     </when>
+                    <when value="custom">
+                        <param name="design_formula" type="text" label="Design formula" help="Specify the design formula using R formula syntax (e.g., '~ condition' or '~ batch + condition'). Column names must match those in the sample sheet. The last factor in the formula is used as the primary factor for contrasts.">
+                            <validator type="regex" message="Design formula must start with ~ and contain valid R variable names">^~\s*[a-zA-Z][a-zA-Z0-9_.]*(\s*[+*:]\s*[a-zA-Z][a-zA-Z0-9_.]*)*$</validator>
+                            <sanitizer>
+                                <valid initial="string.letters,string.digits">
+                                    <add value="~"/>
+                                    <add value=":"/>
+                                    <add value="+"/>
+                                    <add value="*"/>
+                                    <add value="_"/>
+                                    <add value="."/>
+                                    <add value=" "/>
+                                </valid>
+                            </sanitizer>
+                        </param>
+                        <param name="reference_level" type="text" label="Reference level for primary factor" help="Specify the reference level for the primary factor. If left blank, the first encountered level of the primary factor will be used as reference." optional="true"/>
+                        <param name="target_level" type="text" label="Target level for primary factor" help="Specify the target level for the primary factor. If left blank, all levels will be compared against the primary factor." optional="true"/>
+                    </when>
+                </conditional>
+            </when>
         </conditional>

         <param name="batch_factors" type="data" format="tabular" optional="true" label="(Optional) provide a tabular file with additional batch factors to include in the model." help="You can produce this file using RUVSeq or svaseq."/>
@@ -209,9 +270,9 @@
                 label="Turn off independent filtering"
                 help=" DESeq2 performs independent filtering by default using the mean of normalized counts as a filter statistic"/>
             <conditional name="prefilter_conditional">
-                <param name="prefilter" type="select" label="Perform pre-filtering" help="While it is not necessary to pre-filter
-                    low count genes before running the DESeq2 functions, there are two reasons which make pre-filtering useful:
-                    by removing rows in which there are very few reads, we reduce the required memory, and we increase the speed.
+                <param name="prefilter" type="select" label="Perform pre-filtering" help="While it is not necessary to pre-filter
+                    low count genes before running the DESeq2 functions, there are two reasons which make pre-filtering useful:
+                    by removing rows in which there are very few reads, we reduce the required memory, and we increase the speed.
                     It can also improve visualizations, as features with no information for differential expression are not plotted.">
                     <option value="-P">Enabled</option>
                     <option value="" selected="true">Disabled</option>
@@ -265,7 +326,7 @@
         </data>
     </outputs>
     <tests>
-        <!--Ensure counts files with header works -->
+        <!-- Ensure counts files with header works -->
         <test expect_num_outputs="4">
             <repeat name="rep_factorName">
                 <param name="factorName" value="Treatment"/>
@@ -309,7 +370,7 @@
                 </assert_contents>
             </output>
         </test>
-        <!--Ensure additional batch factor correction works -->
+        <!-- Ensure additional batch factor correction works -->
         <test expect_num_outputs="2">
             <repeat name="rep_factorName">
                 <param name="factorName" value="Treatment"/>
@@ -361,7 +422,7 @@
                 </assert_contents>
             </output>
         </test>
-        <!--Ensure counts files without header works -->
+        <!-- Ensure counts files without header works -->
         <test expect_num_outputs="4">
             <repeat name="rep_factorName">
                 <param name="factorName" value="Treatment"/>
@@ -380,7 +441,7 @@
             </section>
             <section name="output_options">
                 <param name="output_selector" value="normCounts,normRLog,normVST"/>
-            </section>
+            </section>
             <output name="counts_out">
                 <assert_contents>
                     <has_text_matching expression="GSM461176_untreat_single.counts.noheader\tGSM461177_untreat_paired.counts.noheader\tGSM461178_untreat_paired.counts.noheader\tGSM461182_untreat_single.counts.noheader\tGSM461179_treat_single.counts.noheader\tGSM461180_treat_paired.counts.noheader\tGSM461181_treat_paired.counts.noheader"/>
@@ -405,7 +466,7 @@
                 </assert_contents>
             </output>
         </test>
-        <!--Ensure Sailfish/Salmon input with tx2gene table works-->
+        <!-- Ensure Sailfish/Salmon input with tx2gene table works -->
         <test expect_num_outputs="1">
             <repeat name="rep_factorName">
                 <param name="factorName" value="Treatment"/>
@@ -423,7 +484,7 @@
             </section>
             <section name="output_options">
                 <param name="output_selector" value=""/>
-            </section>
+            </section>
             <param name="tximport_selector" value="tximport"/>
             <param name="txtype" value="sailfish"/>
             <param name="mapping_format_selector" value="tabular"/>
@@ -434,7 +495,7 @@
                 </assert_contents>
             </output>
         </test>
-        <!--Ensure Sailfish/Salmon input with GFF3 annotation from NCBI works-->
+        <!-- Ensure Sailfish/Salmon input with GFF3 annotation from NCBI works -->
         <test expect_num_outputs="1">
             <repeat name="rep_factorName">
                 <param name="factorName" value="Treatment"/>
@@ -452,7 +513,7 @@
             </section>
             <section name="output_options">
                 <param name="output_selector" value=""/>
-            </section>
+            </section>
             <param name="tximport_selector" value="tximport"/>
             <param name="txtype" value="sailfish"/>
             <param name="mapping_format_selector" value="gtf"/>
@@ -463,7 +524,7 @@
                 </assert_contents>
             </output>
         </test>
-        <!--Ensure Sailfish/Salmon input with GTF annotation from Ensembl works-->
+        <!-- Ensure Sailfish/Salmon input with GTF annotation from Ensembl works -->
         <test expect_num_outputs="1">
             <repeat name="rep_factorName">
                 <param name="factorName" value="Treatment"/>
@@ -481,7 +542,7 @@
             </section>
             <section name="output_options">
                 <param name="output_selector" value=""/>
-            </section>
+            </section>
             <param name="tximport_selector" value="tximport"/>
             <param name="txtype" value="sailfish"/>
             <param name="mapping_format_selector" value="gtf"/>
@@ -492,7 +553,7 @@
                 </assert_contents>
             </output>
         </test>
-        <!--Ensure group tags can be used to select factor levels -->
+        <!-- Ensure group tags can be used to select factor levels -->
         <test expect_num_outputs="1">
             <param name="select_data|how" value="group_tags"/>
             <param name="select_data|countsFile">
@@ -521,7 +582,7 @@
             </section>
             <section name="output_options">
                 <param name="output_selector" value=""/>
-            </section>
+            </section>
             <param name="tximport_selector" value="tximport"/>
             <param name="txtype" value="sailfish"/>
             <param name="mapping_format_selector" value="tabular"/>
@@ -532,7 +593,7 @@
                 </assert_contents>
             </output>
         </test>
-        <!--Ensure many_contrasts produces output collection -->
+        <!-- Ensure many_contrasts produces output collection -->
         <test expect_num_outputs="1">
             <param name="select_data|how" value="group_tags"/>
             <param name="select_data|countsFile">
@@ -561,7 +622,7 @@
             </section>
             <section name="output_options">
                 <param name="output_selector" value="many_contrasts"/>
-            </section>
+            </section>
             <param name="tximport_selector" value="tximport"/>
             <param name="txtype" value="sailfish"/>
             <param name="mapping_format_selector" value="tabular"/>
@@ -574,7 +635,7 @@
                 </element>
             </output_collection>
         </test>
-        <!--Test alpha_ma option-->
+        <!-- Test alpha_ma option -->
         <test expect_num_outputs="1">
             <repeat name="rep_factorName">
                 <param name="factorName" value="Treatment"/>
@@ -593,7 +654,7 @@
             <section name="output_options">
                 <param name="output_selector" value=""/>
                 <param name="alpha_ma" value="0.05"/>
-            </section>
+            </section>
             <param name="tximport_selector" value="tximport"/>
             <param name="txtype" value="sailfish"/>
             <param name="mapping_format_selector" value="gtf"/>
@@ -632,7 +693,7 @@
                 </assert_contents>
             </output>
         </test>
-        <!--Test alpha_ma option, but with user-provided size factors -->
+        <!-- Test alpha_ma option, but with user-provided size factors -->
         <test expect_num_outputs="1">
             <repeat name="rep_factorName">
                 <param name="factorName" value="Treatment"/>
@@ -666,7 +727,7 @@
                 </assert_contents>
             </output>
         </test>
-        <!-- Same as above alpha_ma size factor test, but with a non-default estimator-->
+        <!-- Same as above alpha_ma size factor test, but with a non-default estimator -->
         <test expect_num_outputs="2">
             <repeat name="rep_factorName">
                 <param name="factorName" value="Treatment"/>
@@ -697,7 +758,7 @@
                 </assert_contents>
             </output>
         </test>
-        <!--Test prefilter parameter -->
+        <!-- Test prefilter parameter -->
         <test expect_num_outputs="2">
             <repeat name="rep_factorName">
                 <param name="factorName" value="Treatment"/>
@@ -730,6 +791,199 @@
                 </assert_contents>
             </output>
         </test>
+
+        <!--Test sample_sheet_contrasts mode -->
+        <test expect_num_outputs="2">
+            <param name="select_data|how" value="sample_sheet_contrasts"/>
+            <param name="select_data|countsFile">
+                <collection type="list">
+                    <element name="GSM461179_treat_single.counts" value="GSM461179_treat_single.counts"/>
+                    <element name="GSM461180_treat_paired.counts" value="GSM461180_treat_paired.counts"/>
+                    <element name="GSM461181_treat_paired.counts" value="GSM461181_treat_paired.counts"/>
+                    <element name="GSM461176_untreat_single.counts" value="GSM461176_untreat_single.counts"/>
+                    <element name="GSM461177_untreat_paired.counts" value="GSM461177_untreat_paired.counts"/>
+                    <element name="GSM461178_untreat_paired.counts" value="GSM461178_untreat_paired.counts"/>
+                    <element name="GSM461182_untreat_single.counts" value="GSM461182_untreat_single.counts"/>
+                </collection>
+            </param>
+            <param name="select_data|sample_sheet" value="sample_sheet.tab"/>
+            <param name="select_data|design_formula_mode|mode" value="automatic"/>
+            <param name="select_data|design_formula_mode|factor" value="2"/>
+            <section name="advanced_options">
+                <param name="use_beta_priors" value="1"/>
+            </section>
+            <section name="output_options">
+                <param name="output_selector" value="normCounts"/>
+            </section>
+            <output name="counts_out">
+                <assert_contents>
+                    <has_text_matching expression="GSM461179_treat_single.counts\tGSM461180_treat_paired.counts\tGSM461181_treat_paired.counts\tGSM461176_untreat_single.counts\tGSM461177_untreat_paired.counts\tGSM461178_untreat_paired.counts\tGSM461182_untreat_single.counts"/>
+                    <has_text_matching expression="FBgn0000003\t0\t0\t0\t0\t0\t0\t0"/>
+                </assert_contents>
+            </output>
+            <output name="deseq_out">
+                <assert_contents>
+                    <!-- Contrast direction: first level (Treated) is reference, comparing Untreated vs Treated -->
+                    <has_text_matching expression="FBgn0003360\t1933\.9504.*\t2\.8399.*\t0\.1309.*\t21\.68.*\t.*e-104\t.*e-101"/>
+                    <has_n_lines n="3999"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!--Test sample_sheet_contrasts mode with explicit reference_level -->
+        <test expect_num_outputs="2">
+            <param name="select_data|how" value="sample_sheet_contrasts"/>
+            <param name="select_data|countsFile">
+                <collection type="list">
+                    <element name="GSM461179_treat_single.counts" value="GSM461179_treat_single.counts"/>
+                    <element name="GSM461180_treat_paired.counts" value="GSM461180_treat_paired.counts"/>
+                    <element name="GSM461181_treat_paired.counts" value="GSM461181_treat_paired.counts"/>
+                    <element name="GSM461176_untreat_single.counts" value="GSM461176_untreat_single.counts"/>
+                    <element name="GSM461177_untreat_paired.counts" value="GSM461177_untreat_paired.counts"/>
+                    <element name="GSM461178_untreat_paired.counts" value="GSM461178_untreat_paired.counts"/>
+                    <element name="GSM461182_untreat_single.counts" value="GSM461182_untreat_single.counts"/>
+                </collection>
+            </param>
+            <param name="select_data|sample_sheet" value="sample_sheet.tab"/>
+            <param name="select_data|design_formula_mode|mode" value="automatic"/>
+            <param name="select_data|design_formula_mode|factor" value="2"/>
+            <param name="select_data|design_formula_mode|reference_level" value="Untreated"/>
+            <section name="advanced_options">
+                <param name="use_beta_priors" value="1"/>
+            </section>
+            <section name="output_options">
+                <param name="output_selector" value="normCounts"/>
+            </section>
+            <output name="counts_out">
+                <assert_contents>
+                    <has_text_matching expression="GSM461176_untreat_single.counts\tGSM461177_untreat_paired.counts\tGSM461178_untreat_paired.counts\tGSM461182_untreat_single.counts\tGSM461179_treat_single.counts\tGSM461180_treat_paired.counts\tGSM461181_treat_paired.counts"/>
+                    <has_text_matching expression="FBgn0000003\t0\t0\t0\t0\t0\t0\t0"/>
+                </assert_contents>
+            </output>
+            <output name="deseq_out">
+                <assert_contents>
+                    <!-- With reference_level=Untreated, compares Treated vs Untreated = log2(Treated/Untreated) -->
+                    <has_text_matching expression="FBgn0003360\t1933\.9504.*\t-2\.8399.*\t0\.1309.*\t-21\.68.*\t.*e-104\t.*e-101"/>
+                    <has_n_lines n="3999"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!--Test sample_sheet_contrasts mode with reference_level and target_level -->
+        <test expect_num_outputs="2">
+            <param name="select_data|how" value="sample_sheet_contrasts"/>
+            <param name="select_data|countsFile">
+                <collection type="list">
+                    <element name="GSM461179_treat_single.counts" value="GSM461179_treat_single.counts"/>
+                    <element name="GSM461180_treat_paired.counts" value="GSM461180_treat_paired.counts"/>
+                    <element name="GSM461181_treat_paired.counts" value="GSM461181_treat_paired.counts"/>
+                    <element name="GSM461176_untreat_single.counts" value="GSM461176_untreat_single.counts"/>
+                    <element name="GSM461177_untreat_paired.counts" value="GSM461177_untreat_paired.counts"/>
+                    <element name="GSM461178_untreat_paired.counts" value="GSM461178_untreat_paired.counts"/>
+                    <element name="GSM461182_untreat_single.counts" value="GSM461182_untreat_single.counts"/>
+                </collection>
+            </param>
+            <param name="select_data|sample_sheet" value="sample_sheet.tab"/>
+            <param name="select_data|design_formula_mode|mode" value="automatic"/>
+            <param name="select_data|design_formula_mode|factor" value="2"/>
+            <param name="select_data|design_formula_mode|reference_level" value="Untreated"/>
+            <param name="select_data|design_formula_mode|target_level" value="Treated"/>
+            <section name="advanced_options">
+                <param name="use_beta_priors" value="1"/>
+            </section>
+            <section name="output_options">
+                <param name="output_selector" value="normCounts"/>
+            </section>
+            <output name="counts_out">
+                <assert_contents>
+                    <has_text_matching expression="GSM461176_untreat_single.counts\tGSM461177_untreat_paired.counts\tGSM461178_untreat_paired.counts\tGSM461182_untreat_single.counts\tGSM461179_treat_single.counts\tGSM461180_treat_paired.counts\tGSM461181_treat_paired.counts"/>
+                    <has_text_matching expression="FBgn0000003\t0\t0\t0\t0\t0\t0\t0"/>
+                </assert_contents>
+            </output>
+            <output name="deseq_out">
+                <assert_contents>
+                    <!-- With reference=Untreated and target=Treated, compares Treated vs Untreated -->
+                    <has_text_matching expression="FBgn0003360\t1933\.9504.*\t-2\.8399.*\t0\.1309.*\t-21\.68.*\t.*e-104\t.*e-101"/>
+                    <has_n_lines n="3999"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!--Test sample_sheet_contrasts with custom design formula -->
+        <test expect_num_outputs="2">
+            <param name="select_data|how" value="sample_sheet_contrasts"/>
+            <param name="select_data|countsFile">
+                <collection type="list">
+                    <element name="GSM461179_treat_single.counts" value="GSM461179_treat_single.counts"/>
+                    <element name="GSM461180_treat_paired.counts" value="GSM461180_treat_paired.counts"/>
+                    <element name="GSM461181_treat_paired.counts" value="GSM461181_treat_paired.counts"/>
+                    <element name="GSM461176_untreat_single.counts" value="GSM461176_untreat_single.counts"/>
+                    <element name="GSM461177_untreat_paired.counts" value="GSM461177_untreat_paired.counts"/>
+                    <element name="GSM461178_untreat_paired.counts" value="GSM461178_untreat_paired.counts"/>
+                    <element name="GSM461182_untreat_single.counts" value="GSM461182_untreat_single.counts"/>
+                </collection>
+            </param>
+            <param name="select_data|sample_sheet" value="sample_sheet.tab"/>
+            <param name="select_data|design_formula_mode|mode" value="custom"/>
+            <param name="select_data|design_formula_mode|design_formula" value="~ treatment"/>
+            <param name="select_data|design_formula_mode|reference_level" value="Untreated"/>
+            <section name="advanced_options">
+                <param name="use_beta_priors" value="1"/>
+            </section>
+            <section name="output_options">
+                <param name="output_selector" value="normCounts"/>
+            </section>
+            <output name="counts_out">
+                <assert_contents>
+                    <has_text_matching expression="GSM461176_untreat_single.counts\tGSM461177_untreat_paired.counts\tGSM461178_untreat_paired.counts\tGSM461182_untreat_single.counts\tGSM461179_treat_single.counts\tGSM461180_treat_paired.counts\tGSM461181_treat_paired.counts"/>
+                    <has_text_matching expression="FBgn0000003\t0\t0\t0\t0\t0\t0\t0"/>
+                </assert_contents>
+            </output>
+            <output name="deseq_out">
+                <assert_contents>
+                    <!-- Custom formula ~treatment with reference=Untreated, compares Treated vs Untreated -->
+                    <has_text_matching expression="FBgn0003360\t1933\.9504.*\t-2\.8399.*\t0\.1309.*\t-21\.68.*\t.*e-104\t.*e-101"/>
+                    <has_n_lines n="3999"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!--Test sample_sheet_contrasts with multi-factor custom design formula -->
+        <test expect_num_outputs="2">
+            <param name="select_data|how" value="sample_sheet_contrasts"/>
+            <param name="select_data|countsFile">
+                <collection type="list">
+                    <element name="GSM461179_treat_single.counts" value="GSM461179_treat_single.counts"/>
+                    <element name="GSM461180_treat_paired.counts" value="GSM461180_treat_paired.counts"/>
+                    <element name="GSM461181_treat_paired.counts" value="GSM461181_treat_paired.counts"/>
+                    <element name="GSM461176_untreat_single.counts" value="GSM461176_untreat_single.counts"/>
+                    <element name="GSM461177_untreat_paired.counts" value="GSM461177_untreat_paired.counts"/>
+                    <element name="GSM461178_untreat_paired.counts" value="GSM461178_untreat_paired.counts"/>
+                    <element name="GSM461182_untreat_single.counts" value="GSM461182_untreat_single.counts"/>
+                </collection>
+            </param>
+            <param name="select_data|sample_sheet" value="sample_sheet.tab"/>
+            <param name="select_data|design_formula_mode|mode" value="custom"/>
+            <param name="select_data|design_formula_mode|design_formula" value="~ sequencing + treatment"/>
+            <param name="select_data|design_formula_mode|reference_level" value="Untreated"/>
+            <section name="advanced_options">
+                <param name="use_beta_priors" value="1"/>
+            </section>
+            <section name="output_options">
+                <param name="output_selector" value="normCounts"/>
+            </section>
+            <output name="counts_out">
+                <assert_contents>
+                    <!-- Verify all 7 samples present with correct values (column order may vary) -->
+                    <has_text_matching expression="FBgn0000003\t0\t0\t0\t0\t0\t0\t0"/>
+                    <has_n_columns n="8"/>
+                </assert_contents>
+            </output>
+            <output name="deseq_out">
+                <assert_contents>
+                    <!-- Multi-factor design ~sequencing + treatment controls for sequencing type -->
+                    <!-- Effect estimates differ from single-factor design due to covariate adjustment -->
+                    <has_text_matching expression="FBgn0003360\t1933\.950.*\t-2\.88.*\t0\.111.*\t-25\.8.*\t.*e-14[0-9]\t.*e-14[0-9]"/>
+                    <has_n_lines n="3999"/>
+                </assert_contents>
+            </output>
+        </test>
     </tests>
     <help><![CDATA[
 .. class:: infomark
--- a/get_deseq_dataset.R	Tue Jul 18 14:58:28 2023 +0000
+++ b/get_deseq_dataset.R	Tue Dec 09 17:43:58 2025 +0000
@@ -1,80 +1,77 @@
 get_deseq_dataset <- function(sample_table, header, design_formula, tximport, txtype, tx2gene) {
-
-  dir <- ""
+    dir <- ""

-  has_header <- !is.null(header)
-  use_txi <- !is.null(tximport)
-  if (use_txi) {
-    if (is.null(tx2gene)) stop("A transcript-to-gene map or a GTF/GFF3 file is required for tximport")
-    if (tolower(file_ext(tx2gene)) == "gff") {
-      gff_file <- tx2gene
-    } else {
-      gff_file <- NULL
-      tx2gene <- read.table(tx2gene, header = has_header)
+    has_header <- !is.null(header)
+    use_txi <- !is.null(tximport)
+    if (use_txi) {
+        if (is.null(tx2gene)) stop("A transcript-to-gene map or a GTF/GFF3 file is required for tximport")
+        if (tolower(file_ext(tx2gene)) == "gff") {
+            gff_file <- tx2gene
+        } else {
+            gff_file <- NULL
+            tx2gene <- read.table(tx2gene, header = has_header)
+        }
     }
-  }

-  if (!use_txi && has_header) {
-      countfiles <- lapply(as.character(sample_table$filename), read.delim, row.names = 1)
-      tbl <- do.call("cbind", countfiles)
-      colnames(tbl) <- rownames(sample_table) # take sample ids from header
+    if (!use_txi && has_header) {
+        countfiles <- lapply(as.character(sample_table$filename), read.delim, row.names = 1)
+        tbl <- do.call("cbind", countfiles)
+        colnames(tbl) <- rownames(sample_table) # take sample ids from header

-      # check for htseq report lines (from DESeqDataSetFromHTSeqCount function)
-      old_special_names <- c(
-        "no_feature",
-        "ambiguous",
-        "too_low_aQual",
-        "not_aligned",
-        "alignment_not_unique"
-      )
-      special_rows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% old_special_names
-      tbl <- tbl[!special_rows, , drop = FALSE]
-
-      dds <- DESeqDataSetFromMatrix(
-        countData = tbl,
-        colData = subset(sample_table, select = -filename),
-        design = design_formula
-      )
-  } else if (!use_txi && !has_header) {
+        # check for htseq report lines (from DESeqDataSetFromHTSeqCount function)
+        old_special_names <- c(
+            "no_feature",
+            "ambiguous",
+            "too_low_aQual",
+            "not_aligned",
+            "alignment_not_unique"
+        )
+        special_rows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% old_special_names
+        tbl <- tbl[!special_rows, , drop = FALSE]

-    # construct the object from HTSeq files
-    dds <- DESeqDataSetFromHTSeqCount(
-      sampleTable = sample_table,
-      directory = dir,
-      design = design_formula
-    )
-    colnames(dds) <- row.names(sample_table)
-
-  } else {
-      # construct the object using tximport
-      library("tximport")
-      txi_files <- as.character(sample_table$filename)
-      labs <- row.names(sample_table)
-      names(txi_files) <- labs
-      if (!is.null(gff_file)) {
-        # first need to make the tx2gene table
-        # this takes ~2-3 minutes using Bioconductor functions
-        suppressPackageStartupMessages({
-          library("GenomicFeatures")
-        })
-        txdb <- makeTxDbFromGFF(gff_file)
-        k <- keys(txdb, keytype = "TXNAME")
-        tx2gene <- select(txdb, keys = k, columns = "GENEID", keytype = "TXNAME")
-        # Remove 'transcript:' from transcript IDs (when gff_file is a GFF3 from Ensembl and the transcript does not have a Name)
-        tx2gene$TXNAME <- sub("^transcript:", "", tx2gene$TXNAME)  # nolint
-      }
-      try(txi <- tximport(txi_files, type = txtype, tx2gene = tx2gene))
-      if (!exists("txi")) {
-        # Remove version from transcript IDs in tx2gene...
-        tx2gene$TXNAME <- sub("\\.[0-9]+$", "", tx2gene$TXNAME)  # nolint
-        # ...and in txi_files
-        txi <- tximport(txi_files, type = txtype, tx2gene = tx2gene, ignoreTxVersion = TRUE)
-      }
-      dds <- DESeqDataSetFromTximport(
-        txi,
-        subset(sample_table, select = -c(filename)),
-        design_formula
-      )
-  }
-  return(dds)
+        dds <- DESeqDataSetFromMatrix(
+            countData = tbl,
+            colData = subset(sample_table, select = -filename),
+            design = design_formula
+        )
+    } else if (!use_txi && !has_header) {
+        # construct the object from HTSeq files
+        dds <- DESeqDataSetFromHTSeqCount(
+            sampleTable = sample_table,
+            directory = dir,
+            design = design_formula
+        )
+        colnames(dds) <- row.names(sample_table)
+    } else {
+        # construct the object using tximport
+        library("tximport")
+        txi_files <- as.character(sample_table$filename)
+        labs <- row.names(sample_table)
+        names(txi_files) <- labs
+        if (!is.null(gff_file)) {
+            # first need to make the tx2gene table
+            # this takes ~2-3 minutes using Bioconductor functions
+            suppressPackageStartupMessages({
+                library("GenomicFeatures")
+            })
+            txdb <- makeTxDbFromGFF(gff_file)
+            k <- keys(txdb, keytype = "TXNAME")
+            tx2gene <- select(txdb, keys = k, columns = "GENEID", keytype = "TXNAME")
+            # Remove 'transcript:' from transcript IDs (when gff_file is a GFF3 from Ensembl and the transcript does not have a Name)
+            tx2gene$TXNAME <- sub("^transcript:", "", tx2gene$TXNAME) # nolint
+        }
+        try(txi <- tximport(txi_files, type = txtype, tx2gene = tx2gene))
+        if (!exists("txi")) {
+            # Remove version from transcript IDs in tx2gene...
+            tx2gene$TXNAME <- sub("\\.[0-9]+$", "", tx2gene$TXNAME) # nolint
+            # ...and in txi_files
+            txi <- tximport(txi_files, type = txtype, tx2gene = tx2gene, ignoreTxVersion = TRUE)
+        }
+        dds <- DESeqDataSetFromTximport(
+            txi,
+            subset(sample_table, select = -c(filename)),
+            design_formula
+        )
+    }
+    return(dds)
 }
--- a/macros.xml	Tue Jul 18 14:58:28 2023 +0000
+++ b/macros.xml	Tue Dec 09 17:43:58 2025 +0000
@@ -34,7 +34,7 @@
     </xml>
     <token name="@TOOL_VERSION@">2.11.40.8</token>
     <token name="@DESEQ2_VERSION@">1.40.2</token>
-    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@VERSION_SUFFIX@">1</token>
     <token name="@PROFILE@">22.01</token>
     <xml name="edam_ontology">
         <edam_topics>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_sheet.tab	Tue Dec 09 17:43:58 2025 +0000
@@ -0,0 +1,8 @@
+sample	treatment	sequencing
+GSM461179_treat_single.counts	Treated	single
+GSM461180_treat_paired.counts	Treated	paired
+GSM461181_treat_paired.counts	Treated	paired
+GSM461176_untreat_single.counts	Untreated	single
+GSM461177_untreat_paired.counts	Untreated	paired
+GSM461178_untreat_paired.counts	Untreated	paired
+GSM461182_untreat_single.counts	Untreated	single