Mercurial > repos > bgruening > deseq2

--- a/deseq2.R	Mon Sep 02 10:09:37 2013 -0400
+++ b/deseq2.R	Wed Sep 04 11:58:20 2013 -0400
@@ -1,3 +1,8 @@
+## Setup R error handling to go to stderr
+options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+# we need that to not crash galaxy with an UTF8 error on German LC settings.
+Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+
 library('getopt');
 options(stringAsfactors = FALSE, useFancyQuotes = FALSE)
 args <- commandArgs(trailingOnly = TRUE)
@@ -8,9 +13,10 @@
   'verbose', 'v', 2, "integer",
   'help' , 'h', 0, "logical",
   'outputfile' , 'o', 1, "character",
-  'plots' , 'p', 1, "character",
+  'plots' , 'p', 2, "character",
   'input' , 'i', 1, "character",
-  'samples', 's', 1, "character"
+  'samples', 's', 1, "character",
+  'factors', 'f', 2, "character"
 ), byrow=TRUE, ncol=4);
 opt = getopt(spec);

@@ -23,99 +29,131 @@
 }

 trim <- function (x) gsub("^\\s+|\\s+$", "", x)
-opt$samples<-trim(opt$samples)
-
+opt$samples <- trim(opt$samples)
+opt$factors <- trim(opt$factors)

 htseqCountTable = read.table(opt$input, sep="\t", comment="", as.is=T)

-l<-unique(c(htseqCountTable[1,-1]))
-#print(l)
-
-colnames(htseqCountTable) <- htseqCountTable[1,]
+colnames(htseqCountTable) <- htseqCountTable[2,]
 names(htseqCountTable)<-colnames(htseqCountTable)
+conditions <- htseqCountTable[1,]
+conditions <- unlist(conditions[,-1])
 tagnames <- htseqCountTable[-(1:2), 1]
 htseqCountTable <- htseqCountTable[-(1:2),-1]
 htseqCountTable[,1] <- gsub(",", ".", htseqCountTable[,1])

+l <- unique(c(conditions))

 library( "DESeq2" )

-opt$samples
-
-print('----------------')
-
-pdf(opt$plots)
-
+if ( !is.null(opt$plots) ) {
+    pdf(opt$plots)
+}
 if (opt$samples=="all_vs_all"){
-
+  # all versus all
   for (i in seq(1, length(l), by=1)) {
     k=i+1
     if(k<=length(l)){
       for (j in seq(k, length(l), by=1)) {
-
-        colData<-names(htseqCountTable[, which(gsub("[.][0-9]+","",names(htseqCountTable)) %in% c(l[[i]], l[[j]]))])
-        currentPairTable<-htseqCountTable[ , which(gsub("[.][0-9]+","",names(htseqCountTable)) %in% c(l[[i]], l[[j]]))]
-        #      rownames(currentPairTable)<-tagnames
-
-        write.table(currentPairTable, file=paste(l[[i]],"_", l[[j]],".txt"))
-        currentPairTable<-read.table(paste(l[[i]],"_", l[[j]],".txt"),row.names=1,header=T)
-
-        currentPairTable <- as.data.frame(currentPairTable,stringsAsFactors=F)
-        #print(currentPairTable)
-        pdata = data.frame(condition=factor(c(rep(l[[i]], 2), rep(l[[j]], 2))),row.names=colnames(currentPairTable))
+        currentColmuns <- which(conditions %in% c(l[[i]], l[[j]]))
+            sampleNames <- names(htseqCountTable[currentColmuns])
+
+        currentPairTable <- htseqCountTable[currentColmuns]
+        ncondition1cols <- length(which(conditions %in% c(l[[i]])))
+        ncondition2cols <- length(which(conditions %in% c(l[[i]])))
+
+        ntabcols <- length(currentPairTable)
+            currentPairTable <- as.integer(unlist(currentPairTable))
+        currentPairTable <- matrix(unlist(currentPairTable), ncol = ntabcols, byrow = FALSE)
+            colnames(currentPairTable) <- sampleNames
+
+        pdata = data.frame(condition=factor(conditions[currentColmuns]),row.names=colnames(currentPairTable))
+
         dds = DESeqDataSetFromMatrix(countData = currentPairTable,  colData = pdata, design = ~ condition)
-
-        #      colData(dds)$condition
+
         dds <- DESeq(dds)
         sizeFactors(dds)
-        plotDispEsts(dds)
-        plotMA(dds)
-
+        if ( !is.null(opt$plots) ) {
+            plotDispEsts(dds)
+            plotMA(dds)
+        }
         res <- results(dds)
-        rownames(res)<-tagnames
-        resSorted <- res[order(res$padj),];
-        write.csv( as.data.frame(resSorted), file = opt$outputfile )
+        resCols <- colnames(res)
+
+        cond <- c(rep(paste(unique(conditions[currentColmuns]),collapse="_"), nrow(res)))
+        res[, "condition"] <- cond
+        res[, "geneIds"] <- tagnames
+
+        resSorted <- res[order(res$padj),]
+        write.table(as.data.frame(resSorted[,c("condition", "geneIds", resCols)]), file = opt$outputfile, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE)
       }
     }
   }
 }else{

-  sampleSets<-unlist(strsplit(opt$samples, " "))
-  sampleSets
-  samplesControl <- {}
-  samplesExperiment <- {}
-  sampleNames <- {}
+    sampleSets <- unlist(strsplit(opt$samples, " "))
+    samplesControl <- {}
+    samplesExperiment <- {}
+    samplesControl <- unlist(strsplit(sampleSets[1], ","))
+    samplesExperiment <- unlist(strsplit(sampleSets[2], ","))

-  samplesControl<-unlist(strsplit(sampleSets[1], ","))
-  samplesExperiment<-unlist(strsplit(sampleSets[2], ","))
+    # the minus one is needed because we get column indizes including the first column
+    sampleColumns <- c()
+    for (i in samplesControl) sampleColumns[[length(sampleColumns) + 1]] <- as.integer(i) - 1
+    for (i in samplesExperiment) sampleColumns[[length(sampleColumns) + 1]] <- as.integer(i) - 1
+    htseqSubCountTable <- htseqCountTable[,sampleColumns]

-  print(samplesControl)
-  print(samplesExperiment)
+    ntabcols <- length(htseqSubCountTable)
+    htseqSubCountTable <- as.integer(unlist(htseqSubCountTable))
+    htseqSubCountTable <- matrix(unlist(htseqSubCountTable), ncol = ntabcols, byrow = FALSE)
+    colnames(htseqSubCountTable) <- names(htseqCountTable)[sampleColumns]
+
+    pdata = data.frame(condition=factor(conditions[sampleColumns]), row.names=colnames(htseqSubCountTable))

-  # the minus one is needed because we get column indizes including the first column
-  samplecolumns <- c()
-  for (i in samplesControl) samplecolumns[[length(samplecolumns) + 1]] <- as.integer(i) - 1
-  for (i in samplesExperiment) samplecolumns[[length(samplecolumns) + 1]] <- as.integer(i) - 1
-
-print(samplecolumns)
-
-  htseqSubCountTable <- htseqCountTable[,samplecolumns]
-
-  #TODO: pavan hack :)
-  #write.table(htseqSubCountTable, file=paste(gsub(" ","_",opt$samples),".txt"))
-  #htseqSubCountTable<-read.table(paste(gsub(" ","_",opt$samples),".txt"),row.names=1,header=T)
+    if(!is.null(opt$factors)){
+        factorSets <- unlist(strsplit(opt$factors, " "))
+        for (factorSet in factorSets) {
+            currentFactor <- unlist(strsplit(factorSet, ":"))
+            for (factor in currentFactor[2]) {
+                factoredCols <- unlist(strsplit(factor, ","))
+                factoredCols <- as.numeric(factoredCols)
+                factors <- c()
+                for (i in sampleColumns){
+                    j=i+1
+                    if(j %in% factoredCols)
+                        factors[[length(factors) + 1]] <- "yes"
+                    else
+                        factors[[length(factors) + 1]] <- "no"
+                }
+                # only add factor if factor is applied to the selected samples
+                if((length(unique(factors))) >= 2){
+                        pdata[[currentFactor[1]]]<-factor(factors)
+                }else{
+                    write("Input-Error 01: You can only apply factors to selected samples.", stderr())
+                }
+            }
+        }
+    }

-  pdata = data.frame(condition=factor(c(names( htseqSubCountTable )[ samplecolumns ])),row.names=colnames(htseqSubCountTable))
-  dds = DESeqDataSetFromMatrix(countData = bla,  colData = pdata, design = ~ condition)
-
-  dds <- DESeq(dds)
-  sizeFactors(dds)
-  plotDispEsts(dds)
-  plotMA(dds)
-
-  res <- results(dds)
-  rownames(res)<-tagnames
-  resSorted <- res[order(res$padj),];
-  write.csv(as.data.frame(resSorted), file = opt$outputfile)
+    form <- as.formula(paste("", paste(names(pdata), collapse=" + "), sep=" ~ "))
+    dds = DESeqDataSetFromMatrix(countData = htseqSubCountTable,  colData = pdata, design = form)
+
+    dds <- DESeq(dds)
+    sizeFactors(dds)
+    if ( !is.null(opt$plots) ) {
+        plotDispEsts(dds)
+        plotMA(dds)
+    }
+
+    res <- results(dds)
+    resCols <- colnames(res)
+
+    cond <- c(rep(paste(unique(conditions[sampleColumns]),collapse="_"), nrow(res)))
+    res[, "condition"] <- cond
+    res[, "geneIds"] <- tagnames
+
+    resSorted <- res[order(res$padj),]
+    write.table(as.data.frame(resSorted[,c("condition", "geneIds", resCols)]), file = opt$outputfile, sep="\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
 }
 dev.off()
+
--- a/deseq2.xml	Mon Sep 02 10:09:37 2013 -0400
+++ b/deseq2.xml	Wed Sep 04 11:58:20 2013 -0400
@@ -9,17 +9,42 @@
     <command interpreter="Rscript">
         deseq2.R
             -o "$deseq_out"
-            -p "$plots"
+
+            #if $pdf:
+                -p "$plots"
+            #end if
+
             -i "$input_matrix"

         #if $filter_sel.filter_sel_opts == 'all_vs_all':
             -s 'all_vs_all'
         #else:
-            -s ## build a string like '1:2 5:6'
+            -s ## build a string like '1,2 5,6'
             "${filter_sel.control_cols} ${filter_sel.experiement_cols}"
+
+            #set $temp_factor_list = list()
+            #set $is_multi_factor_analysis = False
+            #for $factor in $filter_sel.factor:
+                #set $is_multi_factor_analysis = True
+                $temp_factor_list.append( '%s:%s' % ($factor.factor_name.replace(' ','_'), $factor.factor_index) )
+            #end for
+
+            #if $is_multi_factor_analysis:
+                -f "#echo ' '.join( $temp_factor_list )#"
+            #end if
         #end if

     </command>
+    <stdio>
+        <regex match="Execution halted"
+           source="both"
+           level="fatal"
+           description="Execution halted" />
+        <regex match="Input-Error 01"
+           source="both"
+           level="fatal"
+           description="Error in your input parameters: Make sure you only apply factors to selected samples." />
+    </stdio>
     <inputs>
         <param format="tabular" name="input_matrix" type="data" label="Countmatrix" help="You can create a count matrix with the tool ..."/>

@@ -40,16 +65,29 @@
                     <validator type="no_options" message="Please select at least one column."/>
                 </param>

+                <repeat name="factor" title="Include multi-factor analysis">
+                    <param name="factor_name" type="text" value="Factor Name" label="Specify a factor name" help=""/>
+
+                    <param name="factor_index" label="Select columns that are associated with a factor." type="data_column" data_ref="input_matrix"
+                        numerical="True" multiple="true" use_header_names="true" size="120" display="checkboxes">
+                        <validator type="no_options" message="Please select at least one column."/>
+                    </param>
+                </repeat>
+
             </when>
             <when value="all_v_all" />
         </conditional>

+        <param name="pdf" type="boolean" truevalue="" falsevalue="" checked="true" label="Visualising the analysis results"
+            help="output an additional PDF file" />
+
     </inputs>

   <outputs>
-    <data format="txt" name="deseq_out" label="DESeq2 result file"/>
-    <data format="pdf" name="plots" label="Plot collection"/>
-    <data format="txt" name="log" label="DESeq2 log file"/>
+    <data format="tabular" name="deseq_out" label="DESeq2 result file on ${on_string}"/>
+    <data format="pdf" name="plots" label="DESeq2 plots on ${on_string}">
+        <filter>pdf == True</filter>
+    </data>
   </outputs>

   <help>
@@ -58,47 +96,49 @@

 **What it does**

-`DESeq` is a tool for differential expression testing of RNA-Seq data.
+Estimate variance-mean dependence in count data from high-throughput sequencing assays and test for differential expression based on a model using the negative binomial distribution


 **Inputs**

-`DESeq` requires three input files to run:
+DESeq2_ requires one count matrix as input file. You can use the tool

-1. Annotation file in GFF3, containing the necessary information about the transcripts that are to be quantified.
-2. The BAM alignment files grouped into replicate groups, each containing several replicates. BAM files store the read alignments in a compressed format. They can be generated using the `SAM-to-BAM` tool in the NGS: SAM Tools section. (The script will also work with only two groups containing only a single replicate each. However, this analysis has less statistical power and is  therefor not recommended.)
+

 **Output**

-`DESeq` generates a text file containing the gene name and the p-value.
+DESeq2_ generates a tabular file containing the different columns and optional visualized results as PDF.
+
+====== ==========================================================
+Column Description
+------ ----------------------------------------------------------
+     1 Sample ID (corresponds to the header in your count matrix)
+     2 Gene Identifiers
+     3 mean normalised counts, averaged over all samples from both conditions
+     4 the logarithm (to basis 2) of the fold change
+     5 standard error estimate for the log2 fold change estimate
+     6 p value for the statistical significance of this change
+     7 p value adjusted for multiple testing with the Benjamini-Hochberg procedure
+       which controls false discovery rate (FDR)
+====== ==========================================================
+

 ------

-**Licenses**
+**References**
+
+DESeq2_ Authors: Michael Love (MPIMG Berlin), Simon Anders, Wolfgang Huber (EMBL Heidelberg)

-If **DESeq** is used to obtain results for scientific publications it
-should be cited as [1]_.
+If _DESeq2_ is used to obtain results for scientific publications it
+should be cited as [1]_. A paper describing DESeq2_ is in preparation.

-**References**
+

 .. [1] Anders, S and Huber, W (2010): `Differential expression analysis for sequence count data`_.

 .. _Differential expression analysis for sequence count data: http://dx.doi.org/10.1186/gb-2010-11-10-r106
+.. _DESeq2: http://master.bioconductor.org/packages/release/bioc/html/DESeq2.html

-------
-
-For more information see http://www.sequenceontology.org/gff3.shtml

-**SAM/BAM format** The Sequence Alignment/Map (SAM) format is a
-tab-limited text format that stores large nucleotide sequence
-alignments. BAM is the binary version of a SAM file that allows for
-fast and intensive data processing. The format specification and the
-description of SAMtools can be found on
-http://samtools.sourceforge.net/.
-
-------
-
-DESeq-hts Wrapper Version 0.3 (Feb 2012)
-
-</help>
+  </help>
 </tool>