changeset 22:aad8927093ac draft

Uploaded
author bgruening
date Sat, 15 Nov 2014 05:42:24 -0500
parents d32de046ba31
children f8a59b35c397
files deseq2.R deseq2.xml deseq_helper.py foldchanges_heatmap.r foldchanges_heatmap.xml helper.py test-data/countmatrix.txt tool_dependencies.xml
diffstat 8 files changed, 222 insertions(+), 493 deletions(-) [+]
line wrap: on
line diff
--- a/deseq2.R	Wed Feb 19 12:43:03 2014 -0500
+++ b/deseq2.R	Sat Nov 15 05:42:24 2014 -0500
@@ -3,186 +3,142 @@
 # we need that to not crash galaxy with an UTF8 error on German LC settings.
 Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
 
-library('getopt');
+library('getopt')
+library('rjson')
+library('DESeq2')
 options(stringAsfactors = FALSE, useFancyQuotes = FALSE)
 args <- commandArgs(trailingOnly = TRUE)
 
 #get options, using the spec as defined by the enclosed list.
 #we read the options from the default: commandArgs(TRUE).
 spec = matrix(c(
-	'verbose', 'v', 2, "integer",
-	'help' , 'h', 0, "logical",
-	'outfile' , 'o', 1, "character",
-	'outfilefiltered' , 'f', 1, "character",
-	'plots' , 'p', 2, "character",
-	'input' , 'i', 1, "character",
-	'factors', 'm', 2, "character",
-	'fittype', 't', 2, "character",
-	'threshold', 'c', 2, "double"
-#	'organism', 'g', 2, "character"
+    'verbose', 'v', 2, "integer",
+    'help' , 'h', 0, "logical",
+    'outfile' , 'o', 1, "character",
+    'outfilefiltered' , 'f', 1, "character",
+    'plots' , 'p', 2, "character",
+    'factors', 'm', 2, "character",
+    'filtermode', 'l', 2, "character",
+    'threshold', 'c', 2, "double",
+    'organism', 'g', 2, "character"
 ), byrow=TRUE, ncol=4);
 opt = getopt(spec);
 
+
 # if help was asked for print a friendly message
 # and exit with a non-zero error code
 if ( !is.null(opt$help) ) {
-	cat(getopt(spec, usage=TRUE));
-	q(status=1);
+    cat(getopt(spec, usage=TRUE));
+    q(status=1);
 }
 
-if( is.null(opt$fittype))
-	opt$fittype = "parametric"
+if( is.null(opt$filtermode))
+    opt$filtermode = "absolute"
 
 trim <- function (x) gsub("^\\s+|\\s+$", "", x)
 opt$samples <- trim(opt$samples)
 opt$factors <- trim(opt$factors)
 
-htseqCountTable = read.table(opt$input, sep="\t", comment="", as.is=T)
 
-colnames(htseqCountTable) <- htseqCountTable[2,]
-names(htseqCountTable)<-colnames(htseqCountTable)
-conditions <- htseqCountTable[1,]
-conditions <- unlist(conditions[,-1])
-featurenames <- htseqCountTable[-(1:2), 1]
-htseqCountTable <- htseqCountTable[-(1:2),-1]
-htseqCountTable[,1] <- gsub(",", ".", htseqCountTable[,1])
-
-l <- unique(c(conditions))
-
-library('rjson')
-library('DESeq2')
-#library('biomaRt')
-
-if ( !is.null(opt$plots) ) {
-	pdf(opt$plots)
-}
+generatePlots <-function(dds, title_prefix){
+    plotDispEsts(dds, main= paste(title_prefix, "Dispersion estimate plot"))
+    plotMA(dds, main= paste(title_prefix, "MA-plot"))
 
-## The following function takes deseq data object, computes, writes and plots the results.
-computeAndWriteResults <- function(dds, sampleCols, outputcsv, featurenames_filtered) {
-	dds <- DESeq(dds, fitType= opt$fittype)
-	sizeFactors(dds)
-	res <- results(dds)
-	resCols <- colnames(res)
-	cond <- c(rep(paste(unique(conditions[sampleCols]),collapse="_"), nrow(res)))
-	if(missing(featurenames_filtered)){
-		res[, "geneIds"] <- featurenames
-#		rownames(res) <- featurenames
-		title_prefix = "Complete: "
-	}else{
-		res[, "geneIds"] <- featurenames_filtered
-#		rownames(res) <- featurenames
-		title_prefix = "Filtered: "
-	}
-	print(sum(res$padj < .1, na.rm=TRUE))
-	print(opt$organism)
-#	if(opt$organism != "other"){
-#		dataset = ""
-#		if(opt$organism == "mouse")
-#			dataset = "mmusculus_gene_ensembl"
-#		if(opt$organism == "human")
-#			dataset = "hsapiens_gene_ensembl"
-#		if(opt$organism == "fly")
-#			dataset = "dmelanogaster_gene_ensembl"
-#		ensembldb = useMart("ensembl",dataset=dataset)
-#
-#		annot <- getBM(attributes = c("ensembl_gene_id", "external_gene_id","description"),
-#					filters = "ensembl_gene_id",
-#					values=res[, "geneIds"],
-#					mart=ensembldb)
+    library("RColorBrewer")
+    library("gplots")
+    select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:30]
+    hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
 
-#		res <- merge(res, annot,
-#					by.x = "geneIds",
-#					by.y = "ensembl_gene_id",
-#					all.x=TRUE)
-		
-#		resCols <- colnames(res)
-#		resSorted <- res[order(res$padj),]
-#		write.table(as.data.frame(resSorted[,c(resCols)]), file = outputcsv, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE)					
-#	}else{
-		resSorted <- res[order(res$padj),]
-		write.table(as.data.frame(resSorted[,c("geneIds", resCols)]), file = outputcsv, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE)
-#	}
-	
-	if ( !is.null(opt$plots) ) {
-		plotDispEsts(dds, main= paste(title_prefix, "Dispersion estimate plot"))
-		plotMA(dds, main= paste(title_prefix, "MA-plot"))
-
-		library("RColorBrewer")
-		library("gplots")
-		select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:30]
-		hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
-
-		rld <- rlogTransformation(dds, blind=TRUE)
-		vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
-		distsRL <- dist(t(assay(rld)))
-		mat <- as.matrix(distsRL)
-		heatmap.2(mat, trace="none", col = rev(hmcol), main = paste(title_prefix, "Sample-to-sample distances"))
-	}
-	return(res)
-}
-
-## This functions filters out the genes with low counts and plots p-value distribution
-independentFilter <- function(deseqRes, countTable, sampleColumns, designFormula){
-#	use = (deseqRes$baseMean > quantile(deseqRes$baseMean, probs = opt$threshold) & (!is.na(deseqRes$pvalue)))
-	use = (deseqRes$baseMean > opt$threshold & !is.na(deseqRes$pvalue))
-	ddsFilt <- DESeqDataSetFromMatrix(countData = countTable[use, ], colData = sampleTable, design = designFormula)
-	computeAndWriteResults(ddsFilt, sampleColumns, opt$outfilefiltered, featurenames[use])
-	
-	## p-value distribution
-	h1 <- hist(deseqRes$pvalue[!use], breaks=50, plot=FALSE)
-	h2 <- hist(deseqRes$pvalue[use], breaks=50, plot=FALSE)
-	colori <- c(`filtered out`="khaki", `complete`="powderblue")
-	barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
-			col = colori, space = 0, main = "Distribution of p-values", ylab="frequency")
-	text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)), adj = c(0.5,1.7), xpd=NA)
-	legend("topright", fill=rev(colori), legend=rev(names(colori)))	
+    rld <- rlogTransformation(dds, blind=TRUE)
+    vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
+    distsRL <- dist(t(assay(rld)))
+    mat <- as.matrix(distsRL)
+    heatmap.2(mat, trace="none", col = rev(hmcol), main = paste(title_prefix, "Sample-to-sample distances"), margin=c(13,13))
 }
 
 parser <- newJSONParser()
 parser$addData( opt$factors )
 factorsList <- parser$getObject()
 
-sampleColumns <- c()
-
-for (j in 2:length(factorsList[[1]])){
-    for (k in 1:length(names(factorsList[[1]][[j]]))){
-        for (l in 1:length(factorsList[[1]][[j]][[k]])){
-            sampleColumns[[length(sampleColumns) + 1]] <- as.integer(factorsList[[1]][[j]][[k]][l]) - 1
+sampleTable<-data.frame()
+factorNames<-c()
+for(factor in factorsList){
+    factorName<-factor[[1]]
+    factorNames<-append(factorNames, factorName)
+    factorValuesMapList<-factor[[2]]
+    c = length(factorValuesMapList)
+    for (factorValuesMap in factorValuesMapList){
+        for(files in factorValuesMap){
+            for(file in files){
+                sampleTable[basename(file),"sampleName"]<-file
+                sampleTable[basename(file),"fileName"]<-file
+                sampleTable[basename(file),factorName]<-paste(c,names(factorValuesMap),sep="_")
+            }
         }
+        c = c-1
     }
 }
 
-sampleColumns<-sort(unique(sampleColumns))
-htseqSubCountTable <- htseqCountTable[,sampleColumns]
-ntabcols <- length(htseqSubCountTable)
-htseqSubCountTable <- as.integer(unlist(htseqSubCountTable))
-htseqSubCountTable <- matrix(unlist(htseqSubCountTable), ncol = ntabcols, byrow = FALSE)
-colnames(htseqSubCountTable) <- names(htseqCountTable)[sampleColumns]
-sampleTable = data.frame(row.names=colnames(htseqSubCountTable))
- 
-for(i in 1:length(factorsList)){
-    factorName<-factorsList[[i]][[1]]
-    for (j in 2:length(factorsList[[i]])){
-        effected_cols <- c()
-        for (k in 1:length(names(factorsList[[i]][[j]]))){
-            for (l in 1:length(factorsList[[i]][[j]][[k]])){
-                if(colnames(htseqCountTable)[factorsList[[i]][[j]][[k]][l]-1] %in% rownames(sampleTable)){
-                    sampleTable[colnames(htseqCountTable)[factorsList[[i]][[j]][[k]][l]-1],factorName] <- names(factorsList[[i]][[j]])[k]
-                }
-            }
-        }
-    }
+sampleTable
+factorNames<-rev(factorNames)
+factorNames
+designFormula <- as.formula(paste("", paste(factorNames, collapse=" + "), sep=" ~ "))
+
+designFormula
+
+ddsHTSeq = DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
+                                      directory = "",
+                                      design =  designFormula)
+
+ddsHTSeq <- DESeq(ddsHTSeq)
+ddsHTSeq <- nbinomWaldTest(ddsHTSeq, cooksCutoff=FALSE)
+ddsHTSeq
+deseqRes <- results(ddsHTSeq)
+mcols(deseqRes)$description
+
+resSorted <- deseqRes[order(deseqRes$padj),]
+head(resSorted)
+write.table(as.data.frame(resSorted), file = opt$outfile, sep="\t", quote = FALSE, append=TRUE, col.names = FALSE)
+
+
+if(opt$filtermode == "absolute"){
+    use = (deseqRes$baseMean > opt$threshold & !is.na(deseqRes$pvalue))
+} else if(opt$filtermode == "quantile"){
+    opt$threshold = opt$threshold/100
+    print(paste("Threshold:", opt$threshold))
+    print(quantile(unique(deseqRes$baseMean), probs = opt$threshold))
+    use = (deseqRes$baseMean > quantile(unique(deseqRes$baseMean), probs = opt$threshold) & (!is.na(deseqRes$pvalue)))
+} else{
+    print(paste("Unknown filtermode:", opt$filtermode))
+    q("no", 1, FALSE)
 }
-sampleTable[is.na(sampleTable)] <- "undefined"
-sampleTable
+
+ddsHTSeqFilt <- ddsHTSeq[use, ]
+ddsHTSeqFilt <- DESeq(ddsHTSeqFilt)
+ddsHTSeqFilt <- nbinomWaldTest(ddsHTSeqFilt, cooksCutoff=FALSE)
+ddsHTSeqFilt
+deseqResFilt <- results(ddsHTSeqFilt)
+mcols(deseqResFilt)$description
+
+resFiltSorted <- deseqResFilt[order(deseqResFilt$padj),]
+head(resFiltSorted)
+write.table(as.data.frame(resFiltSorted), file = opt$outfilefiltered, sep="\t", quote = FALSE, append=TRUE, col.names = FALSE)
 
-factorNames <- c(names(sampleTable)[-1], names(sampleTable)[1])
-designFormula <- as.formula(paste("", paste(factorNames, collapse=" + "), sep=" ~ "))
-dds = DESeqDataSetFromMatrix(countData = htseqSubCountTable,  colData = sampleTable, design = designFormula)
-deseqres <- computeAndWriteResults(dds, sampleColumns, opt$outfile) 
+if(!is.null(opt$plots)){
+    pdf(opt$plots)
+    generatePlots(ddsHTSeq, "Complete: ")
+    generatePlots(ddsHTSeqFilt, "Filtered: ")
+    ## p-value distribution
+    h1 <- hist(deseqRes$pvalue[!use], breaks=50, plot=FALSE)
+    h2 <- hist(deseqRes$pvalue[use], breaks=50, plot=FALSE)
+    colori <- c(`filtered out`="khaki", `complete`="powderblue")
+    barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
+            col = colori, space = 0, main = "Distribution of p-values", ylab="frequency")
+    text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)), adj = c(0.5,1.7), xpd=NA)
+    legend("topright", fill=rev(colori), legend=rev(names(colori)))
+    dev.off()
+}
 
-## independent filtering
-independentFilter(deseqres, htseqSubCountTable, sampleColumns, designFormula)
+sessionInfo()
 
-dev.off()
-sessionInfo()
+
--- a/deseq2.xml	Wed Feb 19 12:43:03 2014 -0500
+++ b/deseq2.xml	Sat Nov 15 05:42:24 2014 -0500
@@ -1,12 +1,11 @@
-<tool id="deseq2" name="DESeq2" version="2.0.1">
-    <description>Determines differentially expressed features from count data</description>
+<tool id="iuc_deseq2" name="DESeq2" version="2.1.6.0">
+    <description>Determines differentially expressed features from count tables</description>
     <requirements>
         <requirement type="binary">Rscript</requirement>
         <requirement type="R-module">DESeq2</requirement>
-        <requirement type="package" version="3.0.2">R_3_0_2</requirement>
-        <requirement type="package" version="1.2.10">deseq2</requirement>
-        <!--<requirement type="set_environment">DESEQ2_SCRIPT_PATH</requirement>-->
+        <requirement type="package" version="1.6.1">deseq2</requirement>
     </requirements>
+    <code file="helper.py" />
     <command interpreter="Rscript">
         #import json
         deseq2.R
@@ -16,99 +15,79 @@
             #if $pdf:
                 -p "$plots" 
             #end if
-
-            -i "$input_matrix"
-
-            #set $temp_factor_name = list()
+            
+            #set $temp_factor_names = list()
             #for $factor in $rep_factorName:
-                #set $temp_factor = dict()
+
+                #set $temp_factor = list()
                 #for $level in $factor.rep_factorLevel:
-                    ##$temp_factor_list.append( '%s::%s:%s' % ($factor.factorName.replace(' ','_'), $level.factorLevel, $level.factorIndex) )
-                    $temp_factor.update({str($level.factorLevel): map(int, str($level.factorIndex).split(','))})
+                    #set $count_files = list()
+                    #for $file in $level.rep_countsFile:
+                        $count_files.append(str($file.countsFile))
+                    #end for
+                    $temp_factor.append( {str($level.factorLevel): $count_files} )
                 #end for
-                $temp_factor_name.append([str($factor.factorName), $temp_factor])
+                $temp_factor_names.append([str($factor.factorName), $temp_factor])
 
             #end for
 
-                ##-m "#echo ' '.join( $temp_factor_list )#"
-                -m '#echo json.dumps(temp_factor_name)#'
-            ##--organism "$organism"
-            ##-t "$fittype"
-            -c $countthreshold
+            -m '#echo json.dumps(temp_factor_names)#'
+            #if str($filtermode.mode) == "absolute":
+                -c $filtermode.count_abs
+            #elif str($filtermode.mode) == "quantile":
+                -c $filtermode.count_quant
+            #end if
+            --filtermode $filtermode.mode
     </command>
     <stdio>
-        <regex match="Execution halted" 
-           source="both" 
-           level="fatal" 
-           description="Execution halted." />
-        <regex match="Input-Error 01" 
-           source="both" 
-           level="fatal" 
-           description="Error in your input parameters: Make sure you only apply factors to selected samples." />
-        <regex match="Error in" 
-           source="both" 
-           level="fatal" 
-           description="An undefined error occured, please check your intput carefully and contact your administrator." />
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+        <regex match="Error:" />
+        <regex match="Exception:" />
     </stdio>
     <inputs>
-        <param format="tabular" name="input_matrix" type="data" label="Countmatrix" help="You can create a count matrix with the tool 
-            'Count reads in features with htseq-count'"/>
-
-        <repeat name="rep_factorName" title="Factor/Condition" min="1">
-            <param name="factorName" type="text" value="FactorName" label="Specify a factor name" help="" />
-            <repeat name="rep_factorLevel" title="Factor level" min="1">
-                <param name="factorLevel" type="text" value="FactorLevel" label="Specify a factor level" help="" />
-
-                <param name="factorIndex" label="Select columns that are associated with this factor level" type="data_column" data_ref="input_matrix"
-                    numerical="True" multiple="true" use_header_names="true" size="120" display="checkboxes">
-                    <validator type="no_options" message="Please select at least one column." />
+        <repeat name="rep_factorName" title="Factor" min="1">
+            <param name="factorName" type="text" value="FactorName" label="Specify a factor name" 
+                help="Only letters, numbers and underscores will be retained in this field">
+                <sanitizer>
+                    <valid initial="string.letters,string.digits"><add value="_" /></valid>
+                </sanitizer>
+            </param>
+            <repeat name="rep_factorLevel" title="Factor level" min="2" max="2" default="2">
+                <param name="factorLevel" type="text" value="FactorLevel" label="Specify a factor level"
+                    help="Only letters, numbers and underscores will be retained in this field">
+                    <sanitizer>
+                        <valid initial="string.letters,string.digits"><add value="_" /></valid>
+                    </sanitizer>
                 </param>
+                <repeat name="rep_countsFile" title="Biological replicate" min="1">
+                    <param name="countsFile" type="data" label="Counts file"/>
+                </repeat>
             </repeat>
         </repeat>
-        <!--
-        <param name="control_cols" type="select" display="checkboxes" multiple="true" optional="True" label="Select columns containing first condition" 
-            dynamic_options="get_matrix_header( input_dataset=input_matrix )" help="insert useful info here">
-            <validator type="no_options" message="Please select at least one column."/>
-        </param>            
-        <param name="experiement_cols" type="select" display="checkboxes" multiple="true" optional="True" label="Select columns containing second condition" 
-            dynamic_options="get_matrix_header( input_dataset=input_matrix )" help="insert useful info here">
-            <validator type="no_options" message="Please select at least one column."/>
-        </param>             
-                 
-        <repeat name="factor" title="factor">
-            <param name="factor_name" type="text" value="Factor Name" label="Specify a factor name" 
-                help="Order of factors you add effects the design formual and hence effects whole analysis. Design formual will be created as follows: ~factor1+factor2+factor3+...+factorN+condition"/>
-            <param name="factor_index" type="select" display="checkboxes" multiple="true" optional="True" label="Choose sample to compare with" 
-                    dynamic_options="get_matrix_header( input_dataset=input_matrix )" help="Select columns that are associated with a factor">
-                <validator type="no_options" message="Please select at least one column."/>
+        <conditional name="filtermode">
+            <param name="mode" type="select" label="Filter out features with following criteria">
+                <option value="absolute">Filter based on absolute normalized mean counts</option>
+                <option value="quantile">Filter based on quantiles</option>
             </param>
-        </repeat>
-        -->
-        <!--param name="organism" size="10" type="select">
-            <option value="human">human</option>
-            <option value="mouse">mouse</option>
-            <option value="fly">fly</option>
-            <option value="other">other</option>
-        </param-->
-        <param name="countthreshold" size="10" type="float" value="10.0" label="Filter out features with mean normalized counts lower than this value"/>
-        <param name="fittype" type="select" label="Type of fitting of dispersions to the mean intensity">
-            <option value="parametric">parametric</option>
-            <option value="local">local</option>
-            <option value="mean">mean</option>
-        </param>
-        <param name="pdf" type="boolean" truevalue="" falsevalue="" checked="true" 
-            label="Visualising the analysis results"
+            <when value="absolute">
+                <param name="count_abs" size="10" type="float" value="10.0" label="Filter out features with mean normalized counts lower than this value"/>
+            </when>
+            <when value="quantile">
+                <param name="count_quant" size="10" type="float" value="10.0" min="0" max="100" label="Filter out features with mean normalized counts lower than this % of values" />
+            </when>
+        </conditional>
+        <param name="pdf" type="boolean" truevalue="" falsevalue="" checked="true" label="Visualising the analysis results"
             help="output an additional PDF files" />
     </inputs>
 
     <outputs>
-        <data format="tabular" name="deseq_out" label="DESeq2 result file on ${on_string}" />
-        <data format="tabular" name="deseq_out_filtered" label="Independent filtering result file on ${on_string}" />
+        <data format="tabular" name="deseq_out" label="DESeq2 result file on ${on_string}"/>
+        <data format="tabular" name="deseq_out_filtered" label="Independent filtering result file on ${on_string}"/>
         <data format="pdf" name="plots" label="DESeq2 plots on ${on_string}">
             <filter>pdf == True</filter>
         </data>
     </outputs>
-    <code file="deseq_helper.py" />
 
     <help>
 
@@ -121,9 +100,25 @@
 
 **Inputs**
 
-DESeq2_ requires one count matrix as input file. You can use the tool
+DESeq2_ takes count tables that generated from the htseq-count as input. Count tables must be generated for each sample individually. DESeq2 is capable of handling multiple factors that effect your experiment. The first factor you input is considered as the primary factor that affects gene expressions. You also input several secondary factors that might influence your experiment. But the final output will be changes in genes due to primary factor in presence of secondary factors. Each factor has two levels/states. You need to select appropriate count table from your history for each factor level.
+
+The following table gives some examples of factors and their levels:
 
+========= ============== ===============
+Factor    Factor level 1 Factor level 2 
+--------- -------------- ---------------
+Treatment Treated        Untreated
+--------- -------------- ---------------
+Condition Knockdown      Wildtype
+--------- -------------- ---------------
+TimePoint Day4           Day1
+--------- -------------- ---------------
+SeqType   SingleEnd      PairedEnd
+--------- -------------- ---------------
+Gender    Female         Male
+========= ============== ===============
 
+*Note*: Output log2 fold changes are based on primary factor level 1 vs. factor level2. Here the order of factor levels is important. For example, for the factor 'Treatment' given in above table, DESeq2 computes fold changes of 'Treated' samples against 'Untreated', i.e. the values correspond to up or down regulations of genes in Treated samples.
 
 **Output**
 
@@ -134,7 +129,7 @@
 ------ ----------------------------------------------------------
      1 Gene Identifiers
      2 mean normalised counts, averaged over all samples from both conditions
-     3 the logarithm (to basis 2) of the fold change
+     3 the logarithm (to basis 2) of the fold change (See the note in inputs section)
      4 standard error estimate for the log2 fold change estimate
      5 p value for the statistical significance of this change
      6 p value adjusted for multiple testing with the Benjamini-Hochberg procedure
@@ -142,22 +137,10 @@
 ====== ==========================================================
 
 
-------
-
-**References** 
-
-DESeq2_ Authors: Michael Love (MPIMG Berlin), Simon Anders, Wolfgang Huber (EMBL Heidelberg)
-
-If DESeq2_ is used to obtain results for scientific publications it
-should be cited as [1]_. A paper describing DESeq2_ is in preparation.
-
-
-
-.. [1] Anders, S and Huber, W (2010): `Differential expression analysis for sequence count data`_. 
-
-.. _Differential expression analysis for sequence count data: http://dx.doi.org/10.1186/gb-2010-11-10-r106
 .. _DESeq2: http://master.bioconductor.org/packages/release/bioc/html/DESeq2.html
 
-
     </help>
+    <citations>
+        <citation type="doi">10.1101/002832</citation>
+    </citations>
 </tool>
--- a/deseq_helper.py	Wed Feb 19 12:43:03 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,98 +0,0 @@
-
-from galaxy.tools.parameters import DataToolParameter
-
-def get_matrix_header( input_dataset ):
-    """
-        Not used currently, because the reload of the ckeckboxes did not work.
-    """
-    input_handle = open( input_dataset.file_name )
-    first_header = input_handle.readline()
-    second_header = input_handle.readline()
-    return [('%s::%s' % (cname2,cname1), str(int(col_num) + 1), False) for col_num, (cname2, cname1) in enumerate(zip(second_header.split()[1:],first_header.split()[1:])) ]
-
-
-
-def _construct_error_map( error_map, rep_dict, rep_parent, child, error_value ):
-    """
-        Its no so easy to create a propper error_map for repetitions in Galaxy.
-        This is a helper function.
-    """
-
-    error_map[ rep_parent ] = [ dict() for t in rep_dict ]
-    for i in range( len( rep_dict ) ):
-        error_map[ rep_parent ][i][ child ] = error_value
-
-
-
-def validate_input( trans, error_map, param_values, page_param_map ):
-    """
-        Validates the user input, before execution.
-    """
-    factors = param_values['rep_factorName']
-    factor_name_list = []
-    factor_duplication = False
-    level_duplication = False
-    overlapping_selection = False
-
-    first_condition = True
-    factor_indieces = list()
-
-    for factor in factors:
-        # factor names should be unique
-        fn = factor['factorName']
-        if fn in factor_name_list:
-            factor_duplication = True
-            break
-        factor_name_list.append( fn )
-
-        level_name_list = list()
-        factor_index_list = list()
-
-        if first_condition and len( factor['rep_factorLevel'] ) < 2:
-            # first condition needs to have at least 2 levels
-            _construct_error_map( error_map, factors, 'rep_factorName', 'rep_factorLevel', [ {'factorLevel': 'The first condition should have at least 2 factor'} for t in factor['rep_factorLevel'] ] )
-
-        for level in factor['rep_factorLevel']:
-            # level names under one factor should be unique
-            fl = level['factorLevel']
-            if fl in level_name_list:
-                level_duplication = True
-            level_name_list.append( fl )
-
-            fi = level['factorIndex']
-            if fi:
-                # the checkboxes should not have an overlap
-                for check in fi:
-                    if check in factor_index_list:
-                        overlapping_selection = True
-                    factor_index_list.append( check )
-
-            print set(factor_index_list)
-            print factor_indieces
-            if set(factor_index_list) in factor_indieces:
-                _construct_error_map( error_map, factors, 'rep_factorName', 'rep_factorLevel', [ {'factorLevel': 'It is not allowed to have two identical factors, that means two factors with the same toggeled checked boxes. '} for t in factor['rep_factorLevel'] ] )
-            else:
-                factor_indieces.append( set(factor_index_list) )
-
-
-
-        if level_duplication:
-            error_map['rep_factorName'] = [ dict() for t in factors ]
-            for i in range( len( factors ) ):
-                error_map['rep_factorName'][i]['rep_factorLevel'] = [ {'factorLevel': 'Factor levels for each factor need to be unique'} for t in factor['rep_factorLevel'] ]
-            break
-        if overlapping_selection:
-            error_map['rep_factorName'] = [ dict() for t in factors ]
-            for i in range( len( factors ) ):
-                error_map['rep_factorName'][i]['rep_factorLevel'] = [ {'factorIndex': 'The samples from different factors are not allowed to overlap'} for t in factor['rep_factorLevel'] ]
-            break
-
-        first_condition = False
-
-    if factor_duplication:
-        _construct_error_map( error_map, factors, 'rep_factorName', 'factorName', 'Factor names need to be unique' )
-        """
-        error_map['rep_factorName'] = [ dict() for t in factors ]
-        for i in range( len( factors ) ):
-            error_map['rep_factorName'][i]['factorName'] = 'Factor names need to be unique'
-        """
--- a/foldchanges_heatmap.r	Wed Feb 19 12:43:03 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,44 +0,0 @@
-## Setup R error handling to go to stderr
-options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
-# we need that to not crash galaxy with an UTF8 error on German LC settings.
-Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
-
-args <- commandArgs(trailingOnly = TRUE)
-
-library("gplots")
-library("RColorBrewer")
-
-tables <- {}
-labels <- {}
-
-labels <- unlist(strsplit(args[1], ","))
-tables <- unlist(strsplit(args[2], ","))
-ids_file=args[3]
-output_pdf_file=args[4]
-
-reds <- colorRampPalette(brewer.pal(9,"Reds"))(100)
-reds <- rev(reds)
-greens <- colorRampPalette(brewer.pal(9,"Greens"))(100)
-cols <- c(reds, greens)
-
-ids <- read.table(ids_file)
-
-mat <- c()
-
-for(i in 1:length(tables)) {
- # get data frame
-    curr_table <- read.table(tables[[i]], sep="\t", header=FALSE)
-    log2FCvect <- c()
-    for(j in 1:length(ids$V1)) {
-        log2FCvect <- c(log2FCvect, curr_table$V3[which(curr_table$V1 %in% ids$V1[j])])
-    }
-    # build foldChange data frame for heatmap
-    mat <- cbind(mat, log2FCvect)
-}
-pdf(output_pdf_file) 
-hm <- heatmap.2(mat, col = cols, trace="none", labCol=labels, labRow=NULL,scale="none",symm=F,symkey=T,symbreaks=T, cexCol=0.8, cexRow=0.8)
-dev.off()  
-
-
-
-
--- a/foldchanges_heatmap.xml	Wed Feb 19 12:43:03 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,87 +0,0 @@
-<tool id="foldchange_heatmap" name="fold change heatmap" version="1.0">
-    <description>Plot heat map of fold changes from multiple DESeq2 outputs</description>
-    <requirements>
-        <requirement type="binary">Rscript</requirement>
-        <requirement type="package" version="3.0.2">R_3_0_2</requirement>
-    </requirements>
-    <command interpreter="Rscript">
-            #set $label_list = list()
-            #set $table_list = list()
-            #for $input in $deseqout:
-                $label_list.append('%s' %($input.label))
-            #end for
-            #for $input in $deseqout:
-                $table_list.append('%s' %($input.table))
-            #end for
-            foldchanges_heatmap.r "#echo ','.join( $label_list )#" "#echo ','.join( $table_list )#" $gene_ids_file $plots
-            ## TODO: instead of throwing away stderr, try Bjoern's fix
-            2> /dev/null
-    </command>
-    <stdio>
-        <regex match="Execution halted" 
-           source="both" 
-           level="fatal" 
-           description="Execution halted." />
-    </stdio>
-    <inputs>
-        <repeat name="deseqout" title="DESeq2 output" min="1">
-            <param name="label" type="text" value="Label" label="Specify a label" help="" />
-            <param name="table" label="Select DESeq2 output" type="data" format="tabular"/>
-        </repeat>
-        <param name="gene_ids_file" label="Select file containing gene ids" type="data" format="tabular"/>
-    </inputs>
-
-    <outputs>
-        <data format="pdf" name="plots" label="Heatmap plot on ${on_string}"/>
-    </outputs>
-    <help>
-
-.. class:: infomark
-
-**What it does** 
-
-Estimate variance-mean dependence in count data from high-throughput sequencing assays and test for differential expression based on a model using the negative binomial distribution
-
-
-**Inputs**
-
-DESeq2_ requires one count matrix as input file. You can use the tool
-
-
-
-**Output**
-
-DESeq2_ generates a tabular file containing the different columns and optional visualized results as PDF.
-
-====== ==========================================================
-Column Description
------- ----------------------------------------------------------
-     1 Gene Identifiers
-     2 mean normalised counts, averaged over all samples from both conditions
-     3 the logarithm (to basis 2) of the fold change
-     4 standard error estimate for the log2 fold change estimate
-     5 p value for the statistical significance of this change
-     6 p value adjusted for multiple testing with the Benjamini-Hochberg procedure
-       which controls false discovery rate (FDR)
-====== ==========================================================
-
-
-------
-
-**References** 
-
-DESeq2_ Authors: Michael Love (MPIMG Berlin), Simon Anders, Wolfgang Huber (EMBL Heidelberg)
-
-If DESeq2_ is used to obtain results for scientific publications it
-should be cited as [1]_. A paper describing DESeq2_ is in preparation.
-
-
-
-.. [1] Anders, S and Huber, W (2010): `Differential expression analysis for sequence count data`_. 
-
-.. _Differential expression analysis for sequence count data: http://dx.doi.org/10.1186/gb-2010-11-10-r106
-.. _DESeq2: http://master.bioconductor.org/packages/release/bioc/html/DESeq2.html
-
-
-    </help>
-</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/helper.py	Sat Nov 15 05:42:24 2014 -0500
@@ -0,0 +1,41 @@
+
+from galaxy.tools.parameters import DataToolParameter
+
+def validate_input( trans, error_map, param_values, page_param_map ):
+    """
+        Validates the user input, before execution.
+    """
+    factors = param_values['rep_factorName']
+    factor_name_list = []
+    factor_duplication = False
+    level_duplication = False
+
+    for factor in factors:
+        # factor names should be unique
+        fn = factor['factorName']
+        if fn in factor_name_list:
+            factor_duplication = True
+            break
+        factor_name_list.append( fn )
+
+        level_name_list = list()
+        factor_index_list = list()
+
+        for level in factor['rep_factorLevel']:
+            # level names under one factor should be unique
+            fl = level['factorLevel']
+            if fl in level_name_list:
+                level_duplication = True
+            level_name_list.append( fl )
+
+        if level_duplication:
+            error_map['rep_factorName'] = [ dict() for t in factors ]
+            for i in range( len( factors ) ):
+                error_map['rep_factorName'][i]['rep_factorLevel'] = [ {'factorLevel': 'Factor levels for each factor need to be unique'} for t in factor['rep_factorLevel'] ]
+            break
+
+    if factor_duplication:
+        error_map['rep_factorName'] = [ dict() for t in factors ]
+        for i in range( len( factors ) ):
+            error_map['rep_factorName'][i]['factorName'] = 'Factor names need to be unique.'
+
--- a/test-data/countmatrix.txt	Wed Feb 19 12:43:03 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,19 +0,0 @@
-#	G1:Mut1	G1:Mut1	G2:Mut2	G2:Mut2	G3:WT	G3:WT
-#Feature	Spl1	Spl2	Spl3	Spl4	Spl5	Spl6
-NM_001001130	97	43	61	34	73	26
-NM_001001144	25	8	9	3	5	5
-NM_001001152	72	45	29	20	31	13
-NM_001001160	0	1	1	1	0	0
-NM_001001177	0	1	0	4	3	3
-NM_001001178	0	2	1	0	4	0
-NM_001001179	0	0	0	0	0	2
-NM_001001180	0	0	0	0	0	2
-NM_001001181	415	319	462	185	391	155
-NM_001001182	1293	945	987	297	938	496
-NM_001001183	5	4	11	7	11	2
-NM_001001184	135	198	178	110	205	64
-NM_001001185	186	1	0	1	1	0
-NM_001001186	75	90	91	34	63	54
-NM_001001187	267	236	170	165	202	51
-NM_001001295	5	2	6	1	7	0
-NM_001001309	1	0	0	1	2	1
--- a/tool_dependencies.xml	Wed Feb 19 12:43:03 2014 -0500
+++ b/tool_dependencies.xml	Sat Nov 15 05:42:24 2014 -0500
@@ -1,9 +1,6 @@
 <?xml version="1.0"?>
 <tool_dependency>
-    <package name="R_3_0_2" version="3.0.2">
-        <repository changeset_revision="9aac33c71188" name="package_r_3_0_2" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu" />
-    </package>
-    <package name="deseq2" version="1.2.10">
-        <repository changeset_revision="43ec8420b239" name="package_deseq2_1_2_10" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+    <package name="deseq2" version="1.6.1">
+       <repository changeset_revision="feb1e4cb43f3" name="package_deseq2_1_6_1" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>
 </tool_dependency>