changeset 17:a05999fd6e26 draft

Uploaded
author bgruening
date Mon, 30 Sep 2013 10:51:41 -0400
parents 1d2a02bc2208
children 72613128bd65
files deseq2.R deseq2.py deseq2.xml matrix_helper.py tool_dependencies.xml
diffstat 5 files changed, 276 insertions(+), 212 deletions(-) [+]
line wrap: on
line diff
--- a/deseq2.R	Mon Sep 09 07:09:38 2013 -0400
+++ b/deseq2.R	Mon Sep 30 10:51:41 2013 -0400
@@ -10,23 +10,24 @@
 #get options, using the spec as defined by the enclosed list.
 #we read the options from the default: commandArgs(TRUE).
 spec = matrix(c(
-  'verbose', 'v', 2, "integer",
-  'help' , 'h', 0, "logical",
-  'outputfile' , 'o', 1, "character",
-  'plots' , 'p', 2, "character",
-  'input' , 'i', 1, "character",
-  'samples', 's', 1, "character",
-  'factors', 'f', 2, "character",
-  'fittype', 't', 2, "character"
+	'verbose', 'v', 2, "integer",
+	'help' , 'h', 0, "logical",
+	'outfile' , 'o', 1, "character",
+	'outfilefiltered' , 'f', 1, "character",
+	'plots' , 'p', 2, "character",
+	'input' , 'i', 1, "character",
+	'factors', 'm', 2, "character",
+	'fittype', 't', 2, "character",
+	'threshold', 'c', 2, "double"
+#	'organism', 'g', 2, "character"
 ), byrow=TRUE, ncol=4);
 opt = getopt(spec);
 
-
 # if help was asked for print a friendly message
 # and exit with a non-zero error code
 if ( !is.null(opt$help) ) {
-  cat(getopt(spec, usage=TRUE));
-  q(status=1);
+	cat(getopt(spec, usage=TRUE));
+	q(status=1);
 }
 
 if( is.null(opt$fittype))
@@ -42,148 +43,146 @@
 names(htseqCountTable)<-colnames(htseqCountTable)
 conditions <- htseqCountTable[1,]
 conditions <- unlist(conditions[,-1])
-tagnames <- htseqCountTable[-(1:2), 1]
+featurenames <- htseqCountTable[-(1:2), 1]
 htseqCountTable <- htseqCountTable[-(1:2),-1]
 htseqCountTable[,1] <- gsub(",", ".", htseqCountTable[,1])
 
 l <- unique(c(conditions))
 
-library( "DESeq2" )
+library('rjson')
+library('DESeq2')
+#library('biomaRt')
 
 if ( !is.null(opt$plots) ) {
-    pdf(opt$plots)
+	pdf(opt$plots)
 }
-if (opt$samples=="all_vs_all"){
-  # all versus all
-  for (i in seq(1, length(l), by=1)) {
-    k=i+1
-    if(k<=length(l)){
-      for (j in seq(k, length(l), by=1)) {
-        currentColmuns <- which(conditions %in% c(l[[i]], l[[j]]))
-            sampleNames <- names(htseqCountTable[currentColmuns])
-
-        currentPairTable <- htseqCountTable[currentColmuns]
-        ncondition1cols <- length(which(conditions %in% c(l[[i]])))
-        ncondition2cols <- length(which(conditions %in% c(l[[i]])))
-
-        ntabcols <- length(currentPairTable)
-            currentPairTable <- as.integer(unlist(currentPairTable))
-        currentPairTable <- matrix(unlist(currentPairTable), ncol = ntabcols, byrow = FALSE)
-            colnames(currentPairTable) <- sampleNames
-
-        pdata = data.frame(condition=factor(conditions[currentColmuns]),row.names=colnames(currentPairTable))
-        print(pdata)
-
-        dds = DESeqDataSetFromMatrix(countData = currentPairTable,  colData = pdata, design = ~ condition)
-
-        dds <- DESeq(dds, fitType= opt$fittype)
-        sizeFactors(dds)
-        res <- results(dds)
-        resCols <- colnames(res)
-
-        cond <- c(rep(paste(unique(conditions[currentColmuns]),collapse="_"), nrow(res)))
-        res[, "condition"] <- cond
-        res[, "geneIds"] <- tagnames
-        
-        resSorted <- res[order(res$padj),]
-        write.table(as.data.frame(resSorted[,c("condition", "geneIds", resCols)]), file = opt$outputfile, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE)
-        
-        if ( !is.null(opt$plots) ) {
-            plotDispEsts(dds)
-            plotMA(dds)
-            
-			library("RColorBrewer")
-			library("gplots")
-			select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:30]
-			hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
-			
-			rld <- rlogTransformation(dds, blind=TRUE)
-			vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
-			distsRL <- dist(t(assay(rld)))
-			mat <- as.matrix(distsRL)
-			heatmap.2(mat, trace="none", col = rev(hmcol), main = "Sample-to-sample distances")            
-        }        
-      }
-    }
-  }
-}else{
 
-    sampleSets <- unlist(strsplit(opt$samples, " "))
-    samplesControl <- {}
-    samplesExperiment <- {}
-    samplesControl <- unlist(strsplit(sampleSets[1], ","))
-    samplesExperiment <- unlist(strsplit(sampleSets[2], ","))
-
-    # the minus one is needed because we get column indizes including the first column
-    sampleColumns <- c()
-    for (i in samplesControl) sampleColumns[[length(sampleColumns) + 1]] <- as.integer(i) - 1
-    for (i in samplesExperiment) sampleColumns[[length(sampleColumns) + 1]] <- as.integer(i) - 1
-    htseqSubCountTable <- htseqCountTable[,sampleColumns]
-
-    ntabcols <- length(htseqSubCountTable)
-    htseqSubCountTable <- as.integer(unlist(htseqSubCountTable))
-    htseqSubCountTable <- matrix(unlist(htseqSubCountTable), ncol = ntabcols, byrow = FALSE)
-    colnames(htseqSubCountTable) <- names(htseqCountTable)[sampleColumns]
-
-    pdata = data.frame(row.names=colnames(htseqSubCountTable))
+## The following function takes deseq data object, computes, writes and plots the results.
+computeAndWriteResults <- function(dds, sampleCols, outputcsv, featurenames_filtered) {
+	dds <- DESeq(dds, fitType= opt$fittype)
+	sizeFactors(dds)
+	res <- results(dds)
+	resCols <- colnames(res)
+	cond <- c(rep(paste(unique(conditions[sampleCols]),collapse="_"), nrow(res)))
+	if(missing(featurenames_filtered)){
+		res[, "geneIds"] <- featurenames
+#		rownames(res) <- featurenames
+		title_prefix = "Complete: "
+	}else{
+		res[, "geneIds"] <- featurenames_filtered
+#		rownames(res) <- featurenames
+		title_prefix = "Filtered: "
+	}
+	print(sum(res$padj < .1, na.rm=TRUE))
+	print(opt$organism)
+#	if(opt$organism != "other"){
+#		dataset = ""
+#		if(opt$organism == "mouse")
+#			dataset = "mmusculus_gene_ensembl"
+#		if(opt$organism == "human")
+#			dataset = "hsapiens_gene_ensembl"
+#		if(opt$organism == "fly")
+#			dataset = "dmelanogaster_gene_ensembl"
+#		ensembldb = useMart("ensembl",dataset=dataset)
+#
+#		annot <- getBM(attributes = c("ensembl_gene_id", "external_gene_id","description"),
+#					filters = "ensembl_gene_id",
+#					values=res[, "geneIds"],
+#					mart=ensembldb)
 
-    if(!is.null(opt$factors)){
-        factorSets <- unlist(strsplit(opt$factors, " "))
-        for (factorSet in factorSets) {
-            currentFactor <- unlist(strsplit(factorSet, ":"))
-            for (factor in currentFactor[2]) {
-                factoredCols <- unlist(strsplit(factor, ","))
-                factoredCols <- as.numeric(factoredCols)
-                factors <- c()
-                for (i in sampleColumns){
-                    j=i+1
-                    if(j %in% factoredCols)
-                        factors[[length(factors) + 1]] <- "yes"
-                    else
-                        factors[[length(factors) + 1]] <- "no"
-                }
-                # only add factor if factor is applied to the selected samples
-                if((length(unique(factors))) >= 2){
-                        pdata[[currentFactor[1]]]<-factor(factors)
-                }else{
-                    write("Input-Error 01: You can only apply factors to selected samples.", stderr())
-                }
-            }
-        }
-    }
-    pdata$condition<-factor(conditions[sampleColumns])
-    print(pdata)
-
-    designFormula <- as.formula(paste("", paste(names(pdata), collapse=" + "), sep=" ~ "))
-    dds = DESeqDataSetFromMatrix(countData = htseqSubCountTable,  colData = pdata, design = designFormula)
-
-    dds <- DESeq(dds, fitType= opt$fittype)
-    sizeFactors(dds)
-    res <- results(dds)
-    resCols <- colnames(res)
-
-    cond <- c(rep(paste(unique(conditions[sampleColumns]),collapse="_"), nrow(res)))
-    res[, "condition"] <- cond
-    res[, "geneIds"] <- tagnames
-
-    resSorted <- res[order(res$padj),]
-    write.table(as.data.frame(resSorted[,c("condition", "geneIds", resCols)]), file = opt$outputfile, sep="\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
-
-    if ( !is.null(opt$plots) ) {
-        plotDispEsts(dds)
-        plotMA(dds)
+#		res <- merge(res, annot,
+#					by.x = "geneIds",
+#					by.y = "ensembl_gene_id",
+#					all.x=TRUE)
+		
+#		resCols <- colnames(res)
+#		resSorted <- res[order(res$padj),]
+#		write.table(as.data.frame(resSorted[,c(resCols)]), file = outputcsv, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE)					
+#	}else{
+		resSorted <- res[order(res$padj),]
+		write.table(as.data.frame(resSorted[,c("geneIds", resCols)]), file = outputcsv, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE)
+#	}
+	
+	if ( !is.null(opt$plots) ) {
+		plotDispEsts(dds, main= paste(title_prefix, "Dispersion estimate plot"))
+		plotMA(dds, main= paste(title_prefix, "MA-plot"))
 
 		library("RColorBrewer")
 		library("gplots")
 		select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:30]
 		hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
-        
+
 		rld <- rlogTransformation(dds, blind=TRUE)
 		vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
 		distsRL <- dist(t(assay(rld)))
 		mat <- as.matrix(distsRL)
-		heatmap.2(mat, trace="none", col = rev(hmcol), main = "Sample-to-sample distances")
-    }    
+		heatmap.2(mat, trace="none", col = rev(hmcol), main = paste(title_prefix, "Sample-to-sample distances"))
+	}
+	return(res)
+}
+
+## This functions filters out the genes with low counts and plots p-value distribution
+independentFilter <- function(deseqRes, countTable, sampleColumns, designFormula){
+#	use = (deseqRes$baseMean > quantile(deseqRes$baseMean, probs = opt$threshold) & (!is.na(deseqRes$pvalue)))
+	use = (deseqRes$baseMean > opt$threshold & !is.na(deseqRes$pvalue))
+	ddsFilt <- DESeqDataSetFromMatrix(countData = countTable[use, ], colData = sampleTable, design = designFormula)
+	computeAndWriteResults(ddsFilt, sampleColumns, opt$outfilefiltered, featurenames[use])
+	
+	## p-value distribution
+	h1 <- hist(deseqRes$pvalue[!use], breaks=50, plot=FALSE)
+	h2 <- hist(deseqRes$pvalue[use], breaks=50, plot=FALSE)
+	colori <- c(`filtered out`="khaki", `complete`="powderblue")
+	barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
+			col = colori, space = 0, main = "Distribution of p-values", ylab="frequency")
+	text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)), adj = c(0.5,1.7), xpd=NA)
+	legend("topright", fill=rev(colori), legend=rev(names(colori)))	
+}
+
+parser <- newJSONParser()
+parser$addData( opt$factors )
+factorsList <- parser$getObject()
+
+sampleColumns <- c()
+
+for (j in 2:length(factorsList[[1]])){
+    for (k in 1:length(names(factorsList[[1]][[j]]))){
+        for (l in 1:length(factorsList[[1]][[j]][[k]])){
+            sampleColumns[[length(sampleColumns) + 1]] <- as.integer(factorsList[[1]][[j]][[k]][l]) - 1
+        }
+    }
 }
+
+sampleColumns<-sort(unique(sampleColumns))
+htseqSubCountTable <- htseqCountTable[,sampleColumns]
+ntabcols <- length(htseqSubCountTable)
+htseqSubCountTable <- as.integer(unlist(htseqSubCountTable))
+htseqSubCountTable <- matrix(unlist(htseqSubCountTable), ncol = ntabcols, byrow = FALSE)
+colnames(htseqSubCountTable) <- names(htseqCountTable)[sampleColumns]
+sampleTable = data.frame(row.names=colnames(htseqSubCountTable))
+ 
+for(i in 1:length(factorsList)){
+    factorName<-factorsList[[i]][[1]]
+    for (j in 2:length(factorsList[[i]])){
+        effected_cols <- c()
+        for (k in 1:length(names(factorsList[[i]][[j]]))){
+            for (l in 1:length(factorsList[[i]][[j]][[k]])){
+                if(colnames(htseqCountTable)[factorsList[[i]][[j]][[k]][l]-1] %in% rownames(sampleTable)){
+                    sampleTable[colnames(htseqCountTable)[factorsList[[i]][[j]][[k]][l]-1],factorName] <- names(factorsList[[i]][[j]])[k]
+                }
+            }
+        }
+    }
+}
+sampleTable[is.na(sampleTable)] <- "undefined"
+sampleTable
+
+factorNames <- c(names(sampleTable)[-1], names(sampleTable)[1])
+designFormula <- as.formula(paste("", paste(factorNames, collapse=" + "), sep=" ~ "))
+dds = DESeqDataSetFromMatrix(countData = htseqSubCountTable,  colData = sampleTable, design = designFormula)
+deseqres <- computeAndWriteResults(dds, sampleColumns, opt$outfile) 
+
+## independent filtering
+independentFilter(deseqres, htseqSubCountTable, sampleColumns, designFormula)
+
 dev.off()
-
+sessionInfo()
--- a/deseq2.py	Mon Sep 09 07:09:38 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
-#!/usr/bin/env python
-import os, sys
-import rpy2.robjects as robjects
-from rpy2.robjects.packages import importr
-deseq2 = importr("DESeq2")
-
-sys.stdout.write('Works!')
-
--- a/deseq2.xml	Mon Sep 09 07:09:38 2013 -0400
+++ b/deseq2.xml	Mon Sep 30 10:51:41 2013 -0400
@@ -1,5 +1,5 @@
 <tool id="deseq2" name="DESeq2" version="2.0.1">
-  <description>Determines differentially expressed transcripts from read alignments</description>
+    <description>Determines differentially expressed features from count data</description>
     <requirements>
         <requirement type="binary">Rscript</requirement>
         <requirement type="R-module">DESeq2</requirement>
@@ -7,33 +7,33 @@
         <requirement type="set_environment">DESEQ2_SCRIPT_PATH</requirement>
     </requirements>
     <command interpreter="Rscript">
+        #import simplejson
         deseq2.R
             -o "$deseq_out"
+            --outfilefiltered "$deseq_out_filtered"
 
             #if $pdf:
-                -p "$plots"
+                -p "$plots" 
             #end if
-
+            
             -i "$input_matrix"
 
-        #if $filter_sel.filter_sel_opts == 'all_vs_all':
-            -s 'all_vs_all'
-        #else:
-            -s ## build a string like '1,2 5,6'
-            "${filter_sel.control_cols} ${filter_sel.experiement_cols}"
+            #set $temp_factor_name = list()
+            #for $factor in $rep_factorName:
+                #set $temp_factor = dict()                
+                #for $level in $factor.rep_factorLevel:
+                    ##$temp_factor_list.append( '%s::%s:%s' % ($factor.factorName.replace(' ','_'), $level.factorLevel, $level.factorIndex) )
+                    $temp_factor.update({str($level.factorLevel): map(int, str($level.factorIndex).split(','))})
+                #end for
+                $temp_factor_name.append([str($factor.factorName), $temp_factor])
 
-            #set $temp_factor_list = list()
-            #set $is_multi_factor_analysis = False
-            #for $factor in $filter_sel.factor:
-                #set $is_multi_factor_analysis = True
-                $temp_factor_list.append( '%s:%s' % ($factor.factor_name.replace(' ','_'), $factor.factor_index) )
             #end for
 
-            #if $is_multi_factor_analysis:
-                -f "#echo ' '.join( $temp_factor_list )#"
-            #end if
-        #end if
-				-t $fittype
+                ##-m "#echo ' '.join( $temp_factor_list )#"
+                -m '#echo simplejson.dumps(temp_factor_name)#'
+            ##--organism "$organism"
+            ##-t "$fittype"
+            -c $countthreshold
     </command>
     <stdio>
         <regex match="Execution halted" 
@@ -51,56 +51,64 @@
     </stdio>
     <inputs>
         <param format="tabular" name="input_matrix" type="data" label="Countmatrix" help="You can create a count matrix with the tool 
-			'Count reads in features with htseq-count'"/>
+            'Count reads in features with htseq-count'"/>
+        
+        <repeat name="rep_factorName" title="Factor/Condition" min="1">
+            <param name="factorName" type="text" value="FactorName" label="Specify a factor name" help=""/>
+            <repeat name="rep_factorLevel" title="Factor level" min="1">
+                <param name="factorLevel" type="text" value="FactorLevel" label="Specify a factor level" help=""/>
 
-         <conditional name="filter_sel">
-            <param name="filter_sel_opts" type="select" label="Select mode of analysis" help="All against all: analyses all possible pairs of conditions,
-				&lt;br&gt; Specific: select your samples of interest">
-              <option value="all_vs_all">All against all</option>
-              <option value="specific">specific</option>
-            </param>
-            <when value="specific">
-
-                <param name="control_cols" label="Select columns containing first condition" type="data_column" data_ref="input_matrix"
-                    numerical="True" multiple="true" use_header_names="true" size="120" display="checkboxes">
-                    <validator type="no_options" message="Please select at least one column."/>
-                </param>
-
-                <param name="experiement_cols" label="Select columns containing second condition" type="data_column" data_ref="input_matrix"
+                <param name="factorIndex" label="Select columns that are associated with this factor level" type="data_column" data_ref="input_matrix"
                     numerical="True" multiple="true" use_header_names="true" size="120" display="checkboxes">
                     <validator type="no_options" message="Please select at least one column."/>
                 </param>
-
-                <repeat name="factor" title="Include factor">
-                    <param name="factor_name" type="text" value="Factor Name" label="Specify a factor name" help=""/>
-
-                    <param name="factor_index" label="Select columns that are associated with a factor." type="data_column" data_ref="input_matrix"
-                        numerical="True" multiple="true" use_header_names="true" size="120" display="checkboxes">
-                        <validator type="no_options" message="Please select at least one column."/>
-                    </param>
-                </repeat>
-
-            </when>
-            <when value="all_vs_all" />
-        </conditional>
-
+            </repeat>
+        </repeat>
+        <!--
+        <param name="control_cols" type="select" display="checkboxes" multiple="true" optional="True" label="Select columns containing first condition" 
+            dynamic_options="get_matrix_header( input_dataset=input_matrix )" help="insert useful info here">
+            <validator type="no_options" message="Please select at least one column."/>
+        </param>            
+        <param name="experiement_cols" type="select" display="checkboxes" multiple="true" optional="True" label="Select columns containing second condition" 
+            dynamic_options="get_matrix_header( input_dataset=input_matrix )" help="insert useful info here">
+            <validator type="no_options" message="Please select at least one column."/>
+        </param>             
+                 
+        <repeat name="factor" title="factor">
+            <param name="factor_name" type="text" value="Factor Name" label="Specify a factor name" 
+                help="Order of factors you add effects the design formual and hence effects whole analysis. Design formual will be created as follows: ~factor1+factor2+factor3+...+factorN+condition"/>
+            <param name="factor_index" type="select" display="checkboxes" multiple="true" optional="True" label="Choose sample to compare with" 
+                    dynamic_options="get_matrix_header( input_dataset=input_matrix )" help="Select columns that are associated with a factor">
+                <validator type="no_options" message="Please select at least one column."/>
+            </param>
+        </repeat>
+        -->
+        <!--param name="organism" size="10" type="select">
+            <option value="human">human</option>
+            <option value="mouse">mouse</option>
+            <option value="fly">fly</option>
+            <option value="other">other</option>
+        </param-->
+        <param name="countthreshold" size="10" type="float" value="10" label="Filter out features with mean normalized counts lower than this value"/>
         <param name="fittype" type="select" label="Type of fitting of dispersions to the mean intensity">
-			<option value="parametric">parametric</option>
-			<option value="local">local</option>
-			<option value="mean">mean</option>
-		</param>	
-		<param name="pdf" type="boolean" truevalue="" falsevalue="" checked="true" label="Visualising the analysis results"
-            help="output an additional PDF file" />
+            <option value="parametric">parametric</option>
+            <option value="local">local</option>
+            <option value="mean">mean</option>
+        </param> 
+        <param name="pdf" type="boolean" truevalue="" falsevalue="" checked="true" label="Visualising the analysis results"
+            help="output an additional PDF files" />            
     </inputs>
 
-  <outputs>
-    <data format="tabular" name="deseq_out" label="DESeq2 result file on ${on_string}"/>
-    <data format="pdf" name="plots" label="DESeq2 plots on ${on_string}">
-        <filter>pdf == True</filter>
-    </data>
-  </outputs>
+    <outputs>
+        <data format="tabular" name="deseq_out" label="DESeq2 result file on ${on_string}"/>
+        <data format="tabular" name="deseq_out_filtered" label="Independent filtering result file on ${on_string}"/>
+        <data format="pdf" name="plots" label="DESeq2 plots on ${on_string}">
+            <filter>pdf == True</filter>
+        </data>
+    </outputs>
+    <code file="matrix_helper.py" />
 
-  <help>
+    <help>
 
 .. class:: infomark
 
@@ -122,13 +130,12 @@
 ====== ==========================================================
 Column Description
 ------ ----------------------------------------------------------
-     1 Condition tested (corresponds to the conitions from the first line of your count matrix)
-     2 Gene Identifiers
-     3 mean normalised counts, averaged over all samples from both conditions
-     4 the logarithm (to basis 2) of the fold change
-     5 standard error estimate for the log2 fold change estimate
-     6 p value for the statistical significance of this change
-     7 p value adjusted for multiple testing with the Benjamini-Hochberg procedure
+     1 Gene Identifiers
+     2 mean normalised counts, averaged over all samples from both conditions
+     3 the logarithm (to basis 2) of the fold change
+     4 standard error estimate for the log2 fold change estimate
+     5 p value for the statistical significance of this change
+     6 p value adjusted for multiple testing with the Benjamini-Hochberg procedure
        which controls false discovery rate (FDR)
 ====== ==========================================================
 
@@ -150,5 +157,5 @@
 .. _DESeq2: http://master.bioconductor.org/packages/release/bioc/html/DESeq2.html
 
 
-  </help>
+    </help>
 </tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_helper.py	Mon Sep 30 10:51:41 2013 -0400
@@ -0,0 +1,66 @@
+
+from galaxy.tools.parameters import DataToolParameter
+
+def get_matrix_header( input_dataset ):
+    """
+        Not used currently, because the reload of the ckeckboxes did not work.
+    """
+    input_handle = open( input_dataset.file_name )
+    first_header = input_handle.readline()
+    second_header = input_handle.readline()
+    return [('%s::%s' % (cname2,cname1), str(int(col_num) + 1), False) for col_num, (cname2, cname1) in enumerate(zip(second_header.split()[1:],first_header.split()[1:])) ]
+
+
+def validate_input( trans, error_map, param_values, page_param_map ):
+    """
+        Validates the user input, before execution.
+    """
+    factors = param_values['rep_factorName']
+    factor_name_list = []
+    factor_duplication = False
+    level_duplication = False
+    overlapping_selection = False
+
+
+    for factor in factors:
+        # factor names should be unique
+        fn = factor['factorName']
+        if fn in factor_name_list:
+            factor_duplication = True
+            break
+        factor_name_list.append( fn )
+
+        level_name_list = list()
+        factor_index_list = list()
+
+        for level in factor['rep_factorLevel']:
+            # level names under one factor should be unique
+            fl = level['factorLevel']
+            if fl in level_name_list:
+                level_duplication = True
+            level_name_list.append( fl )
+
+            fi = level['factorIndex']
+            if fi:
+                # the checkboxes should not have an overlap
+                for check in fi:
+                    if check in factor_index_list:
+                        overlapping_selection = True
+                    factor_index_list.append( check )
+
+        if level_duplication:
+            error_map['rep_factorName'] = [ dict() for t in factors ]
+            for i in range( len( factors ) ):
+                error_map['rep_factorName'][i]['rep_factorLevel'] = [ {'factorLevel': 'Factor levels for each factor need to be unique'} for t in factor['rep_factorLevel'] ]
+            break
+        if overlapping_selection:
+            error_map['rep_factorName'] = [ dict() for t in factors ]
+            for i in range( len( factors ) ):
+                error_map['rep_factorName'][i]['rep_factorLevel'] = [ {'factorIndex': 'The samples from different factors are not allowed to overlap'} for t in factor['rep_factorLevel'] ]
+            break
+
+    if factor_duplication:
+        error_map['rep_factorName'] = [ dict() for t in factors ]
+        for i in range( len( factors ) ):
+            error_map['rep_factorName'][i]['factorName'] = 'Factor names need to be unique'
+
--- a/tool_dependencies.xml	Mon Sep 09 07:09:38 2013 -0400
+++ b/tool_dependencies.xml	Mon Sep 30 10:51:41 2013 -0400
@@ -4,7 +4,7 @@
         <repository changeset_revision="bae5c9880b71" name="package_r_3_0_1" owner="bgruening" toolshed="http://testtoolshed.g2.bx.psu.edu" />
     </package>
     <package name="deseq2" version="1.0.17">
-        <repository changeset_revision="eab36a9a70b1" name="package_deseq2_1_0_17" owner="bgruening" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="3e3d1a50a347" name="package_deseq2_1_0_17" owner="bgruening" toolshed="http://testtoolshed.g2.bx.psu.edu" />
     </package>
     <set_environment version="1.0">
         <environment_variable action="set_to" name="DESEQ2_SCRIPT_PATH">$REPOSITORY_INSTALL_DIR/scripts</environment_variable>