# HG changeset patch
# User bgruening
# Date 1380552701 14400
# Node ID a05999fd6e268bc20fb05441f25e8520bdc2d831
# Parent 1d2a02bc220843dbe04bcdbead96f6008d8e7f58
Uploaded
diff -r 1d2a02bc2208 -r a05999fd6e26 deseq2.R
--- a/deseq2.R Mon Sep 09 07:09:38 2013 -0400
+++ b/deseq2.R Mon Sep 30 10:51:41 2013 -0400
@@ -10,23 +10,24 @@
#get options, using the spec as defined by the enclosed list.
#we read the options from the default: commandArgs(TRUE).
spec = matrix(c(
- 'verbose', 'v', 2, "integer",
- 'help' , 'h', 0, "logical",
- 'outputfile' , 'o', 1, "character",
- 'plots' , 'p', 2, "character",
- 'input' , 'i', 1, "character",
- 'samples', 's', 1, "character",
- 'factors', 'f', 2, "character",
- 'fittype', 't', 2, "character"
+ 'verbose', 'v', 2, "integer",
+ 'help' , 'h', 0, "logical",
+ 'outfile' , 'o', 1, "character",
+ 'outfilefiltered' , 'f', 1, "character",
+ 'plots' , 'p', 2, "character",
+ 'input' , 'i', 1, "character",
+ 'factors', 'm', 2, "character",
+ 'fittype', 't', 2, "character",
+ 'threshold', 'c', 2, "double"
+# 'organism', 'g', 2, "character"
), byrow=TRUE, ncol=4);
opt = getopt(spec);
-
# if help was asked for print a friendly message
# and exit with a non-zero error code
if ( !is.null(opt$help) ) {
- cat(getopt(spec, usage=TRUE));
- q(status=1);
+ cat(getopt(spec, usage=TRUE));
+ q(status=1);
}
if( is.null(opt$fittype))
@@ -42,148 +43,146 @@
names(htseqCountTable)<-colnames(htseqCountTable)
conditions <- htseqCountTable[1,]
conditions <- unlist(conditions[,-1])
-tagnames <- htseqCountTable[-(1:2), 1]
+featurenames <- htseqCountTable[-(1:2), 1]
htseqCountTable <- htseqCountTable[-(1:2),-1]
htseqCountTable[,1] <- gsub(",", ".", htseqCountTable[,1])
l <- unique(c(conditions))
-library( "DESeq2" )
+library('rjson')
+library('DESeq2')
+#library('biomaRt')
if ( !is.null(opt$plots) ) {
- pdf(opt$plots)
+ pdf(opt$plots)
}
-if (opt$samples=="all_vs_all"){
- # all versus all
- for (i in seq(1, length(l), by=1)) {
- k=i+1
- if(k<=length(l)){
- for (j in seq(k, length(l), by=1)) {
- currentColmuns <- which(conditions %in% c(l[[i]], l[[j]]))
- sampleNames <- names(htseqCountTable[currentColmuns])
-
- currentPairTable <- htseqCountTable[currentColmuns]
- ncondition1cols <- length(which(conditions %in% c(l[[i]])))
- ncondition2cols <- length(which(conditions %in% c(l[[i]])))
-
- ntabcols <- length(currentPairTable)
- currentPairTable <- as.integer(unlist(currentPairTable))
- currentPairTable <- matrix(unlist(currentPairTable), ncol = ntabcols, byrow = FALSE)
- colnames(currentPairTable) <- sampleNames
-
- pdata = data.frame(condition=factor(conditions[currentColmuns]),row.names=colnames(currentPairTable))
- print(pdata)
-
- dds = DESeqDataSetFromMatrix(countData = currentPairTable, colData = pdata, design = ~ condition)
-
- dds <- DESeq(dds, fitType= opt$fittype)
- sizeFactors(dds)
- res <- results(dds)
- resCols <- colnames(res)
-
- cond <- c(rep(paste(unique(conditions[currentColmuns]),collapse="_"), nrow(res)))
- res[, "condition"] <- cond
- res[, "geneIds"] <- tagnames
-
- resSorted <- res[order(res$padj),]
- write.table(as.data.frame(resSorted[,c("condition", "geneIds", resCols)]), file = opt$outputfile, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE)
-
- if ( !is.null(opt$plots) ) {
- plotDispEsts(dds)
- plotMA(dds)
-
- library("RColorBrewer")
- library("gplots")
- select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:30]
- hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
-
- rld <- rlogTransformation(dds, blind=TRUE)
- vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
- distsRL <- dist(t(assay(rld)))
- mat <- as.matrix(distsRL)
- heatmap.2(mat, trace="none", col = rev(hmcol), main = "Sample-to-sample distances")
- }
- }
- }
- }
-}else{
- sampleSets <- unlist(strsplit(opt$samples, " "))
- samplesControl <- {}
- samplesExperiment <- {}
- samplesControl <- unlist(strsplit(sampleSets[1], ","))
- samplesExperiment <- unlist(strsplit(sampleSets[2], ","))
-
- # the minus one is needed because we get column indizes including the first column
- sampleColumns <- c()
- for (i in samplesControl) sampleColumns[[length(sampleColumns) + 1]] <- as.integer(i) - 1
- for (i in samplesExperiment) sampleColumns[[length(sampleColumns) + 1]] <- as.integer(i) - 1
- htseqSubCountTable <- htseqCountTable[,sampleColumns]
-
- ntabcols <- length(htseqSubCountTable)
- htseqSubCountTable <- as.integer(unlist(htseqSubCountTable))
- htseqSubCountTable <- matrix(unlist(htseqSubCountTable), ncol = ntabcols, byrow = FALSE)
- colnames(htseqSubCountTable) <- names(htseqCountTable)[sampleColumns]
-
- pdata = data.frame(row.names=colnames(htseqSubCountTable))
+## The following function takes deseq data object, computes, writes and plots the results.
+computeAndWriteResults <- function(dds, sampleCols, outputcsv, featurenames_filtered) {
+ dds <- DESeq(dds, fitType= opt$fittype)
+ sizeFactors(dds)
+ res <- results(dds)
+ resCols <- colnames(res)
+ cond <- c(rep(paste(unique(conditions[sampleCols]),collapse="_"), nrow(res)))
+ if(missing(featurenames_filtered)){
+ res[, "geneIds"] <- featurenames
+# rownames(res) <- featurenames
+ title_prefix = "Complete: "
+ }else{
+ res[, "geneIds"] <- featurenames_filtered
+# rownames(res) <- featurenames
+ title_prefix = "Filtered: "
+ }
+ print(sum(res$padj < .1, na.rm=TRUE))
+ print(opt$organism)
+# if(opt$organism != "other"){
+# dataset = ""
+# if(opt$organism == "mouse")
+# dataset = "mmusculus_gene_ensembl"
+# if(opt$organism == "human")
+# dataset = "hsapiens_gene_ensembl"
+# if(opt$organism == "fly")
+# dataset = "dmelanogaster_gene_ensembl"
+# ensembldb = useMart("ensembl",dataset=dataset)
+#
+# annot <- getBM(attributes = c("ensembl_gene_id", "external_gene_id","description"),
+# filters = "ensembl_gene_id",
+# values=res[, "geneIds"],
+# mart=ensembldb)
- if(!is.null(opt$factors)){
- factorSets <- unlist(strsplit(opt$factors, " "))
- for (factorSet in factorSets) {
- currentFactor <- unlist(strsplit(factorSet, ":"))
- for (factor in currentFactor[2]) {
- factoredCols <- unlist(strsplit(factor, ","))
- factoredCols <- as.numeric(factoredCols)
- factors <- c()
- for (i in sampleColumns){
- j=i+1
- if(j %in% factoredCols)
- factors[[length(factors) + 1]] <- "yes"
- else
- factors[[length(factors) + 1]] <- "no"
- }
- # only add factor if factor is applied to the selected samples
- if((length(unique(factors))) >= 2){
- pdata[[currentFactor[1]]]<-factor(factors)
- }else{
- write("Input-Error 01: You can only apply factors to selected samples.", stderr())
- }
- }
- }
- }
- pdata$condition<-factor(conditions[sampleColumns])
- print(pdata)
-
- designFormula <- as.formula(paste("", paste(names(pdata), collapse=" + "), sep=" ~ "))
- dds = DESeqDataSetFromMatrix(countData = htseqSubCountTable, colData = pdata, design = designFormula)
-
- dds <- DESeq(dds, fitType= opt$fittype)
- sizeFactors(dds)
- res <- results(dds)
- resCols <- colnames(res)
-
- cond <- c(rep(paste(unique(conditions[sampleColumns]),collapse="_"), nrow(res)))
- res[, "condition"] <- cond
- res[, "geneIds"] <- tagnames
-
- resSorted <- res[order(res$padj),]
- write.table(as.data.frame(resSorted[,c("condition", "geneIds", resCols)]), file = opt$outputfile, sep="\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
-
- if ( !is.null(opt$plots) ) {
- plotDispEsts(dds)
- plotMA(dds)
+# res <- merge(res, annot,
+# by.x = "geneIds",
+# by.y = "ensembl_gene_id",
+# all.x=TRUE)
+
+# resCols <- colnames(res)
+# resSorted <- res[order(res$padj),]
+# write.table(as.data.frame(resSorted[,c(resCols)]), file = outputcsv, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE)
+# }else{
+ resSorted <- res[order(res$padj),]
+ write.table(as.data.frame(resSorted[,c("geneIds", resCols)]), file = outputcsv, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE)
+# }
+
+ if ( !is.null(opt$plots) ) {
+ plotDispEsts(dds, main= paste(title_prefix, "Dispersion estimate plot"))
+ plotMA(dds, main= paste(title_prefix, "MA-plot"))
library("RColorBrewer")
library("gplots")
select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:30]
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
-
+
rld <- rlogTransformation(dds, blind=TRUE)
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
- heatmap.2(mat, trace="none", col = rev(hmcol), main = "Sample-to-sample distances")
- }
+ heatmap.2(mat, trace="none", col = rev(hmcol), main = paste(title_prefix, "Sample-to-sample distances"))
+ }
+ return(res)
+}
+
+## This functions filters out the genes with low counts and plots p-value distribution
+independentFilter <- function(deseqRes, countTable, sampleColumns, designFormula){
+# use = (deseqRes$baseMean > quantile(deseqRes$baseMean, probs = opt$threshold) & (!is.na(deseqRes$pvalue)))
+ use = (deseqRes$baseMean > opt$threshold & !is.na(deseqRes$pvalue))
+ ddsFilt <- DESeqDataSetFromMatrix(countData = countTable[use, ], colData = sampleTable, design = designFormula)
+ computeAndWriteResults(ddsFilt, sampleColumns, opt$outfilefiltered, featurenames[use])
+
+ ## p-value distribution
+ h1 <- hist(deseqRes$pvalue[!use], breaks=50, plot=FALSE)
+ h2 <- hist(deseqRes$pvalue[use], breaks=50, plot=FALSE)
+ colori <- c(`filtered out`="khaki", `complete`="powderblue")
+ barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
+ col = colori, space = 0, main = "Distribution of p-values", ylab="frequency")
+ text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)), adj = c(0.5,1.7), xpd=NA)
+ legend("topright", fill=rev(colori), legend=rev(names(colori)))
+}
+
+parser <- newJSONParser()
+parser$addData( opt$factors )
+factorsList <- parser$getObject()
+
+sampleColumns <- c()
+
+for (j in 2:length(factorsList[[1]])){
+ for (k in 1:length(names(factorsList[[1]][[j]]))){
+ for (l in 1:length(factorsList[[1]][[j]][[k]])){
+ sampleColumns[[length(sampleColumns) + 1]] <- as.integer(factorsList[[1]][[j]][[k]][l]) - 1
+ }
+ }
}
+
+sampleColumns<-sort(unique(sampleColumns))
+htseqSubCountTable <- htseqCountTable[,sampleColumns]
+ntabcols <- length(htseqSubCountTable)
+htseqSubCountTable <- as.integer(unlist(htseqSubCountTable))
+htseqSubCountTable <- matrix(unlist(htseqSubCountTable), ncol = ntabcols, byrow = FALSE)
+colnames(htseqSubCountTable) <- names(htseqCountTable)[sampleColumns]
+sampleTable = data.frame(row.names=colnames(htseqSubCountTable))
+
+for(i in 1:length(factorsList)){
+ factorName<-factorsList[[i]][[1]]
+ for (j in 2:length(factorsList[[i]])){
+ effected_cols <- c()
+ for (k in 1:length(names(factorsList[[i]][[j]]))){
+ for (l in 1:length(factorsList[[i]][[j]][[k]])){
+ if(colnames(htseqCountTable)[factorsList[[i]][[j]][[k]][l]-1] %in% rownames(sampleTable)){
+ sampleTable[colnames(htseqCountTable)[factorsList[[i]][[j]][[k]][l]-1],factorName] <- names(factorsList[[i]][[j]])[k]
+ }
+ }
+ }
+ }
+}
+sampleTable[is.na(sampleTable)] <- "undefined"
+sampleTable
+
+factorNames <- c(names(sampleTable)[-1], names(sampleTable)[1])
+designFormula <- as.formula(paste("", paste(factorNames, collapse=" + "), sep=" ~ "))
+dds = DESeqDataSetFromMatrix(countData = htseqSubCountTable, colData = sampleTable, design = designFormula)
+deseqres <- computeAndWriteResults(dds, sampleColumns, opt$outfile)
+
+## independent filtering
+independentFilter(deseqres, htseqSubCountTable, sampleColumns, designFormula)
+
dev.off()
-
+sessionInfo()
diff -r 1d2a02bc2208 -r a05999fd6e26 deseq2.py
--- a/deseq2.py Mon Sep 09 07:09:38 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
-#!/usr/bin/env python
-import os, sys
-import rpy2.robjects as robjects
-from rpy2.robjects.packages import importr
-deseq2 = importr("DESeq2")
-
-sys.stdout.write('Works!')
-
diff -r 1d2a02bc2208 -r a05999fd6e26 deseq2.xml
--- a/deseq2.xml Mon Sep 09 07:09:38 2013 -0400
+++ b/deseq2.xml Mon Sep 30 10:51:41 2013 -0400
@@ -1,5 +1,5 @@
- Determines differentially expressed transcripts from read alignments
+ Determines differentially expressed features from count data
Rscript
DESeq2
@@ -7,33 +7,33 @@
DESEQ2_SCRIPT_PATH
+ #import simplejson
deseq2.R
-o "$deseq_out"
+ --outfilefiltered "$deseq_out_filtered"
#if $pdf:
- -p "$plots"
+ -p "$plots"
#end if
-
+
-i "$input_matrix"
- #if $filter_sel.filter_sel_opts == 'all_vs_all':
- -s 'all_vs_all'
- #else:
- -s ## build a string like '1,2 5,6'
- "${filter_sel.control_cols} ${filter_sel.experiement_cols}"
+ #set $temp_factor_name = list()
+ #for $factor in $rep_factorName:
+ #set $temp_factor = dict()
+ #for $level in $factor.rep_factorLevel:
+ ##$temp_factor_list.append( '%s::%s:%s' % ($factor.factorName.replace(' ','_'), $level.factorLevel, $level.factorIndex) )
+ $temp_factor.update({str($level.factorLevel): map(int, str($level.factorIndex).split(','))})
+ #end for
+ $temp_factor_name.append([str($factor.factorName), $temp_factor])
- #set $temp_factor_list = list()
- #set $is_multi_factor_analysis = False
- #for $factor in $filter_sel.factor:
- #set $is_multi_factor_analysis = True
- $temp_factor_list.append( '%s:%s' % ($factor.factor_name.replace(' ','_'), $factor.factor_index) )
#end for
- #if $is_multi_factor_analysis:
- -f "#echo ' '.join( $temp_factor_list )#"
- #end if
- #end if
- -t $fittype
+ ##-m "#echo ' '.join( $temp_factor_list )#"
+ -m '#echo simplejson.dumps(temp_factor_name)#'
+ ##--organism "$organism"
+ ##-t "$fittype"
+ -c $countthreshold
+ 'Count reads in features with htseq-count'"/>
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
-
-
-
-
-
+
+
+
+
+
-
-
-
- pdf == True
-
-
+
+
+
+
+ pdf == True
+
+
+
-
+
.. class:: infomark
@@ -122,13 +130,12 @@
====== ==========================================================
Column Description
------ ----------------------------------------------------------
- 1 Condition tested (corresponds to the conitions from the first line of your count matrix)
- 2 Gene Identifiers
- 3 mean normalised counts, averaged over all samples from both conditions
- 4 the logarithm (to basis 2) of the fold change
- 5 standard error estimate for the log2 fold change estimate
- 6 p value for the statistical significance of this change
- 7 p value adjusted for multiple testing with the Benjamini-Hochberg procedure
+ 1 Gene Identifiers
+ 2 mean normalised counts, averaged over all samples from both conditions
+ 3 the logarithm (to basis 2) of the fold change
+ 4 standard error estimate for the log2 fold change estimate
+ 5 p value for the statistical significance of this change
+ 6 p value adjusted for multiple testing with the Benjamini-Hochberg procedure
which controls false discovery rate (FDR)
====== ==========================================================
@@ -150,5 +157,5 @@
.. _DESeq2: http://master.bioconductor.org/packages/release/bioc/html/DESeq2.html
-
+
diff -r 1d2a02bc2208 -r a05999fd6e26 matrix_helper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_helper.py Mon Sep 30 10:51:41 2013 -0400
@@ -0,0 +1,66 @@
+
+from galaxy.tools.parameters import DataToolParameter
+
+def get_matrix_header( input_dataset ):
+ """
+ Not used currently, because the reload of the ckeckboxes did not work.
+ """
+ input_handle = open( input_dataset.file_name )
+ first_header = input_handle.readline()
+ second_header = input_handle.readline()
+ return [('%s::%s' % (cname2,cname1), str(int(col_num) + 1), False) for col_num, (cname2, cname1) in enumerate(zip(second_header.split()[1:],first_header.split()[1:])) ]
+
+
+def validate_input( trans, error_map, param_values, page_param_map ):
+ """
+ Validates the user input, before execution.
+ """
+ factors = param_values['rep_factorName']
+ factor_name_list = []
+ factor_duplication = False
+ level_duplication = False
+ overlapping_selection = False
+
+
+ for factor in factors:
+ # factor names should be unique
+ fn = factor['factorName']
+ if fn in factor_name_list:
+ factor_duplication = True
+ break
+ factor_name_list.append( fn )
+
+ level_name_list = list()
+ factor_index_list = list()
+
+ for level in factor['rep_factorLevel']:
+ # level names under one factor should be unique
+ fl = level['factorLevel']
+ if fl in level_name_list:
+ level_duplication = True
+ level_name_list.append( fl )
+
+ fi = level['factorIndex']
+ if fi:
+ # the checkboxes should not have an overlap
+ for check in fi:
+ if check in factor_index_list:
+ overlapping_selection = True
+ factor_index_list.append( check )
+
+ if level_duplication:
+ error_map['rep_factorName'] = [ dict() for t in factors ]
+ for i in range( len( factors ) ):
+ error_map['rep_factorName'][i]['rep_factorLevel'] = [ {'factorLevel': 'Factor levels for each factor need to be unique'} for t in factor['rep_factorLevel'] ]
+ break
+ if overlapping_selection:
+ error_map['rep_factorName'] = [ dict() for t in factors ]
+ for i in range( len( factors ) ):
+ error_map['rep_factorName'][i]['rep_factorLevel'] = [ {'factorIndex': 'The samples from different factors are not allowed to overlap'} for t in factor['rep_factorLevel'] ]
+ break
+
+ if factor_duplication:
+ error_map['rep_factorName'] = [ dict() for t in factors ]
+ for i in range( len( factors ) ):
+ error_map['rep_factorName'][i]['factorName'] = 'Factor names need to be unique'
+
diff -r 1d2a02bc2208 -r a05999fd6e26 tool_dependencies.xml
--- a/tool_dependencies.xml Mon Sep 09 07:09:38 2013 -0400
+++ b/tool_dependencies.xml Mon Sep 30 10:51:41 2013 -0400
@@ -4,7 +4,7 @@
-
+
$REPOSITORY_INSTALL_DIR/scripts