# HG changeset patch
# User bgruening
# Date 1378310300 14400
# Node ID bb5c80d15e0a68b80dd72955b5fb94d4f40f4210
# Parent 6d17a7d6fe9c388997323fc31c536d97bd3c2645
Uploaded
diff -r 6d17a7d6fe9c -r bb5c80d15e0a deseq2.R
--- a/deseq2.R Mon Sep 02 10:09:37 2013 -0400
+++ b/deseq2.R Wed Sep 04 11:58:20 2013 -0400
@@ -1,3 +1,8 @@
+## Setup R error handling to go to stderr
+options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+# we need that to not crash galaxy with an UTF8 error on German LC settings.
+Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+
library('getopt');
options(stringAsfactors = FALSE, useFancyQuotes = FALSE)
args <- commandArgs(trailingOnly = TRUE)
@@ -8,9 +13,10 @@
'verbose', 'v', 2, "integer",
'help' , 'h', 0, "logical",
'outputfile' , 'o', 1, "character",
- 'plots' , 'p', 1, "character",
+ 'plots' , 'p', 2, "character",
'input' , 'i', 1, "character",
- 'samples', 's', 1, "character"
+ 'samples', 's', 1, "character",
+ 'factors', 'f', 2, "character"
), byrow=TRUE, ncol=4);
opt = getopt(spec);
@@ -23,99 +29,131 @@
}
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
-opt$samples<-trim(opt$samples)
-
+opt$samples <- trim(opt$samples)
+opt$factors <- trim(opt$factors)
htseqCountTable = read.table(opt$input, sep="\t", comment="", as.is=T)
-l<-unique(c(htseqCountTable[1,-1]))
-#print(l)
-
-colnames(htseqCountTable) <- htseqCountTable[1,]
+colnames(htseqCountTable) <- htseqCountTable[2,]
names(htseqCountTable)<-colnames(htseqCountTable)
+conditions <- htseqCountTable[1,]
+conditions <- unlist(conditions[,-1])
tagnames <- htseqCountTable[-(1:2), 1]
htseqCountTable <- htseqCountTable[-(1:2),-1]
htseqCountTable[,1] <- gsub(",", ".", htseqCountTable[,1])
+l <- unique(c(conditions))
library( "DESeq2" )
-opt$samples
-
-print('----------------')
-
-pdf(opt$plots)
-
+if ( !is.null(opt$plots) ) {
+ pdf(opt$plots)
+}
if (opt$samples=="all_vs_all"){
-
+ # all versus all
for (i in seq(1, length(l), by=1)) {
k=i+1
if(k<=length(l)){
for (j in seq(k, length(l), by=1)) {
-
- colData<-names(htseqCountTable[, which(gsub("[.][0-9]+","",names(htseqCountTable)) %in% c(l[[i]], l[[j]]))])
- currentPairTable<-htseqCountTable[ , which(gsub("[.][0-9]+","",names(htseqCountTable)) %in% c(l[[i]], l[[j]]))]
- # rownames(currentPairTable)<-tagnames
-
- write.table(currentPairTable, file=paste(l[[i]],"_", l[[j]],".txt"))
- currentPairTable<-read.table(paste(l[[i]],"_", l[[j]],".txt"),row.names=1,header=T)
-
- currentPairTable <- as.data.frame(currentPairTable,stringsAsFactors=F)
- #print(currentPairTable)
- pdata = data.frame(condition=factor(c(rep(l[[i]], 2), rep(l[[j]], 2))),row.names=colnames(currentPairTable))
+ currentColmuns <- which(conditions %in% c(l[[i]], l[[j]]))
+ sampleNames <- names(htseqCountTable[currentColmuns])
+
+ currentPairTable <- htseqCountTable[currentColmuns]
+ ncondition1cols <- length(which(conditions %in% c(l[[i]])))
+ ncondition2cols <- length(which(conditions %in% c(l[[i]])))
+
+ ntabcols <- length(currentPairTable)
+ currentPairTable <- as.integer(unlist(currentPairTable))
+ currentPairTable <- matrix(unlist(currentPairTable), ncol = ntabcols, byrow = FALSE)
+ colnames(currentPairTable) <- sampleNames
+
+ pdata = data.frame(condition=factor(conditions[currentColmuns]),row.names=colnames(currentPairTable))
+
dds = DESeqDataSetFromMatrix(countData = currentPairTable, colData = pdata, design = ~ condition)
-
- # colData(dds)$condition
+
dds <- DESeq(dds)
sizeFactors(dds)
- plotDispEsts(dds)
- plotMA(dds)
-
+ if ( !is.null(opt$plots) ) {
+ plotDispEsts(dds)
+ plotMA(dds)
+ }
res <- results(dds)
- rownames(res)<-tagnames
- resSorted <- res[order(res$padj),];
- write.csv( as.data.frame(resSorted), file = opt$outputfile )
+ resCols <- colnames(res)
+
+ cond <- c(rep(paste(unique(conditions[currentColmuns]),collapse="_"), nrow(res)))
+ res[, "condition"] <- cond
+ res[, "geneIds"] <- tagnames
+
+ resSorted <- res[order(res$padj),]
+ write.table(as.data.frame(resSorted[,c("condition", "geneIds", resCols)]), file = opt$outputfile, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE)
}
}
}
}else{
- sampleSets<-unlist(strsplit(opt$samples, " "))
- sampleSets
- samplesControl <- {}
- samplesExperiment <- {}
- sampleNames <- {}
+ sampleSets <- unlist(strsplit(opt$samples, " "))
+ samplesControl <- {}
+ samplesExperiment <- {}
+ samplesControl <- unlist(strsplit(sampleSets[1], ","))
+ samplesExperiment <- unlist(strsplit(sampleSets[2], ","))
- samplesControl<-unlist(strsplit(sampleSets[1], ","))
- samplesExperiment<-unlist(strsplit(sampleSets[2], ","))
+ # the minus one is needed because we get column indizes including the first column
+ sampleColumns <- c()
+ for (i in samplesControl) sampleColumns[[length(sampleColumns) + 1]] <- as.integer(i) - 1
+ for (i in samplesExperiment) sampleColumns[[length(sampleColumns) + 1]] <- as.integer(i) - 1
+ htseqSubCountTable <- htseqCountTable[,sampleColumns]
- print(samplesControl)
- print(samplesExperiment)
+ ntabcols <- length(htseqSubCountTable)
+ htseqSubCountTable <- as.integer(unlist(htseqSubCountTable))
+ htseqSubCountTable <- matrix(unlist(htseqSubCountTable), ncol = ntabcols, byrow = FALSE)
+ colnames(htseqSubCountTable) <- names(htseqCountTable)[sampleColumns]
+
+ pdata = data.frame(condition=factor(conditions[sampleColumns]), row.names=colnames(htseqSubCountTable))
- # the minus one is needed because we get column indizes including the first column
- samplecolumns <- c()
- for (i in samplesControl) samplecolumns[[length(samplecolumns) + 1]] <- as.integer(i) - 1
- for (i in samplesExperiment) samplecolumns[[length(samplecolumns) + 1]] <- as.integer(i) - 1
-
-print(samplecolumns)
-
- htseqSubCountTable <- htseqCountTable[,samplecolumns]
-
- #TODO: pavan hack :)
- #write.table(htseqSubCountTable, file=paste(gsub(" ","_",opt$samples),".txt"))
- #htseqSubCountTable<-read.table(paste(gsub(" ","_",opt$samples),".txt"),row.names=1,header=T)
+ if(!is.null(opt$factors)){
+ factorSets <- unlist(strsplit(opt$factors, " "))
+ for (factorSet in factorSets) {
+ currentFactor <- unlist(strsplit(factorSet, ":"))
+ for (factor in currentFactor[2]) {
+ factoredCols <- unlist(strsplit(factor, ","))
+ factoredCols <- as.numeric(factoredCols)
+ factors <- c()
+ for (i in sampleColumns){
+ j=i+1
+ if(j %in% factoredCols)
+ factors[[length(factors) + 1]] <- "yes"
+ else
+ factors[[length(factors) + 1]] <- "no"
+ }
+ # only add factor if factor is applied to the selected samples
+ if((length(unique(factors))) >= 2){
+ pdata[[currentFactor[1]]]<-factor(factors)
+ }else{
+ write("Input-Error 01: You can only apply factors to selected samples.", stderr())
+ }
+ }
+ }
+ }
- pdata = data.frame(condition=factor(c(names( htseqSubCountTable )[ samplecolumns ])),row.names=colnames(htseqSubCountTable))
- dds = DESeqDataSetFromMatrix(countData = bla, colData = pdata, design = ~ condition)
-
- dds <- DESeq(dds)
- sizeFactors(dds)
- plotDispEsts(dds)
- plotMA(dds)
-
- res <- results(dds)
- rownames(res)<-tagnames
- resSorted <- res[order(res$padj),];
- write.csv(as.data.frame(resSorted), file = opt$outputfile)
+ form <- as.formula(paste("", paste(names(pdata), collapse=" + "), sep=" ~ "))
+ dds = DESeqDataSetFromMatrix(countData = htseqSubCountTable, colData = pdata, design = form)
+
+ dds <- DESeq(dds)
+ sizeFactors(dds)
+ if ( !is.null(opt$plots) ) {
+ plotDispEsts(dds)
+ plotMA(dds)
+ }
+
+ res <- results(dds)
+ resCols <- colnames(res)
+
+ cond <- c(rep(paste(unique(conditions[sampleColumns]),collapse="_"), nrow(res)))
+ res[, "condition"] <- cond
+ res[, "geneIds"] <- tagnames
+
+ resSorted <- res[order(res$padj),]
+ write.table(as.data.frame(resSorted[,c("condition", "geneIds", resCols)]), file = opt$outputfile, sep="\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
}
dev.off()
+
diff -r 6d17a7d6fe9c -r bb5c80d15e0a deseq2.xml
--- a/deseq2.xml Mon Sep 02 10:09:37 2013 -0400
+++ b/deseq2.xml Wed Sep 04 11:58:20 2013 -0400
@@ -9,17 +9,42 @@
deseq2.R
-o "$deseq_out"
- -p "$plots"
+
+ #if $pdf:
+ -p "$plots"
+ #end if
+
-i "$input_matrix"
#if $filter_sel.filter_sel_opts == 'all_vs_all':
-s 'all_vs_all'
#else:
- -s ## build a string like '1:2 5:6'
+ -s ## build a string like '1,2 5,6'
"${filter_sel.control_cols} ${filter_sel.experiement_cols}"
+
+ #set $temp_factor_list = list()
+ #set $is_multi_factor_analysis = False
+ #for $factor in $filter_sel.factor:
+ #set $is_multi_factor_analysis = True
+ $temp_factor_list.append( '%s:%s' % ($factor.factor_name.replace(' ','_'), $factor.factor_index) )
+ #end for
+
+ #if $is_multi_factor_analysis:
+ -f "#echo ' '.join( $temp_factor_list )#"
+ #end if
#end if
+
+
+
+
@@ -40,16 +65,29 @@
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
+ pdf == True
+
@@ -58,47 +96,49 @@
**What it does**
-`DESeq` is a tool for differential expression testing of RNA-Seq data.
+Estimate variance-mean dependence in count data from high-throughput sequencing assays and test for differential expression based on a model using the negative binomial distribution
**Inputs**
-`DESeq` requires three input files to run:
+DESeq2_ requires one count matrix as input file. You can use the tool
-1. Annotation file in GFF3, containing the necessary information about the transcripts that are to be quantified.
-2. The BAM alignment files grouped into replicate groups, each containing several replicates. BAM files store the read alignments in a compressed format. They can be generated using the `SAM-to-BAM` tool in the NGS: SAM Tools section. (The script will also work with only two groups containing only a single replicate each. However, this analysis has less statistical power and is therefor not recommended.)
+
**Output**
-`DESeq` generates a text file containing the gene name and the p-value.
+DESeq2_ generates a tabular file containing the different columns and optional visualized results as PDF.
+
+====== ==========================================================
+Column Description
+------ ----------------------------------------------------------
+ 1 Sample ID (corresponds to the header in your count matrix)
+ 2 Gene Identifiers
+ 3 mean normalised counts, averaged over all samples from both conditions
+ 4 the logarithm (to basis 2) of the fold change
+ 5 standard error estimate for the log2 fold change estimate
+ 6 p value for the statistical significance of this change
+ 7 p value adjusted for multiple testing with the Benjamini-Hochberg procedure
+ which controls false discovery rate (FDR)
+====== ==========================================================
+
------
-**Licenses**
+**References**
+
+DESeq2_ Authors: Michael Love (MPIMG Berlin), Simon Anders, Wolfgang Huber (EMBL Heidelberg)
-If **DESeq** is used to obtain results for scientific publications it
-should be cited as [1]_.
+If _DESeq2_ is used to obtain results for scientific publications it
+should be cited as [1]_. A paper describing DESeq2_ is in preparation.
-**References**
+
.. [1] Anders, S and Huber, W (2010): `Differential expression analysis for sequence count data`_.
.. _Differential expression analysis for sequence count data: http://dx.doi.org/10.1186/gb-2010-11-10-r106
+.. _DESeq2: http://master.bioconductor.org/packages/release/bioc/html/DESeq2.html
-------
-
-For more information see http://www.sequenceontology.org/gff3.shtml
-**SAM/BAM format** The Sequence Alignment/Map (SAM) format is a
-tab-limited text format that stores large nucleotide sequence
-alignments. BAM is the binary version of a SAM file that allows for
-fast and intensive data processing. The format specification and the
-description of SAMtools can be found on
-http://samtools.sourceforge.net/.
-
-------
-
-DESeq-hts Wrapper Version 0.3 (Feb 2012)
-
-
+