# HG changeset patch # User bgruening # Date 1378310300 14400 # Node ID bb5c80d15e0a68b80dd72955b5fb94d4f40f4210 # Parent 6d17a7d6fe9c388997323fc31c536d97bd3c2645 Uploaded diff -r 6d17a7d6fe9c -r bb5c80d15e0a deseq2.R --- a/deseq2.R Mon Sep 02 10:09:37 2013 -0400 +++ b/deseq2.R Wed Sep 04 11:58:20 2013 -0400 @@ -1,3 +1,8 @@ +## Setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) +# we need that to not crash galaxy with an UTF8 error on German LC settings. +Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + library('getopt'); options(stringAsfactors = FALSE, useFancyQuotes = FALSE) args <- commandArgs(trailingOnly = TRUE) @@ -8,9 +13,10 @@ 'verbose', 'v', 2, "integer", 'help' , 'h', 0, "logical", 'outputfile' , 'o', 1, "character", - 'plots' , 'p', 1, "character", + 'plots' , 'p', 2, "character", 'input' , 'i', 1, "character", - 'samples', 's', 1, "character" + 'samples', 's', 1, "character", + 'factors', 'f', 2, "character" ), byrow=TRUE, ncol=4); opt = getopt(spec); @@ -23,99 +29,131 @@ } trim <- function (x) gsub("^\\s+|\\s+$", "", x) -opt$samples<-trim(opt$samples) - +opt$samples <- trim(opt$samples) +opt$factors <- trim(opt$factors) htseqCountTable = read.table(opt$input, sep="\t", comment="", as.is=T) -l<-unique(c(htseqCountTable[1,-1])) -#print(l) - -colnames(htseqCountTable) <- htseqCountTable[1,] +colnames(htseqCountTable) <- htseqCountTable[2,] names(htseqCountTable)<-colnames(htseqCountTable) +conditions <- htseqCountTable[1,] +conditions <- unlist(conditions[,-1]) tagnames <- htseqCountTable[-(1:2), 1] htseqCountTable <- htseqCountTable[-(1:2),-1] htseqCountTable[,1] <- gsub(",", ".", htseqCountTable[,1]) +l <- unique(c(conditions)) library( "DESeq2" ) -opt$samples - -print('----------------') - -pdf(opt$plots) - +if ( !is.null(opt$plots) ) { + pdf(opt$plots) +} if (opt$samples=="all_vs_all"){ - + # all versus all for (i in seq(1, length(l), by=1)) { k=i+1 if(k<=length(l)){ for (j in seq(k, length(l), by=1)) { - - colData<-names(htseqCountTable[, which(gsub("[.][0-9]+","",names(htseqCountTable)) %in% c(l[[i]], l[[j]]))]) - currentPairTable<-htseqCountTable[ , which(gsub("[.][0-9]+","",names(htseqCountTable)) %in% c(l[[i]], l[[j]]))] - # rownames(currentPairTable)<-tagnames - - write.table(currentPairTable, file=paste(l[[i]],"_", l[[j]],".txt")) - currentPairTable<-read.table(paste(l[[i]],"_", l[[j]],".txt"),row.names=1,header=T) - - currentPairTable <- as.data.frame(currentPairTable,stringsAsFactors=F) - #print(currentPairTable) - pdata = data.frame(condition=factor(c(rep(l[[i]], 2), rep(l[[j]], 2))),row.names=colnames(currentPairTable)) + currentColmuns <- which(conditions %in% c(l[[i]], l[[j]])) + sampleNames <- names(htseqCountTable[currentColmuns]) + + currentPairTable <- htseqCountTable[currentColmuns] + ncondition1cols <- length(which(conditions %in% c(l[[i]]))) + ncondition2cols <- length(which(conditions %in% c(l[[i]]))) + + ntabcols <- length(currentPairTable) + currentPairTable <- as.integer(unlist(currentPairTable)) + currentPairTable <- matrix(unlist(currentPairTable), ncol = ntabcols, byrow = FALSE) + colnames(currentPairTable) <- sampleNames + + pdata = data.frame(condition=factor(conditions[currentColmuns]),row.names=colnames(currentPairTable)) + dds = DESeqDataSetFromMatrix(countData = currentPairTable, colData = pdata, design = ~ condition) - - # colData(dds)$condition + dds <- DESeq(dds) sizeFactors(dds) - plotDispEsts(dds) - plotMA(dds) - + if ( !is.null(opt$plots) ) { + plotDispEsts(dds) + plotMA(dds) + } res <- results(dds) - rownames(res)<-tagnames - resSorted <- res[order(res$padj),]; - write.csv( as.data.frame(resSorted), file = opt$outputfile ) + resCols <- colnames(res) + + cond <- c(rep(paste(unique(conditions[currentColmuns]),collapse="_"), nrow(res))) + res[, "condition"] <- cond + res[, "geneIds"] <- tagnames + + resSorted <- res[order(res$padj),] + write.table(as.data.frame(resSorted[,c("condition", "geneIds", resCols)]), file = opt$outputfile, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE) } } } }else{ - sampleSets<-unlist(strsplit(opt$samples, " ")) - sampleSets - samplesControl <- {} - samplesExperiment <- {} - sampleNames <- {} + sampleSets <- unlist(strsplit(opt$samples, " ")) + samplesControl <- {} + samplesExperiment <- {} + samplesControl <- unlist(strsplit(sampleSets[1], ",")) + samplesExperiment <- unlist(strsplit(sampleSets[2], ",")) - samplesControl<-unlist(strsplit(sampleSets[1], ",")) - samplesExperiment<-unlist(strsplit(sampleSets[2], ",")) + # the minus one is needed because we get column indizes including the first column + sampleColumns <- c() + for (i in samplesControl) sampleColumns[[length(sampleColumns) + 1]] <- as.integer(i) - 1 + for (i in samplesExperiment) sampleColumns[[length(sampleColumns) + 1]] <- as.integer(i) - 1 + htseqSubCountTable <- htseqCountTable[,sampleColumns] - print(samplesControl) - print(samplesExperiment) + ntabcols <- length(htseqSubCountTable) + htseqSubCountTable <- as.integer(unlist(htseqSubCountTable)) + htseqSubCountTable <- matrix(unlist(htseqSubCountTable), ncol = ntabcols, byrow = FALSE) + colnames(htseqSubCountTable) <- names(htseqCountTable)[sampleColumns] + + pdata = data.frame(condition=factor(conditions[sampleColumns]), row.names=colnames(htseqSubCountTable)) - # the minus one is needed because we get column indizes including the first column - samplecolumns <- c() - for (i in samplesControl) samplecolumns[[length(samplecolumns) + 1]] <- as.integer(i) - 1 - for (i in samplesExperiment) samplecolumns[[length(samplecolumns) + 1]] <- as.integer(i) - 1 - -print(samplecolumns) - - htseqSubCountTable <- htseqCountTable[,samplecolumns] - - #TODO: pavan hack :) - #write.table(htseqSubCountTable, file=paste(gsub(" ","_",opt$samples),".txt")) - #htseqSubCountTable<-read.table(paste(gsub(" ","_",opt$samples),".txt"),row.names=1,header=T) + if(!is.null(opt$factors)){ + factorSets <- unlist(strsplit(opt$factors, " ")) + for (factorSet in factorSets) { + currentFactor <- unlist(strsplit(factorSet, ":")) + for (factor in currentFactor[2]) { + factoredCols <- unlist(strsplit(factor, ",")) + factoredCols <- as.numeric(factoredCols) + factors <- c() + for (i in sampleColumns){ + j=i+1 + if(j %in% factoredCols) + factors[[length(factors) + 1]] <- "yes" + else + factors[[length(factors) + 1]] <- "no" + } + # only add factor if factor is applied to the selected samples + if((length(unique(factors))) >= 2){ + pdata[[currentFactor[1]]]<-factor(factors) + }else{ + write("Input-Error 01: You can only apply factors to selected samples.", stderr()) + } + } + } + } - pdata = data.frame(condition=factor(c(names( htseqSubCountTable )[ samplecolumns ])),row.names=colnames(htseqSubCountTable)) - dds = DESeqDataSetFromMatrix(countData = bla, colData = pdata, design = ~ condition) - - dds <- DESeq(dds) - sizeFactors(dds) - plotDispEsts(dds) - plotMA(dds) - - res <- results(dds) - rownames(res)<-tagnames - resSorted <- res[order(res$padj),]; - write.csv(as.data.frame(resSorted), file = opt$outputfile) + form <- as.formula(paste("", paste(names(pdata), collapse=" + "), sep=" ~ ")) + dds = DESeqDataSetFromMatrix(countData = htseqSubCountTable, colData = pdata, design = form) + + dds <- DESeq(dds) + sizeFactors(dds) + if ( !is.null(opt$plots) ) { + plotDispEsts(dds) + plotMA(dds) + } + + res <- results(dds) + resCols <- colnames(res) + + cond <- c(rep(paste(unique(conditions[sampleColumns]),collapse="_"), nrow(res))) + res[, "condition"] <- cond + res[, "geneIds"] <- tagnames + + resSorted <- res[order(res$padj),] + write.table(as.data.frame(resSorted[,c("condition", "geneIds", resCols)]), file = opt$outputfile, sep="\t", quote = FALSE, row.names = FALSE, col.names = FALSE) } dev.off() + diff -r 6d17a7d6fe9c -r bb5c80d15e0a deseq2.xml --- a/deseq2.xml Mon Sep 02 10:09:37 2013 -0400 +++ b/deseq2.xml Wed Sep 04 11:58:20 2013 -0400 @@ -9,17 +9,42 @@ deseq2.R -o "$deseq_out" - -p "$plots" + + #if $pdf: + -p "$plots" + #end if + -i "$input_matrix" #if $filter_sel.filter_sel_opts == 'all_vs_all': -s 'all_vs_all' #else: - -s ## build a string like '1:2 5:6' + -s ## build a string like '1,2 5,6' "${filter_sel.control_cols} ${filter_sel.experiement_cols}" + + #set $temp_factor_list = list() + #set $is_multi_factor_analysis = False + #for $factor in $filter_sel.factor: + #set $is_multi_factor_analysis = True + $temp_factor_list.append( '%s:%s' % ($factor.factor_name.replace(' ','_'), $factor.factor_index) ) + #end for + + #if $is_multi_factor_analysis: + -f "#echo ' '.join( $temp_factor_list )#" + #end if #end if + + + + @@ -40,16 +65,29 @@ + + + + + + + + + + - - - + + + pdf == True + @@ -58,47 +96,49 @@ **What it does** -`DESeq` is a tool for differential expression testing of RNA-Seq data. +Estimate variance-mean dependence in count data from high-throughput sequencing assays and test for differential expression based on a model using the negative binomial distribution **Inputs** -`DESeq` requires three input files to run: +DESeq2_ requires one count matrix as input file. You can use the tool -1. Annotation file in GFF3, containing the necessary information about the transcripts that are to be quantified. -2. The BAM alignment files grouped into replicate groups, each containing several replicates. BAM files store the read alignments in a compressed format. They can be generated using the `SAM-to-BAM` tool in the NGS: SAM Tools section. (The script will also work with only two groups containing only a single replicate each. However, this analysis has less statistical power and is therefor not recommended.) + **Output** -`DESeq` generates a text file containing the gene name and the p-value. +DESeq2_ generates a tabular file containing the different columns and optional visualized results as PDF. + +====== ========================================================== +Column Description +------ ---------------------------------------------------------- + 1 Sample ID (corresponds to the header in your count matrix) + 2 Gene Identifiers + 3 mean normalised counts, averaged over all samples from both conditions + 4 the logarithm (to basis 2) of the fold change + 5 standard error estimate for the log2 fold change estimate + 6 p value for the statistical significance of this change + 7 p value adjusted for multiple testing with the Benjamini-Hochberg procedure + which controls false discovery rate (FDR) +====== ========================================================== + ------ -**Licenses** +**References** + +DESeq2_ Authors: Michael Love (MPIMG Berlin), Simon Anders, Wolfgang Huber (EMBL Heidelberg) -If **DESeq** is used to obtain results for scientific publications it -should be cited as [1]_. +If _DESeq2_ is used to obtain results for scientific publications it +should be cited as [1]_. A paper describing DESeq2_ is in preparation. -**References** + .. [1] Anders, S and Huber, W (2010): `Differential expression analysis for sequence count data`_. .. _Differential expression analysis for sequence count data: http://dx.doi.org/10.1186/gb-2010-11-10-r106 +.. _DESeq2: http://master.bioconductor.org/packages/release/bioc/html/DESeq2.html ------- - -For more information see http://www.sequenceontology.org/gff3.shtml -**SAM/BAM format** The Sequence Alignment/Map (SAM) format is a -tab-limited text format that stores large nucleotide sequence -alignments. BAM is the binary version of a SAM file that allows for -fast and intensive data processing. The format specification and the -description of SAMtools can be found on -http://samtools.sourceforge.net/. - ------- - -DESeq-hts Wrapper Version 0.3 (Feb 2012) - - +