# HG changeset patch # User bgruening # Date 1378130977 14400 # Node ID 6d17a7d6fe9c388997323fc31c536d97bd3c2645 # Parent 01114c547afbaf9f7395e55e3b9a16400e33eba9 Uploaded diff -r 01114c547afb -r 6d17a7d6fe9c deseq2.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deseq2.R Mon Sep 02 10:09:37 2013 -0400 @@ -0,0 +1,121 @@ +library('getopt'); +options(stringAsfactors = FALSE, useFancyQuotes = FALSE) +args <- commandArgs(trailingOnly = TRUE) + +#get options, using the spec as defined by the enclosed list. +#we read the options from the default: commandArgs(TRUE). +spec = matrix(c( + 'verbose', 'v', 2, "integer", + 'help' , 'h', 0, "logical", + 'outputfile' , 'o', 1, "character", + 'plots' , 'p', 1, "character", + 'input' , 'i', 1, "character", + 'samples', 's', 1, "character" +), byrow=TRUE, ncol=4); +opt = getopt(spec); + + +# if help was asked for print a friendly message +# and exit with a non-zero error code +if ( !is.null(opt$help) ) { + cat(getopt(spec, usage=TRUE)); + q(status=1); +} + +trim <- function (x) gsub("^\\s+|\\s+$", "", x) +opt$samples<-trim(opt$samples) + + +htseqCountTable = read.table(opt$input, sep="\t", comment="", as.is=T) + +l<-unique(c(htseqCountTable[1,-1])) +#print(l) + +colnames(htseqCountTable) <- htseqCountTable[1,] +names(htseqCountTable)<-colnames(htseqCountTable) +tagnames <- htseqCountTable[-(1:2), 1] +htseqCountTable <- htseqCountTable[-(1:2),-1] +htseqCountTable[,1] <- gsub(",", ".", htseqCountTable[,1]) + + +library( "DESeq2" ) + +opt$samples + +print('----------------') + +pdf(opt$plots) + +if (opt$samples=="all_vs_all"){ + + for (i in seq(1, length(l), by=1)) { + k=i+1 + if(k<=length(l)){ + for (j in seq(k, length(l), by=1)) { + + colData<-names(htseqCountTable[, which(gsub("[.][0-9]+","",names(htseqCountTable)) %in% c(l[[i]], l[[j]]))]) + currentPairTable<-htseqCountTable[ , which(gsub("[.][0-9]+","",names(htseqCountTable)) %in% c(l[[i]], l[[j]]))] + # rownames(currentPairTable)<-tagnames + + write.table(currentPairTable, file=paste(l[[i]],"_", l[[j]],".txt")) + currentPairTable<-read.table(paste(l[[i]],"_", l[[j]],".txt"),row.names=1,header=T) + + currentPairTable <- as.data.frame(currentPairTable,stringsAsFactors=F) + #print(currentPairTable) + pdata = data.frame(condition=factor(c(rep(l[[i]], 2), rep(l[[j]], 2))),row.names=colnames(currentPairTable)) + dds = DESeqDataSetFromMatrix(countData = currentPairTable, colData = pdata, design = ~ condition) + + # colData(dds)$condition + dds <- DESeq(dds) + sizeFactors(dds) + plotDispEsts(dds) + plotMA(dds) + + res <- results(dds) + rownames(res)<-tagnames + resSorted <- res[order(res$padj),]; + write.csv( as.data.frame(resSorted), file = opt$outputfile ) + } + } + } +}else{ + + sampleSets<-unlist(strsplit(opt$samples, " ")) + sampleSets + samplesControl <- {} + samplesExperiment <- {} + sampleNames <- {} + + samplesControl<-unlist(strsplit(sampleSets[1], ",")) + samplesExperiment<-unlist(strsplit(sampleSets[2], ",")) + + print(samplesControl) + print(samplesExperiment) + + # the minus one is needed because we get column indizes including the first column + samplecolumns <- c() + for (i in samplesControl) samplecolumns[[length(samplecolumns) + 1]] <- as.integer(i) - 1 + for (i in samplesExperiment) samplecolumns[[length(samplecolumns) + 1]] <- as.integer(i) - 1 + +print(samplecolumns) + + htseqSubCountTable <- htseqCountTable[,samplecolumns] + + #TODO: pavan hack :) + #write.table(htseqSubCountTable, file=paste(gsub(" ","_",opt$samples),".txt")) + #htseqSubCountTable<-read.table(paste(gsub(" ","_",opt$samples),".txt"),row.names=1,header=T) + + pdata = data.frame(condition=factor(c(names( htseqSubCountTable )[ samplecolumns ])),row.names=colnames(htseqSubCountTable)) + dds = DESeqDataSetFromMatrix(countData = bla, colData = pdata, design = ~ condition) + + dds <- DESeq(dds) + sizeFactors(dds) + plotDispEsts(dds) + plotMA(dds) + + res <- results(dds) + rownames(res)<-tagnames + resSorted <- res[order(res$padj),]; + write.csv(as.data.frame(resSorted), file = opt$outputfile) +} +dev.off() diff -r 01114c547afb -r 6d17a7d6fe9c deseq2.xml --- a/deseq2.xml Fri Aug 02 13:16:18 2013 -0400 +++ b/deseq2.xml Mon Sep 02 10:09:37 2013 -0400 @@ -1,27 +1,104 @@ - - - deseq + + Determines differentially expressed transcripts from read alignments + Rscript + DESeq2 R_3_0_1 - rpy2_2_3_6 - deseq2 + DESEQ2_SCRIPT_PATH - - deseq2.py > $outfile + + deseq2.R + -o "$deseq_out" + -p "$plots" + -i "$input_matrix" + + #if $filter_sel.filter_sel_opts == 'all_vs_all': + -s 'all_vs_all' + #else: + -s ## build a string like '1:2 5:6' + "${filter_sel.control_cols} ${filter_sel.experiement_cols}" + #end if + + + + + + + + + + + + + + + + + + + + + + - - - + + + + + - - + + +.. class:: infomark - +**What it does** + +`DESeq` is a tool for differential expression testing of RNA-Seq data. +**Inputs** - +`DESeq` requires three input files to run: + +1. Annotation file in GFF3, containing the necessary information about the transcripts that are to be quantified. +2. The BAM alignment files grouped into replicate groups, each containing several replicates. BAM files store the read alignments in a compressed format. They can be generated using the `SAM-to-BAM` tool in the NGS: SAM Tools section. (The script will also work with only two groups containing only a single replicate each. However, this analysis has less statistical power and is therefor not recommended.) + +**Output** + +`DESeq` generates a text file containing the gene name and the p-value. + +------ + +**Licenses** + +If **DESeq** is used to obtain results for scientific publications it +should be cited as [1]_. + +**References** + +.. [1] Anders, S and Huber, W (2010): `Differential expression analysis for sequence count data`_. + +.. _Differential expression analysis for sequence count data: http://dx.doi.org/10.1186/gb-2010-11-10-r106 + +------ + +For more information see http://www.sequenceontology.org/gff3.shtml + +**SAM/BAM format** The Sequence Alignment/Map (SAM) format is a +tab-limited text format that stores large nucleotide sequence +alignments. BAM is the binary version of a SAM file that allows for +fast and intensive data processing. The format specification and the +description of SAMtools can be found on +http://samtools.sourceforge.net/. + +------ + +DESeq-hts Wrapper Version 0.3 (Feb 2012) + + diff -r 01114c547afb -r 6d17a7d6fe9c test-data/countmatrix.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/countmatrix.txt Mon Sep 02 10:09:37 2013 -0400 @@ -0,0 +1,19 @@ +# G1:Mut1 G1:Mut1 G2:Mut2 G2:Mut2 G3:WT G3:WT +#Feature Spl1 Spl2 Spl3 Spl4 Spl5 Spl6 +NM_001001130 97 43 61 34 73 26 +NM_001001144 25 8 9 3 5 5 +NM_001001152 72 45 29 20 31 13 +NM_001001160 0 1 1 1 0 0 +NM_001001177 0 1 0 4 3 3 +NM_001001178 0 2 1 0 4 0 +NM_001001179 0 0 0 0 0 2 +NM_001001180 0 0 0 0 0 2 +NM_001001181 415 319 462 185 391 155 +NM_001001182 1293 945 987 297 938 496 +NM_001001183 5 4 11 7 11 2 +NM_001001184 135 198 178 110 205 64 +NM_001001185 186 1 0 1 1 0 +NM_001001186 75 90 91 34 63 54 +NM_001001187 267 236 170 165 202 51 +NM_001001295 5 2 6 1 7 0 +NM_001001309 1 0 0 1 2 1 diff -r 01114c547afb -r 6d17a7d6fe9c tool_dependencies.xml --- a/tool_dependencies.xml Fri Aug 02 13:16:18 2013 -0400 +++ b/tool_dependencies.xml Mon Sep 02 10:09:37 2013 -0400 @@ -3,10 +3,10 @@ - - - + + $REPOSITORY_INSTALL_DIR/scripts +