Mercurial > repos > bgruening > deseq2
changeset 13:6d17a7d6fe9c draft
Uploaded
author | bgruening |
---|---|
date | Mon, 02 Sep 2013 10:09:37 -0400 |
parents | 01114c547afb |
children | bb5c80d15e0a |
files | deseq2.R deseq2.xml test-data/countmatrix.txt tool_dependencies.xml |
diffstat | 4 files changed, 234 insertions(+), 17 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deseq2.R Mon Sep 02 10:09:37 2013 -0400 @@ -0,0 +1,121 @@ +library('getopt'); +options(stringAsfactors = FALSE, useFancyQuotes = FALSE) +args <- commandArgs(trailingOnly = TRUE) + +#get options, using the spec as defined by the enclosed list. +#we read the options from the default: commandArgs(TRUE). +spec = matrix(c( + 'verbose', 'v', 2, "integer", + 'help' , 'h', 0, "logical", + 'outputfile' , 'o', 1, "character", + 'plots' , 'p', 1, "character", + 'input' , 'i', 1, "character", + 'samples', 's', 1, "character" +), byrow=TRUE, ncol=4); +opt = getopt(spec); + + +# if help was asked for print a friendly message +# and exit with a non-zero error code +if ( !is.null(opt$help) ) { + cat(getopt(spec, usage=TRUE)); + q(status=1); +} + +trim <- function (x) gsub("^\\s+|\\s+$", "", x) +opt$samples<-trim(opt$samples) + + +htseqCountTable = read.table(opt$input, sep="\t", comment="", as.is=T) + +l<-unique(c(htseqCountTable[1,-1])) +#print(l) + +colnames(htseqCountTable) <- htseqCountTable[1,] +names(htseqCountTable)<-colnames(htseqCountTable) +tagnames <- htseqCountTable[-(1:2), 1] +htseqCountTable <- htseqCountTable[-(1:2),-1] +htseqCountTable[,1] <- gsub(",", ".", htseqCountTable[,1]) + + +library( "DESeq2" ) + +opt$samples + +print('----------------') + +pdf(opt$plots) + +if (opt$samples=="all_vs_all"){ + + for (i in seq(1, length(l), by=1)) { + k=i+1 + if(k<=length(l)){ + for (j in seq(k, length(l), by=1)) { + + colData<-names(htseqCountTable[, which(gsub("[.][0-9]+","",names(htseqCountTable)) %in% c(l[[i]], l[[j]]))]) + currentPairTable<-htseqCountTable[ , which(gsub("[.][0-9]+","",names(htseqCountTable)) %in% c(l[[i]], l[[j]]))] + # rownames(currentPairTable)<-tagnames + + write.table(currentPairTable, file=paste(l[[i]],"_", l[[j]],".txt")) + currentPairTable<-read.table(paste(l[[i]],"_", l[[j]],".txt"),row.names=1,header=T) + + currentPairTable <- as.data.frame(currentPairTable,stringsAsFactors=F) + #print(currentPairTable) + pdata = data.frame(condition=factor(c(rep(l[[i]], 2), rep(l[[j]], 2))),row.names=colnames(currentPairTable)) + dds = DESeqDataSetFromMatrix(countData = currentPairTable, colData = pdata, design = ~ condition) + + # colData(dds)$condition + dds <- DESeq(dds) + sizeFactors(dds) + plotDispEsts(dds) + plotMA(dds) + + res <- results(dds) + rownames(res)<-tagnames + resSorted <- res[order(res$padj),]; + write.csv( as.data.frame(resSorted), file = opt$outputfile ) + } + } + } +}else{ + + sampleSets<-unlist(strsplit(opt$samples, " ")) + sampleSets + samplesControl <- {} + samplesExperiment <- {} + sampleNames <- {} + + samplesControl<-unlist(strsplit(sampleSets[1], ",")) + samplesExperiment<-unlist(strsplit(sampleSets[2], ",")) + + print(samplesControl) + print(samplesExperiment) + + # the minus one is needed because we get column indizes including the first column + samplecolumns <- c() + for (i in samplesControl) samplecolumns[[length(samplecolumns) + 1]] <- as.integer(i) - 1 + for (i in samplesExperiment) samplecolumns[[length(samplecolumns) + 1]] <- as.integer(i) - 1 + +print(samplecolumns) + + htseqSubCountTable <- htseqCountTable[,samplecolumns] + + #TODO: pavan hack :) + #write.table(htseqSubCountTable, file=paste(gsub(" ","_",opt$samples),".txt")) + #htseqSubCountTable<-read.table(paste(gsub(" ","_",opt$samples),".txt"),row.names=1,header=T) + + pdata = data.frame(condition=factor(c(names( htseqSubCountTable )[ samplecolumns ])),row.names=colnames(htseqSubCountTable)) + dds = DESeqDataSetFromMatrix(countData = bla, colData = pdata, design = ~ condition) + + dds <- DESeq(dds) + sizeFactors(dds) + plotDispEsts(dds) + plotMA(dds) + + res <- results(dds) + rownames(res)<-tagnames + resSorted <- res[order(res$padj),]; + write.csv(as.data.frame(resSorted), file = opt$outputfile) +} +dev.off()
--- a/deseq2.xml Fri Aug 02 13:16:18 2013 -0400 +++ b/deseq2.xml Mon Sep 02 10:09:37 2013 -0400 @@ -1,27 +1,104 @@ -<tool id="deseq2" name="DESeq2" version="0.1"> - - <description>deseq</description> +<tool id="deseq2" name="DESeq2" version="2.0"> + <description>Determines differentially expressed transcripts from read alignments</description> <requirements> + <requirement type="binary">Rscript</requirement> + <requirement type="R-module">DESeq2</requirement> <requirement type="package" version="3.0.1">R_3_0_1</requirement> - <requirement type="package" version="2.3.6">rpy2_2_3_6</requirement> - <requirement type="package" version="1.0.17">deseq2</requirement> + <requirement type="set_environment">DESEQ2_SCRIPT_PATH</requirement> </requirements> - <command interpreter="python"> - deseq2.py > $outfile + <command interpreter="Rscript"> + deseq2.R + -o "$deseq_out" + -p "$plots" + -i "$input_matrix" + + #if $filter_sel.filter_sel_opts == 'all_vs_all': + -s 'all_vs_all' + #else: + -s ## build a string like '1:2 5:6' + "${filter_sel.control_cols} ${filter_sel.experiement_cols}" + #end if + </command> <inputs> + <param format="tabular" name="input_matrix" type="data" label="Countmatrix" help="You can create a count matrix with the tool ..."/> + + <conditional name="filter_sel"> + <param name="filter_sel_opts" type="select" label="Select properties to filter"> + <option value="all_vs_all">All against all</option> + <option value="specific">specific</option> + </param> + <when value="specific"> + + <param name="control_cols" label="Select columns containing treatment." type="data_column" data_ref="input_matrix" + numerical="True" multiple="true" use_header_names="true" size="120" display="checkboxes"> + <validator type="no_options" message="Please select at least one column."/> + </param> + + <param name="experiement_cols" label="Select columns containing treatment." type="data_column" data_ref="input_matrix" + numerical="True" multiple="true" use_header_names="true" size="120" display="checkboxes"> + <validator type="no_options" message="Please select at least one column."/> + </param> + + </when> + <when value="all_v_all" /> + </conditional> + </inputs> - <outputs> - <data format="txt" name="outfile" label="${tool.name} on ${on_string}: out"/> - </outputs> + <outputs> + <data format="txt" name="deseq_out" label="DESeq2 result file"/> + <data format="pdf" name="plots" label="Plot collection"/> + <data format="txt" name="log" label="DESeq2 log file"/> + </outputs> - <tests> - </tests> + <help> + +.. class:: infomark - <help> +**What it does** + +`DESeq` is a tool for differential expression testing of RNA-Seq data. +**Inputs** - </help> +`DESeq` requires three input files to run: + +1. Annotation file in GFF3, containing the necessary information about the transcripts that are to be quantified. +2. The BAM alignment files grouped into replicate groups, each containing several replicates. BAM files store the read alignments in a compressed format. They can be generated using the `SAM-to-BAM` tool in the NGS: SAM Tools section. (The script will also work with only two groups containing only a single replicate each. However, this analysis has less statistical power and is therefor not recommended.) + +**Output** + +`DESeq` generates a text file containing the gene name and the p-value. + +------ + +**Licenses** + +If **DESeq** is used to obtain results for scientific publications it +should be cited as [1]_. + +**References** + +.. [1] Anders, S and Huber, W (2010): `Differential expression analysis for sequence count data`_. + +.. _Differential expression analysis for sequence count data: http://dx.doi.org/10.1186/gb-2010-11-10-r106 + +------ + +For more information see http://www.sequenceontology.org/gff3.shtml + +**SAM/BAM format** The Sequence Alignment/Map (SAM) format is a +tab-limited text format that stores large nucleotide sequence +alignments. BAM is the binary version of a SAM file that allows for +fast and intensive data processing. The format specification and the +description of SAMtools can be found on +http://samtools.sourceforge.net/. + +------ + +DESeq-hts Wrapper Version 0.3 (Feb 2012) + +</help> </tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/countmatrix.txt Mon Sep 02 10:09:37 2013 -0400 @@ -0,0 +1,19 @@ +# G1:Mut1 G1:Mut1 G2:Mut2 G2:Mut2 G3:WT G3:WT +#Feature Spl1 Spl2 Spl3 Spl4 Spl5 Spl6 +NM_001001130 97 43 61 34 73 26 +NM_001001144 25 8 9 3 5 5 +NM_001001152 72 45 29 20 31 13 +NM_001001160 0 1 1 1 0 0 +NM_001001177 0 1 0 4 3 3 +NM_001001178 0 2 1 0 4 0 +NM_001001179 0 0 0 0 0 2 +NM_001001180 0 0 0 0 0 2 +NM_001001181 415 319 462 185 391 155 +NM_001001182 1293 945 987 297 938 496 +NM_001001183 5 4 11 7 11 2 +NM_001001184 135 198 178 110 205 64 +NM_001001185 186 1 0 1 1 0 +NM_001001186 75 90 91 34 63 54 +NM_001001187 267 236 170 165 202 51 +NM_001001295 5 2 6 1 7 0 +NM_001001309 1 0 0 1 2 1
--- a/tool_dependencies.xml Fri Aug 02 13:16:18 2013 -0400 +++ b/tool_dependencies.xml Mon Sep 02 10:09:37 2013 -0400 @@ -3,10 +3,10 @@ <package name="R_3_0_1" version="3.0.1"> <repository changeset_revision="bae5c9880b71" name="package_r_3_0_1" owner="bgruening" toolshed="http://testtoolshed.g2.bx.psu.edu" /> </package> - <package name="rpy2_2_3_6" version="2.3.6"> - <repository changeset_revision="738e7d34477d" name="package_rpy2_2_3_6" owner="bgruening" toolshed="http://testtoolshed.g2.bx.psu.edu" /> - </package> <package name="deseq2" version="1.0.17"> <repository changeset_revision="eab36a9a70b1" name="package_deseq2_1_0_17" owner="bgruening" toolshed="http://testtoolshed.g2.bx.psu.edu" /> </package> + <set_environment version="1.0"> + <environment_variable action="set_to" name="DESEQ2_SCRIPT_PATH">$REPOSITORY_INSTALL_DIR/scripts</environment_variable> + </set_environment> </tool_dependency>