changeset 13:6d17a7d6fe9c draft

Uploaded
author bgruening
date Mon, 02 Sep 2013 10:09:37 -0400
parents 01114c547afb
children bb5c80d15e0a
files deseq2.R deseq2.xml test-data/countmatrix.txt tool_dependencies.xml
diffstat 4 files changed, 234 insertions(+), 17 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deseq2.R	Mon Sep 02 10:09:37 2013 -0400
@@ -0,0 +1,121 @@
+library('getopt');
+options(stringAsfactors = FALSE, useFancyQuotes = FALSE)
+args <- commandArgs(trailingOnly = TRUE)
+
+#get options, using the spec as defined by the enclosed list.
+#we read the options from the default: commandArgs(TRUE).
+spec = matrix(c(
+  'verbose', 'v', 2, "integer",
+  'help' , 'h', 0, "logical",
+  'outputfile' , 'o', 1, "character",
+  'plots' , 'p', 1, "character",
+  'input' , 'i', 1, "character",
+  'samples', 's', 1, "character"
+), byrow=TRUE, ncol=4);
+opt = getopt(spec);
+
+
+# if help was asked for print a friendly message
+# and exit with a non-zero error code
+if ( !is.null(opt$help) ) {
+  cat(getopt(spec, usage=TRUE));
+  q(status=1);
+}
+
+trim <- function (x) gsub("^\\s+|\\s+$", "", x)
+opt$samples<-trim(opt$samples)
+
+
+htseqCountTable = read.table(opt$input, sep="\t", comment="", as.is=T)
+
+l<-unique(c(htseqCountTable[1,-1]))
+#print(l)
+
+colnames(htseqCountTable) <- htseqCountTable[1,]
+names(htseqCountTable)<-colnames(htseqCountTable)
+tagnames <- htseqCountTable[-(1:2), 1]
+htseqCountTable <- htseqCountTable[-(1:2),-1]
+htseqCountTable[,1] <- gsub(",", ".", htseqCountTable[,1])
+
+
+library( "DESeq2" )
+
+opt$samples
+
+print('----------------')
+
+pdf(opt$plots)
+
+if (opt$samples=="all_vs_all"){
+
+  for (i in seq(1, length(l), by=1)) {
+    k=i+1
+    if(k<=length(l)){
+      for (j in seq(k, length(l), by=1)) {
+        
+        colData<-names(htseqCountTable[, which(gsub("[.][0-9]+","",names(htseqCountTable)) %in% c(l[[i]], l[[j]]))])
+        currentPairTable<-htseqCountTable[ , which(gsub("[.][0-9]+","",names(htseqCountTable)) %in% c(l[[i]], l[[j]]))]
+        #      rownames(currentPairTable)<-tagnames
+        
+        write.table(currentPairTable, file=paste(l[[i]],"_", l[[j]],".txt"))
+        currentPairTable<-read.table(paste(l[[i]],"_", l[[j]],".txt"),row.names=1,header=T)
+        
+        currentPairTable <- as.data.frame(currentPairTable,stringsAsFactors=F)
+        #print(currentPairTable)
+        pdata = data.frame(condition=factor(c(rep(l[[i]], 2), rep(l[[j]], 2))),row.names=colnames(currentPairTable))
+        dds = DESeqDataSetFromMatrix(countData = currentPairTable,  colData = pdata, design = ~ condition)
+        
+        #      colData(dds)$condition
+        dds <- DESeq(dds)
+        sizeFactors(dds)
+        plotDispEsts(dds)
+        plotMA(dds)
+        
+        res <- results(dds)
+        rownames(res)<-tagnames
+        resSorted <- res[order(res$padj),];
+        write.csv( as.data.frame(resSorted), file = opt$outputfile )
+      }
+    }
+  }
+}else{
+
+  sampleSets<-unlist(strsplit(opt$samples, " "))
+  sampleSets
+  samplesControl <- {}
+  samplesExperiment <- {}
+  sampleNames <- {}
+
+  samplesControl<-unlist(strsplit(sampleSets[1], ","))
+  samplesExperiment<-unlist(strsplit(sampleSets[2], ","))
+
+  print(samplesControl)
+  print(samplesExperiment)
+
+  # the minus one is needed because we get column indizes including the first column
+  samplecolumns <- c()
+  for (i in samplesControl) samplecolumns[[length(samplecolumns) + 1]] <- as.integer(i) - 1
+  for (i in samplesExperiment) samplecolumns[[length(samplecolumns) + 1]] <- as.integer(i) - 1
+  
+print(samplecolumns)
+  
+  htseqSubCountTable <- htseqCountTable[,samplecolumns]
+
+  #TODO: pavan hack :)
+  #write.table(htseqSubCountTable, file=paste(gsub(" ","_",opt$samples),".txt"))
+  #htseqSubCountTable<-read.table(paste(gsub(" ","_",opt$samples),".txt"),row.names=1,header=T)
+
+  pdata = data.frame(condition=factor(c(names( htseqSubCountTable )[ samplecolumns ])),row.names=colnames(htseqSubCountTable))
+  dds = DESeqDataSetFromMatrix(countData = bla,  colData = pdata, design = ~ condition)
+  
+  dds <- DESeq(dds)
+  sizeFactors(dds)
+  plotDispEsts(dds)
+  plotMA(dds)
+  
+  res <- results(dds)
+  rownames(res)<-tagnames
+  resSorted <- res[order(res$padj),];
+  write.csv(as.data.frame(resSorted), file = opt$outputfile)  
+}
+dev.off()
--- a/deseq2.xml	Fri Aug 02 13:16:18 2013 -0400
+++ b/deseq2.xml	Mon Sep 02 10:09:37 2013 -0400
@@ -1,27 +1,104 @@
-<tool id="deseq2" name="DESeq2" version="0.1">
-
-    <description>deseq</description>
+<tool id="deseq2" name="DESeq2" version="2.0">
+  <description>Determines differentially expressed transcripts from read alignments</description>
     <requirements>
+        <requirement type="binary">Rscript</requirement>
+        <requirement type="R-module">DESeq2</requirement>
         <requirement type="package" version="3.0.1">R_3_0_1</requirement>
-        <requirement type="package" version="2.3.6">rpy2_2_3_6</requirement>
-        <requirement type="package" version="1.0.17">deseq2</requirement>
+        <requirement type="set_environment">DESEQ2_SCRIPT_PATH</requirement>
     </requirements>
-    <command interpreter="python">
-    deseq2.py > $outfile
+    <command interpreter="Rscript">
+        deseq2.R
+            -o "$deseq_out"
+            -p "$plots" 
+            -i "$input_matrix"
+
+        #if $filter_sel.filter_sel_opts == 'all_vs_all':
+            -s 'all_vs_all'
+        #else:
+            -s ## build a string like '1:2 5:6'
+            "${filter_sel.control_cols} ${filter_sel.experiement_cols}"
+        #end if
+
     </command>
     <inputs>
+        <param format="tabular" name="input_matrix" type="data" label="Countmatrix" help="You can create a count matrix with the tool ..."/>
+
+         <conditional name="filter_sel">
+            <param name="filter_sel_opts" type="select" label="Select properties to filter">
+              <option value="all_vs_all">All against all</option>
+              <option value="specific">specific</option>
+            </param>
+            <when value="specific">
+
+                <param name="control_cols" label="Select columns containing treatment." type="data_column" data_ref="input_matrix"
+                    numerical="True" multiple="true" use_header_names="true" size="120" display="checkboxes">
+                    <validator type="no_options" message="Please select at least one column."/>
+                </param>
+
+                <param name="experiement_cols" label="Select columns containing treatment." type="data_column" data_ref="input_matrix"
+                    numerical="True" multiple="true" use_header_names="true" size="120" display="checkboxes">
+                    <validator type="no_options" message="Please select at least one column."/>
+                </param>
+
+            </when>
+            <when value="all_v_all" />
+        </conditional>
+
     </inputs>
 
-    <outputs>
-        <data format="txt" name="outfile" label="${tool.name} on ${on_string}: out"/>
-    </outputs>
+  <outputs>
+    <data format="txt" name="deseq_out" label="DESeq2 result file"/>
+    <data format="pdf" name="plots" label="Plot collection"/>
+    <data format="txt" name="log" label="DESeq2 log file"/>
+  </outputs>
 
-    <tests>
-    </tests>
+  <help>
+
+.. class:: infomark
 
-    <help>
+**What it does** 
+
+`DESeq` is a tool for differential expression testing of RNA-Seq data.
 
 
+**Inputs**
 
-  </help>
+`DESeq` requires three input files to run:
+
+1. Annotation file in GFF3, containing the necessary information about the transcripts that are to be quantified.
+2. The BAM alignment files grouped into replicate groups, each containing several replicates. BAM files store the read alignments in a compressed format. They can be generated using the `SAM-to-BAM` tool in the NGS: SAM Tools section. (The script will also work with only two groups containing only a single replicate each. However, this analysis has less statistical power and is  therefor not recommended.)
+
+**Output**
+
+`DESeq` generates a text file containing the gene name and the p-value.
+
+------
+
+**Licenses**
+
+If **DESeq** is used to obtain results for scientific publications it
+should be cited as [1]_.
+
+**References** 
+
+.. [1] Anders, S and Huber, W (2010): `Differential expression analysis for sequence count data`_. 
+
+.. _Differential expression analysis for sequence count data: http://dx.doi.org/10.1186/gb-2010-11-10-r106
+
+------
+
+For more information see http://www.sequenceontology.org/gff3.shtml
+
+**SAM/BAM format** The Sequence Alignment/Map (SAM) format is a
+tab-limited text format that stores large nucleotide sequence
+alignments. BAM is the binary version of a SAM file that allows for
+fast and intensive data processing. The format specification and the
+description of SAMtools can be found on
+http://samtools.sourceforge.net/.
+
+------
+
+DESeq-hts Wrapper Version 0.3 (Feb 2012)
+
+</help>
 </tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/countmatrix.txt	Mon Sep 02 10:09:37 2013 -0400
@@ -0,0 +1,19 @@
+#	G1:Mut1	G1:Mut1	G2:Mut2	G2:Mut2	G3:WT	G3:WT
+#Feature	Spl1	Spl2	Spl3	Spl4	Spl5	Spl6
+NM_001001130	97	43	61	34	73	26
+NM_001001144	25	8	9	3	5	5
+NM_001001152	72	45	29	20	31	13
+NM_001001160	0	1	1	1	0	0
+NM_001001177	0	1	0	4	3	3
+NM_001001178	0	2	1	0	4	0
+NM_001001179	0	0	0	0	0	2
+NM_001001180	0	0	0	0	0	2
+NM_001001181	415	319	462	185	391	155
+NM_001001182	1293	945	987	297	938	496
+NM_001001183	5	4	11	7	11	2
+NM_001001184	135	198	178	110	205	64
+NM_001001185	186	1	0	1	1	0
+NM_001001186	75	90	91	34	63	54
+NM_001001187	267	236	170	165	202	51
+NM_001001295	5	2	6	1	7	0
+NM_001001309	1	0	0	1	2	1
--- a/tool_dependencies.xml	Fri Aug 02 13:16:18 2013 -0400
+++ b/tool_dependencies.xml	Mon Sep 02 10:09:37 2013 -0400
@@ -3,10 +3,10 @@
     <package name="R_3_0_1" version="3.0.1">
         <repository changeset_revision="bae5c9880b71" name="package_r_3_0_1" owner="bgruening" toolshed="http://testtoolshed.g2.bx.psu.edu" />
     </package>
-    <package name="rpy2_2_3_6" version="2.3.6">
-        <repository changeset_revision="738e7d34477d" name="package_rpy2_2_3_6" owner="bgruening" toolshed="http://testtoolshed.g2.bx.psu.edu" />
-    </package>
     <package name="deseq2" version="1.0.17">
         <repository changeset_revision="eab36a9a70b1" name="package_deseq2_1_0_17" owner="bgruening" toolshed="http://testtoolshed.g2.bx.psu.edu" />
     </package>
+    <set_environment version="1.0">
+        <environment_variable action="set_to" name="DESEQ2_SCRIPT_PATH">$REPOSITORY_INSTALL_DIR/scripts</environment_variable>
+    </set_environment>
 </tool_dependency>