changeset 34:d917cde0f94b draft

Uploaded
author fubar
date Thu, 08 Aug 2013 07:41:46 -0400
parents 3906fe90d881
children e427310cc7af
files htseq_bams_to_count_matrix/generatetest.sh htseq_bams_to_count_matrix/htseqsams2mx.xml htseq_bams_to_count_matrix/htseqsams2mx.xml~ htseq_bams_to_count_matrix/tool_dependencies.xml
diffstat 4 files changed, 148 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/htseq_bams_to_count_matrix/generatetest.sh	Thu Aug 08 07:41:46 2013 -0400
@@ -0,0 +1,1 @@
+python ../htseqsams2mx.py -g rn4_chr20_100k.gtf -o test.xls --samf "'rn4chr20test1.bam','col1'" --samf "'rn4chr20test2.bam','col2'"
--- a/htseq_bams_to_count_matrix/htseqsams2mx.xml	Wed Aug 07 21:54:01 2013 -0400
+++ b/htseq_bams_to_count_matrix/htseqsams2mx.xml	Thu Aug 08 07:41:46 2013 -0400
@@ -1,4 +1,4 @@
-<tool id="htseqsams2mx" name="SAM/BAM to count matrix" version="0.3">
+<tool id="htseqsams2mx" name="SAM/BAM to count matrix" version="0.2">
  <description>using HTSeq code</description>
   <stdio>
      <exit_code range="666"   level="warning"   
@@ -6,7 +6,6 @@
   </stdio>
   <requirements>
       <requirement type="package" version="1.7.1">numpy</requirement>
-      <requirement type="package" version="0.7.5">pysam</requirement>
       <requirement type="package" version="2.4.11">freetype</requirement>
       <requirement type="package" version="1.2.1">matplotliblite</requirement> 
       <requirement type="package" version="0.5.4p3">htseq</requirement>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/htseq_bams_to_count_matrix/htseqsams2mx.xml~	Thu Aug 08 07:41:46 2013 -0400
@@ -0,0 +1,145 @@
+<tool id="htseqsams2mx" name="SAM/BAM to count matrix" version="0.2">
+ <description>using HTSeq code</description>
+  <stdio>
+     <exit_code range="666"   level="warning"   
+       description="Exit code 666 encountered" />
+  </stdio>
+  <requirements>
+      <requirement type="package" version="1.7.1">numpy</requirement>
+      <requirement type="package" version="2.4.11">freetype</requirement>
+      <requirement type="package" version="1.2.1">matplotliblite</requirement> 
+      <requirement type="package" version="0.5.4p3">htseq</requirement>
+  </requirements>
+  <command interpreter="python">
+    htseqsams2mx.py -g "$gfffile" -o "$outfile" -m "$model" --id_attribute "$id_attr" --feature_type "$feature_type"
+    --samf "'$firstsamf','${firstsamf.name}'"
+    #if $secondsamf:
+    --samf "'$secondsamf','${secondsamf.name}'"
+    #end if
+    #if $thirdsamf:
+    --samf "'$thirdsamf','${thirdsamf.name}'"
+    #end if
+    #if $fourthsamf:
+    --samf "'$fourthsamf','${fourthsamf.name}'"
+    #end if
+    #for $s in $samfiles:
+      #if $s.samf:
+        --samf "'${s.samf}','${s.samf.name}'" 
+      #end if
+    #end for
+  </command>
+  <inputs>
+    <param format="gtf" name="gfffile" type="data" label="Gene model (GFF) file to count reads over from your current history" size="100" />
+    <param name="mapqMin" label="Filter reads with mapq below than this value" 
+    help="0 to count any mapping quality read. Otherwise only reads at or above specified mapq will be counted" 
+    type="integer" value="5"/>
+    <param name="title" label="Name for this job's output file" type="text" size="80" value="bams to DGE count matrix"/>
+    <param name="stranded" value="false" type="boolean" label="Reads are stranded - use strand in counting" display="checkbox"
+      truevalue="yes" falsevalue="no" checked="no" help="Check this ONLY if you know your sequences are strand specific" />
+    <param name="model"  type="select" label="Model for counting reads over the supplied gene model- see HTSeq docs"
+        help="If in doubt, union is a reasonable default but intersection-strict avoids double counting over overlapping exons">
+        <option value="union" selected="true">union</option>
+        <option value="intersection-strict">intersection-strict</option>
+        <option value="intersection-nonempty">intersection-nonempty</option>
+    </param>   
+    <param name="id_attr" type="select" label="GTF attribute to output as the name for each contig - see HTSeq docs"
+        help="If in doubt, use gene name or if you need the id in your GTF, gene id">
+        <option value="gene_name" selected="true">gene name</option>
+        <option value="gene_id">gene id</option>
+        <option value="transcript_id">transcript id</option>
+        <option value="transcript_name">transcript name</option>
+    </param>   
+    <param name="feature_type" type="select" label="GTF feature type for counting reads over the supplied gene model- see HTSeq docs"
+        help="GTF feature type to count over - exon is a good choice with gene name as the contig to count over">
+        <option value="exon" selected="true">exon</option>
+        <option value="CDS">CDS</option>
+        <option value="UTR">UTR</option>
+        <option value="transcript">transcript</option>
+    </param>   
+    <param name="firstsamf" type="data" label="First bam/sam file from your history to count reads overlapping gene model regions"
+          format="sam,bam" optional="false"/>
+    <param name="secondsamf" type="data" label="Another bam/sam file from your history to count reads overlapping gene model regions"
+          format="sam,bam" optional="false"/>
+    <param name="thirdsamf" type="data" label="Another bam/sam file from your history to count reads overlapping gene model regions"
+          format="sam,bam" optional="true"/>
+    <param name="fourthsamf" type="data" label="Another bam/sam file from your history to count reads overlapping gene model regions"
+          format="sam,bam" optional="true"/>
+    <repeat name="samfiles" title="Use this to add all needed additional bam/sam files from your history to count reads overlapping gene model regions">
+        <param name="samf" type="data" label="Additional bam/sam file from your history" format="sam,bam" size="100" optional="true"/>
+    </repeat>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="outfile" label="${title}_htseqsams2mx.xls" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="feature_type" value="exon" />
+      <param name="gfffile" value="rn4_chr20_100k.gtf" />
+      <param name="firstsamf" value="rn4chr20test1.bam" label="rn4chr20test1.bam"/>
+      <param name="secondsamf" value="rn4chr20test2.bam" label="rn4chr20test2.bam" />
+      <param name="id_attr" value="gene_name" />
+      <param name="model" value="union" />
+      <param name="stranded" value="no" />
+      <param name="title" value="htseqtest" />
+      <param name="mapqMin" value="0" />
+
+      <output name="outfile" file="htseqsams2mx_test1_out.xls" lines_diff="1"/>
+    </test>
+  </tests>
+  <help>
+
+**What this tool does**
+
+Counts reads in multiple sam/bam format mapped files and generates a matrix ideal for edgeR and other count based tools
+It uses HTSeq to count your sam reads over a gene model supplied as a GTF file
+The output is a tabular text (columnar - spreadsheet) file containing the 
+count matrix for downstream processing. Each row contains the counts from each sample for each
+of the non-emtpy GTF input file contigs matching the GTF attribute choice above. 
+You probably want to use gene level GTF output attribute and count reads that overlap 
+GTF exons for RNA-seq. Or you can count over exons by using transcript level output names or ids. Etc.
+
+----
+
+**Author's plea on replicates**
+
+If you want to interpret the downstream p values in terms of rejecting or accepting the null hypothesis 
+under random sampling with replacement from the universe of possible biological/experimental replicates from which your data was derived,
+which is what published p values are often assumed to do, then you need biological 
+(or for cell culture material experimental) replicates. 
+
+Using technical or no replicates means the downstream p values are not interpretable the way most people would assume 
+they are - ie as the probability of obtaining a result as or more extreme as your experimental data
+in millions of experiments conducted using the same methods under the null hypothesis.
+
+There is no way around this and it is scientific fraud to ignore this issue and publish bogus p values derived from 
+technical or no replicates without making the lack of biological or experimental error in the p value calculations 
+clear to your readers so they can adjust their expectations. However, the buck stops here at higher level inference.
+If you have no replicates, you must not use this tool as the p values are uninterpretable. So there.
+
+See your stats 101 notes on the central limit theorem and test statistics for a refresher or talk to a 
+statistician if this makes no sense please.
+
+**Attribution**
+
+This Galaxy tool relies on HTSeq_ from http://www-huber.embl.de/users/anders/HTSeq/doc/index.html 
+for the tricky work of counting. That code includes the following attribution:
+
+## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology
+## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General
+## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3
+
+It will be automatically installed if you use the toolshed as in general, you probably should.
+HTSeq_ must be installed with this tool if you install manually.
+
+Otherwise, all code and documentation comprising this tool including the requirement
+for more than one sample bam
+was written by Ross Lazarus and is 
+licensed to you under the LGPL_ like other rgenetics artefacts
+
+Sorry, don't use so can't be buggered with read groups - contributions welcome - send code
+
+.. _LGPL: http://www.gnu.org/copyleft/lesser.html
+.. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
+  </help>
+
+</tool>
--- a/htseq_bams_to_count_matrix/tool_dependencies.xml	Wed Aug 07 21:54:01 2013 -0400
+++ b/htseq_bams_to_count_matrix/tool_dependencies.xml	Thu Aug 08 07:41:46 2013 -0400
@@ -3,9 +3,6 @@
     <package name="numpy" version="1.7.1">
         <repository changeset_revision="af9633757cf0" name="package_numpy_1_7" owner="blankenberg" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu" />
     </package>
-    <package name="pysam" version="0.7.5">
-        <repository changeset_revision="a4e35f23093f" name="package_pysam_0_7_5" owner="fubar" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu" />
-    </package>
     <package name="freetype" version="2.4.11">
         <repository changeset_revision="4e54e357ac25" name="package_freetype_2_4" owner="bgruening" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" />
     </package>
@@ -22,9 +19,6 @@
                         <repository changeset_revision="af9633757cf0" name="package_numpy_1_7" owner="blankenberg" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu">
                             <package name="numpy" version="1.7.1" />
                         </repository>
-                        <repository changeset_revision="a4e35f23093f" name="package_pysam_0_7_5" owner="fubar" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu">
-                            <package name="pysam" version="0.7.5" />
-                        </repository>
                         <repository changeset_revision="4e54e357ac25" name="package_freetype_2_4" owner="bgruening" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/">
                             <package name="freetype" version="2.4.11" />
                         </repository>
@@ -40,7 +34,7 @@
             </actions>
         </install>
         <readme>
-            Installation of HTSeq requires Python 2.5+ (does not yet work with Python 3), pysam and the Numpy Python package. 
+            Installation of HTSeq requires Python 2.5+ (does not yet work with Python 3), and the Numpy Python package. 
             Note this uses the matplotlib lite version dependent on the lite version of numpy - no atlas compilation 
         </readme>
     </package>