changeset 17:13f82e0a397c draft

Should now pass functional test. Painful business
author fubar
date Fri, 07 Jun 2013 07:34:25 -0400
parents 63eaa64904be
children 8c5184649b23
files htseq_bams_to_count_matrix/generatetest.sh htseq_bams_to_count_matrix/htseqsams2mx.py htseq_bams_to_count_matrix/htseqsams2mx.xml htseq_bams_to_count_matrix/test-data/generatetest.sh
diffstat 4 files changed, 25 insertions(+), 17 deletions(-) [+]
line wrap: on
line diff
--- a/htseq_bams_to_count_matrix/generatetest.sh	Fri Jun 07 02:59:49 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-#python ../htseqsams2mx.py -g rn4_chr20_100k.gtf -o test.xls --samf "'rn4chr20test1.bam','col1'" --samf "'rn4chr20test2.bam','col2'"
-python ../htseqsams2mx.py -g rn4_chr20_100k.gtf -o htseqsams2mx_test1_out.xls --samf "'rn4chr20test1.bam',''" --samf "'rn4chr20test2.bam',''"
--- a/htseq_bams_to_count_matrix/htseqsams2mx.py	Fri Jun 07 02:59:49 2013 -0400
+++ b/htseq_bams_to_count_matrix/htseqsams2mx.py	Fri Jun 07 07:34:25 2013 -0400
@@ -121,7 +121,7 @@
           sys.stdout.write( "Warning: No features of type '%s' found.\n" % opts.feature_type )
        for sami,sam_filename in enumerate(sam_filenames):
            colname = colnames[sami]
-           isbam = sam_filename.endswith('.bam')
+           isbam = colname.endswith('.bam')
            try:
               if isbam:
                   read_seq = HTSeq.BAM_Reader( sam_filename )
--- a/htseq_bams_to_count_matrix/htseqsams2mx.xml	Fri Jun 07 02:59:49 2013 -0400
+++ b/htseq_bams_to_count_matrix/htseqsams2mx.xml	Fri Jun 07 07:34:25 2013 -0400
@@ -4,12 +4,14 @@
      <exit_code range="666"   level="warning"   
        description="Exit code 666 encountered" />
   </stdio>
+  <!--
   <requirements>
       <requirement type="package" version="0.5.4p3">htseq</requirement>
       <requirement type="package" version="2.4.11">freetype</requirement>
       <requirement type="package" version="1.7.1">numpy</requirement>
       <requirement type="package" version="1.2.1">matplotlib</requirement> 
   </requirements>
+  -->
   <command interpreter="python">
     htseqsams2mx.py -g "$gfffile" -o "$outfile" -m "$model" --id_attribute "$id_attr" --feature_type "$feature_type"
     --samf "'$firstsamf','${firstsamf.name}'"
@@ -23,7 +25,7 @@
     --samf "'$fourthsamf','${fourthsamf.name}'"
     #end if
     #for $s in $samfiles:
-      #if $s.samf != None:
+      #if $s.samf:
         --samf "'${s.samf}','${s.samf.name}'" 
       #end if
     #end for
@@ -33,7 +35,7 @@
     <param name="mapqMin" label="Filter reads with mapq below than this value" 
     help="0 to count any mapping quality read. Otherwise only reads at or above specified mapq will be counted" 
     type="integer" value="5"/>
-    <param name="title" label="Name for this job's output file" type="text" size="80" value="bams to DGE matrix"/>
+    <param name="title" label="Name for this job's output file" type="text" size="80" value="bams to DGE count matrix"/>
     <param name="stranded" value="false" type="boolean" label="Reads are stranded - use strand in counting" display="checkbox"
       truevalue="yes" falsevalue="no" checked="no" help="Check this ONLY if you know your sequences are strand specific" />
     <param name="model"  type="select" label="Model for counting reads over the supplied gene model- see HTSeq docs"
@@ -57,9 +59,9 @@
         <option value="transcript">transcript</option>
     </param>   
     <param name="firstsamf" type="data" label="First bam/sam file from your history to count reads overlapping gene model regions"
-          format="sam,bam" optional="true"/>
+          format="sam,bam" optional="false"/>
     <param name="secondsamf" type="data" label="Another bam/sam file from your history to count reads overlapping gene model regions"
-          format="sam,bam" optional="true"/>
+          format="sam,bam" optional="false"/>
     <param name="thirdsamf" type="data" label="Another bam/sam file from your history to count reads overlapping gene model regions"
           format="sam,bam" optional="true"/>
     <param name="fourthsamf" type="data" label="Another bam/sam file from your history to count reads overlapping gene model regions"
@@ -75,14 +77,15 @@
     <test>
       <param name="feature_type" value="exon" />
       <param name="gfffile" value="rn4_chr20_100k.gtf" />
-      <param name="firstsamf" dbkey='rn4' ftype="bam" value="rn4chr20test1.bam" />
-      <param name="id_attr" value="gene_id" />
+      <param name="firstsamf" value="rn4chr20test1.bam" label="rn4chr20test1.bam"/>
+      <param name="secondsamf" value="rn4chr20test2.bam" label="rn4chr20test2.bam" />
+      <param name="id_attr" value="gene_name" />
       <param name="model" value="union" />
       <param name="stranded" value="no" />
       <param name="title" value="htseqtest" />
       <param name="mapqMin" value="0" />
-      <param name="secondsamf" value="rn4chr20test2.bam" ftype="bam" dbkey="rn4" />
-      <output name="outfile" file="htseqsams2mx_test1_out.xls" ftype="tabular" lines_diff="1"/>
+
+      <output name="outfile" file="htseqsams2mx_test1_out.xls" lines_diff="1"/>
     </test>
   </tests>
   <help>
@@ -99,20 +102,21 @@
 
 ----
 
-**Tool author's plea on the importance of replicates**
+**Author's plea on replicates**
 
-If you want the downstream p values to inform you about your data in terms of rejecting or accepting the null hypothesis 
-under random sampling from the universe of possible biological/experimental replicates from which your data was drawn,
+If you want to interpret the downstream p values in terms of rejecting or accepting the null hypothesis 
+under random sampling with replacement from the universe of possible biological/experimental replicates from which your data was derived,
 which is what published p values are often assumed to do, then you need biological 
 (or for cell culture material experimental) replicates. 
 
 Using technical or no replicates means the downstream p values are not interpretable the way most people would assume 
 they are - ie as the probability of obtaining a result as or more extreme as your experimental data
-in millions of experiments conducted under the null hypothesis.
+in millions of experiments conducted using the same methods under the null hypothesis.
 
 There is no way around this and it is scientific fraud to ignore this issue and publish bogus p values derived from 
 technical or no replicates without making the lack of biological or experimental error in the p value calculations 
-clear to your readers.
+clear to your readers so they can adjust their expectations. However, the buck stops here at higher level inference.
+If you have no replicates, you must not use this tool as the p values are uninterpretable. So there.
 
 See your stats 101 notes on the central limit theorem and test statistics for a refresher or talk to a 
 statistician if this makes no sense please.
@@ -129,9 +133,13 @@
 It will be automatically installed if you use the toolshed as in general, you probably should.
 HTSeq_ must be installed with this tool if you install manually.
 
-Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is 
+Otherwise, all code and documentation comprising this tool including the requirement
+for more than one sample bam
+was written by Ross Lazarus and is 
 licensed to you under the LGPL_ like other rgenetics artefacts
 
+Sorry, don't use so can't be buggered with read groups - contributions welcome - send code
+
 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
   </help>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/htseq_bams_to_count_matrix/test-data/generatetest.sh	Fri Jun 07 07:34:25 2013 -0400
@@ -0,0 +1,2 @@
+#python ../htseqsams2mx.py -g rn4_chr20_100k.gtf -o test.xls --samf "'rn4chr20test1.bam','col1'" --samf "'rn4chr20test2.bam','col2'"
+python ../htseqsams2mx.py -g rn4_chr20_100k.gtf -o htseqsams2mx_test1_out.xls --samf "'rn4chr20test1.bam',''" --samf "'rn4chr20test2.bam',''"