Mercurial > repos > fubar > htseq_bams_to_count_matrix

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/htseq_bams_to_count_matrix/generatetest.sh	Fri Jun 07 01:50:53 2013 -0400
@@ -0,0 +1,1 @@
+python ../htseqsams2mx.py -g rn4_chr20_100k.gtf -o test.xls --samf "'rn4chr20test1.bam','col1'" --samf "'rn4chr20test2.bam','col2'"
--- a/htseq_bams_to_count_matrix/htseqsams2mx.py	Thu Jun 06 22:14:16 2013 -0400
+++ b/htseq_bams_to_count_matrix/htseqsams2mx.py	Fri Jun 07 01:50:53 2013 -0400
@@ -60,52 +60,17 @@
         self.msg = msg


-def keynat(s=None):
-     '''
-     borrowed from http://code.activestate.com/recipes/285264-natural-string-sorting/
-     A natural sort helper function for sort() and sorted()
-     without using regular expressions or exceptions.
-     >>> items = ('Z', 'a', '10th', '1st', '9') sorted(items)
-     ['10th', '1st', '9', 'Z', 'a']
-     >>> sorted(items, key=keynat)
-     ['1st', '9', '10th', 'a', 'Z']
-     '''
-     if type(s) == type([]) or type(s) == type(()) :
-         s = s[0]
-     it = type(1)
-     r = []
-     for c in s:
-         if c.isdigit():
-             d = int(c)
-             if r and type( r[-1] ) == it:
-                 r[-1] = r[-1] * 10 + d
-             else:
-                 r.append(d)
-         else:
-             r.append(c.lower())
-     return r
-
-
-def sort_table(table, cols):
-    """ sort a table by multiple columns
-        table: a list of lists (or tuple of tuples) where each inner list
-               represents a row
-        cols:  a list (or tuple) specifying the column numbers to sort by
-               e.g. (1,0) would sort by column 1, then by column 0
+def htseqMX(gff_filename,sam_filenames,colnames,opts):
     """
-    for col in reversed(cols):
-        table = sorted(table, key=operator.itemgetter(col))
-    return table
-
-
-
-def htseqMX(gff_filename,sam_filenames,colnames,opts):
-
+    Code taken from count.py in Simon Anders HTSeq distribution
+    Wrapped in a loop to accept multiple bam/sam files and their names from galaxy to
+    produce a matrix of contig counts by sample for downstream use in edgeR and DESeq tools
+    """
     class UnknownChrom( Exception ):
        pass

     def my_showwarning( message, category, filename, lineno = None, line = None ):
-       sys.stderr.write( "Warning: %s\n" % message )
+       sys.stdout.write( "Warning: %s\n" % message )

     def invert_strand( iv ):
        iv2 = iv.copy()
@@ -156,12 +121,19 @@
           sys.stdout.write( "Warning: No features of type '%s' found.\n" % opts.feature_type )
        for sami,sam_filename in enumerate(sam_filenames):
            colname = colnames[sami]
+           isbam = sam_filename.endswith('.bam')
            try:
-              read_seq = HTSeq.SAM_Reader( sam_filename )
+              if isbam:
+                  read_seq = HTSeq.BAM_Reader( sam_filename )
+              else:
+                  read_seq = HTSeq.SAM_Reader( sam_filename )
               first_read = iter(read_seq).next()
               pe_mode = first_read.paired_end
            except:
-              sys.stderr.write( "Error occured when reading first line of sam file %s\n" % sam_filename )
+              if isbam:
+                  sys.stderr.write( "Error occured when reading first line of bam file %s colname=%s \n" % (sam_filename,colname) )
+              else:
+                  sys.stderr.write( "Error occured when reading first line of sam file %s colname=%s \n" % (sam_filename,colname ))
               raise

            try:
@@ -262,7 +234,7 @@
               raise

            if not opts.quiet:
-              sys.stdout.write( "%d sam %s processed.\n" % ( sami, "lines " if not pe_mode else "line pairs" ) )
+              sys.stdout.write( "%d sam %s processed.\n" % ( seqi, "lines " if not pe_mode else "line pairs" ) )
        return counts,empty,ambiguous,lowqual,notaligned,nonunique

     warnings.showwarning = my_showwarning
@@ -280,7 +252,7 @@


 def usage():
-        print >> sys.stderr, """Usage: python htseqsams2mx.py -w <halfwindowsize> -g <gfffile.gff> -o <outfilename> [-i] [-c] --samf "<sam1.sam>,<sam1.column_header>" --samf "...<samN.column_header>" """
+        print >> sys.stdout, """Usage: python htseqsams2mx.py -w <halfwindowsize> -g <gfffile.gff> -o <outfilename> [-i] [-c] --samf "<sam1.sam>,<sam1.column_header>" --samf "...<samN.column_header>" """
         sys.exit(1)

 if __name__ == "__main__":
@@ -345,15 +317,23 @@
     contigs = counts.keys()
     contigs.sort()
     totalc = 0
+    emptycontigs = 0
     for contig in contigs:
-        totalc += sum(counts[contig])
-        crow = [contig,] + ['%d' % x for x in counts[contig]]
-        res.append('\t'.join(crow))
+        thisc = sum(counts[contig])
+        if thisc > 0: # no output for empty contigs
+            totalc += thisc
+            crow = [contig,] + ['%d' % x for x in counts[contig]]
+            res.append('\t'.join(crow))
+        else:
+            emptycontigs += 1
     outf = open(opts.outfname,'w')
     outf.write('\n'.join(res))
     outf.write('\n')
     outf.close()
     walltime = int(time.time() - starttime)
-    accumulatornames = ('walltimeseconds','contigs','emptyread','ambiguous','lowqual','notaligned','nonunique')
-    notes = ['%s=%d' % (accumulatornames[i],x) for i,x in enumerate((len(contigs),empty,ambiguous,lowqual,notaligned,nonunique))]
+    accumulatornames = ('walltimeseconds','totreadscounted','ncontigs','emptyreads','ambiguousreads','lowqualreads',
+           'notalignedreads','nonuniquereads','emptycontigs')
+    accums = (walltime,totalc,len(contigs),empty,ambiguous,lowqual,notaligned,nonunique,emptycontigs)
+    notes = ['%s=%d' % (accumulatornames[i],x) for i,x in enumerate(accums)]
     print >> sys.stdout, ','.join(notes)
+    sys.exit(0)
--- a/htseq_bams_to_count_matrix/htseqsams2mx.xml	Thu Jun 06 22:14:16 2013 -0400
+++ b/htseq_bams_to_count_matrix/htseqsams2mx.xml	Fri Jun 07 01:50:53 2013 -0400
@@ -1,26 +1,39 @@
-<tool id="htseqsams2mx" name="Multiple SAMs to count matrix" version="0.2">
-    <description>for DGE</description>
-    <requirements>
+<tool id="htseqsams2mx" name="SAM/BAM to count matrix" version="0.2">
+ <description>using HTSeq code</description>
+  <stdio>
+     <exit_code range="666"   level="warning"
+       description="Exit code 666 encountered" />
+  </stdio>
+  <requirements>
       <requirement type="package" version="0.5.4p3">htseq</requirement>
       <requirement type="package" version="2.4.11">freetype</requirement>
       <requirement type="package" version="1.7.1">numpy</requirement>
       <requirement type="package" version="1.2.1">matplotlib</requirement>
-    </requirements>
-    <command interpreter="python">
+  </requirements>
+  <command interpreter="python">
     htseqsams2mx.py -g "$gfffile" -o "$outfile" -m "$model" --id_attribute "$id_attr" --feature_type "$feature_type"
     --samf "'$firstsamf','${firstsamf.name}'"
+    #if secondsamfile:
+    --samf "'$secondsamf','${secondsamf.name}'"
+    #end if
+    #if thirdsamfile:
+    --samf "'$thirdsamf','${thirdsamf.name}'"
+    #end if
+    #if fourthsamfile:
+    --samf "'$fourthsamf','${fourthsamf.name}'"
+    #end if
     #for $s in $samfiles:
     --samf "'${s.samf}','${s.samf.name}'"
     #end for
-    </command>
+  </command>
   <inputs>
     <param format="gtf" name="gfffile" type="data" label="Gene model (GFF) file to count reads over from your current history" size="100" />
     <param name="mapqMin" label="Filter reads with mapq below than this value"
     help="0 to count any mapping quality read. Otherwise only reads at or above specified mapq will be counted"
     type="integer" value="5"/>
     <param name="title" label="Name for this job's output file" type="text" size="80" value="bams to DGE matrix"/>
-    <param name="forceName" value="false" type="boolean" label="Force replacement to chr:start-offset as the name for each contig in the output"
-      truevalue="true" falsevalue="false" checked="no" help="Leave as false to use the contig names as supplied in your bed file" />
+    <param name="stranded" value="false" type="boolean" label="Reads are stranded - use strand in counting" display="checkbox"
+      truevalue="yes" falsevalue="no" checked="no" help="Check this ONLY if you know your sequences are strand specific" />
     <param name="model"  type="select" label="Model for counting reads over the supplied gene model- see HTSeq docs"
         help="If in doubt, union is a reasonable default but intersection-strict avoids double counting over overlapping exons">
         <option value="union" selected="true">union</option>
@@ -28,65 +41,98 @@
         <option value="intersection-nonempty">intersection-nonempty</option>
     </param>
     <param name="id_attr" type="select" label="GTF attribute to output as the name for each contig - see HTSeq docs"
-        help="If in doubt, gene name is the only option right now">
+        help="If in doubt, use gene name or if you need the id in your GTF, gene id">
         <option value="gene_name" selected="true">gene name</option>
         <option value="gene_id">gene id</option>
         <option value="transcript_id">transcript id</option>
         <option value="transcript_name">transcript name</option>
     </param>
     <param name="feature_type" type="select" label="GTF feature type for counting reads over the supplied gene model- see HTSeq docs"
-        help="exon is all">
+        help="GTF feature type to count over - exon is a good choice with gene name as the contig to count over">
         <option value="exon" selected="true">exon</option>
         <option value="CDS">CDS</option>
         <option value="UTR">UTR</option>
         <option value="transcript">transcript</option>
     </param>
-    <param name="firstsamf" type="data" label="SAM file from your history to count reads overlapping gene model regions" format="sam" />
-    <repeat name="samfiles" title="Additional SAM files from your history to count reads overlapping gene model regions">
-        <param name="samf" type="data" label="Additional SAM file from your history" format="sam" size="100"/>
+    <param name="firstsamf" type="data" label="First bam/sam file from your history to count reads overlapping gene model regions"
+          format="sam,bam" optional="true"/>
+    <param name="secondsamf" type="data" label="Another bam/sam file from your history to count reads overlapping gene model regions"
+          format="sam,bam" optional="true"/>
+    <param name="thirdsamf" type="data" label="Another bam/sam file from your history to count reads overlapping gene model regions"
+          format="sam,bam" optional="true"/>
+    <param name="fourthsamf" type="data" label="Another bam/sam file from your history to count reads overlapping gene model regions"
+          format="sam,bam" optional="true"/>
+    <repeat name="samfiles" title="Use this to add all needed additional bam/sam files from your history to count reads overlapping gene model regions">
+        <param name="samf" type="data" label="Additional bam/sam file from your history" format="sam,bam" size="100" optional="true"/>
     </repeat>
   </inputs>
   <outputs>
     <data format="tabular" name="outfile" label="${title}_htseqsams2mx.xls" />
   </outputs>
+  <tests>
+    <test>
+      <param name="feature_type" value="exon" />
+      <param name="firstsamf" dbkey='rn4' ftype="bam" value="rn4chr20test1.bam" />
+      <param name="firstsamf.name" value="rn4chr20test1.bam" />
+      <param name="id_attr" value="gene_id" />
+      <param name="model" value="union" />
+      <param name="stranded" value="no" />
+      <param name="title" value="htseq test" />
+      <param name="mapqmin" value="0" />
+      <param name="secondsamfile" value="rn4chr20test2.bam" ftype="bam" dbkey="rn4" />
+      <param name="secondsamfile.name" value="rn4chr20test2.bam" />
+      <output name="outfile" file="htseqsams2mx_test1_out.xls" ftype="tabular" lines_diff="30"/>
+    </test>
+  </tests>
   <help>

-**Warning**
-
-This code will count reads overlapping contigs supplied in the gff file.
-
-
-**Note**
-
-htseqsams2mx is an experimental tool currently under test
-
-There is much discussion about whether to count optical/pcr duplicates. If you set the ignore flag to True, any reads in the input BAM files marked as
-duplicates by upstream processing such as the Picard MarkDuplicates tool will NOT be counted. The 'right' setting depends on your data and coverage. For extremely deep
-coverage, true duplicate reads are inevitable and ignoring them may be throwing away useful real data. In most cases, counting them is probably a reasonable
-choice - any induced bias is likely to be non-differential between samples, whereas it's not at all clear whether that's the case if they are ignored.
-
-----
-
 **What this tool does**

-Counts reads in multiple sample aligned sam format files using HTSeq counting over a gene model supplied as a GFF file
-
-The output is a tabular file containing the count matrix and suitable for downstream processing.
+Counts reads in multiple sam/bam format mapped files and generates a matrix ideal for edgeR and other count based tools
+It uses HTSeq to count your sam reads over a gene model supplied as a GTF file
+The output is a tabular text (columnar - spreadsheet) file containing the
+count matrix for downstream processing. Each row contains the counts from each sample for each
+of the non-emtpy GTF input file contigs matching the GTF attribute choice above.
+You probably want to use gene level GTF output attribute and count reads that overlap
+GTF exons for RNA-seq. Or you can count over exons by using transcript level output names or ids. Etc.

 ----

+**Tool author's plea on the importance of replicates**
+
+If you want the downstream p values to inform you about your data in terms of rejecting or accepting the null hypothesis
+under random sampling from the universe of possible biological/experimental replicates from which your data was drawn,
+which is what published p values are often assumed to do, then you need biological
+(or for cell culture material experimental) replicates.
+
+Using technical or no replicates means the downstream p values are not interpretable the way most people would assume
+they are - ie as the probability of obtaining a result as or more extreme as your experimental data
+in millions of experiments conducted under the null hypothesis.
+
+There is no way around this and it is scientific fraud to ignore this issue and publish bogus p values derived from
+technical or no replicates without making the lack of biological or experimental error in the p value calculations
+clear to your readers.
+
+See your stats 101 notes on the central limit theorem and test statistics for a refresher or talk to a
+statistician if this makes no sense please.
+
 **Attribution**

+This Galaxy tool relies on HTSeq_ from http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
+for the tricky work of counting. That code includes the following attribution:

-This Galaxy wrapper was written for a revised version by Ross Lazarus and is licensed under the LGPL_ like other rgenetics artefacts
+## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology
+## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General
+## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3
+
+It will be automatically installed if you use the toolshed as in general, you probably should.
+HTSeq_ must be installed with this tool if you install manually.
+
+Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
+licensed to you under the LGPL_ like other rgenetics artefacts

 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
-
+.. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
   </help>

 </tool>
-
-
-
-
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/htseq_bams_to_count_matrix/test-data/htseqsams2mx_test1_out.xls	Fri Jun 07 01:50:53 2013 -0400
@@ -0,0 +1,4 @@
+Contig	col1	col2
+Clic2	494	944
+F1M7K0_RAT	3	2
+Tmlhe	164	172
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/htseq_bams_to_count_matrix/test-data/rn4_chr20_100k.gtf	Fri Jun 07 01:50:53 2013 -0400
@@ -0,0 +1,62 @@
+chr20	protein_coding	CDS	801	1238	.	+	0	exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562";
+chr20	protein_coding	exon	801	1238	.	+	.	exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562";
+chr20	protein_coding	CDS	1742	1976	.	+	0	exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562";
+chr20	protein_coding	exon	1742	1976	.	+	.	exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562";
+chr20	protein_coding	CDS	2016	2177	.	+	2	exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562";
+chr20	protein_coding	exon	2016	2177	.	+	.	exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562";
+chr20	protein_coding	CDS	2263	2342	.	+	2	exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562";
+chr20	protein_coding	exon	2263	2342	.	+	.	exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562";
+chr20	protein_coding	CDS	2345	2533	.	+	0	exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562";
+chr20	protein_coding	exon	2345	2533	.	+	.	exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562";
+chr20	protein_coding	CDS	19528	19708	.	+	0	exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	CDS	19528	19708	.	+	0	exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451";
+chr20	protein_coding	exon	19528	19708	.	+	.	exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	exon	19528	19708	.	+	.	exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451";
+chr20	protein_coding	start_codon	19528	19530	.	+	0	exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	start_codon	19528	19530	.	+	0	exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451";
+chr20	protein_coding	CDS	21979	22014	.	+	2	exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	exon	21979	22014	.	+	.	exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	CDS	25349	25525	.	+	2	exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	CDS	25349	25525	.	+	2	exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451";
+chr20	protein_coding	exon	25349	25525	.	+	.	exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	exon	25349	25525	.	+	.	exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451";
+chr20	protein_coding	CDS	35197	35476	.	+	2	exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	CDS	35197	35476	.	+	2	exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451";
+chr20	protein_coding	exon	35197	35476	.	+	.	exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	exon	35197	35476	.	+	.	exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451";
+chr20	protein_coding	CDS	36764	36883	.	+	1	exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	CDS	36764	36883	.	+	1	exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451";
+chr20	protein_coding	exon	36764	36883	.	+	.	exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	exon	36764	36883	.	+	.	exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451";
+chr20	protein_coding	CDS	49040	49276	.	+	1	exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	CDS	49040	49276	.	+	1	exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451";
+chr20	protein_coding	exon	49040	49276	.	+	.	exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	exon	49040	49276	.	+	.	exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451";
+chr20	protein_coding	CDS	55193	55331	.	+	1	exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	CDS	55193	55331	.	+	1	exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451";
+chr20	protein_coding	exon	55193	55331	.	+	.	exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	exon	55193	55331	.	+	.	exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451";
+chr20	protein_coding	CDS	55883	56011	.	+	0	exon_number "8"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	CDS	55883	56011	.	+	0	exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451";
+chr20	protein_coding	exon	55883	56124	.	+	.	exon_number "8"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	exon	55883	56124	.	+	.	exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451";
+chr20	protein_coding	stop_codon	56012	56014	.	+	0	exon_number "8"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451";
+chr20	protein_coding	stop_codon	56012	56014	.	+	0	exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451";
+chr20	protein_coding	exon	66518	66785	.	+	.	exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592";
+chr20	protein_coding	CDS	66729	66785	.	+	0	exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592";
+chr20	protein_coding	start_codon	66729	66731	.	+	0	exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592";
+chr20	protein_coding	CDS	75931	76040	.	+	0	exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592";
+chr20	protein_coding	exon	75931	76040	.	+	.	exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592";
+chr20	protein_coding	CDS	76165	76290	.	+	1	exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592";
+chr20	protein_coding	exon	76165	76290	.	+	.	exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592";
+chr20	protein_coding	CDS	79941	80047	.	+	1	exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592";
+chr20	protein_coding	exon	79941	80047	.	+	.	exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592";
+chr20	protein_coding	CDS	80692	80873	.	+	2	exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592";
+chr20	protein_coding	exon	80692	80873	.	+	.	exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592";
+chr20	protein_coding	CDS	81142	81294	.	+	0	exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592";
+chr20	protein_coding	exon	81142	81536	.	+	.	exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592";
+chr20	protein_coding	stop_codon	81295	81297	.	+	0	exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592";
+chr20	protein_coding	exon	92810	93748	.	-	.	exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000029622"; gene_name "Olr1668"; p_id "P5423"; transcript_id "ENSRNOT00000047483"; transcript_name "Olr1668"; tss_id "TSS17091";
+chr20	protein_coding	stop_codon	92810	92812	.	-	0	exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000029622"; gene_name "Olr1668"; p_id "P5423"; transcript_id "ENSRNOT00000047483"; transcript_name "Olr1668"; tss_id "TSS17091";
+chr20	protein_coding	CDS	92813	93748	.	-	0	exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000029622"; gene_name "Olr1668"; p_id "P5423"; protein_id "ENSRNOP00000042115"; transcript_id "ENSRNOT00000047483"; transcript_name "Olr1668"; tss_id "TSS17091";
+chr20	protein_coding	start_codon	93746	93748	.	-	0	exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000029622"; gene_name "Olr1668"; p_id "P5423"; transcript_id "ENSRNOT00000047483"; transcript_name "Olr1668"; tss_id "TSS17091";
Binary file htseq_bams_to_count_matrix/test-data/rn4chr20test1.bam has changed
Binary file htseq_bams_to_count_matrix/test-data/rn4chr20test2.bam has changed
--- a/htseq_bams_to_count_matrix/tool_dependencies.xml.notworkingcomplex	Thu Jun 06 22:14:16 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="package_htseq_0_5_4" version="0.5.4">
-       <repository name="package_htseq_0_5_4" owner="fubar" toolshed="http://testtoolshed.g2.bx.psu.edu" changeset_revision="7574983a942a"/>
-    </package>
-</tool_dependency>