Mercurial > repos > fubar > htseq_bams_to_count_matrix
changeset 43:390cb852aae7 draft
Uploaded
author | fubar |
---|---|
date | Thu, 21 Nov 2013 17:39:01 -0500 (2013-11-21) |
parents | 5c783df4f31c |
children | b71da02aa36a |
files | generatetest.sh htseq_bams_to_count_matrix/generatetest.sh htseq_bams_to_count_matrix/htseqsams2mx.py htseq_bams_to_count_matrix/htseqsams2mx.xml htseq_bams_to_count_matrix/test-data/generatetest.sh htseq_bams_to_count_matrix/test-data/htseqsams2mx_test1_out.xls htseq_bams_to_count_matrix/test-data/rn4_chr20_100k.gtf htseq_bams_to_count_matrix/test-data/rn4chr20test1.bam htseq_bams_to_count_matrix/test-data/rn4chr20test2.bam htseq_bams_to_count_matrix/tool_dependencies.xml htseqsams2mx.py htseqsams2mx.xml test-data/generatetest.sh test-data/htseqsams2mx_test1_out.xls test-data/rn4_chr20_100k.gtf test-data/rn4chr20test1.bam test-data/rn4chr20test2.bam tool_dependencies.xml |
diffstat | 18 files changed, 649 insertions(+), 655 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/generatetest.sh Thu Nov 21 17:39:01 2013 -0500 @@ -0,0 +1,1 @@ +python ../htseqsams2mx.py -g rn4_chr20_100k.gtf -o test.xls --samf "'rn4chr20test1.bam','col1'" --samf "'rn4chr20test2.bam','col2'"
--- a/htseq_bams_to_count_matrix/generatetest.sh Wed Nov 20 23:21:38 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -python ../htseqsams2mx.py -g rn4_chr20_100k.gtf -o test.xls --samf "'rn4chr20test1.bam','col1'" --samf "'rn4chr20test2.bam','col2'"
--- a/htseq_bams_to_count_matrix/htseqsams2mx.py Wed Nov 20 23:21:38 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,384 +0,0 @@ -# May 2013 -# Change to htseq as the counting engine - wrap so arbitrary number of columns created -# borged Simon Anders' "count.py" since we need a vector of counts rather than a new sam file as output -# note attribution for htseq and count.py : -## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology -## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General -## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3 -# updated ross lazarus august 2011 to NOT include region and to finesse the name as the region for bed3 format inputs -# also now sums all duplicate named regions and provides a summary of any collapsing as the info -# updated ross lazarus july 26 to respect the is_duplicate flag rather than try to second guess -# note Heng Li argues that removing dupes is a bad idea for RNA seq -# updated ross lazarus july 22 to count reads OUTSIDE each bed region during the processing of each bam -# added better sorting with decoration of a dict key later sorted and undecorated. -# code cleaned up and galaxified ross lazarus july 18 et seq. -# bams2mx.py -turns a series of bam and a bed file into a matrix of counts Usage bams2mx.py <halfwindow> <bedfile.bed> <bam1.bam> -# <bam2.bam> -# uses pysam to read and count bam reads over each bed interval for each sample for speed -# still not so fast -# TODO options -shift -unique -# -""" -how this gets run: - -(vgalaxy)galaxy@iaas1-int:~$ cat database/job_working_directory/027/27014/galaxy_27014.sh -#!/bin/sh -GALAXY_LIB="/data/extended/galaxy/lib" -if [ "$GALAXY_LIB" != "None" ]; then - if [ -n "$PYTHONPATH" ]; then - PYTHONPATH="$GALAXY_LIB:$PYTHONPATH" - else - PYTHONPATH="$GALAXY_LIB" - fi - export PYTHONPATH -fi - -cd /data/extended/galaxy/database/job_working_directory/027/27014 -python /data/extended/galaxy/tools/rgenetics/htseqsams2mx.py -g "/data/extended/galaxy/database/files/034/dataset_34115.dat" -o "/data/extended/galaxy/database/files/034/dataset_34124.dat" -m "union" --id_attribute "gene_id" --feature_type "exon" --samf "'/data/extended/galaxy/database/files/033/dataset_33980.dat','T5A_C1PPHACXX_AGTTCC_L003_R1.fastq_bwa.sam'" --samf "'/data/extended/galaxy/database/files/033/dataset_33975.dat','T5A_C1PPHACXX_AGTTCC_L002_R1.fastq_bwa.sam'"; cd /data/extended/galaxy; /data/extended/galaxy/set_metadata.sh ./database/files /data/extended/galaxy/database/job_working_directory/027/27014 . /data/extended/galaxy/universe_wsgi.ini /data/tmp/tmpmwsElH /data/extended/galaxy/database/job_working_directory/027/27014/galaxy.json /data/extended/galaxy/database/job_working_directory/027/27014/metadata_in_HistoryDatasetAssociation_45202_sfOMGa,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_kwds_HistoryDatasetAssociation_45202_gaMnxa,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_out_HistoryDatasetAssociation_45202_kZPsZO,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_results_HistoryDatasetAssociation_45202_bXU7IU,,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_override_HistoryDatasetAssociation_45202_hyLAvh -echo $? > /data/extended/galaxy/database/job_working_directory/027/27014/galaxy_27014.ec - -""" - -import os -import re -import sys -import HTSeq.scripts.count as htcount -import optparse -import tempfile -import shutil -import operator -import subprocess -import itertools -import warnings -import traceback -import HTSeq -import time - - -class Xcpt(Exception): - def __init__(self, msg): - self.msg = msg - - -def htseqMX(gff_filename,sam_filenames,colnames,sam_exts,sam_bais,opts): - """ - Code taken from count.py in Simon Anders HTSeq distribution - Wrapped in a loop to accept multiple bam/sam files and their names from galaxy to - produce a matrix of contig counts by sample for downstream use in edgeR and DESeq tools - """ - class UnknownChrom( Exception ): - pass - - def my_showwarning( message, category, filename, lineno = None, line = None ): - sys.stdout.write( "Warning: %s\n" % message ) - - def invert_strand( iv ): - iv2 = iv.copy() - if iv2.strand == "+": - iv2.strand = "-" - elif iv2.strand == "-": - iv2.strand = "+" - else: - raise ValueError, "Illegal strand" - return iv2 - - def count_reads_in_features( sam_filenames, colnames, gff_filename, opts ): - """ Hacked version of htseq count.py - """ - if opts.quiet: - warnings.filterwarnings( action="ignore", module="HTSeq" ) - features = HTSeq.GenomicArrayOfSets( "auto", opts.stranded != "no" ) - mapqMin = int(opts.mapqMin) - counts = {} - nreads = 0 - empty = 0 - ambiguous = 0 - notaligned = 0 - lowqual = 0 - nonunique = 0 - filtered = 0 # new filter_extras - need a better way to do this - independent filter tool? - gff = HTSeq.GFF_Reader( gff_filename ) - try: - for i,f in enumerate(gff): - if f.type == opts.feature_type: - try: - feature_id = f.attr[ opts.id_attribute ] - except KeyError: - try: - feature_id = f.attr[ 'gene_id' ] - except KeyError: - sys.exit( "Feature at row %d %s does not contain a '%s' attribute OR a gene_id attribute - faulty GFF?" % - ( (i+1), f.name, opts.id_attribute ) ) - if opts.stranded != "no" and f.iv.strand == ".": - sys.exit( "Feature %s at %s does not have strand information but you are " - "running htseq-count in stranded mode. Use '--stranded=no'." % - ( f.name, f.iv ) ) - features[ f.iv ] += feature_id - counts[ feature_id ] = [0 for x in colnames] # we use sami as an index here to bump counts later - except: - sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() ) - raise - - if not opts.quiet: - sys.stdout.write( "%d GFF lines processed.\n" % i ) - - if len( counts ) == 0 and not opts.quiet: - sys.stdout.write( "Warning: No features of type '%s' found.\n" % opts.feature_type ) - for sami,sam_filename in enumerate(sam_filenames): - colname = colnames[sami] - isbam = sam_exts[sami] == 'bam' - hasbai = sam_bais[sami] > '' - if hasbai: - tempname = os.path.splitext(os.path.basename(sam_filename))[0] - tempbam = '%s.bam' % tempname - tempbai = '%s.bai' % tempname - os.link(sam_filename,tempbam) - os.link(sam_bais[sami],tempbai) - try: - if isbam: - if hasbai: - read_seq = HTSeq.BAM_Reader ( tempbam ) - else: - read_seq = HTSeq.BAM_Reader( sam_filename ) - else: - read_seq = HTSeq.SAM_Reader( sam_filename ) - first_read = iter(read_seq).next() - pe_mode = first_read.paired_end - except: - if isbam: - print >> sys.stderr, "Error occured when reading first line of bam file %s colname=%s \n" % (sam_filename,colname ) - else: - print >> sys.stderr, "Error occured when reading first line of sam file %s colname=%s \n" % (sam_filename,colname ) - raise - - try: - if pe_mode: - read_seq_pe_file = read_seq - read_seq = HTSeq.pair_SAM_alignments( read_seq ) - for seqi,r in enumerate(read_seq): - nreads += 1 - if not pe_mode: - if not r.aligned: - notaligned += 1 - continue - try: - if len(opts.filter_extras) > 0: - for extra in opts.filter_extras: - if r.optional_field(extra): - filtered += 1 - continue - if r.optional_field( "NH" ) > 1: - nonunique += 1 - continue - except KeyError: - pass - if r.aQual < mapqMin: - lowqual += 1 - continue - if opts.stranded != "reverse": - iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ) - else: - iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 ) - else: - if r[0] is not None and r[0].aligned: - if opts.stranded != "reverse": - iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) - else: - iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) - else: - iv_seq = tuple() - if r[1] is not None and r[1].aligned: - if opts.stranded != "reverse": - iv_seq = itertools.chain( iv_seq, - ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) - else: - iv_seq = itertools.chain( iv_seq, - ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) - else: - if ( r[0] is None ) or not ( r[0].aligned ): - notaligned += 1 - continue - try: - if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ - ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): - nonunique += 1 - continue - except KeyError: - pass - if ( r[0] and r[0].aQual < mapqMin ) or ( r[1] and r[1].aQual < mapqMin ): - lowqual += 1 - continue - - try: - if opts.mode == "union": - fs = set() - for iv in iv_seq: - if iv.chrom not in features.chrom_vectors: - raise UnknownChrom - for iv2, fs2 in features[ iv ].steps(): - fs = fs.union( fs2 ) - elif opts.mode == "intersection-strict" or opts.mode == "intersection-nonempty": - fs = None - for iv in iv_seq: - if iv.chrom not in features.chrom_vectors: - raise UnknownChrom - for iv2, fs2 in features[ iv ].steps(): - if len(fs2) > 0 or opts.mode == "intersection-strict": - if fs is None: - fs = fs2.copy() - else: - fs = fs.intersection( fs2 ) - else: - sys.exit( "Illegal overlap mode %s" % opts.mode ) - if fs is None or len( fs ) == 0: - empty += 1 - elif len( fs ) > 1: - ambiguous += 1 - else: - ck = list(fs)[0] - counts[ck][sami] += 1 # end up with counts for each sample as a list - except UnknownChrom: - if not pe_mode: - rr = r - else: - rr = r[0] if r[0] is not None else r[1] - empty += 1 - if not opts.quiet: - sys.stdout.write( ( "Warning: Skipping read '%s', because chromosome " + - "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % - ( rr.read.name, iv.chrom ) ) - except: - if not pe_mode: - sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) - else: - sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) - raise - - if not opts.quiet: - sys.stdout.write( "%d sam %s processed for %s.\n" % ( seqi, "lines " if not pe_mode else "line pairs", colname ) ) - return counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads - - warnings.showwarning = my_showwarning - assert os.path.isfile(gff_filename),'## unable to open supplied gff file %s' % gff_filename - try: - counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads = count_reads_in_features( sam_filenames, colnames, gff_filename,opts) - except: - sys.stderr.write( "Error: %s\n" % str( sys.exc_info()[1] ) ) - sys.stderr.write( "[Exception type: %s, raised in %s:%d]\n" % - ( sys.exc_info()[1].__class__.__name__, - os.path.basename(traceback.extract_tb( sys.exc_info()[2] )[-1][0]), - traceback.extract_tb( sys.exc_info()[2] )[-1][1] ) ) - sys.exit( 1 ) - return counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads - - -def usage(): - print >> sys.stdout, """Usage: python htseqsams2mx.py -w <halfwindowsize> -g <gfffile.gff> -o <outfilename> [-i] [-c] --samf "<sam1.sam>,<sam1.column_header>" --samf "...<samN.column_header>" """ - sys.exit(1) - -if __name__ == "__main__": - """ - <command interpreter="python"> - htseqsams2mx.py -w "$halfwin" -g "$gfffile" -o "$outfile" -m "union" - #for $s in $samfiles: - --samf "'${s.samf}','${s.samf.name}'" - #end for - </command> - """ - if len(sys.argv) < 2: - usage() - sys.exit(1) - starttime = time.time() - op = optparse.OptionParser() - # All tools - op.add_option('-w', '--halfwindow', default="0") - op.add_option('-m', '--mode', default="union") - op.add_option('-s', '--stranded', default="no") - op.add_option('-y', '--feature_type', default="exon") - op.add_option('-g', '--gff_file', default=None) - op.add_option('-o', '--outfname', default=None) - op.add_option('-f','--forceName', default="false") - op.add_option('--samf', default=[], action="append") - op.add_option('--filter_extras', default=[], action="append") - op.add_option('--mapqMin', default='0') - op.add_option( "-t", "--type", type="string", dest="featuretype", - default = "exon", help = "feature type (3rd column in GFF file) to be used, " + - "all features of other type are ignored (default, suitable for Ensembl " + - "GTF files: exon)" ) - - op.add_option( "-i", "--id_attribute", type="string", dest="id_attribute", - default = "gene_name", help = "GTF attribute to be used as feature ID (default, " + - "suitable for Ensembl GTF files: gene_id)" ) - - op.add_option( "-q", "--quiet", action="store_true", dest="quiet", default = False, - help = "suppress progress report and warnings" ) - opts, args = op.parse_args() - halfwindow = int(opts.halfwindow) - gff_file = opts.gff_file - assert os.path.isfile(gff_file),'##ERROR htseqsams2mx: Supplied input GFF file "%s" not found' % gff_file - outfname = opts.outfname - sam_filenames = [] - colnames = [] - samf = opts.samf - samfsplit = [x.split(',') for x in samf] # one per samf set - samsets = [] - for samfs in samfsplit: - samset = [x.replace("'","") for x in samfs] - samset = [x.replace('"','') for x in samset] - samsets.append(samset) - samsets = [x for x in samsets if x[0].lower() != 'none'] - # just cannot stop getting these on cl! wtf in cheetah for a repeat group? - samfnames = [x[0] for x in samsets] - if len(set(samfnames)) != len(samfnames): - samnames = [] - delme = [] - for i,s in enumerate(samfnames): - if s in samnames: - delme.append(i) - print sys.stdout,'## WARNING htseqsams2mx: Duplicate input sam file %s in %s - ignoring dupe in 0 based position %s' %\ - (s,','.join(samfnames), str(delme)) - else: - samnames.append(s) # first time - samsets = [x for i,x in enumerate(samsets) if not (i in delme)] - samfnames = [x[0] for x in samsets] - scolnames = [x[1]for x in samsets] - assert len(samfnames) == len(scolnames), '##ERROR sams2mx: Count of sam/cname not consistent - %d/%d' % (len(samfnames),len(scolnames)) - sam_exts = [x[2] for x in samsets] - assert len(samfnames) == len(sam_exts), '##ERROR sams2mx: Count of extensions not consistent - %d/%d' % (len(samfnames),len(sam_exts)) - sam_bais = [x[3] for x in samsets] # these only exist for bams and need to be finessed with a symlink so pysam will just work - for i,b in enumerate(samfnames): - assert os.path.isfile(b),'## Supplied input sam file "%s" not found' % b - sam_filenames.append(b) - sampName = scolnames[i] # better be unique - sampName = sampName.replace('#','') # for R - sampName = sampName.replace('(','') # for R - sampName = sampName.replace(')','') # for R - sampName = sampName.replace(' ','_') # for R - colnames.append(sampName) - counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads = htseqMX(gff_file, sam_filenames,colnames,sam_exts,sam_bais,opts) - heads = '\t'.join(['Contig',] + colnames) - res = [heads,] - contigs = counts.keys() - contigs.sort() - totalc = 0 - emptycontigs = 0 - for contig in contigs: - thisc = sum(counts[contig]) - if thisc > 0: # no output for empty contigs - totalc += thisc - crow = [contig,] + ['%d' % x for x in counts[contig]] - res.append('\t'.join(crow)) - else: - emptycontigs += 1 - outf = open(opts.outfname,'w') - outf.write('\n'.join(res)) - outf.write('\n') - outf.close() - walltime = int(time.time() - starttime) - accumulatornames = ('walltime (seconds)','total reads read','total reads counted','number of contigs','total empty reads','total ambiguous reads','total low quality reads', - 'total not aligned reads','total not unique mapping reads','extra filtered reads','empty contigs') - accums = (walltime,nreads,totalc,len(contigs),empty,ambiguous,lowqual,notaligned,nonunique,filtered,emptycontigs) - fracs = (1.0,1.0,float(totalc)/nreads,1.0,float(empty)/nreads,float(ambiguous)/nreads,float(lowqual)/nreads,float(notaligned)/nreads,float(nonunique)/nreads,float(filtered)/nreads,float(emptycontigs)/len(contigs)) - notes = ['%s = %d (%2.3f)' % (accumulatornames[i],x,100.0*fracs[i]) for i,x in enumerate(accums)] - print >> sys.stdout, '\n'.join(notes) - sys.exit(0)
--- a/htseq_bams_to_count_matrix/htseqsams2mx.xml Wed Nov 20 23:21:38 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,149 +0,0 @@ -<tool id="htseqsams2mxlocal" name="SAM/BAM to count matrix" version="0.4"> - <description>using HTSeq code</description> - <stdio> - <regex match=".*" source="both" level="warning" description="chatter from HTSeq:"/> - </stdio> - <requirements> - <requirement type="package" version="1.7">numpy</requirement> - <requirement type="package" version="0.7.6">pysam</requirement> - <requirement type="package" version="2.4">freetype</requirement> - <requirement type="package" version="1.2">matplotlib</requirement> - <requirement type="package" version="0.5.4p3">htseq</requirement> - </requirements> - <command interpreter="python"> - htseqsams2mx.py -g "$gfffile" -o "$outfile" -m "$model" --id_attribute "$id_attr" --feature_type "$feature_type" - --mapqMin $mapqMin --samf "'${firstsamf}','${firstsamf.name}','${firstsamf.ext}','${firstsamf.metadata.bam_index}'" - #if $secondsamf.ext != 'data': - --samf "'${secondsamf}','${secondsamf.name}','${secondsamf.ext}','${secondsamf.metadata.bam_index}'" - #end if - #for $s in $samfiles: - #if $s.samf.ext != 'data': - --samf "'${s.samf}','${s.samf.name}','${s.samf.ext}','${s.samf.metadata.bam_index}'" - #end if - #end for - #if $filter_extras: - --filter_extras "$filter_extras" - #end if - </command> - <inputs> - <param format="gtf" name="gfffile" type="data" label="Gene model (GFF) file to count reads over from your current history" size="100" /> - <param name="mapqMin" label="Filter reads with mapq below than this value" - help="0 to count any mapping quality read. Otherwise only reads at or above specified mapq will be counted" - type="integer" value="5"/> - <param name="title" label="Name for this job's output file" type="text" size="80" value="bams to DGE count matrix"/> - <param name="stranded" value="false" type="boolean" label="Reads are stranded - use strand in counting" display="checkbox" - truevalue="yes" falsevalue="no" checked="no" help="Check this ONLY if you know your sequences are strand specific" /> - <param name="model" type="select" label="Model for counting reads over the supplied gene model- see HTSeq docs" - help="If in doubt, union is a reasonable default but intersection-strict avoids double counting over overlapping exons"> - <option value="union" selected="true">union</option> - <option value="intersection-strict">intersection-strict</option> - <option value="intersection-nonempty">intersection-nonempty</option> - </param> - <param name="id_attr" type="select" label="GTF attribute to output as the name for each contig - see HTSeq docs" - help="If in doubt, use gene name or if you need the id in your GTF, gene id"> - <option value="gene_name" selected="true">gene name</option> - <option value="gene_id">gene id</option> - <option value="transcript_id">transcript id</option> - <option value="transcript_name">transcript name</option> - </param> - <param name="feature_type" type="select" label="GTF feature type for counting reads over the supplied gene model- see HTSeq docs" - help="GTF feature type to count over - exon is a good choice with gene name as the contig to count over"> - <option value="exon" selected="true">exon</option> - <option value="CDS">CDS</option> - <option value="UTR">UTR</option> - <option value="transcript">transcript</option> - </param> - <param name="filter_extras" type="select" label="Filter any read with one or more flags" - help="eg the XS tag created by bowtie for multiple reads" optional="true" mutliple="true"> - <option value="">None</option> - <option value="XS">XS:i > 0 - More than one mapping position Bowtie</option> - <option value="XS:A">Might be useful for tophat</option> - </param> - - <param name="firstsamf" type="data" label="bam/sam file from your history" format="sam,bam" size="100" - help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs" - optional="false"/> - <param name="secondsamf" type="data" label="Additional bam/sam file from your history" format="sam,bam" size="100" - help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs" - optional="false"/> - <repeat name="samfiles" min="16" - title="Specify additional bam/sam file inputs" help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs"> - <param name="samf" type="data" label="Additional bam/sam file from your history" format="sam,bam" size="100" - optional="true"/> - </repeat> - </inputs> - <outputs> - <data format="tabular" name="outfile" label="${title}_htseqsams2mx.xls" /> - </outputs> - <tests> - <test> - <param name="feature_type" value="exon" /> - <param name="gfffile" value="rn4_chr20_100k.gtf" /> - <param name="firstsamf" value="rn4chr20test1.bam" ftype="bam"/> - <param name="secondsamf" value="rn4chr20test2.bam" ftype="bam"/> - <param name="id_attr" value="gene_name" /> - <param name="model" value="union" /> - <param name="stranded" value="no" /> - <param name="title" value="htseqtest" /> - <param name="mapqMin" value="0" /> - - <output name="outfile" file="htseqsams2mx_test1_out.xls" lines_diff="1"/> - </test> - </tests> - <help> - -**What this tool does** - -Counts reads in multiple sam/bam format mapped files and generates a matrix ideal for edgeR and other count based tools -It uses HTSeq to count your sam reads over a gene model supplied as a GTF file -The output is a tabular text (columnar - spreadsheet) file containing the -count matrix for downstream processing. Each row contains the counts from each sample for each -of the non-emtpy GTF input file contigs matching the GTF attribute choice above. -You probably want to use gene level GTF output attribute and count reads that overlap -GTF exons for RNA-seq. Or you can count over exons by using transcript level output names or ids. Etc. - ----- - -**Author's plea on replicates** - -If you want to interpret the downstream p values in terms of rejecting or accepting the null hypothesis -under random sampling with replacement from the universe of possible biological/experimental replicates from which your data was derived, -which is what published p values are often assumed to do, then you need biological -(or for cell culture material experimental) replicates. - -Using technical or no replicates means the downstream p values are not interpretable the way most people would assume -they are - ie as the probability of obtaining a result as or more extreme as your experimental data -in millions of experiments conducted using the same methods under the null hypothesis. - -There is no way around this and it is scientific fraud to ignore this issue and publish bogus p values derived from -technical or no replicates without making the lack of biological or experimental error in the p value calculations -clear to your readers so they can adjust their expectations. However, the buck stops here at higher level inference. -If you have no replicates, you must not use this tool as the p values are uninterpretable. So there. - -See your stats 101 notes on the central limit theorem and test statistics for a refresher or talk to a -statistician if this makes no sense please. - -**Attribution** - -This Galaxy tool relies on HTSeq_ from http://www-huber.embl.de/users/anders/HTSeq/doc/index.html -for the tricky work of counting. That code includes the following attribution: - -## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology -## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General -## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3 - -It will be automatically installed if you use the toolshed as in general, you probably should. -HTSeq_ must be installed with this tool if you install manually. - -Otherwise, all code and documentation comprising this tool including the requirement -for more than one sample bam -was written by Ross Lazarus and is -licensed to you under the LGPL_ like other rgenetics artefacts - -Sorry, I don't use readgroups so had no reason to code read groups. Contributions welcome. Send code - -.. _LGPL: http://www.gnu.org/copyleft/lesser.html -.. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html - </help> - -</tool>
--- a/htseq_bams_to_count_matrix/test-data/generatetest.sh Wed Nov 20 23:21:38 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -#python ../htseqsams2mx.py -g rn4_chr20_100k.gtf -o test.xls --samf "'rn4chr20test1.bam','col1'" --samf "'rn4chr20test2.bam','col2'" -python ../htseqsams2mx.py -g rn4_chr20_100k.gtf -o htseqsams2mx_test1_out.xls --samf "'rn4chr20test1.bam',''" --samf "'rn4chr20test2.bam',''"
--- a/htseq_bams_to_count_matrix/test-data/htseqsams2mx_test1_out.xls Wed Nov 20 23:21:38 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ -Contig rn4chr20test1.bam rn4chr20test2.bam -Clic2 494 944 -F1M7K0_RAT 3 2 -Tmlhe 164 172
--- a/htseq_bams_to_count_matrix/test-data/rn4_chr20_100k.gtf Wed Nov 20 23:21:38 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ -chr20 protein_coding CDS 801 1238 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; -chr20 protein_coding exon 801 1238 . + . exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; -chr20 protein_coding CDS 1742 1976 . + 0 exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; -chr20 protein_coding exon 1742 1976 . + . exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; -chr20 protein_coding CDS 2016 2177 . + 2 exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; -chr20 protein_coding exon 2016 2177 . + . exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; -chr20 protein_coding CDS 2263 2342 . + 2 exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; -chr20 protein_coding exon 2263 2342 . + . exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; -chr20 protein_coding CDS 2345 2533 . + 0 exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; -chr20 protein_coding exon 2345 2533 . + . exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; -chr20 protein_coding CDS 19528 19708 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding CDS 19528 19708 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; -chr20 protein_coding exon 19528 19708 . + . exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding exon 19528 19708 . + . exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; -chr20 protein_coding start_codon 19528 19530 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding start_codon 19528 19530 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; -chr20 protein_coding CDS 21979 22014 . + 2 exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding exon 21979 22014 . + . exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding CDS 25349 25525 . + 2 exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding CDS 25349 25525 . + 2 exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; -chr20 protein_coding exon 25349 25525 . + . exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding exon 25349 25525 . + . exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; -chr20 protein_coding CDS 35197 35476 . + 2 exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding CDS 35197 35476 . + 2 exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; -chr20 protein_coding exon 35197 35476 . + . exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding exon 35197 35476 . + . exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; -chr20 protein_coding CDS 36764 36883 . + 1 exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding CDS 36764 36883 . + 1 exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; -chr20 protein_coding exon 36764 36883 . + . exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding exon 36764 36883 . + . exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; -chr20 protein_coding CDS 49040 49276 . + 1 exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding CDS 49040 49276 . + 1 exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; -chr20 protein_coding exon 49040 49276 . + . exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding exon 49040 49276 . + . exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; -chr20 protein_coding CDS 55193 55331 . + 1 exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding CDS 55193 55331 . + 1 exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; -chr20 protein_coding exon 55193 55331 . + . exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding exon 55193 55331 . + . exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; -chr20 protein_coding CDS 55883 56011 . + 0 exon_number "8"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding CDS 55883 56011 . + 0 exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; -chr20 protein_coding exon 55883 56124 . + . exon_number "8"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding exon 55883 56124 . + . exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; -chr20 protein_coding stop_codon 56012 56014 . + 0 exon_number "8"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; -chr20 protein_coding stop_codon 56012 56014 . + 0 exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; -chr20 protein_coding exon 66518 66785 . + . exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; -chr20 protein_coding CDS 66729 66785 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; -chr20 protein_coding start_codon 66729 66731 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; -chr20 protein_coding CDS 75931 76040 . + 0 exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; -chr20 protein_coding exon 75931 76040 . + . exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; -chr20 protein_coding CDS 76165 76290 . + 1 exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; -chr20 protein_coding exon 76165 76290 . + . exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; -chr20 protein_coding CDS 79941 80047 . + 1 exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; -chr20 protein_coding exon 79941 80047 . + . exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; -chr20 protein_coding CDS 80692 80873 . + 2 exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; -chr20 protein_coding exon 80692 80873 . + . exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; -chr20 protein_coding CDS 81142 81294 . + 0 exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; -chr20 protein_coding exon 81142 81536 . + . exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; -chr20 protein_coding stop_codon 81295 81297 . + 0 exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; -chr20 protein_coding exon 92810 93748 . - . exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000029622"; gene_name "Olr1668"; p_id "P5423"; transcript_id "ENSRNOT00000047483"; transcript_name "Olr1668"; tss_id "TSS17091"; -chr20 protein_coding stop_codon 92810 92812 . - 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000029622"; gene_name "Olr1668"; p_id "P5423"; transcript_id "ENSRNOT00000047483"; transcript_name "Olr1668"; tss_id "TSS17091"; -chr20 protein_coding CDS 92813 93748 . - 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000029622"; gene_name "Olr1668"; p_id "P5423"; protein_id "ENSRNOP00000042115"; transcript_id "ENSRNOT00000047483"; transcript_name "Olr1668"; tss_id "TSS17091"; -chr20 protein_coding start_codon 93746 93748 . - 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000029622"; gene_name "Olr1668"; p_id "P5423"; transcript_id "ENSRNOT00000047483"; transcript_name "Olr1668"; tss_id "TSS17091";
--- a/htseq_bams_to_count_matrix/tool_dependencies.xml Wed Nov 20 23:21:38 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="numpy" version="1.7"> - <repository changeset_revision="84125ffacb90" name="package_numpy_1_7" owner="iuc" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu" /> - </package> - <package name="pysam" version="0.7.6"> - <repository changeset_revision="247e5e5bee87" name="package_pysam_0_7_6" owner="iuc" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu" /> - </package> - <package name="freetype" version="2.4"> - <repository changeset_revision="fe5cfaf931ff" name="package_freetype_2_4" owner="iuc" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" /> - </package> - <package name="matplotlib" version="1.2"> - <repository changeset_revision="966f29c955b9" name="package_matplotlib_1_2" owner="iuc" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" /> - </package> - <package name="htseq" version="0.5.4p3"> - <install version="1.0"> - <actions> - <action type="download_by_url">https://pypi.python.org/packages/source/H/HTSeq/HTSeq-0.5.4p3.tar.gz</action> - <action type="make_directory">$INSTALL_DIR/lib/python</action> <!-- Not sure why these must be made apriori, but install fails otherwise --> - <action type="make_directory">$INSTALL_DIR/lib64/python</action> <!-- Not sure why these must be made apriori, but install fails otherwise --> - <action type="set_environment_for_install"> - <repository changeset_revision="84125ffacb90" name="package_numpy_1_7" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu"> - <package name="numpy" version="1.7" /> - </repository> - </action> - <action type="set_environment_for_install"> - <repository changeset_revision="247e5e5bee87" name="package_pysam_0_7_6" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu"> - <package name="pysam" version="0.7.6" /> - </repository> - </action> - <action type="set_environment_for_install"> - <repository changeset_revision="fe5cfaf931ff" name="package_freetype_2_4" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu/"> - <package name="freetype" version="2.4" /> - </repository> - </action> - <action type="set_environment_for_install"> - <repository changeset_revision="966f29c955b9" name="package_matplotlib_1_2" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu/"> - <package name="matplotlib" version="1.2" /> - </repository> - </action> - <action type="shell_command">export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python:$INSTALL_DIR/lib64/python && python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin</action> - <action type="set_environment"> - <environment_variable action="append_to" name="PYTHONPATH">$INSTALL_DIR/lib/python:$INSTALL_DIR/lib64/python</environment_variable> - <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable> - </action> - </actions> - </install> - <readme> - Installation of HTSeq requires Python 2.5+ (does not yet work with Python 3), pysam and the Numpy Python package. - Note this uses the matplotlib lite version dependent on the lite version of numpy - no atlas compilation - </readme> - </package> -</tool_dependency>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/htseqsams2mx.py Thu Nov 21 17:39:01 2013 -0500 @@ -0,0 +1,384 @@ +# May 2013 +# Change to htseq as the counting engine - wrap so arbitrary number of columns created +# borged Simon Anders' "count.py" since we need a vector of counts rather than a new sam file as output +# note attribution for htseq and count.py : +## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology +## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General +## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3 +# updated ross lazarus august 2011 to NOT include region and to finesse the name as the region for bed3 format inputs +# also now sums all duplicate named regions and provides a summary of any collapsing as the info +# updated ross lazarus july 26 to respect the is_duplicate flag rather than try to second guess +# note Heng Li argues that removing dupes is a bad idea for RNA seq +# updated ross lazarus july 22 to count reads OUTSIDE each bed region during the processing of each bam +# added better sorting with decoration of a dict key later sorted and undecorated. +# code cleaned up and galaxified ross lazarus july 18 et seq. +# bams2mx.py -turns a series of bam and a bed file into a matrix of counts Usage bams2mx.py <halfwindow> <bedfile.bed> <bam1.bam> +# <bam2.bam> +# uses pysam to read and count bam reads over each bed interval for each sample for speed +# still not so fast +# TODO options -shift -unique +# +""" +how this gets run: + +(vgalaxy)galaxy@iaas1-int:~$ cat database/job_working_directory/027/27014/galaxy_27014.sh +#!/bin/sh +GALAXY_LIB="/data/extended/galaxy/lib" +if [ "$GALAXY_LIB" != "None" ]; then + if [ -n "$PYTHONPATH" ]; then + PYTHONPATH="$GALAXY_LIB:$PYTHONPATH" + else + PYTHONPATH="$GALAXY_LIB" + fi + export PYTHONPATH +fi + +cd /data/extended/galaxy/database/job_working_directory/027/27014 +python /data/extended/galaxy/tools/rgenetics/htseqsams2mx.py -g "/data/extended/galaxy/database/files/034/dataset_34115.dat" -o "/data/extended/galaxy/database/files/034/dataset_34124.dat" -m "union" --id_attribute "gene_id" --feature_type "exon" --samf "'/data/extended/galaxy/database/files/033/dataset_33980.dat','T5A_C1PPHACXX_AGTTCC_L003_R1.fastq_bwa.sam'" --samf "'/data/extended/galaxy/database/files/033/dataset_33975.dat','T5A_C1PPHACXX_AGTTCC_L002_R1.fastq_bwa.sam'"; cd /data/extended/galaxy; /data/extended/galaxy/set_metadata.sh ./database/files /data/extended/galaxy/database/job_working_directory/027/27014 . /data/extended/galaxy/universe_wsgi.ini /data/tmp/tmpmwsElH /data/extended/galaxy/database/job_working_directory/027/27014/galaxy.json /data/extended/galaxy/database/job_working_directory/027/27014/metadata_in_HistoryDatasetAssociation_45202_sfOMGa,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_kwds_HistoryDatasetAssociation_45202_gaMnxa,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_out_HistoryDatasetAssociation_45202_kZPsZO,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_results_HistoryDatasetAssociation_45202_bXU7IU,,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_override_HistoryDatasetAssociation_45202_hyLAvh +echo $? > /data/extended/galaxy/database/job_working_directory/027/27014/galaxy_27014.ec + +""" + +import os +import re +import sys +import HTSeq.scripts.count as htcount +import optparse +import tempfile +import shutil +import operator +import subprocess +import itertools +import warnings +import traceback +import HTSeq +import time + + +class Xcpt(Exception): + def __init__(self, msg): + self.msg = msg + + +def htseqMX(gff_filename,sam_filenames,colnames,sam_exts,sam_bais,opts): + """ + Code taken from count.py in Simon Anders HTSeq distribution + Wrapped in a loop to accept multiple bam/sam files and their names from galaxy to + produce a matrix of contig counts by sample for downstream use in edgeR and DESeq tools + """ + class UnknownChrom( Exception ): + pass + + def my_showwarning( message, category, filename, lineno = None, line = None ): + sys.stdout.write( "Warning: %s\n" % message ) + + def invert_strand( iv ): + iv2 = iv.copy() + if iv2.strand == "+": + iv2.strand = "-" + elif iv2.strand == "-": + iv2.strand = "+" + else: + raise ValueError, "Illegal strand" + return iv2 + + def count_reads_in_features( sam_filenames, colnames, gff_filename, opts ): + """ Hacked version of htseq count.py + """ + if opts.quiet: + warnings.filterwarnings( action="ignore", module="HTSeq" ) + features = HTSeq.GenomicArrayOfSets( "auto", opts.stranded != "no" ) + mapqMin = int(opts.mapqMin) + counts = {} + nreads = 0 + empty = 0 + ambiguous = 0 + notaligned = 0 + lowqual = 0 + nonunique = 0 + filtered = 0 # new filter_extras - need a better way to do this - independent filter tool? + gff = HTSeq.GFF_Reader( gff_filename ) + try: + for i,f in enumerate(gff): + if f.type == opts.feature_type: + try: + feature_id = f.attr[ opts.id_attribute ] + except KeyError: + try: + feature_id = f.attr[ 'gene_id' ] + except KeyError: + sys.exit( "Feature at row %d %s does not contain a '%s' attribute OR a gene_id attribute - faulty GFF?" % + ( (i+1), f.name, opts.id_attribute ) ) + if opts.stranded != "no" and f.iv.strand == ".": + sys.exit( "Feature %s at %s does not have strand information but you are " + "running htseq-count in stranded mode. Use '--stranded=no'." % + ( f.name, f.iv ) ) + features[ f.iv ] += feature_id + counts[ feature_id ] = [0 for x in colnames] # we use sami as an index here to bump counts later + except: + sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() ) + raise + + if not opts.quiet: + sys.stdout.write( "%d GFF lines processed.\n" % i ) + + if len( counts ) == 0 and not opts.quiet: + sys.stdout.write( "Warning: No features of type '%s' found.\n" % opts.feature_type ) + for sami,sam_filename in enumerate(sam_filenames): + colname = colnames[sami] + isbam = sam_exts[sami] == 'bam' + hasbai = sam_bais[sami] > '' + if hasbai: + tempname = os.path.splitext(os.path.basename(sam_filename))[0] + tempbam = '%s.bam' % tempname + tempbai = '%s.bai' % tempname + os.link(sam_filename,tempbam) + os.link(sam_bais[sami],tempbai) + try: + if isbam: + if hasbai: + read_seq = HTSeq.BAM_Reader ( tempbam ) + else: + read_seq = HTSeq.BAM_Reader( sam_filename ) + else: + read_seq = HTSeq.SAM_Reader( sam_filename ) + first_read = iter(read_seq).next() + pe_mode = first_read.paired_end + except: + if isbam: + print >> sys.stderr, "Error occured when reading first line of bam file %s colname=%s \n" % (sam_filename,colname ) + else: + print >> sys.stderr, "Error occured when reading first line of sam file %s colname=%s \n" % (sam_filename,colname ) + raise + + try: + if pe_mode: + read_seq_pe_file = read_seq + read_seq = HTSeq.pair_SAM_alignments( read_seq ) + for seqi,r in enumerate(read_seq): + nreads += 1 + if not pe_mode: + if not r.aligned: + notaligned += 1 + continue + try: + if len(opts.filter_extras) > 0: + for extra in opts.filter_extras: + if r.optional_field(extra): + filtered += 1 + continue + if r.optional_field( "NH" ) > 1: + nonunique += 1 + continue + except KeyError: + pass + if r.aQual < mapqMin: + lowqual += 1 + continue + if opts.stranded != "reverse": + iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ) + else: + iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 ) + else: + if r[0] is not None and r[0].aligned: + if opts.stranded != "reverse": + iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) + else: + iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) + else: + iv_seq = tuple() + if r[1] is not None and r[1].aligned: + if opts.stranded != "reverse": + iv_seq = itertools.chain( iv_seq, + ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) + else: + iv_seq = itertools.chain( iv_seq, + ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) + else: + if ( r[0] is None ) or not ( r[0].aligned ): + notaligned += 1 + continue + try: + if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ + ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): + nonunique += 1 + continue + except KeyError: + pass + if ( r[0] and r[0].aQual < mapqMin ) or ( r[1] and r[1].aQual < mapqMin ): + lowqual += 1 + continue + + try: + if opts.mode == "union": + fs = set() + for iv in iv_seq: + if iv.chrom not in features.chrom_vectors: + raise UnknownChrom + for iv2, fs2 in features[ iv ].steps(): + fs = fs.union( fs2 ) + elif opts.mode == "intersection-strict" or opts.mode == "intersection-nonempty": + fs = None + for iv in iv_seq: + if iv.chrom not in features.chrom_vectors: + raise UnknownChrom + for iv2, fs2 in features[ iv ].steps(): + if len(fs2) > 0 or opts.mode == "intersection-strict": + if fs is None: + fs = fs2.copy() + else: + fs = fs.intersection( fs2 ) + else: + sys.exit( "Illegal overlap mode %s" % opts.mode ) + if fs is None or len( fs ) == 0: + empty += 1 + elif len( fs ) > 1: + ambiguous += 1 + else: + ck = list(fs)[0] + counts[ck][sami] += 1 # end up with counts for each sample as a list + except UnknownChrom: + if not pe_mode: + rr = r + else: + rr = r[0] if r[0] is not None else r[1] + empty += 1 + if not opts.quiet: + sys.stdout.write( ( "Warning: Skipping read '%s', because chromosome " + + "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % + ( rr.read.name, iv.chrom ) ) + except: + if not pe_mode: + sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) + else: + sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) + raise + + if not opts.quiet: + sys.stdout.write( "%d sam %s processed for %s.\n" % ( seqi, "lines " if not pe_mode else "line pairs", colname ) ) + return counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads + + warnings.showwarning = my_showwarning + assert os.path.isfile(gff_filename),'## unable to open supplied gff file %s' % gff_filename + try: + counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads = count_reads_in_features( sam_filenames, colnames, gff_filename,opts) + except: + sys.stderr.write( "Error: %s\n" % str( sys.exc_info()[1] ) ) + sys.stderr.write( "[Exception type: %s, raised in %s:%d]\n" % + ( sys.exc_info()[1].__class__.__name__, + os.path.basename(traceback.extract_tb( sys.exc_info()[2] )[-1][0]), + traceback.extract_tb( sys.exc_info()[2] )[-1][1] ) ) + sys.exit( 1 ) + return counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads + + +def usage(): + print >> sys.stdout, """Usage: python htseqsams2mx.py -w <halfwindowsize> -g <gfffile.gff> -o <outfilename> [-i] [-c] --samf "<sam1.sam>,<sam1.column_header>" --samf "...<samN.column_header>" """ + sys.exit(1) + +if __name__ == "__main__": + """ + <command interpreter="python"> + htseqsams2mx.py -w "$halfwin" -g "$gfffile" -o "$outfile" -m "union" + #for $s in $samfiles: + --samf "'${s.samf}','${s.samf.name}'" + #end for + </command> + """ + if len(sys.argv) < 2: + usage() + sys.exit(1) + starttime = time.time() + op = optparse.OptionParser() + # All tools + op.add_option('-w', '--halfwindow', default="0") + op.add_option('-m', '--mode', default="union") + op.add_option('-s', '--stranded', default="no") + op.add_option('-y', '--feature_type', default="exon") + op.add_option('-g', '--gff_file', default=None) + op.add_option('-o', '--outfname', default=None) + op.add_option('-f','--forceName', default="false") + op.add_option('--samf', default=[], action="append") + op.add_option('--filter_extras', default=[], action="append") + op.add_option('--mapqMin', default='0') + op.add_option( "-t", "--type", type="string", dest="featuretype", + default = "exon", help = "feature type (3rd column in GFF file) to be used, " + + "all features of other type are ignored (default, suitable for Ensembl " + + "GTF files: exon)" ) + + op.add_option( "-i", "--id_attribute", type="string", dest="id_attribute", + default = "gene_name", help = "GTF attribute to be used as feature ID (default, " + + "suitable for Ensembl GTF files: gene_id)" ) + + op.add_option( "-q", "--quiet", action="store_true", dest="quiet", default = False, + help = "suppress progress report and warnings" ) + opts, args = op.parse_args() + halfwindow = int(opts.halfwindow) + gff_file = opts.gff_file + assert os.path.isfile(gff_file),'##ERROR htseqsams2mx: Supplied input GFF file "%s" not found' % gff_file + outfname = opts.outfname + sam_filenames = [] + colnames = [] + samf = opts.samf + samfsplit = [x.split(',') for x in samf] # one per samf set + samsets = [] + for samfs in samfsplit: + samset = [x.replace("'","") for x in samfs] + samset = [x.replace('"','') for x in samset] + samsets.append(samset) + samsets = [x for x in samsets if x[0].lower() != 'none'] + # just cannot stop getting these on cl! wtf in cheetah for a repeat group? + samfnames = [x[0] for x in samsets] + if len(set(samfnames)) != len(samfnames): + samnames = [] + delme = [] + for i,s in enumerate(samfnames): + if s in samnames: + delme.append(i) + print sys.stdout,'## WARNING htseqsams2mx: Duplicate input sam file %s in %s - ignoring dupe in 0 based position %s' %\ + (s,','.join(samfnames), str(delme)) + else: + samnames.append(s) # first time + samsets = [x for i,x in enumerate(samsets) if not (i in delme)] + samfnames = [x[0] for x in samsets] + scolnames = [x[1]for x in samsets] + assert len(samfnames) == len(scolnames), '##ERROR sams2mx: Count of sam/cname not consistent - %d/%d' % (len(samfnames),len(scolnames)) + sam_exts = [x[2] for x in samsets] + assert len(samfnames) == len(sam_exts), '##ERROR sams2mx: Count of extensions not consistent - %d/%d' % (len(samfnames),len(sam_exts)) + sam_bais = [x[3] for x in samsets] # these only exist for bams and need to be finessed with a symlink so pysam will just work + for i,b in enumerate(samfnames): + assert os.path.isfile(b),'## Supplied input sam file "%s" not found' % b + sam_filenames.append(b) + sampName = scolnames[i] # better be unique + sampName = sampName.replace('#','') # for R + sampName = sampName.replace('(','') # for R + sampName = sampName.replace(')','') # for R + sampName = sampName.replace(' ','_') # for R + colnames.append(sampName) + counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads = htseqMX(gff_file, sam_filenames,colnames,sam_exts,sam_bais,opts) + heads = '\t'.join(['Contig',] + colnames) + res = [heads,] + contigs = counts.keys() + contigs.sort() + totalc = 0 + emptycontigs = 0 + for contig in contigs: + thisc = sum(counts[contig]) + if thisc > 0: # no output for empty contigs + totalc += thisc + crow = [contig,] + ['%d' % x for x in counts[contig]] + res.append('\t'.join(crow)) + else: + emptycontigs += 1 + outf = open(opts.outfname,'w') + outf.write('\n'.join(res)) + outf.write('\n') + outf.close() + walltime = int(time.time() - starttime) + accumulatornames = ('walltime (seconds)','total reads read','total reads counted','number of contigs','total empty reads','total ambiguous reads','total low quality reads', + 'total not aligned reads','total not unique mapping reads','extra filtered reads','empty contigs') + accums = (walltime,nreads,totalc,len(contigs),empty,ambiguous,lowqual,notaligned,nonunique,filtered,emptycontigs) + fracs = (1.0,1.0,float(totalc)/nreads,1.0,float(empty)/nreads,float(ambiguous)/nreads,float(lowqual)/nreads,float(notaligned)/nreads,float(nonunique)/nreads,float(filtered)/nreads,float(emptycontigs)/len(contigs)) + notes = ['%s = %d (%2.3f)' % (accumulatornames[i],x,100.0*fracs[i]) for i,x in enumerate(accums)] + print >> sys.stdout, '\n'.join(notes) + sys.exit(0)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/htseqsams2mx.xml Thu Nov 21 17:39:01 2013 -0500 @@ -0,0 +1,149 @@ +<tool id="htseqsams2mxlocal" name="SAM/BAM to count matrix" version="0.4"> + <description>using HTSeq code</description> + <stdio> + <regex match=".*" source="both" level="warning" description="chatter from HTSeq:"/> + </stdio> + <requirements> + <requirement type="package" version="1.7">numpy</requirement> + <requirement type="package" version="0.7.6">pysam</requirement> + <requirement type="package" version="2.4">freetype</requirement> + <requirement type="package" version="1.2">matplotlib</requirement> + <requirement type="package" version="0.5.4p3">htseq</requirement> + </requirements> + <command interpreter="python"> + htseqsams2mx.py -g "$gfffile" -o "$outfile" -m "$model" --id_attribute "$id_attr" --feature_type "$feature_type" + --mapqMin $mapqMin --samf "'${firstsamf}','${firstsamf.name}','${firstsamf.ext}','${firstsamf.metadata.bam_index}'" + #if $secondsamf.ext != 'data': + --samf "'${secondsamf}','${secondsamf.name}','${secondsamf.ext}','${secondsamf.metadata.bam_index}'" + #end if + #for $s in $samfiles: + #if $s.samf.ext != 'data': + --samf "'${s.samf}','${s.samf.name}','${s.samf.ext}','${s.samf.metadata.bam_index}'" + #end if + #end for + #if $filter_extras: + --filter_extras "$filter_extras" + #end if + </command> + <inputs> + <param format="gtf" name="gfffile" type="data" label="Gene model (GFF) file to count reads over from your current history" size="100" /> + <param name="mapqMin" label="Filter reads with mapq below than this value" + help="0 to count any mapping quality read. Otherwise only reads at or above specified mapq will be counted" + type="integer" value="5"/> + <param name="title" label="Name for this job's output file" type="text" size="80" value="bams to DGE count matrix"/> + <param name="stranded" value="false" type="boolean" label="Reads are stranded - use strand in counting" display="checkbox" + truevalue="yes" falsevalue="no" checked="no" help="Check this ONLY if you know your sequences are strand specific" /> + <param name="model" type="select" label="Model for counting reads over the supplied gene model- see HTSeq docs" + help="If in doubt, union is a reasonable default but intersection-strict avoids double counting over overlapping exons"> + <option value="union" selected="true">union</option> + <option value="intersection-strict">intersection-strict</option> + <option value="intersection-nonempty">intersection-nonempty</option> + </param> + <param name="id_attr" type="select" label="GTF attribute to output as the name for each contig - see HTSeq docs" + help="If in doubt, use gene name or if you need the id in your GTF, gene id"> + <option value="gene_name" selected="true">gene name</option> + <option value="gene_id">gene id</option> + <option value="transcript_id">transcript id</option> + <option value="transcript_name">transcript name</option> + </param> + <param name="feature_type" type="select" label="GTF feature type for counting reads over the supplied gene model- see HTSeq docs" + help="GTF feature type to count over - exon is a good choice with gene name as the contig to count over"> + <option value="exon" selected="true">exon</option> + <option value="CDS">CDS</option> + <option value="UTR">UTR</option> + <option value="transcript">transcript</option> + </param> + <param name="filter_extras" type="select" label="Filter any read with one or more flags" + help="eg the XS tag created by bowtie for multiple reads" optional="true" mutliple="true"> + <option value="">None</option> + <option value="XS">XS:i > 0 - More than one mapping position Bowtie</option> + <option value="XS:A">Might be useful for tophat</option> + </param> + + <param name="firstsamf" type="data" label="bam/sam file from your history" format="sam,bam" size="100" + help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs" + optional="false"/> + <param name="secondsamf" type="data" label="Additional bam/sam file from your history" format="sam,bam" size="100" + help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs" + optional="false"/> + <repeat name="samfiles" min="16" + title="Specify additional bam/sam file inputs" help="Each sam/bam contributes a column of read counts overlapping the specified gene model contigs"> + <param name="samf" type="data" label="Additional bam/sam file from your history" format="sam,bam" size="100" + optional="true"/> + </repeat> + </inputs> + <outputs> + <data format="tabular" name="outfile" label="${title}_htseqsams2mx.xls" /> + </outputs> + <tests> + <test> + <param name="feature_type" value="exon" /> + <param name="gfffile" value="rn4_chr20_100k.gtf" /> + <param name="firstsamf" value="rn4chr20test1.bam" ftype="bam"/> + <param name="secondsamf" value="rn4chr20test2.bam" ftype="bam"/> + <param name="id_attr" value="gene_name" /> + <param name="model" value="union" /> + <param name="stranded" value="no" /> + <param name="title" value="htseqtest" /> + <param name="mapqMin" value="0" /> + + <output name="outfile" file="htseqsams2mx_test1_out.xls" lines_diff="1"/> + </test> + </tests> + <help> + +**What this tool does** + +Counts reads in multiple sam/bam format mapped files and generates a matrix ideal for edgeR and other count based tools +It uses HTSeq to count your sam reads over a gene model supplied as a GTF file +The output is a tabular text (columnar - spreadsheet) file containing the +count matrix for downstream processing. Each row contains the counts from each sample for each +of the non-emtpy GTF input file contigs matching the GTF attribute choice above. +You probably want to use gene level GTF output attribute and count reads that overlap +GTF exons for RNA-seq. Or you can count over exons by using transcript level output names or ids. Etc. + +---- + +**Author's plea on replicates** + +If you want to interpret the downstream p values in terms of rejecting or accepting the null hypothesis +under random sampling with replacement from the universe of possible biological/experimental replicates from which your data was derived, +which is what published p values are often assumed to do, then you need biological +(or for cell culture material experimental) replicates. + +Using technical or no replicates means the downstream p values are not interpretable the way most people would assume +they are - ie as the probability of obtaining a result as or more extreme as your experimental data +in millions of experiments conducted using the same methods under the null hypothesis. + +There is no way around this and it is scientific fraud to ignore this issue and publish bogus p values derived from +technical or no replicates without making the lack of biological or experimental error in the p value calculations +clear to your readers so they can adjust their expectations. However, the buck stops here at higher level inference. +If you have no replicates, you must not use this tool as the p values are uninterpretable. So there. + +See your stats 101 notes on the central limit theorem and test statistics for a refresher or talk to a +statistician if this makes no sense please. + +**Attribution** + +This Galaxy tool relies on HTSeq_ from http://www-huber.embl.de/users/anders/HTSeq/doc/index.html +for the tricky work of counting. That code includes the following attribution: + +## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology +## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General +## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3 + +It will be automatically installed if you use the toolshed as in general, you probably should. +HTSeq_ must be installed with this tool if you install manually. + +Otherwise, all code and documentation comprising this tool including the requirement +for more than one sample bam +was written by Ross Lazarus and is +licensed to you under the LGPL_ like other rgenetics artefacts + +Sorry, I don't use readgroups so had no reason to code read groups. Contributions welcome. Send code + +.. _LGPL: http://www.gnu.org/copyleft/lesser.html +.. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/generatetest.sh Thu Nov 21 17:39:01 2013 -0500 @@ -0,0 +1,2 @@ +#python ../htseqsams2mx.py -g rn4_chr20_100k.gtf -o test.xls --samf "'rn4chr20test1.bam','col1'" --samf "'rn4chr20test2.bam','col2'" +python ../htseqsams2mx.py -g rn4_chr20_100k.gtf -o htseqsams2mx_test1_out.xls --samf "'rn4chr20test1.bam',''" --samf "'rn4chr20test2.bam',''"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/htseqsams2mx_test1_out.xls Thu Nov 21 17:39:01 2013 -0500 @@ -0,0 +1,4 @@ +Contig rn4chr20test1.bam rn4chr20test2.bam +Clic2 494 944 +F1M7K0_RAT 3 2 +Tmlhe 164 172
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rn4_chr20_100k.gtf Thu Nov 21 17:39:01 2013 -0500 @@ -0,0 +1,62 @@ +chr20 protein_coding CDS 801 1238 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding exon 801 1238 . + . exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding CDS 1742 1976 . + 0 exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding exon 1742 1976 . + . exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding CDS 2016 2177 . + 2 exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding exon 2016 2177 . + . exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding CDS 2263 2342 . + 2 exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding exon 2263 2342 . + . exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding CDS 2345 2533 . + 0 exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding exon 2345 2533 . + . exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding CDS 19528 19708 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding CDS 19528 19708 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding exon 19528 19708 . + . exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding exon 19528 19708 . + . exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding start_codon 19528 19530 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding start_codon 19528 19530 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding CDS 21979 22014 . + 2 exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding exon 21979 22014 . + . exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding CDS 25349 25525 . + 2 exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding CDS 25349 25525 . + 2 exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding exon 25349 25525 . + . exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding exon 25349 25525 . + . exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding CDS 35197 35476 . + 2 exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding CDS 35197 35476 . + 2 exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding exon 35197 35476 . + . exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding exon 35197 35476 . + . exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding CDS 36764 36883 . + 1 exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding CDS 36764 36883 . + 1 exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding exon 36764 36883 . + . exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding exon 36764 36883 . + . exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding CDS 49040 49276 . + 1 exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding CDS 49040 49276 . + 1 exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding exon 49040 49276 . + . exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding exon 49040 49276 . + . exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding CDS 55193 55331 . + 1 exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding CDS 55193 55331 . + 1 exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding exon 55193 55331 . + . exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding exon 55193 55331 . + . exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding CDS 55883 56011 . + 0 exon_number "8"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding CDS 55883 56011 . + 0 exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding exon 55883 56124 . + . exon_number "8"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding exon 55883 56124 . + . exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding stop_codon 56012 56014 . + 0 exon_number "8"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding stop_codon 56012 56014 . + 0 exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding exon 66518 66785 . + . exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding CDS 66729 66785 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding start_codon 66729 66731 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding CDS 75931 76040 . + 0 exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding exon 75931 76040 . + . exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding CDS 76165 76290 . + 1 exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding exon 76165 76290 . + . exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding CDS 79941 80047 . + 1 exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding exon 79941 80047 . + . exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding CDS 80692 80873 . + 2 exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding exon 80692 80873 . + . exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding CDS 81142 81294 . + 0 exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding exon 81142 81536 . + . exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding stop_codon 81295 81297 . + 0 exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding exon 92810 93748 . - . exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000029622"; gene_name "Olr1668"; p_id "P5423"; transcript_id "ENSRNOT00000047483"; transcript_name "Olr1668"; tss_id "TSS17091"; +chr20 protein_coding stop_codon 92810 92812 . - 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000029622"; gene_name "Olr1668"; p_id "P5423"; transcript_id "ENSRNOT00000047483"; transcript_name "Olr1668"; tss_id "TSS17091"; +chr20 protein_coding CDS 92813 93748 . - 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000029622"; gene_name "Olr1668"; p_id "P5423"; protein_id "ENSRNOP00000042115"; transcript_id "ENSRNOT00000047483"; transcript_name "Olr1668"; tss_id "TSS17091"; +chr20 protein_coding start_codon 93746 93748 . - 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000029622"; gene_name "Olr1668"; p_id "P5423"; transcript_id "ENSRNOT00000047483"; transcript_name "Olr1668"; tss_id "TSS17091";
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu Nov 21 17:39:01 2013 -0500 @@ -0,0 +1,47 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="numpy" version="1.7"> + <repository changeset_revision="84125ffacb90" name="package_numpy_1_7" owner="iuc" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="pysam" version="0.7.6"> + <repository changeset_revision="247e5e5bee87" name="package_pysam_0_7_6" owner="iuc" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="freetype" version="2.4"> + <repository changeset_revision="fe5cfaf931ff" name="package_freetype_2_4" owner="iuc" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" /> + </package> + <package name="matplotlib" version="1.2"> + <repository changeset_revision="966f29c955b9" name="package_matplotlib_1_2" owner="iuc" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" /> + </package> + <package name="htseq" version="0.5.4p3"> + <install version="1.0"> + <actions> + <action type="set_environment_for_install"> + <repository changeset_revision="84125ffacb90" name="package_numpy_1_7" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu"> + <package name="numpy" version="1.7" /> + </repository> + <repository changeset_revision="247e5e5bee87" name="package_pysam_0_7_6" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu"> + <package name="pysam" version="0.7.6" /> + </repository> + <repository changeset_revision="fe5cfaf931ff" name="package_freetype_2_4" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu/"> + <package name="freetype" version="2.4" /> + </repository> + <repository changeset_revision="966f29c955b9" name="package_matplotlib_1_2" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu/"> + <package name="matplotlib" version="1.2" /> + </repository> + </action> + <action type="download_by_url">https://pypi.python.org/packages/source/H/HTSeq/HTSeq-0.5.4p3.tar.gz</action> + <action type="make_directory">$INSTALL_DIR/lib/python</action> <!-- Not sure why these must be made apriori, but install fails otherwise --> + <action type="make_directory">$INSTALL_DIR/lib64/python</action> <!-- Not sure why these must be made apriori, but install fails otherwise --> + <action type="shell_command">export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python:$INSTALL_DIR/lib64/python && python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin</action> + <action type="set_environment"> + <environment_variable action="append_to" name="PYTHONPATH">$INSTALL_DIR/lib/python:$INSTALL_DIR/lib64/python</environment_variable> + <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable> + </action> + </actions> + </install> + <readme> + Installation of HTSeq requires Python 2.5+ (does not yet work with Python 3), pysam and the Numpy Python package. + Note this uses the matplotlib lite version dependent on the lite version of numpy - no atlas compilation + </readme> + </package> +</tool_dependency>