Mercurial > repos > fubar > htseq_bams_to_count_matrix
changeset 56:9b59cd40f20d draft
Uploaded
author | iuc |
---|---|
date | Tue, 28 Apr 2015 22:56:39 -0400 |
parents | bf016b884c68 |
children | 05ba058b0d28 |
files | htseqsams2mx.py htseqsams2mx.xml test-data/generatetest.sh test-data/htseqsams2mx_test1_out.xls test-data/rn4_chr20_100k.gtf test-data/rn4chr20test1.bam test-data/rn4chr20test1.bam.bai test-data/rn4chr20test2.bam test-data/rn4chr20test2.bam.bai tool_dependencies.xml |
diffstat | 10 files changed, 619 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/htseqsams2mx.py Tue Apr 28 22:56:39 2015 -0400 @@ -0,0 +1,384 @@ +# May 2013 +# Change to htseq as the counting engine - wrap so arbitrary number of columns created +# borged Simon Anders' "count.py" since we need a vector of counts rather than a new sam file as output +# note attribution for htseq and count.py : +## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology +## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General +## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3 +# updated ross lazarus august 2011 to NOT include region and to finesse the name as the region for bed3 format inputs +# also now sums all duplicate named regions and provides a summary of any collapsing as the info +# updated ross lazarus july 26 to respect the is_duplicate flag rather than try to second guess +# note Heng Li argues that removing dupes is a bad idea for RNA seq +# updated ross lazarus july 22 to count reads OUTSIDE each bed region during the processing of each bam +# added better sorting with decoration of a dict key later sorted and undecorated. +# code cleaned up and galaxified ross lazarus july 18 et seq. +# bams2mx.py -turns a series of bam and a bed file into a matrix of counts Usage bams2mx.py <halfwindow> <bedfile.bed> <bam1.bam> +# <bam2.bam> +# uses pysam to read and count bam reads over each bed interval for each sample for speed +# still not so fast +# TODO options -shift -unique +# +""" +how this gets run: + +(vgalaxy)galaxy@iaas1-int:~$ cat database/job_working_directory/027/27014/galaxy_27014.sh +#!/bin/sh +GALAXY_LIB="/data/extended/galaxy/lib" +if [ "$GALAXY_LIB" != "None" ]; then + if [ -n "$PYTHONPATH" ]; then + PYTHONPATH="$GALAXY_LIB:$PYTHONPATH" + else + PYTHONPATH="$GALAXY_LIB" + fi + export PYTHONPATH +fi + +cd /data/extended/galaxy/database/job_working_directory/027/27014 +python /data/extended/galaxy/tools/rgenetics/htseqsams2mx.py -g "/data/extended/galaxy/database/files/034/dataset_34115.dat" -o "/data/extended/galaxy/database/files/034/dataset_34124.dat" -m "union" --id_attribute "gene_id" --feature_type "exon" --samf "'/data/extended/galaxy/database/files/033/dataset_33980.dat','T5A_C1PPHACXX_AGTTCC_L003_R1.fastq_bwa.sam'" --samf "'/data/extended/galaxy/database/files/033/dataset_33975.dat','T5A_C1PPHACXX_AGTTCC_L002_R1.fastq_bwa.sam'"; cd /data/extended/galaxy; /data/extended/galaxy/set_metadata.sh ./database/files /data/extended/galaxy/database/job_working_directory/027/27014 . /data/extended/galaxy/universe_wsgi.ini /data/tmp/tmpmwsElH /data/extended/galaxy/database/job_working_directory/027/27014/galaxy.json /data/extended/galaxy/database/job_working_directory/027/27014/metadata_in_HistoryDatasetAssociation_45202_sfOMGa,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_kwds_HistoryDatasetAssociation_45202_gaMnxa,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_out_HistoryDatasetAssociation_45202_kZPsZO,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_results_HistoryDatasetAssociation_45202_bXU7IU,,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_override_HistoryDatasetAssociation_45202_hyLAvh +echo $? > /data/extended/galaxy/database/job_working_directory/027/27014/galaxy_27014.ec + +""" + +import os +import re +import sys +import HTSeq.scripts.count as htcount +import optparse +import tempfile +import shutil +import operator +import subprocess +import itertools +import warnings +import traceback +import HTSeq +import time + + +class Xcpt(Exception): + def __init__(self, msg): + self.msg = msg + + +def htseqMX(gff_filename,sam_filenames,colnames,sam_exts,sam_bais,opts): + """ + Code taken from count.py in Simon Anders HTSeq distribution + Wrapped in a loop to accept multiple bam/sam files and their names from galaxy to + produce a matrix of contig counts by sample for downstream use in edgeR and DESeq tools + """ + class UnknownChrom( Exception ): + pass + + def my_showwarning( message, category, filename, lineno = None, line = None ): + sys.stdout.write( "Warning: %s\n" % message ) + + def invert_strand( iv ): + iv2 = iv.copy() + if iv2.strand == "+": + iv2.strand = "-" + elif iv2.strand == "-": + iv2.strand = "+" + else: + raise ValueError, "Illegal strand" + return iv2 + + def count_reads_in_features( sam_filenames, colnames, gff_filename, opts ): + """ Hacked version of htseq count.py + """ + if opts.quiet: + warnings.filterwarnings( action="ignore", module="HTSeq" ) + features = HTSeq.GenomicArrayOfSets( "auto", opts.stranded != "no" ) + mapqMin = int(opts.mapqMin) + counts = {} + nreads = 0 + empty = 0 + ambiguous = 0 + notaligned = 0 + lowqual = 0 + nonunique = 0 + filtered = 0 # new filter_extras - need a better way to do this - independent filter tool? + gff = HTSeq.GFF_Reader( gff_filename ) + try: + for i,f in enumerate(gff): + if f.type == opts.feature_type: + try: + feature_id = f.attr[ opts.id_attribute ] + except KeyError: + try: + feature_id = f.attr[ 'gene_id' ] + except KeyError: + sys.exit( "Feature at row %d %s does not contain a '%s' attribute OR a gene_id attribute - faulty GFF?" % + ( (i+1), f.name, opts.id_attribute ) ) + if opts.stranded != "no" and f.iv.strand == ".": + sys.exit( "Feature %s at %s does not have strand information but you are " + "running htseq-count in stranded mode. Use '--stranded=no'." % + ( f.name, f.iv ) ) + features[ f.iv ] += feature_id + counts[ feature_id ] = [0 for x in colnames] # we use sami as an index here to bump counts later + except: + sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() ) + raise + + if not opts.quiet: + sys.stdout.write( "%d GFF lines processed.\n" % i ) + + if len( counts ) == 0 and not opts.quiet: + sys.stdout.write( "Warning: No features of type '%s' found.\n" % opts.feature_type ) + for sami,sam_filename in enumerate(sam_filenames): + colname = colnames[sami] + isbam = sam_exts[sami] == 'bam' + hasbai = sam_bais[sami] > '' + if hasbai: + tempname = os.path.splitext(os.path.basename(sam_filename))[0] + tempbam = '%s_TEMP.bam' % tempname + tempbai = '%s_TEMP.bai' % tempname + os.link(sam_filename,tempbam) + os.link(sam_bais[sami],tempbai) + try: + if isbam: + if hasbai: + read_seq = HTSeq.BAM_Reader ( tempbam ) + else: + read_seq = HTSeq.BAM_Reader( sam_filename ) + else: + read_seq = HTSeq.SAM_Reader( sam_filename ) + first_read = iter(read_seq).next() + pe_mode = first_read.paired_end + except: + if isbam: + print >> sys.stderr, "Error occured when reading first line of bam file %s colname=%s \n" % (sam_filename,colname ) + else: + print >> sys.stderr, "Error occured when reading first line of sam file %s colname=%s \n" % (sam_filename,colname ) + raise + + try: + if pe_mode: + read_seq_pe_file = read_seq + read_seq = HTSeq.pair_SAM_alignments( read_seq ) + for seqi,r in enumerate(read_seq): + nreads += 1 + if not pe_mode: + if not r.aligned: + notaligned += 1 + continue + try: + if len(opts.filter_extras) > 0: + for extra in opts.filter_extras: + if r.optional_field(extra): + filtered += 1 + continue + if r.optional_field( "NH" ) > 1: + nonunique += 1 + continue + except KeyError: + pass + if r.aQual < mapqMin: + lowqual += 1 + continue + if opts.stranded != "reverse": + iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ) + else: + iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 ) + else: + if r[0] is not None and r[0].aligned: + if opts.stranded != "reverse": + iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) + else: + iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) + else: + iv_seq = tuple() + if r[1] is not None and r[1].aligned: + if opts.stranded != "reverse": + iv_seq = itertools.chain( iv_seq, + ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) + else: + iv_seq = itertools.chain( iv_seq, + ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) + else: + if ( r[0] is None ) or not ( r[0].aligned ): + notaligned += 1 + continue + try: + if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ + ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): + nonunique += 1 + continue + except KeyError: + pass + if ( r[0] and r[0].aQual < mapqMin ) or ( r[1] and r[1].aQual < mapqMin ): + lowqual += 1 + continue + + try: + if opts.mode == "union": + fs = set() + for iv in iv_seq: + if iv.chrom not in features.chrom_vectors: + raise UnknownChrom + for iv2, fs2 in features[ iv ].steps(): + fs = fs.union( fs2 ) + elif opts.mode == "intersection-strict" or opts.mode == "intersection-nonempty": + fs = None + for iv in iv_seq: + if iv.chrom not in features.chrom_vectors: + raise UnknownChrom + for iv2, fs2 in features[ iv ].steps(): + if len(fs2) > 0 or opts.mode == "intersection-strict": + if fs is None: + fs = fs2.copy() + else: + fs = fs.intersection( fs2 ) + else: + sys.exit( "Illegal overlap mode %s" % opts.mode ) + if fs is None or len( fs ) == 0: + empty += 1 + elif len( fs ) > 1: + ambiguous += 1 + else: + ck = list(fs)[0] + counts[ck][sami] += 1 # end up with counts for each sample as a list + except UnknownChrom: + if not pe_mode: + rr = r + else: + rr = r[0] if r[0] is not None else r[1] + empty += 1 + if not opts.quiet: + sys.stdout.write( ( "Warning: Skipping read '%s', because chromosome " + + "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % + ( rr.read.name, iv.chrom ) ) + except: + if not pe_mode: + sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) + else: + sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) + raise + + if not opts.quiet: + sys.stdout.write( "%d sam %s processed for %s.\n" % ( seqi, "lines " if not pe_mode else "line pairs", colname ) ) + return counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads + + warnings.showwarning = my_showwarning + assert os.path.isfile(gff_filename),'## unable to open supplied gff file %s' % gff_filename + try: + counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads = count_reads_in_features( sam_filenames, colnames, gff_filename,opts) + except: + sys.stderr.write( "Error: %s\n" % str( sys.exc_info()[1] ) ) + sys.stderr.write( "[Exception type: %s, raised in %s:%d]\n" % + ( sys.exc_info()[1].__class__.__name__, + os.path.basename(traceback.extract_tb( sys.exc_info()[2] )[-1][0]), + traceback.extract_tb( sys.exc_info()[2] )[-1][1] ) ) + sys.exit( 1 ) + return counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads + + +def usage(): + print >> sys.stdout, """Usage: python htseqsams2mx.py -w <halfwindowsize> -g <gfffile.gff> -o <outfilename> [-i] [-c] --samf "<sam1.sam>,<sam1.column_header>" --samf "...<samN.column_header>" """ + sys.exit(1) + +if __name__ == "__main__": + """ + <command interpreter="python"> + htseqsams2mx.py -w "$halfwin" -g "$gfffile" -o "$outfile" -m "union" + #for $s in $samfiles: + --samf "'${s.samf}','${s.samf.name}'" + #end for + </command> + """ + if len(sys.argv) < 2: + usage() + sys.exit(1) + starttime = time.time() + op = optparse.OptionParser() + # All tools + op.add_option('-w', '--halfwindow', default="0") + op.add_option('-m', '--mode', default="union") + op.add_option('-s', '--stranded', default="no") + op.add_option('-y', '--feature_type', default="exon") + op.add_option('-g', '--gff_file', default=None) + op.add_option('-o', '--outfname', default=None) + op.add_option('-f','--forceName', default="false") + op.add_option('--samf', default=[], action="append") + op.add_option('--filter_extras', default=[], action="append") + op.add_option('--mapqMin', default='0') + op.add_option( "-t", "--type", type="string", dest="featuretype", + default = "exon", help = "feature type (3rd column in GFF file) to be used, " + + "all features of other type are ignored (default, suitable for Ensembl " + + "GTF files: exon)" ) + + op.add_option( "-i", "--id_attribute", type="string", dest="id_attribute", + default = "gene_name", help = "GTF attribute to be used as feature ID (default, " + + "suitable for Ensembl GTF files: gene_id)" ) + + op.add_option( "-q", "--quiet", action="store_true", dest="quiet", default = False, + help = "suppress progress report and warnings" ) + opts, args = op.parse_args() + halfwindow = int(opts.halfwindow) + gff_file = opts.gff_file + assert os.path.isfile(gff_file),'##ERROR htseqsams2mx: Supplied input GFF file "%s" not found' % gff_file + outfname = opts.outfname + sam_filenames = [] + colnames = [] + samf = opts.samf + samfsplit = [x.split(',') for x in samf] # one per samf set + samsets = [] + for samfs in samfsplit: + samset = [x.replace("'","") for x in samfs] + samset = [x.replace('"','') for x in samset] + samsets.append(samset) + samsets = [x for x in samsets if x[0].lower() != 'none'] + # just cannot stop getting these on cl! wtf in cheetah for a repeat group? + samfnames = [x[0] for x in samsets] + if len(set(samfnames)) != len(samfnames): + samnames = [] + delme = [] + for i,s in enumerate(samfnames): + if s in samnames: + delme.append(i) + print sys.stdout,'## WARNING htseqsams2mx: Duplicate input sam file %s in %s - ignoring dupe in 0 based position %s' %\ + (s,','.join(samfnames), str(delme)) + else: + samnames.append(s) # first time + samsets = [x for i,x in enumerate(samsets) if not (i in delme)] + samfnames = [x[0] for x in samsets] + scolnames = [x[1]for x in samsets] + assert len(samfnames) == len(scolnames), '##ERROR sams2mx: Count of sam/cname not consistent - %d/%d' % (len(samfnames),len(scolnames)) + sam_exts = [x[2] for x in samsets] + assert len(samfnames) == len(sam_exts), '##ERROR sams2mx: Count of extensions not consistent - %d/%d' % (len(samfnames),len(sam_exts)) + sam_bais = [x[3] for x in samsets] # these only exist for bams and need to be finessed with a symlink so pysam will just work + for i,b in enumerate(samfnames): + assert os.path.isfile(b),'## Supplied input sam file "%s" not found' % b + sam_filenames.append(b) + sampName = scolnames[i] # better be unique + sampName = sampName.replace('#','') # for R + sampName = sampName.replace('(','') # for R + sampName = sampName.replace(')','') # for R + sampName = sampName.replace(' ','_') # for R + colnames.append(sampName) + counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads = htseqMX(gff_file, sam_filenames,colnames,sam_exts,sam_bais,opts) + heads = '\t'.join(['Contig',] + colnames) + res = [heads,] + contigs = counts.keys() + contigs.sort() + totalc = 0 + emptycontigs = 0 + for contig in contigs: + thisc = sum(counts[contig]) + if thisc > 0: # no output for empty contigs + totalc += thisc + crow = [contig,] + ['%d' % x for x in counts[contig]] + res.append('\t'.join(crow)) + else: + emptycontigs += 1 + outf = open(opts.outfname,'w') + outf.write('\n'.join(res)) + outf.write('\n') + outf.close() + walltime = int(time.time() - starttime) + accumulatornames = ('walltime (seconds)','total reads read','total reads counted','number of contigs','total empty reads','total ambiguous reads','total low quality reads', + 'total not aligned reads','total not unique mapping reads','extra filtered reads','empty contigs') + accums = (walltime,nreads,totalc,len(contigs),empty,ambiguous,lowqual,notaligned,nonunique,filtered,emptycontigs) + fracs = (1.0,1.0,float(totalc)/nreads,1.0,float(empty)/nreads,float(ambiguous)/nreads,float(lowqual)/nreads,float(notaligned)/nreads,float(nonunique)/nreads,float(filtered)/nreads,float(emptycontigs)/len(contigs)) + notes = ['%s = %d (%2.3f)' % (accumulatornames[i],x,100.0*fracs[i]) for i,x in enumerate(accums)] + print >> sys.stdout, '\n'.join(notes) + sys.exit(0)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/htseqsams2mx.xml Tue Apr 28 22:56:39 2015 -0400 @@ -0,0 +1,133 @@ +<tool id="htseqsams2mxlocal" name="SAM/BAM to count matrix" version="0.5"> + <description>using HTSeq code</description> + <stdio> + <regex match=".*" source="both" level="warning" description="chatter from HTSeq:"/> + </stdio> + <requirements> + <requirement type="package" version="0.7.6">pysam</requirement> + <requirement type="package" version="1.2.1">matplotlib</requirement> + <requirement type="package" version="0.5.4p3">htseq</requirement> + </requirements> + <command interpreter="python"> + htseqsams2mx.py -g "$gfffile" -o "$outfile" -m "$model" --id_attribute "$id_attr" --feature_type "$feature_type" + --mapqMin $mapqMin + #for $s in $samfiles: + #if $s.ext != 'data': + --samf "'${s}','${s.name}','${s.ext}','${s.metadata.bam_index}'" + #end if + #end for + #if $filter_extras: + --filter_extras "$filter_extras" + #end if + </command> + <inputs> + <param format="gtf" name="gfffile" type="data" label="Gene model (GFF) file to count reads over from your current history" size="100" /> + <param name="mapqMin" label="Filter reads with mapq below than this value" + help="0 to count any mapping quality read. Otherwise only reads at or above specified mapq will be counted" + type="integer" value="5"/> + <param name="title" label="Name for this job's output file" type="text" size="80" value="bams to DGE count matrix"/> + <param name="stranded" value="false" type="boolean" label="Reads are stranded - use strand in counting" display="checkbox" + truevalue="yes" falsevalue="no" checked="no" help="Check this ONLY if you know your sequences are strand specific" /> + <param name="model" type="select" label="Model for counting reads over the supplied gene model- see HTSeq docs" + help="If in doubt, union is a reasonable default but intersection-strict avoids double counting over overlapping exons"> + <option value="union" selected="true">union</option> + <option value="intersection-strict">intersection-strict</option> + <option value="intersection-nonempty">intersection-nonempty</option> + </param> + <param name="id_attr" type="select" label="GTF attribute to output as the name for each contig - see HTSeq docs" + help="If in doubt, use gene name or if you need the id in your GTF, gene id"> + <option value="gene_name" selected="true">gene name</option> + <option value="gene_id">gene id</option> + <option value="transcript_id">transcript id</option> + <option value="transcript_name">transcript name</option> + </param> + <param name="feature_type" type="select" label="GTF feature type for counting reads over the supplied gene model- see HTSeq docs" + help="GTF feature type to count over - exon is a good choice with gene name as the contig to count over"> + <option value="exon" selected="true">exon</option> + <option value="CDS">CDS</option> + <option value="UTR">UTR</option> + <option value="transcript">transcript</option> + </param> + <param name="filter_extras" type="select" label="Filter any read with one or more flags" + help="eg the XS tag created by bowtie for multiple reads" optional="true" mutliple="true"> + <option value="">None</option> + <option value="XS">XS:i > 0 - More than one mapping position Bowtie</option> + <option value="XS:A">Might be useful for tophat</option> + </param> + + <param name="samfiles" type="data" label="bam/sam file from your history" format="sam,bam" size="100" multiple="true"/> + </inputs> + <outputs> + <data format="tabular" name="outfile" label="${title}_htseqsams2mx.xls" /> + </outputs> + <tests> + <test> + <param name="feature_type" value="exon" /> + <param name="gfffile" value="rn4_chr20_100k.gtf" /> + <param name="samfiles" value="rn4chr20test1.bam,rn4chr20test2.bam" ftype="bam"/> + <param name="id_attr" value="gene_name" /> + <param name="model" value="union" /> + <param name="stranded" value="no" /> + <param name="title" value="htseqtest" /> + <param name="mapqMin" value="0" /> + + <output name="outfile" file="htseqsams2mx_test1_out.xls" lines_diff="1"/> + </test> + </tests> + <help> + +**What this tool does** + +Counts reads in multiple sam/bam format mapped files and generates a matrix ideal for edgeR and other count based tools +It uses HTSeq to count your sam reads over a gene model supplied as a GTF file +The output is a tabular text (columnar - spreadsheet) file containing the +count matrix for downstream processing. Each row contains the counts from each sample for each +of the non-emtpy GTF input file contigs matching the GTF attribute choice above. +You probably want to use gene level GTF output attribute and count reads that overlap +GTF exons for RNA-seq. Or you can count over exons by using transcript level output names or ids. Etc. + +---- + +**Author's plea on replicates** + +If you want to interpret the downstream p values in terms of rejecting or accepting the null hypothesis +under random sampling with replacement from the universe of possible biological/experimental replicates from which your data was derived, +which is what published p values are often assumed to do, then you need biological +(or for cell culture material experimental) replicates. + +Using technical or no replicates means the downstream p values are not interpretable the way most people would assume +they are - ie as the probability of obtaining a result as or more extreme as your experimental data +in millions of experiments conducted using the same methods under the null hypothesis. + +There is no way around this and it is scientific fraud to ignore this issue and publish bogus p values derived from +technical or no replicates without making the lack of biological or experimental error in the p value calculations +clear to your readers so they can adjust their expectations. However, the buck stops here at higher level inference. +If you have no replicates, you must not use this tool as the p values are uninterpretable. So there. + +See your stats 101 notes on the central limit theorem and test statistics for a refresher or talk to a +statistician if this makes no sense please. + +**Attribution** + +This Galaxy tool relies on HTSeq_ from http://www-huber.embl.de/users/anders/HTSeq/doc/index.html +for the tricky work of counting. That code includes the following attribution: + +## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology +## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General +## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3 + +It will be automatically installed if you use the toolshed as in general, you probably should. +HTSeq_ must be installed with this tool if you install manually. + +Otherwise, all code and documentation comprising this tool including the requirement +for more than one sample bam +was written by Ross Lazarus and is +licensed to you under the LGPL_ like other rgenetics artefacts + +Sorry, I don't use readgroups so had no reason to code read groups. Contributions welcome. Send code + +.. _LGPL: http://www.gnu.org/copyleft/lesser.html +.. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/generatetest.sh Tue Apr 28 22:56:39 2015 -0400 @@ -0,0 +1,1 @@ +python ../htseqsams2mx.py -g rn4_chr20_100k.gtf -o test.xls --samf "'rn4chr20test1.bam','rn4chr20test1.bam','bam','rn4chr20test1.bam.bai'" --samf "'rn4chr20test2.bam','rn4chr20test2.bam','bam','rn4chr20test2.bam.bai'"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/htseqsams2mx_test1_out.xls Tue Apr 28 22:56:39 2015 -0400 @@ -0,0 +1,4 @@ +Contig rn4chr20test1.bam rn4chr20test2.bam +Clic2 494 944 +F1M7K0_RAT 3 2 +Tmlhe 164 172
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rn4_chr20_100k.gtf Tue Apr 28 22:56:39 2015 -0400 @@ -0,0 +1,62 @@ +chr20 protein_coding CDS 801 1238 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding exon 801 1238 . + . exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding CDS 1742 1976 . + 0 exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding exon 1742 1976 . + . exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding CDS 2016 2177 . + 2 exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding exon 2016 2177 . + . exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding CDS 2263 2342 . + 2 exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding exon 2263 2342 . + . exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding CDS 2345 2533 . + 0 exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; protein_id "ENSRNOP00000000957"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding exon 2345 2533 . + . exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000730"; gene_name "F1M7K0_RAT"; p_id "P14715"; transcript_id "ENSRNOT00000000957"; transcript_name "F1M7K0_RAT"; tss_id "TSS11562"; +chr20 protein_coding CDS 19528 19708 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding CDS 19528 19708 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding exon 19528 19708 . + . exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding exon 19528 19708 . + . exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding start_codon 19528 19530 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding start_codon 19528 19530 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding CDS 21979 22014 . + 2 exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding exon 21979 22014 . + . exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding CDS 25349 25525 . + 2 exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding CDS 25349 25525 . + 2 exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding exon 25349 25525 . + . exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding exon 25349 25525 . + . exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding CDS 35197 35476 . + 2 exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding CDS 35197 35476 . + 2 exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding exon 35197 35476 . + . exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding exon 35197 35476 . + . exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding CDS 36764 36883 . + 1 exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding CDS 36764 36883 . + 1 exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding exon 36764 36883 . + . exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding exon 36764 36883 . + . exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding CDS 49040 49276 . + 1 exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding CDS 49040 49276 . + 1 exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding exon 49040 49276 . + . exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding exon 49040 49276 . + . exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding CDS 55193 55331 . + 1 exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding CDS 55193 55331 . + 1 exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding exon 55193 55331 . + . exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding exon 55193 55331 . + . exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding CDS 55883 56011 . + 0 exon_number "8"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; protein_id "ENSRNOP00000044070"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding CDS 55883 56011 . + 0 exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; protein_id "ENSRNOP00000000956"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding exon 55883 56124 . + . exon_number "8"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding exon 55883 56124 . + . exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding stop_codon 56012 56014 . + 0 exon_number "8"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P13601"; transcript_id "ENSRNOT00000049573"; transcript_name "Tmlhe"; tss_id "TSS451"; +chr20 protein_coding stop_codon 56012 56014 . + 0 exon_number "7"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000729"; gene_name "Tmlhe"; p_id "P3227"; transcript_id "ENSRNOT00000000956"; transcript_name "TMLH_RAT"; tss_id "TSS451"; +chr20 protein_coding exon 66518 66785 . + . exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding CDS 66729 66785 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding start_codon 66729 66731 . + 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding CDS 75931 76040 . + 0 exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding exon 75931 76040 . + . exon_number "2"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding CDS 76165 76290 . + 1 exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding exon 76165 76290 . + . exon_number "3"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding CDS 79941 80047 . + 1 exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding exon 79941 80047 . + . exon_number "4"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding CDS 80692 80873 . + 2 exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding exon 80692 80873 . + . exon_number "5"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding CDS 81142 81294 . + 0 exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; protein_id "ENSRNOP00000000955"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding exon 81142 81536 . + . exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding stop_codon 81295 81297 . + 0 exon_number "6"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000000728"; gene_name "Clic2"; p_id "P19357"; transcript_id "ENSRNOT00000000955"; transcript_name "Clic2"; tss_id "TSS24592"; +chr20 protein_coding exon 92810 93748 . - . exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000029622"; gene_name "Olr1668"; p_id "P5423"; transcript_id "ENSRNOT00000047483"; transcript_name "Olr1668"; tss_id "TSS17091"; +chr20 protein_coding stop_codon 92810 92812 . - 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000029622"; gene_name "Olr1668"; p_id "P5423"; transcript_id "ENSRNOT00000047483"; transcript_name "Olr1668"; tss_id "TSS17091"; +chr20 protein_coding CDS 92813 93748 . - 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000029622"; gene_name "Olr1668"; p_id "P5423"; protein_id "ENSRNOP00000042115"; transcript_id "ENSRNOT00000047483"; transcript_name "Olr1668"; tss_id "TSS17091"; +chr20 protein_coding start_codon 93746 93748 . - 0 exon_number "1"; gene_biotype "protein_coding"; gene_id "ENSRNOG00000029622"; gene_name "Olr1668"; p_id "P5423"; transcript_id "ENSRNOT00000047483"; transcript_name "Olr1668"; tss_id "TSS17091";
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Tue Apr 28 22:56:39 2015 -0400 @@ -0,0 +1,35 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="pysam" version="0.7.6"> + <repository changeset_revision="247e5e5bee87" name="package_pysam_0_7_6" owner="iuc" prior_installation_required="True" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="matplotlib" version="1.2.1"> + <repository changeset_revision="9f3e58477115" name="package_matplotlib_1_2" owner="iuc" prior_installation_required="True" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="htseq" version="0.5.4p3"> + <install version="1.0"> + <actions> + <action type="download_by_url">https://pypi.python.org/packages/source/H/HTSeq/HTSeq-0.6.1.tar.gz</action> + <action type="set_environment_for_install"> + <repository changeset_revision="247e5e5bee87" name="package_pysam_0_7_6" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu"> + <package name="pysam" version="0.7.6" /> + </repository> + <repository changeset_revision="9f3e58477115" name="package_matplotlib_1_2" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu"> + <package name="matplotlib" version="1.2.1" /> + </repository> + </action> + <action type="make_directory">$INSTALL_DIR/lib/python</action> <!-- Not sure why these must be made apriori, but install fails otherwise --> + <action type="make_directory">$INSTALL_DIR/lib64/python</action> <!-- Not sure why these must be made apriori, but install fails otherwise --> + <action type="shell_command">export PYTHONPATH=$PYTHONPATH:$INSTALL_DIR/lib/python:$INSTALL_DIR/lib64/python && python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin</action> + <action type="set_environment"> + <environment_variable action="append_to" name="PYTHONPATH">$INSTALL_DIR/lib/python:$INSTALL_DIR/lib64/python</environment_variable> + <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable> + </action> + </actions> + </install> + <readme> + Installation of HTSeq requires Python 2.5+ (does not yet work with Python 3), pysam and the Numpy Python package. + Note this uses the matplotlib lite version dependent on the lite version of numpy - no atlas compilation + </readme> + </package> +</tool_dependency>