Mercurial > repos > fubar > htseq_bams_to_count_matrix
changeset 0:f2310e26012f draft
Uploaded
author | fubar |
---|---|
date | Thu, 06 Jun 2013 08:18:03 -0400 |
parents | |
children | 6aa7ffe331c0 |
files | htseq_bams_to_count_matrix/htseqsams2mx.py htseq_bams_to_count_matrix/htseqsams2mx.xml htseq_bams_to_count_matrix/tool_dependencies.xml |
diffstat | 3 files changed, 454 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/htseq_bams_to_count_matrix/htseqsams2mx.py Thu Jun 06 08:18:03 2013 -0400 @@ -0,0 +1,359 @@ +# May 2013 +# Change to htseq as the counting engine - wrap so arbitrary number of columns created +# borged Simon Anders' "count.py" since we need a vector of counts rather than a new sam file as output +# note attribution for htseq and count.py : +## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology +## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General +## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3 +# updated ross lazarus august 2011 to NOT include region and to finesse the name as the region for bed3 format inputs +# also now sums all duplicate named regions and provides a summary of any collapsing as the info +# updated ross lazarus july 26 to respect the is_duplicate flag rather than try to second guess +# note Heng Li argues that removing dupes is a bad idea for RNA seq +# updated ross lazarus july 22 to count reads OUTSIDE each bed region during the processing of each bam +# added better sorting with decoration of a dict key later sorted and undecorated. +# code cleaned up and galaxified ross lazarus july 18 et seq. +# bams2mx.py -turns a series of bam and a bed file into a matrix of counts Usage bams2mx.py <halfwindow> <bedfile.bed> <bam1.bam> +# <bam2.bam> +# uses pysam to read and count bam reads over each bed interval for each sample for speed +# still not so fast +# TODO options -shift -unique +# +""" +how this gets run: + +(vgalaxy)galaxy@iaas1-int:~$ cat database/job_working_directory/027/27014/galaxy_27014.sh +#!/bin/sh +GALAXY_LIB="/data/extended/galaxy/lib" +if [ "$GALAXY_LIB" != "None" ]; then + if [ -n "$PYTHONPATH" ]; then + PYTHONPATH="$GALAXY_LIB:$PYTHONPATH" + else + PYTHONPATH="$GALAXY_LIB" + fi + export PYTHONPATH +fi + +cd /data/extended/galaxy/database/job_working_directory/027/27014 +python /data/extended/galaxy/tools/rgenetics/htseqsams2mx.py -g "/data/extended/galaxy/database/files/034/dataset_34115.dat" -o "/data/extended/galaxy/database/files/034/dataset_34124.dat" -m "union" --id_attribute "gene_id" --feature_type "exon" --samf "'/data/extended/galaxy/database/files/033/dataset_33980.dat','T5A_C1PPHACXX_AGTTCC_L003_R1.fastq_bwa.sam'" --samf "'/data/extended/galaxy/database/files/033/dataset_33975.dat','T5A_C1PPHACXX_AGTTCC_L002_R1.fastq_bwa.sam'"; cd /data/extended/galaxy; /data/extended/galaxy/set_metadata.sh ./database/files /data/extended/galaxy/database/job_working_directory/027/27014 . /data/extended/galaxy/universe_wsgi.ini /data/tmp/tmpmwsElH /data/extended/galaxy/database/job_working_directory/027/27014/galaxy.json /data/extended/galaxy/database/job_working_directory/027/27014/metadata_in_HistoryDatasetAssociation_45202_sfOMGa,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_kwds_HistoryDatasetAssociation_45202_gaMnxa,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_out_HistoryDatasetAssociation_45202_kZPsZO,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_results_HistoryDatasetAssociation_45202_bXU7IU,,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_override_HistoryDatasetAssociation_45202_hyLAvh +echo $? > /data/extended/galaxy/database/job_working_directory/027/27014/galaxy_27014.ec + +""" + +import os +import re +import sys +import HTSeq.scripts.count as htcount +import optparse +import tempfile +import shutil +import operator +import subprocess +import itertools +import warnings +import traceback +import HTSeq +import time + + +class Xcpt(Exception): + def __init__(self, msg): + self.msg = msg + + +def keynat(s=None): + ''' + borrowed from http://code.activestate.com/recipes/285264-natural-string-sorting/ + A natural sort helper function for sort() and sorted() + without using regular expressions or exceptions. + >>> items = ('Z', 'a', '10th', '1st', '9') sorted(items) + ['10th', '1st', '9', 'Z', 'a'] + >>> sorted(items, key=keynat) + ['1st', '9', '10th', 'a', 'Z'] + ''' + if type(s) == type([]) or type(s) == type(()) : + s = s[0] + it = type(1) + r = [] + for c in s: + if c.isdigit(): + d = int(c) + if r and type( r[-1] ) == it: + r[-1] = r[-1] * 10 + d + else: + r.append(d) + else: + r.append(c.lower()) + return r + + +def sort_table(table, cols): + """ sort a table by multiple columns + table: a list of lists (or tuple of tuples) where each inner list + represents a row + cols: a list (or tuple) specifying the column numbers to sort by + e.g. (1,0) would sort by column 1, then by column 0 + """ + for col in reversed(cols): + table = sorted(table, key=operator.itemgetter(col)) + return table + + + +def htseqMX(gff_filename,sam_filenames,colnames,opts): + + class UnknownChrom( Exception ): + pass + + def my_showwarning( message, category, filename, lineno = None, line = None ): + sys.stderr.write( "Warning: %s\n" % message ) + + def invert_strand( iv ): + iv2 = iv.copy() + if iv2.strand == "+": + iv2.strand = "-" + elif iv2.strand == "-": + iv2.strand = "+" + else: + raise ValueError, "Illegal strand" + return iv2 + + def count_reads_in_features( sam_filenames, colnames, gff_filename, opts ): + """ Hacked version of htseq count.py + """ + if opts.quiet: + warnings.filterwarnings( action="ignore", module="HTSeq" ) + features = HTSeq.GenomicArrayOfSets( "auto", opts.stranded != "no" ) + mapqMin = int(opts.mapqMin) + counts = {} + empty = 0 + ambiguous = 0 + notaligned = 0 + lowqual = 0 + nonunique = 0 + gff = HTSeq.GFF_Reader( gff_filename ) + try: + for i,f in enumerate(gff): + if f.type == opts.feature_type: + try: + feature_id = f.attr[ opts.id_attribute ] + except KeyError: + sys.exit( "Feature at row %d %s does not contain a '%s' attribute" % + ( (i+1), f.name, opts.id_attribute ) ) + if opts.stranded != "no" and f.iv.strand == ".": + sys.exit( "Feature %s at %s does not have strand information but you are " + "running htseq-count in stranded mode. Use '--stranded=no'." % + ( f.name, f.iv ) ) + features[ f.iv ] += feature_id + counts[ f.attr[ opts.id_attribute ] ] = [0 for x in colnames] # we use sami as an index here to bump counts later + except: + sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() ) + raise + + if not opts.quiet: + sys.stdout.write( "%d GFF lines processed.\n" % i ) + + if len( counts ) == 0 and not opts.quiet: + sys.stdout.write( "Warning: No features of type '%s' found.\n" % opts.feature_type ) + for sami,sam_filename in enumerate(sam_filenames): + colname = colnames[sami] + try: + read_seq = HTSeq.SAM_Reader( sam_filename ) + first_read = iter(read_seq).next() + pe_mode = first_read.paired_end + except: + sys.stderr.write( "Error occured when reading first line of sam file %s\n" % sam_filename ) + raise + + try: + if pe_mode: + read_seq_pe_file = read_seq + read_seq = HTSeq.pair_SAM_alignments( read_seq ) + for seqi,r in enumerate(read_seq): + if not pe_mode: + if not r.aligned: + notaligned += 1 + continue + try: + if r.optional_field( "NH" ) > 1: + nonunique += 1 + continue + except KeyError: + pass + if r.aQual < mapqMin: + lowqual += 1 + continue + if opts.stranded != "reverse": + iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ) + else: + iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 ) + else: + if r[0] is not None and r[0].aligned: + if opts.stranded != "reverse": + iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) + else: + iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) + else: + iv_seq = tuple() + if r[1] is not None and r[1].aligned: + if opts.stranded != "reverse": + iv_seq = itertools.chain( iv_seq, + ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) + else: + iv_seq = itertools.chain( iv_seq, + ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) + else: + if ( r[0] is None ) or not ( r[0].aligned ): + notaligned += 1 + continue + try: + if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ + ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): + nonunique += 1 + continue + except KeyError: + pass + if ( r[0] and r[0].aQual < mapqMin ) or ( r[1] and r[1].aQual < mapqMin ): + lowqual += 1 + continue + + try: + if opts.mode == "union": + fs = set() + for iv in iv_seq: + if iv.chrom not in features.chrom_vectors: + raise UnknownChrom + for iv2, fs2 in features[ iv ].steps(): + fs = fs.union( fs2 ) + elif opts.mode == "intersection-strict" or opts.mode == "intersection-nonempty": + fs = None + for iv in iv_seq: + if iv.chrom not in features.chrom_vectors: + raise UnknownChrom + for iv2, fs2 in features[ iv ].steps(): + if len(fs2) > 0 or opts.mode == "intersection-strict": + if fs is None: + fs = fs2.copy() + else: + fs = fs.intersection( fs2 ) + else: + sys.exit( "Illegal overlap mode %s" % opts.mode ) + if fs is None or len( fs ) == 0: + empty += 1 + elif len( fs ) > 1: + ambiguous += 1 + else: + ck = list(fs)[0] + counts[ck][sami] += 1 # end up with counts for each sample as a list + except UnknownChrom: + if not pe_mode: + rr = r + else: + rr = r[0] if r[0] is not None else r[1] + empty += 1 + #if not quiet: + # sys.stderr.write( ( "Warning: Skipping read '%s', because chromosome " + + # "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % + # ( rr.read.name, iv.chrom ) ) + except: + if not pe_mode: + sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) + else: + sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) + raise + + if not opts.quiet: + sys.stdout.write( "%d sam %s processed.\n" % ( sami, "lines " if not pe_mode else "line pairs" ) ) + return counts,empty,ambiguous,lowqual,notaligned,nonunique + + warnings.showwarning = my_showwarning + assert os.path.isfile(gff_filename),'## unable to open supplied gff file %s' % gff_filename + try: + counts,empty,ambiguous,lowqual,notaligned,nonunique = count_reads_in_features( sam_filenames, colnames, gff_filename,opts) + except: + sys.stderr.write( "Error: %s\n" % str( sys.exc_info()[1] ) ) + sys.stderr.write( "[Exception type: %s, raised in %s:%d]\n" % + ( sys.exc_info()[1].__class__.__name__, + os.path.basename(traceback.extract_tb( sys.exc_info()[2] )[-1][0]), + traceback.extract_tb( sys.exc_info()[2] )[-1][1] ) ) + sys.exit( 1 ) + return counts,empty,ambiguous,lowqual,notaligned,nonunique + + +def usage(): + print >> sys.stderr, """Usage: python htseqsams2mx.py -w <halfwindowsize> -g <gfffile.gff> -o <outfilename> [-i] [-c] --samf "<sam1.sam>,<sam1.column_header>" --samf "...<samN.column_header>" """ + sys.exit(1) + +if __name__ == "__main__": + """ + <command interpreter="python"> + htseqsams2mx.py -w "$halfwin" -g "$gfffile" -o "$outfile" -m "union" + #for $s in $samfiles: + --samf "'${s.samf}','${s.samf.name}'" + #end for + </command> + """ + if len(sys.argv) < 2: + usage() + sys.exit(1) + starttime = time.time() + op = optparse.OptionParser() + # All tools + op.add_option('-w', '--halfwindow', default="0") + op.add_option('-m', '--mode', default="union") + op.add_option('-s', '--stranded', default="no") + op.add_option('-y', '--feature_type', default="exon") + op.add_option('-g', '--gff_file', default=None) + op.add_option('-o', '--outfname', default=None) + op.add_option('-f','--forceName', default="false") + op.add_option('--samf', default=[], action="append") + op.add_option('--mapqMin', default='0') + op.add_option( "-t", "--type", type="string", dest="featuretype", + default = "exon", help = "feature type (3rd column in GFF file) to be used, " + + "all features of other type are ignored (default, suitable for Ensembl " + + "GTF files: exon)" ) + + op.add_option( "-i", "--id_attribute", type="string", dest="id_attribute", + default = "gene_name", help = "GTF attribute to be used as feature ID (default, " + + "suitable for Ensembl GTF files: gene_id)" ) + + op.add_option( "-q", "--quiet", action="store_true", dest="quiet", default = False, + help = "suppress progress report and warnings" ) + opts, args = op.parse_args() + halfwindow = int(opts.halfwindow) + gff_file = opts.gff_file + assert os.path.isfile(gff_file),'##ERROR htseqsams2mx: Supplied input GFF file "%s" not found' % gff_file + outfname = opts.outfname + sam_filenames = [] + colnames = [] + samdat = opts.samf + samf = [x.split(',')[0].replace("'",'').replace('"','') for x in samdat] # get rid of wrapper supplied quotes + assert len(set(samf)) == len(samf),'## ERROR sams2mx: Duplicate input sam file in %s' % ','.join(samf) + scolnames = [x.split(',')[1].replace("'",'').replace('"','') for x in samdat] + assert len(samf) == len(scolnames), '##ERROR sams2mx: Count of sam/cname not consistent - %d/%d' % (len(samf),len(scolname)) + for i,b in enumerate(samf): + assert os.path.isfile(b),'## Supplied input sam file "%s" not found' % b + sam_filenames.append(b) + sampName = scolnames[i] # better be unique + sampName = sampName.replace('#','') # for R + sampName = sampName.replace('(','') # for R + sampName = sampName.replace(')','') # for R + sampName = sampName.replace(' ','_') # for R + colnames.append(sampName) + counts,empty,ambiguous,lowqual,notaligned,nonunique = htseqMX(gff_file, sam_filenames,colnames,opts) + heads = '\t'.join(['Contig',] + colnames) + res = [heads,] + contigs = counts.keys() + contigs.sort() + totalc = 0 + for contig in contigs: + totalc += sum(counts[contig]) + crow = [contig,] + ['%d' % x for x in counts[contig]] + res.append('\t'.join(crow)) + outf = open(opts.outfname,'w') + outf.write('\n'.join(res)) + outf.write('\n') + outf.close() + walltime = int(time.time() - starttime) + accumulatornames = ('walltimeseconds','contigs','emptyread','ambiguous','lowqual','notaligned','nonunique') + notes = ['%s=%d' % (accumulatornames[i],x) for i,x in enumerate((len(contigs),empty,ambiguous,lowqual,notaligned,nonunique))] + print >> sys.stdout, ','.join(notes)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/htseq_bams_to_count_matrix/htseqsams2mx.xml Thu Jun 06 08:18:03 2013 -0400 @@ -0,0 +1,89 @@ +<tool id="htseqsams2mx" name="Multiple SAMs to count matrix" version="0.2"> + <description>for DGE</description> + <requirements> + <requirement type="package" version="0.5.4">package_htseq_0_5_4</requirement> + </requirements> + <command interpreter="python"> + htseqsams2mx.py -g "$gfffile" -o "$outfile" -m "$model" --id_attribute "$id_attr" --feature_type "$feature_type" + --samf "'$firstsamf','${firstsamf.name}'" + #for $s in $samfiles: + --samf "'${s.samf}','${s.samf.name}'" + #end for + </command> + <inputs> + <param format="gtf" name="gfffile" type="data" label="Gene model (GFF) file to count reads over from your current history" size="100" /> + <param name="mapqMin" label="Filter reads with mapq below than this value" + help="0 to count any mapping quality read. Otherwise only reads at or above specified mapq will be counted" + type="integer" value="5"/> + <param name="title" label="Name for this job's output file" type="text" size="80" value="bams to DGE matrix"/> + <param name="forceName" value="false" type="boolean" label="Force replacement to chr:start-offset as the name for each contig in the output" + truevalue="true" falsevalue="false" checked="no" help="Leave as false to use the contig names as supplied in your bed file" /> + <param name="model" type="select" label="Model for counting reads over the supplied gene model- see HTSeq docs" + help="If in doubt, union is a reasonable default but intersection-strict avoids double counting over overlapping exons"> + <option value="union" selected="true">union</option> + <option value="intersection-strict">intersection-strict</option> + <option value="intersection-nonempty">intersection-nonempty</option> + </param> + <param name="id_attr" type="select" label="GTF attribute to output as the name for each contig - see HTSeq docs" + help="If in doubt, gene name is the only option right now"> + <option value="gene_name" selected="true">gene name</option> + <option value="gene_id">gene id</option> + <option value="transcript_id">transcript id</option> + <option value="transcript_name">transcript name</option> + </param> + <param name="feature_type" type="select" label="GTF feature type for counting reads over the supplied gene model- see HTSeq docs" + help="exon is all"> + <option value="exon" selected="true">exon</option> + <option value="CDS">CDS</option> + <option value="UTR">UTR</option> + <option value="transcript">transcript</option> + </param> + <param name="firstsamf" type="data" label="SAM file from your history to count reads overlapping gene model regions" format="sam" /> + <repeat name="samfiles" title="Additional SAM files from your history to count reads overlapping gene model regions"> + <param name="samf" type="data" label="Additional SAM file from your history" format="sam" size="100"/> + </repeat> + </inputs> + <outputs> + <data format="tabular" name="outfile" label="${title}_htseqsams2mx.xls" /> + </outputs> + <help> + +**Warning** + +This code will count reads overlapping contigs supplied in the gff file. + + +**Note** + +htseqsams2mx is an experimental tool currently under test + +There is much discussion about whether to count optical/pcr duplicates. If you set the ignore flag to True, any reads in the input BAM files marked as +duplicates by upstream processing such as the Picard MarkDuplicates tool will NOT be counted. The 'right' setting depends on your data and coverage. For extremely deep +coverage, true duplicate reads are inevitable and ignoring them may be throwing away useful real data. In most cases, counting them is probably a reasonable +choice - any induced bias is likely to be non-differential between samples, whereas it's not at all clear whether that's the case if they are ignored. + +---- + +**What this tool does** + +Counts reads in multiple sample aligned sam format files using HTSeq counting over a gene model supplied as a GFF file + +The output is a tabular file containing the count matrix and suitable for downstream processing. + +---- + +**Attribution** + + +This Galaxy wrapper was written for a revised version by Ross Lazarus and is licensed under the LGPL_ like other rgenetics artefacts + +.. _LGPL: http://www.gnu.org/copyleft/lesser.html + + </help> + +</tool> + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/htseq_bams_to_count_matrix/tool_dependencies.xml Thu Jun 06 08:18:03 2013 -0400 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="package_htseq_0_5_4" version="0.5.4"> + <repository name="package_htseq_0_5_4" owner="fubar" toolshed="http://testtoolshed.g2.bx.psu.edu"/> + </package> +</tool_dependency>