Mercurial > repos > fubar > htseq_bams_to_count_matrix
diff htseqsams2mx.py @ 58:dc6df7644fc4 draft
Uploaded
author | iuc |
---|---|
date | Wed, 29 Apr 2015 12:06:59 -0400 |
parents | 9b59cd40f20d |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/htseqsams2mx.py Wed Apr 29 12:06:59 2015 -0400 @@ -0,0 +1,384 @@ +# May 2013 +# Change to htseq as the counting engine - wrap so arbitrary number of columns created +# borged Simon Anders' "count.py" since we need a vector of counts rather than a new sam file as output +# note attribution for htseq and count.py : +## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology +## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General +## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3 +# updated ross lazarus august 2011 to NOT include region and to finesse the name as the region for bed3 format inputs +# also now sums all duplicate named regions and provides a summary of any collapsing as the info +# updated ross lazarus july 26 to respect the is_duplicate flag rather than try to second guess +# note Heng Li argues that removing dupes is a bad idea for RNA seq +# updated ross lazarus july 22 to count reads OUTSIDE each bed region during the processing of each bam +# added better sorting with decoration of a dict key later sorted and undecorated. +# code cleaned up and galaxified ross lazarus july 18 et seq. +# bams2mx.py -turns a series of bam and a bed file into a matrix of counts Usage bams2mx.py <halfwindow> <bedfile.bed> <bam1.bam> +# <bam2.bam> +# uses pysam to read and count bam reads over each bed interval for each sample for speed +# still not so fast +# TODO options -shift -unique +# +""" +how this gets run: + +(vgalaxy)galaxy@iaas1-int:~$ cat database/job_working_directory/027/27014/galaxy_27014.sh +#!/bin/sh +GALAXY_LIB="/data/extended/galaxy/lib" +if [ "$GALAXY_LIB" != "None" ]; then + if [ -n "$PYTHONPATH" ]; then + PYTHONPATH="$GALAXY_LIB:$PYTHONPATH" + else + PYTHONPATH="$GALAXY_LIB" + fi + export PYTHONPATH +fi + +cd /data/extended/galaxy/database/job_working_directory/027/27014 +python /data/extended/galaxy/tools/rgenetics/htseqsams2mx.py -g "/data/extended/galaxy/database/files/034/dataset_34115.dat" -o "/data/extended/galaxy/database/files/034/dataset_34124.dat" -m "union" --id_attribute "gene_id" --feature_type "exon" --samf "'/data/extended/galaxy/database/files/033/dataset_33980.dat','T5A_C1PPHACXX_AGTTCC_L003_R1.fastq_bwa.sam'" --samf "'/data/extended/galaxy/database/files/033/dataset_33975.dat','T5A_C1PPHACXX_AGTTCC_L002_R1.fastq_bwa.sam'"; cd /data/extended/galaxy; /data/extended/galaxy/set_metadata.sh ./database/files /data/extended/galaxy/database/job_working_directory/027/27014 . /data/extended/galaxy/universe_wsgi.ini /data/tmp/tmpmwsElH /data/extended/galaxy/database/job_working_directory/027/27014/galaxy.json /data/extended/galaxy/database/job_working_directory/027/27014/metadata_in_HistoryDatasetAssociation_45202_sfOMGa,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_kwds_HistoryDatasetAssociation_45202_gaMnxa,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_out_HistoryDatasetAssociation_45202_kZPsZO,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_results_HistoryDatasetAssociation_45202_bXU7IU,,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_override_HistoryDatasetAssociation_45202_hyLAvh +echo $? > /data/extended/galaxy/database/job_working_directory/027/27014/galaxy_27014.ec + +""" + +import os +import re +import sys +import HTSeq.scripts.count as htcount +import optparse +import tempfile +import shutil +import operator +import subprocess +import itertools +import warnings +import traceback +import HTSeq +import time + + +class Xcpt(Exception): + def __init__(self, msg): + self.msg = msg + + +def htseqMX(gff_filename,sam_filenames,colnames,sam_exts,sam_bais,opts): + """ + Code taken from count.py in Simon Anders HTSeq distribution + Wrapped in a loop to accept multiple bam/sam files and their names from galaxy to + produce a matrix of contig counts by sample for downstream use in edgeR and DESeq tools + """ + class UnknownChrom( Exception ): + pass + + def my_showwarning( message, category, filename, lineno = None, line = None ): + sys.stdout.write( "Warning: %s\n" % message ) + + def invert_strand( iv ): + iv2 = iv.copy() + if iv2.strand == "+": + iv2.strand = "-" + elif iv2.strand == "-": + iv2.strand = "+" + else: + raise ValueError, "Illegal strand" + return iv2 + + def count_reads_in_features( sam_filenames, colnames, gff_filename, opts ): + """ Hacked version of htseq count.py + """ + if opts.quiet: + warnings.filterwarnings( action="ignore", module="HTSeq" ) + features = HTSeq.GenomicArrayOfSets( "auto", opts.stranded != "no" ) + mapqMin = int(opts.mapqMin) + counts = {} + nreads = 0 + empty = 0 + ambiguous = 0 + notaligned = 0 + lowqual = 0 + nonunique = 0 + filtered = 0 # new filter_extras - need a better way to do this - independent filter tool? + gff = HTSeq.GFF_Reader( gff_filename ) + try: + for i,f in enumerate(gff): + if f.type == opts.feature_type: + try: + feature_id = f.attr[ opts.id_attribute ] + except KeyError: + try: + feature_id = f.attr[ 'gene_id' ] + except KeyError: + sys.exit( "Feature at row %d %s does not contain a '%s' attribute OR a gene_id attribute - faulty GFF?" % + ( (i+1), f.name, opts.id_attribute ) ) + if opts.stranded != "no" and f.iv.strand == ".": + sys.exit( "Feature %s at %s does not have strand information but you are " + "running htseq-count in stranded mode. Use '--stranded=no'." % + ( f.name, f.iv ) ) + features[ f.iv ] += feature_id + counts[ feature_id ] = [0 for x in colnames] # we use sami as an index here to bump counts later + except: + sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() ) + raise + + if not opts.quiet: + sys.stdout.write( "%d GFF lines processed.\n" % i ) + + if len( counts ) == 0 and not opts.quiet: + sys.stdout.write( "Warning: No features of type '%s' found.\n" % opts.feature_type ) + for sami,sam_filename in enumerate(sam_filenames): + colname = colnames[sami] + isbam = sam_exts[sami] == 'bam' + hasbai = sam_bais[sami] > '' + if hasbai: + tempname = os.path.splitext(os.path.basename(sam_filename))[0] + tempbam = '%s_TEMP.bam' % tempname + tempbai = '%s_TEMP.bai' % tempname + os.link(sam_filename,tempbam) + os.link(sam_bais[sami],tempbai) + try: + if isbam: + if hasbai: + read_seq = HTSeq.BAM_Reader ( tempbam ) + else: + read_seq = HTSeq.BAM_Reader( sam_filename ) + else: + read_seq = HTSeq.SAM_Reader( sam_filename ) + first_read = iter(read_seq).next() + pe_mode = first_read.paired_end + except: + if isbam: + print >> sys.stderr, "Error occured when reading first line of bam file %s colname=%s \n" % (sam_filename,colname ) + else: + print >> sys.stderr, "Error occured when reading first line of sam file %s colname=%s \n" % (sam_filename,colname ) + raise + + try: + if pe_mode: + read_seq_pe_file = read_seq + read_seq = HTSeq.pair_SAM_alignments( read_seq ) + for seqi,r in enumerate(read_seq): + nreads += 1 + if not pe_mode: + if not r.aligned: + notaligned += 1 + continue + try: + if len(opts.filter_extras) > 0: + for extra in opts.filter_extras: + if r.optional_field(extra): + filtered += 1 + continue + if r.optional_field( "NH" ) > 1: + nonunique += 1 + continue + except KeyError: + pass + if r.aQual < mapqMin: + lowqual += 1 + continue + if opts.stranded != "reverse": + iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 ) + else: + iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 ) + else: + if r[0] is not None and r[0].aligned: + if opts.stranded != "reverse": + iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 ) + else: + iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 ) + else: + iv_seq = tuple() + if r[1] is not None and r[1].aligned: + if opts.stranded != "reverse": + iv_seq = itertools.chain( iv_seq, + ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) + else: + iv_seq = itertools.chain( iv_seq, + ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) ) + else: + if ( r[0] is None ) or not ( r[0].aligned ): + notaligned += 1 + continue + try: + if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \ + ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ): + nonunique += 1 + continue + except KeyError: + pass + if ( r[0] and r[0].aQual < mapqMin ) or ( r[1] and r[1].aQual < mapqMin ): + lowqual += 1 + continue + + try: + if opts.mode == "union": + fs = set() + for iv in iv_seq: + if iv.chrom not in features.chrom_vectors: + raise UnknownChrom + for iv2, fs2 in features[ iv ].steps(): + fs = fs.union( fs2 ) + elif opts.mode == "intersection-strict" or opts.mode == "intersection-nonempty": + fs = None + for iv in iv_seq: + if iv.chrom not in features.chrom_vectors: + raise UnknownChrom + for iv2, fs2 in features[ iv ].steps(): + if len(fs2) > 0 or opts.mode == "intersection-strict": + if fs is None: + fs = fs2.copy() + else: + fs = fs.intersection( fs2 ) + else: + sys.exit( "Illegal overlap mode %s" % opts.mode ) + if fs is None or len( fs ) == 0: + empty += 1 + elif len( fs ) > 1: + ambiguous += 1 + else: + ck = list(fs)[0] + counts[ck][sami] += 1 # end up with counts for each sample as a list + except UnknownChrom: + if not pe_mode: + rr = r + else: + rr = r[0] if r[0] is not None else r[1] + empty += 1 + if not opts.quiet: + sys.stdout.write( ( "Warning: Skipping read '%s', because chromosome " + + "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) % + ( rr.read.name, iv.chrom ) ) + except: + if not pe_mode: + sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() ) + else: + sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() ) + raise + + if not opts.quiet: + sys.stdout.write( "%d sam %s processed for %s.\n" % ( seqi, "lines " if not pe_mode else "line pairs", colname ) ) + return counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads + + warnings.showwarning = my_showwarning + assert os.path.isfile(gff_filename),'## unable to open supplied gff file %s' % gff_filename + try: + counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads = count_reads_in_features( sam_filenames, colnames, gff_filename,opts) + except: + sys.stderr.write( "Error: %s\n" % str( sys.exc_info()[1] ) ) + sys.stderr.write( "[Exception type: %s, raised in %s:%d]\n" % + ( sys.exc_info()[1].__class__.__name__, + os.path.basename(traceback.extract_tb( sys.exc_info()[2] )[-1][0]), + traceback.extract_tb( sys.exc_info()[2] )[-1][1] ) ) + sys.exit( 1 ) + return counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads + + +def usage(): + print >> sys.stdout, """Usage: python htseqsams2mx.py -w <halfwindowsize> -g <gfffile.gff> -o <outfilename> [-i] [-c] --samf "<sam1.sam>,<sam1.column_header>" --samf "...<samN.column_header>" """ + sys.exit(1) + +if __name__ == "__main__": + """ + <command interpreter="python"> + htseqsams2mx.py -w "$halfwin" -g "$gfffile" -o "$outfile" -m "union" + #for $s in $samfiles: + --samf "'${s.samf}','${s.samf.name}'" + #end for + </command> + """ + if len(sys.argv) < 2: + usage() + sys.exit(1) + starttime = time.time() + op = optparse.OptionParser() + # All tools + op.add_option('-w', '--halfwindow', default="0") + op.add_option('-m', '--mode', default="union") + op.add_option('-s', '--stranded', default="no") + op.add_option('-y', '--feature_type', default="exon") + op.add_option('-g', '--gff_file', default=None) + op.add_option('-o', '--outfname', default=None) + op.add_option('-f','--forceName', default="false") + op.add_option('--samf', default=[], action="append") + op.add_option('--filter_extras', default=[], action="append") + op.add_option('--mapqMin', default='0') + op.add_option( "-t", "--type", type="string", dest="featuretype", + default = "exon", help = "feature type (3rd column in GFF file) to be used, " + + "all features of other type are ignored (default, suitable for Ensembl " + + "GTF files: exon)" ) + + op.add_option( "-i", "--id_attribute", type="string", dest="id_attribute", + default = "gene_name", help = "GTF attribute to be used as feature ID (default, " + + "suitable for Ensembl GTF files: gene_id)" ) + + op.add_option( "-q", "--quiet", action="store_true", dest="quiet", default = False, + help = "suppress progress report and warnings" ) + opts, args = op.parse_args() + halfwindow = int(opts.halfwindow) + gff_file = opts.gff_file + assert os.path.isfile(gff_file),'##ERROR htseqsams2mx: Supplied input GFF file "%s" not found' % gff_file + outfname = opts.outfname + sam_filenames = [] + colnames = [] + samf = opts.samf + samfsplit = [x.split(',') for x in samf] # one per samf set + samsets = [] + for samfs in samfsplit: + samset = [x.replace("'","") for x in samfs] + samset = [x.replace('"','') for x in samset] + samsets.append(samset) + samsets = [x for x in samsets if x[0].lower() != 'none'] + # just cannot stop getting these on cl! wtf in cheetah for a repeat group? + samfnames = [x[0] for x in samsets] + if len(set(samfnames)) != len(samfnames): + samnames = [] + delme = [] + for i,s in enumerate(samfnames): + if s in samnames: + delme.append(i) + print sys.stdout,'## WARNING htseqsams2mx: Duplicate input sam file %s in %s - ignoring dupe in 0 based position %s' %\ + (s,','.join(samfnames), str(delme)) + else: + samnames.append(s) # first time + samsets = [x for i,x in enumerate(samsets) if not (i in delme)] + samfnames = [x[0] for x in samsets] + scolnames = [x[1]for x in samsets] + assert len(samfnames) == len(scolnames), '##ERROR sams2mx: Count of sam/cname not consistent - %d/%d' % (len(samfnames),len(scolnames)) + sam_exts = [x[2] for x in samsets] + assert len(samfnames) == len(sam_exts), '##ERROR sams2mx: Count of extensions not consistent - %d/%d' % (len(samfnames),len(sam_exts)) + sam_bais = [x[3] for x in samsets] # these only exist for bams and need to be finessed with a symlink so pysam will just work + for i,b in enumerate(samfnames): + assert os.path.isfile(b),'## Supplied input sam file "%s" not found' % b + sam_filenames.append(b) + sampName = scolnames[i] # better be unique + sampName = sampName.replace('#','') # for R + sampName = sampName.replace('(','') # for R + sampName = sampName.replace(')','') # for R + sampName = sampName.replace(' ','_') # for R + colnames.append(sampName) + counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads = htseqMX(gff_file, sam_filenames,colnames,sam_exts,sam_bais,opts) + heads = '\t'.join(['Contig',] + colnames) + res = [heads,] + contigs = counts.keys() + contigs.sort() + totalc = 0 + emptycontigs = 0 + for contig in contigs: + thisc = sum(counts[contig]) + if thisc > 0: # no output for empty contigs + totalc += thisc + crow = [contig,] + ['%d' % x for x in counts[contig]] + res.append('\t'.join(crow)) + else: + emptycontigs += 1 + outf = open(opts.outfname,'w') + outf.write('\n'.join(res)) + outf.write('\n') + outf.close() + walltime = int(time.time() - starttime) + accumulatornames = ('walltime (seconds)','total reads read','total reads counted','number of contigs','total empty reads','total ambiguous reads','total low quality reads', + 'total not aligned reads','total not unique mapping reads','extra filtered reads','empty contigs') + accums = (walltime,nreads,totalc,len(contigs),empty,ambiguous,lowqual,notaligned,nonunique,filtered,emptycontigs) + fracs = (1.0,1.0,float(totalc)/nreads,1.0,float(empty)/nreads,float(ambiguous)/nreads,float(lowqual)/nreads,float(notaligned)/nreads,float(nonunique)/nreads,float(filtered)/nreads,float(emptycontigs)/len(contigs)) + notes = ['%s = %d (%2.3f)' % (accumulatornames[i],x,100.0*fracs[i]) for i,x in enumerate(accums)] + print >> sys.stdout, '\n'.join(notes) + sys.exit(0)