comparison htseqsams2mx.py @ 56:9b59cd40f20d draft

Uploaded
author iuc
date Tue, 28 Apr 2015 22:56:39 -0400
parents
children
comparison
equal deleted inserted replaced
55:bf016b884c68 56:9b59cd40f20d
1 # May 2013
2 # Change to htseq as the counting engine - wrap so arbitrary number of columns created
3 # borged Simon Anders' "count.py" since we need a vector of counts rather than a new sam file as output
4 # note attribution for htseq and count.py :
5 ## Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology
6 ## Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General
7 ## Public License v3. Part of the 'HTSeq' framework, version HTSeq-0.5.4p3
8 # updated ross lazarus august 2011 to NOT include region and to finesse the name as the region for bed3 format inputs
9 # also now sums all duplicate named regions and provides a summary of any collapsing as the info
10 # updated ross lazarus july 26 to respect the is_duplicate flag rather than try to second guess
11 # note Heng Li argues that removing dupes is a bad idea for RNA seq
12 # updated ross lazarus july 22 to count reads OUTSIDE each bed region during the processing of each bam
13 # added better sorting with decoration of a dict key later sorted and undecorated.
14 # code cleaned up and galaxified ross lazarus july 18 et seq.
15 # bams2mx.py -turns a series of bam and a bed file into a matrix of counts Usage bams2mx.py <halfwindow> <bedfile.bed> <bam1.bam>
16 # <bam2.bam>
17 # uses pysam to read and count bam reads over each bed interval for each sample for speed
18 # still not so fast
19 # TODO options -shift -unique
20 #
21 """
22 how this gets run:
23
24 (vgalaxy)galaxy@iaas1-int:~$ cat database/job_working_directory/027/27014/galaxy_27014.sh
25 #!/bin/sh
26 GALAXY_LIB="/data/extended/galaxy/lib"
27 if [ "$GALAXY_LIB" != "None" ]; then
28 if [ -n "$PYTHONPATH" ]; then
29 PYTHONPATH="$GALAXY_LIB:$PYTHONPATH"
30 else
31 PYTHONPATH="$GALAXY_LIB"
32 fi
33 export PYTHONPATH
34 fi
35
36 cd /data/extended/galaxy/database/job_working_directory/027/27014
37 python /data/extended/galaxy/tools/rgenetics/htseqsams2mx.py -g "/data/extended/galaxy/database/files/034/dataset_34115.dat" -o "/data/extended/galaxy/database/files/034/dataset_34124.dat" -m "union" --id_attribute "gene_id" --feature_type "exon" --samf "'/data/extended/galaxy/database/files/033/dataset_33980.dat','T5A_C1PPHACXX_AGTTCC_L003_R1.fastq_bwa.sam'" --samf "'/data/extended/galaxy/database/files/033/dataset_33975.dat','T5A_C1PPHACXX_AGTTCC_L002_R1.fastq_bwa.sam'"; cd /data/extended/galaxy; /data/extended/galaxy/set_metadata.sh ./database/files /data/extended/galaxy/database/job_working_directory/027/27014 . /data/extended/galaxy/universe_wsgi.ini /data/tmp/tmpmwsElH /data/extended/galaxy/database/job_working_directory/027/27014/galaxy.json /data/extended/galaxy/database/job_working_directory/027/27014/metadata_in_HistoryDatasetAssociation_45202_sfOMGa,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_kwds_HistoryDatasetAssociation_45202_gaMnxa,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_out_HistoryDatasetAssociation_45202_kZPsZO,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_results_HistoryDatasetAssociation_45202_bXU7IU,,/data/extended/galaxy/database/job_working_directory/027/27014/metadata_override_HistoryDatasetAssociation_45202_hyLAvh
38 echo $? > /data/extended/galaxy/database/job_working_directory/027/27014/galaxy_27014.ec
39
40 """
41
42 import os
43 import re
44 import sys
45 import HTSeq.scripts.count as htcount
46 import optparse
47 import tempfile
48 import shutil
49 import operator
50 import subprocess
51 import itertools
52 import warnings
53 import traceback
54 import HTSeq
55 import time
56
57
58 class Xcpt(Exception):
59 def __init__(self, msg):
60 self.msg = msg
61
62
63 def htseqMX(gff_filename,sam_filenames,colnames,sam_exts,sam_bais,opts):
64 """
65 Code taken from count.py in Simon Anders HTSeq distribution
66 Wrapped in a loop to accept multiple bam/sam files and their names from galaxy to
67 produce a matrix of contig counts by sample for downstream use in edgeR and DESeq tools
68 """
69 class UnknownChrom( Exception ):
70 pass
71
72 def my_showwarning( message, category, filename, lineno = None, line = None ):
73 sys.stdout.write( "Warning: %s\n" % message )
74
75 def invert_strand( iv ):
76 iv2 = iv.copy()
77 if iv2.strand == "+":
78 iv2.strand = "-"
79 elif iv2.strand == "-":
80 iv2.strand = "+"
81 else:
82 raise ValueError, "Illegal strand"
83 return iv2
84
85 def count_reads_in_features( sam_filenames, colnames, gff_filename, opts ):
86 """ Hacked version of htseq count.py
87 """
88 if opts.quiet:
89 warnings.filterwarnings( action="ignore", module="HTSeq" )
90 features = HTSeq.GenomicArrayOfSets( "auto", opts.stranded != "no" )
91 mapqMin = int(opts.mapqMin)
92 counts = {}
93 nreads = 0
94 empty = 0
95 ambiguous = 0
96 notaligned = 0
97 lowqual = 0
98 nonunique = 0
99 filtered = 0 # new filter_extras - need a better way to do this - independent filter tool?
100 gff = HTSeq.GFF_Reader( gff_filename )
101 try:
102 for i,f in enumerate(gff):
103 if f.type == opts.feature_type:
104 try:
105 feature_id = f.attr[ opts.id_attribute ]
106 except KeyError:
107 try:
108 feature_id = f.attr[ 'gene_id' ]
109 except KeyError:
110 sys.exit( "Feature at row %d %s does not contain a '%s' attribute OR a gene_id attribute - faulty GFF?" %
111 ( (i+1), f.name, opts.id_attribute ) )
112 if opts.stranded != "no" and f.iv.strand == ".":
113 sys.exit( "Feature %s at %s does not have strand information but you are "
114 "running htseq-count in stranded mode. Use '--stranded=no'." %
115 ( f.name, f.iv ) )
116 features[ f.iv ] += feature_id
117 counts[ feature_id ] = [0 for x in colnames] # we use sami as an index here to bump counts later
118 except:
119 sys.stderr.write( "Error occured in %s.\n" % gff.get_line_number_string() )
120 raise
121
122 if not opts.quiet:
123 sys.stdout.write( "%d GFF lines processed.\n" % i )
124
125 if len( counts ) == 0 and not opts.quiet:
126 sys.stdout.write( "Warning: No features of type '%s' found.\n" % opts.feature_type )
127 for sami,sam_filename in enumerate(sam_filenames):
128 colname = colnames[sami]
129 isbam = sam_exts[sami] == 'bam'
130 hasbai = sam_bais[sami] > ''
131 if hasbai:
132 tempname = os.path.splitext(os.path.basename(sam_filename))[0]
133 tempbam = '%s_TEMP.bam' % tempname
134 tempbai = '%s_TEMP.bai' % tempname
135 os.link(sam_filename,tempbam)
136 os.link(sam_bais[sami],tempbai)
137 try:
138 if isbam:
139 if hasbai:
140 read_seq = HTSeq.BAM_Reader ( tempbam )
141 else:
142 read_seq = HTSeq.BAM_Reader( sam_filename )
143 else:
144 read_seq = HTSeq.SAM_Reader( sam_filename )
145 first_read = iter(read_seq).next()
146 pe_mode = first_read.paired_end
147 except:
148 if isbam:
149 print >> sys.stderr, "Error occured when reading first line of bam file %s colname=%s \n" % (sam_filename,colname )
150 else:
151 print >> sys.stderr, "Error occured when reading first line of sam file %s colname=%s \n" % (sam_filename,colname )
152 raise
153
154 try:
155 if pe_mode:
156 read_seq_pe_file = read_seq
157 read_seq = HTSeq.pair_SAM_alignments( read_seq )
158 for seqi,r in enumerate(read_seq):
159 nreads += 1
160 if not pe_mode:
161 if not r.aligned:
162 notaligned += 1
163 continue
164 try:
165 if len(opts.filter_extras) > 0:
166 for extra in opts.filter_extras:
167 if r.optional_field(extra):
168 filtered += 1
169 continue
170 if r.optional_field( "NH" ) > 1:
171 nonunique += 1
172 continue
173 except KeyError:
174 pass
175 if r.aQual < mapqMin:
176 lowqual += 1
177 continue
178 if opts.stranded != "reverse":
179 iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 )
180 else:
181 iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 )
182 else:
183 if r[0] is not None and r[0].aligned:
184 if opts.stranded != "reverse":
185 iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
186 else:
187 iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
188 else:
189 iv_seq = tuple()
190 if r[1] is not None and r[1].aligned:
191 if opts.stranded != "reverse":
192 iv_seq = itertools.chain( iv_seq,
193 ( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
194 else:
195 iv_seq = itertools.chain( iv_seq,
196 ( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
197 else:
198 if ( r[0] is None ) or not ( r[0].aligned ):
199 notaligned += 1
200 continue
201 try:
202 if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
203 ( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
204 nonunique += 1
205 continue
206 except KeyError:
207 pass
208 if ( r[0] and r[0].aQual < mapqMin ) or ( r[1] and r[1].aQual < mapqMin ):
209 lowqual += 1
210 continue
211
212 try:
213 if opts.mode == "union":
214 fs = set()
215 for iv in iv_seq:
216 if iv.chrom not in features.chrom_vectors:
217 raise UnknownChrom
218 for iv2, fs2 in features[ iv ].steps():
219 fs = fs.union( fs2 )
220 elif opts.mode == "intersection-strict" or opts.mode == "intersection-nonempty":
221 fs = None
222 for iv in iv_seq:
223 if iv.chrom not in features.chrom_vectors:
224 raise UnknownChrom
225 for iv2, fs2 in features[ iv ].steps():
226 if len(fs2) > 0 or opts.mode == "intersection-strict":
227 if fs is None:
228 fs = fs2.copy()
229 else:
230 fs = fs.intersection( fs2 )
231 else:
232 sys.exit( "Illegal overlap mode %s" % opts.mode )
233 if fs is None or len( fs ) == 0:
234 empty += 1
235 elif len( fs ) > 1:
236 ambiguous += 1
237 else:
238 ck = list(fs)[0]
239 counts[ck][sami] += 1 # end up with counts for each sample as a list
240 except UnknownChrom:
241 if not pe_mode:
242 rr = r
243 else:
244 rr = r[0] if r[0] is not None else r[1]
245 empty += 1
246 if not opts.quiet:
247 sys.stdout.write( ( "Warning: Skipping read '%s', because chromosome " +
248 "'%s', to which it has been aligned, did not appear in the GFF file.\n" ) %
249 ( rr.read.name, iv.chrom ) )
250 except:
251 if not pe_mode:
252 sys.stderr.write( "Error occured in %s.\n" % read_seq.get_line_number_string() )
253 else:
254 sys.stderr.write( "Error occured in %s.\n" % read_seq_pe_file.get_line_number_string() )
255 raise
256
257 if not opts.quiet:
258 sys.stdout.write( "%d sam %s processed for %s.\n" % ( seqi, "lines " if not pe_mode else "line pairs", colname ) )
259 return counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads
260
261 warnings.showwarning = my_showwarning
262 assert os.path.isfile(gff_filename),'## unable to open supplied gff file %s' % gff_filename
263 try:
264 counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads = count_reads_in_features( sam_filenames, colnames, gff_filename,opts)
265 except:
266 sys.stderr.write( "Error: %s\n" % str( sys.exc_info()[1] ) )
267 sys.stderr.write( "[Exception type: %s, raised in %s:%d]\n" %
268 ( sys.exc_info()[1].__class__.__name__,
269 os.path.basename(traceback.extract_tb( sys.exc_info()[2] )[-1][0]),
270 traceback.extract_tb( sys.exc_info()[2] )[-1][1] ) )
271 sys.exit( 1 )
272 return counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads
273
274
275 def usage():
276 print >> sys.stdout, """Usage: python htseqsams2mx.py -w <halfwindowsize> -g <gfffile.gff> -o <outfilename> [-i] [-c] --samf "<sam1.sam>,<sam1.column_header>" --samf "...<samN.column_header>" """
277 sys.exit(1)
278
279 if __name__ == "__main__":
280 """
281 <command interpreter="python">
282 htseqsams2mx.py -w "$halfwin" -g "$gfffile" -o "$outfile" -m "union"
283 #for $s in $samfiles:
284 --samf "'${s.samf}','${s.samf.name}'"
285 #end for
286 </command>
287 """
288 if len(sys.argv) < 2:
289 usage()
290 sys.exit(1)
291 starttime = time.time()
292 op = optparse.OptionParser()
293 # All tools
294 op.add_option('-w', '--halfwindow', default="0")
295 op.add_option('-m', '--mode', default="union")
296 op.add_option('-s', '--stranded', default="no")
297 op.add_option('-y', '--feature_type', default="exon")
298 op.add_option('-g', '--gff_file', default=None)
299 op.add_option('-o', '--outfname', default=None)
300 op.add_option('-f','--forceName', default="false")
301 op.add_option('--samf', default=[], action="append")
302 op.add_option('--filter_extras', default=[], action="append")
303 op.add_option('--mapqMin', default='0')
304 op.add_option( "-t", "--type", type="string", dest="featuretype",
305 default = "exon", help = "feature type (3rd column in GFF file) to be used, " +
306 "all features of other type are ignored (default, suitable for Ensembl " +
307 "GTF files: exon)" )
308
309 op.add_option( "-i", "--id_attribute", type="string", dest="id_attribute",
310 default = "gene_name", help = "GTF attribute to be used as feature ID (default, " +
311 "suitable for Ensembl GTF files: gene_id)" )
312
313 op.add_option( "-q", "--quiet", action="store_true", dest="quiet", default = False,
314 help = "suppress progress report and warnings" )
315 opts, args = op.parse_args()
316 halfwindow = int(opts.halfwindow)
317 gff_file = opts.gff_file
318 assert os.path.isfile(gff_file),'##ERROR htseqsams2mx: Supplied input GFF file "%s" not found' % gff_file
319 outfname = opts.outfname
320 sam_filenames = []
321 colnames = []
322 samf = opts.samf
323 samfsplit = [x.split(',') for x in samf] # one per samf set
324 samsets = []
325 for samfs in samfsplit:
326 samset = [x.replace("'","") for x in samfs]
327 samset = [x.replace('"','') for x in samset]
328 samsets.append(samset)
329 samsets = [x for x in samsets if x[0].lower() != 'none']
330 # just cannot stop getting these on cl! wtf in cheetah for a repeat group?
331 samfnames = [x[0] for x in samsets]
332 if len(set(samfnames)) != len(samfnames):
333 samnames = []
334 delme = []
335 for i,s in enumerate(samfnames):
336 if s in samnames:
337 delme.append(i)
338 print sys.stdout,'## WARNING htseqsams2mx: Duplicate input sam file %s in %s - ignoring dupe in 0 based position %s' %\
339 (s,','.join(samfnames), str(delme))
340 else:
341 samnames.append(s) # first time
342 samsets = [x for i,x in enumerate(samsets) if not (i in delme)]
343 samfnames = [x[0] for x in samsets]
344 scolnames = [x[1]for x in samsets]
345 assert len(samfnames) == len(scolnames), '##ERROR sams2mx: Count of sam/cname not consistent - %d/%d' % (len(samfnames),len(scolnames))
346 sam_exts = [x[2] for x in samsets]
347 assert len(samfnames) == len(sam_exts), '##ERROR sams2mx: Count of extensions not consistent - %d/%d' % (len(samfnames),len(sam_exts))
348 sam_bais = [x[3] for x in samsets] # these only exist for bams and need to be finessed with a symlink so pysam will just work
349 for i,b in enumerate(samfnames):
350 assert os.path.isfile(b),'## Supplied input sam file "%s" not found' % b
351 sam_filenames.append(b)
352 sampName = scolnames[i] # better be unique
353 sampName = sampName.replace('#','') # for R
354 sampName = sampName.replace('(','') # for R
355 sampName = sampName.replace(')','') # for R
356 sampName = sampName.replace(' ','_') # for R
357 colnames.append(sampName)
358 counts,empty,ambiguous,lowqual,notaligned,nonunique,filtered,nreads = htseqMX(gff_file, sam_filenames,colnames,sam_exts,sam_bais,opts)
359 heads = '\t'.join(['Contig',] + colnames)
360 res = [heads,]
361 contigs = counts.keys()
362 contigs.sort()
363 totalc = 0
364 emptycontigs = 0
365 for contig in contigs:
366 thisc = sum(counts[contig])
367 if thisc > 0: # no output for empty contigs
368 totalc += thisc
369 crow = [contig,] + ['%d' % x for x in counts[contig]]
370 res.append('\t'.join(crow))
371 else:
372 emptycontigs += 1
373 outf = open(opts.outfname,'w')
374 outf.write('\n'.join(res))
375 outf.write('\n')
376 outf.close()
377 walltime = int(time.time() - starttime)
378 accumulatornames = ('walltime (seconds)','total reads read','total reads counted','number of contigs','total empty reads','total ambiguous reads','total low quality reads',
379 'total not aligned reads','total not unique mapping reads','extra filtered reads','empty contigs')
380 accums = (walltime,nreads,totalc,len(contigs),empty,ambiguous,lowqual,notaligned,nonunique,filtered,emptycontigs)
381 fracs = (1.0,1.0,float(totalc)/nreads,1.0,float(empty)/nreads,float(ambiguous)/nreads,float(lowqual)/nreads,float(notaligned)/nreads,float(nonunique)/nreads,float(filtered)/nreads,float(emptycontigs)/len(contigs))
382 notes = ['%s = %d (%2.3f)' % (accumulatornames[i],x,100.0*fracs[i]) for i,x in enumerate(accums)]
383 print >> sys.stdout, '\n'.join(notes)
384 sys.exit(0)