comparison vmap.py @ 0:f63d639b223c draft

Initial Upload
author mzeidler
date Wed, 25 Sep 2013 11:41:16 -0400
parents
children be5d3889a1b9
comparison
equal deleted inserted replaced
-1:000000000000 0:f63d639b223c
1 #!/usr/bin/env python
2 #from __future__ import print_function
3
4 import cProfile
5
6 import sys
7 import re
8
9 import tempfile
10 import subprocess
11 import shutil
12 import os
13 import os.path
14 import logging
15 import bz2
16 import zlib
17
18 import math
19 import string
20
21 from collections import defaultdict, Counter
22
23 from subprocess import PIPE
24
25 NON_ID = ''.join(c for c in map(chr, range(256)) if not c.isalnum())
26 NON_ID = NON_ID.replace('_', '').replace('-', '')
27
28 try:
29 from Bio.SeqRecord import SeqRecord
30 from Bio import SeqIO
31 from Bio.Seq import Seq
32 except ImportError:
33 message = 'This script requires the BioPython python package\n'
34 sys.stderr.write(message)
35 sys.exit(1)
36
37 try:
38 from plumbum import cli
39 except ImportError:
40 message = 'This script requires the plumbum python package\n'
41 sys.stderr.write(message)
42 sys.exit(1)
43
44
45 try:
46 import HTSeq
47 except ImportError:
48 message = 'This script requires the HTSeq python package\n'
49 sys.stderr.write(message)
50 sys.exit(1)
51
52 KHMER_AVAILABLE = True
53 try:
54 import khmer
55 except ImportError:
56 KHMER_AVAILABLE = False
57
58 #from io import BufferedRandom
59
60 logging.basicConfig(level=logging.INFO, format='%(message)s')
61
62
63 def profile_this(fn):
64 def profiled_fn(*args, **kwargs):
65 fpath = fn.__name__ + ".profile"
66 prof = cProfile.Profile()
67 ret = prof.runcall(fn, *args, **kwargs)
68 prof.dump_stats(fpath)
69 return ret
70 return profiled_fn
71
72
73 class CLI(cli.Application):
74 """RNA-Seq and DNA-Seq short read analysis by mapping to known reference sequences"""
75 PROGNAME = "vmap"
76 VERSION = "1.0.0"
77 DESCRIPTION = """Virana vmap is an interface to the NCBI and ensembl reference databases that can
78 generate reference indexes for the short read mappers STAR (RNA-Seq) and
79 BWA-MEM (DNA-Seq). Short reads can be mapped to arbitrary combinations of
80 reference databases and the results can be summarized by taxonomic family
81 as well as stored as SAM file, unsorted BAM file, or as a HIT file that
82 models multimapping reads between specific reference databases."""
83 USAGE = """The program has four modes that can be accessed by `vmap rnaindex`, `vmap dnaindex`, `vmap rnamap`, and `vmap dnamap.`"""
84
85 def main(self, *args):
86
87 if args:
88 print("Unknown command %r" % (args[0]))
89 print self.USAGE
90 return 1
91
92 if not self.nested_command:
93 print("No command given")
94 print self.USAGE
95 return 1
96
97
98 @CLI.subcommand("rnaindex")
99 class RNAIndex(cli.Application):
100 """ Creates a STAR index from a FASTA genome reference """
101
102 reference_files = cli.SwitchAttr(
103 ['-r', '--reference_file'], str, list=True, mandatory=True,
104 help="Sets the reference genome(s) FASTA file." +
105 " Multiple occurrences of this parameter are allowed.")
106 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
107 help="Sets the index output directory." +
108 " Directory will be generated if not existing." +
109 " Directory will be filled with several index files.")
110 threads = cli.SwitchAttr(
111 ['-t', '--threads'], cli.Range(1, 512), mandatory=False,
112 help="Sets the number of threads to use",
113 default=1)
114 max_ram = cli.SwitchAttr(
115 ['-m'], cli.Range(1, 400000000000), mandatory=False,
116 help="Sets the maximum amount of memory (RAM) to use (in bytes)",
117 default=400000000000)
118 path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False,
119 help="Path to STAR executable",
120 default='')
121 sparse = cli.Flag(
122 ["-s", "--sparse"], help="If given, a sparse index that requires less " +
123 " RAM in the mapping phase will be constructed")
124
125 debug = cli.Flag(["-d", "--debug"], help="Enable debug output")
126
127 def main(self):
128
129 if self.debug:
130 logging.getLogger().setLevel(logging.DEBUG)
131
132 # Obtain star executable
133 star = [self.path and self.path or 'STAR']
134
135 # Check if genome directory is existing
136 for reference_file in self.reference_files:
137 if not os.path.exists(reference_file):
138 sys.stdout.write(
139 'Reference file %s nor existing, exiting' % reference_file)
140 sys.exit(1)
141
142 # Check if output directory is existing
143 if not os.path.exists(self.index_dir):
144 logging.debug(
145 'Making output directory for index at %s' % self.index_dir)
146 os.makedirs(self.index_dir)
147
148 # # Make named pipe to extract genomes
149 # pipe_path = os.path.abspath(os.path.join(self.genome_dir, 'pipe.fa'))
150 # if os.path.exists(pipe_path):
151 # os.unlink(pipe_path)
152 # os.mkfifo(pipe_path)
153
154 # Make star command line
155 cline = star + ['--runMode', 'genomeGenerate',
156 '--genomeDir', self.index_dir,
157 '--limitGenomeGenerateRAM', str(self.max_ram),
158 '--runThreadN', str(self.threads),
159 '--genomeFastaFiles'] + self.reference_files
160
161 # Add parameters for sparse (memory-saving) index generation
162 if self.sparse:
163 cline += ['--genomeSAsparseD', '2',
164 '--genomeChrBinNbits', '12',
165 '--genomeSAindexNbases', '13']
166
167 else:
168 cline += ['--genomeSAsparseD', '1',
169 '--genomeChrBinNbits', '18',
170 '--genomeSAindexNbases', '15']
171
172 if self.debug:
173 print ' '.join(cline)
174
175 # Run STAR reference generation process
176 star_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
177
178 # Block until streams are closed by the process
179 stdout, stderr = star_process.communicate()
180
181 if stderr:
182 sys.stderr.write(stderr)
183
184 if self.debug and stdout:
185 print stdout
186
187
188 @CLI.subcommand("dnaindex")
189 class DNAIndex(cli.Application):
190 """ Creates a BWA index from a FASTA reference file """
191
192 reference_file = cli.SwitchAttr(['-r', '--reference_file'], str, mandatory=True,
193 help="Sets the input reference FASTA file.")
194 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
195 help="Sets the index output directory." +
196 " Directory will be generated if not existing." +
197 " Directory will be filled with several index files.")
198 path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False,
199 help="Path to BWA executable",
200 default='')
201 debug = cli.Flag(["-d", "--debug"], help="Enable debug output")
202
203 if debug:
204 logging.getLogger().setLevel(logging.DEBUG)
205
206 def main(self):
207
208 # Obtain star executable
209 bwa = [self.path and self.path or 'bwa']
210
211 # Check if genome directory is existing
212 if not os.path.exists(self.reference_file):
213 sys.stdout.write('Genome file %s nor existing, exiting' % self.reference_file)
214 sys.exit(1)
215
216 # Check if output directory is existing
217 if not os.path.exists(self.index_dir):
218 logging.debug('Making output directory %s' % self.index_dir)
219 os.makedirs(self.index_dir)
220
221 # Make star command line
222 cline = bwa + ['index', '-a', 'bwtsw', '-p', os.path.join(self.index_dir, 'index'), self.reference_file]
223
224 if self.debug:
225 print ' '.join(cline)
226
227 # Run BWA index generation process
228 bwa_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
229 stdout, stderr = bwa_process.communicate()
230
231 if stderr:
232 sys.stderr.write(stderr)
233
234 if self.debug and stdout:
235 print stdout
236
237
238
239
240
241 class SAMHits:
242 """ Converts SAM output of mappers into bzipped HIT files. """
243
244 def __init__(self, output_file, sample_id, refseq_filter=None, min_mapping_score=None,\
245 min_alignment_score=None, max_mismatches=None,\
246 max_relative_mismatches=None, min_continiously_matching=None,\
247 filter_complexity=False):
248
249 self.output_file = bz2.BZ2File(output_file, 'wb', buffering=100 * 1024 * 1024)
250 self.sample_id = sample_id.translate(None, NON_ID)
251 self.refseq_filter = refseq_filter
252 self.max_mismatches = max_mismatches
253 self.max_relative_mismatches = max_relative_mismatches
254 self.current_group = []
255 self.min_mapping_score = min_mapping_score
256 self.min_alignment_score = min_alignment_score
257 self.min_continiously_matching = min_continiously_matching
258 self.filter_complexity = filter_complexity
259
260 self.re_matches = re.compile(r'(\d+)M')
261 self.re_dels = re.compile(r'(\d+)D')
262
263 def count(self, parsed_line):
264
265 if parsed_line is None:
266 return
267
268 read_key, read_name, flag, ref_name, ref_position, mapping_score,\
269 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
270 is_end1, is_end2, number_mismatches, alignment_score,\
271 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
272 is_paired, number_matches, read_end_pos, max_match = parsed_line
273
274 if not is_mapped:
275 return
276
277 if self.filter_complexity:
278
279 avg_compression = float(len(zlib.compress(seq)))/len(seq)
280
281 if avg_compression < 0.5:
282 return
283
284 # length = len(seq)
285 # counts = [seq.count(nuc) for nuc in 'ACGT']
286 # min_count = length * 0.10
287 # max_count = length * 0.50
288 # for count in counts:
289 # if count < min_count or count > max_count:
290 # return None
291
292 # counter = Counter()
293 # for i in range(length - 2):
294 # counter[seq[i: i + 3]] += 1
295 # maximal = length - 4
296
297 # highest = sum([v for k, v in counter.most_common(2)])
298 # if highest > (maximal / 3.0):
299 # return None
300
301 # self.passed.append(avg_compression)
302
303 pair_id = ''
304 if is_end1:
305 pair_id = '/1'
306
307 elif is_end2:
308 pair_id = '/2'
309
310 read_name = self.sample_id + ';' + read_name + pair_id
311
312 # Initialize new current group
313 if len(self.current_group) == 0:
314 self.current_group = [read_name, seq, []]
315
316 # Write old current group to file
317 if read_name != self.current_group[0]:
318 self._write_group()
319 self.current_group = [read_name, seq, []]
320
321 try:
322 refseq_group, family, organism, identifier = ref_name.split(';')[:4]
323 except ValueError:
324 sys.stderr.write('Read mapped to malformed reference sequence %s, skipping\n' % ref_name)
325 return
326
327 if self.min_continiously_matching:
328
329 if self.min_continiously_matching > max_match:
330 return
331
332 if self.max_mismatches\
333 and int(number_mismatches) > self.max_mismatches:
334 return
335
336 if self.max_relative_mismatches\
337 and int(number_mismatches) / float(len(seq))\
338 > self.max_relative_mismatches:
339 return
340
341 if self.min_mapping_score\
342 and self.min_mapping_score > mapping_score:
343 return
344
345 if self.min_alignment_score\
346 and self.min_alignment_score > alignment_score:
347 return
348
349 start = int(ref_position) + 1
350
351 self.current_group[2].append([refseq_group, family, organism, identifier, str(start), str(read_end_pos)])
352
353 def _write_group(self):
354 passed = True
355
356 if self.refseq_filter:
357 passed = False
358 for refseq_group, family, organism, identifier, start, end in self.current_group[2]:
359 if passed:
360 break
361 for f in self.refseq_filter:
362 if refseq_group == f:
363 passed = True
364 break
365 if passed:
366 description = []
367 for identifier in self.current_group[2]:
368 description.append(';'.join(identifier))
369 description = '|'.join(description)
370
371 record = SeqRecord(Seq(self.current_group[1]))
372 record.id = 'Read;' + self.current_group[0]
373 record.description = description
374
375 SeqIO.write([record], self.output_file, "fasta")
376
377 def write(self):
378
379 self._write_group()
380 self.output_file.close()
381
382 class SAMParser:
383
384 def parse(self, line):
385
386 if line[0] == '@':
387 return None
388
389 alignment = HTSeq._HTSeq.SAM_Alignment.from_SAM_line(line)
390 read_name = alignment.read.name
391 seq = alignment.read.seq
392 qual = alignment.read.qual
393 flag = alignment.flag
394 cigar = None
395
396 is_paired = (flag & 1)
397 is_mapped = not (flag & 4)
398 is_mate_mapped = alignment.mate_aligned is not None #not (flag & 8)
399 is_reverse = (flag & 16)
400 is_end1 = (flag & 64)
401 is_end2 = (flag & 128)
402 is_primary = not (flag & 256)
403
404 read_key = (read_name, is_end1)
405
406 ref_name = None
407 ref_position = None
408 mapping_score = 0
409 mate_ref_name = None
410 mate_ref_position = None
411 insert_size = None
412 alignment_score = 0
413 read_end_pos = None
414
415 if is_mate_mapped and alignment.mate_start:
416 mate_ref_name = alignment.mate_start.chrom
417 mate_ref_position = alignment.mate_start.start
418
419 number_hits = 0
420 alignment_score = 0
421 number_mismatches = 0
422
423 number_matches = 0
424 max_match = 0
425
426 if is_mapped:
427
428 ref_name = alignment.iv.chrom
429 ref_position = alignment.iv.start
430 read_end_pos = alignment.iv.end
431 alignment_score = alignment.aQual
432 cigar = alignment.cigar
433
434 if is_mate_mapped:
435 insert_size = alignment.inferred_insert_size
436
437 for c in cigar:
438 if c.type == 'M':
439 number_matches += c.size
440 max_match = max(max_match, c.size)
441
442 for tag, value in alignment.optional_fields:
443 if tag == 'NM':
444 number_hits = value
445 elif tag == 'AS':
446 alignment_score = value
447 elif tag == 'NH':
448 number_mismatches = value
449
450 return read_key, read_name, flag, ref_name, ref_position, mapping_score,\
451 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
452 is_end1, is_end2, number_mismatches, alignment_score,\
453 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
454 is_paired, number_matches, read_end_pos, max_match
455
456
457 class SAMQuality:
458
459 def __init__(self, file_path):
460
461 self.file_path = file_path
462
463 self.stored = defaultdict(Counter)
464 self.all_references = defaultdict(int)
465 self.primary_references = defaultdict(int)
466 self.complement = string.maketrans('ATCGN', 'TAGCN')
467
468 if KHMER_AVAILABLE:
469 self.ktable = khmer.new_ktable(10)
470
471 def _get_complement(self, sequence):
472
473 return sequence.translate(self.complement)[::-1]
474
475 def _get_summary(self, counter):
476 """"Returns five numbers (sum, extrema, mean, and std)
477 for a max_frequency counter """
478
479 maximum = 0
480 minimum = sys.maxint
481 thesum = 0
482 allcount = 0
483 mode = [0, None]
484
485 items = 0.0
486 mean = 0.0
487 m2 = 0.0
488 variance = 0.0
489
490 for item in counter:
491
492 count = counter[item]
493 if count > mode[0]:
494 mode = [count, item]
495
496 allcount += count
497 maximum = max(maximum, item)
498 minimum = min(minimum, item)
499 thesum += (count * item)
500
501 x = 1
502 while x <= count:
503 items += 1
504 delta = item - mean
505 mean = mean + delta / items
506 m2 = m2 + delta * (item - mean)
507 variance = m2 / items
508 x += 1
509
510 std = math.sqrt(variance)
511
512 return allcount, thesum, minimum, maximum, mode[1], mean, std
513
514
515 def _to_unit(self, item, is_percentage=False):
516 """ Convert a numeric to a string with metric units """
517
518 if is_percentage:
519 return ('%-.3f' % (item * 100)) + '%'
520 converted = None
521 try:
522 item = float(item)
523 if item > 10**12:
524 converted = str(round(item / 10**9,3))+'P'
525 elif item > 10**9:
526 converted = str(round(item / 10**9,3))+'G'
527 elif item > 10**6:
528 converted = str(round(item / 10**6,3))+'M'
529 elif item > 10**3:
530 converted = str(round(item / 10**3,3))+'K'
531 else:
532 converted = str(round(item,3))
533 except:
534 converted = str(item)
535
536 return converted
537
538 def _str_metrics(self, data):
539
540 str_metrics = []
541
542 for (item, metric) in sorted(data.keys()):
543 counter = data[(item, metric)]
544 if not hasattr(counter.iterkeys().next(), 'real'):
545 for element, count in counter.most_common():
546 str_metrics.append(self._str_metric(item, metric, element, count, no_units=True))
547 else:
548 summary = self._get_summary(counter)
549 str_metrics.append(self._str_metric(item, metric, *summary))
550
551 return str_metrics
552
553 def _str_metric(self, item, metric, count, thesum='', minimum='',\
554 maximum='', mode='', mean='', std='', no_units=False):
555
556 counters = [count, thesum, minimum, maximum, mode, mean, std]
557 counters = map(str, counters)
558
559 if no_units:
560 items = [item, metric] + counters
561 else:
562 units = map(self._to_unit, counters)
563 items = [item, metric] + units
564
565 return '%-15s\t%-60s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\n' \
566 % tuple(items)
567
568
569 def _count_read(self, metric, data, sample):
570
571 item = 'read'
572
573 (insert_size, alignment_score, mapping_score, length,\
574 q20_length, avg_phred_quality, number_hits, is_reverse) = data
575
576 self.stored[(item, metric + ' mappings')][number_hits] += sample
577 self.stored[(item, metric + ' insert')][insert_size] += sample
578
579
580 def _count_segment(self, metric, data, sample):
581
582 item = 'segment'
583
584 (insert_size, alignment_score, mapping_score, length,\
585 q20_length, avg_phred_quality, number_hits, is_reverse) = data
586
587 self.stored[(item, metric + ' algq')][alignment_score] += sample
588 self.stored[(item, metric + ' mapq')][mapping_score] += sample
589 self.stored[(item, metric + ' length')][length] += sample
590 self.stored[(item, metric + ' q20length')][q20_length] += sample
591 self.stored[(item, metric + ' meanbasequal')][avg_phred_quality] += sample
592 self.stored[(item, metric + ' reverse')][is_reverse] += sample
593
594 def count(self, parsed_line):
595
596 if parsed_line is None:
597 return
598
599 #print_metric('Item' , 'Metric', 'Count', 'Sum', 'Min', 'Max', 'Mode', 'Mean', 'STD')
600
601 read_key, read_name, flag, ref_name, ref_position, mapping_score,\
602 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
603 is_end1, is_end2, number_mismatches, alignment_score,\
604 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
605 is_paired, number_matches, read_end_pos, max_match = parsed_line
606
607 phred_quality = [q - 33 for q in qual]
608 avg_phred_quality = sum(phred_quality) / float(len(phred_quality))
609 length = len(seq)
610 mate_reference_id = mate_ref_name
611 reference_id = ref_name
612 reference = reference_id is not None and reference_id != '*'
613 insert_size = insert_size and abs(insert_size) or insert_size
614 is_segment1 = not is_paired or (is_paired and is_end1)
615 is_reverse = is_reverse
616 is_unique = is_primary and number_hits == 1
617 is_translocation = is_paired and is_mapped and is_mate_mapped\
618 and (mate_reference_id != '=' and reference_id != mate_reference_id)
619 is_part_of_doublemap = is_paired and is_mapped and is_mate_mapped
620 is_part_of_halfmap = is_paired and (is_mapped != is_mate_mapped)
621 is_part_of_nomap = is_paired and not is_mapped and not is_mate_mapped
622
623
624 # Count length until first low quality base call
625 q20_length = 0
626 for q in phred_quality:
627 if q < 20:
628 break
629 q20_length += 1
630
631 # Count kmers
632 if KHMER_AVAILABLE:
633 if not is_reverse:
634 self.ktable.consume(seq)
635 else:
636 self.ktable.consume(self._get_complement(seq))
637
638 if reference:
639
640 self.all_references[reference_id] += 1
641
642 if is_primary:
643 self.primary_references[reference_id] += 1
644
645
646 data = (insert_size, alignment_score, mapping_score, length,\
647 q20_length, avg_phred_quality, number_hits, is_reverse)
648
649 sample = 1
650
651 self._count_segment('sequenced', data, sample)
652 if is_mapped:
653 self._count_segment('sequenced mapped multi', data, sample)
654 if is_primary:
655 self._count_segment('sequenced mapped primary', data, sample)
656 if number_hits and is_unique:
657 self._count_segment('sequenced mapped primary unique', data, sample)
658
659 if is_segment1:
660 self._count_read('sequenced mapped multi', data, sample)
661 if is_primary:
662 self._count_read('sequenced mapped primary', data, sample)
663
664 if is_paired:
665 self._count_segment('sequenced paired', data, sample)
666
667 if is_part_of_doublemap:
668 self._count_segment('sequenced paired doublemap', data, sample)
669
670 if is_primary:
671 self._count_segment('sequenced paired doublemap primary', data, sample)
672
673 if is_segment1:
674 self._count_read('sequenced paired doublemap multi', data, sample)
675
676 if is_primary:
677 self._count_read('sequenced paired doublemap primary', data, sample)
678
679 if number_hits and is_unique:
680 self._count_read('sequenced paired doublemap primary unique', data, sample)
681
682 if is_translocation:
683 self._count_read('sequenced paired doublemap primary unique translocation', data, sample)
684
685 elif is_part_of_halfmap:
686
687 self._count_segment('sequenced paired halfmap', data, sample)
688
689 # The mapped segment
690 if is_mapped:
691
692 self._count_segment('sequenced paired halfmap mapped', data, sample)
693
694 if is_primary:
695
696 self._count_read('sequenced paired halfmap mapped primary', data, sample)
697
698 if number_hits and is_unique:
699 self._count_read('sequenced paired halfmap mapped primary unique', data, sample)
700
701 elif not is_primary:
702
703 self._count_read('sequenced unpaired mapped multi', data, sample)
704
705 # The unmapped segment
706 elif not is_mapped:
707 self._count_segment('sequenced paired halfmap unmapped', data, sample)
708
709 elif is_part_of_nomap:
710
711 self._count_segment('sequenced paired nomap', data, sample)
712
713 if is_segment1:
714
715 self._count_read('sequenced paired nomap', data, sample)
716
717 elif not is_paired:
718
719 self._count_segment('sequenced unpaired', data, sample)
720
721 if is_mapped:
722
723 self._count_segment('sequenced unpaired mapped', data, sample)
724
725 if is_primary:
726
727 self._count_read('sequenced unpaired mapped primary', data, sample)
728
729 if number_hits and is_unique:
730
731 self._count_read('sequenced paired unpaired mapped primary unique', data, sample)
732
733 elif not is_primary:
734
735 self._count_read('sequenced unpaired mapped multi', data, sample)
736
737
738 elif not is_mapped:
739
740 self._count_segment('sequenced unpaired unmapped', data, sample)
741
742 if is_segment1:
743 self._count_read('sequenced unpaired unmapped', data, sample)
744
745 def write(self):
746
747 with open(self.file_path, 'w') as output_file:
748
749 all_references = sorted([(count, reference) for reference, count\
750 in self.all_references.iteritems()], reverse=True)
751
752 for j, (count, reference) in enumerate(all_references[:30]):
753 self.stored[('segment', 'multireference_' + str(j+1))][reference] = count
754
755 primary_references = sorted([(count, reference) for reference, count\
756 in self.primary_references.iteritems()], reverse=True)
757
758 for j, (count, reference) in enumerate(primary_references[:30]):
759 self.stored[('segment', 'primaryreference_' + str(j+1))][reference] = count
760
761 # Extract top-ranking kmers
762 if KHMER_AVAILABLE:
763 kmer_frequencies = []
764 for i in range(0, self.ktable.n_entries()):
765 n = self.ktable.get(i)
766 if n > 0:
767 kmer_frequencies.append((n, self.ktable.reverse_hash(i)))
768 kmer_frequencies = sorted(kmer_frequencies, reverse=True)
769 for j, (frequency, kmer) in enumerate(kmer_frequencies[:10]):
770 self.stored[('segment', 'kmer_' + str(j+1))][kmer] = frequency
771
772 output_file.writelines(self._str_metrics(self.stored))
773
774
775 class SAMTaxonomy:
776 """ Provides taxonomic summary information from a SAM file stream. """
777
778 def __init__(self, file_path):
779
780 self.file_path = file_path
781
782 self.count_primaries = Counter()
783 self.detailed_information = {}
784
785 self._last_read = (None, None)
786 self._last_read_human_prim = 0
787 self._last_read_human_sec = 0
788 self._last_organisms = set()
789
790 def count(self, parsed_line):
791
792 if parsed_line is None:
793 return
794
795 read_key, read_name, flag, ref_name, ref_position, mapping_score,\
796 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
797 is_end1, is_end2, number_mismatches, alignment_score,\
798 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
799 is_paired, number_matches, read_end_pos, max_match = parsed_line
800
801 if is_mapped:
802
803 refseq_group, family, organism, gi = ref_name.split(';')[:4]
804
805 if is_primary:
806 self.count_primaries[organism] += 1
807
808 if organism not in self.detailed_information:
809 # refseq_group. family, gis, avg_mapping_score,
810 # avg_seq_length, avg_number_hits, avg_alignment_score, avg_nr_mismatches
811 initial = [refseq_group,
812 family,
813 set([gi]),
814 [int(mapping_score), 1],
815 [len(seq), 1],
816 [0, 0],
817 [alignment_score, 1],
818 [number_mismatches, 1],
819 0,
820 0]
821
822 self.detailed_information[organism] = initial
823
824 else:
825 entry = self.detailed_information[organism]
826 entry[2].add(gi)
827 entry[3][0] += int(mapping_score)
828 entry[3][1] += 1
829 entry[4][0] += len(seq)
830 entry[4][1] += 1
831 entry[6][0] += alignment_score
832 entry[6][1] += 1
833 entry[7][0] += number_mismatches
834 entry[7][1] += 1
835
836 if is_primary:
837 entry = self.detailed_information[organism]
838 entry[5][0] += number_hits
839 entry[5][1] += 1
840
841 if self._last_read == (None, None):
842 self._last_read = read_key
843
844 if self._last_read != read_key:
845
846 for last_organism in self._last_organisms:
847
848 self.detailed_information[last_organism][8]\
849 += self._last_read_human_prim
850
851 self.detailed_information[last_organism][9]\
852 += self._last_read_human_sec
853
854 self._last_read = read_key
855 self._last_organisms = set()
856 self._last_read_human_prim = 0
857 self._last_read_human_sec = 0
858
859 self._last_organisms.add(organism)
860
861 if organism == 'Homo_sapiens':
862 if is_primary:
863 self._last_read_human_prim += 1
864 else:
865 self._last_read_human_sec += 1
866
867 def get_summary(self, top=100):
868
869 lines = []
870
871 lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10s\t%10s\t%5s\t%5s\t%5s\t%10s\t%10s\n'\
872 % ('Count', 'Group', 'Family', 'Organism', 'Targets', 'ReadLen', 'Hits', 'Map', 'Algn', 'Mism', 'HuP', 'HuS'))
873
874 top_organisms = self.count_primaries.most_common(top)
875
876 for organism, count in top_organisms:
877
878 refseq_group, family, identifiers,\
879 avg_mapping_score, avg_seq_length, avg_number_hits,\
880 avg_alignment_score, avg_nr_mismatches, human_prim, human_sec\
881 = self.detailed_information[organism]
882
883 avg_len = int(avg_seq_length[0] / float(avg_seq_length[1]))
884 if avg_number_hits[1] == 0:
885 avg_hits = 0
886 else:
887 avg_hits = int(avg_number_hits[
888 0] / float(avg_number_hits[1]))
889
890 avg_mapping_score = int(avg_mapping_score[
891 0] / float(avg_mapping_score[1]))
892
893 avg_alignment_score = int(avg_alignment_score[
894 0] / float(avg_alignment_score[1]))
895
896 avg_nr_mismatches = int(avg_nr_mismatches[
897 0] / float(avg_nr_mismatches[1]))
898
899 nr_ids = len(identifiers)
900
901 if count > 10**6:
902 count = str(round(count / float(10**6), 3)) + 'M'
903 if human_prim > 10**6:
904 human_prim = str(round(human_prim / float(10**6), 3)) + 'M'
905 if human_sec > 10**6:
906 human_sec = str(round(human_sec / float(10**6), 3)) + 'M'
907 if nr_ids > 10**6:
908 nr_ids = str(round(nr_ids / float(10**6), 3)) + 'M'
909
910 lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10i\t%10i\t%5i\t%5i\t%5i\t%10s\t%10s\n'\
911 % (str(count), refseq_group[:20], family[:20], organism[:20],\
912 str(nr_ids), avg_len, avg_hits, avg_mapping_score,\
913 avg_alignment_score, avg_nr_mismatches, str(human_prim),\
914 str(human_sec)))
915
916 return lines
917
918 def write(self):
919
920 with open(self.file_path, 'w') as output_file:
921 output_file.writelines(self.get_summary())
922
923 @CLI.subcommand("rnamap")
924 class RNAmap(cli.Application):
925 """ Map input reads against a STAR index """
926
927 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
928 help="Sets the index output directory")
929
930 threads = cli.SwitchAttr(
931 ['-t', '--threads'], cli.Range(1, 512), mandatory=False,
932 help="Sets the number of threads to use",
933 default=1)
934
935 taxonomy = cli.SwitchAttr(
936 ['-x', '--taxonomy'], str, mandatory=False,
937 help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping",
938 default='')
939
940 star_path = cli.SwitchAttr(['--star_path'], str, mandatory=False,
941 help="Path to STAR executable",
942 default='')
943
944 samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False,
945 help="Path to samtools executable",
946 default='')
947
948 temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False,
949 help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.",
950 default='')
951
952 min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False,
953 help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)",
954 default=None)
955
956 min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False,
957 help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)",
958 default=None)
959
960 max_mismatches = cli.SwitchAttr(['--max_mismatches'], cli.Range(0, 10000000), mandatory=False,
961 help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)",
962 default=None)
963
964 max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False,
965 help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)",
966 default=None)
967
968 min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False,
969 help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)",
970 default=None)
971
972 filter_complexity = cli.Flag(['--filter_complexity'],
973 help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)",
974 default=False)
975
976 bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False,
977 help="Path to unsorted, unindexed output BAM file",
978 default='')
979
980 sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False,
981 help="Path to output SAM file",
982 default='')
983
984 qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False,
985 help="Path to output quality file",
986 default='')
987
988 hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False,
989 help="Path to bzip2-compressed tab-delimited output hit file",
990 default='')
991
992 sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False,
993 help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file",
994 default='no_sample_id')
995
996 unmapped1 = cli.SwitchAttr(['--unmapped_end_1'], str, mandatory=False,
997 help="Output path to uncompressed fastq file containing unmapped reads, first ends only for paired ends.",
998 default='')
999
1000 unmapped2 = cli.SwitchAttr(['--unmapped_end_2'], str, mandatory=False,
1001 help="Output path to uncompressed fastq file containing unmapped reads, second ends only for paired ends.",
1002 default='')
1003
1004 splice_junctions = cli.SwitchAttr(['--splice_junctions'], str, mandatory=False,
1005 help="Input path to splice junction file (currently not implemented)",
1006 default='')
1007
1008 chimeric_mappings = cli.SwitchAttr(['--chimeric_mappings'], str, mandatory=False,
1009 help="Ouput path to SAM file containing chimeric mappings",
1010 default='')
1011
1012 hit_filter = cli.SwitchAttr(
1013 ['-f', '--virana_hit_filter'], str, list=True, mandatory=False,
1014 help="Only generate hit groups that include at last one read mapping to a reference of this reference group.",
1015 default=[])
1016
1017 debug = cli.Flag(["-d", "--debug"], help="Enable debug information")
1018
1019 reads = cli.SwitchAttr(
1020 ['-r', '--reads'], str, list=True, mandatory=True,
1021 help="Sets the input reads. Add this parameter twice for paired end reads.")
1022
1023 zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped")
1024
1025 sensitive = cli.Flag(
1026 ["--sensitive"], help="If given, mapping will process slower and more sensitive")
1027
1028 def main(self):
1029
1030 if self.debug:
1031 logging.getLogger().setLevel(logging.DEBUG)
1032
1033 # Obtain star executable
1034 star = [self.star_path and self.star_path or 'STAR']
1035 samtools = [self.samtools_path and self.samtools_path or 'samtools']
1036
1037 # Check if genome directory is existing
1038 if not os.path.exists(self.index_dir):
1039 sys.stdout.write('Index directory %s not existing, exiting' % self.genome_dir)
1040 sys.exit(1)
1041
1042 if self.temp_path:
1043 temp_path = tempfile.mkdtemp(dir=self.temp_path)
1044 else:
1045 temp_path = tempfile.mkdtemp()
1046
1047 first_ends = []
1048 second_ends = []
1049 single_ends = []
1050
1051 if len(self.reads) == 2:
1052 first, second = self.reads
1053 first_ends.append(first)
1054 second_ends.append(second)
1055
1056 elif len(self.reads) == 1:
1057 single_ends.append(self.reads[0])
1058
1059 else:
1060 sys.stdout.write('Invalid number of fastq files; provide either one (single end) or two (paired end)')
1061 sys.exit(1)
1062
1063 if single_ends and not first_ends and not second_ends:
1064 reads = [','.join(single_ends)]
1065
1066 elif first_ends and second_ends:
1067 reads = [','.join(first_ends), ','.join(second_ends)]
1068
1069
1070 star_cline = star + ['--runMode', 'alignReads',
1071 '--genomeDir', self.index_dir,
1072 '--runThreadN', str(self.threads),
1073 '--readMatesLengthsIn', 'NotEqual',
1074 '--outFileNamePrefix', os.path.join(
1075 temp_path, 'out'),
1076 '--outSAMmode', 'Full',
1077 '--outSAMstrandField', 'None',
1078 '--outSAMattributes', 'Standard',
1079 '--outSAMunmapped', 'Within',
1080 '--outStd', 'SAM',
1081 '--outFilterMultimapNmax', '1000',
1082 '--outSAMprimaryFlag', 'AllBestScore',
1083 '--outSAMorder', 'PairedKeepInputOrder']
1084
1085 if self.unmapped1 or self.unmapped2:
1086 star_cline += ['--outReadsUnmapped', 'Fastx']
1087 else:
1088 star_cline += ['--outReadsUnmapped', 'None']
1089
1090
1091 if self.zipped:
1092 star_cline += ['--readFilesCommand', 'zcat']
1093
1094 if self.sensitive:
1095 star_cline += ['--outFilterMultimapScoreRange', '10',
1096 '--outFilterMismatchNmax', '60',
1097 '--outFilterMismatchNoverLmax', '0.3',
1098 '--outFilterScoreMin', '0',
1099 '--outFilterScoreMinOverLread', '0.3',
1100 '--outFilterMatchNmin', '0',
1101 '--outFilterMatchNminOverLread', '0.66',
1102 '--seedSearchStartLmax', '12',
1103 '--winAnchorMultimapNmax', '50']
1104
1105 star_cline += ['--readFilesIn'] + reads
1106
1107 if self.debug:
1108 print ' '.join(star_cline)
1109
1110 # Try if we can make the relevant files
1111 touch_files = [self.unmapped1, self.unmapped2, self.taxonomy, self.qual, self.hits, self.sam, self.bam]
1112 for file_path in touch_files:
1113 if file_path is None or file_path == '':
1114 continue
1115 try:
1116 with file(file_path, 'a'):
1117 os.utime(file_path, None)
1118 except IOError:
1119 sys.stderr.write('Could not write output file %s\n' % file_path)
1120 sys.exit(1)
1121
1122 star_process = subprocess.Popen(' '.join(
1123 star_cline), shell=True, stdout=PIPE)
1124
1125 parser = SAMParser()
1126
1127 if self.taxonomy:
1128 taxonomy = SAMTaxonomy(self.taxonomy)
1129
1130 if self.qual:
1131 quality = SAMQuality(self.qual)
1132
1133 if self.hits:
1134 hits = SAMHits(self.hits, self.sample_id, self.hit_filter,
1135 self.min_mapping_score,
1136 self.min_alignment_score,
1137 self.max_mismatches,
1138 self.max_relative_mismatches,
1139 self.min_continiously_matching,
1140 self.filter_complexity)
1141
1142 if self.sam:
1143 sam_file = open(self.sam, 'w')
1144
1145 if self.bam:
1146 with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file:
1147 samtools_cline = samtools + [
1148 'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin']
1149 if self.debug:
1150 print ' '.join(samtools_cline)
1151 samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE)
1152
1153
1154 do_sam = self.sam
1155 do_bam = self.bam
1156 do_taxonomy = self.taxonomy
1157 do_qual = self.qual
1158 do_hits = self.hits
1159 do_parse = do_taxonomy or do_qual or do_hits
1160
1161 for i, line in enumerate(iter(star_process.stdout.readline, '')):
1162
1163 if do_sam:
1164 sam_file.write(line)
1165
1166 if do_bam:
1167 samtools_process.stdin.write(line)
1168
1169 if line[0] == '@':
1170 continue
1171
1172 if do_parse:
1173 parsed_line = parser.parse(line)
1174
1175 if do_taxonomy:
1176 taxonomy.count(parsed_line)
1177 if i > 0 and (i % 50000) == 0:
1178 print ''.join(taxonomy.get_summary(10))
1179
1180 if do_qual:
1181 quality.count(parsed_line)
1182
1183 if do_hits:
1184 hits.count(parsed_line)
1185
1186 if do_bam:
1187 samtools_process.stdin.close()
1188
1189 if do_sam:
1190 sam_file.close()
1191
1192 if do_hits:
1193 hits.write()
1194
1195 if do_taxonomy:
1196 print ''.join(taxonomy.get_summary(10))
1197 taxonomy.write()
1198
1199 if do_qual:
1200 quality.write()
1201
1202 try:
1203 if self.unmapped1:
1204 shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate1'),\
1205 self.unmapped1)
1206 except IOError:
1207 pass
1208
1209 try:
1210 if self.unmapped2:
1211 shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate2'),\
1212 self.unmapped2)
1213 except IOError:
1214 pass
1215
1216 try:
1217 if self.chimeric_mappings:
1218 shutil.move(os.path.join(temp_path, 'out' + 'Chimeric.out.sam'),\
1219 self.chimeric_mappings)
1220 except IOError:
1221 pass
1222
1223 shutil.rmtree(temp_path)
1224
1225 @CLI.subcommand("dnamap")
1226 class DNAmap(cli.Application):
1227 """ Map input reads against a BWA index """
1228
1229 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
1230 help="Sets the index output directory")
1231
1232 threads = cli.SwitchAttr(
1233 ['-t', '--threads'], cli.Range(1, 512), mandatory=False,
1234 help="Sets the number of threads to use",
1235 default=1)
1236
1237 taxonomy = cli.SwitchAttr(
1238 ['-x', '--taxonomy'], str, mandatory=False,
1239 help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping",
1240 default='')
1241
1242 samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False,
1243 help="Path to samtools executable",
1244 default='')
1245
1246 temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False,
1247 help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.",
1248 default='')
1249
1250 min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False,
1251 help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)",
1252 default=None)
1253
1254 min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False,
1255 help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)",
1256 default=None)
1257
1258 max_mismatches = cli.SwitchAttr(['--max_mismatches'], cli.Range(0, 10000000), mandatory=False,
1259 help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)",
1260 default=None)
1261
1262 max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False,
1263 help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)",
1264 default=None)
1265
1266 min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False,
1267 help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)",
1268 default=None)
1269
1270 filter_complexity = cli.Flag(['--filter_complexity'],
1271 help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)",
1272 default=False)
1273
1274 sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False,
1275 help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file",
1276 default='no_sample_id')
1277
1278 bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False,
1279 help="Path to unsorted, unindexed output BAM file",
1280 default='')
1281
1282 sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False,
1283 help="Path to output SAM file",
1284 default='')
1285
1286 qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False,
1287 help="Path to output quality file",
1288 default='')
1289
1290 hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False,
1291 help="Path to bzip2-compressed tab-delimited output virana hit file",
1292 default='')
1293
1294 hit_filter = cli.SwitchAttr(
1295 ['-f', '--virana_hit_filter'], str, list=True, mandatory=False,
1296 help="Only generate hit groups that include at last one read mapping to a reference of this reference group.",
1297 default=[])
1298
1299 debug = cli.Flag(["-d", "--debug"], help="Enable debug information")
1300
1301 zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped")
1302
1303 sensitive = cli.Flag(
1304 ["--sensitive"], help="If given, mapping will process slower and more sensitive")
1305
1306
1307 bwa_path = cli.SwitchAttr(['--bwa_path'], str, mandatory=False,
1308 help="Path to BWA executable",
1309 default='')
1310
1311 debug = cli.Flag(["-d", "--debug"], help="Enable debug information")
1312
1313 if debug:
1314 logging.getLogger().setLevel(logging.DEBUG)
1315
1316 reads = cli.SwitchAttr(
1317 ['-r', '--reads'], str, list=True, mandatory=True,
1318 help="Sets the input reads. Add this parameter twice for paired end reads.")
1319
1320 def main(self):
1321
1322 if self.debug:
1323 logging.getLogger().setLevel(logging.DEBUG)
1324
1325 # Obtain star executable
1326 bwa = [self.bwa_path and self.bwa_path or 'bwa']
1327 samtools = [self.samtools_path and self.samtools_path or 'samtools']
1328
1329 # Check if genome directory is existing
1330 if not os.path.exists(self.index_dir):
1331 sys.stdout.write('Index directory %s not existing, exiting'\
1332 % self.genome_dir)
1333 sys.exit(1)
1334
1335 if len(self.reads) not in (1, 2):
1336 message = 'Invalid number of FASTQ files; supply either one (single end) or two (paired end)\n'
1337 sys.stderr.write(message)
1338 sys.exit(1)
1339
1340 bwa_cline = bwa + ['mem', '-t', str(self.threads), '-M', os.path.join(self.index_dir, 'index')]
1341
1342 bwa_cline += self.reads
1343
1344 if self.debug:
1345 print ' '.join(bwa_cline)
1346
1347 bwa_process = subprocess.Popen(' '.join(bwa_cline), shell=True, stdout=PIPE)
1348
1349 parser = SAMParser()
1350
1351 if self.taxonomy:
1352 taxonomy = SAMTaxonomy(self.taxonomy)
1353
1354 if self.qual:
1355 quality = SAMQuality(self.qual)
1356
1357 if self.hits:
1358 hits = SAMHits(self.hits, self.sample_id, self.hit_filter,
1359 self.min_mapping_score,
1360 self.min_alignment_score,
1361 self.max_mismatches,
1362 self.max_relative_mismatches,
1363 self.min_continiously_matching,
1364 self.filter_complexity)
1365
1366 if self.sam:
1367 sam_file = open(self.sam, 'w', buffering=100 * 1024 * 1024)
1368
1369 if self.bam:
1370 with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file:
1371 samtools_cline = samtools + [
1372 'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin']
1373 if self.debug:
1374 print ' '.join(samtools_cline)
1375 samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE)
1376
1377 do_sam = self.sam
1378 do_bam = self.bam
1379 do_taxonomy = self.taxonomy
1380 do_qual = self.qual
1381 do_hits = self.hits
1382 do_parse = do_taxonomy or do_qual or do_hits
1383
1384 for i, line in enumerate(iter(bwa_process.stdout.readline, '')):
1385
1386 if do_sam:
1387 sam_file.write(line)
1388
1389 if do_bam:
1390 samtools_process.stdin.write(line)
1391
1392 if do_parse:
1393 parsed_line = parser.parse(line)
1394
1395 if do_taxonomy:
1396 taxonomy.count(parsed_line)
1397
1398 if i > 0 and (i % 10000) == 0:
1399 print ''.join(taxonomy.get_summary(10))
1400
1401 if do_qual:
1402 quality.count(parsed_line)
1403
1404 if do_hits:
1405 hits.count(parsed_line)
1406
1407 if do_bam:
1408 samtools_process.stdin.close()
1409
1410 if do_sam:
1411 sam_file.close()
1412
1413 if do_hits:
1414 hits.write()
1415
1416 if do_taxonomy:
1417 print ''.join(taxonomy.get_summary(10))
1418 taxonomy.write()
1419
1420 if do_qual:
1421 quality.write()
1422
1423
1424 if __name__ == "__main__":
1425 CLI.run()