Mercurial > repos > mzeidler > virana_main
comparison vmap.py @ 0:f63d639b223c draft
Initial Upload
author | mzeidler |
---|---|
date | Wed, 25 Sep 2013 11:41:16 -0400 |
parents | |
children | be5d3889a1b9 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f63d639b223c |
---|---|
1 #!/usr/bin/env python | |
2 #from __future__ import print_function | |
3 | |
4 import cProfile | |
5 | |
6 import sys | |
7 import re | |
8 | |
9 import tempfile | |
10 import subprocess | |
11 import shutil | |
12 import os | |
13 import os.path | |
14 import logging | |
15 import bz2 | |
16 import zlib | |
17 | |
18 import math | |
19 import string | |
20 | |
21 from collections import defaultdict, Counter | |
22 | |
23 from subprocess import PIPE | |
24 | |
25 NON_ID = ''.join(c for c in map(chr, range(256)) if not c.isalnum()) | |
26 NON_ID = NON_ID.replace('_', '').replace('-', '') | |
27 | |
28 try: | |
29 from Bio.SeqRecord import SeqRecord | |
30 from Bio import SeqIO | |
31 from Bio.Seq import Seq | |
32 except ImportError: | |
33 message = 'This script requires the BioPython python package\n' | |
34 sys.stderr.write(message) | |
35 sys.exit(1) | |
36 | |
37 try: | |
38 from plumbum import cli | |
39 except ImportError: | |
40 message = 'This script requires the plumbum python package\n' | |
41 sys.stderr.write(message) | |
42 sys.exit(1) | |
43 | |
44 | |
45 try: | |
46 import HTSeq | |
47 except ImportError: | |
48 message = 'This script requires the HTSeq python package\n' | |
49 sys.stderr.write(message) | |
50 sys.exit(1) | |
51 | |
52 KHMER_AVAILABLE = True | |
53 try: | |
54 import khmer | |
55 except ImportError: | |
56 KHMER_AVAILABLE = False | |
57 | |
58 #from io import BufferedRandom | |
59 | |
60 logging.basicConfig(level=logging.INFO, format='%(message)s') | |
61 | |
62 | |
63 def profile_this(fn): | |
64 def profiled_fn(*args, **kwargs): | |
65 fpath = fn.__name__ + ".profile" | |
66 prof = cProfile.Profile() | |
67 ret = prof.runcall(fn, *args, **kwargs) | |
68 prof.dump_stats(fpath) | |
69 return ret | |
70 return profiled_fn | |
71 | |
72 | |
73 class CLI(cli.Application): | |
74 """RNA-Seq and DNA-Seq short read analysis by mapping to known reference sequences""" | |
75 PROGNAME = "vmap" | |
76 VERSION = "1.0.0" | |
77 DESCRIPTION = """Virana vmap is an interface to the NCBI and ensembl reference databases that can | |
78 generate reference indexes for the short read mappers STAR (RNA-Seq) and | |
79 BWA-MEM (DNA-Seq). Short reads can be mapped to arbitrary combinations of | |
80 reference databases and the results can be summarized by taxonomic family | |
81 as well as stored as SAM file, unsorted BAM file, or as a HIT file that | |
82 models multimapping reads between specific reference databases.""" | |
83 USAGE = """The program has four modes that can be accessed by `vmap rnaindex`, `vmap dnaindex`, `vmap rnamap`, and `vmap dnamap.`""" | |
84 | |
85 def main(self, *args): | |
86 | |
87 if args: | |
88 print("Unknown command %r" % (args[0])) | |
89 print self.USAGE | |
90 return 1 | |
91 | |
92 if not self.nested_command: | |
93 print("No command given") | |
94 print self.USAGE | |
95 return 1 | |
96 | |
97 | |
98 @CLI.subcommand("rnaindex") | |
99 class RNAIndex(cli.Application): | |
100 """ Creates a STAR index from a FASTA genome reference """ | |
101 | |
102 reference_files = cli.SwitchAttr( | |
103 ['-r', '--reference_file'], str, list=True, mandatory=True, | |
104 help="Sets the reference genome(s) FASTA file." + | |
105 " Multiple occurrences of this parameter are allowed.") | |
106 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, | |
107 help="Sets the index output directory." + | |
108 " Directory will be generated if not existing." + | |
109 " Directory will be filled with several index files.") | |
110 threads = cli.SwitchAttr( | |
111 ['-t', '--threads'], cli.Range(1, 512), mandatory=False, | |
112 help="Sets the number of threads to use", | |
113 default=1) | |
114 max_ram = cli.SwitchAttr( | |
115 ['-m'], cli.Range(1, 400000000000), mandatory=False, | |
116 help="Sets the maximum amount of memory (RAM) to use (in bytes)", | |
117 default=400000000000) | |
118 path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False, | |
119 help="Path to STAR executable", | |
120 default='') | |
121 sparse = cli.Flag( | |
122 ["-s", "--sparse"], help="If given, a sparse index that requires less " + | |
123 " RAM in the mapping phase will be constructed") | |
124 | |
125 debug = cli.Flag(["-d", "--debug"], help="Enable debug output") | |
126 | |
127 def main(self): | |
128 | |
129 if self.debug: | |
130 logging.getLogger().setLevel(logging.DEBUG) | |
131 | |
132 # Obtain star executable | |
133 star = [self.path and self.path or 'STAR'] | |
134 | |
135 # Check if genome directory is existing | |
136 for reference_file in self.reference_files: | |
137 if not os.path.exists(reference_file): | |
138 sys.stdout.write( | |
139 'Reference file %s nor existing, exiting' % reference_file) | |
140 sys.exit(1) | |
141 | |
142 # Check if output directory is existing | |
143 if not os.path.exists(self.index_dir): | |
144 logging.debug( | |
145 'Making output directory for index at %s' % self.index_dir) | |
146 os.makedirs(self.index_dir) | |
147 | |
148 # # Make named pipe to extract genomes | |
149 # pipe_path = os.path.abspath(os.path.join(self.genome_dir, 'pipe.fa')) | |
150 # if os.path.exists(pipe_path): | |
151 # os.unlink(pipe_path) | |
152 # os.mkfifo(pipe_path) | |
153 | |
154 # Make star command line | |
155 cline = star + ['--runMode', 'genomeGenerate', | |
156 '--genomeDir', self.index_dir, | |
157 '--limitGenomeGenerateRAM', str(self.max_ram), | |
158 '--runThreadN', str(self.threads), | |
159 '--genomeFastaFiles'] + self.reference_files | |
160 | |
161 # Add parameters for sparse (memory-saving) index generation | |
162 if self.sparse: | |
163 cline += ['--genomeSAsparseD', '2', | |
164 '--genomeChrBinNbits', '12', | |
165 '--genomeSAindexNbases', '13'] | |
166 | |
167 else: | |
168 cline += ['--genomeSAsparseD', '1', | |
169 '--genomeChrBinNbits', '18', | |
170 '--genomeSAindexNbases', '15'] | |
171 | |
172 if self.debug: | |
173 print ' '.join(cline) | |
174 | |
175 # Run STAR reference generation process | |
176 star_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE) | |
177 | |
178 # Block until streams are closed by the process | |
179 stdout, stderr = star_process.communicate() | |
180 | |
181 if stderr: | |
182 sys.stderr.write(stderr) | |
183 | |
184 if self.debug and stdout: | |
185 print stdout | |
186 | |
187 | |
188 @CLI.subcommand("dnaindex") | |
189 class DNAIndex(cli.Application): | |
190 """ Creates a BWA index from a FASTA reference file """ | |
191 | |
192 reference_file = cli.SwitchAttr(['-r', '--reference_file'], str, mandatory=True, | |
193 help="Sets the input reference FASTA file.") | |
194 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, | |
195 help="Sets the index output directory." + | |
196 " Directory will be generated if not existing." + | |
197 " Directory will be filled with several index files.") | |
198 path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False, | |
199 help="Path to BWA executable", | |
200 default='') | |
201 debug = cli.Flag(["-d", "--debug"], help="Enable debug output") | |
202 | |
203 if debug: | |
204 logging.getLogger().setLevel(logging.DEBUG) | |
205 | |
206 def main(self): | |
207 | |
208 # Obtain star executable | |
209 bwa = [self.path and self.path or 'bwa'] | |
210 | |
211 # Check if genome directory is existing | |
212 if not os.path.exists(self.reference_file): | |
213 sys.stdout.write('Genome file %s nor existing, exiting' % self.reference_file) | |
214 sys.exit(1) | |
215 | |
216 # Check if output directory is existing | |
217 if not os.path.exists(self.index_dir): | |
218 logging.debug('Making output directory %s' % self.index_dir) | |
219 os.makedirs(self.index_dir) | |
220 | |
221 # Make star command line | |
222 cline = bwa + ['index', '-a', 'bwtsw', '-p', os.path.join(self.index_dir, 'index'), self.reference_file] | |
223 | |
224 if self.debug: | |
225 print ' '.join(cline) | |
226 | |
227 # Run BWA index generation process | |
228 bwa_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE) | |
229 stdout, stderr = bwa_process.communicate() | |
230 | |
231 if stderr: | |
232 sys.stderr.write(stderr) | |
233 | |
234 if self.debug and stdout: | |
235 print stdout | |
236 | |
237 | |
238 | |
239 | |
240 | |
241 class SAMHits: | |
242 """ Converts SAM output of mappers into bzipped HIT files. """ | |
243 | |
244 def __init__(self, output_file, sample_id, refseq_filter=None, min_mapping_score=None,\ | |
245 min_alignment_score=None, max_mismatches=None,\ | |
246 max_relative_mismatches=None, min_continiously_matching=None,\ | |
247 filter_complexity=False): | |
248 | |
249 self.output_file = bz2.BZ2File(output_file, 'wb', buffering=100 * 1024 * 1024) | |
250 self.sample_id = sample_id.translate(None, NON_ID) | |
251 self.refseq_filter = refseq_filter | |
252 self.max_mismatches = max_mismatches | |
253 self.max_relative_mismatches = max_relative_mismatches | |
254 self.current_group = [] | |
255 self.min_mapping_score = min_mapping_score | |
256 self.min_alignment_score = min_alignment_score | |
257 self.min_continiously_matching = min_continiously_matching | |
258 self.filter_complexity = filter_complexity | |
259 | |
260 self.re_matches = re.compile(r'(\d+)M') | |
261 self.re_dels = re.compile(r'(\d+)D') | |
262 | |
263 def count(self, parsed_line): | |
264 | |
265 if parsed_line is None: | |
266 return | |
267 | |
268 read_key, read_name, flag, ref_name, ref_position, mapping_score,\ | |
269 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ | |
270 is_end1, is_end2, number_mismatches, alignment_score,\ | |
271 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ | |
272 is_paired, number_matches, read_end_pos, max_match = parsed_line | |
273 | |
274 if not is_mapped: | |
275 return | |
276 | |
277 if self.filter_complexity: | |
278 | |
279 avg_compression = float(len(zlib.compress(seq)))/len(seq) | |
280 | |
281 if avg_compression < 0.5: | |
282 return | |
283 | |
284 # length = len(seq) | |
285 # counts = [seq.count(nuc) for nuc in 'ACGT'] | |
286 # min_count = length * 0.10 | |
287 # max_count = length * 0.50 | |
288 # for count in counts: | |
289 # if count < min_count or count > max_count: | |
290 # return None | |
291 | |
292 # counter = Counter() | |
293 # for i in range(length - 2): | |
294 # counter[seq[i: i + 3]] += 1 | |
295 # maximal = length - 4 | |
296 | |
297 # highest = sum([v for k, v in counter.most_common(2)]) | |
298 # if highest > (maximal / 3.0): | |
299 # return None | |
300 | |
301 # self.passed.append(avg_compression) | |
302 | |
303 pair_id = '' | |
304 if is_end1: | |
305 pair_id = '/1' | |
306 | |
307 elif is_end2: | |
308 pair_id = '/2' | |
309 | |
310 read_name = self.sample_id + ';' + read_name + pair_id | |
311 | |
312 # Initialize new current group | |
313 if len(self.current_group) == 0: | |
314 self.current_group = [read_name, seq, []] | |
315 | |
316 # Write old current group to file | |
317 if read_name != self.current_group[0]: | |
318 self._write_group() | |
319 self.current_group = [read_name, seq, []] | |
320 | |
321 try: | |
322 refseq_group, family, organism, identifier = ref_name.split(';')[:4] | |
323 except ValueError: | |
324 sys.stderr.write('Read mapped to malformed reference sequence %s, skipping\n' % ref_name) | |
325 return | |
326 | |
327 if self.min_continiously_matching: | |
328 | |
329 if self.min_continiously_matching > max_match: | |
330 return | |
331 | |
332 if self.max_mismatches\ | |
333 and int(number_mismatches) > self.max_mismatches: | |
334 return | |
335 | |
336 if self.max_relative_mismatches\ | |
337 and int(number_mismatches) / float(len(seq))\ | |
338 > self.max_relative_mismatches: | |
339 return | |
340 | |
341 if self.min_mapping_score\ | |
342 and self.min_mapping_score > mapping_score: | |
343 return | |
344 | |
345 if self.min_alignment_score\ | |
346 and self.min_alignment_score > alignment_score: | |
347 return | |
348 | |
349 start = int(ref_position) + 1 | |
350 | |
351 self.current_group[2].append([refseq_group, family, organism, identifier, str(start), str(read_end_pos)]) | |
352 | |
353 def _write_group(self): | |
354 passed = True | |
355 | |
356 if self.refseq_filter: | |
357 passed = False | |
358 for refseq_group, family, organism, identifier, start, end in self.current_group[2]: | |
359 if passed: | |
360 break | |
361 for f in self.refseq_filter: | |
362 if refseq_group == f: | |
363 passed = True | |
364 break | |
365 if passed: | |
366 description = [] | |
367 for identifier in self.current_group[2]: | |
368 description.append(';'.join(identifier)) | |
369 description = '|'.join(description) | |
370 | |
371 record = SeqRecord(Seq(self.current_group[1])) | |
372 record.id = 'Read;' + self.current_group[0] | |
373 record.description = description | |
374 | |
375 SeqIO.write([record], self.output_file, "fasta") | |
376 | |
377 def write(self): | |
378 | |
379 self._write_group() | |
380 self.output_file.close() | |
381 | |
382 class SAMParser: | |
383 | |
384 def parse(self, line): | |
385 | |
386 if line[0] == '@': | |
387 return None | |
388 | |
389 alignment = HTSeq._HTSeq.SAM_Alignment.from_SAM_line(line) | |
390 read_name = alignment.read.name | |
391 seq = alignment.read.seq | |
392 qual = alignment.read.qual | |
393 flag = alignment.flag | |
394 cigar = None | |
395 | |
396 is_paired = (flag & 1) | |
397 is_mapped = not (flag & 4) | |
398 is_mate_mapped = alignment.mate_aligned is not None #not (flag & 8) | |
399 is_reverse = (flag & 16) | |
400 is_end1 = (flag & 64) | |
401 is_end2 = (flag & 128) | |
402 is_primary = not (flag & 256) | |
403 | |
404 read_key = (read_name, is_end1) | |
405 | |
406 ref_name = None | |
407 ref_position = None | |
408 mapping_score = 0 | |
409 mate_ref_name = None | |
410 mate_ref_position = None | |
411 insert_size = None | |
412 alignment_score = 0 | |
413 read_end_pos = None | |
414 | |
415 if is_mate_mapped and alignment.mate_start: | |
416 mate_ref_name = alignment.mate_start.chrom | |
417 mate_ref_position = alignment.mate_start.start | |
418 | |
419 number_hits = 0 | |
420 alignment_score = 0 | |
421 number_mismatches = 0 | |
422 | |
423 number_matches = 0 | |
424 max_match = 0 | |
425 | |
426 if is_mapped: | |
427 | |
428 ref_name = alignment.iv.chrom | |
429 ref_position = alignment.iv.start | |
430 read_end_pos = alignment.iv.end | |
431 alignment_score = alignment.aQual | |
432 cigar = alignment.cigar | |
433 | |
434 if is_mate_mapped: | |
435 insert_size = alignment.inferred_insert_size | |
436 | |
437 for c in cigar: | |
438 if c.type == 'M': | |
439 number_matches += c.size | |
440 max_match = max(max_match, c.size) | |
441 | |
442 for tag, value in alignment.optional_fields: | |
443 if tag == 'NM': | |
444 number_hits = value | |
445 elif tag == 'AS': | |
446 alignment_score = value | |
447 elif tag == 'NH': | |
448 number_mismatches = value | |
449 | |
450 return read_key, read_name, flag, ref_name, ref_position, mapping_score,\ | |
451 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ | |
452 is_end1, is_end2, number_mismatches, alignment_score,\ | |
453 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ | |
454 is_paired, number_matches, read_end_pos, max_match | |
455 | |
456 | |
457 class SAMQuality: | |
458 | |
459 def __init__(self, file_path): | |
460 | |
461 self.file_path = file_path | |
462 | |
463 self.stored = defaultdict(Counter) | |
464 self.all_references = defaultdict(int) | |
465 self.primary_references = defaultdict(int) | |
466 self.complement = string.maketrans('ATCGN', 'TAGCN') | |
467 | |
468 if KHMER_AVAILABLE: | |
469 self.ktable = khmer.new_ktable(10) | |
470 | |
471 def _get_complement(self, sequence): | |
472 | |
473 return sequence.translate(self.complement)[::-1] | |
474 | |
475 def _get_summary(self, counter): | |
476 """"Returns five numbers (sum, extrema, mean, and std) | |
477 for a max_frequency counter """ | |
478 | |
479 maximum = 0 | |
480 minimum = sys.maxint | |
481 thesum = 0 | |
482 allcount = 0 | |
483 mode = [0, None] | |
484 | |
485 items = 0.0 | |
486 mean = 0.0 | |
487 m2 = 0.0 | |
488 variance = 0.0 | |
489 | |
490 for item in counter: | |
491 | |
492 count = counter[item] | |
493 if count > mode[0]: | |
494 mode = [count, item] | |
495 | |
496 allcount += count | |
497 maximum = max(maximum, item) | |
498 minimum = min(minimum, item) | |
499 thesum += (count * item) | |
500 | |
501 x = 1 | |
502 while x <= count: | |
503 items += 1 | |
504 delta = item - mean | |
505 mean = mean + delta / items | |
506 m2 = m2 + delta * (item - mean) | |
507 variance = m2 / items | |
508 x += 1 | |
509 | |
510 std = math.sqrt(variance) | |
511 | |
512 return allcount, thesum, minimum, maximum, mode[1], mean, std | |
513 | |
514 | |
515 def _to_unit(self, item, is_percentage=False): | |
516 """ Convert a numeric to a string with metric units """ | |
517 | |
518 if is_percentage: | |
519 return ('%-.3f' % (item * 100)) + '%' | |
520 converted = None | |
521 try: | |
522 item = float(item) | |
523 if item > 10**12: | |
524 converted = str(round(item / 10**9,3))+'P' | |
525 elif item > 10**9: | |
526 converted = str(round(item / 10**9,3))+'G' | |
527 elif item > 10**6: | |
528 converted = str(round(item / 10**6,3))+'M' | |
529 elif item > 10**3: | |
530 converted = str(round(item / 10**3,3))+'K' | |
531 else: | |
532 converted = str(round(item,3)) | |
533 except: | |
534 converted = str(item) | |
535 | |
536 return converted | |
537 | |
538 def _str_metrics(self, data): | |
539 | |
540 str_metrics = [] | |
541 | |
542 for (item, metric) in sorted(data.keys()): | |
543 counter = data[(item, metric)] | |
544 if not hasattr(counter.iterkeys().next(), 'real'): | |
545 for element, count in counter.most_common(): | |
546 str_metrics.append(self._str_metric(item, metric, element, count, no_units=True)) | |
547 else: | |
548 summary = self._get_summary(counter) | |
549 str_metrics.append(self._str_metric(item, metric, *summary)) | |
550 | |
551 return str_metrics | |
552 | |
553 def _str_metric(self, item, metric, count, thesum='', minimum='',\ | |
554 maximum='', mode='', mean='', std='', no_units=False): | |
555 | |
556 counters = [count, thesum, minimum, maximum, mode, mean, std] | |
557 counters = map(str, counters) | |
558 | |
559 if no_units: | |
560 items = [item, metric] + counters | |
561 else: | |
562 units = map(self._to_unit, counters) | |
563 items = [item, metric] + units | |
564 | |
565 return '%-15s\t%-60s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\n' \ | |
566 % tuple(items) | |
567 | |
568 | |
569 def _count_read(self, metric, data, sample): | |
570 | |
571 item = 'read' | |
572 | |
573 (insert_size, alignment_score, mapping_score, length,\ | |
574 q20_length, avg_phred_quality, number_hits, is_reverse) = data | |
575 | |
576 self.stored[(item, metric + ' mappings')][number_hits] += sample | |
577 self.stored[(item, metric + ' insert')][insert_size] += sample | |
578 | |
579 | |
580 def _count_segment(self, metric, data, sample): | |
581 | |
582 item = 'segment' | |
583 | |
584 (insert_size, alignment_score, mapping_score, length,\ | |
585 q20_length, avg_phred_quality, number_hits, is_reverse) = data | |
586 | |
587 self.stored[(item, metric + ' algq')][alignment_score] += sample | |
588 self.stored[(item, metric + ' mapq')][mapping_score] += sample | |
589 self.stored[(item, metric + ' length')][length] += sample | |
590 self.stored[(item, metric + ' q20length')][q20_length] += sample | |
591 self.stored[(item, metric + ' meanbasequal')][avg_phred_quality] += sample | |
592 self.stored[(item, metric + ' reverse')][is_reverse] += sample | |
593 | |
594 def count(self, parsed_line): | |
595 | |
596 if parsed_line is None: | |
597 return | |
598 | |
599 #print_metric('Item' , 'Metric', 'Count', 'Sum', 'Min', 'Max', 'Mode', 'Mean', 'STD') | |
600 | |
601 read_key, read_name, flag, ref_name, ref_position, mapping_score,\ | |
602 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ | |
603 is_end1, is_end2, number_mismatches, alignment_score,\ | |
604 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ | |
605 is_paired, number_matches, read_end_pos, max_match = parsed_line | |
606 | |
607 phred_quality = [q - 33 for q in qual] | |
608 avg_phred_quality = sum(phred_quality) / float(len(phred_quality)) | |
609 length = len(seq) | |
610 mate_reference_id = mate_ref_name | |
611 reference_id = ref_name | |
612 reference = reference_id is not None and reference_id != '*' | |
613 insert_size = insert_size and abs(insert_size) or insert_size | |
614 is_segment1 = not is_paired or (is_paired and is_end1) | |
615 is_reverse = is_reverse | |
616 is_unique = is_primary and number_hits == 1 | |
617 is_translocation = is_paired and is_mapped and is_mate_mapped\ | |
618 and (mate_reference_id != '=' and reference_id != mate_reference_id) | |
619 is_part_of_doublemap = is_paired and is_mapped and is_mate_mapped | |
620 is_part_of_halfmap = is_paired and (is_mapped != is_mate_mapped) | |
621 is_part_of_nomap = is_paired and not is_mapped and not is_mate_mapped | |
622 | |
623 | |
624 # Count length until first low quality base call | |
625 q20_length = 0 | |
626 for q in phred_quality: | |
627 if q < 20: | |
628 break | |
629 q20_length += 1 | |
630 | |
631 # Count kmers | |
632 if KHMER_AVAILABLE: | |
633 if not is_reverse: | |
634 self.ktable.consume(seq) | |
635 else: | |
636 self.ktable.consume(self._get_complement(seq)) | |
637 | |
638 if reference: | |
639 | |
640 self.all_references[reference_id] += 1 | |
641 | |
642 if is_primary: | |
643 self.primary_references[reference_id] += 1 | |
644 | |
645 | |
646 data = (insert_size, alignment_score, mapping_score, length,\ | |
647 q20_length, avg_phred_quality, number_hits, is_reverse) | |
648 | |
649 sample = 1 | |
650 | |
651 self._count_segment('sequenced', data, sample) | |
652 if is_mapped: | |
653 self._count_segment('sequenced mapped multi', data, sample) | |
654 if is_primary: | |
655 self._count_segment('sequenced mapped primary', data, sample) | |
656 if number_hits and is_unique: | |
657 self._count_segment('sequenced mapped primary unique', data, sample) | |
658 | |
659 if is_segment1: | |
660 self._count_read('sequenced mapped multi', data, sample) | |
661 if is_primary: | |
662 self._count_read('sequenced mapped primary', data, sample) | |
663 | |
664 if is_paired: | |
665 self._count_segment('sequenced paired', data, sample) | |
666 | |
667 if is_part_of_doublemap: | |
668 self._count_segment('sequenced paired doublemap', data, sample) | |
669 | |
670 if is_primary: | |
671 self._count_segment('sequenced paired doublemap primary', data, sample) | |
672 | |
673 if is_segment1: | |
674 self._count_read('sequenced paired doublemap multi', data, sample) | |
675 | |
676 if is_primary: | |
677 self._count_read('sequenced paired doublemap primary', data, sample) | |
678 | |
679 if number_hits and is_unique: | |
680 self._count_read('sequenced paired doublemap primary unique', data, sample) | |
681 | |
682 if is_translocation: | |
683 self._count_read('sequenced paired doublemap primary unique translocation', data, sample) | |
684 | |
685 elif is_part_of_halfmap: | |
686 | |
687 self._count_segment('sequenced paired halfmap', data, sample) | |
688 | |
689 # The mapped segment | |
690 if is_mapped: | |
691 | |
692 self._count_segment('sequenced paired halfmap mapped', data, sample) | |
693 | |
694 if is_primary: | |
695 | |
696 self._count_read('sequenced paired halfmap mapped primary', data, sample) | |
697 | |
698 if number_hits and is_unique: | |
699 self._count_read('sequenced paired halfmap mapped primary unique', data, sample) | |
700 | |
701 elif not is_primary: | |
702 | |
703 self._count_read('sequenced unpaired mapped multi', data, sample) | |
704 | |
705 # The unmapped segment | |
706 elif not is_mapped: | |
707 self._count_segment('sequenced paired halfmap unmapped', data, sample) | |
708 | |
709 elif is_part_of_nomap: | |
710 | |
711 self._count_segment('sequenced paired nomap', data, sample) | |
712 | |
713 if is_segment1: | |
714 | |
715 self._count_read('sequenced paired nomap', data, sample) | |
716 | |
717 elif not is_paired: | |
718 | |
719 self._count_segment('sequenced unpaired', data, sample) | |
720 | |
721 if is_mapped: | |
722 | |
723 self._count_segment('sequenced unpaired mapped', data, sample) | |
724 | |
725 if is_primary: | |
726 | |
727 self._count_read('sequenced unpaired mapped primary', data, sample) | |
728 | |
729 if number_hits and is_unique: | |
730 | |
731 self._count_read('sequenced paired unpaired mapped primary unique', data, sample) | |
732 | |
733 elif not is_primary: | |
734 | |
735 self._count_read('sequenced unpaired mapped multi', data, sample) | |
736 | |
737 | |
738 elif not is_mapped: | |
739 | |
740 self._count_segment('sequenced unpaired unmapped', data, sample) | |
741 | |
742 if is_segment1: | |
743 self._count_read('sequenced unpaired unmapped', data, sample) | |
744 | |
745 def write(self): | |
746 | |
747 with open(self.file_path, 'w') as output_file: | |
748 | |
749 all_references = sorted([(count, reference) for reference, count\ | |
750 in self.all_references.iteritems()], reverse=True) | |
751 | |
752 for j, (count, reference) in enumerate(all_references[:30]): | |
753 self.stored[('segment', 'multireference_' + str(j+1))][reference] = count | |
754 | |
755 primary_references = sorted([(count, reference) for reference, count\ | |
756 in self.primary_references.iteritems()], reverse=True) | |
757 | |
758 for j, (count, reference) in enumerate(primary_references[:30]): | |
759 self.stored[('segment', 'primaryreference_' + str(j+1))][reference] = count | |
760 | |
761 # Extract top-ranking kmers | |
762 if KHMER_AVAILABLE: | |
763 kmer_frequencies = [] | |
764 for i in range(0, self.ktable.n_entries()): | |
765 n = self.ktable.get(i) | |
766 if n > 0: | |
767 kmer_frequencies.append((n, self.ktable.reverse_hash(i))) | |
768 kmer_frequencies = sorted(kmer_frequencies, reverse=True) | |
769 for j, (frequency, kmer) in enumerate(kmer_frequencies[:10]): | |
770 self.stored[('segment', 'kmer_' + str(j+1))][kmer] = frequency | |
771 | |
772 output_file.writelines(self._str_metrics(self.stored)) | |
773 | |
774 | |
775 class SAMTaxonomy: | |
776 """ Provides taxonomic summary information from a SAM file stream. """ | |
777 | |
778 def __init__(self, file_path): | |
779 | |
780 self.file_path = file_path | |
781 | |
782 self.count_primaries = Counter() | |
783 self.detailed_information = {} | |
784 | |
785 self._last_read = (None, None) | |
786 self._last_read_human_prim = 0 | |
787 self._last_read_human_sec = 0 | |
788 self._last_organisms = set() | |
789 | |
790 def count(self, parsed_line): | |
791 | |
792 if parsed_line is None: | |
793 return | |
794 | |
795 read_key, read_name, flag, ref_name, ref_position, mapping_score,\ | |
796 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ | |
797 is_end1, is_end2, number_mismatches, alignment_score,\ | |
798 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ | |
799 is_paired, number_matches, read_end_pos, max_match = parsed_line | |
800 | |
801 if is_mapped: | |
802 | |
803 refseq_group, family, organism, gi = ref_name.split(';')[:4] | |
804 | |
805 if is_primary: | |
806 self.count_primaries[organism] += 1 | |
807 | |
808 if organism not in self.detailed_information: | |
809 # refseq_group. family, gis, avg_mapping_score, | |
810 # avg_seq_length, avg_number_hits, avg_alignment_score, avg_nr_mismatches | |
811 initial = [refseq_group, | |
812 family, | |
813 set([gi]), | |
814 [int(mapping_score), 1], | |
815 [len(seq), 1], | |
816 [0, 0], | |
817 [alignment_score, 1], | |
818 [number_mismatches, 1], | |
819 0, | |
820 0] | |
821 | |
822 self.detailed_information[organism] = initial | |
823 | |
824 else: | |
825 entry = self.detailed_information[organism] | |
826 entry[2].add(gi) | |
827 entry[3][0] += int(mapping_score) | |
828 entry[3][1] += 1 | |
829 entry[4][0] += len(seq) | |
830 entry[4][1] += 1 | |
831 entry[6][0] += alignment_score | |
832 entry[6][1] += 1 | |
833 entry[7][0] += number_mismatches | |
834 entry[7][1] += 1 | |
835 | |
836 if is_primary: | |
837 entry = self.detailed_information[organism] | |
838 entry[5][0] += number_hits | |
839 entry[5][1] += 1 | |
840 | |
841 if self._last_read == (None, None): | |
842 self._last_read = read_key | |
843 | |
844 if self._last_read != read_key: | |
845 | |
846 for last_organism in self._last_organisms: | |
847 | |
848 self.detailed_information[last_organism][8]\ | |
849 += self._last_read_human_prim | |
850 | |
851 self.detailed_information[last_organism][9]\ | |
852 += self._last_read_human_sec | |
853 | |
854 self._last_read = read_key | |
855 self._last_organisms = set() | |
856 self._last_read_human_prim = 0 | |
857 self._last_read_human_sec = 0 | |
858 | |
859 self._last_organisms.add(organism) | |
860 | |
861 if organism == 'Homo_sapiens': | |
862 if is_primary: | |
863 self._last_read_human_prim += 1 | |
864 else: | |
865 self._last_read_human_sec += 1 | |
866 | |
867 def get_summary(self, top=100): | |
868 | |
869 lines = [] | |
870 | |
871 lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10s\t%10s\t%5s\t%5s\t%5s\t%10s\t%10s\n'\ | |
872 % ('Count', 'Group', 'Family', 'Organism', 'Targets', 'ReadLen', 'Hits', 'Map', 'Algn', 'Mism', 'HuP', 'HuS')) | |
873 | |
874 top_organisms = self.count_primaries.most_common(top) | |
875 | |
876 for organism, count in top_organisms: | |
877 | |
878 refseq_group, family, identifiers,\ | |
879 avg_mapping_score, avg_seq_length, avg_number_hits,\ | |
880 avg_alignment_score, avg_nr_mismatches, human_prim, human_sec\ | |
881 = self.detailed_information[organism] | |
882 | |
883 avg_len = int(avg_seq_length[0] / float(avg_seq_length[1])) | |
884 if avg_number_hits[1] == 0: | |
885 avg_hits = 0 | |
886 else: | |
887 avg_hits = int(avg_number_hits[ | |
888 0] / float(avg_number_hits[1])) | |
889 | |
890 avg_mapping_score = int(avg_mapping_score[ | |
891 0] / float(avg_mapping_score[1])) | |
892 | |
893 avg_alignment_score = int(avg_alignment_score[ | |
894 0] / float(avg_alignment_score[1])) | |
895 | |
896 avg_nr_mismatches = int(avg_nr_mismatches[ | |
897 0] / float(avg_nr_mismatches[1])) | |
898 | |
899 nr_ids = len(identifiers) | |
900 | |
901 if count > 10**6: | |
902 count = str(round(count / float(10**6), 3)) + 'M' | |
903 if human_prim > 10**6: | |
904 human_prim = str(round(human_prim / float(10**6), 3)) + 'M' | |
905 if human_sec > 10**6: | |
906 human_sec = str(round(human_sec / float(10**6), 3)) + 'M' | |
907 if nr_ids > 10**6: | |
908 nr_ids = str(round(nr_ids / float(10**6), 3)) + 'M' | |
909 | |
910 lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10i\t%10i\t%5i\t%5i\t%5i\t%10s\t%10s\n'\ | |
911 % (str(count), refseq_group[:20], family[:20], organism[:20],\ | |
912 str(nr_ids), avg_len, avg_hits, avg_mapping_score,\ | |
913 avg_alignment_score, avg_nr_mismatches, str(human_prim),\ | |
914 str(human_sec))) | |
915 | |
916 return lines | |
917 | |
918 def write(self): | |
919 | |
920 with open(self.file_path, 'w') as output_file: | |
921 output_file.writelines(self.get_summary()) | |
922 | |
923 @CLI.subcommand("rnamap") | |
924 class RNAmap(cli.Application): | |
925 """ Map input reads against a STAR index """ | |
926 | |
927 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, | |
928 help="Sets the index output directory") | |
929 | |
930 threads = cli.SwitchAttr( | |
931 ['-t', '--threads'], cli.Range(1, 512), mandatory=False, | |
932 help="Sets the number of threads to use", | |
933 default=1) | |
934 | |
935 taxonomy = cli.SwitchAttr( | |
936 ['-x', '--taxonomy'], str, mandatory=False, | |
937 help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping", | |
938 default='') | |
939 | |
940 star_path = cli.SwitchAttr(['--star_path'], str, mandatory=False, | |
941 help="Path to STAR executable", | |
942 default='') | |
943 | |
944 samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False, | |
945 help="Path to samtools executable", | |
946 default='') | |
947 | |
948 temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False, | |
949 help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.", | |
950 default='') | |
951 | |
952 min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False, | |
953 help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)", | |
954 default=None) | |
955 | |
956 min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False, | |
957 help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)", | |
958 default=None) | |
959 | |
960 max_mismatches = cli.SwitchAttr(['--max_mismatches'], cli.Range(0, 10000000), mandatory=False, | |
961 help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)", | |
962 default=None) | |
963 | |
964 max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False, | |
965 help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)", | |
966 default=None) | |
967 | |
968 min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False, | |
969 help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)", | |
970 default=None) | |
971 | |
972 filter_complexity = cli.Flag(['--filter_complexity'], | |
973 help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)", | |
974 default=False) | |
975 | |
976 bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False, | |
977 help="Path to unsorted, unindexed output BAM file", | |
978 default='') | |
979 | |
980 sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False, | |
981 help="Path to output SAM file", | |
982 default='') | |
983 | |
984 qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False, | |
985 help="Path to output quality file", | |
986 default='') | |
987 | |
988 hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False, | |
989 help="Path to bzip2-compressed tab-delimited output hit file", | |
990 default='') | |
991 | |
992 sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False, | |
993 help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file", | |
994 default='no_sample_id') | |
995 | |
996 unmapped1 = cli.SwitchAttr(['--unmapped_end_1'], str, mandatory=False, | |
997 help="Output path to uncompressed fastq file containing unmapped reads, first ends only for paired ends.", | |
998 default='') | |
999 | |
1000 unmapped2 = cli.SwitchAttr(['--unmapped_end_2'], str, mandatory=False, | |
1001 help="Output path to uncompressed fastq file containing unmapped reads, second ends only for paired ends.", | |
1002 default='') | |
1003 | |
1004 splice_junctions = cli.SwitchAttr(['--splice_junctions'], str, mandatory=False, | |
1005 help="Input path to splice junction file (currently not implemented)", | |
1006 default='') | |
1007 | |
1008 chimeric_mappings = cli.SwitchAttr(['--chimeric_mappings'], str, mandatory=False, | |
1009 help="Ouput path to SAM file containing chimeric mappings", | |
1010 default='') | |
1011 | |
1012 hit_filter = cli.SwitchAttr( | |
1013 ['-f', '--virana_hit_filter'], str, list=True, mandatory=False, | |
1014 help="Only generate hit groups that include at last one read mapping to a reference of this reference group.", | |
1015 default=[]) | |
1016 | |
1017 debug = cli.Flag(["-d", "--debug"], help="Enable debug information") | |
1018 | |
1019 reads = cli.SwitchAttr( | |
1020 ['-r', '--reads'], str, list=True, mandatory=True, | |
1021 help="Sets the input reads. Add this parameter twice for paired end reads.") | |
1022 | |
1023 zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped") | |
1024 | |
1025 sensitive = cli.Flag( | |
1026 ["--sensitive"], help="If given, mapping will process slower and more sensitive") | |
1027 | |
1028 def main(self): | |
1029 | |
1030 if self.debug: | |
1031 logging.getLogger().setLevel(logging.DEBUG) | |
1032 | |
1033 # Obtain star executable | |
1034 star = [self.star_path and self.star_path or 'STAR'] | |
1035 samtools = [self.samtools_path and self.samtools_path or 'samtools'] | |
1036 | |
1037 # Check if genome directory is existing | |
1038 if not os.path.exists(self.index_dir): | |
1039 sys.stdout.write('Index directory %s not existing, exiting' % self.genome_dir) | |
1040 sys.exit(1) | |
1041 | |
1042 if self.temp_path: | |
1043 temp_path = tempfile.mkdtemp(dir=self.temp_path) | |
1044 else: | |
1045 temp_path = tempfile.mkdtemp() | |
1046 | |
1047 first_ends = [] | |
1048 second_ends = [] | |
1049 single_ends = [] | |
1050 | |
1051 if len(self.reads) == 2: | |
1052 first, second = self.reads | |
1053 first_ends.append(first) | |
1054 second_ends.append(second) | |
1055 | |
1056 elif len(self.reads) == 1: | |
1057 single_ends.append(self.reads[0]) | |
1058 | |
1059 else: | |
1060 sys.stdout.write('Invalid number of fastq files; provide either one (single end) or two (paired end)') | |
1061 sys.exit(1) | |
1062 | |
1063 if single_ends and not first_ends and not second_ends: | |
1064 reads = [','.join(single_ends)] | |
1065 | |
1066 elif first_ends and second_ends: | |
1067 reads = [','.join(first_ends), ','.join(second_ends)] | |
1068 | |
1069 | |
1070 star_cline = star + ['--runMode', 'alignReads', | |
1071 '--genomeDir', self.index_dir, | |
1072 '--runThreadN', str(self.threads), | |
1073 '--readMatesLengthsIn', 'NotEqual', | |
1074 '--outFileNamePrefix', os.path.join( | |
1075 temp_path, 'out'), | |
1076 '--outSAMmode', 'Full', | |
1077 '--outSAMstrandField', 'None', | |
1078 '--outSAMattributes', 'Standard', | |
1079 '--outSAMunmapped', 'Within', | |
1080 '--outStd', 'SAM', | |
1081 '--outFilterMultimapNmax', '1000', | |
1082 '--outSAMprimaryFlag', 'AllBestScore', | |
1083 '--outSAMorder', 'PairedKeepInputOrder'] | |
1084 | |
1085 if self.unmapped1 or self.unmapped2: | |
1086 star_cline += ['--outReadsUnmapped', 'Fastx'] | |
1087 else: | |
1088 star_cline += ['--outReadsUnmapped', 'None'] | |
1089 | |
1090 | |
1091 if self.zipped: | |
1092 star_cline += ['--readFilesCommand', 'zcat'] | |
1093 | |
1094 if self.sensitive: | |
1095 star_cline += ['--outFilterMultimapScoreRange', '10', | |
1096 '--outFilterMismatchNmax', '60', | |
1097 '--outFilterMismatchNoverLmax', '0.3', | |
1098 '--outFilterScoreMin', '0', | |
1099 '--outFilterScoreMinOverLread', '0.3', | |
1100 '--outFilterMatchNmin', '0', | |
1101 '--outFilterMatchNminOverLread', '0.66', | |
1102 '--seedSearchStartLmax', '12', | |
1103 '--winAnchorMultimapNmax', '50'] | |
1104 | |
1105 star_cline += ['--readFilesIn'] + reads | |
1106 | |
1107 if self.debug: | |
1108 print ' '.join(star_cline) | |
1109 | |
1110 # Try if we can make the relevant files | |
1111 touch_files = [self.unmapped1, self.unmapped2, self.taxonomy, self.qual, self.hits, self.sam, self.bam] | |
1112 for file_path in touch_files: | |
1113 if file_path is None or file_path == '': | |
1114 continue | |
1115 try: | |
1116 with file(file_path, 'a'): | |
1117 os.utime(file_path, None) | |
1118 except IOError: | |
1119 sys.stderr.write('Could not write output file %s\n' % file_path) | |
1120 sys.exit(1) | |
1121 | |
1122 star_process = subprocess.Popen(' '.join( | |
1123 star_cline), shell=True, stdout=PIPE) | |
1124 | |
1125 parser = SAMParser() | |
1126 | |
1127 if self.taxonomy: | |
1128 taxonomy = SAMTaxonomy(self.taxonomy) | |
1129 | |
1130 if self.qual: | |
1131 quality = SAMQuality(self.qual) | |
1132 | |
1133 if self.hits: | |
1134 hits = SAMHits(self.hits, self.sample_id, self.hit_filter, | |
1135 self.min_mapping_score, | |
1136 self.min_alignment_score, | |
1137 self.max_mismatches, | |
1138 self.max_relative_mismatches, | |
1139 self.min_continiously_matching, | |
1140 self.filter_complexity) | |
1141 | |
1142 if self.sam: | |
1143 sam_file = open(self.sam, 'w') | |
1144 | |
1145 if self.bam: | |
1146 with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file: | |
1147 samtools_cline = samtools + [ | |
1148 'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin'] | |
1149 if self.debug: | |
1150 print ' '.join(samtools_cline) | |
1151 samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE) | |
1152 | |
1153 | |
1154 do_sam = self.sam | |
1155 do_bam = self.bam | |
1156 do_taxonomy = self.taxonomy | |
1157 do_qual = self.qual | |
1158 do_hits = self.hits | |
1159 do_parse = do_taxonomy or do_qual or do_hits | |
1160 | |
1161 for i, line in enumerate(iter(star_process.stdout.readline, '')): | |
1162 | |
1163 if do_sam: | |
1164 sam_file.write(line) | |
1165 | |
1166 if do_bam: | |
1167 samtools_process.stdin.write(line) | |
1168 | |
1169 if line[0] == '@': | |
1170 continue | |
1171 | |
1172 if do_parse: | |
1173 parsed_line = parser.parse(line) | |
1174 | |
1175 if do_taxonomy: | |
1176 taxonomy.count(parsed_line) | |
1177 if i > 0 and (i % 50000) == 0: | |
1178 print ''.join(taxonomy.get_summary(10)) | |
1179 | |
1180 if do_qual: | |
1181 quality.count(parsed_line) | |
1182 | |
1183 if do_hits: | |
1184 hits.count(parsed_line) | |
1185 | |
1186 if do_bam: | |
1187 samtools_process.stdin.close() | |
1188 | |
1189 if do_sam: | |
1190 sam_file.close() | |
1191 | |
1192 if do_hits: | |
1193 hits.write() | |
1194 | |
1195 if do_taxonomy: | |
1196 print ''.join(taxonomy.get_summary(10)) | |
1197 taxonomy.write() | |
1198 | |
1199 if do_qual: | |
1200 quality.write() | |
1201 | |
1202 try: | |
1203 if self.unmapped1: | |
1204 shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate1'),\ | |
1205 self.unmapped1) | |
1206 except IOError: | |
1207 pass | |
1208 | |
1209 try: | |
1210 if self.unmapped2: | |
1211 shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate2'),\ | |
1212 self.unmapped2) | |
1213 except IOError: | |
1214 pass | |
1215 | |
1216 try: | |
1217 if self.chimeric_mappings: | |
1218 shutil.move(os.path.join(temp_path, 'out' + 'Chimeric.out.sam'),\ | |
1219 self.chimeric_mappings) | |
1220 except IOError: | |
1221 pass | |
1222 | |
1223 shutil.rmtree(temp_path) | |
1224 | |
1225 @CLI.subcommand("dnamap") | |
1226 class DNAmap(cli.Application): | |
1227 """ Map input reads against a BWA index """ | |
1228 | |
1229 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, | |
1230 help="Sets the index output directory") | |
1231 | |
1232 threads = cli.SwitchAttr( | |
1233 ['-t', '--threads'], cli.Range(1, 512), mandatory=False, | |
1234 help="Sets the number of threads to use", | |
1235 default=1) | |
1236 | |
1237 taxonomy = cli.SwitchAttr( | |
1238 ['-x', '--taxonomy'], str, mandatory=False, | |
1239 help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping", | |
1240 default='') | |
1241 | |
1242 samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False, | |
1243 help="Path to samtools executable", | |
1244 default='') | |
1245 | |
1246 temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False, | |
1247 help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.", | |
1248 default='') | |
1249 | |
1250 min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False, | |
1251 help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)", | |
1252 default=None) | |
1253 | |
1254 min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False, | |
1255 help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)", | |
1256 default=None) | |
1257 | |
1258 max_mismatches = cli.SwitchAttr(['--max_mismatches'], cli.Range(0, 10000000), mandatory=False, | |
1259 help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)", | |
1260 default=None) | |
1261 | |
1262 max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False, | |
1263 help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)", | |
1264 default=None) | |
1265 | |
1266 min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False, | |
1267 help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)", | |
1268 default=None) | |
1269 | |
1270 filter_complexity = cli.Flag(['--filter_complexity'], | |
1271 help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)", | |
1272 default=False) | |
1273 | |
1274 sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False, | |
1275 help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file", | |
1276 default='no_sample_id') | |
1277 | |
1278 bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False, | |
1279 help="Path to unsorted, unindexed output BAM file", | |
1280 default='') | |
1281 | |
1282 sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False, | |
1283 help="Path to output SAM file", | |
1284 default='') | |
1285 | |
1286 qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False, | |
1287 help="Path to output quality file", | |
1288 default='') | |
1289 | |
1290 hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False, | |
1291 help="Path to bzip2-compressed tab-delimited output virana hit file", | |
1292 default='') | |
1293 | |
1294 hit_filter = cli.SwitchAttr( | |
1295 ['-f', '--virana_hit_filter'], str, list=True, mandatory=False, | |
1296 help="Only generate hit groups that include at last one read mapping to a reference of this reference group.", | |
1297 default=[]) | |
1298 | |
1299 debug = cli.Flag(["-d", "--debug"], help="Enable debug information") | |
1300 | |
1301 zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped") | |
1302 | |
1303 sensitive = cli.Flag( | |
1304 ["--sensitive"], help="If given, mapping will process slower and more sensitive") | |
1305 | |
1306 | |
1307 bwa_path = cli.SwitchAttr(['--bwa_path'], str, mandatory=False, | |
1308 help="Path to BWA executable", | |
1309 default='') | |
1310 | |
1311 debug = cli.Flag(["-d", "--debug"], help="Enable debug information") | |
1312 | |
1313 if debug: | |
1314 logging.getLogger().setLevel(logging.DEBUG) | |
1315 | |
1316 reads = cli.SwitchAttr( | |
1317 ['-r', '--reads'], str, list=True, mandatory=True, | |
1318 help="Sets the input reads. Add this parameter twice for paired end reads.") | |
1319 | |
1320 def main(self): | |
1321 | |
1322 if self.debug: | |
1323 logging.getLogger().setLevel(logging.DEBUG) | |
1324 | |
1325 # Obtain star executable | |
1326 bwa = [self.bwa_path and self.bwa_path or 'bwa'] | |
1327 samtools = [self.samtools_path and self.samtools_path or 'samtools'] | |
1328 | |
1329 # Check if genome directory is existing | |
1330 if not os.path.exists(self.index_dir): | |
1331 sys.stdout.write('Index directory %s not existing, exiting'\ | |
1332 % self.genome_dir) | |
1333 sys.exit(1) | |
1334 | |
1335 if len(self.reads) not in (1, 2): | |
1336 message = 'Invalid number of FASTQ files; supply either one (single end) or two (paired end)\n' | |
1337 sys.stderr.write(message) | |
1338 sys.exit(1) | |
1339 | |
1340 bwa_cline = bwa + ['mem', '-t', str(self.threads), '-M', os.path.join(self.index_dir, 'index')] | |
1341 | |
1342 bwa_cline += self.reads | |
1343 | |
1344 if self.debug: | |
1345 print ' '.join(bwa_cline) | |
1346 | |
1347 bwa_process = subprocess.Popen(' '.join(bwa_cline), shell=True, stdout=PIPE) | |
1348 | |
1349 parser = SAMParser() | |
1350 | |
1351 if self.taxonomy: | |
1352 taxonomy = SAMTaxonomy(self.taxonomy) | |
1353 | |
1354 if self.qual: | |
1355 quality = SAMQuality(self.qual) | |
1356 | |
1357 if self.hits: | |
1358 hits = SAMHits(self.hits, self.sample_id, self.hit_filter, | |
1359 self.min_mapping_score, | |
1360 self.min_alignment_score, | |
1361 self.max_mismatches, | |
1362 self.max_relative_mismatches, | |
1363 self.min_continiously_matching, | |
1364 self.filter_complexity) | |
1365 | |
1366 if self.sam: | |
1367 sam_file = open(self.sam, 'w', buffering=100 * 1024 * 1024) | |
1368 | |
1369 if self.bam: | |
1370 with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file: | |
1371 samtools_cline = samtools + [ | |
1372 'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin'] | |
1373 if self.debug: | |
1374 print ' '.join(samtools_cline) | |
1375 samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE) | |
1376 | |
1377 do_sam = self.sam | |
1378 do_bam = self.bam | |
1379 do_taxonomy = self.taxonomy | |
1380 do_qual = self.qual | |
1381 do_hits = self.hits | |
1382 do_parse = do_taxonomy or do_qual or do_hits | |
1383 | |
1384 for i, line in enumerate(iter(bwa_process.stdout.readline, '')): | |
1385 | |
1386 if do_sam: | |
1387 sam_file.write(line) | |
1388 | |
1389 if do_bam: | |
1390 samtools_process.stdin.write(line) | |
1391 | |
1392 if do_parse: | |
1393 parsed_line = parser.parse(line) | |
1394 | |
1395 if do_taxonomy: | |
1396 taxonomy.count(parsed_line) | |
1397 | |
1398 if i > 0 and (i % 10000) == 0: | |
1399 print ''.join(taxonomy.get_summary(10)) | |
1400 | |
1401 if do_qual: | |
1402 quality.count(parsed_line) | |
1403 | |
1404 if do_hits: | |
1405 hits.count(parsed_line) | |
1406 | |
1407 if do_bam: | |
1408 samtools_process.stdin.close() | |
1409 | |
1410 if do_sam: | |
1411 sam_file.close() | |
1412 | |
1413 if do_hits: | |
1414 hits.write() | |
1415 | |
1416 if do_taxonomy: | |
1417 print ''.join(taxonomy.get_summary(10)) | |
1418 taxonomy.write() | |
1419 | |
1420 if do_qual: | |
1421 quality.write() | |
1422 | |
1423 | |
1424 if __name__ == "__main__": | |
1425 CLI.run() |