0
|
1 #!/usr/bin/env python
|
|
2 #from __future__ import print_function
|
|
3
|
|
4 import cProfile
|
|
5
|
|
6 import sys
|
|
7 import re
|
|
8
|
|
9 import tempfile
|
|
10 import subprocess
|
|
11 import shutil
|
|
12 import os
|
|
13 import os.path
|
|
14 import logging
|
|
15 import bz2
|
|
16 import zlib
|
|
17
|
|
18 import math
|
|
19 import string
|
|
20
|
|
21 from collections import defaultdict, Counter
|
|
22
|
|
23 from subprocess import PIPE
|
|
24
|
|
25 NON_ID = ''.join(c for c in map(chr, range(256)) if not c.isalnum())
|
|
26 NON_ID = NON_ID.replace('_', '').replace('-', '')
|
|
27
|
|
28 try:
|
|
29 from Bio.SeqRecord import SeqRecord
|
|
30 from Bio import SeqIO
|
|
31 from Bio.Seq import Seq
|
|
32 except ImportError:
|
|
33 message = 'This script requires the BioPython python package\n'
|
|
34 sys.stderr.write(message)
|
|
35 sys.exit(1)
|
|
36
|
|
37 try:
|
|
38 from plumbum import cli
|
|
39 except ImportError:
|
|
40 message = 'This script requires the plumbum python package\n'
|
|
41 sys.stderr.write(message)
|
|
42 sys.exit(1)
|
|
43
|
|
44
|
|
45 try:
|
|
46 import HTSeq
|
|
47 except ImportError:
|
|
48 message = 'This script requires the HTSeq python package\n'
|
|
49 sys.stderr.write(message)
|
|
50 sys.exit(1)
|
|
51
|
|
52 KHMER_AVAILABLE = True
|
|
53 try:
|
|
54 import khmer
|
|
55 except ImportError:
|
|
56 KHMER_AVAILABLE = False
|
|
57
|
|
58 #from io import BufferedRandom
|
|
59
|
|
60 logging.basicConfig(level=logging.INFO, format='%(message)s')
|
|
61
|
|
62
|
|
63 def profile_this(fn):
|
|
64 def profiled_fn(*args, **kwargs):
|
|
65 fpath = fn.__name__ + ".profile"
|
|
66 prof = cProfile.Profile()
|
|
67 ret = prof.runcall(fn, *args, **kwargs)
|
|
68 prof.dump_stats(fpath)
|
|
69 return ret
|
|
70 return profiled_fn
|
|
71
|
|
72
|
|
73 class CLI(cli.Application):
|
|
74 """RNA-Seq and DNA-Seq short read analysis by mapping to known reference sequences"""
|
|
75 PROGNAME = "vmap"
|
|
76 VERSION = "1.0.0"
|
|
77 DESCRIPTION = """Virana vmap is an interface to the NCBI and ensembl reference databases that can
|
|
78 generate reference indexes for the short read mappers STAR (RNA-Seq) and
|
|
79 BWA-MEM (DNA-Seq). Short reads can be mapped to arbitrary combinations of
|
|
80 reference databases and the results can be summarized by taxonomic family
|
|
81 as well as stored as SAM file, unsorted BAM file, or as a HIT file that
|
|
82 models multimapping reads between specific reference databases."""
|
|
83 USAGE = """The program has four modes that can be accessed by `vmap rnaindex`, `vmap dnaindex`, `vmap rnamap`, and `vmap dnamap.`"""
|
|
84
|
|
85 def main(self, *args):
|
|
86
|
|
87 if args:
|
|
88 print("Unknown command %r" % (args[0]))
|
|
89 print self.USAGE
|
|
90 return 1
|
|
91
|
|
92 if not self.nested_command:
|
|
93 print("No command given")
|
|
94 print self.USAGE
|
|
95 return 1
|
|
96
|
|
97
|
|
98 @CLI.subcommand("rnaindex")
|
|
99 class RNAIndex(cli.Application):
|
|
100 """ Creates a STAR index from a FASTA genome reference """
|
|
101
|
|
102 reference_files = cli.SwitchAttr(
|
|
103 ['-r', '--reference_file'], str, list=True, mandatory=True,
|
|
104 help="Sets the reference genome(s) FASTA file." +
|
|
105 " Multiple occurrences of this parameter are allowed.")
|
|
106 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
|
|
107 help="Sets the index output directory." +
|
|
108 " Directory will be generated if not existing." +
|
|
109 " Directory will be filled with several index files.")
|
|
110 threads = cli.SwitchAttr(
|
|
111 ['-t', '--threads'], cli.Range(1, 512), mandatory=False,
|
|
112 help="Sets the number of threads to use",
|
|
113 default=1)
|
|
114 max_ram = cli.SwitchAttr(
|
|
115 ['-m'], cli.Range(1, 400000000000), mandatory=False,
|
|
116 help="Sets the maximum amount of memory (RAM) to use (in bytes)",
|
|
117 default=400000000000)
|
|
118 path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False,
|
|
119 help="Path to STAR executable",
|
|
120 default='')
|
|
121 sparse = cli.Flag(
|
|
122 ["-s", "--sparse"], help="If given, a sparse index that requires less " +
|
|
123 " RAM in the mapping phase will be constructed")
|
|
124
|
|
125 debug = cli.Flag(["-d", "--debug"], help="Enable debug output")
|
|
126
|
|
127 def main(self):
|
|
128
|
|
129 if self.debug:
|
|
130 logging.getLogger().setLevel(logging.DEBUG)
|
|
131
|
|
132 # Obtain star executable
|
|
133 star = [self.path and self.path or 'STAR']
|
|
134
|
|
135 # Check if genome directory is existing
|
|
136 for reference_file in self.reference_files:
|
|
137 if not os.path.exists(reference_file):
|
|
138 sys.stdout.write(
|
|
139 'Reference file %s nor existing, exiting' % reference_file)
|
|
140 sys.exit(1)
|
|
141
|
|
142 # Check if output directory is existing
|
|
143 if not os.path.exists(self.index_dir):
|
|
144 logging.debug(
|
|
145 'Making output directory for index at %s' % self.index_dir)
|
|
146 os.makedirs(self.index_dir)
|
|
147
|
|
148 # # Make named pipe to extract genomes
|
|
149 # pipe_path = os.path.abspath(os.path.join(self.genome_dir, 'pipe.fa'))
|
|
150 # if os.path.exists(pipe_path):
|
|
151 # os.unlink(pipe_path)
|
|
152 # os.mkfifo(pipe_path)
|
|
153
|
|
154 # Make star command line
|
|
155 cline = star + ['--runMode', 'genomeGenerate',
|
|
156 '--genomeDir', self.index_dir,
|
|
157 '--limitGenomeGenerateRAM', str(self.max_ram),
|
|
158 '--runThreadN', str(self.threads),
|
|
159 '--genomeFastaFiles'] + self.reference_files
|
|
160
|
|
161 # Add parameters for sparse (memory-saving) index generation
|
|
162 if self.sparse:
|
|
163 cline += ['--genomeSAsparseD', '2',
|
|
164 '--genomeChrBinNbits', '12',
|
|
165 '--genomeSAindexNbases', '13']
|
|
166
|
|
167 else:
|
|
168 cline += ['--genomeSAsparseD', '1',
|
|
169 '--genomeChrBinNbits', '18',
|
|
170 '--genomeSAindexNbases', '15']
|
|
171
|
|
172 if self.debug:
|
|
173 print ' '.join(cline)
|
|
174
|
|
175 # Run STAR reference generation process
|
|
176 star_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
|
|
177
|
|
178 # Block until streams are closed by the process
|
|
179 stdout, stderr = star_process.communicate()
|
|
180
|
|
181 if stderr:
|
|
182 sys.stderr.write(stderr)
|
|
183
|
|
184 if self.debug and stdout:
|
|
185 print stdout
|
|
186
|
|
187
|
|
188 @CLI.subcommand("dnaindex")
|
|
189 class DNAIndex(cli.Application):
|
|
190 """ Creates a BWA index from a FASTA reference file """
|
|
191
|
|
192 reference_file = cli.SwitchAttr(['-r', '--reference_file'], str, mandatory=True,
|
|
193 help="Sets the input reference FASTA file.")
|
|
194 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
|
|
195 help="Sets the index output directory." +
|
|
196 " Directory will be generated if not existing." +
|
|
197 " Directory will be filled with several index files.")
|
|
198 path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False,
|
|
199 help="Path to BWA executable",
|
|
200 default='')
|
|
201 debug = cli.Flag(["-d", "--debug"], help="Enable debug output")
|
|
202
|
|
203 if debug:
|
|
204 logging.getLogger().setLevel(logging.DEBUG)
|
|
205
|
|
206 def main(self):
|
|
207
|
|
208 # Obtain star executable
|
|
209 bwa = [self.path and self.path or 'bwa']
|
|
210
|
|
211 # Check if genome directory is existing
|
|
212 if not os.path.exists(self.reference_file):
|
|
213 sys.stdout.write('Genome file %s nor existing, exiting' % self.reference_file)
|
|
214 sys.exit(1)
|
|
215
|
|
216 # Check if output directory is existing
|
|
217 if not os.path.exists(self.index_dir):
|
|
218 logging.debug('Making output directory %s' % self.index_dir)
|
|
219 os.makedirs(self.index_dir)
|
|
220
|
|
221 # Make star command line
|
|
222 cline = bwa + ['index', '-a', 'bwtsw', '-p', os.path.join(self.index_dir, 'index'), self.reference_file]
|
|
223
|
|
224 if self.debug:
|
|
225 print ' '.join(cline)
|
|
226
|
|
227 # Run BWA index generation process
|
|
228 bwa_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
|
|
229 stdout, stderr = bwa_process.communicate()
|
|
230
|
|
231 if stderr:
|
|
232 sys.stderr.write(stderr)
|
|
233
|
|
234 if self.debug and stdout:
|
|
235 print stdout
|
|
236
|
|
237
|
|
238
|
|
239
|
|
240
|
|
241 class SAMHits:
|
|
242 """ Converts SAM output of mappers into bzipped HIT files. """
|
|
243
|
|
244 def __init__(self, output_file, sample_id, refseq_filter=None, min_mapping_score=None,\
|
|
245 min_alignment_score=None, max_mismatches=None,\
|
|
246 max_relative_mismatches=None, min_continiously_matching=None,\
|
|
247 filter_complexity=False):
|
|
248
|
|
249 self.output_file = bz2.BZ2File(output_file, 'wb', buffering=100 * 1024 * 1024)
|
|
250 self.sample_id = sample_id.translate(None, NON_ID)
|
|
251 self.refseq_filter = refseq_filter
|
|
252 self.max_mismatches = max_mismatches
|
|
253 self.max_relative_mismatches = max_relative_mismatches
|
|
254 self.current_group = []
|
|
255 self.min_mapping_score = min_mapping_score
|
|
256 self.min_alignment_score = min_alignment_score
|
|
257 self.min_continiously_matching = min_continiously_matching
|
|
258 self.filter_complexity = filter_complexity
|
|
259
|
|
260 self.re_matches = re.compile(r'(\d+)M')
|
|
261 self.re_dels = re.compile(r'(\d+)D')
|
|
262
|
|
263 def count(self, parsed_line):
|
|
264
|
|
265 if parsed_line is None:
|
|
266 return
|
|
267
|
|
268 read_key, read_name, flag, ref_name, ref_position, mapping_score,\
|
|
269 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
|
|
270 is_end1, is_end2, number_mismatches, alignment_score,\
|
|
271 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
|
|
272 is_paired, number_matches, read_end_pos, max_match = parsed_line
|
|
273
|
|
274 if not is_mapped:
|
|
275 return
|
|
276
|
|
277 if self.filter_complexity:
|
|
278
|
|
279 avg_compression = float(len(zlib.compress(seq)))/len(seq)
|
|
280
|
|
281 if avg_compression < 0.5:
|
|
282 return
|
|
283
|
|
284 # length = len(seq)
|
|
285 # counts = [seq.count(nuc) for nuc in 'ACGT']
|
|
286 # min_count = length * 0.10
|
|
287 # max_count = length * 0.50
|
|
288 # for count in counts:
|
|
289 # if count < min_count or count > max_count:
|
|
290 # return None
|
|
291
|
|
292 # counter = Counter()
|
|
293 # for i in range(length - 2):
|
|
294 # counter[seq[i: i + 3]] += 1
|
|
295 # maximal = length - 4
|
|
296
|
|
297 # highest = sum([v for k, v in counter.most_common(2)])
|
|
298 # if highest > (maximal / 3.0):
|
|
299 # return None
|
|
300
|
|
301 # self.passed.append(avg_compression)
|
|
302
|
|
303 pair_id = ''
|
|
304 if is_end1:
|
|
305 pair_id = '/1'
|
|
306
|
|
307 elif is_end2:
|
|
308 pair_id = '/2'
|
|
309
|
|
310 read_name = self.sample_id + ';' + read_name + pair_id
|
|
311
|
|
312 # Initialize new current group
|
|
313 if len(self.current_group) == 0:
|
|
314 self.current_group = [read_name, seq, []]
|
|
315
|
|
316 # Write old current group to file
|
|
317 if read_name != self.current_group[0]:
|
|
318 self._write_group()
|
|
319 self.current_group = [read_name, seq, []]
|
|
320
|
|
321 try:
|
|
322 refseq_group, family, organism, identifier = ref_name.split(';')[:4]
|
|
323 except ValueError:
|
|
324 sys.stderr.write('Read mapped to malformed reference sequence %s, skipping\n' % ref_name)
|
|
325 return
|
|
326
|
|
327 if self.min_continiously_matching:
|
|
328
|
|
329 if self.min_continiously_matching > max_match:
|
|
330 return
|
|
331
|
|
332 if self.max_mismatches\
|
|
333 and int(number_mismatches) > self.max_mismatches:
|
|
334 return
|
|
335
|
|
336 if self.max_relative_mismatches\
|
|
337 and int(number_mismatches) / float(len(seq))\
|
|
338 > self.max_relative_mismatches:
|
|
339 return
|
|
340
|
|
341 if self.min_mapping_score\
|
|
342 and self.min_mapping_score > mapping_score:
|
|
343 return
|
|
344
|
|
345 if self.min_alignment_score\
|
|
346 and self.min_alignment_score > alignment_score:
|
|
347 return
|
|
348
|
|
349 start = int(ref_position) + 1
|
|
350
|
|
351 self.current_group[2].append([refseq_group, family, organism, identifier, str(start), str(read_end_pos)])
|
|
352
|
|
353 def _write_group(self):
|
|
354 passed = True
|
|
355
|
|
356 if self.refseq_filter:
|
|
357 passed = False
|
|
358 for refseq_group, family, organism, identifier, start, end in self.current_group[2]:
|
|
359 if passed:
|
|
360 break
|
|
361 for f in self.refseq_filter:
|
|
362 if refseq_group == f:
|
|
363 passed = True
|
|
364 break
|
|
365 if passed:
|
|
366 description = []
|
|
367 for identifier in self.current_group[2]:
|
|
368 description.append(';'.join(identifier))
|
|
369 description = '|'.join(description)
|
|
370
|
|
371 record = SeqRecord(Seq(self.current_group[1]))
|
|
372 record.id = 'Read;' + self.current_group[0]
|
|
373 record.description = description
|
|
374
|
|
375 SeqIO.write([record], self.output_file, "fasta")
|
|
376
|
|
377 def write(self):
|
|
378
|
|
379 self._write_group()
|
|
380 self.output_file.close()
|
|
381
|
|
382 class SAMParser:
|
|
383
|
|
384 def parse(self, line):
|
|
385
|
|
386 if line[0] == '@':
|
|
387 return None
|
|
388
|
|
389 alignment = HTSeq._HTSeq.SAM_Alignment.from_SAM_line(line)
|
|
390 read_name = alignment.read.name
|
|
391 seq = alignment.read.seq
|
|
392 qual = alignment.read.qual
|
|
393 flag = alignment.flag
|
|
394 cigar = None
|
|
395
|
|
396 is_paired = (flag & 1)
|
|
397 is_mapped = not (flag & 4)
|
|
398 is_mate_mapped = alignment.mate_aligned is not None #not (flag & 8)
|
|
399 is_reverse = (flag & 16)
|
|
400 is_end1 = (flag & 64)
|
|
401 is_end2 = (flag & 128)
|
|
402 is_primary = not (flag & 256)
|
|
403
|
|
404 read_key = (read_name, is_end1)
|
|
405
|
|
406 ref_name = None
|
|
407 ref_position = None
|
|
408 mapping_score = 0
|
|
409 mate_ref_name = None
|
|
410 mate_ref_position = None
|
|
411 insert_size = None
|
|
412 alignment_score = 0
|
|
413 read_end_pos = None
|
|
414
|
|
415 if is_mate_mapped and alignment.mate_start:
|
|
416 mate_ref_name = alignment.mate_start.chrom
|
|
417 mate_ref_position = alignment.mate_start.start
|
|
418
|
|
419 number_hits = 0
|
|
420 alignment_score = 0
|
|
421 number_mismatches = 0
|
|
422
|
|
423 number_matches = 0
|
|
424 max_match = 0
|
|
425
|
|
426 if is_mapped:
|
|
427
|
|
428 ref_name = alignment.iv.chrom
|
|
429 ref_position = alignment.iv.start
|
|
430 read_end_pos = alignment.iv.end
|
|
431 alignment_score = alignment.aQual
|
|
432 cigar = alignment.cigar
|
|
433
|
|
434 if is_mate_mapped:
|
|
435 insert_size = alignment.inferred_insert_size
|
|
436
|
|
437 for c in cigar:
|
|
438 if c.type == 'M':
|
|
439 number_matches += c.size
|
|
440 max_match = max(max_match, c.size)
|
|
441
|
|
442 for tag, value in alignment.optional_fields:
|
|
443 if tag == 'NM':
|
|
444 number_hits = value
|
|
445 elif tag == 'AS':
|
|
446 alignment_score = value
|
|
447 elif tag == 'NH':
|
|
448 number_mismatches = value
|
|
449
|
|
450 return read_key, read_name, flag, ref_name, ref_position, mapping_score,\
|
|
451 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
|
|
452 is_end1, is_end2, number_mismatches, alignment_score,\
|
|
453 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
|
|
454 is_paired, number_matches, read_end_pos, max_match
|
|
455
|
|
456
|
|
457 class SAMQuality:
|
|
458
|
|
459 def __init__(self, file_path):
|
|
460
|
|
461 self.file_path = file_path
|
|
462
|
|
463 self.stored = defaultdict(Counter)
|
|
464 self.all_references = defaultdict(int)
|
|
465 self.primary_references = defaultdict(int)
|
|
466 self.complement = string.maketrans('ATCGN', 'TAGCN')
|
|
467
|
|
468 if KHMER_AVAILABLE:
|
|
469 self.ktable = khmer.new_ktable(10)
|
|
470
|
|
471 def _get_complement(self, sequence):
|
|
472
|
|
473 return sequence.translate(self.complement)[::-1]
|
|
474
|
|
475 def _get_summary(self, counter):
|
|
476 """"Returns five numbers (sum, extrema, mean, and std)
|
|
477 for a max_frequency counter """
|
|
478
|
|
479 maximum = 0
|
|
480 minimum = sys.maxint
|
|
481 thesum = 0
|
|
482 allcount = 0
|
|
483 mode = [0, None]
|
|
484
|
|
485 items = 0.0
|
|
486 mean = 0.0
|
|
487 m2 = 0.0
|
|
488 variance = 0.0
|
|
489
|
|
490 for item in counter:
|
|
491
|
|
492 count = counter[item]
|
|
493 if count > mode[0]:
|
|
494 mode = [count, item]
|
|
495
|
|
496 allcount += count
|
|
497 maximum = max(maximum, item)
|
|
498 minimum = min(minimum, item)
|
|
499 thesum += (count * item)
|
|
500
|
|
501 x = 1
|
|
502 while x <= count:
|
|
503 items += 1
|
|
504 delta = item - mean
|
|
505 mean = mean + delta / items
|
|
506 m2 = m2 + delta * (item - mean)
|
|
507 variance = m2 / items
|
|
508 x += 1
|
|
509
|
|
510 std = math.sqrt(variance)
|
|
511
|
|
512 return allcount, thesum, minimum, maximum, mode[1], mean, std
|
|
513
|
|
514
|
|
515 def _to_unit(self, item, is_percentage=False):
|
|
516 """ Convert a numeric to a string with metric units """
|
|
517
|
|
518 if is_percentage:
|
|
519 return ('%-.3f' % (item * 100)) + '%'
|
|
520 converted = None
|
|
521 try:
|
|
522 item = float(item)
|
|
523 if item > 10**12:
|
|
524 converted = str(round(item / 10**9,3))+'P'
|
|
525 elif item > 10**9:
|
|
526 converted = str(round(item / 10**9,3))+'G'
|
|
527 elif item > 10**6:
|
|
528 converted = str(round(item / 10**6,3))+'M'
|
|
529 elif item > 10**3:
|
|
530 converted = str(round(item / 10**3,3))+'K'
|
|
531 else:
|
|
532 converted = str(round(item,3))
|
|
533 except:
|
|
534 converted = str(item)
|
|
535
|
|
536 return converted
|
|
537
|
|
538 def _str_metrics(self, data):
|
|
539
|
|
540 str_metrics = []
|
|
541
|
|
542 for (item, metric) in sorted(data.keys()):
|
|
543 counter = data[(item, metric)]
|
|
544 if not hasattr(counter.iterkeys().next(), 'real'):
|
|
545 for element, count in counter.most_common():
|
|
546 str_metrics.append(self._str_metric(item, metric, element, count, no_units=True))
|
|
547 else:
|
|
548 summary = self._get_summary(counter)
|
|
549 str_metrics.append(self._str_metric(item, metric, *summary))
|
|
550
|
|
551 return str_metrics
|
|
552
|
|
553 def _str_metric(self, item, metric, count, thesum='', minimum='',\
|
|
554 maximum='', mode='', mean='', std='', no_units=False):
|
|
555
|
|
556 counters = [count, thesum, minimum, maximum, mode, mean, std]
|
|
557 counters = map(str, counters)
|
|
558
|
|
559 if no_units:
|
|
560 items = [item, metric] + counters
|
|
561 else:
|
|
562 units = map(self._to_unit, counters)
|
|
563 items = [item, metric] + units
|
|
564
|
|
565 return '%-15s\t%-60s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\n' \
|
|
566 % tuple(items)
|
|
567
|
|
568
|
|
569 def _count_read(self, metric, data, sample):
|
|
570
|
|
571 item = 'read'
|
|
572
|
|
573 (insert_size, alignment_score, mapping_score, length,\
|
|
574 q20_length, avg_phred_quality, number_hits, is_reverse) = data
|
|
575
|
|
576 self.stored[(item, metric + ' mappings')][number_hits] += sample
|
|
577 self.stored[(item, metric + ' insert')][insert_size] += sample
|
|
578
|
|
579
|
|
580 def _count_segment(self, metric, data, sample):
|
|
581
|
|
582 item = 'segment'
|
|
583
|
|
584 (insert_size, alignment_score, mapping_score, length,\
|
|
585 q20_length, avg_phred_quality, number_hits, is_reverse) = data
|
|
586
|
|
587 self.stored[(item, metric + ' algq')][alignment_score] += sample
|
|
588 self.stored[(item, metric + ' mapq')][mapping_score] += sample
|
|
589 self.stored[(item, metric + ' length')][length] += sample
|
|
590 self.stored[(item, metric + ' q20length')][q20_length] += sample
|
|
591 self.stored[(item, metric + ' meanbasequal')][avg_phred_quality] += sample
|
|
592 self.stored[(item, metric + ' reverse')][is_reverse] += sample
|
|
593
|
|
594 def count(self, parsed_line):
|
|
595
|
|
596 if parsed_line is None:
|
|
597 return
|
|
598
|
|
599 #print_metric('Item' , 'Metric', 'Count', 'Sum', 'Min', 'Max', 'Mode', 'Mean', 'STD')
|
|
600
|
|
601 read_key, read_name, flag, ref_name, ref_position, mapping_score,\
|
|
602 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
|
|
603 is_end1, is_end2, number_mismatches, alignment_score,\
|
|
604 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
|
|
605 is_paired, number_matches, read_end_pos, max_match = parsed_line
|
|
606
|
|
607 phred_quality = [q - 33 for q in qual]
|
|
608 avg_phred_quality = sum(phred_quality) / float(len(phred_quality))
|
|
609 length = len(seq)
|
|
610 mate_reference_id = mate_ref_name
|
|
611 reference_id = ref_name
|
|
612 reference = reference_id is not None and reference_id != '*'
|
|
613 insert_size = insert_size and abs(insert_size) or insert_size
|
|
614 is_segment1 = not is_paired or (is_paired and is_end1)
|
|
615 is_reverse = is_reverse
|
|
616 is_unique = is_primary and number_hits == 1
|
|
617 is_translocation = is_paired and is_mapped and is_mate_mapped\
|
|
618 and (mate_reference_id != '=' and reference_id != mate_reference_id)
|
|
619 is_part_of_doublemap = is_paired and is_mapped and is_mate_mapped
|
|
620 is_part_of_halfmap = is_paired and (is_mapped != is_mate_mapped)
|
|
621 is_part_of_nomap = is_paired and not is_mapped and not is_mate_mapped
|
|
622
|
|
623
|
|
624 # Count length until first low quality base call
|
|
625 q20_length = 0
|
|
626 for q in phred_quality:
|
|
627 if q < 20:
|
|
628 break
|
|
629 q20_length += 1
|
|
630
|
|
631 # Count kmers
|
|
632 if KHMER_AVAILABLE:
|
|
633 if not is_reverse:
|
|
634 self.ktable.consume(seq)
|
|
635 else:
|
|
636 self.ktable.consume(self._get_complement(seq))
|
|
637
|
|
638 if reference:
|
|
639
|
|
640 self.all_references[reference_id] += 1
|
|
641
|
|
642 if is_primary:
|
|
643 self.primary_references[reference_id] += 1
|
|
644
|
|
645
|
|
646 data = (insert_size, alignment_score, mapping_score, length,\
|
|
647 q20_length, avg_phred_quality, number_hits, is_reverse)
|
|
648
|
|
649 sample = 1
|
|
650
|
|
651 self._count_segment('sequenced', data, sample)
|
|
652 if is_mapped:
|
|
653 self._count_segment('sequenced mapped multi', data, sample)
|
|
654 if is_primary:
|
|
655 self._count_segment('sequenced mapped primary', data, sample)
|
|
656 if number_hits and is_unique:
|
|
657 self._count_segment('sequenced mapped primary unique', data, sample)
|
|
658
|
|
659 if is_segment1:
|
|
660 self._count_read('sequenced mapped multi', data, sample)
|
|
661 if is_primary:
|
|
662 self._count_read('sequenced mapped primary', data, sample)
|
|
663
|
|
664 if is_paired:
|
|
665 self._count_segment('sequenced paired', data, sample)
|
|
666
|
|
667 if is_part_of_doublemap:
|
|
668 self._count_segment('sequenced paired doublemap', data, sample)
|
|
669
|
|
670 if is_primary:
|
|
671 self._count_segment('sequenced paired doublemap primary', data, sample)
|
|
672
|
|
673 if is_segment1:
|
|
674 self._count_read('sequenced paired doublemap multi', data, sample)
|
|
675
|
|
676 if is_primary:
|
|
677 self._count_read('sequenced paired doublemap primary', data, sample)
|
|
678
|
|
679 if number_hits and is_unique:
|
|
680 self._count_read('sequenced paired doublemap primary unique', data, sample)
|
|
681
|
|
682 if is_translocation:
|
|
683 self._count_read('sequenced paired doublemap primary unique translocation', data, sample)
|
|
684
|
|
685 elif is_part_of_halfmap:
|
|
686
|
|
687 self._count_segment('sequenced paired halfmap', data, sample)
|
|
688
|
|
689 # The mapped segment
|
|
690 if is_mapped:
|
|
691
|
|
692 self._count_segment('sequenced paired halfmap mapped', data, sample)
|
|
693
|
|
694 if is_primary:
|
|
695
|
|
696 self._count_read('sequenced paired halfmap mapped primary', data, sample)
|
|
697
|
|
698 if number_hits and is_unique:
|
|
699 self._count_read('sequenced paired halfmap mapped primary unique', data, sample)
|
|
700
|
|
701 elif not is_primary:
|
|
702
|
|
703 self._count_read('sequenced unpaired mapped multi', data, sample)
|
|
704
|
|
705 # The unmapped segment
|
|
706 elif not is_mapped:
|
|
707 self._count_segment('sequenced paired halfmap unmapped', data, sample)
|
|
708
|
|
709 elif is_part_of_nomap:
|
|
710
|
|
711 self._count_segment('sequenced paired nomap', data, sample)
|
|
712
|
|
713 if is_segment1:
|
|
714
|
|
715 self._count_read('sequenced paired nomap', data, sample)
|
|
716
|
|
717 elif not is_paired:
|
|
718
|
|
719 self._count_segment('sequenced unpaired', data, sample)
|
|
720
|
|
721 if is_mapped:
|
|
722
|
|
723 self._count_segment('sequenced unpaired mapped', data, sample)
|
|
724
|
|
725 if is_primary:
|
|
726
|
|
727 self._count_read('sequenced unpaired mapped primary', data, sample)
|
|
728
|
|
729 if number_hits and is_unique:
|
|
730
|
|
731 self._count_read('sequenced paired unpaired mapped primary unique', data, sample)
|
|
732
|
|
733 elif not is_primary:
|
|
734
|
|
735 self._count_read('sequenced unpaired mapped multi', data, sample)
|
|
736
|
|
737
|
|
738 elif not is_mapped:
|
|
739
|
|
740 self._count_segment('sequenced unpaired unmapped', data, sample)
|
|
741
|
|
742 if is_segment1:
|
|
743 self._count_read('sequenced unpaired unmapped', data, sample)
|
|
744
|
|
745 def write(self):
|
|
746
|
|
747 with open(self.file_path, 'w') as output_file:
|
|
748
|
|
749 all_references = sorted([(count, reference) for reference, count\
|
|
750 in self.all_references.iteritems()], reverse=True)
|
|
751
|
|
752 for j, (count, reference) in enumerate(all_references[:30]):
|
|
753 self.stored[('segment', 'multireference_' + str(j+1))][reference] = count
|
|
754
|
|
755 primary_references = sorted([(count, reference) for reference, count\
|
|
756 in self.primary_references.iteritems()], reverse=True)
|
|
757
|
|
758 for j, (count, reference) in enumerate(primary_references[:30]):
|
|
759 self.stored[('segment', 'primaryreference_' + str(j+1))][reference] = count
|
|
760
|
|
761 # Extract top-ranking kmers
|
|
762 if KHMER_AVAILABLE:
|
|
763 kmer_frequencies = []
|
|
764 for i in range(0, self.ktable.n_entries()):
|
|
765 n = self.ktable.get(i)
|
|
766 if n > 0:
|
|
767 kmer_frequencies.append((n, self.ktable.reverse_hash(i)))
|
|
768 kmer_frequencies = sorted(kmer_frequencies, reverse=True)
|
|
769 for j, (frequency, kmer) in enumerate(kmer_frequencies[:10]):
|
|
770 self.stored[('segment', 'kmer_' + str(j+1))][kmer] = frequency
|
|
771
|
|
772 output_file.writelines(self._str_metrics(self.stored))
|
|
773
|
|
774
|
|
775 class SAMTaxonomy:
|
|
776 """ Provides taxonomic summary information from a SAM file stream. """
|
|
777
|
|
778 def __init__(self, file_path):
|
|
779
|
|
780 self.file_path = file_path
|
|
781
|
|
782 self.count_primaries = Counter()
|
|
783 self.detailed_information = {}
|
|
784
|
|
785 self._last_read = (None, None)
|
|
786 self._last_read_human_prim = 0
|
|
787 self._last_read_human_sec = 0
|
|
788 self._last_organisms = set()
|
|
789
|
|
790 def count(self, parsed_line):
|
|
791
|
|
792 if parsed_line is None:
|
|
793 return
|
|
794
|
|
795 read_key, read_name, flag, ref_name, ref_position, mapping_score,\
|
|
796 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
|
|
797 is_end1, is_end2, number_mismatches, alignment_score,\
|
|
798 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
|
|
799 is_paired, number_matches, read_end_pos, max_match = parsed_line
|
|
800
|
|
801 if is_mapped:
|
|
802
|
|
803 refseq_group, family, organism, gi = ref_name.split(';')[:4]
|
|
804
|
|
805 if is_primary:
|
|
806 self.count_primaries[organism] += 1
|
|
807
|
|
808 if organism not in self.detailed_information:
|
|
809 # refseq_group. family, gis, avg_mapping_score,
|
|
810 # avg_seq_length, avg_number_hits, avg_alignment_score, avg_nr_mismatches
|
|
811 initial = [refseq_group,
|
|
812 family,
|
|
813 set([gi]),
|
|
814 [int(mapping_score), 1],
|
|
815 [len(seq), 1],
|
|
816 [0, 0],
|
|
817 [alignment_score, 1],
|
|
818 [number_mismatches, 1],
|
|
819 0,
|
|
820 0]
|
|
821
|
|
822 self.detailed_information[organism] = initial
|
|
823
|
|
824 else:
|
|
825 entry = self.detailed_information[organism]
|
|
826 entry[2].add(gi)
|
|
827 entry[3][0] += int(mapping_score)
|
|
828 entry[3][1] += 1
|
|
829 entry[4][0] += len(seq)
|
|
830 entry[4][1] += 1
|
|
831 entry[6][0] += alignment_score
|
|
832 entry[6][1] += 1
|
|
833 entry[7][0] += number_mismatches
|
|
834 entry[7][1] += 1
|
|
835
|
|
836 if is_primary:
|
|
837 entry = self.detailed_information[organism]
|
|
838 entry[5][0] += number_hits
|
|
839 entry[5][1] += 1
|
|
840
|
|
841 if self._last_read == (None, None):
|
|
842 self._last_read = read_key
|
|
843
|
|
844 if self._last_read != read_key:
|
|
845
|
|
846 for last_organism in self._last_organisms:
|
|
847
|
|
848 self.detailed_information[last_organism][8]\
|
|
849 += self._last_read_human_prim
|
|
850
|
|
851 self.detailed_information[last_organism][9]\
|
|
852 += self._last_read_human_sec
|
|
853
|
|
854 self._last_read = read_key
|
|
855 self._last_organisms = set()
|
|
856 self._last_read_human_prim = 0
|
|
857 self._last_read_human_sec = 0
|
|
858
|
|
859 self._last_organisms.add(organism)
|
|
860
|
|
861 if organism == 'Homo_sapiens':
|
|
862 if is_primary:
|
|
863 self._last_read_human_prim += 1
|
|
864 else:
|
|
865 self._last_read_human_sec += 1
|
|
866
|
|
867 def get_summary(self, top=100):
|
|
868
|
|
869 lines = []
|
|
870
|
|
871 lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10s\t%10s\t%5s\t%5s\t%5s\t%10s\t%10s\n'\
|
|
872 % ('Count', 'Group', 'Family', 'Organism', 'Targets', 'ReadLen', 'Hits', 'Map', 'Algn', 'Mism', 'HuP', 'HuS'))
|
|
873
|
|
874 top_organisms = self.count_primaries.most_common(top)
|
|
875
|
|
876 for organism, count in top_organisms:
|
|
877
|
|
878 refseq_group, family, identifiers,\
|
|
879 avg_mapping_score, avg_seq_length, avg_number_hits,\
|
|
880 avg_alignment_score, avg_nr_mismatches, human_prim, human_sec\
|
|
881 = self.detailed_information[organism]
|
|
882
|
|
883 avg_len = int(avg_seq_length[0] / float(avg_seq_length[1]))
|
|
884 if avg_number_hits[1] == 0:
|
|
885 avg_hits = 0
|
|
886 else:
|
|
887 avg_hits = int(avg_number_hits[
|
|
888 0] / float(avg_number_hits[1]))
|
|
889
|
|
890 avg_mapping_score = int(avg_mapping_score[
|
|
891 0] / float(avg_mapping_score[1]))
|
|
892
|
|
893 avg_alignment_score = int(avg_alignment_score[
|
|
894 0] / float(avg_alignment_score[1]))
|
|
895
|
|
896 avg_nr_mismatches = int(avg_nr_mismatches[
|
|
897 0] / float(avg_nr_mismatches[1]))
|
|
898
|
|
899 nr_ids = len(identifiers)
|
|
900
|
|
901 if count > 10**6:
|
|
902 count = str(round(count / float(10**6), 3)) + 'M'
|
|
903 if human_prim > 10**6:
|
|
904 human_prim = str(round(human_prim / float(10**6), 3)) + 'M'
|
|
905 if human_sec > 10**6:
|
|
906 human_sec = str(round(human_sec / float(10**6), 3)) + 'M'
|
|
907 if nr_ids > 10**6:
|
|
908 nr_ids = str(round(nr_ids / float(10**6), 3)) + 'M'
|
|
909
|
|
910 lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10i\t%10i\t%5i\t%5i\t%5i\t%10s\t%10s\n'\
|
|
911 % (str(count), refseq_group[:20], family[:20], organism[:20],\
|
|
912 str(nr_ids), avg_len, avg_hits, avg_mapping_score,\
|
|
913 avg_alignment_score, avg_nr_mismatches, str(human_prim),\
|
|
914 str(human_sec)))
|
|
915
|
|
916 return lines
|
|
917
|
|
918 def write(self):
|
|
919
|
|
920 with open(self.file_path, 'w') as output_file:
|
|
921 output_file.writelines(self.get_summary())
|
|
922
|
|
923 @CLI.subcommand("rnamap")
|
|
924 class RNAmap(cli.Application):
|
|
925 """ Map input reads against a STAR index """
|
|
926
|
|
927 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
|
|
928 help="Sets the index output directory")
|
|
929
|
|
930 threads = cli.SwitchAttr(
|
|
931 ['-t', '--threads'], cli.Range(1, 512), mandatory=False,
|
|
932 help="Sets the number of threads to use",
|
|
933 default=1)
|
|
934
|
|
935 taxonomy = cli.SwitchAttr(
|
|
936 ['-x', '--taxonomy'], str, mandatory=False,
|
|
937 help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping",
|
|
938 default='')
|
|
939
|
|
940 star_path = cli.SwitchAttr(['--star_path'], str, mandatory=False,
|
|
941 help="Path to STAR executable",
|
|
942 default='')
|
|
943
|
|
944 samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False,
|
|
945 help="Path to samtools executable",
|
|
946 default='')
|
|
947
|
|
948 temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False,
|
|
949 help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.",
|
|
950 default='')
|
|
951
|
|
952 min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False,
|
|
953 help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)",
|
|
954 default=None)
|
|
955
|
|
956 min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False,
|
|
957 help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)",
|
|
958 default=None)
|
|
959
|
|
960 max_mismatches = cli.SwitchAttr(['--max_mismatches'], cli.Range(0, 10000000), mandatory=False,
|
|
961 help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)",
|
|
962 default=None)
|
|
963
|
|
964 max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False,
|
|
965 help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)",
|
|
966 default=None)
|
|
967
|
|
968 min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False,
|
|
969 help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)",
|
|
970 default=None)
|
|
971
|
|
972 filter_complexity = cli.Flag(['--filter_complexity'],
|
|
973 help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)",
|
|
974 default=False)
|
|
975
|
|
976 bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False,
|
|
977 help="Path to unsorted, unindexed output BAM file",
|
|
978 default='')
|
|
979
|
|
980 sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False,
|
|
981 help="Path to output SAM file",
|
|
982 default='')
|
|
983
|
|
984 qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False,
|
|
985 help="Path to output quality file",
|
|
986 default='')
|
|
987
|
|
988 hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False,
|
|
989 help="Path to bzip2-compressed tab-delimited output hit file",
|
|
990 default='')
|
|
991
|
|
992 sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False,
|
|
993 help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file",
|
|
994 default='no_sample_id')
|
|
995
|
|
996 unmapped1 = cli.SwitchAttr(['--unmapped_end_1'], str, mandatory=False,
|
|
997 help="Output path to uncompressed fastq file containing unmapped reads, first ends only for paired ends.",
|
|
998 default='')
|
|
999
|
|
1000 unmapped2 = cli.SwitchAttr(['--unmapped_end_2'], str, mandatory=False,
|
|
1001 help="Output path to uncompressed fastq file containing unmapped reads, second ends only for paired ends.",
|
|
1002 default='')
|
|
1003
|
|
1004 splice_junctions = cli.SwitchAttr(['--splice_junctions'], str, mandatory=False,
|
|
1005 help="Input path to splice junction file (currently not implemented)",
|
|
1006 default='')
|
|
1007
|
|
1008 chimeric_mappings = cli.SwitchAttr(['--chimeric_mappings'], str, mandatory=False,
|
|
1009 help="Ouput path to SAM file containing chimeric mappings",
|
|
1010 default='')
|
|
1011
|
|
1012 hit_filter = cli.SwitchAttr(
|
|
1013 ['-f', '--virana_hit_filter'], str, list=True, mandatory=False,
|
|
1014 help="Only generate hit groups that include at last one read mapping to a reference of this reference group.",
|
|
1015 default=[])
|
|
1016
|
|
1017 debug = cli.Flag(["-d", "--debug"], help="Enable debug information")
|
|
1018
|
|
1019 reads = cli.SwitchAttr(
|
|
1020 ['-r', '--reads'], str, list=True, mandatory=True,
|
|
1021 help="Sets the input reads. Add this parameter twice for paired end reads.")
|
|
1022
|
|
1023 zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped")
|
|
1024
|
|
1025 sensitive = cli.Flag(
|
|
1026 ["--sensitive"], help="If given, mapping will process slower and more sensitive")
|
|
1027
|
|
1028 def main(self):
|
|
1029
|
|
1030 if self.debug:
|
|
1031 logging.getLogger().setLevel(logging.DEBUG)
|
|
1032
|
|
1033 # Obtain star executable
|
|
1034 star = [self.star_path and self.star_path or 'STAR']
|
|
1035 samtools = [self.samtools_path and self.samtools_path or 'samtools']
|
|
1036
|
|
1037 # Check if genome directory is existing
|
|
1038 if not os.path.exists(self.index_dir):
|
|
1039 sys.stdout.write('Index directory %s not existing, exiting' % self.genome_dir)
|
|
1040 sys.exit(1)
|
|
1041
|
|
1042 if self.temp_path:
|
|
1043 temp_path = tempfile.mkdtemp(dir=self.temp_path)
|
|
1044 else:
|
|
1045 temp_path = tempfile.mkdtemp()
|
|
1046
|
|
1047 first_ends = []
|
|
1048 second_ends = []
|
|
1049 single_ends = []
|
|
1050
|
|
1051 if len(self.reads) == 2:
|
|
1052 first, second = self.reads
|
|
1053 first_ends.append(first)
|
|
1054 second_ends.append(second)
|
|
1055
|
|
1056 elif len(self.reads) == 1:
|
|
1057 single_ends.append(self.reads[0])
|
|
1058
|
|
1059 else:
|
|
1060 sys.stdout.write('Invalid number of fastq files; provide either one (single end) or two (paired end)')
|
|
1061 sys.exit(1)
|
|
1062
|
|
1063 if single_ends and not first_ends and not second_ends:
|
|
1064 reads = [','.join(single_ends)]
|
|
1065
|
|
1066 elif first_ends and second_ends:
|
|
1067 reads = [','.join(first_ends), ','.join(second_ends)]
|
|
1068
|
|
1069
|
|
1070 star_cline = star + ['--runMode', 'alignReads',
|
|
1071 '--genomeDir', self.index_dir,
|
|
1072 '--runThreadN', str(self.threads),
|
|
1073 '--readMatesLengthsIn', 'NotEqual',
|
|
1074 '--outFileNamePrefix', os.path.join(
|
|
1075 temp_path, 'out'),
|
|
1076 '--outSAMmode', 'Full',
|
|
1077 '--outSAMstrandField', 'None',
|
|
1078 '--outSAMattributes', 'Standard',
|
|
1079 '--outSAMunmapped', 'Within',
|
|
1080 '--outStd', 'SAM',
|
9
|
1081 '--outFilterMultimapNmax', '1000']
|
0
|
1082
|
|
1083 if self.unmapped1 or self.unmapped2:
|
|
1084 star_cline += ['--outReadsUnmapped', 'Fastx']
|
|
1085 else:
|
|
1086 star_cline += ['--outReadsUnmapped', 'None']
|
|
1087
|
|
1088
|
|
1089 if self.zipped:
|
|
1090 star_cline += ['--readFilesCommand', 'zcat']
|
|
1091
|
|
1092 if self.sensitive:
|
|
1093 star_cline += ['--outFilterMultimapScoreRange', '10',
|
|
1094 '--outFilterMismatchNmax', '60',
|
|
1095 '--outFilterMismatchNoverLmax', '0.3',
|
|
1096 '--outFilterScoreMin', '0',
|
|
1097 '--outFilterScoreMinOverLread', '0.3',
|
|
1098 '--outFilterMatchNmin', '0',
|
|
1099 '--outFilterMatchNminOverLread', '0.66',
|
|
1100 '--seedSearchStartLmax', '12',
|
|
1101 '--winAnchorMultimapNmax', '50']
|
|
1102
|
|
1103 star_cline += ['--readFilesIn'] + reads
|
|
1104
|
|
1105 if self.debug:
|
|
1106 print ' '.join(star_cline)
|
|
1107
|
|
1108 # Try if we can make the relevant files
|
|
1109 touch_files = [self.unmapped1, self.unmapped2, self.taxonomy, self.qual, self.hits, self.sam, self.bam]
|
|
1110 for file_path in touch_files:
|
|
1111 if file_path is None or file_path == '':
|
|
1112 continue
|
|
1113 try:
|
|
1114 with file(file_path, 'a'):
|
|
1115 os.utime(file_path, None)
|
|
1116 except IOError:
|
|
1117 sys.stderr.write('Could not write output file %s\n' % file_path)
|
|
1118 sys.exit(1)
|
|
1119
|
|
1120 star_process = subprocess.Popen(' '.join(
|
|
1121 star_cline), shell=True, stdout=PIPE)
|
|
1122
|
|
1123 parser = SAMParser()
|
|
1124
|
|
1125 if self.taxonomy:
|
|
1126 taxonomy = SAMTaxonomy(self.taxonomy)
|
|
1127
|
|
1128 if self.qual:
|
|
1129 quality = SAMQuality(self.qual)
|
|
1130
|
|
1131 if self.hits:
|
|
1132 hits = SAMHits(self.hits, self.sample_id, self.hit_filter,
|
|
1133 self.min_mapping_score,
|
|
1134 self.min_alignment_score,
|
|
1135 self.max_mismatches,
|
|
1136 self.max_relative_mismatches,
|
|
1137 self.min_continiously_matching,
|
|
1138 self.filter_complexity)
|
|
1139
|
|
1140 if self.sam:
|
|
1141 sam_file = open(self.sam, 'w')
|
|
1142
|
|
1143 if self.bam:
|
|
1144 with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file:
|
|
1145 samtools_cline = samtools + [
|
|
1146 'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin']
|
|
1147 if self.debug:
|
|
1148 print ' '.join(samtools_cline)
|
|
1149 samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE)
|
|
1150
|
|
1151
|
|
1152 do_sam = self.sam
|
|
1153 do_bam = self.bam
|
|
1154 do_taxonomy = self.taxonomy
|
|
1155 do_qual = self.qual
|
|
1156 do_hits = self.hits
|
|
1157 do_parse = do_taxonomy or do_qual or do_hits
|
|
1158
|
|
1159 for i, line in enumerate(iter(star_process.stdout.readline, '')):
|
|
1160
|
|
1161 if do_sam:
|
|
1162 sam_file.write(line)
|
|
1163
|
|
1164 if do_bam:
|
|
1165 samtools_process.stdin.write(line)
|
|
1166
|
|
1167 if line[0] == '@':
|
|
1168 continue
|
|
1169
|
|
1170 if do_parse:
|
|
1171 parsed_line = parser.parse(line)
|
|
1172
|
|
1173 if do_taxonomy:
|
|
1174 taxonomy.count(parsed_line)
|
|
1175 if i > 0 and (i % 50000) == 0:
|
|
1176 print ''.join(taxonomy.get_summary(10))
|
|
1177
|
|
1178 if do_qual:
|
|
1179 quality.count(parsed_line)
|
|
1180
|
|
1181 if do_hits:
|
|
1182 hits.count(parsed_line)
|
|
1183
|
|
1184 if do_bam:
|
|
1185 samtools_process.stdin.close()
|
|
1186
|
|
1187 if do_sam:
|
|
1188 sam_file.close()
|
|
1189
|
|
1190 if do_hits:
|
|
1191 hits.write()
|
|
1192
|
|
1193 if do_taxonomy:
|
|
1194 print ''.join(taxonomy.get_summary(10))
|
|
1195 taxonomy.write()
|
|
1196
|
|
1197 if do_qual:
|
|
1198 quality.write()
|
|
1199
|
|
1200 try:
|
|
1201 if self.unmapped1:
|
|
1202 shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate1'),\
|
|
1203 self.unmapped1)
|
|
1204 except IOError:
|
|
1205 pass
|
|
1206
|
|
1207 try:
|
|
1208 if self.unmapped2:
|
|
1209 shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate2'),\
|
|
1210 self.unmapped2)
|
|
1211 except IOError:
|
|
1212 pass
|
|
1213
|
|
1214 try:
|
|
1215 if self.chimeric_mappings:
|
|
1216 shutil.move(os.path.join(temp_path, 'out' + 'Chimeric.out.sam'),\
|
|
1217 self.chimeric_mappings)
|
|
1218 except IOError:
|
|
1219 pass
|
|
1220
|
|
1221 shutil.rmtree(temp_path)
|
|
1222
|
|
1223 @CLI.subcommand("dnamap")
|
|
1224 class DNAmap(cli.Application):
|
|
1225 """ Map input reads against a BWA index """
|
|
1226
|
|
1227 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
|
|
1228 help="Sets the index output directory")
|
|
1229
|
|
1230 threads = cli.SwitchAttr(
|
|
1231 ['-t', '--threads'], cli.Range(1, 512), mandatory=False,
|
|
1232 help="Sets the number of threads to use",
|
|
1233 default=1)
|
|
1234
|
|
1235 taxonomy = cli.SwitchAttr(
|
|
1236 ['-x', '--taxonomy'], str, mandatory=False,
|
|
1237 help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping",
|
|
1238 default='')
|
|
1239
|
|
1240 samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False,
|
|
1241 help="Path to samtools executable",
|
|
1242 default='')
|
|
1243
|
|
1244 temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False,
|
|
1245 help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.",
|
|
1246 default='')
|
|
1247
|
|
1248 min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False,
|
|
1249 help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)",
|
|
1250 default=None)
|
|
1251
|
|
1252 min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False,
|
|
1253 help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)",
|
|
1254 default=None)
|
|
1255
|
|
1256 max_mismatches = cli.SwitchAttr(['--max_mismatches'], cli.Range(0, 10000000), mandatory=False,
|
|
1257 help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)",
|
|
1258 default=None)
|
|
1259
|
|
1260 max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False,
|
|
1261 help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)",
|
|
1262 default=None)
|
|
1263
|
|
1264 min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False,
|
|
1265 help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)",
|
|
1266 default=None)
|
|
1267
|
|
1268 filter_complexity = cli.Flag(['--filter_complexity'],
|
|
1269 help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)",
|
|
1270 default=False)
|
|
1271
|
|
1272 sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False,
|
|
1273 help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file",
|
|
1274 default='no_sample_id')
|
|
1275
|
|
1276 bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False,
|
|
1277 help="Path to unsorted, unindexed output BAM file",
|
|
1278 default='')
|
|
1279
|
|
1280 sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False,
|
|
1281 help="Path to output SAM file",
|
|
1282 default='')
|
|
1283
|
|
1284 qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False,
|
|
1285 help="Path to output quality file",
|
|
1286 default='')
|
|
1287
|
|
1288 hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False,
|
|
1289 help="Path to bzip2-compressed tab-delimited output virana hit file",
|
|
1290 default='')
|
|
1291
|
|
1292 hit_filter = cli.SwitchAttr(
|
|
1293 ['-f', '--virana_hit_filter'], str, list=True, mandatory=False,
|
|
1294 help="Only generate hit groups that include at last one read mapping to a reference of this reference group.",
|
|
1295 default=[])
|
|
1296
|
|
1297 debug = cli.Flag(["-d", "--debug"], help="Enable debug information")
|
|
1298
|
|
1299 zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped")
|
|
1300
|
|
1301 sensitive = cli.Flag(
|
|
1302 ["--sensitive"], help="If given, mapping will process slower and more sensitive")
|
|
1303
|
|
1304
|
|
1305 bwa_path = cli.SwitchAttr(['--bwa_path'], str, mandatory=False,
|
|
1306 help="Path to BWA executable",
|
|
1307 default='')
|
|
1308
|
|
1309 debug = cli.Flag(["-d", "--debug"], help="Enable debug information")
|
|
1310
|
|
1311 if debug:
|
|
1312 logging.getLogger().setLevel(logging.DEBUG)
|
|
1313
|
|
1314 reads = cli.SwitchAttr(
|
|
1315 ['-r', '--reads'], str, list=True, mandatory=True,
|
|
1316 help="Sets the input reads. Add this parameter twice for paired end reads.")
|
|
1317
|
|
1318 def main(self):
|
|
1319
|
|
1320 if self.debug:
|
|
1321 logging.getLogger().setLevel(logging.DEBUG)
|
|
1322
|
|
1323 # Obtain star executable
|
|
1324 bwa = [self.bwa_path and self.bwa_path or 'bwa']
|
|
1325 samtools = [self.samtools_path and self.samtools_path or 'samtools']
|
|
1326
|
|
1327 # Check if genome directory is existing
|
|
1328 if not os.path.exists(self.index_dir):
|
|
1329 sys.stdout.write('Index directory %s not existing, exiting'\
|
|
1330 % self.genome_dir)
|
|
1331 sys.exit(1)
|
|
1332
|
|
1333 if len(self.reads) not in (1, 2):
|
|
1334 message = 'Invalid number of FASTQ files; supply either one (single end) or two (paired end)\n'
|
|
1335 sys.stderr.write(message)
|
|
1336 sys.exit(1)
|
|
1337
|
|
1338 bwa_cline = bwa + ['mem', '-t', str(self.threads), '-M', os.path.join(self.index_dir, 'index')]
|
|
1339
|
|
1340 bwa_cline += self.reads
|
|
1341
|
|
1342 if self.debug:
|
|
1343 print ' '.join(bwa_cline)
|
|
1344
|
|
1345 bwa_process = subprocess.Popen(' '.join(bwa_cline), shell=True, stdout=PIPE)
|
|
1346
|
|
1347 parser = SAMParser()
|
|
1348
|
|
1349 if self.taxonomy:
|
|
1350 taxonomy = SAMTaxonomy(self.taxonomy)
|
|
1351
|
|
1352 if self.qual:
|
|
1353 quality = SAMQuality(self.qual)
|
|
1354
|
|
1355 if self.hits:
|
|
1356 hits = SAMHits(self.hits, self.sample_id, self.hit_filter,
|
|
1357 self.min_mapping_score,
|
|
1358 self.min_alignment_score,
|
|
1359 self.max_mismatches,
|
|
1360 self.max_relative_mismatches,
|
|
1361 self.min_continiously_matching,
|
|
1362 self.filter_complexity)
|
|
1363
|
|
1364 if self.sam:
|
|
1365 sam_file = open(self.sam, 'w', buffering=100 * 1024 * 1024)
|
|
1366
|
|
1367 if self.bam:
|
|
1368 with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file:
|
|
1369 samtools_cline = samtools + [
|
|
1370 'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin']
|
|
1371 if self.debug:
|
|
1372 print ' '.join(samtools_cline)
|
|
1373 samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE)
|
|
1374
|
|
1375 do_sam = self.sam
|
|
1376 do_bam = self.bam
|
|
1377 do_taxonomy = self.taxonomy
|
|
1378 do_qual = self.qual
|
|
1379 do_hits = self.hits
|
|
1380 do_parse = do_taxonomy or do_qual or do_hits
|
|
1381
|
|
1382 for i, line in enumerate(iter(bwa_process.stdout.readline, '')):
|
|
1383
|
|
1384 if do_sam:
|
|
1385 sam_file.write(line)
|
|
1386
|
|
1387 if do_bam:
|
|
1388 samtools_process.stdin.write(line)
|
|
1389
|
|
1390 if do_parse:
|
|
1391 parsed_line = parser.parse(line)
|
|
1392
|
|
1393 if do_taxonomy:
|
|
1394 taxonomy.count(parsed_line)
|
|
1395
|
|
1396 if i > 0 and (i % 10000) == 0:
|
|
1397 print ''.join(taxonomy.get_summary(10))
|
|
1398
|
|
1399 if do_qual:
|
|
1400 quality.count(parsed_line)
|
|
1401
|
|
1402 if do_hits:
|
|
1403 hits.count(parsed_line)
|
|
1404
|
|
1405 if do_bam:
|
|
1406 samtools_process.stdin.close()
|
|
1407
|
|
1408 if do_sam:
|
|
1409 sam_file.close()
|
|
1410
|
|
1411 if do_hits:
|
|
1412 hits.write()
|
|
1413
|
|
1414 if do_taxonomy:
|
|
1415 print ''.join(taxonomy.get_summary(10))
|
|
1416 taxonomy.write()
|
|
1417
|
|
1418 if do_qual:
|
|
1419 quality.write()
|
|
1420
|
|
1421
|
|
1422 if __name__ == "__main__":
|
|
1423 CLI.run()
|