Mercurial > repos > mzeidler > virana2
comparison vmap.py @ 0:3ba5983012cf draft
Uploaded
| author | mzeidler |
|---|---|
| date | Tue, 24 Sep 2013 10:19:40 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:3ba5983012cf |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 #from __future__ import print_function | |
| 3 | |
| 4 import cProfile | |
| 5 | |
| 6 import sys | |
| 7 import re | |
| 8 | |
| 9 import tempfile | |
| 10 import subprocess | |
| 11 import shutil | |
| 12 import os | |
| 13 import os.path | |
| 14 import logging | |
| 15 import bz2 | |
| 16 import zlib | |
| 17 | |
| 18 import math | |
| 19 import string | |
| 20 | |
| 21 from collections import defaultdict, Counter | |
| 22 | |
| 23 from subprocess import PIPE | |
| 24 | |
| 25 NON_ID = ''.join(c for c in map(chr, range(256)) if not c.isalnum()) | |
| 26 NON_ID = NON_ID.replace('_', '').replace('-', '') | |
| 27 | |
| 28 try: | |
| 29 from Bio.SeqRecord import SeqRecord | |
| 30 from Bio import SeqIO | |
| 31 from Bio.Seq import Seq | |
| 32 except ImportError: | |
| 33 message = 'This script requires the BioPython python package\n' | |
| 34 sys.stderr.write(message) | |
| 35 sys.exit(1) | |
| 36 | |
| 37 try: | |
| 38 from plumbum import cli | |
| 39 except ImportError: | |
| 40 message = 'This script requires the plumbum python package\n' | |
| 41 sys.stderr.write(message) | |
| 42 sys.exit(1) | |
| 43 | |
| 44 | |
| 45 try: | |
| 46 import HTSeq | |
| 47 except ImportError: | |
| 48 message = 'This script requires the HTSeq python package\n' | |
| 49 sys.stderr.write(message) | |
| 50 sys.exit(1) | |
| 51 | |
| 52 KHMER_AVAILABLE = True | |
| 53 try: | |
| 54 import khmer | |
| 55 except ImportError: | |
| 56 KHMER_AVAILABLE = False | |
| 57 | |
| 58 #from io import BufferedRandom | |
| 59 | |
| 60 logging.basicConfig(level=logging.INFO, format='%(message)s') | |
| 61 | |
| 62 | |
| 63 def profile_this(fn): | |
| 64 def profiled_fn(*args, **kwargs): | |
| 65 fpath = fn.__name__ + ".profile" | |
| 66 prof = cProfile.Profile() | |
| 67 ret = prof.runcall(fn, *args, **kwargs) | |
| 68 prof.dump_stats(fpath) | |
| 69 return ret | |
| 70 return profiled_fn | |
| 71 | |
| 72 | |
| 73 class CLI(cli.Application): | |
| 74 """RNA-Seq and DNA-Seq short read analysis by mapping to known reference sequences""" | |
| 75 PROGNAME = "vmap" | |
| 76 VERSION = "1.0.0" | |
| 77 DESCRIPTION = """Virana vmap is an interface to the NCBI and ensembl reference databases that can | |
| 78 generate reference indexes for the short read mappers STAR (RNA-Seq) and | |
| 79 BWA-MEM (DNA-Seq). Short reads can be mapped to arbitrary combinations of | |
| 80 reference databases and the results can be summarized by taxonomic family | |
| 81 as well as stored as SAM file, unsorted BAM file, or as a HIT file that | |
| 82 models multimapping reads between specific reference databases.""" | |
| 83 USAGE = """The program has four modes that can be accessed by `vmap rnaindex`, `vmap dnaindex`, `vmap rnamap`, and `vmap dnamap.`""" | |
| 84 | |
| 85 def main(self, *args): | |
| 86 | |
| 87 if args: | |
| 88 print("Unknown command %r" % (args[0])) | |
| 89 print self.USAGE | |
| 90 return 1 | |
| 91 | |
| 92 if not self.nested_command: | |
| 93 print("No command given") | |
| 94 print self.USAGE | |
| 95 return 1 | |
| 96 | |
| 97 | |
| 98 @CLI.subcommand("rnaindex") | |
| 99 class RNAIndex(cli.Application): | |
| 100 """ Creates a STAR index from a FASTA genome reference """ | |
| 101 | |
| 102 reference_files = cli.SwitchAttr( | |
| 103 ['-r', '--reference_file'], str, list=True, mandatory=True, | |
| 104 help="Sets the reference genome(s) FASTA file." + | |
| 105 " Multiple occurrences of this parameter are allowed.") | |
| 106 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, | |
| 107 help="Sets the index output directory." + | |
| 108 " Directory will be generated if not existing." + | |
| 109 " Directory will be filled with several index files.") | |
| 110 threads = cli.SwitchAttr( | |
| 111 ['-t', '--threads'], cli.Range(1, 512), mandatory=False, | |
| 112 help="Sets the number of threads to use", | |
| 113 default=1) | |
| 114 max_ram = cli.SwitchAttr( | |
| 115 ['-m'], cli.Range(1, 400000000000), mandatory=False, | |
| 116 help="Sets the maximum amount of memory (RAM) to use (in bytes)", | |
| 117 default=400000000000) | |
| 118 path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False, | |
| 119 help="Path to STAR executable", | |
| 120 default='') | |
| 121 sparse = cli.Flag( | |
| 122 ["-s", "--sparse"], help="If given, a sparse index that requires less " + | |
| 123 " RAM in the mapping phase will be constructed") | |
| 124 | |
| 125 debug = cli.Flag(["-d", "--debug"], help="Enable debug output") | |
| 126 | |
| 127 def main(self): | |
| 128 | |
| 129 if self.debug: | |
| 130 logging.getLogger().setLevel(logging.DEBUG) | |
| 131 | |
| 132 # Obtain star executable | |
| 133 star = [self.path and self.path or 'STAR'] | |
| 134 | |
| 135 # Check if genome directory is existing | |
| 136 for reference_file in self.reference_files: | |
| 137 if not os.path.exists(reference_file): | |
| 138 sys.stdout.write( | |
| 139 'Reference file %s nor existing, exiting' % reference_file) | |
| 140 sys.exit(1) | |
| 141 | |
| 142 # Check if output directory is existing | |
| 143 if not os.path.exists(self.index_dir): | |
| 144 logging.debug( | |
| 145 'Making output directory for index at %s' % self.index_dir) | |
| 146 os.makedirs(self.index_dir) | |
| 147 | |
| 148 # # Make named pipe to extract genomes | |
| 149 # pipe_path = os.path.abspath(os.path.join(self.genome_dir, 'pipe.fa')) | |
| 150 # if os.path.exists(pipe_path): | |
| 151 # os.unlink(pipe_path) | |
| 152 # os.mkfifo(pipe_path) | |
| 153 | |
| 154 # Make star command line | |
| 155 cline = star + ['--runMode', 'genomeGenerate', | |
| 156 '--genomeDir', self.index_dir, | |
| 157 '--limitGenomeGenerateRAM', str(self.max_ram), | |
| 158 '--runThreadN', str(self.threads), | |
| 159 '--genomeFastaFiles'] + self.reference_files | |
| 160 | |
| 161 # Add parameters for sparse (memory-saving) index generation | |
| 162 if self.sparse: | |
| 163 cline += ['--genomeSAsparseD', '2', | |
| 164 '--genomeChrBinNbits', '12', | |
| 165 '--genomeSAindexNbases', '13'] | |
| 166 | |
| 167 else: | |
| 168 cline += ['--genomeSAsparseD', '1', | |
| 169 '--genomeChrBinNbits', '18', | |
| 170 '--genomeSAindexNbases', '15'] | |
| 171 | |
| 172 if self.debug: | |
| 173 print ' '.join(cline) | |
| 174 | |
| 175 # Run STAR reference generation process | |
| 176 star_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE) | |
| 177 | |
| 178 # Block until streams are closed by the process | |
| 179 stdout, stderr = star_process.communicate() | |
| 180 | |
| 181 if stderr: | |
| 182 sys.stderr.write(stderr) | |
| 183 | |
| 184 if self.debug and stdout: | |
| 185 print stdout | |
| 186 | |
| 187 | |
| 188 @CLI.subcommand("dnaindex") | |
| 189 class DNAIndex(cli.Application): | |
| 190 """ Creates a BWA index from a FASTA reference file """ | |
| 191 | |
| 192 reference_file = cli.SwitchAttr(['-r', '--reference_file'], str, mandatory=True, | |
| 193 help="Sets the input reference FASTA file.") | |
| 194 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, | |
| 195 help="Sets the index output directory." + | |
| 196 " Directory will be generated if not existing." + | |
| 197 " Directory will be filled with several index files.") | |
| 198 path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False, | |
| 199 help="Path to BWA executable", | |
| 200 default='') | |
| 201 debug = cli.Flag(["-d", "--debug"], help="Enable debug output") | |
| 202 | |
| 203 if debug: | |
| 204 logging.getLogger().setLevel(logging.DEBUG) | |
| 205 | |
| 206 def main(self): | |
| 207 | |
| 208 # Obtain star executable | |
| 209 bwa = [self.path and self.path or 'bwa'] | |
| 210 | |
| 211 # Check if genome directory is existing | |
| 212 if not os.path.exists(self.reference_file): | |
| 213 sys.stdout.write('Genome file %s nor existing, exiting' % self.reference_file) | |
| 214 sys.exit(1) | |
| 215 | |
| 216 # Check if output directory is existing | |
| 217 if not os.path.exists(self.index_dir): | |
| 218 logging.debug('Making output directory %s' % self.index_dir) | |
| 219 os.makedirs(self.index_dir) | |
| 220 | |
| 221 # Make star command line | |
| 222 cline = bwa + ['index', '-a', 'bwtsw', '-p', os.path.join(self.index_dir, 'index'), self.reference_file] | |
| 223 | |
| 224 if self.debug: | |
| 225 print ' '.join(cline) | |
| 226 | |
| 227 # Run BWA index generation process | |
| 228 bwa_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE) | |
| 229 stdout, stderr = bwa_process.communicate() | |
| 230 | |
| 231 if stderr: | |
| 232 sys.stderr.write(stderr) | |
| 233 | |
| 234 if self.debug and stdout: | |
| 235 print stdout | |
| 236 | |
| 237 | |
| 238 | |
| 239 | |
| 240 | |
| 241 class SAMHits: | |
| 242 """ Converts SAM output of mappers into bzipped HIT files. """ | |
| 243 | |
| 244 def __init__(self, output_file, sample_id, refseq_filter=None, min_mapping_score=None,\ | |
| 245 min_alignment_score=None, max_mismatches=None,\ | |
| 246 max_relative_mismatches=None, min_continiously_matching=None,\ | |
| 247 filter_complexity=False): | |
| 248 | |
| 249 self.output_file = bz2.BZ2File(output_file, 'wb', buffering=100 * 1024 * 1024) | |
| 250 self.sample_id = sample_id.translate(None, NON_ID) | |
| 251 self.refseq_filter = refseq_filter | |
| 252 self.max_mismatches = max_mismatches | |
| 253 self.max_relative_mismatches = max_relative_mismatches | |
| 254 self.current_group = [] | |
| 255 self.min_mapping_score = min_mapping_score | |
| 256 self.min_alignment_score = min_alignment_score | |
| 257 self.min_continiously_matching = min_continiously_matching | |
| 258 self.filter_complexity = filter_complexity | |
| 259 | |
| 260 self.re_matches = re.compile(r'(\d+)M') | |
| 261 self.re_dels = re.compile(r'(\d+)D') | |
| 262 | |
| 263 def count(self, parsed_line): | |
| 264 | |
| 265 if parsed_line is None: | |
| 266 return | |
| 267 | |
| 268 read_key, read_name, flag, ref_name, ref_position, mapping_score,\ | |
| 269 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ | |
| 270 is_end1, is_end2, number_mismatches, alignment_score,\ | |
| 271 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ | |
| 272 is_paired, number_matches, read_end_pos, max_match = parsed_line | |
| 273 | |
| 274 if not is_mapped: | |
| 275 return | |
| 276 | |
| 277 if self.filter_complexity: | |
| 278 | |
| 279 avg_compression = float(len(zlib.compress(seq)))/len(seq) | |
| 280 | |
| 281 if avg_compression < 0.5: | |
| 282 return | |
| 283 | |
| 284 # length = len(seq) | |
| 285 # counts = [seq.count(nuc) for nuc in 'ACGT'] | |
| 286 # min_count = length * 0.10 | |
| 287 # max_count = length * 0.50 | |
| 288 # for count in counts: | |
| 289 # if count < min_count or count > max_count: | |
| 290 # return None | |
| 291 | |
| 292 # counter = Counter() | |
| 293 # for i in range(length - 2): | |
| 294 # counter[seq[i: i + 3]] += 1 | |
| 295 # maximal = length - 4 | |
| 296 | |
| 297 # highest = sum([v for k, v in counter.most_common(2)]) | |
| 298 # if highest > (maximal / 3.0): | |
| 299 # return None | |
| 300 | |
| 301 # self.passed.append(avg_compression) | |
| 302 | |
| 303 pair_id = '' | |
| 304 if is_end1: | |
| 305 pair_id = '/1' | |
| 306 | |
| 307 elif is_end2: | |
| 308 pair_id = '/2' | |
| 309 | |
| 310 read_name = self.sample_id + ';' + read_name + pair_id | |
| 311 | |
| 312 # Initialize new current group | |
| 313 if len(self.current_group) == 0: | |
| 314 self.current_group = [read_name, seq, []] | |
| 315 | |
| 316 # Write old current group to file | |
| 317 if read_name != self.current_group[0]: | |
| 318 self._write_group() | |
| 319 self.current_group = [read_name, seq, []] | |
| 320 | |
| 321 try: | |
| 322 refseq_group, family, organism, identifier = ref_name.split(';')[:4] | |
| 323 except ValueError: | |
| 324 sys.stderr.write('Read mapped to malformed reference sequence %s, skipping\n' % ref_name) | |
| 325 return | |
| 326 | |
| 327 if self.min_continiously_matching: | |
| 328 | |
| 329 if self.min_continiously_matching > max_match: | |
| 330 return | |
| 331 | |
| 332 if self.max_mismatches\ | |
| 333 and int(number_mismatches) > self.max_mismatches: | |
| 334 return | |
| 335 | |
| 336 if self.max_relative_mismatches\ | |
| 337 and int(number_mismatches) / float(len(seq))\ | |
| 338 > self.max_relative_mismatches: | |
| 339 return | |
| 340 | |
| 341 if self.min_mapping_score\ | |
| 342 and self.min_mapping_score > mapping_score: | |
| 343 return | |
| 344 | |
| 345 if self.min_alignment_score\ | |
| 346 and self.min_alignment_score > alignment_score: | |
| 347 return | |
| 348 | |
| 349 start = int(ref_position) + 1 | |
| 350 | |
| 351 self.current_group[2].append([refseq_group, family, organism, identifier, str(start), str(read_end_pos)]) | |
| 352 | |
| 353 def _write_group(self): | |
| 354 passed = True | |
| 355 | |
| 356 if self.refseq_filter: | |
| 357 passed = False | |
| 358 for refseq_group, family, organism, identifier, start, end in self.current_group[2]: | |
| 359 if passed: | |
| 360 break | |
| 361 for f in self.refseq_filter: | |
| 362 if refseq_group == f: | |
| 363 passed = True | |
| 364 break | |
| 365 if passed: | |
| 366 description = [] | |
| 367 for identifier in self.current_group[2]: | |
| 368 description.append(';'.join(identifier)) | |
| 369 description = '|'.join(description) | |
| 370 | |
| 371 record = SeqRecord(Seq(self.current_group[1])) | |
| 372 record.id = 'Read;' + self.current_group[0] | |
| 373 record.description = description | |
| 374 | |
| 375 SeqIO.write([record], self.output_file, "fasta") | |
| 376 | |
| 377 def write(self): | |
| 378 | |
| 379 self._write_group() | |
| 380 self.output_file.close() | |
| 381 | |
| 382 class SAMParser: | |
| 383 | |
| 384 def parse(self, line): | |
| 385 | |
| 386 if line[0] == '@': | |
| 387 return None | |
| 388 | |
| 389 alignment = HTSeq._HTSeq.SAM_Alignment.from_SAM_line(line) | |
| 390 read_name = alignment.read.name | |
| 391 seq = alignment.read.seq | |
| 392 qual = alignment.read.qual | |
| 393 flag = alignment.flag | |
| 394 cigar = None | |
| 395 | |
| 396 is_paired = (flag & 1) | |
| 397 is_mapped = not (flag & 4) | |
| 398 is_mate_mapped = alignment.mate_aligned is not None #not (flag & 8) | |
| 399 is_reverse = (flag & 16) | |
| 400 is_end1 = (flag & 64) | |
| 401 is_end2 = (flag & 128) | |
| 402 is_primary = not (flag & 256) | |
| 403 | |
| 404 read_key = (read_name, is_end1) | |
| 405 | |
| 406 ref_name = None | |
| 407 ref_position = None | |
| 408 mapping_score = 0 | |
| 409 mate_ref_name = None | |
| 410 mate_ref_position = None | |
| 411 insert_size = None | |
| 412 alignment_score = 0 | |
| 413 read_end_pos = None | |
| 414 | |
| 415 if is_mate_mapped and alignment.mate_start: | |
| 416 mate_ref_name = alignment.mate_start.chrom | |
| 417 mate_ref_position = alignment.mate_start.start | |
| 418 | |
| 419 number_hits = 0 | |
| 420 alignment_score = 0 | |
| 421 number_mismatches = 0 | |
| 422 | |
| 423 number_matches = 0 | |
| 424 max_match = 0 | |
| 425 | |
| 426 if is_mapped: | |
| 427 | |
| 428 ref_name = alignment.iv.chrom | |
| 429 ref_position = alignment.iv.start | |
| 430 read_end_pos = alignment.iv.end | |
| 431 alignment_score = alignment.aQual | |
| 432 cigar = alignment.cigar | |
| 433 | |
| 434 if is_mate_mapped: | |
| 435 insert_size = alignment.inferred_insert_size | |
| 436 | |
| 437 for c in cigar: | |
| 438 if c.type == 'M': | |
| 439 number_matches += c.size | |
| 440 max_match = max(max_match, c.size) | |
| 441 | |
| 442 for tag, value in alignment.optional_fields: | |
| 443 if tag == 'NM': | |
| 444 number_hits = value | |
| 445 elif tag == 'AS': | |
| 446 alignment_score = value | |
| 447 elif tag == 'NH': | |
| 448 number_mismatches = value | |
| 449 | |
| 450 return read_key, read_name, flag, ref_name, ref_position, mapping_score,\ | |
| 451 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ | |
| 452 is_end1, is_end2, number_mismatches, alignment_score,\ | |
| 453 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ | |
| 454 is_paired, number_matches, read_end_pos, max_match | |
| 455 | |
| 456 | |
| 457 class SAMQuality: | |
| 458 | |
| 459 def __init__(self, file_path): | |
| 460 | |
| 461 self.file_path = file_path | |
| 462 | |
| 463 self.stored = defaultdict(Counter) | |
| 464 self.all_references = defaultdict(int) | |
| 465 self.primary_references = defaultdict(int) | |
| 466 self.complement = string.maketrans('ATCGN', 'TAGCN') | |
| 467 | |
| 468 if KHMER_AVAILABLE: | |
| 469 self.ktable = khmer.new_ktable(10) | |
| 470 | |
| 471 def _get_complement(self, sequence): | |
| 472 | |
| 473 return sequence.translate(self.complement)[::-1] | |
| 474 | |
| 475 def _get_summary(self, counter): | |
| 476 """"Returns five numbers (sum, extrema, mean, and std) | |
| 477 for a max_frequency counter """ | |
| 478 | |
| 479 maximum = 0 | |
| 480 minimum = sys.maxint | |
| 481 thesum = 0 | |
| 482 allcount = 0 | |
| 483 mode = [0, None] | |
| 484 | |
| 485 items = 0.0 | |
| 486 mean = 0.0 | |
| 487 m2 = 0.0 | |
| 488 variance = 0.0 | |
| 489 | |
| 490 for item in counter: | |
| 491 | |
| 492 count = counter[item] | |
| 493 if count > mode[0]: | |
| 494 mode = [count, item] | |
| 495 | |
| 496 allcount += count | |
| 497 maximum = max(maximum, item) | |
| 498 minimum = min(minimum, item) | |
| 499 thesum += (count * item) | |
| 500 | |
| 501 x = 1 | |
| 502 while x <= count: | |
| 503 items += 1 | |
| 504 delta = item - mean | |
| 505 mean = mean + delta / items | |
| 506 m2 = m2 + delta * (item - mean) | |
| 507 variance = m2 / items | |
| 508 x += 1 | |
| 509 | |
| 510 std = math.sqrt(variance) | |
| 511 | |
| 512 return allcount, thesum, minimum, maximum, mode[1], mean, std | |
| 513 | |
| 514 | |
| 515 def _to_unit(self, item, is_percentage=False): | |
| 516 """ Convert a numeric to a string with metric units """ | |
| 517 | |
| 518 if is_percentage: | |
| 519 return ('%-.3f' % (item * 100)) + '%' | |
| 520 converted = None | |
| 521 try: | |
| 522 item = float(item) | |
| 523 if item > 10**12: | |
| 524 converted = str(round(item / 10**9,3))+'P' | |
| 525 elif item > 10**9: | |
| 526 converted = str(round(item / 10**9,3))+'G' | |
| 527 elif item > 10**6: | |
| 528 converted = str(round(item / 10**6,3))+'M' | |
| 529 elif item > 10**3: | |
| 530 converted = str(round(item / 10**3,3))+'K' | |
| 531 else: | |
| 532 converted = str(round(item,3)) | |
| 533 except: | |
| 534 converted = str(item) | |
| 535 | |
| 536 return converted | |
| 537 | |
| 538 def _str_metrics(self, data): | |
| 539 | |
| 540 str_metrics = [] | |
| 541 | |
| 542 for (item, metric) in sorted(data.keys()): | |
| 543 counter = data[(item, metric)] | |
| 544 if not hasattr(counter.iterkeys().next(), 'real'): | |
| 545 for element, count in counter.most_common(): | |
| 546 str_metrics.append(self._str_metric(item, metric, element, count, no_units=True)) | |
| 547 else: | |
| 548 summary = self._get_summary(counter) | |
| 549 str_metrics.append(self._str_metric(item, metric, *summary)) | |
| 550 | |
| 551 return str_metrics | |
| 552 | |
| 553 def _str_metric(self, item, metric, count, thesum='', minimum='',\ | |
| 554 maximum='', mode='', mean='', std='', no_units=False): | |
| 555 | |
| 556 counters = [count, thesum, minimum, maximum, mode, mean, std] | |
| 557 counters = map(str, counters) | |
| 558 | |
| 559 if no_units: | |
| 560 items = [item, metric] + counters | |
| 561 else: | |
| 562 units = map(self._to_unit, counters) | |
| 563 items = [item, metric] + units | |
| 564 | |
| 565 return '%-15s\t%-60s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\n' \ | |
| 566 % tuple(items) | |
| 567 | |
| 568 | |
| 569 def _count_read(self, metric, data, sample): | |
| 570 | |
| 571 item = 'read' | |
| 572 | |
| 573 (insert_size, alignment_score, mapping_score, length,\ | |
| 574 q20_length, avg_phred_quality, number_hits, is_reverse) = data | |
| 575 | |
| 576 self.stored[(item, metric + ' mappings')][number_hits] += sample | |
| 577 self.stored[(item, metric + ' insert')][insert_size] += sample | |
| 578 | |
| 579 | |
| 580 def _count_segment(self, metric, data, sample): | |
| 581 | |
| 582 item = 'segment' | |
| 583 | |
| 584 (insert_size, alignment_score, mapping_score, length,\ | |
| 585 q20_length, avg_phred_quality, number_hits, is_reverse) = data | |
| 586 | |
| 587 self.stored[(item, metric + ' algq')][alignment_score] += sample | |
| 588 self.stored[(item, metric + ' mapq')][mapping_score] += sample | |
| 589 self.stored[(item, metric + ' length')][length] += sample | |
| 590 self.stored[(item, metric + ' q20length')][q20_length] += sample | |
| 591 self.stored[(item, metric + ' meanbasequal')][avg_phred_quality] += sample | |
| 592 self.stored[(item, metric + ' reverse')][is_reverse] += sample | |
| 593 | |
| 594 def count(self, parsed_line): | |
| 595 | |
| 596 if parsed_line is None: | |
| 597 return | |
| 598 | |
| 599 #print_metric('Item' , 'Metric', 'Count', 'Sum', 'Min', 'Max', 'Mode', 'Mean', 'STD') | |
| 600 | |
| 601 read_key, read_name, flag, ref_name, ref_position, mapping_score,\ | |
| 602 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ | |
| 603 is_end1, is_end2, number_mismatches, alignment_score,\ | |
| 604 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ | |
| 605 is_paired, number_matches, read_end_pos, max_match = parsed_line | |
| 606 | |
| 607 phred_quality = [q - 33 for q in qual] | |
| 608 avg_phred_quality = sum(phred_quality) / float(len(phred_quality)) | |
| 609 length = len(seq) | |
| 610 mate_reference_id = mate_ref_name | |
| 611 reference_id = ref_name | |
| 612 reference = reference_id is not None and reference_id != '*' | |
| 613 insert_size = insert_size and abs(insert_size) or insert_size | |
| 614 is_segment1 = not is_paired or (is_paired and is_end1) | |
| 615 is_reverse = is_reverse | |
| 616 is_unique = is_primary and number_hits == 1 | |
| 617 is_translocation = is_paired and is_mapped and is_mate_mapped\ | |
| 618 and (mate_reference_id != '=' and reference_id != mate_reference_id) | |
| 619 is_part_of_doublemap = is_paired and is_mapped and is_mate_mapped | |
| 620 is_part_of_halfmap = is_paired and (is_mapped != is_mate_mapped) | |
| 621 is_part_of_nomap = is_paired and not is_mapped and not is_mate_mapped | |
| 622 | |
| 623 | |
| 624 # Count length until first low quality base call | |
| 625 q20_length = 0 | |
| 626 for q in phred_quality: | |
| 627 if q < 20: | |
| 628 break | |
| 629 q20_length += 1 | |
| 630 | |
| 631 # Count kmers | |
| 632 if KHMER_AVAILABLE: | |
| 633 if not is_reverse: | |
| 634 self.ktable.consume(seq) | |
| 635 else: | |
| 636 self.ktable.consume(self._get_complement(seq)) | |
| 637 | |
| 638 if reference: | |
| 639 | |
| 640 self.all_references[reference_id] += 1 | |
| 641 | |
| 642 if is_primary: | |
| 643 self.primary_references[reference_id] += 1 | |
| 644 | |
| 645 | |
| 646 data = (insert_size, alignment_score, mapping_score, length,\ | |
| 647 q20_length, avg_phred_quality, number_hits, is_reverse) | |
| 648 | |
| 649 sample = 1 | |
| 650 | |
| 651 self._count_segment('sequenced', data, sample) | |
| 652 if is_mapped: | |
| 653 self._count_segment('sequenced mapped multi', data, sample) | |
| 654 if is_primary: | |
| 655 self._count_segment('sequenced mapped primary', data, sample) | |
| 656 if number_hits and is_unique: | |
| 657 self._count_segment('sequenced mapped primary unique', data, sample) | |
| 658 | |
| 659 if is_segment1: | |
| 660 self._count_read('sequenced mapped multi', data, sample) | |
| 661 if is_primary: | |
| 662 self._count_read('sequenced mapped primary', data, sample) | |
| 663 | |
| 664 if is_paired: | |
| 665 self._count_segment('sequenced paired', data, sample) | |
| 666 | |
| 667 if is_part_of_doublemap: | |
| 668 self._count_segment('sequenced paired doublemap', data, sample) | |
| 669 | |
| 670 if is_primary: | |
| 671 self._count_segment('sequenced paired doublemap primary', data, sample) | |
| 672 | |
| 673 if is_segment1: | |
| 674 self._count_read('sequenced paired doublemap multi', data, sample) | |
| 675 | |
| 676 if is_primary: | |
| 677 self._count_read('sequenced paired doublemap primary', data, sample) | |
| 678 | |
| 679 if number_hits and is_unique: | |
| 680 self._count_read('sequenced paired doublemap primary unique', data, sample) | |
| 681 | |
| 682 if is_translocation: | |
| 683 self._count_read('sequenced paired doublemap primary unique translocation', data, sample) | |
| 684 | |
| 685 elif is_part_of_halfmap: | |
| 686 | |
| 687 self._count_segment('sequenced paired halfmap', data, sample) | |
| 688 | |
| 689 # The mapped segment | |
| 690 if is_mapped: | |
| 691 | |
| 692 self._count_segment('sequenced paired halfmap mapped', data, sample) | |
| 693 | |
| 694 if is_primary: | |
| 695 | |
| 696 self._count_read('sequenced paired halfmap mapped primary', data, sample) | |
| 697 | |
| 698 if number_hits and is_unique: | |
| 699 self._count_read('sequenced paired halfmap mapped primary unique', data, sample) | |
| 700 | |
| 701 elif not is_primary: | |
| 702 | |
| 703 self._count_read('sequenced unpaired mapped multi', data, sample) | |
| 704 | |
| 705 # The unmapped segment | |
| 706 elif not is_mapped: | |
| 707 self._count_segment('sequenced paired halfmap unmapped', data, sample) | |
| 708 | |
| 709 elif is_part_of_nomap: | |
| 710 | |
| 711 self._count_segment('sequenced paired nomap', data, sample) | |
| 712 | |
| 713 if is_segment1: | |
| 714 | |
| 715 self._count_read('sequenced paired nomap', data, sample) | |
| 716 | |
| 717 elif not is_paired: | |
| 718 | |
| 719 self._count_segment('sequenced unpaired', data, sample) | |
| 720 | |
| 721 if is_mapped: | |
| 722 | |
| 723 self._count_segment('sequenced unpaired mapped', data, sample) | |
| 724 | |
| 725 if is_primary: | |
| 726 | |
| 727 self._count_read('sequenced unpaired mapped primary', data, sample) | |
| 728 | |
| 729 if number_hits and is_unique: | |
| 730 | |
| 731 self._count_read('sequenced paired unpaired mapped primary unique', data, sample) | |
| 732 | |
| 733 elif not is_primary: | |
| 734 | |
| 735 self._count_read('sequenced unpaired mapped multi', data, sample) | |
| 736 | |
| 737 | |
| 738 elif not is_mapped: | |
| 739 | |
| 740 self._count_segment('sequenced unpaired unmapped', data, sample) | |
| 741 | |
| 742 if is_segment1: | |
| 743 self._count_read('sequenced unpaired unmapped', data, sample) | |
| 744 | |
| 745 def write(self): | |
| 746 | |
| 747 with open(self.file_path, 'w') as output_file: | |
| 748 | |
| 749 all_references = sorted([(count, reference) for reference, count\ | |
| 750 in self.all_references.iteritems()], reverse=True) | |
| 751 | |
| 752 for j, (count, reference) in enumerate(all_references[:30]): | |
| 753 self.stored[('segment', 'multireference_' + str(j+1))][reference] = count | |
| 754 | |
| 755 primary_references = sorted([(count, reference) for reference, count\ | |
| 756 in self.primary_references.iteritems()], reverse=True) | |
| 757 | |
| 758 for j, (count, reference) in enumerate(primary_references[:30]): | |
| 759 self.stored[('segment', 'primaryreference_' + str(j+1))][reference] = count | |
| 760 | |
| 761 # Extract top-ranking kmers | |
| 762 if KHMER_AVAILABLE: | |
| 763 kmer_frequencies = [] | |
| 764 for i in range(0, self.ktable.n_entries()): | |
| 765 n = self.ktable.get(i) | |
| 766 if n > 0: | |
| 767 kmer_frequencies.append((n, self.ktable.reverse_hash(i))) | |
| 768 kmer_frequencies = sorted(kmer_frequencies, reverse=True) | |
| 769 for j, (frequency, kmer) in enumerate(kmer_frequencies[:10]): | |
| 770 self.stored[('segment', 'kmer_' + str(j+1))][kmer] = frequency | |
| 771 | |
| 772 output_file.writelines(self._str_metrics(self.stored)) | |
| 773 | |
| 774 | |
| 775 class SAMTaxonomy: | |
| 776 """ Provides taxonomic summary information from a SAM file stream. """ | |
| 777 | |
| 778 def __init__(self, file_path): | |
| 779 | |
| 780 self.file_path = file_path | |
| 781 | |
| 782 self.count_primaries = Counter() | |
| 783 self.detailed_information = {} | |
| 784 | |
| 785 self._last_read = (None, None) | |
| 786 self._last_read_human_prim = 0 | |
| 787 self._last_read_human_sec = 0 | |
| 788 self._last_organisms = set() | |
| 789 | |
| 790 def count(self, parsed_line): | |
| 791 | |
| 792 if parsed_line is None: | |
| 793 return | |
| 794 | |
| 795 read_key, read_name, flag, ref_name, ref_position, mapping_score,\ | |
| 796 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ | |
| 797 is_end1, is_end2, number_mismatches, alignment_score,\ | |
| 798 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ | |
| 799 is_paired, number_matches, read_end_pos, max_match = parsed_line | |
| 800 | |
| 801 if is_mapped: | |
| 802 | |
| 803 refseq_group, family, organism, gi = ref_name.split(';')[:4] | |
| 804 | |
| 805 if is_primary: | |
| 806 self.count_primaries[organism] += 1 | |
| 807 | |
| 808 if organism not in self.detailed_information: | |
| 809 # refseq_group. family, gis, avg_mapping_score, | |
| 810 # avg_seq_length, avg_number_hits, avg_alignment_score, avg_nr_mismatches | |
| 811 initial = [refseq_group, | |
| 812 family, | |
| 813 set([gi]), | |
| 814 [int(mapping_score), 1], | |
| 815 [len(seq), 1], | |
| 816 [0, 0], | |
| 817 [alignment_score, 1], | |
| 818 [number_mismatches, 1], | |
| 819 0, | |
| 820 0] | |
| 821 | |
| 822 self.detailed_information[organism] = initial | |
| 823 | |
| 824 else: | |
| 825 entry = self.detailed_information[organism] | |
| 826 entry[2].add(gi) | |
| 827 entry[3][0] += int(mapping_score) | |
| 828 entry[3][1] += 1 | |
| 829 entry[4][0] += len(seq) | |
| 830 entry[4][1] += 1 | |
| 831 entry[6][0] += alignment_score | |
| 832 entry[6][1] += 1 | |
| 833 entry[7][0] += number_mismatches | |
| 834 entry[7][1] += 1 | |
| 835 | |
| 836 if is_primary: | |
| 837 entry = self.detailed_information[organism] | |
| 838 entry[5][0] += number_hits | |
| 839 entry[5][1] += 1 | |
| 840 | |
| 841 if self._last_read == (None, None): | |
| 842 self._last_read = read_key | |
| 843 | |
| 844 if self._last_read != read_key: | |
| 845 | |
| 846 for last_organism in self._last_organisms: | |
| 847 | |
| 848 self.detailed_information[last_organism][8]\ | |
| 849 += self._last_read_human_prim | |
| 850 | |
| 851 self.detailed_information[last_organism][9]\ | |
| 852 += self._last_read_human_sec | |
| 853 | |
| 854 self._last_read = read_key | |
| 855 self._last_organisms = set() | |
| 856 self._last_read_human_prim = 0 | |
| 857 self._last_read_human_sec = 0 | |
| 858 | |
| 859 self._last_organisms.add(organism) | |
| 860 | |
| 861 if organism == 'Homo_sapiens': | |
| 862 if is_primary: | |
| 863 self._last_read_human_prim += 1 | |
| 864 else: | |
| 865 self._last_read_human_sec += 1 | |
| 866 | |
| 867 def get_summary(self, top=100): | |
| 868 | |
| 869 lines = [] | |
| 870 | |
| 871 lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10s\t%10s\t%5s\t%5s\t%5s\t%10s\t%10s\n'\ | |
| 872 % ('Count', 'Group', 'Family', 'Organism', 'Targets', 'ReadLen', 'Hits', 'Map', 'Algn', 'Mism', 'HuP', 'HuS')) | |
| 873 | |
| 874 top_organisms = self.count_primaries.most_common(top) | |
| 875 | |
| 876 for organism, count in top_organisms: | |
| 877 | |
| 878 refseq_group, family, identifiers,\ | |
| 879 avg_mapping_score, avg_seq_length, avg_number_hits,\ | |
| 880 avg_alignment_score, avg_nr_mismatches, human_prim, human_sec\ | |
| 881 = self.detailed_information[organism] | |
| 882 | |
| 883 avg_len = int(avg_seq_length[0] / float(avg_seq_length[1])) | |
| 884 if avg_number_hits[1] == 0: | |
| 885 avg_hits = 0 | |
| 886 else: | |
| 887 avg_hits = int(avg_number_hits[ | |
| 888 0] / float(avg_number_hits[1])) | |
| 889 | |
| 890 avg_mapping_score = int(avg_mapping_score[ | |
| 891 0] / float(avg_mapping_score[1])) | |
| 892 | |
| 893 avg_alignment_score = int(avg_alignment_score[ | |
| 894 0] / float(avg_alignment_score[1])) | |
| 895 | |
| 896 avg_nr_mismatches = int(avg_nr_mismatches[ | |
| 897 0] / float(avg_nr_mismatches[1])) | |
| 898 | |
| 899 nr_ids = len(identifiers) | |
| 900 | |
| 901 if count > 10**6: | |
| 902 count = str(round(count / float(10**6), 3)) + 'M' | |
| 903 if human_prim > 10**6: | |
| 904 human_prim = str(round(human_prim / float(10**6), 3)) + 'M' | |
| 905 if human_sec > 10**6: | |
| 906 human_sec = str(round(human_sec / float(10**6), 3)) + 'M' | |
| 907 if nr_ids > 10**6: | |
| 908 nr_ids = str(round(nr_ids / float(10**6), 3)) + 'M' | |
| 909 | |
| 910 lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10i\t%10i\t%5i\t%5i\t%5i\t%10s\t%10s\n'\ | |
| 911 % (str(count), refseq_group[:20], family[:20], organism[:20],\ | |
| 912 str(nr_ids), avg_len, avg_hits, avg_mapping_score,\ | |
| 913 avg_alignment_score, avg_nr_mismatches, str(human_prim),\ | |
| 914 str(human_sec))) | |
| 915 | |
| 916 return lines | |
| 917 | |
| 918 def write(self): | |
| 919 | |
| 920 with open(self.file_path, 'w') as output_file: | |
| 921 output_file.writelines(self.get_summary()) | |
| 922 | |
| 923 @CLI.subcommand("rnamap") | |
| 924 class RNAmap(cli.Application): | |
| 925 """ Map input reads against a STAR index """ | |
| 926 | |
| 927 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, | |
| 928 help="Sets the index output directory") | |
| 929 | |
| 930 threads = cli.SwitchAttr( | |
| 931 ['-t', '--threads'], cli.Range(1, 512), mandatory=False, | |
| 932 help="Sets the number of threads to use", | |
| 933 default=1) | |
| 934 | |
| 935 taxonomy = cli.SwitchAttr( | |
| 936 ['-x', '--taxonomy'], str, mandatory=False, | |
| 937 help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping", | |
| 938 default='') | |
| 939 | |
| 940 star_path = cli.SwitchAttr(['--star_path'], str, mandatory=False, | |
| 941 help="Path to STAR executable", | |
| 942 default='') | |
| 943 | |
| 944 samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False, | |
| 945 help="Path to samtools executable", | |
| 946 default='') | |
| 947 | |
| 948 temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False, | |
| 949 help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.", | |
| 950 default='') | |
| 951 | |
| 952 min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False, | |
| 953 help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)", | |
| 954 default=None) | |
| 955 | |
| 956 min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False, | |
| 957 help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)", | |
| 958 default=None) | |
| 959 | |
| 960 max_mismatches = cli.SwitchAttr(['--max_mismatches'], cli.Range(0, 10000000), mandatory=False, | |
| 961 help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)", | |
| 962 default=None) | |
| 963 | |
| 964 max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False, | |
| 965 help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)", | |
| 966 default=None) | |
| 967 | |
| 968 min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False, | |
| 969 help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)", | |
| 970 default=None) | |
| 971 | |
| 972 filter_complexity = cli.Flag(['--filter_complexity'], | |
| 973 help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)", | |
| 974 default=False) | |
| 975 | |
| 976 bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False, | |
| 977 help="Path to unsorted, unindexed output BAM file", | |
| 978 default='') | |
| 979 | |
| 980 sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False, | |
| 981 help="Path to output SAM file", | |
| 982 default='') | |
| 983 | |
| 984 qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False, | |
| 985 help="Path to output quality file", | |
| 986 default='') | |
| 987 | |
| 988 hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False, | |
| 989 help="Path to bzip2-compressed tab-delimited output hit file", | |
| 990 default='') | |
| 991 | |
| 992 sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False, | |
| 993 help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file", | |
| 994 default='no_sample_id') | |
| 995 | |
| 996 unmapped1 = cli.SwitchAttr(['--unmapped_end_1'], str, mandatory=False, | |
| 997 help="Output path to uncompressed fastq file containing unmapped reads, first ends only for paired ends.", | |
| 998 default='') | |
| 999 | |
| 1000 unmapped2 = cli.SwitchAttr(['--unmapped_end_2'], str, mandatory=False, | |
| 1001 help="Output path to uncompressed fastq file containing unmapped reads, second ends only for paired ends.", | |
| 1002 default='') | |
| 1003 | |
| 1004 splice_junctions = cli.SwitchAttr(['--splice_junctions'], str, mandatory=False, | |
| 1005 help="Input path to splice junction file (currently not implemented)", | |
| 1006 default='') | |
| 1007 | |
| 1008 chimeric_mappings = cli.SwitchAttr(['--chimeric_mappings'], str, mandatory=False, | |
| 1009 help="Ouput path to SAM file containing chimeric mappings", | |
| 1010 default='') | |
| 1011 | |
| 1012 hit_filter = cli.SwitchAttr( | |
| 1013 ['-f', '--virana_hit_filter'], str, list=True, mandatory=False, | |
| 1014 help="Only generate hit groups that include at last one read mapping to a reference of this reference group.", | |
| 1015 default=[]) | |
| 1016 | |
| 1017 debug = cli.Flag(["-d", "--debug"], help="Enable debug information") | |
| 1018 | |
| 1019 reads = cli.SwitchAttr( | |
| 1020 ['-r', '--reads'], str, list=True, mandatory=True, | |
| 1021 help="Sets the input reads. Add this parameter twice for paired end reads.") | |
| 1022 | |
| 1023 zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped") | |
| 1024 | |
| 1025 sensitive = cli.Flag( | |
| 1026 ["--sensitive"], help="If given, mapping will process slower and more sensitive") | |
| 1027 | |
| 1028 def main(self): | |
| 1029 | |
| 1030 if self.debug: | |
| 1031 logging.getLogger().setLevel(logging.DEBUG) | |
| 1032 | |
| 1033 # Obtain star executable | |
| 1034 star = [self.star_path and self.star_path or 'STAR'] | |
| 1035 samtools = [self.samtools_path and self.samtools_path or 'samtools'] | |
| 1036 | |
| 1037 # Check if genome directory is existing | |
| 1038 if not os.path.exists(self.index_dir): | |
| 1039 sys.stdout.write('Index directory %s not existing, exiting' % self.genome_dir) | |
| 1040 sys.exit(1) | |
| 1041 | |
| 1042 if self.temp_path: | |
| 1043 temp_path = tempfile.mkdtemp(dir=self.temp_path) | |
| 1044 else: | |
| 1045 temp_path = tempfile.mkdtemp() | |
| 1046 | |
| 1047 first_ends = [] | |
| 1048 second_ends = [] | |
| 1049 single_ends = [] | |
| 1050 | |
| 1051 if len(self.reads) == 2: | |
| 1052 first, second = self.reads | |
| 1053 first_ends.append(first) | |
| 1054 second_ends.append(second) | |
| 1055 | |
| 1056 elif len(self.reads) == 1: | |
| 1057 single_ends.append(self.reads[0]) | |
| 1058 | |
| 1059 else: | |
| 1060 sys.stdout.write('Invalid number of fastq files; provide either one (single end) or two (paired end)') | |
| 1061 sys.exit(1) | |
| 1062 | |
| 1063 if single_ends and not first_ends and not second_ends: | |
| 1064 reads = [','.join(single_ends)] | |
| 1065 | |
| 1066 elif first_ends and second_ends: | |
| 1067 reads = [','.join(first_ends), ','.join(second_ends)] | |
| 1068 | |
| 1069 | |
| 1070 star_cline = star + ['--runMode', 'alignReads', | |
| 1071 '--genomeDir', self.index_dir, | |
| 1072 '--runThreadN', str(self.threads), | |
| 1073 '--readMatesLengthsIn', 'NotEqual', | |
| 1074 '--outFileNamePrefix', os.path.join( | |
| 1075 temp_path, 'out'), | |
| 1076 '--outSAMmode', 'Full', | |
| 1077 '--outSAMstrandField', 'None', | |
| 1078 '--outSAMattributes', 'Standard', | |
| 1079 '--outSAMunmapped', 'Within', | |
| 1080 '--outStd', 'SAM', | |
| 1081 '--outFilterMultimapNmax', '1000', | |
| 1082 '--outSAMprimaryFlag', 'AllBestScore', | |
| 1083 '--outSAMorder', 'PairedKeepInputOrder'] | |
| 1084 | |
| 1085 if self.unmapped1 or self.unmapped2: | |
| 1086 star_cline += ['--outReadsUnmapped', 'Fastx'] | |
| 1087 else: | |
| 1088 star_cline += ['--outReadsUnmapped', 'None'] | |
| 1089 | |
| 1090 | |
| 1091 if self.zipped: | |
| 1092 star_cline += ['--readFilesCommand', 'zcat'] | |
| 1093 | |
| 1094 if self.sensitive: | |
| 1095 star_cline += ['--outFilterMultimapScoreRange', '10', | |
| 1096 '--outFilterMismatchNmax', '60', | |
| 1097 '--outFilterMismatchNoverLmax', '0.3', | |
| 1098 '--outFilterScoreMin', '0', | |
| 1099 '--outFilterScoreMinOverLread', '0.3', | |
| 1100 '--outFilterMatchNmin', '0', | |
| 1101 '--outFilterMatchNminOverLread', '0.66', | |
| 1102 '--seedSearchStartLmax', '12', | |
| 1103 '--winAnchorMultimapNmax', '50'] | |
| 1104 | |
| 1105 star_cline += ['--readFilesIn'] + reads | |
| 1106 | |
| 1107 if self.debug: | |
| 1108 print ' '.join(star_cline) | |
| 1109 | |
| 1110 # Try if we can make the relevant files | |
| 1111 touch_files = [self.unmapped1, self.unmapped2, self.taxonomy, self.qual, self.hits, self.sam, self.bam] | |
| 1112 for file_path in touch_files: | |
| 1113 if file_path is None or file_path == '': | |
| 1114 continue | |
| 1115 try: | |
| 1116 with file(file_path, 'a'): | |
| 1117 os.utime(file_path, None) | |
| 1118 except IOError: | |
| 1119 sys.stderr.write('Could not write output file %s\n' % file_path) | |
| 1120 sys.exit(1) | |
| 1121 | |
| 1122 star_process = subprocess.Popen(' '.join( | |
| 1123 star_cline), shell=True, stdout=PIPE) | |
| 1124 | |
| 1125 parser = SAMParser() | |
| 1126 | |
| 1127 if self.taxonomy: | |
| 1128 taxonomy = SAMTaxonomy(self.taxonomy) | |
| 1129 | |
| 1130 if self.qual: | |
| 1131 quality = SAMQuality(self.qual) | |
| 1132 | |
| 1133 if self.hits: | |
| 1134 hits = SAMHits(self.hits, self.sample_id, self.hit_filter, | |
| 1135 self.min_mapping_score, | |
| 1136 self.min_alignment_score, | |
| 1137 self.max_mismatches, | |
| 1138 self.max_relative_mismatches, | |
| 1139 self.min_continiously_matching, | |
| 1140 self.filter_complexity) | |
| 1141 | |
| 1142 if self.sam: | |
| 1143 sam_file = open(self.sam, 'w') | |
| 1144 | |
| 1145 if self.bam: | |
| 1146 with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file: | |
| 1147 samtools_cline = samtools + [ | |
| 1148 'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin'] | |
| 1149 if self.debug: | |
| 1150 print ' '.join(samtools_cline) | |
| 1151 samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE) | |
| 1152 | |
| 1153 | |
| 1154 do_sam = self.sam | |
| 1155 do_bam = self.bam | |
| 1156 do_taxonomy = self.taxonomy | |
| 1157 do_qual = self.qual | |
| 1158 do_hits = self.hits | |
| 1159 do_parse = do_taxonomy or do_qual or do_hits | |
| 1160 | |
| 1161 for i, line in enumerate(iter(star_process.stdout.readline, '')): | |
| 1162 | |
| 1163 if do_sam: | |
| 1164 sam_file.write(line) | |
| 1165 | |
| 1166 if do_bam: | |
| 1167 samtools_process.stdin.write(line) | |
| 1168 | |
| 1169 if line[0] == '@': | |
| 1170 continue | |
| 1171 | |
| 1172 if do_parse: | |
| 1173 parsed_line = parser.parse(line) | |
| 1174 | |
| 1175 if do_taxonomy: | |
| 1176 taxonomy.count(parsed_line) | |
| 1177 if i > 0 and (i % 50000) == 0: | |
| 1178 print ''.join(taxonomy.get_summary(10)) | |
| 1179 | |
| 1180 if do_qual: | |
| 1181 quality.count(parsed_line) | |
| 1182 | |
| 1183 if do_hits: | |
| 1184 hits.count(parsed_line) | |
| 1185 | |
| 1186 if do_bam: | |
| 1187 samtools_process.stdin.close() | |
| 1188 | |
| 1189 if do_sam: | |
| 1190 sam_file.close() | |
| 1191 | |
| 1192 if do_hits: | |
| 1193 hits.write() | |
| 1194 | |
| 1195 if do_taxonomy: | |
| 1196 print ''.join(taxonomy.get_summary(10)) | |
| 1197 taxonomy.write() | |
| 1198 | |
| 1199 if do_qual: | |
| 1200 quality.write() | |
| 1201 | |
| 1202 try: | |
| 1203 if self.unmapped1: | |
| 1204 shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate1'),\ | |
| 1205 self.unmapped1) | |
| 1206 except IOError: | |
| 1207 pass | |
| 1208 | |
| 1209 try: | |
| 1210 if self.unmapped2: | |
| 1211 shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate2'),\ | |
| 1212 self.unmapped2) | |
| 1213 except IOError: | |
| 1214 pass | |
| 1215 | |
| 1216 try: | |
| 1217 if self.chimeric_mappings: | |
| 1218 shutil.move(os.path.join(temp_path, 'out' + 'Chimeric.out.sam'),\ | |
| 1219 self.chimeric_mappings) | |
| 1220 except IOError: | |
| 1221 pass | |
| 1222 | |
| 1223 shutil.rmtree(temp_path) | |
| 1224 | |
| 1225 @CLI.subcommand("dnamap") | |
| 1226 class DNAmap(cli.Application): | |
| 1227 """ Map input reads against a BWA index """ | |
| 1228 | |
| 1229 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, | |
| 1230 help="Sets the index output directory") | |
| 1231 | |
| 1232 threads = cli.SwitchAttr( | |
| 1233 ['-t', '--threads'], cli.Range(1, 512), mandatory=False, | |
| 1234 help="Sets the number of threads to use", | |
| 1235 default=1) | |
| 1236 | |
| 1237 taxonomy = cli.SwitchAttr( | |
| 1238 ['-x', '--taxonomy'], str, mandatory=False, | |
| 1239 help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping", | |
| 1240 default='') | |
| 1241 | |
| 1242 samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False, | |
| 1243 help="Path to samtools executable", | |
| 1244 default='') | |
| 1245 | |
| 1246 temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False, | |
| 1247 help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.", | |
| 1248 default='') | |
| 1249 | |
| 1250 min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False, | |
| 1251 help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)", | |
| 1252 default=None) | |
| 1253 | |
| 1254 min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False, | |
| 1255 help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)", | |
| 1256 default=None) | |
| 1257 | |
| 1258 max_mismatches = cli.SwitchAttr(['--max_mismatches'], cli.Range(0, 10000000), mandatory=False, | |
| 1259 help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)", | |
| 1260 default=None) | |
| 1261 | |
| 1262 max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False, | |
| 1263 help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)", | |
| 1264 default=None) | |
| 1265 | |
| 1266 min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False, | |
| 1267 help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)", | |
| 1268 default=None) | |
| 1269 | |
| 1270 filter_complexity = cli.Flag(['--filter_complexity'], | |
| 1271 help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)", | |
| 1272 default=False) | |
| 1273 | |
| 1274 sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False, | |
| 1275 help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file", | |
| 1276 default='no_sample_id') | |
| 1277 | |
| 1278 bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False, | |
| 1279 help="Path to unsorted, unindexed output BAM file", | |
| 1280 default='') | |
| 1281 | |
| 1282 sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False, | |
| 1283 help="Path to output SAM file", | |
| 1284 default='') | |
| 1285 | |
| 1286 qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False, | |
| 1287 help="Path to output quality file", | |
| 1288 default='') | |
| 1289 | |
| 1290 hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False, | |
| 1291 help="Path to bzip2-compressed tab-delimited output virana hit file", | |
| 1292 default='') | |
| 1293 | |
| 1294 hit_filter = cli.SwitchAttr( | |
| 1295 ['-f', '--virana_hit_filter'], str, list=True, mandatory=False, | |
| 1296 help="Only generate hit groups that include at last one read mapping to a reference of this reference group.", | |
| 1297 default=[]) | |
| 1298 | |
| 1299 debug = cli.Flag(["-d", "--debug"], help="Enable debug information") | |
| 1300 | |
| 1301 zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped") | |
| 1302 | |
| 1303 sensitive = cli.Flag( | |
| 1304 ["--sensitive"], help="If given, mapping will process slower and more sensitive") | |
| 1305 | |
| 1306 | |
| 1307 bwa_path = cli.SwitchAttr(['--bwa_path'], str, mandatory=False, | |
| 1308 help="Path to BWA executable", | |
| 1309 default='') | |
| 1310 | |
| 1311 debug = cli.Flag(["-d", "--debug"], help="Enable debug information") | |
| 1312 | |
| 1313 if debug: | |
| 1314 logging.getLogger().setLevel(logging.DEBUG) | |
| 1315 | |
| 1316 reads = cli.SwitchAttr( | |
| 1317 ['-r', '--reads'], str, list=True, mandatory=True, | |
| 1318 help="Sets the input reads. Add this parameter twice for paired end reads.") | |
| 1319 | |
| 1320 def main(self): | |
| 1321 | |
| 1322 if self.debug: | |
| 1323 logging.getLogger().setLevel(logging.DEBUG) | |
| 1324 | |
| 1325 # Obtain star executable | |
| 1326 bwa = [self.bwa_path and self.bwa_path or 'bwa'] | |
| 1327 samtools = [self.samtools_path and self.samtools_path or 'samtools'] | |
| 1328 | |
| 1329 # Check if genome directory is existing | |
| 1330 if not os.path.exists(self.index_dir): | |
| 1331 sys.stdout.write('Index directory %s not existing, exiting'\ | |
| 1332 % self.genome_dir) | |
| 1333 sys.exit(1) | |
| 1334 | |
| 1335 if len(self.reads) not in (1, 2): | |
| 1336 message = 'Invalid number of FASTQ files; supply either one (single end) or two (paired end)\n' | |
| 1337 sys.stderr.write(message) | |
| 1338 sys.exit(1) | |
| 1339 | |
| 1340 bwa_cline = bwa + ['mem', '-t', str(self.threads), '-M', os.path.join(self.index_dir, 'index')] | |
| 1341 | |
| 1342 bwa_cline += self.reads | |
| 1343 | |
| 1344 if self.debug: | |
| 1345 print ' '.join(bwa_cline) | |
| 1346 | |
| 1347 bwa_process = subprocess.Popen(' '.join(bwa_cline), shell=True, stdout=PIPE) | |
| 1348 | |
| 1349 parser = SAMParser() | |
| 1350 | |
| 1351 if self.taxonomy: | |
| 1352 taxonomy = SAMTaxonomy(self.taxonomy) | |
| 1353 | |
| 1354 if self.qual: | |
| 1355 quality = SAMQuality(self.qual) | |
| 1356 | |
| 1357 if self.hits: | |
| 1358 hits = SAMHits(self.hits, self.sample_id, self.hit_filter, | |
| 1359 self.min_mapping_score, | |
| 1360 self.min_alignment_score, | |
| 1361 self.max_mismatches, | |
| 1362 self.max_relative_mismatches, | |
| 1363 self.min_continiously_matching, | |
| 1364 self.filter_complexity) | |
| 1365 | |
| 1366 if self.sam: | |
| 1367 sam_file = open(self.sam, 'w', buffering=100 * 1024 * 1024) | |
| 1368 | |
| 1369 if self.bam: | |
| 1370 with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file: | |
| 1371 samtools_cline = samtools + [ | |
| 1372 'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin'] | |
| 1373 if self.debug: | |
| 1374 print ' '.join(samtools_cline) | |
| 1375 samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE) | |
| 1376 | |
| 1377 do_sam = self.sam | |
| 1378 do_bam = self.bam | |
| 1379 do_taxonomy = self.taxonomy | |
| 1380 do_qual = self.qual | |
| 1381 do_hits = self.hits | |
| 1382 do_parse = do_taxonomy or do_qual or do_hits | |
| 1383 | |
| 1384 for i, line in enumerate(iter(bwa_process.stdout.readline, '')): | |
| 1385 | |
| 1386 if do_sam: | |
| 1387 sam_file.write(line) | |
| 1388 | |
| 1389 if do_bam: | |
| 1390 samtools_process.stdin.write(line) | |
| 1391 | |
| 1392 if do_parse: | |
| 1393 parsed_line = parser.parse(line) | |
| 1394 | |
| 1395 if do_taxonomy: | |
| 1396 taxonomy.count(parsed_line) | |
| 1397 | |
| 1398 if i > 0 and (i % 10000) == 0: | |
| 1399 print ''.join(taxonomy.get_summary(10)) | |
| 1400 | |
| 1401 if do_qual: | |
| 1402 quality.count(parsed_line) | |
| 1403 | |
| 1404 if do_hits: | |
| 1405 hits.count(parsed_line) | |
| 1406 | |
| 1407 if do_bam: | |
| 1408 samtools_process.stdin.close() | |
| 1409 | |
| 1410 if do_sam: | |
| 1411 sam_file.close() | |
| 1412 | |
| 1413 if do_hits: | |
| 1414 hits.write() | |
| 1415 | |
| 1416 if do_taxonomy: | |
| 1417 print ''.join(taxonomy.get_summary(10)) | |
| 1418 taxonomy.write() | |
| 1419 | |
| 1420 if do_qual: | |
| 1421 quality.write() | |
| 1422 | |
| 1423 | |
| 1424 if __name__ == "__main__": | |
| 1425 CLI.run() |
