| 0 | 1 #!/usr/bin/env python | 
|  | 2 #from __future__ import print_function | 
|  | 3 | 
|  | 4 import cProfile | 
|  | 5 | 
|  | 6 import sys | 
|  | 7 import re | 
|  | 8 | 
|  | 9 import tempfile | 
|  | 10 import subprocess | 
|  | 11 import shutil | 
|  | 12 import os | 
|  | 13 import os.path | 
|  | 14 import logging | 
|  | 15 import bz2 | 
|  | 16 import zlib | 
|  | 17 | 
|  | 18 import math | 
|  | 19 import string | 
|  | 20 | 
|  | 21 from collections import defaultdict, Counter | 
|  | 22 | 
|  | 23 from subprocess import PIPE | 
|  | 24 | 
|  | 25 NON_ID = ''.join(c for c in map(chr, range(256)) if not c.isalnum()) | 
|  | 26 NON_ID = NON_ID.replace('_', '').replace('-', '') | 
|  | 27 | 
|  | 28 try: | 
|  | 29     from Bio.SeqRecord import SeqRecord | 
|  | 30     from Bio import SeqIO | 
|  | 31     from Bio.Seq import Seq | 
|  | 32 except ImportError: | 
|  | 33     message = 'This script requires the BioPython python package\n' | 
|  | 34     sys.stderr.write(message) | 
|  | 35     sys.exit(1) | 
|  | 36 | 
|  | 37 try: | 
|  | 38     from plumbum import cli | 
|  | 39 except ImportError: | 
|  | 40     message = 'This script requires the plumbum python package\n' | 
|  | 41     sys.stderr.write(message) | 
|  | 42     sys.exit(1) | 
|  | 43 | 
|  | 44 | 
|  | 45 try: | 
|  | 46     import HTSeq | 
|  | 47 except ImportError: | 
|  | 48     message = 'This script requires the HTSeq python package\n' | 
|  | 49     sys.stderr.write(message) | 
|  | 50     sys.exit(1) | 
|  | 51 | 
|  | 52 KHMER_AVAILABLE = True | 
|  | 53 try: | 
|  | 54     import khmer | 
|  | 55 except ImportError: | 
|  | 56     KHMER_AVAILABLE = False | 
|  | 57 | 
|  | 58 #from io import BufferedRandom | 
|  | 59 | 
|  | 60 logging.basicConfig(level=logging.INFO, format='%(message)s') | 
|  | 61 | 
|  | 62 | 
|  | 63 def profile_this(fn): | 
|  | 64     def profiled_fn(*args, **kwargs): | 
|  | 65         fpath = fn.__name__ + ".profile" | 
|  | 66         prof = cProfile.Profile() | 
|  | 67         ret = prof.runcall(fn, *args, **kwargs) | 
|  | 68         prof.dump_stats(fpath) | 
|  | 69         return ret | 
|  | 70     return profiled_fn | 
|  | 71 | 
|  | 72 | 
|  | 73 class CLI(cli.Application): | 
|  | 74     """RNA-Seq and DNA-Seq short read analysis by mapping to known reference sequences""" | 
|  | 75     PROGNAME = "vmap" | 
|  | 76     VERSION = "1.0.0" | 
|  | 77     DESCRIPTION = """Virana vmap is an interface to the NCBI and ensembl reference databases that can | 
|  | 78                      generate reference indexes for the short read mappers STAR (RNA-Seq) and | 
|  | 79                      BWA-MEM (DNA-Seq). Short reads can be mapped to arbitrary combinations of | 
|  | 80                      reference databases and the results can be summarized by taxonomic family | 
|  | 81                      as well as stored as SAM file, unsorted BAM file, or as a HIT file that | 
|  | 82                      models multimapping reads between specific reference databases.""" | 
|  | 83     USAGE = """The program has four modes that can be accessed by `vmap rnaindex`, `vmap dnaindex`, `vmap rnamap`, and `vmap dnamap.`""" | 
|  | 84 | 
|  | 85     def main(self, *args): | 
|  | 86 | 
|  | 87         if args: | 
|  | 88             print("Unknown command %r" % (args[0])) | 
|  | 89             print self.USAGE | 
|  | 90             return 1 | 
|  | 91 | 
|  | 92         if not self.nested_command: | 
|  | 93             print("No command given") | 
|  | 94             print self.USAGE | 
|  | 95             return 1 | 
|  | 96 | 
|  | 97 | 
|  | 98 @CLI.subcommand("rnaindex") | 
|  | 99 class RNAIndex(cli.Application): | 
|  | 100     """ Creates a STAR index from a FASTA genome reference """ | 
|  | 101 | 
|  | 102     reference_files = cli.SwitchAttr( | 
|  | 103         ['-r', '--reference_file'], str, list=True, mandatory=True, | 
|  | 104         help="Sets the reference genome(s) FASTA file." + | 
|  | 105         " Multiple occurrences of this parameter are allowed.") | 
|  | 106     index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, | 
|  | 107                                help="Sets the index output directory." + | 
|  | 108                                " Directory will be generated if not existing." + | 
|  | 109                                " Directory will be filled with several index files.") | 
|  | 110     threads = cli.SwitchAttr( | 
|  | 111         ['-t', '--threads'], cli.Range(1, 512), mandatory=False, | 
|  | 112         help="Sets the number of threads to use", | 
|  | 113         default=1) | 
|  | 114     max_ram = cli.SwitchAttr( | 
|  | 115         ['-m'], cli.Range(1, 400000000000), mandatory=False, | 
|  | 116         help="Sets the maximum amount of memory (RAM) to use (in bytes)", | 
|  | 117         default=400000000000) | 
|  | 118     path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False, | 
|  | 119                           help="Path to STAR executable", | 
|  | 120                           default='') | 
|  | 121     sparse = cli.Flag( | 
|  | 122         ["-s", "--sparse"], help="If given, a sparse index that requires less " + | 
|  | 123         " RAM in the mapping phase will be constructed") | 
|  | 124 | 
|  | 125     debug = cli.Flag(["-d", "--debug"], help="Enable debug output") | 
|  | 126 | 
|  | 127     def main(self): | 
|  | 128 | 
|  | 129         if self.debug: | 
|  | 130             logging.getLogger().setLevel(logging.DEBUG) | 
|  | 131 | 
|  | 132         # Obtain star executable | 
|  | 133         star = [self.path and self.path or 'STAR'] | 
|  | 134 | 
|  | 135         # Check if genome directory is existing | 
|  | 136         for reference_file in self.reference_files: | 
|  | 137             if not os.path.exists(reference_file): | 
|  | 138                 sys.stdout.write( | 
|  | 139                     'Reference file %s nor existing, exiting' % reference_file) | 
|  | 140                 sys.exit(1) | 
|  | 141 | 
|  | 142         # Check if output directory is existing | 
|  | 143         if not os.path.exists(self.index_dir): | 
|  | 144             logging.debug( | 
|  | 145                 'Making output directory for index at %s' % self.index_dir) | 
|  | 146             os.makedirs(self.index_dir) | 
|  | 147 | 
|  | 148         # # Make named pipe to extract genomes | 
|  | 149         # pipe_path = os.path.abspath(os.path.join(self.genome_dir, 'pipe.fa')) | 
|  | 150         # if os.path.exists(pipe_path): | 
|  | 151         #     os.unlink(pipe_path) | 
|  | 152         # os.mkfifo(pipe_path) | 
|  | 153 | 
|  | 154         # Make star command line | 
|  | 155         cline = star + ['--runMode', 'genomeGenerate', | 
|  | 156                         '--genomeDir', self.index_dir, | 
|  | 157                         '--limitGenomeGenerateRAM', str(self.max_ram), | 
|  | 158                         '--runThreadN', str(self.threads), | 
|  | 159                         '--genomeFastaFiles'] + self.reference_files | 
|  | 160 | 
|  | 161         # Add parameters for sparse (memory-saving) index generation | 
|  | 162         if self.sparse: | 
|  | 163             cline += ['--genomeSAsparseD', '2', | 
|  | 164                       '--genomeChrBinNbits', '12', | 
|  | 165                       '--genomeSAindexNbases', '13'] | 
|  | 166 | 
|  | 167         else: | 
|  | 168             cline += ['--genomeSAsparseD', '1', | 
|  | 169                       '--genomeChrBinNbits', '18', | 
|  | 170                       '--genomeSAindexNbases', '15'] | 
|  | 171 | 
|  | 172         if self.debug: | 
|  | 173             print ' '.join(cline) | 
|  | 174 | 
|  | 175         # Run STAR reference generation process | 
|  | 176         star_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE) | 
|  | 177 | 
|  | 178         # Block until streams are closed by the process | 
|  | 179         stdout, stderr = star_process.communicate() | 
|  | 180 | 
|  | 181         if stderr: | 
|  | 182             sys.stderr.write(stderr) | 
|  | 183 | 
|  | 184         if self.debug and stdout: | 
|  | 185             print stdout | 
|  | 186 | 
|  | 187 | 
|  | 188 @CLI.subcommand("dnaindex") | 
|  | 189 class DNAIndex(cli.Application): | 
|  | 190     """ Creates a BWA index from a FASTA reference file """ | 
|  | 191 | 
|  | 192     reference_file  = cli.SwitchAttr(['-r', '--reference_file'], str, mandatory=True, | 
|  | 193                                  help="Sets the input reference FASTA file.") | 
|  | 194     index_dir   = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, | 
|  | 195                                  help="Sets the index output directory." + | 
|  | 196                                  " Directory will be generated if not existing." + | 
|  | 197                                  " Directory will be filled with several index files.") | 
|  | 198     path        = cli.SwitchAttr(['-p', '--path'], str, mandatory=False, | 
|  | 199                                  help="Path to BWA executable", | 
|  | 200                                  default='') | 
|  | 201     debug       = cli.Flag(["-d", "--debug"], help="Enable debug output") | 
|  | 202 | 
|  | 203     if debug: | 
|  | 204         logging.getLogger().setLevel(logging.DEBUG) | 
|  | 205 | 
|  | 206     def main(self): | 
|  | 207 | 
|  | 208         # Obtain star executable | 
|  | 209         bwa = [self.path and self.path or 'bwa'] | 
|  | 210 | 
|  | 211         # Check if genome directory is existing | 
|  | 212         if not os.path.exists(self.reference_file): | 
|  | 213             sys.stdout.write('Genome file %s nor existing, exiting' % self.reference_file) | 
|  | 214             sys.exit(1) | 
|  | 215 | 
|  | 216         # Check if output directory is existing | 
|  | 217         if not os.path.exists(self.index_dir): | 
|  | 218             logging.debug('Making output directory %s' % self.index_dir) | 
|  | 219             os.makedirs(self.index_dir) | 
|  | 220 | 
|  | 221         # Make star command line | 
|  | 222         cline = bwa + ['index', '-a', 'bwtsw', '-p', os.path.join(self.index_dir, 'index'), self.reference_file] | 
|  | 223 | 
|  | 224         if self.debug: | 
|  | 225             print ' '.join(cline) | 
|  | 226 | 
|  | 227         # Run BWA index generation process | 
|  | 228         bwa_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE) | 
|  | 229         stdout, stderr = bwa_process.communicate() | 
|  | 230 | 
|  | 231         if stderr: | 
|  | 232             sys.stderr.write(stderr) | 
|  | 233 | 
|  | 234         if self.debug and stdout: | 
|  | 235             print stdout | 
|  | 236 | 
|  | 237 | 
|  | 238 | 
|  | 239 | 
|  | 240 | 
|  | 241 class SAMHits: | 
|  | 242     """ Converts SAM output of mappers into bzipped HIT files. """ | 
|  | 243 | 
|  | 244     def __init__(self, output_file, sample_id, refseq_filter=None, min_mapping_score=None,\ | 
|  | 245                      min_alignment_score=None, max_mismatches=None,\ | 
|  | 246                      max_relative_mismatches=None, min_continiously_matching=None,\ | 
|  | 247                      filter_complexity=False): | 
|  | 248 | 
|  | 249         self.output_file                = bz2.BZ2File(output_file, 'wb', buffering=100 * 1024 * 1024) | 
|  | 250         self.sample_id                  = sample_id.translate(None, NON_ID) | 
|  | 251         self.refseq_filter              = refseq_filter | 
|  | 252         self.max_mismatches             = max_mismatches | 
|  | 253         self.max_relative_mismatches    = max_relative_mismatches | 
|  | 254         self.current_group              = [] | 
|  | 255         self.min_mapping_score          = min_mapping_score | 
|  | 256         self.min_alignment_score        = min_alignment_score | 
|  | 257         self.min_continiously_matching  = min_continiously_matching | 
|  | 258         self.filter_complexity          = filter_complexity | 
|  | 259 | 
|  | 260         self.re_matches = re.compile(r'(\d+)M') | 
|  | 261         self.re_dels    = re.compile(r'(\d+)D') | 
|  | 262 | 
|  | 263     def count(self, parsed_line): | 
|  | 264 | 
|  | 265         if parsed_line is None: | 
|  | 266             return | 
|  | 267 | 
|  | 268         read_key, read_name, flag, ref_name, ref_position, mapping_score,\ | 
|  | 269                     cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ | 
|  | 270                     is_end1, is_end2, number_mismatches, alignment_score,\ | 
|  | 271                     number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ | 
|  | 272                     is_paired, number_matches, read_end_pos, max_match = parsed_line | 
|  | 273 | 
|  | 274         if not is_mapped: | 
|  | 275             return | 
|  | 276 | 
|  | 277         if self.filter_complexity: | 
|  | 278 | 
|  | 279             avg_compression = float(len(zlib.compress(seq)))/len(seq) | 
|  | 280 | 
|  | 281             if avg_compression < 0.5: | 
|  | 282                 return | 
|  | 283 | 
|  | 284             # length = len(seq) | 
|  | 285             # counts = [seq.count(nuc) for nuc in 'ACGT'] | 
|  | 286             # min_count = length * 0.10 | 
|  | 287             # max_count = length * 0.50 | 
|  | 288             # for count in counts: | 
|  | 289             #     if count < min_count or count > max_count: | 
|  | 290             #         return None | 
|  | 291 | 
|  | 292             # counter = Counter() | 
|  | 293             # for i in range(length - 2): | 
|  | 294             #     counter[seq[i: i + 3]] += 1 | 
|  | 295             # maximal = length - 4 | 
|  | 296 | 
|  | 297             # highest = sum([v for k, v in counter.most_common(2)]) | 
|  | 298             # if highest > (maximal / 3.0): | 
|  | 299             #     return None | 
|  | 300 | 
|  | 301             # self.passed.append(avg_compression) | 
|  | 302 | 
|  | 303         pair_id = '' | 
|  | 304         if is_end1: | 
|  | 305             pair_id = '/1' | 
|  | 306 | 
|  | 307         elif is_end2: | 
|  | 308             pair_id = '/2' | 
|  | 309 | 
|  | 310         read_name = self.sample_id + ';' + read_name + pair_id | 
|  | 311 | 
|  | 312         # Initialize new current group | 
|  | 313         if len(self.current_group) == 0: | 
|  | 314             self.current_group = [read_name, seq, []] | 
|  | 315 | 
|  | 316         # Write old current group to file | 
|  | 317         if read_name != self.current_group[0]: | 
|  | 318             self._write_group() | 
|  | 319             self.current_group = [read_name, seq, []] | 
|  | 320 | 
|  | 321         try: | 
|  | 322             refseq_group, family, organism, identifier = ref_name.split(';')[:4] | 
|  | 323         except ValueError: | 
|  | 324             sys.stderr.write('Read mapped to malformed reference sequence %s, skipping\n' % ref_name) | 
|  | 325             return | 
|  | 326 | 
|  | 327         if self.min_continiously_matching: | 
|  | 328 | 
|  | 329             if self.min_continiously_matching > max_match: | 
|  | 330                 return | 
|  | 331 | 
|  | 332         if self.max_mismatches\ | 
|  | 333             and int(number_mismatches) > self.max_mismatches: | 
|  | 334             return | 
|  | 335 | 
|  | 336         if self.max_relative_mismatches\ | 
|  | 337             and int(number_mismatches) / float(len(seq))\ | 
|  | 338             > self.max_relative_mismatches: | 
|  | 339             return | 
|  | 340 | 
|  | 341         if self.min_mapping_score\ | 
|  | 342             and self.min_mapping_score > mapping_score: | 
|  | 343             return | 
|  | 344 | 
|  | 345         if self.min_alignment_score\ | 
|  | 346             and self.min_alignment_score > alignment_score: | 
|  | 347             return | 
|  | 348 | 
|  | 349         start = int(ref_position) + 1 | 
|  | 350 | 
|  | 351         self.current_group[2].append([refseq_group, family, organism, identifier, str(start), str(read_end_pos)]) | 
|  | 352 | 
|  | 353     def _write_group(self): | 
|  | 354         passed = True | 
|  | 355 | 
|  | 356         if self.refseq_filter: | 
|  | 357             passed = False | 
|  | 358             for refseq_group, family, organism, identifier, start, end in self.current_group[2]: | 
|  | 359                 if passed: | 
|  | 360                     break | 
|  | 361                 for f in self.refseq_filter: | 
|  | 362                     if refseq_group == f: | 
|  | 363                         passed = True | 
|  | 364                         break | 
|  | 365         if passed: | 
|  | 366             description = [] | 
|  | 367             for identifier in self.current_group[2]: | 
|  | 368                 description.append(';'.join(identifier)) | 
|  | 369             description = '|'.join(description) | 
|  | 370 | 
|  | 371             record = SeqRecord(Seq(self.current_group[1])) | 
|  | 372             record.id = 'Read;' + self.current_group[0] | 
|  | 373             record.description = description | 
|  | 374 | 
|  | 375             SeqIO.write([record], self.output_file, "fasta") | 
|  | 376 | 
|  | 377     def write(self): | 
|  | 378 | 
|  | 379         self._write_group() | 
|  | 380         self.output_file.close() | 
|  | 381 | 
|  | 382 class SAMParser: | 
|  | 383 | 
|  | 384     def parse(self, line): | 
|  | 385 | 
|  | 386         if line[0] == '@': | 
|  | 387             return None | 
|  | 388 | 
|  | 389         alignment   = HTSeq._HTSeq.SAM_Alignment.from_SAM_line(line) | 
|  | 390         read_name   = alignment.read.name | 
|  | 391         seq         = alignment.read.seq | 
|  | 392         qual        = alignment.read.qual | 
|  | 393         flag        = alignment.flag | 
|  | 394         cigar       = None | 
|  | 395 | 
|  | 396         is_paired       = (flag & 1) | 
|  | 397         is_mapped       = not (flag & 4) | 
|  | 398         is_mate_mapped  = alignment.mate_aligned is not None #not (flag & 8) | 
|  | 399         is_reverse      = (flag & 16) | 
|  | 400         is_end1         = (flag & 64) | 
|  | 401         is_end2         = (flag & 128) | 
|  | 402         is_primary      = not (flag & 256) | 
|  | 403 | 
|  | 404         read_key = (read_name, is_end1) | 
|  | 405 | 
|  | 406         ref_name            = None | 
|  | 407         ref_position        = None | 
|  | 408         mapping_score       = 0 | 
|  | 409         mate_ref_name       = None | 
|  | 410         mate_ref_position   = None | 
|  | 411         insert_size         = None | 
|  | 412         alignment_score     = 0 | 
|  | 413         read_end_pos        = None | 
|  | 414 | 
|  | 415         if is_mate_mapped and alignment.mate_start: | 
|  | 416             mate_ref_name       = alignment.mate_start.chrom | 
|  | 417             mate_ref_position   = alignment.mate_start.start | 
|  | 418 | 
|  | 419         number_hits         = 0 | 
|  | 420         alignment_score     = 0 | 
|  | 421         number_mismatches   = 0 | 
|  | 422 | 
|  | 423         number_matches      = 0 | 
|  | 424         max_match           = 0 | 
|  | 425 | 
|  | 426         if is_mapped: | 
|  | 427 | 
|  | 428             ref_name            = alignment.iv.chrom | 
|  | 429             ref_position        = alignment.iv.start | 
|  | 430             read_end_pos        = alignment.iv.end | 
|  | 431             alignment_score     = alignment.aQual | 
|  | 432             cigar               = alignment.cigar | 
|  | 433 | 
|  | 434             if is_mate_mapped: | 
|  | 435                 insert_size         = alignment.inferred_insert_size | 
|  | 436 | 
|  | 437             for c in cigar: | 
|  | 438                 if c.type == 'M': | 
|  | 439                     number_matches += c.size | 
|  | 440                     max_match = max(max_match, c.size) | 
|  | 441 | 
|  | 442             for tag, value in alignment.optional_fields: | 
|  | 443                 if tag == 'NM': | 
|  | 444                     number_hits = value | 
|  | 445                 elif tag == 'AS': | 
|  | 446                     alignment_score = value | 
|  | 447                 elif tag == 'NH': | 
|  | 448                     number_mismatches = value | 
|  | 449 | 
|  | 450         return read_key, read_name, flag, ref_name, ref_position, mapping_score,\ | 
|  | 451             cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ | 
|  | 452             is_end1, is_end2, number_mismatches, alignment_score,\ | 
|  | 453             number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ | 
|  | 454             is_paired, number_matches, read_end_pos, max_match | 
|  | 455 | 
|  | 456 | 
|  | 457 class SAMQuality: | 
|  | 458 | 
|  | 459     def __init__(self, file_path): | 
|  | 460 | 
|  | 461         self.file_path          = file_path | 
|  | 462 | 
|  | 463         self.stored             = defaultdict(Counter) | 
|  | 464         self.all_references     = defaultdict(int) | 
|  | 465         self.primary_references = defaultdict(int) | 
|  | 466         self.complement         = string.maketrans('ATCGN', 'TAGCN') | 
|  | 467 | 
|  | 468         if KHMER_AVAILABLE: | 
|  | 469             self.ktable = khmer.new_ktable(10) | 
|  | 470 | 
|  | 471     def _get_complement(self, sequence): | 
|  | 472 | 
|  | 473         return sequence.translate(self.complement)[::-1] | 
|  | 474 | 
|  | 475     def _get_summary(self, counter): | 
|  | 476         """"Returns five numbers (sum, extrema, mean, and std) | 
|  | 477             for a max_frequency counter """ | 
|  | 478 | 
|  | 479         maximum  = 0 | 
|  | 480         minimum  = sys.maxint | 
|  | 481         thesum   = 0 | 
|  | 482         allcount = 0 | 
|  | 483         mode     = [0, None] | 
|  | 484 | 
|  | 485         items       = 0.0 | 
|  | 486         mean        = 0.0 | 
|  | 487         m2          = 0.0 | 
|  | 488         variance    = 0.0 | 
|  | 489 | 
|  | 490         for item in counter: | 
|  | 491 | 
|  | 492             count   = counter[item] | 
|  | 493             if count > mode[0]: | 
|  | 494                 mode = [count, item] | 
|  | 495 | 
|  | 496             allcount += count | 
|  | 497             maximum = max(maximum, item) | 
|  | 498             minimum = min(minimum, item) | 
|  | 499             thesum  += (count * item) | 
|  | 500 | 
|  | 501             x = 1 | 
|  | 502             while x <= count: | 
|  | 503                 items       += 1 | 
|  | 504                 delta       = item - mean | 
|  | 505                 mean        = mean + delta / items | 
|  | 506                 m2          = m2 + delta * (item - mean) | 
|  | 507                 variance    = m2 / items | 
|  | 508                 x += 1 | 
|  | 509 | 
|  | 510         std = math.sqrt(variance) | 
|  | 511 | 
|  | 512         return allcount, thesum, minimum, maximum, mode[1], mean, std | 
|  | 513 | 
|  | 514 | 
|  | 515     def _to_unit(self, item, is_percentage=False): | 
|  | 516         """ Convert a numeric to a string with metric units """ | 
|  | 517 | 
|  | 518         if is_percentage: | 
|  | 519             return ('%-.3f' % (item * 100)) + '%' | 
|  | 520         converted = None | 
|  | 521         try: | 
|  | 522             item = float(item) | 
|  | 523             if item > 10**12: | 
|  | 524                 converted = str(round(item / 10**9,3))+'P' | 
|  | 525             elif item > 10**9: | 
|  | 526                 converted = str(round(item / 10**9,3))+'G' | 
|  | 527             elif item > 10**6: | 
|  | 528                 converted = str(round(item / 10**6,3))+'M' | 
|  | 529             elif item > 10**3: | 
|  | 530                 converted = str(round(item / 10**3,3))+'K' | 
|  | 531             else: | 
|  | 532                 converted = str(round(item,3)) | 
|  | 533         except: | 
|  | 534             converted = str(item) | 
|  | 535 | 
|  | 536         return converted | 
|  | 537 | 
|  | 538     def _str_metrics(self, data): | 
|  | 539 | 
|  | 540         str_metrics = [] | 
|  | 541 | 
|  | 542         for (item, metric) in sorted(data.keys()): | 
|  | 543             counter = data[(item, metric)] | 
|  | 544             if not hasattr(counter.iterkeys().next(), 'real'): | 
|  | 545                 for element, count in counter.most_common(): | 
|  | 546                     str_metrics.append(self._str_metric(item, metric, element, count, no_units=True)) | 
|  | 547             else: | 
|  | 548                 summary = self._get_summary(counter) | 
|  | 549                 str_metrics.append(self._str_metric(item, metric, *summary)) | 
|  | 550 | 
|  | 551         return str_metrics | 
|  | 552 | 
|  | 553     def _str_metric(self, item, metric, count, thesum='', minimum='',\ | 
|  | 554                         maximum='', mode='', mean='', std='', no_units=False): | 
|  | 555 | 
|  | 556         counters = [count, thesum, minimum, maximum, mode, mean, std] | 
|  | 557         counters = map(str, counters) | 
|  | 558 | 
|  | 559         if no_units: | 
|  | 560             items = [item, metric] +  counters | 
|  | 561         else: | 
|  | 562             units = map(self._to_unit, counters) | 
|  | 563             items = [item, metric] + units | 
|  | 564 | 
|  | 565         return '%-15s\t%-60s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\n' \ | 
|  | 566                     % tuple(items) | 
|  | 567 | 
|  | 568 | 
|  | 569     def _count_read(self, metric, data, sample): | 
|  | 570 | 
|  | 571         item = 'read' | 
|  | 572 | 
|  | 573         (insert_size, alignment_score, mapping_score, length,\ | 
|  | 574             q20_length, avg_phred_quality, number_hits, is_reverse) = data | 
|  | 575 | 
|  | 576         self.stored[(item, metric + ' mappings')][number_hits] += sample | 
|  | 577         self.stored[(item, metric + ' insert')][insert_size] += sample | 
|  | 578 | 
|  | 579 | 
|  | 580     def _count_segment(self, metric, data, sample): | 
|  | 581 | 
|  | 582         item = 'segment' | 
|  | 583 | 
|  | 584         (insert_size, alignment_score, mapping_score, length,\ | 
|  | 585             q20_length, avg_phred_quality, number_hits, is_reverse) = data | 
|  | 586 | 
|  | 587         self.stored[(item, metric + ' algq')][alignment_score] += sample | 
|  | 588         self.stored[(item, metric + ' mapq')][mapping_score] += sample | 
|  | 589         self.stored[(item, metric + ' length')][length] += sample | 
|  | 590         self.stored[(item, metric + ' q20length')][q20_length] += sample | 
|  | 591         self.stored[(item, metric + ' meanbasequal')][avg_phred_quality] += sample | 
|  | 592         self.stored[(item, metric + ' reverse')][is_reverse] += sample | 
|  | 593 | 
|  | 594     def count(self, parsed_line): | 
|  | 595 | 
|  | 596         if parsed_line is None: | 
|  | 597             return | 
|  | 598 | 
|  | 599         #print_metric('Item' , 'Metric', 'Count', 'Sum', 'Min', 'Max', 'Mode', 'Mean', 'STD') | 
|  | 600 | 
|  | 601         read_key, read_name, flag, ref_name, ref_position, mapping_score,\ | 
|  | 602                     cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ | 
|  | 603                     is_end1, is_end2, number_mismatches, alignment_score,\ | 
|  | 604                     number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ | 
|  | 605                     is_paired, number_matches, read_end_pos, max_match = parsed_line | 
|  | 606 | 
|  | 607         phred_quality       = [q - 33 for q in qual] | 
|  | 608         avg_phred_quality   = sum(phred_quality) / float(len(phred_quality)) | 
|  | 609         length              = len(seq) | 
|  | 610         mate_reference_id   = mate_ref_name | 
|  | 611         reference_id        = ref_name | 
|  | 612         reference           = reference_id is not None and reference_id != '*' | 
|  | 613         insert_size         = insert_size and abs(insert_size) or insert_size | 
|  | 614         is_segment1         = not is_paired or (is_paired and is_end1) | 
|  | 615         is_reverse          = is_reverse | 
|  | 616         is_unique           = is_primary and number_hits == 1 | 
|  | 617         is_translocation    = is_paired and is_mapped and is_mate_mapped\ | 
|  | 618                                     and (mate_reference_id != '=' and reference_id != mate_reference_id) | 
|  | 619         is_part_of_doublemap    = is_paired and is_mapped and is_mate_mapped | 
|  | 620         is_part_of_halfmap      = is_paired and (is_mapped != is_mate_mapped) | 
|  | 621         is_part_of_nomap        = is_paired and not is_mapped and not is_mate_mapped | 
|  | 622 | 
|  | 623 | 
|  | 624         # Count length until first low quality base call | 
|  | 625         q20_length = 0 | 
|  | 626         for q in phred_quality: | 
|  | 627             if q < 20: | 
|  | 628                 break | 
|  | 629             q20_length += 1 | 
|  | 630 | 
|  | 631         # Count kmers | 
|  | 632         if KHMER_AVAILABLE: | 
|  | 633             if not is_reverse: | 
|  | 634                 self.ktable.consume(seq) | 
|  | 635             else: | 
|  | 636                 self.ktable.consume(self._get_complement(seq)) | 
|  | 637 | 
|  | 638         if reference: | 
|  | 639 | 
|  | 640             self.all_references[reference_id] += 1 | 
|  | 641 | 
|  | 642             if is_primary: | 
|  | 643                 self.primary_references[reference_id] += 1 | 
|  | 644 | 
|  | 645 | 
|  | 646         data   = (insert_size, alignment_score, mapping_score, length,\ | 
|  | 647                         q20_length, avg_phred_quality, number_hits, is_reverse) | 
|  | 648 | 
|  | 649         sample = 1 | 
|  | 650 | 
|  | 651         self._count_segment('sequenced', data, sample) | 
|  | 652         if is_mapped: | 
|  | 653             self._count_segment('sequenced mapped multi', data, sample) | 
|  | 654             if is_primary: | 
|  | 655                 self._count_segment('sequenced mapped primary', data, sample) | 
|  | 656                 if number_hits and is_unique: | 
|  | 657                     self._count_segment('sequenced mapped primary unique', data, sample) | 
|  | 658 | 
|  | 659             if is_segment1: | 
|  | 660                 self._count_read('sequenced mapped multi', data, sample) | 
|  | 661                 if is_primary: | 
|  | 662                     self._count_read('sequenced mapped primary', data, sample) | 
|  | 663 | 
|  | 664         if is_paired: | 
|  | 665             self._count_segment('sequenced paired', data, sample) | 
|  | 666 | 
|  | 667             if is_part_of_doublemap: | 
|  | 668                 self._count_segment('sequenced paired doublemap', data, sample) | 
|  | 669 | 
|  | 670                 if is_primary: | 
|  | 671                     self._count_segment('sequenced paired doublemap primary', data, sample) | 
|  | 672 | 
|  | 673                 if is_segment1: | 
|  | 674                     self._count_read('sequenced paired doublemap multi', data, sample) | 
|  | 675 | 
|  | 676                     if is_primary: | 
|  | 677                         self._count_read('sequenced paired doublemap primary', data, sample) | 
|  | 678 | 
|  | 679                         if number_hits and is_unique: | 
|  | 680                             self._count_read('sequenced paired doublemap primary unique', data, sample) | 
|  | 681 | 
|  | 682                             if is_translocation: | 
|  | 683                                 self._count_read('sequenced paired doublemap primary unique translocation', data, sample) | 
|  | 684 | 
|  | 685             elif is_part_of_halfmap: | 
|  | 686 | 
|  | 687                 self._count_segment('sequenced paired halfmap', data, sample) | 
|  | 688 | 
|  | 689                 # The mapped segment | 
|  | 690                 if is_mapped: | 
|  | 691 | 
|  | 692                     self._count_segment('sequenced paired halfmap mapped', data, sample) | 
|  | 693 | 
|  | 694                     if is_primary: | 
|  | 695 | 
|  | 696                         self._count_read('sequenced paired halfmap mapped primary', data, sample) | 
|  | 697 | 
|  | 698                         if number_hits and is_unique: | 
|  | 699                             self._count_read('sequenced paired halfmap mapped primary unique', data, sample) | 
|  | 700 | 
|  | 701                     elif not is_primary: | 
|  | 702 | 
|  | 703                         self._count_read('sequenced unpaired mapped multi', data, sample) | 
|  | 704 | 
|  | 705                 # The unmapped segment | 
|  | 706                 elif not is_mapped: | 
|  | 707                     self._count_segment('sequenced paired halfmap unmapped', data, sample) | 
|  | 708 | 
|  | 709             elif is_part_of_nomap: | 
|  | 710 | 
|  | 711                 self._count_segment('sequenced paired nomap', data, sample) | 
|  | 712 | 
|  | 713                 if is_segment1: | 
|  | 714 | 
|  | 715                     self._count_read('sequenced paired nomap', data, sample) | 
|  | 716 | 
|  | 717         elif not is_paired: | 
|  | 718 | 
|  | 719             self._count_segment('sequenced unpaired', data, sample) | 
|  | 720 | 
|  | 721             if is_mapped: | 
|  | 722 | 
|  | 723                 self._count_segment('sequenced unpaired mapped', data, sample) | 
|  | 724 | 
|  | 725                 if is_primary: | 
|  | 726 | 
|  | 727                         self._count_read('sequenced unpaired mapped primary', data, sample) | 
|  | 728 | 
|  | 729                         if number_hits and is_unique: | 
|  | 730 | 
|  | 731                             self._count_read('sequenced paired unpaired mapped primary unique', data, sample) | 
|  | 732 | 
|  | 733                 elif not is_primary: | 
|  | 734 | 
|  | 735                     self._count_read('sequenced unpaired mapped multi', data, sample) | 
|  | 736 | 
|  | 737 | 
|  | 738             elif not is_mapped: | 
|  | 739 | 
|  | 740                 self._count_segment('sequenced unpaired unmapped', data, sample) | 
|  | 741 | 
|  | 742                 if is_segment1: | 
|  | 743                     self._count_read('sequenced unpaired unmapped', data, sample) | 
|  | 744 | 
|  | 745     def write(self): | 
|  | 746 | 
|  | 747         with open(self.file_path, 'w') as output_file: | 
|  | 748 | 
|  | 749             all_references = sorted([(count, reference) for reference, count\ | 
|  | 750                                     in self.all_references.iteritems()], reverse=True) | 
|  | 751 | 
|  | 752             for j, (count, reference) in enumerate(all_references[:30]): | 
|  | 753                 self.stored[('segment', 'multireference_' + str(j+1))][reference] = count | 
|  | 754 | 
|  | 755             primary_references = sorted([(count, reference) for reference, count\ | 
|  | 756                                             in self.primary_references.iteritems()], reverse=True) | 
|  | 757 | 
|  | 758             for j, (count, reference) in enumerate(primary_references[:30]): | 
|  | 759                 self.stored[('segment', 'primaryreference_' + str(j+1))][reference] = count | 
|  | 760 | 
|  | 761             # Extract top-ranking kmers | 
|  | 762             if KHMER_AVAILABLE: | 
|  | 763                 kmer_frequencies = [] | 
|  | 764                 for i in range(0, self.ktable.n_entries()): | 
|  | 765                     n = self.ktable.get(i) | 
|  | 766                     if n > 0: | 
|  | 767                         kmer_frequencies.append((n, self.ktable.reverse_hash(i))) | 
|  | 768                 kmer_frequencies = sorted(kmer_frequencies, reverse=True) | 
|  | 769                 for j, (frequency, kmer) in enumerate(kmer_frequencies[:10]): | 
|  | 770                     self.stored[('segment', 'kmer_' + str(j+1))][kmer] = frequency | 
|  | 771 | 
|  | 772             output_file.writelines(self._str_metrics(self.stored)) | 
|  | 773 | 
|  | 774 | 
|  | 775 class SAMTaxonomy: | 
|  | 776     """ Provides taxonomic summary information from a SAM file stream. """ | 
|  | 777 | 
|  | 778     def __init__(self, file_path): | 
|  | 779 | 
|  | 780         self.file_path = file_path | 
|  | 781 | 
|  | 782         self.count_primaries = Counter() | 
|  | 783         self.detailed_information = {} | 
|  | 784 | 
|  | 785         self._last_read                 = (None, None) | 
|  | 786         self._last_read_human_prim      = 0 | 
|  | 787         self._last_read_human_sec       = 0 | 
|  | 788         self._last_organisms            = set() | 
|  | 789 | 
|  | 790     def count(self, parsed_line): | 
|  | 791 | 
|  | 792         if parsed_line is None: | 
|  | 793             return | 
|  | 794 | 
|  | 795         read_key, read_name, flag, ref_name, ref_position, mapping_score,\ | 
|  | 796                     cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ | 
|  | 797                     is_end1, is_end2, number_mismatches, alignment_score,\ | 
|  | 798                     number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ | 
|  | 799                     is_paired, number_matches, read_end_pos, max_match = parsed_line | 
|  | 800 | 
|  | 801         if is_mapped: | 
|  | 802 | 
|  | 803             refseq_group, family, organism, gi = ref_name.split(';')[:4] | 
|  | 804 | 
|  | 805             if is_primary: | 
|  | 806                 self.count_primaries[organism] += 1 | 
|  | 807 | 
|  | 808             if organism not in self.detailed_information: | 
|  | 809                 # refseq_group. family, gis, avg_mapping_score, | 
|  | 810                 # avg_seq_length, avg_number_hits, avg_alignment_score, avg_nr_mismatches | 
|  | 811                 initial = [refseq_group, | 
|  | 812                            family, | 
|  | 813                            set([gi]), | 
|  | 814                            [int(mapping_score), 1], | 
|  | 815                            [len(seq), 1], | 
|  | 816                            [0, 0], | 
|  | 817                            [alignment_score, 1], | 
|  | 818                            [number_mismatches, 1], | 
|  | 819                            0, | 
|  | 820                            0] | 
|  | 821 | 
|  | 822                 self.detailed_information[organism] = initial | 
|  | 823 | 
|  | 824             else: | 
|  | 825                 entry = self.detailed_information[organism] | 
|  | 826                 entry[2].add(gi) | 
|  | 827                 entry[3][0] += int(mapping_score) | 
|  | 828                 entry[3][1] += 1 | 
|  | 829                 entry[4][0] += len(seq) | 
|  | 830                 entry[4][1] += 1 | 
|  | 831                 entry[6][0] += alignment_score | 
|  | 832                 entry[6][1] += 1 | 
|  | 833                 entry[7][0] += number_mismatches | 
|  | 834                 entry[7][1] += 1 | 
|  | 835 | 
|  | 836             if is_primary: | 
|  | 837                 entry = self.detailed_information[organism] | 
|  | 838                 entry[5][0] += number_hits | 
|  | 839                 entry[5][1] += 1 | 
|  | 840 | 
|  | 841             if self._last_read == (None, None): | 
|  | 842                 self._last_read = read_key | 
|  | 843 | 
|  | 844             if self._last_read != read_key: | 
|  | 845 | 
|  | 846                 for last_organism in self._last_organisms: | 
|  | 847 | 
|  | 848                     self.detailed_information[last_organism][8]\ | 
|  | 849                         += self._last_read_human_prim | 
|  | 850 | 
|  | 851                     self.detailed_information[last_organism][9]\ | 
|  | 852                         += self._last_read_human_sec | 
|  | 853 | 
|  | 854                 self._last_read = read_key | 
|  | 855                 self._last_organisms = set() | 
|  | 856                 self._last_read_human_prim  = 0 | 
|  | 857                 self._last_read_human_sec   = 0 | 
|  | 858 | 
|  | 859             self._last_organisms.add(organism) | 
|  | 860 | 
|  | 861             if organism == 'Homo_sapiens': | 
|  | 862                 if is_primary: | 
|  | 863                     self._last_read_human_prim += 1 | 
|  | 864                 else: | 
|  | 865                     self._last_read_human_sec += 1 | 
|  | 866 | 
|  | 867     def get_summary(self, top=100): | 
|  | 868 | 
|  | 869         lines = [] | 
|  | 870 | 
|  | 871         lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10s\t%10s\t%5s\t%5s\t%5s\t%10s\t%10s\n'\ | 
|  | 872             % ('Count', 'Group', 'Family', 'Organism', 'Targets', 'ReadLen', 'Hits', 'Map', 'Algn', 'Mism', 'HuP', 'HuS')) | 
|  | 873 | 
|  | 874         top_organisms = self.count_primaries.most_common(top) | 
|  | 875 | 
|  | 876         for organism, count in top_organisms: | 
|  | 877 | 
|  | 878             refseq_group, family, identifiers,\ | 
|  | 879                 avg_mapping_score, avg_seq_length, avg_number_hits,\ | 
|  | 880                 avg_alignment_score, avg_nr_mismatches, human_prim, human_sec\ | 
|  | 881                 = self.detailed_information[organism] | 
|  | 882 | 
|  | 883             avg_len = int(avg_seq_length[0] / float(avg_seq_length[1])) | 
|  | 884             if avg_number_hits[1] == 0: | 
|  | 885                 avg_hits = 0 | 
|  | 886             else: | 
|  | 887                 avg_hits = int(avg_number_hits[ | 
|  | 888                                0] / float(avg_number_hits[1])) | 
|  | 889 | 
|  | 890             avg_mapping_score = int(avg_mapping_score[ | 
|  | 891                                     0] / float(avg_mapping_score[1])) | 
|  | 892 | 
|  | 893             avg_alignment_score = int(avg_alignment_score[ | 
|  | 894                                       0] / float(avg_alignment_score[1])) | 
|  | 895 | 
|  | 896             avg_nr_mismatches = int(avg_nr_mismatches[ | 
|  | 897                                     0] / float(avg_nr_mismatches[1])) | 
|  | 898 | 
|  | 899             nr_ids = len(identifiers) | 
|  | 900 | 
|  | 901             if count > 10**6: | 
|  | 902                 count = str(round(count / float(10**6), 3)) + 'M' | 
|  | 903             if human_prim > 10**6: | 
|  | 904                 human_prim = str(round(human_prim / float(10**6), 3)) + 'M' | 
|  | 905             if human_sec > 10**6: | 
|  | 906                 human_sec = str(round(human_sec / float(10**6), 3)) + 'M' | 
|  | 907             if nr_ids > 10**6: | 
|  | 908                 nr_ids = str(round(nr_ids / float(10**6), 3)) + 'M' | 
|  | 909 | 
|  | 910             lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10i\t%10i\t%5i\t%5i\t%5i\t%10s\t%10s\n'\ | 
|  | 911                 % (str(count), refseq_group[:20], family[:20], organism[:20],\ | 
|  | 912                     str(nr_ids), avg_len, avg_hits, avg_mapping_score,\ | 
|  | 913                     avg_alignment_score, avg_nr_mismatches, str(human_prim),\ | 
|  | 914                     str(human_sec))) | 
|  | 915 | 
|  | 916         return lines | 
|  | 917 | 
|  | 918     def write(self): | 
|  | 919 | 
|  | 920         with open(self.file_path, 'w') as output_file: | 
|  | 921             output_file.writelines(self.get_summary()) | 
|  | 922 | 
|  | 923 @CLI.subcommand("rnamap") | 
|  | 924 class RNAmap(cli.Application): | 
|  | 925     """ Map input reads against a STAR index """ | 
|  | 926 | 
|  | 927     index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, | 
|  | 928                                help="Sets the index output directory") | 
|  | 929 | 
|  | 930     threads = cli.SwitchAttr( | 
|  | 931         ['-t', '--threads'], cli.Range(1, 512), mandatory=False, | 
|  | 932         help="Sets the number of threads to use", | 
|  | 933         default=1) | 
|  | 934 | 
|  | 935     taxonomy = cli.SwitchAttr( | 
|  | 936         ['-x', '--taxonomy'], str, mandatory=False, | 
|  | 937         help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping", | 
|  | 938         default='') | 
|  | 939 | 
|  | 940     star_path = cli.SwitchAttr(['--star_path'], str, mandatory=False, | 
|  | 941                           help="Path to STAR executable", | 
|  | 942                           default='') | 
|  | 943 | 
|  | 944     samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False, | 
|  | 945                           help="Path to samtools executable", | 
|  | 946                           default='') | 
|  | 947 | 
|  | 948     temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False, | 
|  | 949                           help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.", | 
|  | 950                           default='') | 
|  | 951 | 
|  | 952     min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False, | 
|  | 953                           help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)", | 
|  | 954                           default=None) | 
|  | 955 | 
|  | 956     min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False, | 
|  | 957                           help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)", | 
|  | 958                           default=None) | 
|  | 959 | 
|  | 960     max_mismatches = cli.SwitchAttr(['--max_mismatches'],  cli.Range(0, 10000000), mandatory=False, | 
|  | 961                           help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)", | 
|  | 962                           default=None) | 
|  | 963 | 
|  | 964     max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False, | 
|  | 965                           help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)", | 
|  | 966                           default=None) | 
|  | 967 | 
|  | 968     min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False, | 
|  | 969                           help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)", | 
|  | 970                           default=None) | 
|  | 971 | 
|  | 972     filter_complexity = cli.Flag(['--filter_complexity'], | 
|  | 973                           help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)", | 
|  | 974                           default=False) | 
|  | 975 | 
|  | 976     bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False, | 
|  | 977                          help="Path to unsorted, unindexed output BAM file", | 
|  | 978                          default='') | 
|  | 979 | 
|  | 980     sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False, | 
|  | 981                          help="Path to output SAM file", | 
|  | 982                          default='') | 
|  | 983 | 
|  | 984     qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False, | 
|  | 985                          help="Path to output quality file", | 
|  | 986                          default='') | 
|  | 987 | 
|  | 988     hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False, | 
|  | 989                           help="Path to bzip2-compressed tab-delimited output hit file", | 
|  | 990                           default='') | 
|  | 991 | 
|  | 992     sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False, | 
|  | 993                           help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file", | 
|  | 994                           default='no_sample_id') | 
|  | 995 | 
|  | 996     unmapped1 = cli.SwitchAttr(['--unmapped_end_1'], str, mandatory=False, | 
|  | 997                          help="Output path to uncompressed fastq file containing unmapped reads, first ends only for paired ends.", | 
|  | 998                          default='') | 
|  | 999 | 
|  | 1000     unmapped2 = cli.SwitchAttr(['--unmapped_end_2'], str, mandatory=False, | 
|  | 1001                          help="Output path to uncompressed fastq file containing unmapped reads, second ends only for paired ends.", | 
|  | 1002                          default='') | 
|  | 1003 | 
|  | 1004     splice_junctions = cli.SwitchAttr(['--splice_junctions'], str, mandatory=False, | 
|  | 1005                          help="Input path to splice junction file (currently not implemented)", | 
|  | 1006                          default='') | 
|  | 1007 | 
|  | 1008     chimeric_mappings = cli.SwitchAttr(['--chimeric_mappings'], str, mandatory=False, | 
|  | 1009                          help="Ouput path to SAM file containing chimeric mappings", | 
|  | 1010                          default='') | 
|  | 1011 | 
|  | 1012     hit_filter = cli.SwitchAttr( | 
|  | 1013         ['-f', '--virana_hit_filter'], str, list=True, mandatory=False, | 
|  | 1014         help="Only generate hit groups that include at last one read mapping to a reference of this reference group.", | 
|  | 1015         default=[]) | 
|  | 1016 | 
|  | 1017     debug = cli.Flag(["-d", "--debug"], help="Enable debug information") | 
|  | 1018 | 
|  | 1019     reads = cli.SwitchAttr( | 
|  | 1020         ['-r', '--reads'], str, list=True, mandatory=True, | 
|  | 1021         help="Sets the input reads. Add this parameter twice for paired end reads.") | 
|  | 1022 | 
|  | 1023     zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped") | 
|  | 1024 | 
|  | 1025     sensitive = cli.Flag( | 
|  | 1026         ["--sensitive"], help="If given, mapping will process slower and more sensitive") | 
|  | 1027 | 
|  | 1028     def main(self): | 
|  | 1029 | 
|  | 1030         if self.debug: | 
|  | 1031             logging.getLogger().setLevel(logging.DEBUG) | 
|  | 1032 | 
|  | 1033         # Obtain star executable | 
|  | 1034         star        = [self.star_path and self.star_path or 'STAR'] | 
|  | 1035         samtools    = [self.samtools_path and self.samtools_path or 'samtools'] | 
|  | 1036 | 
|  | 1037         # Check if genome directory is existing | 
|  | 1038         if not os.path.exists(self.index_dir): | 
|  | 1039             sys.stdout.write('Index directory %s not existing, exiting' % self.genome_dir) | 
|  | 1040             sys.exit(1) | 
|  | 1041 | 
|  | 1042         if self.temp_path: | 
|  | 1043             temp_path = tempfile.mkdtemp(dir=self.temp_path) | 
|  | 1044         else: | 
|  | 1045             temp_path = tempfile.mkdtemp() | 
|  | 1046 | 
|  | 1047         first_ends      = [] | 
|  | 1048         second_ends     = [] | 
|  | 1049         single_ends     = [] | 
|  | 1050 | 
|  | 1051         if len(self.reads) == 2: | 
|  | 1052             first, second = self.reads | 
|  | 1053             first_ends.append(first) | 
|  | 1054             second_ends.append(second) | 
|  | 1055 | 
|  | 1056         elif len(self.reads) == 1: | 
|  | 1057             single_ends.append(self.reads[0]) | 
|  | 1058 | 
|  | 1059         else: | 
|  | 1060             sys.stdout.write('Invalid number of fastq files; provide either one (single end) or two (paired end)') | 
|  | 1061             sys.exit(1) | 
|  | 1062 | 
|  | 1063         if single_ends and not first_ends and not second_ends: | 
|  | 1064             reads = [','.join(single_ends)] | 
|  | 1065 | 
|  | 1066         elif first_ends and second_ends: | 
|  | 1067             reads = [','.join(first_ends), ','.join(second_ends)] | 
|  | 1068 | 
|  | 1069 | 
|  | 1070         star_cline = star + ['--runMode', 'alignReads', | 
|  | 1071                         '--genomeDir', self.index_dir, | 
|  | 1072                         '--runThreadN', str(self.threads), | 
|  | 1073                         '--readMatesLengthsIn', 'NotEqual', | 
|  | 1074                         '--outFileNamePrefix', os.path.join( | 
|  | 1075                             temp_path, 'out'), | 
|  | 1076                         '--outSAMmode', 'Full', | 
|  | 1077                         '--outSAMstrandField', 'None', | 
|  | 1078                         '--outSAMattributes', 'Standard', | 
|  | 1079                         '--outSAMunmapped', 'Within', | 
|  | 1080                         '--outStd', 'SAM', | 
|  | 1081                         '--outFilterMultimapNmax', '1000', | 
|  | 1082                         '--outSAMprimaryFlag', 'AllBestScore', | 
|  | 1083                         '--outSAMorder', 'PairedKeepInputOrder'] | 
|  | 1084 | 
|  | 1085         if self.unmapped1 or self.unmapped2: | 
|  | 1086             star_cline += ['--outReadsUnmapped', 'Fastx'] | 
|  | 1087         else: | 
|  | 1088             star_cline += ['--outReadsUnmapped', 'None'] | 
|  | 1089 | 
|  | 1090 | 
|  | 1091         if self.zipped: | 
|  | 1092             star_cline += ['--readFilesCommand', 'zcat'] | 
|  | 1093 | 
|  | 1094         if self.sensitive: | 
|  | 1095             star_cline += ['--outFilterMultimapScoreRange', '10', | 
|  | 1096                           '--outFilterMismatchNmax', '60', | 
|  | 1097                           '--outFilterMismatchNoverLmax', '0.3', | 
|  | 1098                           '--outFilterScoreMin', '0', | 
|  | 1099                           '--outFilterScoreMinOverLread', '0.3', | 
|  | 1100                           '--outFilterMatchNmin', '0', | 
|  | 1101                           '--outFilterMatchNminOverLread', '0.66', | 
|  | 1102                           '--seedSearchStartLmax', '12', | 
|  | 1103                           '--winAnchorMultimapNmax', '50'] | 
|  | 1104 | 
|  | 1105         star_cline += ['--readFilesIn'] + reads | 
|  | 1106 | 
|  | 1107         if self.debug: | 
|  | 1108             print ' '.join(star_cline) | 
|  | 1109 | 
|  | 1110         # Try if we can make the relevant files | 
|  | 1111         touch_files = [self.unmapped1, self.unmapped2, self.taxonomy, self.qual, self.hits, self.sam, self.bam] | 
|  | 1112         for file_path in touch_files: | 
|  | 1113             if file_path is None or file_path == '': | 
|  | 1114                 continue | 
|  | 1115             try: | 
|  | 1116                 with file(file_path, 'a'): | 
|  | 1117                     os.utime(file_path, None) | 
|  | 1118             except IOError: | 
|  | 1119                 sys.stderr.write('Could not write output file %s\n' % file_path) | 
|  | 1120                 sys.exit(1) | 
|  | 1121 | 
|  | 1122         star_process = subprocess.Popen(' '.join( | 
|  | 1123                                         star_cline), shell=True, stdout=PIPE) | 
|  | 1124 | 
|  | 1125         parser      = SAMParser() | 
|  | 1126 | 
|  | 1127         if self.taxonomy: | 
|  | 1128             taxonomy    = SAMTaxonomy(self.taxonomy) | 
|  | 1129 | 
|  | 1130         if self.qual: | 
|  | 1131             quality     = SAMQuality(self.qual) | 
|  | 1132 | 
|  | 1133         if self.hits: | 
|  | 1134             hits        = SAMHits(self.hits, self.sample_id, self.hit_filter, | 
|  | 1135                         self.min_mapping_score, | 
|  | 1136                         self.min_alignment_score, | 
|  | 1137                         self.max_mismatches, | 
|  | 1138                         self.max_relative_mismatches, | 
|  | 1139                         self.min_continiously_matching, | 
|  | 1140                         self.filter_complexity) | 
|  | 1141 | 
|  | 1142         if self.sam: | 
|  | 1143             sam_file = open(self.sam, 'w') | 
|  | 1144 | 
|  | 1145         if self.bam: | 
|  | 1146             with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file: | 
|  | 1147                 samtools_cline = samtools + [ | 
|  | 1148                     'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin'] | 
|  | 1149                 if self.debug: | 
|  | 1150                     print ' '.join(samtools_cline) | 
|  | 1151                 samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE) | 
|  | 1152 | 
|  | 1153 | 
|  | 1154         do_sam = self.sam | 
|  | 1155         do_bam = self.bam | 
|  | 1156         do_taxonomy = self.taxonomy | 
|  | 1157         do_qual = self.qual | 
|  | 1158         do_hits = self.hits | 
|  | 1159         do_parse = do_taxonomy or do_qual or do_hits | 
|  | 1160 | 
|  | 1161         for i, line in enumerate(iter(star_process.stdout.readline, '')): | 
|  | 1162 | 
|  | 1163             if do_sam: | 
|  | 1164                 sam_file.write(line) | 
|  | 1165 | 
|  | 1166             if do_bam: | 
|  | 1167                 samtools_process.stdin.write(line) | 
|  | 1168 | 
|  | 1169             if line[0] == '@': | 
|  | 1170                 continue | 
|  | 1171 | 
|  | 1172             if do_parse: | 
|  | 1173                 parsed_line = parser.parse(line) | 
|  | 1174 | 
|  | 1175             if  do_taxonomy: | 
|  | 1176                 taxonomy.count(parsed_line) | 
|  | 1177                 if i > 0 and (i % 50000) == 0: | 
|  | 1178                     print ''.join(taxonomy.get_summary(10)) | 
|  | 1179 | 
|  | 1180             if do_qual: | 
|  | 1181                 quality.count(parsed_line) | 
|  | 1182 | 
|  | 1183             if do_hits: | 
|  | 1184                 hits.count(parsed_line) | 
|  | 1185 | 
|  | 1186         if do_bam: | 
|  | 1187             samtools_process.stdin.close() | 
|  | 1188 | 
|  | 1189         if do_sam: | 
|  | 1190             sam_file.close() | 
|  | 1191 | 
|  | 1192         if do_hits: | 
|  | 1193             hits.write() | 
|  | 1194 | 
|  | 1195         if do_taxonomy: | 
|  | 1196             print ''.join(taxonomy.get_summary(10)) | 
|  | 1197             taxonomy.write() | 
|  | 1198 | 
|  | 1199         if do_qual: | 
|  | 1200             quality.write() | 
|  | 1201 | 
|  | 1202         try: | 
|  | 1203             if self.unmapped1: | 
|  | 1204                 shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate1'),\ | 
|  | 1205                 self.unmapped1) | 
|  | 1206         except IOError: | 
|  | 1207             pass | 
|  | 1208 | 
|  | 1209         try: | 
|  | 1210             if self.unmapped2: | 
|  | 1211                 shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate2'),\ | 
|  | 1212                  self.unmapped2) | 
|  | 1213         except IOError: | 
|  | 1214             pass | 
|  | 1215 | 
|  | 1216         try: | 
|  | 1217             if self.chimeric_mappings: | 
|  | 1218                 shutil.move(os.path.join(temp_path, 'out' + 'Chimeric.out.sam'),\ | 
|  | 1219                  self.chimeric_mappings) | 
|  | 1220         except IOError: | 
|  | 1221             pass | 
|  | 1222 | 
|  | 1223         shutil.rmtree(temp_path) | 
|  | 1224 | 
|  | 1225 @CLI.subcommand("dnamap") | 
|  | 1226 class DNAmap(cli.Application): | 
|  | 1227     """ Map input reads against a BWA index """ | 
|  | 1228 | 
|  | 1229     index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, | 
|  | 1230                                help="Sets the index output directory") | 
|  | 1231 | 
|  | 1232     threads = cli.SwitchAttr( | 
|  | 1233         ['-t', '--threads'], cli.Range(1, 512), mandatory=False, | 
|  | 1234         help="Sets the number of threads to use", | 
|  | 1235         default=1) | 
|  | 1236 | 
|  | 1237     taxonomy = cli.SwitchAttr( | 
|  | 1238         ['-x', '--taxonomy'], str, mandatory=False, | 
|  | 1239         help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping", | 
|  | 1240         default='') | 
|  | 1241 | 
|  | 1242     samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False, | 
|  | 1243                           help="Path to samtools executable", | 
|  | 1244                           default='') | 
|  | 1245 | 
|  | 1246     temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False, | 
|  | 1247                           help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.", | 
|  | 1248                           default='') | 
|  | 1249 | 
|  | 1250     min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False, | 
|  | 1251                           help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)", | 
|  | 1252                           default=None) | 
|  | 1253 | 
|  | 1254     min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False, | 
|  | 1255                           help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)", | 
|  | 1256                           default=None) | 
|  | 1257 | 
|  | 1258     max_mismatches = cli.SwitchAttr(['--max_mismatches'],  cli.Range(0, 10000000), mandatory=False, | 
|  | 1259                           help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)", | 
|  | 1260                           default=None) | 
|  | 1261 | 
|  | 1262     max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False, | 
|  | 1263                           help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)", | 
|  | 1264                           default=None) | 
|  | 1265 | 
|  | 1266     min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False, | 
|  | 1267                           help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)", | 
|  | 1268                           default=None) | 
|  | 1269 | 
|  | 1270     filter_complexity = cli.Flag(['--filter_complexity'], | 
|  | 1271                           help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)", | 
|  | 1272                           default=False) | 
|  | 1273 | 
|  | 1274     sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False, | 
|  | 1275                           help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file", | 
|  | 1276                           default='no_sample_id') | 
|  | 1277 | 
|  | 1278     bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False, | 
|  | 1279                          help="Path to unsorted, unindexed output BAM file", | 
|  | 1280                          default='') | 
|  | 1281 | 
|  | 1282     sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False, | 
|  | 1283                          help="Path to output SAM file", | 
|  | 1284                          default='') | 
|  | 1285 | 
|  | 1286     qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False, | 
|  | 1287                          help="Path to output quality file", | 
|  | 1288                          default='') | 
|  | 1289 | 
|  | 1290     hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False, | 
|  | 1291                           help="Path to bzip2-compressed tab-delimited output virana hit file", | 
|  | 1292                           default='') | 
|  | 1293 | 
|  | 1294     hit_filter = cli.SwitchAttr( | 
|  | 1295         ['-f', '--virana_hit_filter'], str, list=True, mandatory=False, | 
|  | 1296         help="Only generate hit groups that include at last one read mapping to a reference of this reference group.", | 
|  | 1297         default=[]) | 
|  | 1298 | 
|  | 1299     debug = cli.Flag(["-d", "--debug"], help="Enable debug information") | 
|  | 1300 | 
|  | 1301     zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped") | 
|  | 1302 | 
|  | 1303     sensitive = cli.Flag( | 
|  | 1304         ["--sensitive"], help="If given, mapping will process slower and more sensitive") | 
|  | 1305 | 
|  | 1306 | 
|  | 1307     bwa_path = cli.SwitchAttr(['--bwa_path'], str, mandatory=False, | 
|  | 1308                           help="Path to BWA executable", | 
|  | 1309                           default='') | 
|  | 1310 | 
|  | 1311     debug = cli.Flag(["-d", "--debug"], help="Enable debug information") | 
|  | 1312 | 
|  | 1313     if debug: | 
|  | 1314         logging.getLogger().setLevel(logging.DEBUG) | 
|  | 1315 | 
|  | 1316     reads = cli.SwitchAttr( | 
|  | 1317         ['-r', '--reads'], str, list=True, mandatory=True, | 
|  | 1318         help="Sets the input reads. Add this parameter twice for paired end reads.") | 
|  | 1319 | 
|  | 1320     def main(self): | 
|  | 1321 | 
|  | 1322         if self.debug: | 
|  | 1323             logging.getLogger().setLevel(logging.DEBUG) | 
|  | 1324 | 
|  | 1325         # Obtain star executable | 
|  | 1326         bwa         = [self.bwa_path and self.bwa_path or 'bwa'] | 
|  | 1327         samtools    = [self.samtools_path and self.samtools_path or 'samtools'] | 
|  | 1328 | 
|  | 1329         # Check if genome directory is existing | 
|  | 1330         if not os.path.exists(self.index_dir): | 
|  | 1331             sys.stdout.write('Index directory %s not existing, exiting'\ | 
|  | 1332                 % self.genome_dir) | 
|  | 1333             sys.exit(1) | 
|  | 1334 | 
|  | 1335         if len(self.reads) not in (1, 2): | 
|  | 1336             message = 'Invalid number of FASTQ files; supply either one (single end) or two (paired end)\n' | 
|  | 1337             sys.stderr.write(message) | 
|  | 1338             sys.exit(1) | 
|  | 1339 | 
|  | 1340         bwa_cline = bwa + ['mem', '-t', str(self.threads), '-M', os.path.join(self.index_dir, 'index')] | 
|  | 1341 | 
|  | 1342         bwa_cline += self.reads | 
|  | 1343 | 
|  | 1344         if self.debug: | 
|  | 1345             print ' '.join(bwa_cline) | 
|  | 1346 | 
|  | 1347         bwa_process = subprocess.Popen(' '.join(bwa_cline), shell=True, stdout=PIPE) | 
|  | 1348 | 
|  | 1349         parser      = SAMParser() | 
|  | 1350 | 
|  | 1351         if self.taxonomy: | 
|  | 1352             taxonomy    = SAMTaxonomy(self.taxonomy) | 
|  | 1353 | 
|  | 1354         if self.qual: | 
|  | 1355             quality     = SAMQuality(self.qual) | 
|  | 1356 | 
|  | 1357         if self.hits: | 
|  | 1358             hits        = SAMHits(self.hits, self.sample_id, self.hit_filter, | 
|  | 1359                         self.min_mapping_score, | 
|  | 1360                         self.min_alignment_score, | 
|  | 1361                         self.max_mismatches, | 
|  | 1362                         self.max_relative_mismatches, | 
|  | 1363                         self.min_continiously_matching, | 
|  | 1364                         self.filter_complexity) | 
|  | 1365 | 
|  | 1366         if self.sam: | 
|  | 1367             sam_file = open(self.sam, 'w', buffering=100 * 1024 * 1024) | 
|  | 1368 | 
|  | 1369         if self.bam: | 
|  | 1370             with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file: | 
|  | 1371                 samtools_cline = samtools + [ | 
|  | 1372                     'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin'] | 
|  | 1373                 if self.debug: | 
|  | 1374                     print ' '.join(samtools_cline) | 
|  | 1375                 samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE) | 
|  | 1376 | 
|  | 1377         do_sam = self.sam | 
|  | 1378         do_bam = self.bam | 
|  | 1379         do_taxonomy = self.taxonomy | 
|  | 1380         do_qual = self.qual | 
|  | 1381         do_hits = self.hits | 
|  | 1382         do_parse = do_taxonomy or do_qual or do_hits | 
|  | 1383 | 
|  | 1384         for i, line in enumerate(iter(bwa_process.stdout.readline, '')): | 
|  | 1385 | 
|  | 1386             if do_sam: | 
|  | 1387                 sam_file.write(line) | 
|  | 1388 | 
|  | 1389             if do_bam: | 
|  | 1390                 samtools_process.stdin.write(line) | 
|  | 1391 | 
|  | 1392             if do_parse: | 
|  | 1393                 parsed_line = parser.parse(line) | 
|  | 1394 | 
|  | 1395             if do_taxonomy: | 
|  | 1396                 taxonomy.count(parsed_line) | 
|  | 1397 | 
|  | 1398                 if i > 0 and (i % 10000) == 0: | 
|  | 1399                     print ''.join(taxonomy.get_summary(10)) | 
|  | 1400 | 
|  | 1401             if do_qual: | 
|  | 1402                 quality.count(parsed_line) | 
|  | 1403 | 
|  | 1404             if do_hits: | 
|  | 1405                 hits.count(parsed_line) | 
|  | 1406 | 
|  | 1407         if do_bam: | 
|  | 1408             samtools_process.stdin.close() | 
|  | 1409 | 
|  | 1410         if do_sam: | 
|  | 1411             sam_file.close() | 
|  | 1412 | 
|  | 1413         if do_hits: | 
|  | 1414             hits.write() | 
|  | 1415 | 
|  | 1416         if do_taxonomy: | 
|  | 1417             print ''.join(taxonomy.get_summary(10)) | 
|  | 1418             taxonomy.write() | 
|  | 1419 | 
|  | 1420         if do_qual: | 
|  | 1421             quality.write() | 
|  | 1422 | 
|  | 1423 | 
|  | 1424 if __name__ == "__main__": | 
|  | 1425     CLI.run() |