Mercurial > repos > mzeidler > virana2
diff vmap.py @ 0:3ba5983012cf draft
Uploaded
| author | mzeidler | 
|---|---|
| date | Tue, 24 Sep 2013 10:19:40 -0400 | 
| parents | |
| children | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vmap.py Tue Sep 24 10:19:40 2013 -0400 @@ -0,0 +1,1425 @@ +#!/usr/bin/env python +#from __future__ import print_function + +import cProfile + +import sys +import re + +import tempfile +import subprocess +import shutil +import os +import os.path +import logging +import bz2 +import zlib + +import math +import string + +from collections import defaultdict, Counter + +from subprocess import PIPE + +NON_ID = ''.join(c for c in map(chr, range(256)) if not c.isalnum()) +NON_ID = NON_ID.replace('_', '').replace('-', '') + +try: + from Bio.SeqRecord import SeqRecord + from Bio import SeqIO + from Bio.Seq import Seq +except ImportError: + message = 'This script requires the BioPython python package\n' + sys.stderr.write(message) + sys.exit(1) + +try: + from plumbum import cli +except ImportError: + message = 'This script requires the plumbum python package\n' + sys.stderr.write(message) + sys.exit(1) + + +try: + import HTSeq +except ImportError: + message = 'This script requires the HTSeq python package\n' + sys.stderr.write(message) + sys.exit(1) + +KHMER_AVAILABLE = True +try: + import khmer +except ImportError: + KHMER_AVAILABLE = False + +#from io import BufferedRandom + +logging.basicConfig(level=logging.INFO, format='%(message)s') + + +def profile_this(fn): + def profiled_fn(*args, **kwargs): + fpath = fn.__name__ + ".profile" + prof = cProfile.Profile() + ret = prof.runcall(fn, *args, **kwargs) + prof.dump_stats(fpath) + return ret + return profiled_fn + + +class CLI(cli.Application): + """RNA-Seq and DNA-Seq short read analysis by mapping to known reference sequences""" + PROGNAME = "vmap" + VERSION = "1.0.0" + DESCRIPTION = """Virana vmap is an interface to the NCBI and ensembl reference databases that can + generate reference indexes for the short read mappers STAR (RNA-Seq) and + BWA-MEM (DNA-Seq). Short reads can be mapped to arbitrary combinations of + reference databases and the results can be summarized by taxonomic family + as well as stored as SAM file, unsorted BAM file, or as a HIT file that + models multimapping reads between specific reference databases.""" + USAGE = """The program has four modes that can be accessed by `vmap rnaindex`, `vmap dnaindex`, `vmap rnamap`, and `vmap dnamap.`""" + + def main(self, *args): + + if args: + print("Unknown command %r" % (args[0])) + print self.USAGE + return 1 + + if not self.nested_command: + print("No command given") + print self.USAGE + return 1 + + +@CLI.subcommand("rnaindex") +class RNAIndex(cli.Application): + """ Creates a STAR index from a FASTA genome reference """ + + reference_files = cli.SwitchAttr( + ['-r', '--reference_file'], str, list=True, mandatory=True, + help="Sets the reference genome(s) FASTA file." + + " Multiple occurrences of this parameter are allowed.") + index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, + help="Sets the index output directory." + + " Directory will be generated if not existing." + + " Directory will be filled with several index files.") + threads = cli.SwitchAttr( + ['-t', '--threads'], cli.Range(1, 512), mandatory=False, + help="Sets the number of threads to use", + default=1) + max_ram = cli.SwitchAttr( + ['-m'], cli.Range(1, 400000000000), mandatory=False, + help="Sets the maximum amount of memory (RAM) to use (in bytes)", + default=400000000000) + path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False, + help="Path to STAR executable", + default='') + sparse = cli.Flag( + ["-s", "--sparse"], help="If given, a sparse index that requires less " + + " RAM in the mapping phase will be constructed") + + debug = cli.Flag(["-d", "--debug"], help="Enable debug output") + + def main(self): + + if self.debug: + logging.getLogger().setLevel(logging.DEBUG) + + # Obtain star executable + star = [self.path and self.path or 'STAR'] + + # Check if genome directory is existing + for reference_file in self.reference_files: + if not os.path.exists(reference_file): + sys.stdout.write( + 'Reference file %s nor existing, exiting' % reference_file) + sys.exit(1) + + # Check if output directory is existing + if not os.path.exists(self.index_dir): + logging.debug( + 'Making output directory for index at %s' % self.index_dir) + os.makedirs(self.index_dir) + + # # Make named pipe to extract genomes + # pipe_path = os.path.abspath(os.path.join(self.genome_dir, 'pipe.fa')) + # if os.path.exists(pipe_path): + # os.unlink(pipe_path) + # os.mkfifo(pipe_path) + + # Make star command line + cline = star + ['--runMode', 'genomeGenerate', + '--genomeDir', self.index_dir, + '--limitGenomeGenerateRAM', str(self.max_ram), + '--runThreadN', str(self.threads), + '--genomeFastaFiles'] + self.reference_files + + # Add parameters for sparse (memory-saving) index generation + if self.sparse: + cline += ['--genomeSAsparseD', '2', + '--genomeChrBinNbits', '12', + '--genomeSAindexNbases', '13'] + + else: + cline += ['--genomeSAsparseD', '1', + '--genomeChrBinNbits', '18', + '--genomeSAindexNbases', '15'] + + if self.debug: + print ' '.join(cline) + + # Run STAR reference generation process + star_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE) + + # Block until streams are closed by the process + stdout, stderr = star_process.communicate() + + if stderr: + sys.stderr.write(stderr) + + if self.debug and stdout: + print stdout + + +@CLI.subcommand("dnaindex") +class DNAIndex(cli.Application): + """ Creates a BWA index from a FASTA reference file """ + + reference_file = cli.SwitchAttr(['-r', '--reference_file'], str, mandatory=True, + help="Sets the input reference FASTA file.") + index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, + help="Sets the index output directory." + + " Directory will be generated if not existing." + + " Directory will be filled with several index files.") + path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False, + help="Path to BWA executable", + default='') + debug = cli.Flag(["-d", "--debug"], help="Enable debug output") + + if debug: + logging.getLogger().setLevel(logging.DEBUG) + + def main(self): + + # Obtain star executable + bwa = [self.path and self.path or 'bwa'] + + # Check if genome directory is existing + if not os.path.exists(self.reference_file): + sys.stdout.write('Genome file %s nor existing, exiting' % self.reference_file) + sys.exit(1) + + # Check if output directory is existing + if not os.path.exists(self.index_dir): + logging.debug('Making output directory %s' % self.index_dir) + os.makedirs(self.index_dir) + + # Make star command line + cline = bwa + ['index', '-a', 'bwtsw', '-p', os.path.join(self.index_dir, 'index'), self.reference_file] + + if self.debug: + print ' '.join(cline) + + # Run BWA index generation process + bwa_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE) + stdout, stderr = bwa_process.communicate() + + if stderr: + sys.stderr.write(stderr) + + if self.debug and stdout: + print stdout + + + + + +class SAMHits: + """ Converts SAM output of mappers into bzipped HIT files. """ + + def __init__(self, output_file, sample_id, refseq_filter=None, min_mapping_score=None,\ + min_alignment_score=None, max_mismatches=None,\ + max_relative_mismatches=None, min_continiously_matching=None,\ + filter_complexity=False): + + self.output_file = bz2.BZ2File(output_file, 'wb', buffering=100 * 1024 * 1024) + self.sample_id = sample_id.translate(None, NON_ID) + self.refseq_filter = refseq_filter + self.max_mismatches = max_mismatches + self.max_relative_mismatches = max_relative_mismatches + self.current_group = [] + self.min_mapping_score = min_mapping_score + self.min_alignment_score = min_alignment_score + self.min_continiously_matching = min_continiously_matching + self.filter_complexity = filter_complexity + + self.re_matches = re.compile(r'(\d+)M') + self.re_dels = re.compile(r'(\d+)D') + + def count(self, parsed_line): + + if parsed_line is None: + return + + read_key, read_name, flag, ref_name, ref_position, mapping_score,\ + cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ + is_end1, is_end2, number_mismatches, alignment_score,\ + number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ + is_paired, number_matches, read_end_pos, max_match = parsed_line + + if not is_mapped: + return + + if self.filter_complexity: + + avg_compression = float(len(zlib.compress(seq)))/len(seq) + + if avg_compression < 0.5: + return + + # length = len(seq) + # counts = [seq.count(nuc) for nuc in 'ACGT'] + # min_count = length * 0.10 + # max_count = length * 0.50 + # for count in counts: + # if count < min_count or count > max_count: + # return None + + # counter = Counter() + # for i in range(length - 2): + # counter[seq[i: i + 3]] += 1 + # maximal = length - 4 + + # highest = sum([v for k, v in counter.most_common(2)]) + # if highest > (maximal / 3.0): + # return None + + # self.passed.append(avg_compression) + + pair_id = '' + if is_end1: + pair_id = '/1' + + elif is_end2: + pair_id = '/2' + + read_name = self.sample_id + ';' + read_name + pair_id + + # Initialize new current group + if len(self.current_group) == 0: + self.current_group = [read_name, seq, []] + + # Write old current group to file + if read_name != self.current_group[0]: + self._write_group() + self.current_group = [read_name, seq, []] + + try: + refseq_group, family, organism, identifier = ref_name.split(';')[:4] + except ValueError: + sys.stderr.write('Read mapped to malformed reference sequence %s, skipping\n' % ref_name) + return + + if self.min_continiously_matching: + + if self.min_continiously_matching > max_match: + return + + if self.max_mismatches\ + and int(number_mismatches) > self.max_mismatches: + return + + if self.max_relative_mismatches\ + and int(number_mismatches) / float(len(seq))\ + > self.max_relative_mismatches: + return + + if self.min_mapping_score\ + and self.min_mapping_score > mapping_score: + return + + if self.min_alignment_score\ + and self.min_alignment_score > alignment_score: + return + + start = int(ref_position) + 1 + + self.current_group[2].append([refseq_group, family, organism, identifier, str(start), str(read_end_pos)]) + + def _write_group(self): + passed = True + + if self.refseq_filter: + passed = False + for refseq_group, family, organism, identifier, start, end in self.current_group[2]: + if passed: + break + for f in self.refseq_filter: + if refseq_group == f: + passed = True + break + if passed: + description = [] + for identifier in self.current_group[2]: + description.append(';'.join(identifier)) + description = '|'.join(description) + + record = SeqRecord(Seq(self.current_group[1])) + record.id = 'Read;' + self.current_group[0] + record.description = description + + SeqIO.write([record], self.output_file, "fasta") + + def write(self): + + self._write_group() + self.output_file.close() + +class SAMParser: + + def parse(self, line): + + if line[0] == '@': + return None + + alignment = HTSeq._HTSeq.SAM_Alignment.from_SAM_line(line) + read_name = alignment.read.name + seq = alignment.read.seq + qual = alignment.read.qual + flag = alignment.flag + cigar = None + + is_paired = (flag & 1) + is_mapped = not (flag & 4) + is_mate_mapped = alignment.mate_aligned is not None #not (flag & 8) + is_reverse = (flag & 16) + is_end1 = (flag & 64) + is_end2 = (flag & 128) + is_primary = not (flag & 256) + + read_key = (read_name, is_end1) + + ref_name = None + ref_position = None + mapping_score = 0 + mate_ref_name = None + mate_ref_position = None + insert_size = None + alignment_score = 0 + read_end_pos = None + + if is_mate_mapped and alignment.mate_start: + mate_ref_name = alignment.mate_start.chrom + mate_ref_position = alignment.mate_start.start + + number_hits = 0 + alignment_score = 0 + number_mismatches = 0 + + number_matches = 0 + max_match = 0 + + if is_mapped: + + ref_name = alignment.iv.chrom + ref_position = alignment.iv.start + read_end_pos = alignment.iv.end + alignment_score = alignment.aQual + cigar = alignment.cigar + + if is_mate_mapped: + insert_size = alignment.inferred_insert_size + + for c in cigar: + if c.type == 'M': + number_matches += c.size + max_match = max(max_match, c.size) + + for tag, value in alignment.optional_fields: + if tag == 'NM': + number_hits = value + elif tag == 'AS': + alignment_score = value + elif tag == 'NH': + number_mismatches = value + + return read_key, read_name, flag, ref_name, ref_position, mapping_score,\ + cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ + is_end1, is_end2, number_mismatches, alignment_score,\ + number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ + is_paired, number_matches, read_end_pos, max_match + + +class SAMQuality: + + def __init__(self, file_path): + + self.file_path = file_path + + self.stored = defaultdict(Counter) + self.all_references = defaultdict(int) + self.primary_references = defaultdict(int) + self.complement = string.maketrans('ATCGN', 'TAGCN') + + if KHMER_AVAILABLE: + self.ktable = khmer.new_ktable(10) + + def _get_complement(self, sequence): + + return sequence.translate(self.complement)[::-1] + + def _get_summary(self, counter): + """"Returns five numbers (sum, extrema, mean, and std) + for a max_frequency counter """ + + maximum = 0 + minimum = sys.maxint + thesum = 0 + allcount = 0 + mode = [0, None] + + items = 0.0 + mean = 0.0 + m2 = 0.0 + variance = 0.0 + + for item in counter: + + count = counter[item] + if count > mode[0]: + mode = [count, item] + + allcount += count + maximum = max(maximum, item) + minimum = min(minimum, item) + thesum += (count * item) + + x = 1 + while x <= count: + items += 1 + delta = item - mean + mean = mean + delta / items + m2 = m2 + delta * (item - mean) + variance = m2 / items + x += 1 + + std = math.sqrt(variance) + + return allcount, thesum, minimum, maximum, mode[1], mean, std + + + def _to_unit(self, item, is_percentage=False): + """ Convert a numeric to a string with metric units """ + + if is_percentage: + return ('%-.3f' % (item * 100)) + '%' + converted = None + try: + item = float(item) + if item > 10**12: + converted = str(round(item / 10**9,3))+'P' + elif item > 10**9: + converted = str(round(item / 10**9,3))+'G' + elif item > 10**6: + converted = str(round(item / 10**6,3))+'M' + elif item > 10**3: + converted = str(round(item / 10**3,3))+'K' + else: + converted = str(round(item,3)) + except: + converted = str(item) + + return converted + + def _str_metrics(self, data): + + str_metrics = [] + + for (item, metric) in sorted(data.keys()): + counter = data[(item, metric)] + if not hasattr(counter.iterkeys().next(), 'real'): + for element, count in counter.most_common(): + str_metrics.append(self._str_metric(item, metric, element, count, no_units=True)) + else: + summary = self._get_summary(counter) + str_metrics.append(self._str_metric(item, metric, *summary)) + + return str_metrics + + def _str_metric(self, item, metric, count, thesum='', minimum='',\ + maximum='', mode='', mean='', std='', no_units=False): + + counters = [count, thesum, minimum, maximum, mode, mean, std] + counters = map(str, counters) + + if no_units: + items = [item, metric] + counters + else: + units = map(self._to_unit, counters) + items = [item, metric] + units + + return '%-15s\t%-60s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\n' \ + % tuple(items) + + + def _count_read(self, metric, data, sample): + + item = 'read' + + (insert_size, alignment_score, mapping_score, length,\ + q20_length, avg_phred_quality, number_hits, is_reverse) = data + + self.stored[(item, metric + ' mappings')][number_hits] += sample + self.stored[(item, metric + ' insert')][insert_size] += sample + + + def _count_segment(self, metric, data, sample): + + item = 'segment' + + (insert_size, alignment_score, mapping_score, length,\ + q20_length, avg_phred_quality, number_hits, is_reverse) = data + + self.stored[(item, metric + ' algq')][alignment_score] += sample + self.stored[(item, metric + ' mapq')][mapping_score] += sample + self.stored[(item, metric + ' length')][length] += sample + self.stored[(item, metric + ' q20length')][q20_length] += sample + self.stored[(item, metric + ' meanbasequal')][avg_phred_quality] += sample + self.stored[(item, metric + ' reverse')][is_reverse] += sample + + def count(self, parsed_line): + + if parsed_line is None: + return + + #print_metric('Item' , 'Metric', 'Count', 'Sum', 'Min', 'Max', 'Mode', 'Mean', 'STD') + + read_key, read_name, flag, ref_name, ref_position, mapping_score,\ + cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ + is_end1, is_end2, number_mismatches, alignment_score,\ + number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ + is_paired, number_matches, read_end_pos, max_match = parsed_line + + phred_quality = [q - 33 for q in qual] + avg_phred_quality = sum(phred_quality) / float(len(phred_quality)) + length = len(seq) + mate_reference_id = mate_ref_name + reference_id = ref_name + reference = reference_id is not None and reference_id != '*' + insert_size = insert_size and abs(insert_size) or insert_size + is_segment1 = not is_paired or (is_paired and is_end1) + is_reverse = is_reverse + is_unique = is_primary and number_hits == 1 + is_translocation = is_paired and is_mapped and is_mate_mapped\ + and (mate_reference_id != '=' and reference_id != mate_reference_id) + is_part_of_doublemap = is_paired and is_mapped and is_mate_mapped + is_part_of_halfmap = is_paired and (is_mapped != is_mate_mapped) + is_part_of_nomap = is_paired and not is_mapped and not is_mate_mapped + + + # Count length until first low quality base call + q20_length = 0 + for q in phred_quality: + if q < 20: + break + q20_length += 1 + + # Count kmers + if KHMER_AVAILABLE: + if not is_reverse: + self.ktable.consume(seq) + else: + self.ktable.consume(self._get_complement(seq)) + + if reference: + + self.all_references[reference_id] += 1 + + if is_primary: + self.primary_references[reference_id] += 1 + + + data = (insert_size, alignment_score, mapping_score, length,\ + q20_length, avg_phred_quality, number_hits, is_reverse) + + sample = 1 + + self._count_segment('sequenced', data, sample) + if is_mapped: + self._count_segment('sequenced mapped multi', data, sample) + if is_primary: + self._count_segment('sequenced mapped primary', data, sample) + if number_hits and is_unique: + self._count_segment('sequenced mapped primary unique', data, sample) + + if is_segment1: + self._count_read('sequenced mapped multi', data, sample) + if is_primary: + self._count_read('sequenced mapped primary', data, sample) + + if is_paired: + self._count_segment('sequenced paired', data, sample) + + if is_part_of_doublemap: + self._count_segment('sequenced paired doublemap', data, sample) + + if is_primary: + self._count_segment('sequenced paired doublemap primary', data, sample) + + if is_segment1: + self._count_read('sequenced paired doublemap multi', data, sample) + + if is_primary: + self._count_read('sequenced paired doublemap primary', data, sample) + + if number_hits and is_unique: + self._count_read('sequenced paired doublemap primary unique', data, sample) + + if is_translocation: + self._count_read('sequenced paired doublemap primary unique translocation', data, sample) + + elif is_part_of_halfmap: + + self._count_segment('sequenced paired halfmap', data, sample) + + # The mapped segment + if is_mapped: + + self._count_segment('sequenced paired halfmap mapped', data, sample) + + if is_primary: + + self._count_read('sequenced paired halfmap mapped primary', data, sample) + + if number_hits and is_unique: + self._count_read('sequenced paired halfmap mapped primary unique', data, sample) + + elif not is_primary: + + self._count_read('sequenced unpaired mapped multi', data, sample) + + # The unmapped segment + elif not is_mapped: + self._count_segment('sequenced paired halfmap unmapped', data, sample) + + elif is_part_of_nomap: + + self._count_segment('sequenced paired nomap', data, sample) + + if is_segment1: + + self._count_read('sequenced paired nomap', data, sample) + + elif not is_paired: + + self._count_segment('sequenced unpaired', data, sample) + + if is_mapped: + + self._count_segment('sequenced unpaired mapped', data, sample) + + if is_primary: + + self._count_read('sequenced unpaired mapped primary', data, sample) + + if number_hits and is_unique: + + self._count_read('sequenced paired unpaired mapped primary unique', data, sample) + + elif not is_primary: + + self._count_read('sequenced unpaired mapped multi', data, sample) + + + elif not is_mapped: + + self._count_segment('sequenced unpaired unmapped', data, sample) + + if is_segment1: + self._count_read('sequenced unpaired unmapped', data, sample) + + def write(self): + + with open(self.file_path, 'w') as output_file: + + all_references = sorted([(count, reference) for reference, count\ + in self.all_references.iteritems()], reverse=True) + + for j, (count, reference) in enumerate(all_references[:30]): + self.stored[('segment', 'multireference_' + str(j+1))][reference] = count + + primary_references = sorted([(count, reference) for reference, count\ + in self.primary_references.iteritems()], reverse=True) + + for j, (count, reference) in enumerate(primary_references[:30]): + self.stored[('segment', 'primaryreference_' + str(j+1))][reference] = count + + # Extract top-ranking kmers + if KHMER_AVAILABLE: + kmer_frequencies = [] + for i in range(0, self.ktable.n_entries()): + n = self.ktable.get(i) + if n > 0: + kmer_frequencies.append((n, self.ktable.reverse_hash(i))) + kmer_frequencies = sorted(kmer_frequencies, reverse=True) + for j, (frequency, kmer) in enumerate(kmer_frequencies[:10]): + self.stored[('segment', 'kmer_' + str(j+1))][kmer] = frequency + + output_file.writelines(self._str_metrics(self.stored)) + + +class SAMTaxonomy: + """ Provides taxonomic summary information from a SAM file stream. """ + + def __init__(self, file_path): + + self.file_path = file_path + + self.count_primaries = Counter() + self.detailed_information = {} + + self._last_read = (None, None) + self._last_read_human_prim = 0 + self._last_read_human_sec = 0 + self._last_organisms = set() + + def count(self, parsed_line): + + if parsed_line is None: + return + + read_key, read_name, flag, ref_name, ref_position, mapping_score,\ + cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ + is_end1, is_end2, number_mismatches, alignment_score,\ + number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ + is_paired, number_matches, read_end_pos, max_match = parsed_line + + if is_mapped: + + refseq_group, family, organism, gi = ref_name.split(';')[:4] + + if is_primary: + self.count_primaries[organism] += 1 + + if organism not in self.detailed_information: + # refseq_group. family, gis, avg_mapping_score, + # avg_seq_length, avg_number_hits, avg_alignment_score, avg_nr_mismatches + initial = [refseq_group, + family, + set([gi]), + [int(mapping_score), 1], + [len(seq), 1], + [0, 0], + [alignment_score, 1], + [number_mismatches, 1], + 0, + 0] + + self.detailed_information[organism] = initial + + else: + entry = self.detailed_information[organism] + entry[2].add(gi) + entry[3][0] += int(mapping_score) + entry[3][1] += 1 + entry[4][0] += len(seq) + entry[4][1] += 1 + entry[6][0] += alignment_score + entry[6][1] += 1 + entry[7][0] += number_mismatches + entry[7][1] += 1 + + if is_primary: + entry = self.detailed_information[organism] + entry[5][0] += number_hits + entry[5][1] += 1 + + if self._last_read == (None, None): + self._last_read = read_key + + if self._last_read != read_key: + + for last_organism in self._last_organisms: + + self.detailed_information[last_organism][8]\ + += self._last_read_human_prim + + self.detailed_information[last_organism][9]\ + += self._last_read_human_sec + + self._last_read = read_key + self._last_organisms = set() + self._last_read_human_prim = 0 + self._last_read_human_sec = 0 + + self._last_organisms.add(organism) + + if organism == 'Homo_sapiens': + if is_primary: + self._last_read_human_prim += 1 + else: + self._last_read_human_sec += 1 + + def get_summary(self, top=100): + + lines = [] + + lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10s\t%10s\t%5s\t%5s\t%5s\t%10s\t%10s\n'\ + % ('Count', 'Group', 'Family', 'Organism', 'Targets', 'ReadLen', 'Hits', 'Map', 'Algn', 'Mism', 'HuP', 'HuS')) + + top_organisms = self.count_primaries.most_common(top) + + for organism, count in top_organisms: + + refseq_group, family, identifiers,\ + avg_mapping_score, avg_seq_length, avg_number_hits,\ + avg_alignment_score, avg_nr_mismatches, human_prim, human_sec\ + = self.detailed_information[organism] + + avg_len = int(avg_seq_length[0] / float(avg_seq_length[1])) + if avg_number_hits[1] == 0: + avg_hits = 0 + else: + avg_hits = int(avg_number_hits[ + 0] / float(avg_number_hits[1])) + + avg_mapping_score = int(avg_mapping_score[ + 0] / float(avg_mapping_score[1])) + + avg_alignment_score = int(avg_alignment_score[ + 0] / float(avg_alignment_score[1])) + + avg_nr_mismatches = int(avg_nr_mismatches[ + 0] / float(avg_nr_mismatches[1])) + + nr_ids = len(identifiers) + + if count > 10**6: + count = str(round(count / float(10**6), 3)) + 'M' + if human_prim > 10**6: + human_prim = str(round(human_prim / float(10**6), 3)) + 'M' + if human_sec > 10**6: + human_sec = str(round(human_sec / float(10**6), 3)) + 'M' + if nr_ids > 10**6: + nr_ids = str(round(nr_ids / float(10**6), 3)) + 'M' + + lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10i\t%10i\t%5i\t%5i\t%5i\t%10s\t%10s\n'\ + % (str(count), refseq_group[:20], family[:20], organism[:20],\ + str(nr_ids), avg_len, avg_hits, avg_mapping_score,\ + avg_alignment_score, avg_nr_mismatches, str(human_prim),\ + str(human_sec))) + + return lines + + def write(self): + + with open(self.file_path, 'w') as output_file: + output_file.writelines(self.get_summary()) + +@CLI.subcommand("rnamap") +class RNAmap(cli.Application): + """ Map input reads against a STAR index """ + + index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, + help="Sets the index output directory") + + threads = cli.SwitchAttr( + ['-t', '--threads'], cli.Range(1, 512), mandatory=False, + help="Sets the number of threads to use", + default=1) + + taxonomy = cli.SwitchAttr( + ['-x', '--taxonomy'], str, mandatory=False, + help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping", + default='') + + star_path = cli.SwitchAttr(['--star_path'], str, mandatory=False, + help="Path to STAR executable", + default='') + + samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False, + help="Path to samtools executable", + default='') + + temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False, + help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.", + default='') + + min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False, + help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)", + default=None) + + min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False, + help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)", + default=None) + + max_mismatches = cli.SwitchAttr(['--max_mismatches'], cli.Range(0, 10000000), mandatory=False, + help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)", + default=None) + + max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False, + help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)", + default=None) + + min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False, + help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)", + default=None) + + filter_complexity = cli.Flag(['--filter_complexity'], + help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)", + default=False) + + bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False, + help="Path to unsorted, unindexed output BAM file", + default='') + + sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False, + help="Path to output SAM file", + default='') + + qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False, + help="Path to output quality file", + default='') + + hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False, + help="Path to bzip2-compressed tab-delimited output hit file", + default='') + + sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False, + help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file", + default='no_sample_id') + + unmapped1 = cli.SwitchAttr(['--unmapped_end_1'], str, mandatory=False, + help="Output path to uncompressed fastq file containing unmapped reads, first ends only for paired ends.", + default='') + + unmapped2 = cli.SwitchAttr(['--unmapped_end_2'], str, mandatory=False, + help="Output path to uncompressed fastq file containing unmapped reads, second ends only for paired ends.", + default='') + + splice_junctions = cli.SwitchAttr(['--splice_junctions'], str, mandatory=False, + help="Input path to splice junction file (currently not implemented)", + default='') + + chimeric_mappings = cli.SwitchAttr(['--chimeric_mappings'], str, mandatory=False, + help="Ouput path to SAM file containing chimeric mappings", + default='') + + hit_filter = cli.SwitchAttr( + ['-f', '--virana_hit_filter'], str, list=True, mandatory=False, + help="Only generate hit groups that include at last one read mapping to a reference of this reference group.", + default=[]) + + debug = cli.Flag(["-d", "--debug"], help="Enable debug information") + + reads = cli.SwitchAttr( + ['-r', '--reads'], str, list=True, mandatory=True, + help="Sets the input reads. Add this parameter twice for paired end reads.") + + zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped") + + sensitive = cli.Flag( + ["--sensitive"], help="If given, mapping will process slower and more sensitive") + + def main(self): + + if self.debug: + logging.getLogger().setLevel(logging.DEBUG) + + # Obtain star executable + star = [self.star_path and self.star_path or 'STAR'] + samtools = [self.samtools_path and self.samtools_path or 'samtools'] + + # Check if genome directory is existing + if not os.path.exists(self.index_dir): + sys.stdout.write('Index directory %s not existing, exiting' % self.genome_dir) + sys.exit(1) + + if self.temp_path: + temp_path = tempfile.mkdtemp(dir=self.temp_path) + else: + temp_path = tempfile.mkdtemp() + + first_ends = [] + second_ends = [] + single_ends = [] + + if len(self.reads) == 2: + first, second = self.reads + first_ends.append(first) + second_ends.append(second) + + elif len(self.reads) == 1: + single_ends.append(self.reads[0]) + + else: + sys.stdout.write('Invalid number of fastq files; provide either one (single end) or two (paired end)') + sys.exit(1) + + if single_ends and not first_ends and not second_ends: + reads = [','.join(single_ends)] + + elif first_ends and second_ends: + reads = [','.join(first_ends), ','.join(second_ends)] + + + star_cline = star + ['--runMode', 'alignReads', + '--genomeDir', self.index_dir, + '--runThreadN', str(self.threads), + '--readMatesLengthsIn', 'NotEqual', + '--outFileNamePrefix', os.path.join( + temp_path, 'out'), + '--outSAMmode', 'Full', + '--outSAMstrandField', 'None', + '--outSAMattributes', 'Standard', + '--outSAMunmapped', 'Within', + '--outStd', 'SAM', + '--outFilterMultimapNmax', '1000', + '--outSAMprimaryFlag', 'AllBestScore', + '--outSAMorder', 'PairedKeepInputOrder'] + + if self.unmapped1 or self.unmapped2: + star_cline += ['--outReadsUnmapped', 'Fastx'] + else: + star_cline += ['--outReadsUnmapped', 'None'] + + + if self.zipped: + star_cline += ['--readFilesCommand', 'zcat'] + + if self.sensitive: + star_cline += ['--outFilterMultimapScoreRange', '10', + '--outFilterMismatchNmax', '60', + '--outFilterMismatchNoverLmax', '0.3', + '--outFilterScoreMin', '0', + '--outFilterScoreMinOverLread', '0.3', + '--outFilterMatchNmin', '0', + '--outFilterMatchNminOverLread', '0.66', + '--seedSearchStartLmax', '12', + '--winAnchorMultimapNmax', '50'] + + star_cline += ['--readFilesIn'] + reads + + if self.debug: + print ' '.join(star_cline) + + # Try if we can make the relevant files + touch_files = [self.unmapped1, self.unmapped2, self.taxonomy, self.qual, self.hits, self.sam, self.bam] + for file_path in touch_files: + if file_path is None or file_path == '': + continue + try: + with file(file_path, 'a'): + os.utime(file_path, None) + except IOError: + sys.stderr.write('Could not write output file %s\n' % file_path) + sys.exit(1) + + star_process = subprocess.Popen(' '.join( + star_cline), shell=True, stdout=PIPE) + + parser = SAMParser() + + if self.taxonomy: + taxonomy = SAMTaxonomy(self.taxonomy) + + if self.qual: + quality = SAMQuality(self.qual) + + if self.hits: + hits = SAMHits(self.hits, self.sample_id, self.hit_filter, + self.min_mapping_score, + self.min_alignment_score, + self.max_mismatches, + self.max_relative_mismatches, + self.min_continiously_matching, + self.filter_complexity) + + if self.sam: + sam_file = open(self.sam, 'w') + + if self.bam: + with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file: + samtools_cline = samtools + [ + 'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin'] + if self.debug: + print ' '.join(samtools_cline) + samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE) + + + do_sam = self.sam + do_bam = self.bam + do_taxonomy = self.taxonomy + do_qual = self.qual + do_hits = self.hits + do_parse = do_taxonomy or do_qual or do_hits + + for i, line in enumerate(iter(star_process.stdout.readline, '')): + + if do_sam: + sam_file.write(line) + + if do_bam: + samtools_process.stdin.write(line) + + if line[0] == '@': + continue + + if do_parse: + parsed_line = parser.parse(line) + + if do_taxonomy: + taxonomy.count(parsed_line) + if i > 0 and (i % 50000) == 0: + print ''.join(taxonomy.get_summary(10)) + + if do_qual: + quality.count(parsed_line) + + if do_hits: + hits.count(parsed_line) + + if do_bam: + samtools_process.stdin.close() + + if do_sam: + sam_file.close() + + if do_hits: + hits.write() + + if do_taxonomy: + print ''.join(taxonomy.get_summary(10)) + taxonomy.write() + + if do_qual: + quality.write() + + try: + if self.unmapped1: + shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate1'),\ + self.unmapped1) + except IOError: + pass + + try: + if self.unmapped2: + shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate2'),\ + self.unmapped2) + except IOError: + pass + + try: + if self.chimeric_mappings: + shutil.move(os.path.join(temp_path, 'out' + 'Chimeric.out.sam'),\ + self.chimeric_mappings) + except IOError: + pass + + shutil.rmtree(temp_path) + +@CLI.subcommand("dnamap") +class DNAmap(cli.Application): + """ Map input reads against a BWA index """ + + index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, + help="Sets the index output directory") + + threads = cli.SwitchAttr( + ['-t', '--threads'], cli.Range(1, 512), mandatory=False, + help="Sets the number of threads to use", + default=1) + + taxonomy = cli.SwitchAttr( + ['-x', '--taxonomy'], str, mandatory=False, + help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping", + default='') + + samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False, + help="Path to samtools executable", + default='') + + temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False, + help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.", + default='') + + min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False, + help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)", + default=None) + + min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False, + help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)", + default=None) + + max_mismatches = cli.SwitchAttr(['--max_mismatches'], cli.Range(0, 10000000), mandatory=False, + help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)", + default=None) + + max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False, + help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)", + default=None) + + min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False, + help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)", + default=None) + + filter_complexity = cli.Flag(['--filter_complexity'], + help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)", + default=False) + + sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False, + help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file", + default='no_sample_id') + + bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False, + help="Path to unsorted, unindexed output BAM file", + default='') + + sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False, + help="Path to output SAM file", + default='') + + qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False, + help="Path to output quality file", + default='') + + hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False, + help="Path to bzip2-compressed tab-delimited output virana hit file", + default='') + + hit_filter = cli.SwitchAttr( + ['-f', '--virana_hit_filter'], str, list=True, mandatory=False, + help="Only generate hit groups that include at last one read mapping to a reference of this reference group.", + default=[]) + + debug = cli.Flag(["-d", "--debug"], help="Enable debug information") + + zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped") + + sensitive = cli.Flag( + ["--sensitive"], help="If given, mapping will process slower and more sensitive") + + + bwa_path = cli.SwitchAttr(['--bwa_path'], str, mandatory=False, + help="Path to BWA executable", + default='') + + debug = cli.Flag(["-d", "--debug"], help="Enable debug information") + + if debug: + logging.getLogger().setLevel(logging.DEBUG) + + reads = cli.SwitchAttr( + ['-r', '--reads'], str, list=True, mandatory=True, + help="Sets the input reads. Add this parameter twice for paired end reads.") + + def main(self): + + if self.debug: + logging.getLogger().setLevel(logging.DEBUG) + + # Obtain star executable + bwa = [self.bwa_path and self.bwa_path or 'bwa'] + samtools = [self.samtools_path and self.samtools_path or 'samtools'] + + # Check if genome directory is existing + if not os.path.exists(self.index_dir): + sys.stdout.write('Index directory %s not existing, exiting'\ + % self.genome_dir) + sys.exit(1) + + if len(self.reads) not in (1, 2): + message = 'Invalid number of FASTQ files; supply either one (single end) or two (paired end)\n' + sys.stderr.write(message) + sys.exit(1) + + bwa_cline = bwa + ['mem', '-t', str(self.threads), '-M', os.path.join(self.index_dir, 'index')] + + bwa_cline += self.reads + + if self.debug: + print ' '.join(bwa_cline) + + bwa_process = subprocess.Popen(' '.join(bwa_cline), shell=True, stdout=PIPE) + + parser = SAMParser() + + if self.taxonomy: + taxonomy = SAMTaxonomy(self.taxonomy) + + if self.qual: + quality = SAMQuality(self.qual) + + if self.hits: + hits = SAMHits(self.hits, self.sample_id, self.hit_filter, + self.min_mapping_score, + self.min_alignment_score, + self.max_mismatches, + self.max_relative_mismatches, + self.min_continiously_matching, + self.filter_complexity) + + if self.sam: + sam_file = open(self.sam, 'w', buffering=100 * 1024 * 1024) + + if self.bam: + with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file: + samtools_cline = samtools + [ + 'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin'] + if self.debug: + print ' '.join(samtools_cline) + samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE) + + do_sam = self.sam + do_bam = self.bam + do_taxonomy = self.taxonomy + do_qual = self.qual + do_hits = self.hits + do_parse = do_taxonomy or do_qual or do_hits + + for i, line in enumerate(iter(bwa_process.stdout.readline, '')): + + if do_sam: + sam_file.write(line) + + if do_bam: + samtools_process.stdin.write(line) + + if do_parse: + parsed_line = parser.parse(line) + + if do_taxonomy: + taxonomy.count(parsed_line) + + if i > 0 and (i % 10000) == 0: + print ''.join(taxonomy.get_summary(10)) + + if do_qual: + quality.count(parsed_line) + + if do_hits: + hits.count(parsed_line) + + if do_bam: + samtools_process.stdin.close() + + if do_sam: + sam_file.close() + + if do_hits: + hits.write() + + if do_taxonomy: + print ''.join(taxonomy.get_summary(10)) + taxonomy.write() + + if do_qual: + quality.write() + + +if __name__ == "__main__": + CLI.run()
