Mercurial > repos > mzeidler > virana2
view vmap.py @ 53:ae57473347db draft
Uploaded
| author | mzeidler | 
|---|---|
| date | Thu, 26 Sep 2013 07:06:54 -0400 | 
| parents | 3ba5983012cf | 
| children | 
line wrap: on
 line source
#!/usr/bin/env python #from __future__ import print_function import cProfile import sys import re import tempfile import subprocess import shutil import os import os.path import logging import bz2 import zlib import math import string from collections import defaultdict, Counter from subprocess import PIPE NON_ID = ''.join(c for c in map(chr, range(256)) if not c.isalnum()) NON_ID = NON_ID.replace('_', '').replace('-', '') try: from Bio.SeqRecord import SeqRecord from Bio import SeqIO from Bio.Seq import Seq except ImportError: message = 'This script requires the BioPython python package\n' sys.stderr.write(message) sys.exit(1) try: from plumbum import cli except ImportError: message = 'This script requires the plumbum python package\n' sys.stderr.write(message) sys.exit(1) try: import HTSeq except ImportError: message = 'This script requires the HTSeq python package\n' sys.stderr.write(message) sys.exit(1) KHMER_AVAILABLE = True try: import khmer except ImportError: KHMER_AVAILABLE = False #from io import BufferedRandom logging.basicConfig(level=logging.INFO, format='%(message)s') def profile_this(fn): def profiled_fn(*args, **kwargs): fpath = fn.__name__ + ".profile" prof = cProfile.Profile() ret = prof.runcall(fn, *args, **kwargs) prof.dump_stats(fpath) return ret return profiled_fn class CLI(cli.Application): """RNA-Seq and DNA-Seq short read analysis by mapping to known reference sequences""" PROGNAME = "vmap" VERSION = "1.0.0" DESCRIPTION = """Virana vmap is an interface to the NCBI and ensembl reference databases that can generate reference indexes for the short read mappers STAR (RNA-Seq) and BWA-MEM (DNA-Seq). Short reads can be mapped to arbitrary combinations of reference databases and the results can be summarized by taxonomic family as well as stored as SAM file, unsorted BAM file, or as a HIT file that models multimapping reads between specific reference databases.""" USAGE = """The program has four modes that can be accessed by `vmap rnaindex`, `vmap dnaindex`, `vmap rnamap`, and `vmap dnamap.`""" def main(self, *args): if args: print("Unknown command %r" % (args[0])) print self.USAGE return 1 if not self.nested_command: print("No command given") print self.USAGE return 1 @CLI.subcommand("rnaindex") class RNAIndex(cli.Application): """ Creates a STAR index from a FASTA genome reference """ reference_files = cli.SwitchAttr( ['-r', '--reference_file'], str, list=True, mandatory=True, help="Sets the reference genome(s) FASTA file." + " Multiple occurrences of this parameter are allowed.") index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, help="Sets the index output directory." + " Directory will be generated if not existing." + " Directory will be filled with several index files.") threads = cli.SwitchAttr( ['-t', '--threads'], cli.Range(1, 512), mandatory=False, help="Sets the number of threads to use", default=1) max_ram = cli.SwitchAttr( ['-m'], cli.Range(1, 400000000000), mandatory=False, help="Sets the maximum amount of memory (RAM) to use (in bytes)", default=400000000000) path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False, help="Path to STAR executable", default='') sparse = cli.Flag( ["-s", "--sparse"], help="If given, a sparse index that requires less " + " RAM in the mapping phase will be constructed") debug = cli.Flag(["-d", "--debug"], help="Enable debug output") def main(self): if self.debug: logging.getLogger().setLevel(logging.DEBUG) # Obtain star executable star = [self.path and self.path or 'STAR'] # Check if genome directory is existing for reference_file in self.reference_files: if not os.path.exists(reference_file): sys.stdout.write( 'Reference file %s nor existing, exiting' % reference_file) sys.exit(1) # Check if output directory is existing if not os.path.exists(self.index_dir): logging.debug( 'Making output directory for index at %s' % self.index_dir) os.makedirs(self.index_dir) # # Make named pipe to extract genomes # pipe_path = os.path.abspath(os.path.join(self.genome_dir, 'pipe.fa')) # if os.path.exists(pipe_path): # os.unlink(pipe_path) # os.mkfifo(pipe_path) # Make star command line cline = star + ['--runMode', 'genomeGenerate', '--genomeDir', self.index_dir, '--limitGenomeGenerateRAM', str(self.max_ram), '--runThreadN', str(self.threads), '--genomeFastaFiles'] + self.reference_files # Add parameters for sparse (memory-saving) index generation if self.sparse: cline += ['--genomeSAsparseD', '2', '--genomeChrBinNbits', '12', '--genomeSAindexNbases', '13'] else: cline += ['--genomeSAsparseD', '1', '--genomeChrBinNbits', '18', '--genomeSAindexNbases', '15'] if self.debug: print ' '.join(cline) # Run STAR reference generation process star_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE) # Block until streams are closed by the process stdout, stderr = star_process.communicate() if stderr: sys.stderr.write(stderr) if self.debug and stdout: print stdout @CLI.subcommand("dnaindex") class DNAIndex(cli.Application): """ Creates a BWA index from a FASTA reference file """ reference_file = cli.SwitchAttr(['-r', '--reference_file'], str, mandatory=True, help="Sets the input reference FASTA file.") index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, help="Sets the index output directory." + " Directory will be generated if not existing." + " Directory will be filled with several index files.") path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False, help="Path to BWA executable", default='') debug = cli.Flag(["-d", "--debug"], help="Enable debug output") if debug: logging.getLogger().setLevel(logging.DEBUG) def main(self): # Obtain star executable bwa = [self.path and self.path or 'bwa'] # Check if genome directory is existing if not os.path.exists(self.reference_file): sys.stdout.write('Genome file %s nor existing, exiting' % self.reference_file) sys.exit(1) # Check if output directory is existing if not os.path.exists(self.index_dir): logging.debug('Making output directory %s' % self.index_dir) os.makedirs(self.index_dir) # Make star command line cline = bwa + ['index', '-a', 'bwtsw', '-p', os.path.join(self.index_dir, 'index'), self.reference_file] if self.debug: print ' '.join(cline) # Run BWA index generation process bwa_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = bwa_process.communicate() if stderr: sys.stderr.write(stderr) if self.debug and stdout: print stdout class SAMHits: """ Converts SAM output of mappers into bzipped HIT files. """ def __init__(self, output_file, sample_id, refseq_filter=None, min_mapping_score=None,\ min_alignment_score=None, max_mismatches=None,\ max_relative_mismatches=None, min_continiously_matching=None,\ filter_complexity=False): self.output_file = bz2.BZ2File(output_file, 'wb', buffering=100 * 1024 * 1024) self.sample_id = sample_id.translate(None, NON_ID) self.refseq_filter = refseq_filter self.max_mismatches = max_mismatches self.max_relative_mismatches = max_relative_mismatches self.current_group = [] self.min_mapping_score = min_mapping_score self.min_alignment_score = min_alignment_score self.min_continiously_matching = min_continiously_matching self.filter_complexity = filter_complexity self.re_matches = re.compile(r'(\d+)M') self.re_dels = re.compile(r'(\d+)D') def count(self, parsed_line): if parsed_line is None: return read_key, read_name, flag, ref_name, ref_position, mapping_score,\ cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ is_end1, is_end2, number_mismatches, alignment_score,\ number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ is_paired, number_matches, read_end_pos, max_match = parsed_line if not is_mapped: return if self.filter_complexity: avg_compression = float(len(zlib.compress(seq)))/len(seq) if avg_compression < 0.5: return # length = len(seq) # counts = [seq.count(nuc) for nuc in 'ACGT'] # min_count = length * 0.10 # max_count = length * 0.50 # for count in counts: # if count < min_count or count > max_count: # return None # counter = Counter() # for i in range(length - 2): # counter[seq[i: i + 3]] += 1 # maximal = length - 4 # highest = sum([v for k, v in counter.most_common(2)]) # if highest > (maximal / 3.0): # return None # self.passed.append(avg_compression) pair_id = '' if is_end1: pair_id = '/1' elif is_end2: pair_id = '/2' read_name = self.sample_id + ';' + read_name + pair_id # Initialize new current group if len(self.current_group) == 0: self.current_group = [read_name, seq, []] # Write old current group to file if read_name != self.current_group[0]: self._write_group() self.current_group = [read_name, seq, []] try: refseq_group, family, organism, identifier = ref_name.split(';')[:4] except ValueError: sys.stderr.write('Read mapped to malformed reference sequence %s, skipping\n' % ref_name) return if self.min_continiously_matching: if self.min_continiously_matching > max_match: return if self.max_mismatches\ and int(number_mismatches) > self.max_mismatches: return if self.max_relative_mismatches\ and int(number_mismatches) / float(len(seq))\ > self.max_relative_mismatches: return if self.min_mapping_score\ and self.min_mapping_score > mapping_score: return if self.min_alignment_score\ and self.min_alignment_score > alignment_score: return start = int(ref_position) + 1 self.current_group[2].append([refseq_group, family, organism, identifier, str(start), str(read_end_pos)]) def _write_group(self): passed = True if self.refseq_filter: passed = False for refseq_group, family, organism, identifier, start, end in self.current_group[2]: if passed: break for f in self.refseq_filter: if refseq_group == f: passed = True break if passed: description = [] for identifier in self.current_group[2]: description.append(';'.join(identifier)) description = '|'.join(description) record = SeqRecord(Seq(self.current_group[1])) record.id = 'Read;' + self.current_group[0] record.description = description SeqIO.write([record], self.output_file, "fasta") def write(self): self._write_group() self.output_file.close() class SAMParser: def parse(self, line): if line[0] == '@': return None alignment = HTSeq._HTSeq.SAM_Alignment.from_SAM_line(line) read_name = alignment.read.name seq = alignment.read.seq qual = alignment.read.qual flag = alignment.flag cigar = None is_paired = (flag & 1) is_mapped = not (flag & 4) is_mate_mapped = alignment.mate_aligned is not None #not (flag & 8) is_reverse = (flag & 16) is_end1 = (flag & 64) is_end2 = (flag & 128) is_primary = not (flag & 256) read_key = (read_name, is_end1) ref_name = None ref_position = None mapping_score = 0 mate_ref_name = None mate_ref_position = None insert_size = None alignment_score = 0 read_end_pos = None if is_mate_mapped and alignment.mate_start: mate_ref_name = alignment.mate_start.chrom mate_ref_position = alignment.mate_start.start number_hits = 0 alignment_score = 0 number_mismatches = 0 number_matches = 0 max_match = 0 if is_mapped: ref_name = alignment.iv.chrom ref_position = alignment.iv.start read_end_pos = alignment.iv.end alignment_score = alignment.aQual cigar = alignment.cigar if is_mate_mapped: insert_size = alignment.inferred_insert_size for c in cigar: if c.type == 'M': number_matches += c.size max_match = max(max_match, c.size) for tag, value in alignment.optional_fields: if tag == 'NM': number_hits = value elif tag == 'AS': alignment_score = value elif tag == 'NH': number_mismatches = value return read_key, read_name, flag, ref_name, ref_position, mapping_score,\ cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ is_end1, is_end2, number_mismatches, alignment_score,\ number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ is_paired, number_matches, read_end_pos, max_match class SAMQuality: def __init__(self, file_path): self.file_path = file_path self.stored = defaultdict(Counter) self.all_references = defaultdict(int) self.primary_references = defaultdict(int) self.complement = string.maketrans('ATCGN', 'TAGCN') if KHMER_AVAILABLE: self.ktable = khmer.new_ktable(10) def _get_complement(self, sequence): return sequence.translate(self.complement)[::-1] def _get_summary(self, counter): """"Returns five numbers (sum, extrema, mean, and std) for a max_frequency counter """ maximum = 0 minimum = sys.maxint thesum = 0 allcount = 0 mode = [0, None] items = 0.0 mean = 0.0 m2 = 0.0 variance = 0.0 for item in counter: count = counter[item] if count > mode[0]: mode = [count, item] allcount += count maximum = max(maximum, item) minimum = min(minimum, item) thesum += (count * item) x = 1 while x <= count: items += 1 delta = item - mean mean = mean + delta / items m2 = m2 + delta * (item - mean) variance = m2 / items x += 1 std = math.sqrt(variance) return allcount, thesum, minimum, maximum, mode[1], mean, std def _to_unit(self, item, is_percentage=False): """ Convert a numeric to a string with metric units """ if is_percentage: return ('%-.3f' % (item * 100)) + '%' converted = None try: item = float(item) if item > 10**12: converted = str(round(item / 10**9,3))+'P' elif item > 10**9: converted = str(round(item / 10**9,3))+'G' elif item > 10**6: converted = str(round(item / 10**6,3))+'M' elif item > 10**3: converted = str(round(item / 10**3,3))+'K' else: converted = str(round(item,3)) except: converted = str(item) return converted def _str_metrics(self, data): str_metrics = [] for (item, metric) in sorted(data.keys()): counter = data[(item, metric)] if not hasattr(counter.iterkeys().next(), 'real'): for element, count in counter.most_common(): str_metrics.append(self._str_metric(item, metric, element, count, no_units=True)) else: summary = self._get_summary(counter) str_metrics.append(self._str_metric(item, metric, *summary)) return str_metrics def _str_metric(self, item, metric, count, thesum='', minimum='',\ maximum='', mode='', mean='', std='', no_units=False): counters = [count, thesum, minimum, maximum, mode, mean, std] counters = map(str, counters) if no_units: items = [item, metric] + counters else: units = map(self._to_unit, counters) items = [item, metric] + units return '%-15s\t%-60s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\n' \ % tuple(items) def _count_read(self, metric, data, sample): item = 'read' (insert_size, alignment_score, mapping_score, length,\ q20_length, avg_phred_quality, number_hits, is_reverse) = data self.stored[(item, metric + ' mappings')][number_hits] += sample self.stored[(item, metric + ' insert')][insert_size] += sample def _count_segment(self, metric, data, sample): item = 'segment' (insert_size, alignment_score, mapping_score, length,\ q20_length, avg_phred_quality, number_hits, is_reverse) = data self.stored[(item, metric + ' algq')][alignment_score] += sample self.stored[(item, metric + ' mapq')][mapping_score] += sample self.stored[(item, metric + ' length')][length] += sample self.stored[(item, metric + ' q20length')][q20_length] += sample self.stored[(item, metric + ' meanbasequal')][avg_phred_quality] += sample self.stored[(item, metric + ' reverse')][is_reverse] += sample def count(self, parsed_line): if parsed_line is None: return #print_metric('Item' , 'Metric', 'Count', 'Sum', 'Min', 'Max', 'Mode', 'Mean', 'STD') read_key, read_name, flag, ref_name, ref_position, mapping_score,\ cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ is_end1, is_end2, number_mismatches, alignment_score,\ number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ is_paired, number_matches, read_end_pos, max_match = parsed_line phred_quality = [q - 33 for q in qual] avg_phred_quality = sum(phred_quality) / float(len(phred_quality)) length = len(seq) mate_reference_id = mate_ref_name reference_id = ref_name reference = reference_id is not None and reference_id != '*' insert_size = insert_size and abs(insert_size) or insert_size is_segment1 = not is_paired or (is_paired and is_end1) is_reverse = is_reverse is_unique = is_primary and number_hits == 1 is_translocation = is_paired and is_mapped and is_mate_mapped\ and (mate_reference_id != '=' and reference_id != mate_reference_id) is_part_of_doublemap = is_paired and is_mapped and is_mate_mapped is_part_of_halfmap = is_paired and (is_mapped != is_mate_mapped) is_part_of_nomap = is_paired and not is_mapped and not is_mate_mapped # Count length until first low quality base call q20_length = 0 for q in phred_quality: if q < 20: break q20_length += 1 # Count kmers if KHMER_AVAILABLE: if not is_reverse: self.ktable.consume(seq) else: self.ktable.consume(self._get_complement(seq)) if reference: self.all_references[reference_id] += 1 if is_primary: self.primary_references[reference_id] += 1 data = (insert_size, alignment_score, mapping_score, length,\ q20_length, avg_phred_quality, number_hits, is_reverse) sample = 1 self._count_segment('sequenced', data, sample) if is_mapped: self._count_segment('sequenced mapped multi', data, sample) if is_primary: self._count_segment('sequenced mapped primary', data, sample) if number_hits and is_unique: self._count_segment('sequenced mapped primary unique', data, sample) if is_segment1: self._count_read('sequenced mapped multi', data, sample) if is_primary: self._count_read('sequenced mapped primary', data, sample) if is_paired: self._count_segment('sequenced paired', data, sample) if is_part_of_doublemap: self._count_segment('sequenced paired doublemap', data, sample) if is_primary: self._count_segment('sequenced paired doublemap primary', data, sample) if is_segment1: self._count_read('sequenced paired doublemap multi', data, sample) if is_primary: self._count_read('sequenced paired doublemap primary', data, sample) if number_hits and is_unique: self._count_read('sequenced paired doublemap primary unique', data, sample) if is_translocation: self._count_read('sequenced paired doublemap primary unique translocation', data, sample) elif is_part_of_halfmap: self._count_segment('sequenced paired halfmap', data, sample) # The mapped segment if is_mapped: self._count_segment('sequenced paired halfmap mapped', data, sample) if is_primary: self._count_read('sequenced paired halfmap mapped primary', data, sample) if number_hits and is_unique: self._count_read('sequenced paired halfmap mapped primary unique', data, sample) elif not is_primary: self._count_read('sequenced unpaired mapped multi', data, sample) # The unmapped segment elif not is_mapped: self._count_segment('sequenced paired halfmap unmapped', data, sample) elif is_part_of_nomap: self._count_segment('sequenced paired nomap', data, sample) if is_segment1: self._count_read('sequenced paired nomap', data, sample) elif not is_paired: self._count_segment('sequenced unpaired', data, sample) if is_mapped: self._count_segment('sequenced unpaired mapped', data, sample) if is_primary: self._count_read('sequenced unpaired mapped primary', data, sample) if number_hits and is_unique: self._count_read('sequenced paired unpaired mapped primary unique', data, sample) elif not is_primary: self._count_read('sequenced unpaired mapped multi', data, sample) elif not is_mapped: self._count_segment('sequenced unpaired unmapped', data, sample) if is_segment1: self._count_read('sequenced unpaired unmapped', data, sample) def write(self): with open(self.file_path, 'w') as output_file: all_references = sorted([(count, reference) for reference, count\ in self.all_references.iteritems()], reverse=True) for j, (count, reference) in enumerate(all_references[:30]): self.stored[('segment', 'multireference_' + str(j+1))][reference] = count primary_references = sorted([(count, reference) for reference, count\ in self.primary_references.iteritems()], reverse=True) for j, (count, reference) in enumerate(primary_references[:30]): self.stored[('segment', 'primaryreference_' + str(j+1))][reference] = count # Extract top-ranking kmers if KHMER_AVAILABLE: kmer_frequencies = [] for i in range(0, self.ktable.n_entries()): n = self.ktable.get(i) if n > 0: kmer_frequencies.append((n, self.ktable.reverse_hash(i))) kmer_frequencies = sorted(kmer_frequencies, reverse=True) for j, (frequency, kmer) in enumerate(kmer_frequencies[:10]): self.stored[('segment', 'kmer_' + str(j+1))][kmer] = frequency output_file.writelines(self._str_metrics(self.stored)) class SAMTaxonomy: """ Provides taxonomic summary information from a SAM file stream. """ def __init__(self, file_path): self.file_path = file_path self.count_primaries = Counter() self.detailed_information = {} self._last_read = (None, None) self._last_read_human_prim = 0 self._last_read_human_sec = 0 self._last_organisms = set() def count(self, parsed_line): if parsed_line is None: return read_key, read_name, flag, ref_name, ref_position, mapping_score,\ cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\ is_end1, is_end2, number_mismatches, alignment_score,\ number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\ is_paired, number_matches, read_end_pos, max_match = parsed_line if is_mapped: refseq_group, family, organism, gi = ref_name.split(';')[:4] if is_primary: self.count_primaries[organism] += 1 if organism not in self.detailed_information: # refseq_group. family, gis, avg_mapping_score, # avg_seq_length, avg_number_hits, avg_alignment_score, avg_nr_mismatches initial = [refseq_group, family, set([gi]), [int(mapping_score), 1], [len(seq), 1], [0, 0], [alignment_score, 1], [number_mismatches, 1], 0, 0] self.detailed_information[organism] = initial else: entry = self.detailed_information[organism] entry[2].add(gi) entry[3][0] += int(mapping_score) entry[3][1] += 1 entry[4][0] += len(seq) entry[4][1] += 1 entry[6][0] += alignment_score entry[6][1] += 1 entry[7][0] += number_mismatches entry[7][1] += 1 if is_primary: entry = self.detailed_information[organism] entry[5][0] += number_hits entry[5][1] += 1 if self._last_read == (None, None): self._last_read = read_key if self._last_read != read_key: for last_organism in self._last_organisms: self.detailed_information[last_organism][8]\ += self._last_read_human_prim self.detailed_information[last_organism][9]\ += self._last_read_human_sec self._last_read = read_key self._last_organisms = set() self._last_read_human_prim = 0 self._last_read_human_sec = 0 self._last_organisms.add(organism) if organism == 'Homo_sapiens': if is_primary: self._last_read_human_prim += 1 else: self._last_read_human_sec += 1 def get_summary(self, top=100): lines = [] lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10s\t%10s\t%5s\t%5s\t%5s\t%10s\t%10s\n'\ % ('Count', 'Group', 'Family', 'Organism', 'Targets', 'ReadLen', 'Hits', 'Map', 'Algn', 'Mism', 'HuP', 'HuS')) top_organisms = self.count_primaries.most_common(top) for organism, count in top_organisms: refseq_group, family, identifiers,\ avg_mapping_score, avg_seq_length, avg_number_hits,\ avg_alignment_score, avg_nr_mismatches, human_prim, human_sec\ = self.detailed_information[organism] avg_len = int(avg_seq_length[0] / float(avg_seq_length[1])) if avg_number_hits[1] == 0: avg_hits = 0 else: avg_hits = int(avg_number_hits[ 0] / float(avg_number_hits[1])) avg_mapping_score = int(avg_mapping_score[ 0] / float(avg_mapping_score[1])) avg_alignment_score = int(avg_alignment_score[ 0] / float(avg_alignment_score[1])) avg_nr_mismatches = int(avg_nr_mismatches[ 0] / float(avg_nr_mismatches[1])) nr_ids = len(identifiers) if count > 10**6: count = str(round(count / float(10**6), 3)) + 'M' if human_prim > 10**6: human_prim = str(round(human_prim / float(10**6), 3)) + 'M' if human_sec > 10**6: human_sec = str(round(human_sec / float(10**6), 3)) + 'M' if nr_ids > 10**6: nr_ids = str(round(nr_ids / float(10**6), 3)) + 'M' lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10i\t%10i\t%5i\t%5i\t%5i\t%10s\t%10s\n'\ % (str(count), refseq_group[:20], family[:20], organism[:20],\ str(nr_ids), avg_len, avg_hits, avg_mapping_score,\ avg_alignment_score, avg_nr_mismatches, str(human_prim),\ str(human_sec))) return lines def write(self): with open(self.file_path, 'w') as output_file: output_file.writelines(self.get_summary()) @CLI.subcommand("rnamap") class RNAmap(cli.Application): """ Map input reads against a STAR index """ index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, help="Sets the index output directory") threads = cli.SwitchAttr( ['-t', '--threads'], cli.Range(1, 512), mandatory=False, help="Sets the number of threads to use", default=1) taxonomy = cli.SwitchAttr( ['-x', '--taxonomy'], str, mandatory=False, help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping", default='') star_path = cli.SwitchAttr(['--star_path'], str, mandatory=False, help="Path to STAR executable", default='') samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False, help="Path to samtools executable", default='') temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False, help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.", default='') min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False, help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)", default=None) min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False, help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)", default=None) max_mismatches = cli.SwitchAttr(['--max_mismatches'], cli.Range(0, 10000000), mandatory=False, help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)", default=None) max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False, help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)", default=None) min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False, help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)", default=None) filter_complexity = cli.Flag(['--filter_complexity'], help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)", default=False) bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False, help="Path to unsorted, unindexed output BAM file", default='') sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False, help="Path to output SAM file", default='') qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False, help="Path to output quality file", default='') hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False, help="Path to bzip2-compressed tab-delimited output hit file", default='') sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False, help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file", default='no_sample_id') unmapped1 = cli.SwitchAttr(['--unmapped_end_1'], str, mandatory=False, help="Output path to uncompressed fastq file containing unmapped reads, first ends only for paired ends.", default='') unmapped2 = cli.SwitchAttr(['--unmapped_end_2'], str, mandatory=False, help="Output path to uncompressed fastq file containing unmapped reads, second ends only for paired ends.", default='') splice_junctions = cli.SwitchAttr(['--splice_junctions'], str, mandatory=False, help="Input path to splice junction file (currently not implemented)", default='') chimeric_mappings = cli.SwitchAttr(['--chimeric_mappings'], str, mandatory=False, help="Ouput path to SAM file containing chimeric mappings", default='') hit_filter = cli.SwitchAttr( ['-f', '--virana_hit_filter'], str, list=True, mandatory=False, help="Only generate hit groups that include at last one read mapping to a reference of this reference group.", default=[]) debug = cli.Flag(["-d", "--debug"], help="Enable debug information") reads = cli.SwitchAttr( ['-r', '--reads'], str, list=True, mandatory=True, help="Sets the input reads. Add this parameter twice for paired end reads.") zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped") sensitive = cli.Flag( ["--sensitive"], help="If given, mapping will process slower and more sensitive") def main(self): if self.debug: logging.getLogger().setLevel(logging.DEBUG) # Obtain star executable star = [self.star_path and self.star_path or 'STAR'] samtools = [self.samtools_path and self.samtools_path or 'samtools'] # Check if genome directory is existing if not os.path.exists(self.index_dir): sys.stdout.write('Index directory %s not existing, exiting' % self.genome_dir) sys.exit(1) if self.temp_path: temp_path = tempfile.mkdtemp(dir=self.temp_path) else: temp_path = tempfile.mkdtemp() first_ends = [] second_ends = [] single_ends = [] if len(self.reads) == 2: first, second = self.reads first_ends.append(first) second_ends.append(second) elif len(self.reads) == 1: single_ends.append(self.reads[0]) else: sys.stdout.write('Invalid number of fastq files; provide either one (single end) or two (paired end)') sys.exit(1) if single_ends and not first_ends and not second_ends: reads = [','.join(single_ends)] elif first_ends and second_ends: reads = [','.join(first_ends), ','.join(second_ends)] star_cline = star + ['--runMode', 'alignReads', '--genomeDir', self.index_dir, '--runThreadN', str(self.threads), '--readMatesLengthsIn', 'NotEqual', '--outFileNamePrefix', os.path.join( temp_path, 'out'), '--outSAMmode', 'Full', '--outSAMstrandField', 'None', '--outSAMattributes', 'Standard', '--outSAMunmapped', 'Within', '--outStd', 'SAM', '--outFilterMultimapNmax', '1000', '--outSAMprimaryFlag', 'AllBestScore', '--outSAMorder', 'PairedKeepInputOrder'] if self.unmapped1 or self.unmapped2: star_cline += ['--outReadsUnmapped', 'Fastx'] else: star_cline += ['--outReadsUnmapped', 'None'] if self.zipped: star_cline += ['--readFilesCommand', 'zcat'] if self.sensitive: star_cline += ['--outFilterMultimapScoreRange', '10', '--outFilterMismatchNmax', '60', '--outFilterMismatchNoverLmax', '0.3', '--outFilterScoreMin', '0', '--outFilterScoreMinOverLread', '0.3', '--outFilterMatchNmin', '0', '--outFilterMatchNminOverLread', '0.66', '--seedSearchStartLmax', '12', '--winAnchorMultimapNmax', '50'] star_cline += ['--readFilesIn'] + reads if self.debug: print ' '.join(star_cline) # Try if we can make the relevant files touch_files = [self.unmapped1, self.unmapped2, self.taxonomy, self.qual, self.hits, self.sam, self.bam] for file_path in touch_files: if file_path is None or file_path == '': continue try: with file(file_path, 'a'): os.utime(file_path, None) except IOError: sys.stderr.write('Could not write output file %s\n' % file_path) sys.exit(1) star_process = subprocess.Popen(' '.join( star_cline), shell=True, stdout=PIPE) parser = SAMParser() if self.taxonomy: taxonomy = SAMTaxonomy(self.taxonomy) if self.qual: quality = SAMQuality(self.qual) if self.hits: hits = SAMHits(self.hits, self.sample_id, self.hit_filter, self.min_mapping_score, self.min_alignment_score, self.max_mismatches, self.max_relative_mismatches, self.min_continiously_matching, self.filter_complexity) if self.sam: sam_file = open(self.sam, 'w') if self.bam: with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file: samtools_cline = samtools + [ 'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin'] if self.debug: print ' '.join(samtools_cline) samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE) do_sam = self.sam do_bam = self.bam do_taxonomy = self.taxonomy do_qual = self.qual do_hits = self.hits do_parse = do_taxonomy or do_qual or do_hits for i, line in enumerate(iter(star_process.stdout.readline, '')): if do_sam: sam_file.write(line) if do_bam: samtools_process.stdin.write(line) if line[0] == '@': continue if do_parse: parsed_line = parser.parse(line) if do_taxonomy: taxonomy.count(parsed_line) if i > 0 and (i % 50000) == 0: print ''.join(taxonomy.get_summary(10)) if do_qual: quality.count(parsed_line) if do_hits: hits.count(parsed_line) if do_bam: samtools_process.stdin.close() if do_sam: sam_file.close() if do_hits: hits.write() if do_taxonomy: print ''.join(taxonomy.get_summary(10)) taxonomy.write() if do_qual: quality.write() try: if self.unmapped1: shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate1'),\ self.unmapped1) except IOError: pass try: if self.unmapped2: shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate2'),\ self.unmapped2) except IOError: pass try: if self.chimeric_mappings: shutil.move(os.path.join(temp_path, 'out' + 'Chimeric.out.sam'),\ self.chimeric_mappings) except IOError: pass shutil.rmtree(temp_path) @CLI.subcommand("dnamap") class DNAmap(cli.Application): """ Map input reads against a BWA index """ index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True, help="Sets the index output directory") threads = cli.SwitchAttr( ['-t', '--threads'], cli.Range(1, 512), mandatory=False, help="Sets the number of threads to use", default=1) taxonomy = cli.SwitchAttr( ['-x', '--taxonomy'], str, mandatory=False, help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping", default='') samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False, help="Path to samtools executable", default='') temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False, help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.", default='') min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False, help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)", default=None) min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False, help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)", default=None) max_mismatches = cli.SwitchAttr(['--max_mismatches'], cli.Range(0, 10000000), mandatory=False, help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)", default=None) max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False, help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)", default=None) min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False, help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)", default=None) filter_complexity = cli.Flag(['--filter_complexity'], help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)", default=False) sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False, help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file", default='no_sample_id') bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False, help="Path to unsorted, unindexed output BAM file", default='') sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False, help="Path to output SAM file", default='') qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False, help="Path to output quality file", default='') hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False, help="Path to bzip2-compressed tab-delimited output virana hit file", default='') hit_filter = cli.SwitchAttr( ['-f', '--virana_hit_filter'], str, list=True, mandatory=False, help="Only generate hit groups that include at last one read mapping to a reference of this reference group.", default=[]) debug = cli.Flag(["-d", "--debug"], help="Enable debug information") zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped") sensitive = cli.Flag( ["--sensitive"], help="If given, mapping will process slower and more sensitive") bwa_path = cli.SwitchAttr(['--bwa_path'], str, mandatory=False, help="Path to BWA executable", default='') debug = cli.Flag(["-d", "--debug"], help="Enable debug information") if debug: logging.getLogger().setLevel(logging.DEBUG) reads = cli.SwitchAttr( ['-r', '--reads'], str, list=True, mandatory=True, help="Sets the input reads. Add this parameter twice for paired end reads.") def main(self): if self.debug: logging.getLogger().setLevel(logging.DEBUG) # Obtain star executable bwa = [self.bwa_path and self.bwa_path or 'bwa'] samtools = [self.samtools_path and self.samtools_path or 'samtools'] # Check if genome directory is existing if not os.path.exists(self.index_dir): sys.stdout.write('Index directory %s not existing, exiting'\ % self.genome_dir) sys.exit(1) if len(self.reads) not in (1, 2): message = 'Invalid number of FASTQ files; supply either one (single end) or two (paired end)\n' sys.stderr.write(message) sys.exit(1) bwa_cline = bwa + ['mem', '-t', str(self.threads), '-M', os.path.join(self.index_dir, 'index')] bwa_cline += self.reads if self.debug: print ' '.join(bwa_cline) bwa_process = subprocess.Popen(' '.join(bwa_cline), shell=True, stdout=PIPE) parser = SAMParser() if self.taxonomy: taxonomy = SAMTaxonomy(self.taxonomy) if self.qual: quality = SAMQuality(self.qual) if self.hits: hits = SAMHits(self.hits, self.sample_id, self.hit_filter, self.min_mapping_score, self.min_alignment_score, self.max_mismatches, self.max_relative_mismatches, self.min_continiously_matching, self.filter_complexity) if self.sam: sam_file = open(self.sam, 'w', buffering=100 * 1024 * 1024) if self.bam: with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file: samtools_cline = samtools + [ 'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin'] if self.debug: print ' '.join(samtools_cline) samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE) do_sam = self.sam do_bam = self.bam do_taxonomy = self.taxonomy do_qual = self.qual do_hits = self.hits do_parse = do_taxonomy or do_qual or do_hits for i, line in enumerate(iter(bwa_process.stdout.readline, '')): if do_sam: sam_file.write(line) if do_bam: samtools_process.stdin.write(line) if do_parse: parsed_line = parser.parse(line) if do_taxonomy: taxonomy.count(parsed_line) if i > 0 and (i % 10000) == 0: print ''.join(taxonomy.get_summary(10)) if do_qual: quality.count(parsed_line) if do_hits: hits.count(parsed_line) if do_bam: samtools_process.stdin.close() if do_sam: sam_file.close() if do_hits: hits.write() if do_taxonomy: print ''.join(taxonomy.get_summary(10)) taxonomy.write() if do_qual: quality.write() if __name__ == "__main__": CLI.run()
