virana_main: vmap.py comparison

comparison vmap.py @ 0:f63d639b223c draft

Initial Upload

author	mzeidler
date	Wed, 25 Sep 2013 11:41:16 -0400
parents
children	be5d3889a1b9

comparison

equal deleted inserted replaced

--1:000000000000
+:f63d639b223c
+#!/usr/bin/env python
+#from __future__ import print_function
+import cProfile
+import sys
+import re
+import tempfile
+import subprocess
+import shutil
+import os
+import os.path
+import logging
+import bz2
+import zlib
+import math
+import string
+from collections import defaultdict, Counter
+from subprocess import PIPE
+NON_ID = ''.join(c for c in map(chr, range(256)) if not c.isalnum())
+NON_ID = NON_ID.replace('_', '').replace('-', '')
+try:
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+from Bio.Seq import Seq
+except ImportError:
+message = 'This script requires the BioPython python package\n'
+sys.stderr.write(message)
+sys.exit(1)
+try:
+from plumbum import cli
+except ImportError:
+message = 'This script requires the plumbum python package\n'
+sys.stderr.write(message)
+sys.exit(1)
+try:
+import HTSeq
+except ImportError:
+message = 'This script requires the HTSeq python package\n'
+sys.stderr.write(message)
+sys.exit(1)
+KHMER_AVAILABLE = True
+try:
+import khmer
+except ImportError:
+KHMER_AVAILABLE = False
+#from io import BufferedRandom
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+def profile_this(fn):
+def profiled_fn(*args, **kwargs):
+fpath = fn.__name__ + ".profile"
+prof = cProfile.Profile()
+ret = prof.runcall(fn, *args, **kwargs)
+prof.dump_stats(fpath)
+return ret
+return profiled_fn
+class CLI(cli.Application):
+"""RNA-Seq and DNA-Seq short read analysis by mapping to known reference sequences"""
+PROGNAME = "vmap"
+VERSION = "1.0.0"
+DESCRIPTION = """Virana vmap is an interface to the NCBI and ensembl reference databases that can
+generate reference indexes for the short read mappers STAR (RNA-Seq) and
+BWA-MEM (DNA-Seq). Short reads can be mapped to arbitrary combinations of
+reference databases and the results can be summarized by taxonomic family
+as well as stored as SAM file, unsorted BAM file, or as a HIT file that
+models multimapping reads between specific reference databases."""
+USAGE = """The program has four modes that can be accessed by `vmap rnaindex`, `vmap dnaindex`, `vmap rnamap`, and `vmap dnamap.`"""
+def main(self, *args):
+if args:
+print("Unknown command %r" % (args[0]))
+print self.USAGE
+return 1
+if not self.nested_command:
+print("No command given")
+print self.USAGE
+return 1
+@CLI.subcommand("rnaindex")
+class RNAIndex(cli.Application):
+""" Creates a STAR index from a FASTA genome reference """
+reference_files = cli.SwitchAttr(
+['-r', '--reference_file'], str, list=True, mandatory=True,
+help="Sets the reference genome(s) FASTA file." +
+" Multiple occurrences of this parameter are allowed.")
+index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
+help="Sets the index output directory." +
+" Directory will be generated if not existing." +
+" Directory will be filled with several index files.")
+threads = cli.SwitchAttr(
+['-t', '--threads'], cli.Range(1, 512), mandatory=False,
+help="Sets the number of threads to use",
+default=1)
+max_ram = cli.SwitchAttr(
+['-m'], cli.Range(1, 400000000000), mandatory=False,
+help="Sets the maximum amount of memory (RAM) to use (in bytes)",
+default=400000000000)
+path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False,
+help="Path to STAR executable",
+default='')
+sparse = cli.Flag(
+["-s", "--sparse"], help="If given, a sparse index that requires less " +
+" RAM in the mapping phase will be constructed")
+debug = cli.Flag(["-d", "--debug"], help="Enable debug output")
+def main(self):
+if self.debug:
+logging.getLogger().setLevel(logging.DEBUG)
+# Obtain star executable
+star = [self.path and self.path or 'STAR']
+# Check if genome directory is existing
+for reference_file in self.reference_files:
+if not os.path.exists(reference_file):
+sys.stdout.write(
+'Reference file %s nor existing, exiting' % reference_file)
+sys.exit(1)
+# Check if output directory is existing
+if not os.path.exists(self.index_dir):
+logging.debug(
+'Making output directory for index at %s' % self.index_dir)
+os.makedirs(self.index_dir)
+# # Make named pipe to extract genomes
+# pipe_path = os.path.abspath(os.path.join(self.genome_dir, 'pipe.fa'))
+# if os.path.exists(pipe_path):
+#     os.unlink(pipe_path)
+# os.mkfifo(pipe_path)
+# Make star command line
+cline = star + ['--runMode', 'genomeGenerate',
+'--genomeDir', self.index_dir,
+'--limitGenomeGenerateRAM', str(self.max_ram),
+'--runThreadN', str(self.threads),
+'--genomeFastaFiles'] + self.reference_files
+# Add parameters for sparse (memory-saving) index generation
+if self.sparse:
+cline += ['--genomeSAsparseD', '2',
+'--genomeChrBinNbits', '12',
+'--genomeSAindexNbases', '13']
+else:
+cline += ['--genomeSAsparseD', '1',
+'--genomeChrBinNbits', '18',
+'--genomeSAindexNbases', '15']
+if self.debug:
+print ' '.join(cline)
+# Run STAR reference generation process
+star_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
+# Block until streams are closed by the process
+stdout, stderr = star_process.communicate()
+if stderr:
+sys.stderr.write(stderr)
+if self.debug and stdout:
+print stdout
+@CLI.subcommand("dnaindex")
+class DNAIndex(cli.Application):
+""" Creates a BWA index from a FASTA reference file """
+reference_file  = cli.SwitchAttr(['-r', '--reference_file'], str, mandatory=True,
+help="Sets the input reference FASTA file.")
+index_dir   = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
+help="Sets the index output directory." +
+" Directory will be generated if not existing." +
+" Directory will be filled with several index files.")
+path        = cli.SwitchAttr(['-p', '--path'], str, mandatory=False,
+help="Path to BWA executable",
+default='')
+debug       = cli.Flag(["-d", "--debug"], help="Enable debug output")
+if debug:
+logging.getLogger().setLevel(logging.DEBUG)
+def main(self):
+# Obtain star executable
+bwa = [self.path and self.path or 'bwa']
+# Check if genome directory is existing
+if not os.path.exists(self.reference_file):
+sys.stdout.write('Genome file %s nor existing, exiting' % self.reference_file)
+sys.exit(1)
+# Check if output directory is existing
+if not os.path.exists(self.index_dir):
+logging.debug('Making output directory %s' % self.index_dir)
+os.makedirs(self.index_dir)
+# Make star command line
+cline = bwa + ['index', '-a', 'bwtsw', '-p', os.path.join(self.index_dir, 'index'), self.reference_file]
+if self.debug:
+print ' '.join(cline)
+# Run BWA index generation process
+bwa_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
+stdout, stderr = bwa_process.communicate()
+if stderr:
+sys.stderr.write(stderr)
+if self.debug and stdout:
+print stdout
+class SAMHits:
+""" Converts SAM output of mappers into bzipped HIT files. """
+def __init__(self, output_file, sample_id, refseq_filter=None, min_mapping_score=None,\
+min_alignment_score=None, max_mismatches=None,\
+max_relative_mismatches=None, min_continiously_matching=None,\
+filter_complexity=False):
+self.output_file                = bz2.BZ2File(output_file, 'wb', buffering=100 * 1024 * 1024)
+self.sample_id                  = sample_id.translate(None, NON_ID)
+self.refseq_filter              = refseq_filter
+self.max_mismatches             = max_mismatches
+self.max_relative_mismatches    = max_relative_mismatches
+self.current_group              = []
+self.min_mapping_score          = min_mapping_score
+self.min_alignment_score        = min_alignment_score
+self.min_continiously_matching  = min_continiously_matching
+self.filter_complexity          = filter_complexity
+self.re_matches = re.compile(r'(\d+)M')
+self.re_dels    = re.compile(r'(\d+)D')
+def count(self, parsed_line):
+if parsed_line is None:
+return
+read_key, read_name, flag, ref_name, ref_position, mapping_score,\
+cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
+is_end1, is_end2, number_mismatches, alignment_score,\
+number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
+is_paired, number_matches, read_end_pos, max_match = parsed_line
+if not is_mapped:
+return
+if self.filter_complexity:
+avg_compression = float(len(zlib.compress(seq)))/len(seq)
+if avg_compression < 0.5:
+return
+# length = len(seq)
+# counts = [seq.count(nuc) for nuc in 'ACGT']
+# min_count = length * 0.10
+# max_count = length * 0.50
+# for count in counts:
+#     if count < min_count or count > max_count:
+#         return None
+# counter = Counter()
+# for i in range(length - 2):
+#     counter[seq[i: i + 3]] += 1
+# maximal = length - 4
+# highest = sum([v for k, v in counter.most_common(2)])
+# if highest > (maximal / 3.0):
+#     return None
+# self.passed.append(avg_compression)
+pair_id = ''
+if is_end1:
+pair_id = '/1'
+elif is_end2:
+pair_id = '/2'
+read_name = self.sample_id + ';' + read_name + pair_id
+# Initialize new current group
+if len(self.current_group) == 0:
+self.current_group = [read_name, seq, []]
+# Write old current group to file
+if read_name != self.current_group[0]:
+self._write_group()
+self.current_group = [read_name, seq, []]
+try:
+refseq_group, family, organism, identifier = ref_name.split(';')[:4]
+except ValueError:
+sys.stderr.write('Read mapped to malformed reference sequence %s, skipping\n' % ref_name)
+return
+if self.min_continiously_matching:
+if self.min_continiously_matching > max_match:
+return
+if self.max_mismatches\
+and int(number_mismatches) > self.max_mismatches:
+return
+if self.max_relative_mismatches\
+and int(number_mismatches) / float(len(seq))\
+> self.max_relative_mismatches:
+return
+if self.min_mapping_score\
+and self.min_mapping_score > mapping_score:
+return
+if self.min_alignment_score\
+and self.min_alignment_score > alignment_score:
+return
+start = int(ref_position) + 1
+self.current_group[2].append([refseq_group, family, organism, identifier, str(start), str(read_end_pos)])
+def _write_group(self):
+passed = True
+if self.refseq_filter:
+passed = False
+for refseq_group, family, organism, identifier, start, end in self.current_group[2]:
+if passed:
+break
+for f in self.refseq_filter:
+if refseq_group == f:
+passed = True
+break
+if passed:
+description = []
+for identifier in self.current_group[2]:
+description.append(';'.join(identifier))
+description = '|'.join(description)
+record = SeqRecord(Seq(self.current_group[1]))
+record.id = 'Read;' + self.current_group[0]
+record.description = description
+SeqIO.write([record], self.output_file, "fasta")
+def write(self):
+self._write_group()
+self.output_file.close()
+class SAMParser:
+def parse(self, line):
+if line[0] == '@':
+return None
+alignment   = HTSeq._HTSeq.SAM_Alignment.from_SAM_line(line)
+read_name   = alignment.read.name
+seq         = alignment.read.seq
+qual        = alignment.read.qual
+flag        = alignment.flag
+cigar       = None
+is_paired       = (flag & 1)
+is_mapped       = not (flag & 4)
+is_mate_mapped  = alignment.mate_aligned is not None #not (flag & 8)
+is_reverse      = (flag & 16)
+is_end1         = (flag & 64)
+is_end2         = (flag & 128)
+is_primary      = not (flag & 256)
+read_key = (read_name, is_end1)
+ref_name            = None
+ref_position        = None
+mapping_score       = 0
+mate_ref_name       = None
+mate_ref_position   = None
+insert_size         = None
+alignment_score     = 0
+read_end_pos        = None
+if is_mate_mapped and alignment.mate_start:
+mate_ref_name       = alignment.mate_start.chrom
+mate_ref_position   = alignment.mate_start.start
+number_hits         = 0
+alignment_score     = 0
+number_mismatches   = 0
+number_matches      = 0
+max_match           = 0
+if is_mapped:
+ref_name            = alignment.iv.chrom
+ref_position        = alignment.iv.start
+read_end_pos        = alignment.iv.end
+alignment_score     = alignment.aQual
+cigar               = alignment.cigar
+if is_mate_mapped:
+insert_size         = alignment.inferred_insert_size
+for c in cigar:
+if c.type == 'M':
+number_matches += c.size
+max_match = max(max_match, c.size)
+for tag, value in alignment.optional_fields:
+if tag == 'NM':
+number_hits = value
+elif tag == 'AS':
+alignment_score = value
+elif tag == 'NH':
+number_mismatches = value
+return read_key, read_name, flag, ref_name, ref_position, mapping_score,\
+cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
+is_end1, is_end2, number_mismatches, alignment_score,\
+number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
+is_paired, number_matches, read_end_pos, max_match
+class SAMQuality:
+def __init__(self, file_path):
+self.file_path          = file_path
+self.stored             = defaultdict(Counter)
+self.all_references     = defaultdict(int)
+self.primary_references = defaultdict(int)
+self.complement         = string.maketrans('ATCGN', 'TAGCN')
+if KHMER_AVAILABLE:
+self.ktable = khmer.new_ktable(10)
+def _get_complement(self, sequence):
+return sequence.translate(self.complement)[::-1]
+def _get_summary(self, counter):
+""""Returns five numbers (sum, extrema, mean, and std)
+for a max_frequency counter """
+maximum  = 0
+minimum  = sys.maxint
+thesum   = 0
+allcount = 0
+mode     = [0, None]
+items       = 0.0
+mean        = 0.0
+m2          = 0.0
+variance    = 0.0
+for item in counter:
+count   = counter[item]
+if count > mode[0]:
+mode = [count, item]
+allcount += count
+maximum = max(maximum, item)
+minimum = min(minimum, item)
+thesum  += (count * item)
+x = 1
+while x <= count:
+items       += 1
+delta       = item - mean
+mean        = mean + delta / items
+m2          = m2 + delta * (item - mean)
+variance    = m2 / items
+x += 1
+std = math.sqrt(variance)
+return allcount, thesum, minimum, maximum, mode[1], mean, std
+def _to_unit(self, item, is_percentage=False):
+""" Convert a numeric to a string with metric units """
+if is_percentage:
+return ('%-.3f' % (item * 100)) + '%'
+converted = None
+try:
+item = float(item)
+if item > 10**12:
+converted = str(round(item / 10**9,3))+'P'
+elif item > 10**9:
+converted = str(round(item / 10**9,3))+'G'
+elif item > 10**6:
+converted = str(round(item / 10**6,3))+'M'
+elif item > 10**3:
+converted = str(round(item / 10**3,3))+'K'
+else:
+converted = str(round(item,3))
+except:
+converted = str(item)
+return converted
+def _str_metrics(self, data):
+str_metrics = []
+for (item, metric) in sorted(data.keys()):
+counter = data[(item, metric)]
+if not hasattr(counter.iterkeys().next(), 'real'):
+for element, count in counter.most_common():
+str_metrics.append(self._str_metric(item, metric, element, count, no_units=True))
+else:
+summary = self._get_summary(counter)
+str_metrics.append(self._str_metric(item, metric, *summary))
+return str_metrics
+def _str_metric(self, item, metric, count, thesum='', minimum='',\
+maximum='', mode='', mean='', std='', no_units=False):
+counters = [count, thesum, minimum, maximum, mode, mean, std]
+counters = map(str, counters)
+if no_units:
+items = [item, metric] +  counters
+else:
+units = map(self._to_unit, counters)
+items = [item, metric] + units
+return '%-15s\t%-60s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\n' \
+% tuple(items)
+def _count_read(self, metric, data, sample):
+item = 'read'
+(insert_size, alignment_score, mapping_score, length,\
+q20_length, avg_phred_quality, number_hits, is_reverse) = data
+self.stored[(item, metric + ' mappings')][number_hits] += sample
+self.stored[(item, metric + ' insert')][insert_size] += sample
+def _count_segment(self, metric, data, sample):
+item = 'segment'
+(insert_size, alignment_score, mapping_score, length,\
+q20_length, avg_phred_quality, number_hits, is_reverse) = data
+self.stored[(item, metric + ' algq')][alignment_score] += sample
+self.stored[(item, metric + ' mapq')][mapping_score] += sample
+self.stored[(item, metric + ' length')][length] += sample
+self.stored[(item, metric + ' q20length')][q20_length] += sample
+self.stored[(item, metric + ' meanbasequal')][avg_phred_quality] += sample
+self.stored[(item, metric + ' reverse')][is_reverse] += sample
+def count(self, parsed_line):
+if parsed_line is None:
+return
+#print_metric('Item' , 'Metric', 'Count', 'Sum', 'Min', 'Max', 'Mode', 'Mean', 'STD')
+read_key, read_name, flag, ref_name, ref_position, mapping_score,\
+cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
+is_end1, is_end2, number_mismatches, alignment_score,\
+number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
+is_paired, number_matches, read_end_pos, max_match = parsed_line
+phred_quality       = [q - 33 for q in qual]
+avg_phred_quality   = sum(phred_quality) / float(len(phred_quality))
+length              = len(seq)
+mate_reference_id   = mate_ref_name
+reference_id        = ref_name
+reference           = reference_id is not None and reference_id != '*'
+insert_size         = insert_size and abs(insert_size) or insert_size
+is_segment1         = not is_paired or (is_paired and is_end1)
+is_reverse          = is_reverse
+is_unique           = is_primary and number_hits == 1
+is_translocation    = is_paired and is_mapped and is_mate_mapped\
+and (mate_reference_id != '=' and reference_id != mate_reference_id)
+is_part_of_doublemap    = is_paired and is_mapped and is_mate_mapped
+is_part_of_halfmap      = is_paired and (is_mapped != is_mate_mapped)
+is_part_of_nomap        = is_paired and not is_mapped and not is_mate_mapped
+# Count length until first low quality base call
+q20_length = 0
+for q in phred_quality:
+if q < 20:
+break
+q20_length += 1
+# Count kmers
+if KHMER_AVAILABLE:
+if not is_reverse:
+self.ktable.consume(seq)
+else:
+self.ktable.consume(self._get_complement(seq))
+if reference:
+self.all_references[reference_id] += 1
+if is_primary:
+self.primary_references[reference_id] += 1
+data   = (insert_size, alignment_score, mapping_score, length,\
+q20_length, avg_phred_quality, number_hits, is_reverse)
+sample = 1
+self._count_segment('sequenced', data, sample)
+if is_mapped:
+self._count_segment('sequenced mapped multi', data, sample)
+if is_primary:
+self._count_segment('sequenced mapped primary', data, sample)
+if number_hits and is_unique:
+self._count_segment('sequenced mapped primary unique', data, sample)
+if is_segment1:
+self._count_read('sequenced mapped multi', data, sample)
+if is_primary:
+self._count_read('sequenced mapped primary', data, sample)
+if is_paired:
+self._count_segment('sequenced paired', data, sample)
+if is_part_of_doublemap:
+self._count_segment('sequenced paired doublemap', data, sample)
+if is_primary:
+self._count_segment('sequenced paired doublemap primary', data, sample)
+if is_segment1:
+self._count_read('sequenced paired doublemap multi', data, sample)
+if is_primary:
+self._count_read('sequenced paired doublemap primary', data, sample)
+if number_hits and is_unique:
+self._count_read('sequenced paired doublemap primary unique', data, sample)
+if is_translocation:
+self._count_read('sequenced paired doublemap primary unique translocation', data, sample)
+elif is_part_of_halfmap:
+self._count_segment('sequenced paired halfmap', data, sample)
+# The mapped segment
+if is_mapped:
+self._count_segment('sequenced paired halfmap mapped', data, sample)
+if is_primary:
+self._count_read('sequenced paired halfmap mapped primary', data, sample)
+if number_hits and is_unique:
+self._count_read('sequenced paired halfmap mapped primary unique', data, sample)
+elif not is_primary:
+self._count_read('sequenced unpaired mapped multi', data, sample)
+# The unmapped segment
+elif not is_mapped:
+self._count_segment('sequenced paired halfmap unmapped', data, sample)
+elif is_part_of_nomap:
+self._count_segment('sequenced paired nomap', data, sample)
+if is_segment1:
+self._count_read('sequenced paired nomap', data, sample)
+elif not is_paired:
+self._count_segment('sequenced unpaired', data, sample)
+if is_mapped:
+self._count_segment('sequenced unpaired mapped', data, sample)
+if is_primary:
+self._count_read('sequenced unpaired mapped primary', data, sample)
+if number_hits and is_unique:
+self._count_read('sequenced paired unpaired mapped primary unique', data, sample)
+elif not is_primary:
+self._count_read('sequenced unpaired mapped multi', data, sample)
+elif not is_mapped:
+self._count_segment('sequenced unpaired unmapped', data, sample)
+if is_segment1:
+self._count_read('sequenced unpaired unmapped', data, sample)
+def write(self):
+with open(self.file_path, 'w') as output_file:
+all_references = sorted([(count, reference) for reference, count\
+in self.all_references.iteritems()], reverse=True)
+for j, (count, reference) in enumerate(all_references[:30]):
+self.stored[('segment', 'multireference_' + str(j+1))][reference] = count
+primary_references = sorted([(count, reference) for reference, count\
+in self.primary_references.iteritems()], reverse=True)
+for j, (count, reference) in enumerate(primary_references[:30]):
+self.stored[('segment', 'primaryreference_' + str(j+1))][reference] = count
+# Extract top-ranking kmers
+if KHMER_AVAILABLE:
+kmer_frequencies = []
+for i in range(0, self.ktable.n_entries()):
+n = self.ktable.get(i)
+if n > 0:
+kmer_frequencies.append((n, self.ktable.reverse_hash(i)))
+kmer_frequencies = sorted(kmer_frequencies, reverse=True)
+for j, (frequency, kmer) in enumerate(kmer_frequencies[:10]):
+self.stored[('segment', 'kmer_' + str(j+1))][kmer] = frequency
+output_file.writelines(self._str_metrics(self.stored))
+class SAMTaxonomy:
+""" Provides taxonomic summary information from a SAM file stream. """
+def __init__(self, file_path):
+self.file_path = file_path
+self.count_primaries = Counter()
+self.detailed_information = {}
+self._last_read                 = (None, None)
+self._last_read_human_prim      = 0
+self._last_read_human_sec       = 0
+self._last_organisms            = set()
+def count(self, parsed_line):
+if parsed_line is None:
+return
+read_key, read_name, flag, ref_name, ref_position, mapping_score,\
+cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
+is_end1, is_end2, number_mismatches, alignment_score,\
+number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
+is_paired, number_matches, read_end_pos, max_match = parsed_line
+if is_mapped:
+refseq_group, family, organism, gi = ref_name.split(';')[:4]
+if is_primary:
+self.count_primaries[organism] += 1
+if organism not in self.detailed_information:
+# refseq_group. family, gis, avg_mapping_score,
+# avg_seq_length, avg_number_hits, avg_alignment_score, avg_nr_mismatches
+initial = [refseq_group,
+family,
+set([gi]),
+[int(mapping_score), 1],
+[len(seq), 1],
+[0, 0],
+[alignment_score, 1],
+[number_mismatches, 1],
+0,
+0]
+self.detailed_information[organism] = initial
+else:
+entry = self.detailed_information[organism]
+entry[2].add(gi)
+entry[3][0] += int(mapping_score)
+entry[3][1] += 1
+entry[4][0] += len(seq)
+entry[4][1] += 1
+entry[6][0] += alignment_score
+entry[6][1] += 1
+entry[7][0] += number_mismatches
+entry[7][1] += 1
+if is_primary:
+entry = self.detailed_information[organism]
+entry[5][0] += number_hits
+entry[5][1] += 1
+if self._last_read == (None, None):
+self._last_read = read_key
+if self._last_read != read_key:
+for last_organism in self._last_organisms:
+self.detailed_information[last_organism][8]\
++= self._last_read_human_prim
+self.detailed_information[last_organism][9]\
++= self._last_read_human_sec
+self._last_read = read_key
+self._last_organisms = set()
+self._last_read_human_prim  = 0
+self._last_read_human_sec   = 0
+self._last_organisms.add(organism)
+if organism == 'Homo_sapiens':
+if is_primary:
+self._last_read_human_prim += 1
+else:
+self._last_read_human_sec += 1
+def get_summary(self, top=100):
+lines = []
+lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10s\t%10s\t%5s\t%5s\t%5s\t%10s\t%10s\n'\
+% ('Count', 'Group', 'Family', 'Organism', 'Targets', 'ReadLen', 'Hits', 'Map', 'Algn', 'Mism', 'HuP', 'HuS'))
+top_organisms = self.count_primaries.most_common(top)
+for organism, count in top_organisms:
+refseq_group, family, identifiers,\
+avg_mapping_score, avg_seq_length, avg_number_hits,\
+avg_alignment_score, avg_nr_mismatches, human_prim, human_sec\
+= self.detailed_information[organism]
+avg_len = int(avg_seq_length[0] / float(avg_seq_length[1]))
+if avg_number_hits[1] == 0:
+avg_hits = 0
+else:
+avg_hits = int(avg_number_hits[
+0] / float(avg_number_hits[1]))
+avg_mapping_score = int(avg_mapping_score[
+0] / float(avg_mapping_score[1]))
+avg_alignment_score = int(avg_alignment_score[
+0] / float(avg_alignment_score[1]))
+avg_nr_mismatches = int(avg_nr_mismatches[
+0] / float(avg_nr_mismatches[1]))
+nr_ids = len(identifiers)
+if count > 10**6:
+count = str(round(count / float(10**6), 3)) + 'M'
+if human_prim > 10**6:
+human_prim = str(round(human_prim / float(10**6), 3)) + 'M'
+if human_sec > 10**6:
+human_sec = str(round(human_sec / float(10**6), 3)) + 'M'
+if nr_ids > 10**6:
+nr_ids = str(round(nr_ids / float(10**6), 3)) + 'M'
+lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10i\t%10i\t%5i\t%5i\t%5i\t%10s\t%10s\n'\
+% (str(count), refseq_group[:20], family[:20], organism[:20],\
+str(nr_ids), avg_len, avg_hits, avg_mapping_score,\
+avg_alignment_score, avg_nr_mismatches, str(human_prim),\
+str(human_sec)))
+return lines
+def write(self):
+with open(self.file_path, 'w') as output_file:
+output_file.writelines(self.get_summary())
+@CLI.subcommand("rnamap")
+class RNAmap(cli.Application):
+""" Map input reads against a STAR index """
+index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
+help="Sets the index output directory")
+threads = cli.SwitchAttr(
+['-t', '--threads'], cli.Range(1, 512), mandatory=False,
+help="Sets the number of threads to use",
+default=1)
+taxonomy = cli.SwitchAttr(
+['-x', '--taxonomy'], str, mandatory=False,
+help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping",
+default='')
+star_path = cli.SwitchAttr(['--star_path'], str, mandatory=False,
+help="Path to STAR executable",
+default='')
+samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False,
+help="Path to samtools executable",
+default='')
+temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False,
+help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.",
+default='')
+min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False,
+help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)",
+default=None)
+min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False,
+help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)",
+default=None)
+max_mismatches = cli.SwitchAttr(['--max_mismatches'],  cli.Range(0, 10000000), mandatory=False,
+help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)",
+default=None)
+max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False,
+help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)",
+default=None)
+min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False,
+help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)",
+default=None)
+filter_complexity = cli.Flag(['--filter_complexity'],
+help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)",
+default=False)
+bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False,
+help="Path to unsorted, unindexed output BAM file",
+default='')
+sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False,
+help="Path to output SAM file",
+default='')
+qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False,
+help="Path to output quality file",
+default='')
+hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False,
+help="Path to bzip2-compressed tab-delimited output hit file",
+default='')
+sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False,
+help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file",
+default='no_sample_id')
+unmapped1 = cli.SwitchAttr(['--unmapped_end_1'], str, mandatory=False,
+help="Output path to uncompressed fastq file containing unmapped reads, first ends only for paired ends.",
+default='')
+unmapped2 = cli.SwitchAttr(['--unmapped_end_2'], str, mandatory=False,
+help="Output path to uncompressed fastq file containing unmapped reads, second ends only for paired ends.",
+default='')
+splice_junctions = cli.SwitchAttr(['--splice_junctions'], str, mandatory=False,
+help="Input path to splice junction file (currently not implemented)",
+default='')
+chimeric_mappings = cli.SwitchAttr(['--chimeric_mappings'], str, mandatory=False,
+help="Ouput path to SAM file containing chimeric mappings",
+default='')
+hit_filter = cli.SwitchAttr(
+['-f', '--virana_hit_filter'], str, list=True, mandatory=False,
+help="Only generate hit groups that include at last one read mapping to a reference of this reference group.",
+default=[])
+debug = cli.Flag(["-d", "--debug"], help="Enable debug information")
+reads = cli.SwitchAttr(
+['-r', '--reads'], str, list=True, mandatory=True,
+help="Sets the input reads. Add this parameter twice for paired end reads.")
+zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped")
+sensitive = cli.Flag(
+["--sensitive"], help="If given, mapping will process slower and more sensitive")
+def main(self):
+if self.debug:
+logging.getLogger().setLevel(logging.DEBUG)
+# Obtain star executable
+star        = [self.star_path and self.star_path or 'STAR']
+samtools    = [self.samtools_path and self.samtools_path or 'samtools']
+# Check if genome directory is existing
+if not os.path.exists(self.index_dir):
+sys.stdout.write('Index directory %s not existing, exiting' % self.genome_dir)
+sys.exit(1)
+if self.temp_path:
+temp_path = tempfile.mkdtemp(dir=self.temp_path)
+else:
+temp_path = tempfile.mkdtemp()
+first_ends      = []
+second_ends     = []
+single_ends     = []
+if len(self.reads) == 2:
+first, second = self.reads
+first_ends.append(first)
+second_ends.append(second)
+elif len(self.reads) == 1:
+single_ends.append(self.reads[0])
+else:
+sys.stdout.write('Invalid number of fastq files; provide either one (single end) or two (paired end)')
+sys.exit(1)
+if single_ends and not first_ends and not second_ends:
+reads = [','.join(single_ends)]
+elif first_ends and second_ends:
+reads = [','.join(first_ends), ','.join(second_ends)]
+star_cline = star + ['--runMode', 'alignReads',
+'--genomeDir', self.index_dir,
+'--runThreadN', str(self.threads),
+'--readMatesLengthsIn', 'NotEqual',
+'--outFileNamePrefix', os.path.join(
+temp_path, 'out'),
+'--outSAMmode', 'Full',
+'--outSAMstrandField', 'None',
+'--outSAMattributes', 'Standard',
+'--outSAMunmapped', 'Within',
+'--outStd', 'SAM',
+'--outFilterMultimapNmax', '1000',
+'--outSAMprimaryFlag', 'AllBestScore',
+'--outSAMorder', 'PairedKeepInputOrder']
+if self.unmapped1 or self.unmapped2:
+star_cline += ['--outReadsUnmapped', 'Fastx']
+else:
+star_cline += ['--outReadsUnmapped', 'None']
+if self.zipped:
+star_cline += ['--readFilesCommand', 'zcat']
+if self.sensitive:
+star_cline += ['--outFilterMultimapScoreRange', '10',
+'--outFilterMismatchNmax', '60',
+'--outFilterMismatchNoverLmax', '0.3',
+'--outFilterScoreMin', '0',
+'--outFilterScoreMinOverLread', '0.3',
+'--outFilterMatchNmin', '0',
+'--outFilterMatchNminOverLread', '0.66',
+'--seedSearchStartLmax', '12',
+'--winAnchorMultimapNmax', '50']
+star_cline += ['--readFilesIn'] + reads
+if self.debug:
+print ' '.join(star_cline)
+# Try if we can make the relevant files
+touch_files = [self.unmapped1, self.unmapped2, self.taxonomy, self.qual, self.hits, self.sam, self.bam]
+for file_path in touch_files:
+if file_path is None or file_path == '':
+continue
+try:
+with file(file_path, 'a'):
+os.utime(file_path, None)
+except IOError:
+sys.stderr.write('Could not write output file %s\n' % file_path)
+sys.exit(1)
+star_process = subprocess.Popen(' '.join(
+star_cline), shell=True, stdout=PIPE)
+parser      = SAMParser()
+if self.taxonomy:
+taxonomy    = SAMTaxonomy(self.taxonomy)
+if self.qual:
+quality     = SAMQuality(self.qual)
+if self.hits:
+hits        = SAMHits(self.hits, self.sample_id, self.hit_filter,
+self.min_mapping_score,
+self.min_alignment_score,
+self.max_mismatches,
+self.max_relative_mismatches,
+self.min_continiously_matching,
+self.filter_complexity)
+if self.sam:
+sam_file = open(self.sam, 'w')
+if self.bam:
+with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file:
+samtools_cline = samtools + [
+'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin']
+if self.debug:
+print ' '.join(samtools_cline)
+samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE)
+do_sam = self.sam
+do_bam = self.bam
+do_taxonomy = self.taxonomy
+do_qual = self.qual
+do_hits = self.hits
+do_parse = do_taxonomy or do_qual or do_hits
+for i, line in enumerate(iter(star_process.stdout.readline, '')):
+if do_sam:
+sam_file.write(line)
+if do_bam:
+samtools_process.stdin.write(line)
+if line[0] == '@':
+continue
+if do_parse:
+parsed_line = parser.parse(line)
+if  do_taxonomy:
+taxonomy.count(parsed_line)
+if i > 0 and (i % 50000) == 0:
+print ''.join(taxonomy.get_summary(10))
+if do_qual:
+quality.count(parsed_line)
+if do_hits:
+hits.count(parsed_line)
+if do_bam:
+samtools_process.stdin.close()
+if do_sam:
+sam_file.close()
+if do_hits:
+hits.write()
+if do_taxonomy:
+print ''.join(taxonomy.get_summary(10))
+taxonomy.write()
+if do_qual:
+quality.write()
+try:
+if self.unmapped1:
+shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate1'),\
+self.unmapped1)
+except IOError:
+pass
+try:
+if self.unmapped2:
+shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate2'),\
+self.unmapped2)
+except IOError:
+pass
+try:
+if self.chimeric_mappings:
+shutil.move(os.path.join(temp_path, 'out' + 'Chimeric.out.sam'),\
+self.chimeric_mappings)
+except IOError:
+pass
+shutil.rmtree(temp_path)
+@CLI.subcommand("dnamap")
+class DNAmap(cli.Application):
+""" Map input reads against a BWA index """
+index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
+help="Sets the index output directory")
+threads = cli.SwitchAttr(
+['-t', '--threads'], cli.Range(1, 512), mandatory=False,
+help="Sets the number of threads to use",
+default=1)
+taxonomy = cli.SwitchAttr(
+['-x', '--taxonomy'], str, mandatory=False,
+help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping",
+default='')
+samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False,
+help="Path to samtools executable",
+default='')
+temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False,
+help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.",
+default='')
+min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False,
+help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)",
+default=None)
+min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False,
+help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)",
+default=None)
+max_mismatches = cli.SwitchAttr(['--max_mismatches'],  cli.Range(0, 10000000), mandatory=False,
+help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)",
+default=None)
+max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False,
+help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)",
+default=None)
+min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False,
+help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)",
+default=None)
+filter_complexity = cli.Flag(['--filter_complexity'],
+help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)",
+default=False)
+sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False,
+help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file",
+default='no_sample_id')
+bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False,
+help="Path to unsorted, unindexed output BAM file",
+default='')
+sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False,
+help="Path to output SAM file",
+default='')
+qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False,
+help="Path to output quality file",
+default='')
+hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False,
+help="Path to bzip2-compressed tab-delimited output virana hit file",
+default='')
+hit_filter = cli.SwitchAttr(
+['-f', '--virana_hit_filter'], str, list=True, mandatory=False,
+help="Only generate hit groups that include at last one read mapping to a reference of this reference group.",
+default=[])
+debug = cli.Flag(["-d", "--debug"], help="Enable debug information")
+zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped")
+sensitive = cli.Flag(
+["--sensitive"], help="If given, mapping will process slower and more sensitive")
+bwa_path = cli.SwitchAttr(['--bwa_path'], str, mandatory=False,
+help="Path to BWA executable",
+default='')
+debug = cli.Flag(["-d", "--debug"], help="Enable debug information")
+if debug:
+logging.getLogger().setLevel(logging.DEBUG)
+reads = cli.SwitchAttr(
+['-r', '--reads'], str, list=True, mandatory=True,
+help="Sets the input reads. Add this parameter twice for paired end reads.")
+def main(self):
+if self.debug:
+logging.getLogger().setLevel(logging.DEBUG)
+# Obtain star executable
+bwa         = [self.bwa_path and self.bwa_path or 'bwa']
+samtools    = [self.samtools_path and self.samtools_path or 'samtools']
+# Check if genome directory is existing
+if not os.path.exists(self.index_dir):
+sys.stdout.write('Index directory %s not existing, exiting'\
+% self.genome_dir)
+sys.exit(1)
+if len(self.reads) not in (1, 2):
+message = 'Invalid number of FASTQ files; supply either one (single end) or two (paired end)\n'
+sys.stderr.write(message)
+sys.exit(1)
+bwa_cline = bwa + ['mem', '-t', str(self.threads), '-M', os.path.join(self.index_dir, 'index')]
+bwa_cline += self.reads
+if self.debug:
+print ' '.join(bwa_cline)
+bwa_process = subprocess.Popen(' '.join(bwa_cline), shell=True, stdout=PIPE)
+parser      = SAMParser()
+if self.taxonomy:
+taxonomy    = SAMTaxonomy(self.taxonomy)
+if self.qual:
+quality     = SAMQuality(self.qual)
+if self.hits:
+hits        = SAMHits(self.hits, self.sample_id, self.hit_filter,
+self.min_mapping_score,
+self.min_alignment_score,
+self.max_mismatches,
+self.max_relative_mismatches,
+self.min_continiously_matching,
+self.filter_complexity)
+if self.sam:
+sam_file = open(self.sam, 'w', buffering=100 * 1024 * 1024)
+if self.bam:
+with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file:
+samtools_cline = samtools + [
+'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin']
+if self.debug:
+print ' '.join(samtools_cline)
+samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE)
+do_sam = self.sam
+do_bam = self.bam
+do_taxonomy = self.taxonomy
+do_qual = self.qual
+do_hits = self.hits
+do_parse = do_taxonomy or do_qual or do_hits
+for i, line in enumerate(iter(bwa_process.stdout.readline, '')):
+if do_sam:
+sam_file.write(line)
+if do_bam:
+samtools_process.stdin.write(line)
+if do_parse:
+parsed_line = parser.parse(line)
+if do_taxonomy:
+taxonomy.count(parsed_line)
+if i > 0 and (i % 10000) == 0:
+print ''.join(taxonomy.get_summary(10))
+if do_qual:
+quality.count(parsed_line)
+if do_hits:
+hits.count(parsed_line)
+if do_bam:
+samtools_process.stdin.close()
+if do_sam:
+sam_file.close()
+if do_hits:
+hits.write()
+if do_taxonomy:
+print ''.join(taxonomy.get_summary(10))
+taxonomy.write()
+if do_qual:
+quality.write()
+if __name__ == "__main__":
+CLI.run()

Mercurial > repos > mzeidler > virana_main

comparison vmap.py @ 0:f63d639b223c draft