Mercurial > repos > mzeidler > virana2
diff vhom.py @ 0:3ba5983012cf draft
Uploaded
author | mzeidler |
---|---|
date | Tue, 24 Sep 2013 10:19:40 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vhom.py Tue Sep 24 10:19:40 2013 -0400 @@ -0,0 +1,1315 @@ +#!/usr/bin/env python + +import sys +import os +import logging +import tempfile +import shutil + +import subprocess +import bz2 +import re +import glob + +from shutil import rmtree +from subprocess import PIPE +from collections import defaultdict, Counter + +try: + import pysam +except ImportError: + message = 'This script requires the pysam python package\n' + sys.stderr.write(message) + sys.exit(1) + +try: + from plumbum import cli +except ImportError: + message = 'This script requires the plumbum python package\n' + sys.stderr.write(message) + sys.exit(1) + +try: + from Bio.SeqRecord import SeqRecord + from Bio import SeqIO + from Bio.Seq import Seq + from Bio.Alphabet import IUPAC, Gapped +except ImportError: + message = 'This script requires the BioPython python package\n' + sys.stderr.write(message) + sys.exit(1) + +try: + import HTSeq +except ImportError: + message = 'This script requires the HTSeq python package\n' + sys.stderr.write(message) + sys.exit(1) + +logging.basicConfig(level=logging.INFO, format='%(message)s') + + +class CLI(cli.Application): + + """ Homologous region analysis of hit files""" + PROGNAME = "vmap" + VERSION = "1.0.0" + DESCRIPTION = """""" + USAGE = """""" + + def main(self, *args): + + if args: + print("Unknown command %r" % (args[0])) + print self.USAGE + return 1 + + if not self.nested_command: + print("No command given") + print self.USAGE + return 1 + +class SequenceProxy: + + def __init__(self, references_path, human_transcripts_path=None): + + self.references_path = references_path + self.human_transcripts_path = human_transcripts_path + + self.human_transcript_records = None + self.reference_records = None + self.hit_records = [] + + self.transcript_ranges = None + + def add_hit_file(self, hit_file_path): + + logging.debug('SequenceProxy: adding new hit file %s' % hit_file_path) + + hit_handle = bz2.BZ2File(hit_file_path, 'rb') + try: + hit_handle.read(10) + except IOError: + hit_handle = open(hit_file_path, 'rb') + + hit_dict = SeqIO.to_dict(SeqIO.parse(hit_handle, "fasta")) + + logging.debug('SequenceProxy: hit file has %i entries' % len(hit_dict)) + + self.hit_records.append(hit_dict) + + def get_read_record(self, read_id): + + for record_dict in self.hit_records: + if read_id in record_dict: + record = record_dict[read_id] + return SeqRecord(record.seq, record.id, '', '') + + return None + + def get_reference_record(self, identifier): + + if not self.reference_records: + logging.debug('SequenceProxy: indexing reference records') + self.reference_records = SeqIO.index(self.references_path, 'fasta') + + try: + record = self.reference_records[identifier] + return SeqRecord(record.seq, record.id, '', '') + except KeyError: + return None + + def get_transcript_record(self, identifier): + + if not self.human_transcripts_path: + return None + + if not self.human_transcript_records: + logging.debug('SequenceProxy: indexing human transcripts') + self.human_transcript_records = SeqIO.index( + self.human_transcripts_path, 'fasta') + + try: + record = self.human_transcript_records[identifier] + return SeqRecord(record.seq, record.id, '', '') + + except KeyError: + return None + + def get_overlapping_human_transcript_identifiers(self, chromosome, start, end): + + if not self.human_transcripts_path: + return set() + + if not self.transcript_ranges: + self._set_transcript_ranges() + + interval = HTSeq.GenomicInterval(chromosome, start, end, '.') + all_transcript_ids = set() + for interval, transcript_ids in self.transcript_ranges[interval].steps(): + all_transcript_ids = all_transcript_ids.union(transcript_ids) + + return all_transcript_ids + + def _set_transcript_ranges(self): + + logging.debug( + 'SequenceProxy: Preparing transcript ranges, please wait...') + + self.transcript_ranges = HTSeq.GenomicArrayOfSets( + 'auto', stranded=False) + + for record in SeqIO.parse(self.human_transcripts_path, 'fasta'): + transcript_id = record.id + for exon in record.description.strip().split(' ')[1].split('|'): + chromosome, start, end = exon.split(';') + start, end = sorted((int(start), int(end))) + if start == end: + continue + interval = HTSeq.GenomicInterval(chromosome, start, end, ".") + self.transcript_ranges[interval] += transcript_id + + +class JalviewRunner: + + def __init__(self, jalview_jar_dir, tmp_dir=None): + + self.jalview_jar_dir = jalview_jar_dir + + if tmp_dir: + self.tmp_dir = tempfile.mkdtemp( + dir=os.path.abspath(os.path.expandvars(tmp_dir))) + else: + self.tmp_dir = tempfile.mkdtemp() + + self.config = ['#---JalviewX Properties File---', + '#Fri Nov 04 14:23:53 CET 2011', + 'FIGURE_AUTOIDWIDTH=true', + 'ID_ITALICS=false', + 'SORT_ALIGNMENT=No sort', + 'SHOW_IDENTITY=true', + 'FONT_NAME=SansSerif', + 'GAP_SYMBOL=.', + 'SHOW_QUALITY=false', + 'SHOW_GROUP_CONSERVATION=false', + 'FONT_STYLE=plain', + 'ANTI_ALIAS=false', + 'SORT_BY_TREE=false', + 'SHOW_CONSENSUS_HISTOGRAM=false', + 'SHOW_OVERVIEW=false', + 'DEFAULT_COLOUR=Nucleotide', + 'SHOW_CONSENSUS_LOGO=false', + 'SHOW_ANNOTATIONS=true', + 'SHOW_UNCONSERVED=true', + 'AUTO_CALC_CONSENSUS=true', + 'PAD_GAPS=true', + 'FONT_SIZE=10', + 'RIGHT_ALIGN_IDS=false', + 'WRAP_ALIGNMENT=false', + 'FASTA_JVSUFFIX=false', + 'PILEUP_JVSUFFIX=false', + 'SHOW_JVSUFFIX=false'] + + def _make_configuration(self, sub_tmp_dir): + + config_path = os.path.join(sub_tmp_dir, 'config.txt') + logging.debug( + 'JalviewRunner: writing configuration file %s' % config_path) + with open(config_path, 'w') as config_file: + for c in self.config: + config_file.write(c + '\n') + + return config_path + + def _delete_tmp_dir(self): + + try: + rmtree(self.tmp_dir) + except OSError: + pass + + def _delete_sub_tmp_dir(self, sub_tmp_dir): + + try: + rmtree(sub_tmp_dir) + except OSError: + pass + + def run(self, fasta_path, png_output_path): + + fasta_path = os.path.abspath(os.path.expandvars(fasta_path)) + + png_output_path = os.path.abspath(os.path.expandvars(png_output_path)) + + sub_tmp_dir = tempfile.mkdtemp(dir=self.tmp_dir) + config_path = self._make_configuration(sub_tmp_dir) + + logging.debug( + 'JalviewRunner: running Jalview on fasta %s with temp dir %s' % + (fasta_path, sub_tmp_dir)) + + cline = ['java', '-Djava.ext.dirs=%s/lib' % self.jalview_jar_dir, + '-cp %s/jalview.jar' % self.jalview_jar_dir, 'jalview.bin.Jalview', + '-open', fasta_path, '-nodisplay', '-png', png_output_path, + '-colour', 'nucleotide', '-props', config_path] + + # Run sibelia process + java_process = subprocess.Popen( + ' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE) + + # Block until streams are closed by the process + stdout, stderr = java_process.communicate() + + if stderr: + sys.stderr.write('JalviewRunner: ' + stderr.replace('\n', ' ')) + + self._delete_sub_tmp_dir(sub_tmp_dir) + + return png_output_path + + def __del__(self): + + self._delete_tmp_dir() + + +class LastzRunner: + + def __init__(self, lastz_path=None, word_length=7): + + self.lastz_path = lastz_path + self.word_length = word_length + + def align_to_bam_file(self, reference_fasta_path, query_fasta_path, output_bam_path, multiple=False, assert_record=None): + + logging.debug('LastzRunner: running on reference %s and query %s' % + (reference_fasta_path, query_fasta_path)) + output_sam_path = os.path.abspath( + os.path.expandvars(output_bam_path.replace('.bam', '.sam'))) + output_bam_unsorted_path = os.path.abspath( + os.path.expandvars(output_bam_path + '.unsorted')) + + logging.debug( + 'LastzRunner: aligning with output in temporary sam file %s' % + output_sam_path) + with open(output_sam_path, 'w') as output_sam_handler: + for line in self._align(reference_fasta_path, query_fasta_path, multiple): + output_sam_handler.write(line) + + logging.debug( + 'LastzRunner: transforming sam into unsorted bam file %s' % + output_bam_unsorted_path) + input_sam_handler = pysam.Samfile(output_sam_path, "r") + output_bam_file = pysam.Samfile( + output_bam_unsorted_path, "wb", template=input_sam_handler) + + logging.debug( + 'LastzRunner: copying from sam file to bam file') + for s in input_sam_handler: + output_bam_file.write(s) + output_bam_file.close() + + logging.debug('LastzRunner: sorting and indexing bam file %s' % + output_bam_path) + pysam.sort(output_bam_unsorted_path, + output_bam_path.replace('.bam', '')) + + pysam.index(output_bam_path) + + def align_to_samlines(self, reference_fasta_path, query_fasta_path, multiple=False): + + logging.debug('LastzRunner: running on reference %s and query %s' % + (reference_fasta_path, query_fasta_path)) + + for line in self._align(reference_fasta_path, query_fasta_path, multiple): + yield line + + def _align(self, reference_fasta_path, query_fasta_path, multiple): + + reference_fasta_path = os.path.abspath( + os.path.expandvars(reference_fasta_path)) + query_fasta_path = os.path.abspath( + os.path.expandvars(query_fasta_path)) + + lastz = [self.lastz_path and self.lastz_path or 'lastz'] + if multiple: + cline = lastz + [ + reference_fasta_path + + '[multiple]', query_fasta_path + '[nameparse=darkspace]', + '--format=sam', '--strand=both', '--gapped', '--chain', + '--nogfextend', '--seed=match%i' % self.word_length] + + else: + cline = lastz + [ + reference_fasta_path, query_fasta_path + + '[nameparse=darkspace]', + '--format=sam', '--strand=both', '--gapped', + '--chain', '--nogfextend', '--seed=match%i' % self.word_length] + + # Run lastz process + lastz_process = subprocess.Popen( + ' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE) + + # Filter multiple read mappings (one to each strand) by only keeping + # the one with the most matches + alignments = [] + for i, line in enumerate(iter(lastz_process.stdout.readline, '')): + if line.startswith('@'): + yield line + elif line.lower().startswith('read'): + + fields = line.split('\t') + read_name = fields[0] + read_cigar = fields[5] + if not alignments: + alignments = [read_name, line, read_cigar] + continue + else: + if alignments[0] == read_name: + this_mapping = sum( + [int(c[:-1]) for c in re.findall('[0-9]*M', read_cigar)]) + other_mapping = sum( + [int(c[:-1]) for c in re.findall('[0-9]*M', alignments[2])]) + + if other_mapping > this_mapping: + yield alignments[1] + else: + yield line + else: + yield line + alignments = [] + else: + yield line + + +class ConsensusRunner: + + def __init__(self, ambiguity_cutoff=0.3): + + self.ambiguity_cutoff = ambiguity_cutoff + self.ambiguity = { + 'GT': 'K', 'AC': 'M', 'AG': 'R', 'CT': 'Y', 'CG': 'S', + 'AT': 'W', 'CGT': 'B', 'ACG': 'V', 'ACT': 'H', 'AGT': 'D', + 'ACGT': 'N'} + + self.alphabet = Gapped(IUPAC.unambiguous_dna, "-") + + def run(self, input_sam_path, output_fasta_path): + + samfile = pysam.Samfile(input_sam_path, "rb") + references = samfile.references + + logging.debug( + 'ConsensusRunner: writing consensus of bam file %s' % input_sam_path) + + assert len(references) == 1 + + consensus = defaultdict(list) + conditions = {} + + base_mapper = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4, '-': 5} + ambiguity = self.ambiguity + + alphabet = Gapped(IUPAC.ambiguous_dna) + frequency_cutoff = self.ambiguity_cutoff + + # Get all conditions of aligned reads + for alignedread in samfile.fetch(references[0]): + + name = alignedread.qname + length = len(alignedread.seq) + condition = None + + if name.lower().startswith('read'): + condition = ';'.join(name.split(';')[:2]) + else: + condition = ';'.join(name.split(';')[:-1]) + + if condition in conditions: + if name not in conditions[condition][0]: + conditions[condition][0].add(name) + conditions[condition][1] += 1 + conditions[condition][2] += length + else: + conditions[condition] = [set([name]), 1, length] + + # Prepare count dictionaries + consensus = defaultdict(list) + + for i, pileupcolumn in enumerate(samfile.pileup(references[0], max_depth=100000)): + + if i % 1000 == 0: + logging.debug( + 'ConsensusRunner: making consensus at position %i' % i) + + counters = {} + for condition in conditions: + counters[condition] = [0, 0, 0, 0, 0, 0] # ACGTN- + + for pileupread in pileupcolumn.pileups: + + alignment = pileupread.alignment + name = alignment.qname + + if pileupread.is_del: + base = '-' + else: + base = alignment.seq[pileupread.qpos].upper() + + if name.lower().startswith('read'): + condition = ';'.join(name.split(';')[:2]) + counters[condition][base_mapper[base]] += 1 + else: + condition = ';'.join(name.split(';')[:-1]) + counters[condition][base_mapper[base]] += 1 + + for condition, counts in counters.iteritems(): + depth = float(sum(counts)) + bases = [] + a, c, g, t, n, gap = counts + + if depth > 0: + if (n / depth) >= frequency_cutoff: + consensus[condition].append('N') + else: + if (a / depth) >= frequency_cutoff: + bases.append('A') + if (c / depth) >= frequency_cutoff: + bases.append('C') + if (g / depth) >= frequency_cutoff: + bases.append('G') + if (t / depth) >= frequency_cutoff: + bases.append('T') + + if len(bases) > 1: + consensus[condition].append( + ambiguity[''.join(sorted(bases))]) + elif len(bases) == 1: + consensus[condition].append(bases[0]) + else: + consensus[condition].append('-') + + # Split consensuses by type + reference_consensuses = [] + read_consensuses = [] + for condition, sequence_list in consensus.iteritems(): + if condition.lower().startswith('read'): + read_consensuses.append((''.join(sequence_list), condition)) + else: + reference_consensuses.append( + (''.join(sequence_list), condition)) + + reference_consensuses.sort(reverse=True) + read_consensuses.sort(reverse=True) + + # Get start and end of reads (remove fully gapped positions) + start = None + end = None + for read_consensus, condition in read_consensuses: + + # Forward direction + for i, char in enumerate(read_consensus): + if char != '-': + if start is None: + start = i + else: + start = min(start, i) + break + + # Reverse direction + for i, char in enumerate(read_consensus[::-1]): + if char != '-': + if end is None: + end = i + else: + end = min(end, i) + break + + if end == 0 or end is None: + end = None + else: + end = -end + + # Filter all records by start and end of reads + reference_consensuses =\ + [(r[start:end], c) for r, c in reference_consensuses] + + read_consensuses =\ + [(r[start:end], c) for r, c in read_consensuses] + + # Write consensus records to file + with open(output_fasta_path, 'w', buffering=100 * 1024 * 1024) as output_handler: + for consensus, condition in read_consensuses + reference_consensuses: + stats = ';'.join(map(str, conditions[condition][1:])) + record = SeqRecord( + Seq(consensus, alphabet=alphabet), id=condition + ';' + stats, name='', description='') + SeqIO.write([record], output_handler, "fasta") + + +class Group: + + def __init__(self, family_name, qualified_family_name, sequence_proxy, tmp_dir=None): + + self.family_name = family_name + self.qualified_family_name = qualified_family_name + self.sequence_proxy = sequence_proxy + + self.regions = [] + self.fasta_path = None + + if tmp_dir: + self.tmp_dir = tempfile.mkdtemp( + dir=os.path.abspath(os.path.expandvars(tmp_dir))) + else: + self.tmp_dir = tempfile.mkdtemp() + + def add_region(self, read_id, pathogen_mapping_locations, human_mapping_locations): + + region = Region(self.sequence_proxy, self.tmp_dir) + + region.add_read(read_id) + + for mapping_location in list(pathogen_mapping_locations) + list(human_mapping_locations): + reference = ';'.join(mapping_location[:-2]) + region.add_reference( + reference, int(mapping_location[-2]), int(mapping_location[-1])) + + self.regions.append(region) + + def write_outputs(self, output_dir, lastz_runner, consensus_builder, jalview_runner): + + if not self.regions: + logging.debug('Group %s: no regions for family' % self.family_name) + return + + made_output_dir = False + for i, region in enumerate(self.regions): + + if not made_output_dir: + output_dir = os.path.abspath( + os.path.expandvars(os.path.join(output_dir, self.qualified_family_name))) + logging.debug('Group %s: writing output files to %s' % + (self.family_name, output_dir)) + try: + os.makedirs(output_dir) + except OSError: + pass + made_output_dir = True + + logging.debug('Group %s: writing region number %i files to %s' % + (self.family_name, i + 1, output_dir)) + + unaligned_path = os.path.join( + output_dir, 'region_%i_unaligned.fa.bzip2' % (i + 1)) + compressed_unaligned = bz2.BZ2File(unaligned_path, 'wb') + with open(region.get_unaligned_fasta_path()) as unaligned_fasta: + for line in unaligned_fasta: + compressed_unaligned.write(line) + + bam_path = os.path.join( + output_dir, 'region_%i_alignment.bam' % (i + 1)) + temporary_alignment_bam_path = region.get_alignment_bam_path( + lastz_runner, assert_record='Read') + shutil.copy(temporary_alignment_bam_path, bam_path) + + shutil.copy( + temporary_alignment_bam_path + '.bai', bam_path + '.bai') + + consensus_path = os.path.join( + output_dir, 'region_%i_consensus.fa' % (i + 1)) + temporary_consensus_path = region.get_consensus_fasta_path( + consensus_builder, temporary_alignment_bam_path) + shutil.copy(temporary_consensus_path, consensus_path) + + if jalview_runner: + + jalview_path = os.path.join( + output_dir, 'region_%i_consensus.png' % (i + 1)) + temporary_jalview_path = region.get_consensus_figure_path( + jalview_runner, temporary_consensus_path) + shutil.copy(temporary_jalview_path, jalview_path) + + def _delete_temporary_dir(self): + + self.fasta_path = None + try: + rmtree(self.tmp_dir) + except OSError: + pass + + def filter_regions(self, min_region_length=50, min_read_number=5): + + logging.debug('Group %s: filtering %i candidate regions' % + (self.family_name, len(self.regions))) + + filtered = [r for r in self.regions if len( + r) >= min_read_number and r.get_longest_reference_length() >= min_region_length] + + ordered = sorted(filtered, reverse=True) + + if ordered: + length = ordered[0].length + else: + length = 0 + + logging.debug( + 'Group %s: %i candidate regions remained after filtering, the longest is %i bp long' % + (self.family_name, len(ordered), length)) + + self.regions = ordered + + return ordered + + def merge_regions(self, max_gap_length): + """ Merges candidate regions into homologous regions. """ + + logging.debug('Group %s: merging %i candidate regions' % + (self.family_name, len(self.regions))) + + if len(self.regions) > 1: + + potentially_mergable = self.regions + not_mergable = [] + + while len(potentially_mergable) > 1: + + merged = False + current = potentially_mergable[0] + compared_to = potentially_mergable[1:] + + for region in compared_to: + if region.overlaps(current, max_gap_length): + region.merge(current) + region.clean_references(max_gap_length) + #logging.debug('Group %s: merged a region. %i potentially mergable candidate regions remaining' % (self.family_name, len(potentially_mergable))) + potentially_mergable = compared_to + merged = True + break + + if not merged: + not_mergable.append(current) + potentially_mergable = compared_to + #logging.debug('Group %s: not merged a region. %i potentially mergable candidate regions remaining' % (self.family_name, len(potentially_mergable))) + + results = not_mergable + potentially_mergable + + logging.debug('Group %s: merged into %i regions' % + (self.family_name, len(results))) + + self.regions = results + + else: + logging.debug( + 'Group %s: found only 1 region, no mergin necessary' % self.family_name) + + def add_transcripts_to_regions(self): + + logging.debug('Group %s: enriching %i regions with human transcript information' + % (self.family_name, len(self.regions))) + + for region in self.regions: + + added_transcripts = 0 + + for identifier, locations in region.references.iteritems(): + chromosome = identifier.split(';')[-1] + + for start, end in locations: + transcript_identifiers = self.sequence_proxy.get_overlapping_human_transcript_identifiers( + chromosome, start, end) + + for transcript_identifier in transcript_identifiers: + region.add_transcript(transcript_identifier) + added_transcripts += 1 + + logging.debug('Group %s: added %i transcripts to a region' % + (self.family_name, added_transcripts)) + + def __str__(self): + + s = "Group %s with %i regions" % (self.family_name, len(self.regions)) + + return s + + +class GroupGenerator: + + """ Produces Groups from Hitfiles. Reads are assigned to groups based on + taxonomic families. Transcripts overlapping human read mappings as well + as genome sequences of pathogens the reads map to are added to the + Groups. + """ + + def __init__(self, sequence_proxy, reference_database_filter=None, pathogen_family_filter=None, tmp_dir=None): + + self.sequence_proxy = sequence_proxy + + self.tmp_path = tmp_dir + self.reference_database_filter = reference_database_filter + self.pathogen_family_filter = pathogen_family_filter + + self.groups = {} + + if tmp_dir: + self.tmp_dir = tempfile.mkdtemp( + dir=os.path.abspath(os.path.expandvars(tmp_dir))) + else: + self.tmp_dir = tempfile.mkdtemp() + + def get_groups(self, min_read_number=5): + + logging.debug( + 'GroupGenerator: generating groups from %i hit files, please wait...' % + len(self.sequence_proxy.hit_records)) + + for hit_index in self.sequence_proxy.hit_records: + for read_id, record in hit_index.iteritems(): + + # Extract mapping locations to human DNA, human transcript, and + # pathogen genomes + mapping_locations = record.description.strip().split( + ' ')[1].split('|') + + pathogen_families = defaultdict(set) + human_mapping_locations = set() + + for mapping_location in mapping_locations: + fields = mapping_location.split(';') + + # Convert start and end to integers + fields[-2] = int(fields[-2]) # start + fields[-1] = int(fields[-1]) # end + + # Add mapping locations to human cDNA or human DNA + if fields[2] == 'Homo_sapiens': + human_mapping_locations.add(tuple(fields)) + + # Add mapping locations to non-human references + else: + reference_database, family = fields[:2] + if self.reference_database_filter and reference_database\ + not in self.reference_database_filter: + continue + if self.pathogen_family_filter and family\ + not in self.pathogen_family_filter: + continue + pathogen_families[ + (reference_database, family)].add(tuple(fields)) + + if not pathogen_families: + continue + + # Process the hits to different pathogen families + for (reference_database, family), pathogen_locations in pathogen_families.iteritems(): + + qualified_family_name = reference_database + '_' + family + + # Obtain existing group or make new group + if qualified_family_name in self.groups: + group = self.groups[qualified_family_name] + else: + group = Group(family, qualified_family_name, + self.sequence_proxy, self.tmp_dir) + self.groups[qualified_family_name] = group + + group.add_region( + read_id, pathogen_locations, human_mapping_locations) + + # Exclude groups that have collected to few reads + for qualified_family_name, group in self.groups.items(): + if len(group.regions) < min_read_number: + del self.groups[qualified_family_name] + else: + logging.debug( + 'GroupGenerator: made new group for family %s' % qualified_family_name) + + return self.groups + + +class Region: + + def __init__(self, sequence_proxy, tmp_dir=None): + + self.sequence_proxy = sequence_proxy + + self.references = defaultdict(set) + self.reads = set() + self.transcripts = set() + self.length = None + + self.sorted_reference_positions = None + + self.master_tmp = tmp_dir + self.tmp_dir = None + + self.unaligned_fasta_path = None + self.alignment_sam_path = None + self.consensus_fasta_path = None + + def add_reference(self, name, start, end): + + assert start < end + self.length = None + self.longest_reference_id = None + self.references[name].add((start, end)) + + def add_read(self, name): + + self.reads.add(name) + + def add_transcript(self, name): + + self.transcripts.add(name) + + def get_tmp_path(self): + + if self.tmp_dir: + return self.tmp_dir + + if self.master_tmp: + self.tmp_dir = tempfile.mkdtemp( + dir=os.path.abspath(os.path.expandvars(self.master_tmp))) + else: + self.tmp_dir = tempfile.mkdtemp() + + return self.tmp_dir + + def _distance(self, range1, range2): + + first, second = sorted((range1, range2)) + + if first[0] <= first[1] < second[0]: + return (second[0] - first[1]) + else: + return 0 + + def _delete_temporary_dir(self): + + self.unaligned_fasta_path = None + if self.tmp_dir: + try: + rmtree(self.tmp_dir) + except OSError: + pass + + def _get_unaligned_fasta_path(self): + + output_path = os.path.join(self.get_tmp_path(), 'unaligned_region.fa') + logging.debug('Region: writing unaligned fasta to %s' % output_path) + + assert self.reads + + with open(output_path, 'w', buffering=100 * 1024 * 1024) as output_handler: + + # Cut and write references + human_positions = [] + pathogen_positions = [] + for length, identifier, start, end in self.get_sorted_reference_positions(): + if ';Homo_sapiens;' in identifier: + human_positions.append((identifier, start, end)) + else: + pathogen_positions.append((identifier, start, end)) + + for identifier, start, end in pathogen_positions + human_positions: + record = self.sequence_proxy.get_reference_record(identifier) + if record: + record = SeqRecord(record.seq, record.id, '', '') + #record.id += ';%i-%i' % (start, end) + SeqIO.write( + [record[start - 1:end]], output_handler, "fasta") + else: + pass + #logging.debug('Region: could not retrieve reference %s' % identifier) + + # Write full-length transcripts + for identifier in self.transcripts: + record = self.sequence_proxy.get_transcript_record(identifier) + if record: + record = SeqRecord(record.seq, record.id, '', '') + SeqIO.write([record], output_handler, "fasta") + else: + logging.debug( + 'Region: could not retrieve reference %s' % identifier) + + # Write reads + for read_id in sorted(self.reads): + record = self.sequence_proxy.get_read_record(read_id) + if record: + # Transform read ID so that LASTZ can work with it (it + # makes nicknames based on some characters ) + identifier = record.id.replace( + ':', '_').replace('|', '_').replace('/', '_') + record = SeqRecord(record.seq, identifier, '', '') + SeqIO.write([record], output_handler, "fasta") + else: + logging.debug( + 'Region: could not retrieve read %s' % read_id) + + self.unaligned_fasta_path = output_path + + return self.unaligned_fasta_path + + def get_unaligned_fasta_path(self): + + if not self.unaligned_fasta_path: + self._get_unaligned_fasta_path() + + return self.unaligned_fasta_path + + def _align_fastas(self, aligner, assert_record=None): + + # Write reference fasta + queries_path = self.get_unaligned_fasta_path() + first_record = SeqIO.parse(open(queries_path, "rU"), "fasta").next() + reference_path = os.path.join(self.get_tmp_path(), 'reference.fa') + + logging.debug( + 'Region: writing longest reference to %s' % reference_path) + + SeqIO.write([first_record], reference_path, "fasta") + + # Do alignment + bam_path = os.path.join(self.get_tmp_path(), 'aligned.bam') + logging.debug( + 'Region: starting alignment using queries %s to sam path %s' % + (queries_path, bam_path)) + + aligner.align_to_bam_file( + reference_path, queries_path, bam_path, assert_record=assert_record) + + return bam_path + + def _build_alignment_consensus(self, consensus_builder, alignment_bam_path): + + consensus_path = os.path.join(self.get_tmp_path(), 'consensus.fa') + logging.debug('Region: writing consensus to %s' % consensus_path) + + consensus_builder.run(alignment_bam_path, consensus_path) + + return consensus_path + + def get_alignment_bam_path(self, aligner, assert_record=None): + + return self._align_fastas(aligner, assert_record) + + def get_consensus_fasta_path(self, consensus_builder, alignment_bam_path): + + return self._build_alignment_consensus(consensus_builder, alignment_bam_path) + + def get_consensus_figure_path(self, jalview_runner, consensus_fasta_path): + + png_path = os.path.join(self.get_tmp_path(), 'consensus_figure.png') + + return jalview_runner.run(consensus_fasta_path, png_path) + + def overlaps(self, other, max_gap_length): + + overlap = (set(self.references) & set(other.references)) + + # Overlap is solely based on non-human references + overlap = [ref for ref in overlap if not ';Homo_sapiens;' in ref] + + if not overlap: + return False + + for reference in overlap: + reflist = self.references[reference] + for start, end in reflist: + for other_start, other_end in other.references[reference]: + distance = self._distance((start, end), + (other_start, other_end)) + if distance <= max_gap_length: + return True + return False + + def merge(self, other): + + for reference, reflist in other.references.iteritems(): + for (start, end) in reflist: + self.add_reference(reference, start, end) + + for read in other.reads: + self.add_read(read) + + def clean_references(self, max_gap_length): + + for reference, reflist in self.references.iteritems(): + + start_list = sorted(reflist) + saved = list(start_list[0]) + result = set() + + for item in start_list: + if self._distance(saved, item) <= max_gap_length: + saved[1] = max(saved[1], item[1]) + else: + result.add(tuple(saved)) + saved[0] = item[0] + saved[1] = item[1] + result.add(tuple(saved)) + self.references[reference] = result + + def to_sorted_record_ids(self): + + references = [] + for name, locations in self.references.iteritems(): + for start, end in locations: + references.append(end - start, name) + + reads = defaultdict(list) + for name, locations in self.references.iteritems(): + condition = name.split(';')[1] + for start, end in locations: + reads[condition].append(end - start, name) + + all_record_ids = sorted(references, reverse=True) + for condition, the_reads in reads.iteritems(): + all_record_ids += sorted(the_reads, reverse=True) + + for length, record_id in all_record_ids: + yield record_id + + def __len__(self): + + return len(self.reads) + + def __cmp__(self, other): + + if self.get_longest_reference_length() <\ + other.get_longest_reference_length(): + return -1 + elif self.get_longest_reference_length() ==\ + other.get_longest_reference_length(): + return 0 + return 1 + + def get_longest_reference_length(self): + + if self.length is not None: + return self.length + + positions = self.get_sorted_reference_positions() + if positions: + self.length = self.get_sorted_reference_positions()[0][0] + else: + self.length = 0 + + return self.length + + def get_sorted_reference_positions(self): + + if self.sorted_reference_positions is not None: + return self.sorted_reference_positions + + lengths = [] + for reference, the_set in self.references.iteritems(): + for start, end in the_set: + lengths.append([end - start, reference, start, end]) + + self.sorted_reference_positions = sorted(lengths, reverse=True) + + return self.sorted_reference_positions + + def __str__(self): + return '<Region> of length %10i with %10i reads and %5i references' %\ + (self.get_longest_reference_length(), + len(self.reads), len(self.references)) + + +class RegionStatistics: + + def __init__(self, input_dir): + + self.input_dir = input_dir + self.stats = [] + + def _add_sequence(self, qualified_name, region_id, + longest_human_reference, longest_pathogen_reference, record): + + state = 'pathogen' + if longest_human_reference: + state = 'ambiguous' + + reference_type = qualified_name.split('_')[0] + family_name = '_'.join(qualified_name.split('_')[1:]) + + read_type, sample_id, number_reads, basepairs = record.id.split(';') + + coverage = int(basepairs) / float(len(longest_pathogen_reference)) + + self.stats.append( + [reference_type, family_name, region_id, sample_id, state, + number_reads, str(len(longest_pathogen_reference)), basepairs, '%.5f' % coverage]) + + def run(self, output_file): + + for qualified_family_name in os.listdir(self.input_dir): + + family_dir = os.path.join(self.input_dir, qualified_family_name) + + if not os.path.isdir(family_dir): + continue + + consensus_regions = glob.glob(os.path.join( + family_dir, 'region_*_consensus.fa')) + + for consensus_region in consensus_regions: + + base_bame = os.path.basename(consensus_region) + + region_id = base_bame.split('_')[1] + + reads = [] + longest_human_reference = None + longest_pathogen_reference = None + + for consensus_record in SeqIO.parse(consensus_region, 'fasta'): + if consensus_record.id.lower().startswith('read'): + # ends with read number; cumulative bases + reads.append(consensus_record) + elif ';Homo_sapiens;' in consensus_record.id: + if longest_human_reference is None or\ + len(consensus_record) > longest_human_reference: + longest_human_reference = consensus_record + else: + if longest_pathogen_reference is None or\ + len(consensus_record) > longest_pathogen_reference: + longest_pathogen_reference = consensus_record + + for read in reads: + self._add_sequence(qualified_family_name, region_id, + longest_human_reference, longest_pathogen_reference, read) + + with open(os.path.abspath(os.path.expandvars(output_file)), 'w') as output_handler: + output_handler.write( + '\t'.join(['reference_type', 'family_name', 'region_id', + 'sample_id', 'taxonomic_origin', 'number_reads', + 'region_length', 'basepairs', 'coverage']) + '\n') + for entries in sorted(self.stats): + line = '\t'.join(entries) + '\n' + output_handler.write(line) + + +@CLI.subcommand("regions") +class RegionRunner(cli.Application): + + """ Derive homologous groups and regions from hit files """ + + hit_files = cli.SwitchAttr( + ['-v', '--virana_hits'], str, list=True, mandatory=True, + help="Add hit file for analysis. May be supplied multiple times.") + + lastz_path = cli.SwitchAttr(['-z', '--lastz_path'], str, mandatory=False, + help="Path to lastz executable", + default='') + + jalview_jar_dir = cli.SwitchAttr( + ['-j', '--jalview_jar_dir'], str, mandatory=False, + help="Directory containing the jalview.jar file", + default=None) + + references_path = cli.SwitchAttr( + ['-r', '--references'], str, mandatory=True, + help="Input fasta file containing genomic references of pathogens and possibly of human chromosomes as well (the latter is experimental)", + default='') + + cdna_path = cli.SwitchAttr(['-c', '--cdna'], str, mandatory=False, + help="Input fasta file containing human cDNA records.", + default=None) + + output_dir = cli.SwitchAttr(['-o', '--output_dir'], str, mandatory=True, + help="Output directory that will be filled with subdirectories corresponding to homologous regions. The directory will be generated if it is not existing.", + default='') + + tmp_dir = cli.SwitchAttr(['-t', '--tmp_dir'], str, mandatory=False, + help="Directory in which temorary files are stored. Temporary files will be stored in subdirectories of the provided directory and will be deleted after completion.", + default=None) + + stat_path = cli.SwitchAttr(['-s', '--region_stats'], str, mandatory=False, + help="Path to output statistics tab-delimited text file.", + default='') + + reference_database_filter = cli.SwitchAttr( + ['-b', '--reference_database_filter'], str, list=True, mandatory=False, + help="Specifies which kind of reference databases are consisdered when extracting hits from hit files. May be specified multiple times. If any are specified, all reference databases not specified are filtered out. By default, this parameter is empty.", + default=[]) + + pathogen_family_filter = cli.SwitchAttr( + ['-f', '--pathogen_family_filter'], str, list=True, mandatory=False, + help="Specifies which kind of pathogen families are consisdered when extracting hits from hit files. May be specified multiple times. If any are specified, all families not specified are filtered out. By default, this parameter is empty.", + default=[]) + + min_read_number = cli.SwitchAttr( + ['-m', '--min_read_number'], cli.Range(1, 1000), mandatory=False, + help="Minimum number of reads that are required to be present in homologous region. Regions with fewer reads will be omitted from the results.", + default=5) + + max_gap_length = cli.SwitchAttr( + ['-l', '--max_gap_length'], cli.Range(1, 1000), mandatory=False, + help="Maximum number bases that two candidate homologous regions are distant from each other with regard to their positions on a common reference sequence in order for being eligable for merging.", + default=25) + + min_region_length = cli.SwitchAttr( + ['-x', '--min_region_length'], cli.Range(1, 1000), mandatory=False, + help="Minimum number bases of the longest reference sequence of each homologous region that is generated. Shoer regions will be omitted from the results.", + default=50) + + word_length = cli.SwitchAttr( + ['-w', '--word_length'], cli.Range(1, 21), mandatory=False, + help="Word length of the lastz alignment process. Shorter word lengths allow more sensitive but less specific alignments.", + default=7) + + ambiguity_cutoff = cli.SwitchAttr( + ['-a', '--ambiguity_cutoff'], float, mandatory=False, + help="Ratio of variant positions within a column of the homologous region sequence alignment so that the corresponding consensus sequence at this positions show a ambiguous base instead of the majority base.", + default=0.3) + + debug = cli.Flag( + ["-d", "--debug"], help="Enable debug information") + + def main(self): + + if self.debug: + logging.getLogger().setLevel(logging.DEBUG) + + # Make sequence proxy for managing hit files, references, and cdna + proxy = SequenceProxy(self.references_path, self.cdna_path) + for hit_file in self.hit_files: + proxy.add_hit_file(hit_file) + + # Generate homologous groups + generator = GroupGenerator(proxy, + reference_database_filter=self.reference_database_filter, + pathogen_family_filter=self.pathogen_family_filter, + tmp_dir=self.tmp_dir) + groups = generator.get_groups(min_read_number=self.min_read_number) + + # Prepare analysis modules ('runners') for postprocessing homologous + # regions + consensus = ConsensusRunner(ambiguity_cutoff=self.ambiguity_cutoff) + lastz = LastzRunner(word_length=self.word_length) + + if self.jalview_jar_dir: + jalview = JalviewRunner( + jalview_jar_dir=self.jalview_jar_dir, tmp_dir=self.tmp_dir) + else: + jalview = None + + # Make homologous regions within each homologous group + for name, group in groups.iteritems(): + group.merge_regions(max_gap_length=self.max_gap_length) + group.filter_regions( + min_region_length=self.min_region_length, min_read_number=self.min_read_number) + + if self.cdna_path: + group.add_transcripts_to_regions() + + group.write_outputs(self.output_dir, lastz, consensus, jalview) + group._delete_temporary_dir() + + # Run statistics on all homologous regions + if self.stat_path: + statistics = RegionStatistics(self.output_dir) + statistics.run(self.stat_path) + +if __name__ == "__main__": + CLI.run()