comparison vhom.py @ 0:3ba5983012cf draft

Uploaded
author mzeidler
date Tue, 24 Sep 2013 10:19:40 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:3ba5983012cf
1 #!/usr/bin/env python
2
3 import sys
4 import os
5 import logging
6 import tempfile
7 import shutil
8
9 import subprocess
10 import bz2
11 import re
12 import glob
13
14 from shutil import rmtree
15 from subprocess import PIPE
16 from collections import defaultdict, Counter
17
18 try:
19 import pysam
20 except ImportError:
21 message = 'This script requires the pysam python package\n'
22 sys.stderr.write(message)
23 sys.exit(1)
24
25 try:
26 from plumbum import cli
27 except ImportError:
28 message = 'This script requires the plumbum python package\n'
29 sys.stderr.write(message)
30 sys.exit(1)
31
32 try:
33 from Bio.SeqRecord import SeqRecord
34 from Bio import SeqIO
35 from Bio.Seq import Seq
36 from Bio.Alphabet import IUPAC, Gapped
37 except ImportError:
38 message = 'This script requires the BioPython python package\n'
39 sys.stderr.write(message)
40 sys.exit(1)
41
42 try:
43 import HTSeq
44 except ImportError:
45 message = 'This script requires the HTSeq python package\n'
46 sys.stderr.write(message)
47 sys.exit(1)
48
49 logging.basicConfig(level=logging.INFO, format='%(message)s')
50
51
52 class CLI(cli.Application):
53
54 """ Homologous region analysis of hit files"""
55 PROGNAME = "vmap"
56 VERSION = "1.0.0"
57 DESCRIPTION = """"""
58 USAGE = """"""
59
60 def main(self, *args):
61
62 if args:
63 print("Unknown command %r" % (args[0]))
64 print self.USAGE
65 return 1
66
67 if not self.nested_command:
68 print("No command given")
69 print self.USAGE
70 return 1
71
72 class SequenceProxy:
73
74 def __init__(self, references_path, human_transcripts_path=None):
75
76 self.references_path = references_path
77 self.human_transcripts_path = human_transcripts_path
78
79 self.human_transcript_records = None
80 self.reference_records = None
81 self.hit_records = []
82
83 self.transcript_ranges = None
84
85 def add_hit_file(self, hit_file_path):
86
87 logging.debug('SequenceProxy: adding new hit file %s' % hit_file_path)
88
89 hit_handle = bz2.BZ2File(hit_file_path, 'rb')
90 try:
91 hit_handle.read(10)
92 except IOError:
93 hit_handle = open(hit_file_path, 'rb')
94
95 hit_dict = SeqIO.to_dict(SeqIO.parse(hit_handle, "fasta"))
96
97 logging.debug('SequenceProxy: hit file has %i entries' % len(hit_dict))
98
99 self.hit_records.append(hit_dict)
100
101 def get_read_record(self, read_id):
102
103 for record_dict in self.hit_records:
104 if read_id in record_dict:
105 record = record_dict[read_id]
106 return SeqRecord(record.seq, record.id, '', '')
107
108 return None
109
110 def get_reference_record(self, identifier):
111
112 if not self.reference_records:
113 logging.debug('SequenceProxy: indexing reference records')
114 self.reference_records = SeqIO.index(self.references_path, 'fasta')
115
116 try:
117 record = self.reference_records[identifier]
118 return SeqRecord(record.seq, record.id, '', '')
119 except KeyError:
120 return None
121
122 def get_transcript_record(self, identifier):
123
124 if not self.human_transcripts_path:
125 return None
126
127 if not self.human_transcript_records:
128 logging.debug('SequenceProxy: indexing human transcripts')
129 self.human_transcript_records = SeqIO.index(
130 self.human_transcripts_path, 'fasta')
131
132 try:
133 record = self.human_transcript_records[identifier]
134 return SeqRecord(record.seq, record.id, '', '')
135
136 except KeyError:
137 return None
138
139 def get_overlapping_human_transcript_identifiers(self, chromosome, start, end):
140
141 if not self.human_transcripts_path:
142 return set()
143
144 if not self.transcript_ranges:
145 self._set_transcript_ranges()
146
147 interval = HTSeq.GenomicInterval(chromosome, start, end, '.')
148 all_transcript_ids = set()
149 for interval, transcript_ids in self.transcript_ranges[interval].steps():
150 all_transcript_ids = all_transcript_ids.union(transcript_ids)
151
152 return all_transcript_ids
153
154 def _set_transcript_ranges(self):
155
156 logging.debug(
157 'SequenceProxy: Preparing transcript ranges, please wait...')
158
159 self.transcript_ranges = HTSeq.GenomicArrayOfSets(
160 'auto', stranded=False)
161
162 for record in SeqIO.parse(self.human_transcripts_path, 'fasta'):
163 transcript_id = record.id
164 for exon in record.description.strip().split(' ')[1].split('|'):
165 chromosome, start, end = exon.split(';')
166 start, end = sorted((int(start), int(end)))
167 if start == end:
168 continue
169 interval = HTSeq.GenomicInterval(chromosome, start, end, ".")
170 self.transcript_ranges[interval] += transcript_id
171
172
173 class JalviewRunner:
174
175 def __init__(self, jalview_jar_dir, tmp_dir=None):
176
177 self.jalview_jar_dir = jalview_jar_dir
178
179 if tmp_dir:
180 self.tmp_dir = tempfile.mkdtemp(
181 dir=os.path.abspath(os.path.expandvars(tmp_dir)))
182 else:
183 self.tmp_dir = tempfile.mkdtemp()
184
185 self.config = ['#---JalviewX Properties File---',
186 '#Fri Nov 04 14:23:53 CET 2011',
187 'FIGURE_AUTOIDWIDTH=true',
188 'ID_ITALICS=false',
189 'SORT_ALIGNMENT=No sort',
190 'SHOW_IDENTITY=true',
191 'FONT_NAME=SansSerif',
192 'GAP_SYMBOL=.',
193 'SHOW_QUALITY=false',
194 'SHOW_GROUP_CONSERVATION=false',
195 'FONT_STYLE=plain',
196 'ANTI_ALIAS=false',
197 'SORT_BY_TREE=false',
198 'SHOW_CONSENSUS_HISTOGRAM=false',
199 'SHOW_OVERVIEW=false',
200 'DEFAULT_COLOUR=Nucleotide',
201 'SHOW_CONSENSUS_LOGO=false',
202 'SHOW_ANNOTATIONS=true',
203 'SHOW_UNCONSERVED=true',
204 'AUTO_CALC_CONSENSUS=true',
205 'PAD_GAPS=true',
206 'FONT_SIZE=10',
207 'RIGHT_ALIGN_IDS=false',
208 'WRAP_ALIGNMENT=false',
209 'FASTA_JVSUFFIX=false',
210 'PILEUP_JVSUFFIX=false',
211 'SHOW_JVSUFFIX=false']
212
213 def _make_configuration(self, sub_tmp_dir):
214
215 config_path = os.path.join(sub_tmp_dir, 'config.txt')
216 logging.debug(
217 'JalviewRunner: writing configuration file %s' % config_path)
218 with open(config_path, 'w') as config_file:
219 for c in self.config:
220 config_file.write(c + '\n')
221
222 return config_path
223
224 def _delete_tmp_dir(self):
225
226 try:
227 rmtree(self.tmp_dir)
228 except OSError:
229 pass
230
231 def _delete_sub_tmp_dir(self, sub_tmp_dir):
232
233 try:
234 rmtree(sub_tmp_dir)
235 except OSError:
236 pass
237
238 def run(self, fasta_path, png_output_path):
239
240 fasta_path = os.path.abspath(os.path.expandvars(fasta_path))
241
242 png_output_path = os.path.abspath(os.path.expandvars(png_output_path))
243
244 sub_tmp_dir = tempfile.mkdtemp(dir=self.tmp_dir)
245 config_path = self._make_configuration(sub_tmp_dir)
246
247 logging.debug(
248 'JalviewRunner: running Jalview on fasta %s with temp dir %s' %
249 (fasta_path, sub_tmp_dir))
250
251 cline = ['java', '-Djava.ext.dirs=%s/lib' % self.jalview_jar_dir,
252 '-cp %s/jalview.jar' % self.jalview_jar_dir, 'jalview.bin.Jalview',
253 '-open', fasta_path, '-nodisplay', '-png', png_output_path,
254 '-colour', 'nucleotide', '-props', config_path]
255
256 # Run sibelia process
257 java_process = subprocess.Popen(
258 ' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
259
260 # Block until streams are closed by the process
261 stdout, stderr = java_process.communicate()
262
263 if stderr:
264 sys.stderr.write('JalviewRunner: ' + stderr.replace('\n', ' '))
265
266 self._delete_sub_tmp_dir(sub_tmp_dir)
267
268 return png_output_path
269
270 def __del__(self):
271
272 self._delete_tmp_dir()
273
274
275 class LastzRunner:
276
277 def __init__(self, lastz_path=None, word_length=7):
278
279 self.lastz_path = lastz_path
280 self.word_length = word_length
281
282 def align_to_bam_file(self, reference_fasta_path, query_fasta_path, output_bam_path, multiple=False, assert_record=None):
283
284 logging.debug('LastzRunner: running on reference %s and query %s' %
285 (reference_fasta_path, query_fasta_path))
286 output_sam_path = os.path.abspath(
287 os.path.expandvars(output_bam_path.replace('.bam', '.sam')))
288 output_bam_unsorted_path = os.path.abspath(
289 os.path.expandvars(output_bam_path + '.unsorted'))
290
291 logging.debug(
292 'LastzRunner: aligning with output in temporary sam file %s' %
293 output_sam_path)
294 with open(output_sam_path, 'w') as output_sam_handler:
295 for line in self._align(reference_fasta_path, query_fasta_path, multiple):
296 output_sam_handler.write(line)
297
298 logging.debug(
299 'LastzRunner: transforming sam into unsorted bam file %s' %
300 output_bam_unsorted_path)
301 input_sam_handler = pysam.Samfile(output_sam_path, "r")
302 output_bam_file = pysam.Samfile(
303 output_bam_unsorted_path, "wb", template=input_sam_handler)
304
305 logging.debug(
306 'LastzRunner: copying from sam file to bam file')
307 for s in input_sam_handler:
308 output_bam_file.write(s)
309 output_bam_file.close()
310
311 logging.debug('LastzRunner: sorting and indexing bam file %s' %
312 output_bam_path)
313 pysam.sort(output_bam_unsorted_path,
314 output_bam_path.replace('.bam', ''))
315
316 pysam.index(output_bam_path)
317
318 def align_to_samlines(self, reference_fasta_path, query_fasta_path, multiple=False):
319
320 logging.debug('LastzRunner: running on reference %s and query %s' %
321 (reference_fasta_path, query_fasta_path))
322
323 for line in self._align(reference_fasta_path, query_fasta_path, multiple):
324 yield line
325
326 def _align(self, reference_fasta_path, query_fasta_path, multiple):
327
328 reference_fasta_path = os.path.abspath(
329 os.path.expandvars(reference_fasta_path))
330 query_fasta_path = os.path.abspath(
331 os.path.expandvars(query_fasta_path))
332
333 lastz = [self.lastz_path and self.lastz_path or 'lastz']
334 if multiple:
335 cline = lastz + [
336 reference_fasta_path +
337 '[multiple]', query_fasta_path + '[nameparse=darkspace]',
338 '--format=sam', '--strand=both', '--gapped', '--chain',
339 '--nogfextend', '--seed=match%i' % self.word_length]
340
341 else:
342 cline = lastz + [
343 reference_fasta_path, query_fasta_path +
344 '[nameparse=darkspace]',
345 '--format=sam', '--strand=both', '--gapped',
346 '--chain', '--nogfextend', '--seed=match%i' % self.word_length]
347
348 # Run lastz process
349 lastz_process = subprocess.Popen(
350 ' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
351
352 # Filter multiple read mappings (one to each strand) by only keeping
353 # the one with the most matches
354 alignments = []
355 for i, line in enumerate(iter(lastz_process.stdout.readline, '')):
356 if line.startswith('@'):
357 yield line
358 elif line.lower().startswith('read'):
359
360 fields = line.split('\t')
361 read_name = fields[0]
362 read_cigar = fields[5]
363 if not alignments:
364 alignments = [read_name, line, read_cigar]
365 continue
366 else:
367 if alignments[0] == read_name:
368 this_mapping = sum(
369 [int(c[:-1]) for c in re.findall('[0-9]*M', read_cigar)])
370 other_mapping = sum(
371 [int(c[:-1]) for c in re.findall('[0-9]*M', alignments[2])])
372
373 if other_mapping > this_mapping:
374 yield alignments[1]
375 else:
376 yield line
377 else:
378 yield line
379 alignments = []
380 else:
381 yield line
382
383
384 class ConsensusRunner:
385
386 def __init__(self, ambiguity_cutoff=0.3):
387
388 self.ambiguity_cutoff = ambiguity_cutoff
389 self.ambiguity = {
390 'GT': 'K', 'AC': 'M', 'AG': 'R', 'CT': 'Y', 'CG': 'S',
391 'AT': 'W', 'CGT': 'B', 'ACG': 'V', 'ACT': 'H', 'AGT': 'D',
392 'ACGT': 'N'}
393
394 self.alphabet = Gapped(IUPAC.unambiguous_dna, "-")
395
396 def run(self, input_sam_path, output_fasta_path):
397
398 samfile = pysam.Samfile(input_sam_path, "rb")
399 references = samfile.references
400
401 logging.debug(
402 'ConsensusRunner: writing consensus of bam file %s' % input_sam_path)
403
404 assert len(references) == 1
405
406 consensus = defaultdict(list)
407 conditions = {}
408
409 base_mapper = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4, '-': 5}
410 ambiguity = self.ambiguity
411
412 alphabet = Gapped(IUPAC.ambiguous_dna)
413 frequency_cutoff = self.ambiguity_cutoff
414
415 # Get all conditions of aligned reads
416 for alignedread in samfile.fetch(references[0]):
417
418 name = alignedread.qname
419 length = len(alignedread.seq)
420 condition = None
421
422 if name.lower().startswith('read'):
423 condition = ';'.join(name.split(';')[:2])
424 else:
425 condition = ';'.join(name.split(';')[:-1])
426
427 if condition in conditions:
428 if name not in conditions[condition][0]:
429 conditions[condition][0].add(name)
430 conditions[condition][1] += 1
431 conditions[condition][2] += length
432 else:
433 conditions[condition] = [set([name]), 1, length]
434
435 # Prepare count dictionaries
436 consensus = defaultdict(list)
437
438 for i, pileupcolumn in enumerate(samfile.pileup(references[0], max_depth=100000)):
439
440 if i % 1000 == 0:
441 logging.debug(
442 'ConsensusRunner: making consensus at position %i' % i)
443
444 counters = {}
445 for condition in conditions:
446 counters[condition] = [0, 0, 0, 0, 0, 0] # ACGTN-
447
448 for pileupread in pileupcolumn.pileups:
449
450 alignment = pileupread.alignment
451 name = alignment.qname
452
453 if pileupread.is_del:
454 base = '-'
455 else:
456 base = alignment.seq[pileupread.qpos].upper()
457
458 if name.lower().startswith('read'):
459 condition = ';'.join(name.split(';')[:2])
460 counters[condition][base_mapper[base]] += 1
461 else:
462 condition = ';'.join(name.split(';')[:-1])
463 counters[condition][base_mapper[base]] += 1
464
465 for condition, counts in counters.iteritems():
466 depth = float(sum(counts))
467 bases = []
468 a, c, g, t, n, gap = counts
469
470 if depth > 0:
471 if (n / depth) >= frequency_cutoff:
472 consensus[condition].append('N')
473 else:
474 if (a / depth) >= frequency_cutoff:
475 bases.append('A')
476 if (c / depth) >= frequency_cutoff:
477 bases.append('C')
478 if (g / depth) >= frequency_cutoff:
479 bases.append('G')
480 if (t / depth) >= frequency_cutoff:
481 bases.append('T')
482
483 if len(bases) > 1:
484 consensus[condition].append(
485 ambiguity[''.join(sorted(bases))])
486 elif len(bases) == 1:
487 consensus[condition].append(bases[0])
488 else:
489 consensus[condition].append('-')
490
491 # Split consensuses by type
492 reference_consensuses = []
493 read_consensuses = []
494 for condition, sequence_list in consensus.iteritems():
495 if condition.lower().startswith('read'):
496 read_consensuses.append((''.join(sequence_list), condition))
497 else:
498 reference_consensuses.append(
499 (''.join(sequence_list), condition))
500
501 reference_consensuses.sort(reverse=True)
502 read_consensuses.sort(reverse=True)
503
504 # Get start and end of reads (remove fully gapped positions)
505 start = None
506 end = None
507 for read_consensus, condition in read_consensuses:
508
509 # Forward direction
510 for i, char in enumerate(read_consensus):
511 if char != '-':
512 if start is None:
513 start = i
514 else:
515 start = min(start, i)
516 break
517
518 # Reverse direction
519 for i, char in enumerate(read_consensus[::-1]):
520 if char != '-':
521 if end is None:
522 end = i
523 else:
524 end = min(end, i)
525 break
526
527 if end == 0 or end is None:
528 end = None
529 else:
530 end = -end
531
532 # Filter all records by start and end of reads
533 reference_consensuses =\
534 [(r[start:end], c) for r, c in reference_consensuses]
535
536 read_consensuses =\
537 [(r[start:end], c) for r, c in read_consensuses]
538
539 # Write consensus records to file
540 with open(output_fasta_path, 'w', buffering=100 * 1024 * 1024) as output_handler:
541 for consensus, condition in read_consensuses + reference_consensuses:
542 stats = ';'.join(map(str, conditions[condition][1:]))
543 record = SeqRecord(
544 Seq(consensus, alphabet=alphabet), id=condition + ';' + stats, name='', description='')
545 SeqIO.write([record], output_handler, "fasta")
546
547
548 class Group:
549
550 def __init__(self, family_name, qualified_family_name, sequence_proxy, tmp_dir=None):
551
552 self.family_name = family_name
553 self.qualified_family_name = qualified_family_name
554 self.sequence_proxy = sequence_proxy
555
556 self.regions = []
557 self.fasta_path = None
558
559 if tmp_dir:
560 self.tmp_dir = tempfile.mkdtemp(
561 dir=os.path.abspath(os.path.expandvars(tmp_dir)))
562 else:
563 self.tmp_dir = tempfile.mkdtemp()
564
565 def add_region(self, read_id, pathogen_mapping_locations, human_mapping_locations):
566
567 region = Region(self.sequence_proxy, self.tmp_dir)
568
569 region.add_read(read_id)
570
571 for mapping_location in list(pathogen_mapping_locations) + list(human_mapping_locations):
572 reference = ';'.join(mapping_location[:-2])
573 region.add_reference(
574 reference, int(mapping_location[-2]), int(mapping_location[-1]))
575
576 self.regions.append(region)
577
578 def write_outputs(self, output_dir, lastz_runner, consensus_builder, jalview_runner):
579
580 if not self.regions:
581 logging.debug('Group %s: no regions for family' % self.family_name)
582 return
583
584 made_output_dir = False
585 for i, region in enumerate(self.regions):
586
587 if not made_output_dir:
588 output_dir = os.path.abspath(
589 os.path.expandvars(os.path.join(output_dir, self.qualified_family_name)))
590 logging.debug('Group %s: writing output files to %s' %
591 (self.family_name, output_dir))
592 try:
593 os.makedirs(output_dir)
594 except OSError:
595 pass
596 made_output_dir = True
597
598 logging.debug('Group %s: writing region number %i files to %s' %
599 (self.family_name, i + 1, output_dir))
600
601 unaligned_path = os.path.join(
602 output_dir, 'region_%i_unaligned.fa.bzip2' % (i + 1))
603 compressed_unaligned = bz2.BZ2File(unaligned_path, 'wb')
604 with open(region.get_unaligned_fasta_path()) as unaligned_fasta:
605 for line in unaligned_fasta:
606 compressed_unaligned.write(line)
607
608 bam_path = os.path.join(
609 output_dir, 'region_%i_alignment.bam' % (i + 1))
610 temporary_alignment_bam_path = region.get_alignment_bam_path(
611 lastz_runner, assert_record='Read')
612 shutil.copy(temporary_alignment_bam_path, bam_path)
613
614 shutil.copy(
615 temporary_alignment_bam_path + '.bai', bam_path + '.bai')
616
617 consensus_path = os.path.join(
618 output_dir, 'region_%i_consensus.fa' % (i + 1))
619 temporary_consensus_path = region.get_consensus_fasta_path(
620 consensus_builder, temporary_alignment_bam_path)
621 shutil.copy(temporary_consensus_path, consensus_path)
622
623 if jalview_runner:
624
625 jalview_path = os.path.join(
626 output_dir, 'region_%i_consensus.png' % (i + 1))
627 temporary_jalview_path = region.get_consensus_figure_path(
628 jalview_runner, temporary_consensus_path)
629 shutil.copy(temporary_jalview_path, jalview_path)
630
631 def _delete_temporary_dir(self):
632
633 self.fasta_path = None
634 try:
635 rmtree(self.tmp_dir)
636 except OSError:
637 pass
638
639 def filter_regions(self, min_region_length=50, min_read_number=5):
640
641 logging.debug('Group %s: filtering %i candidate regions' %
642 (self.family_name, len(self.regions)))
643
644 filtered = [r for r in self.regions if len(
645 r) >= min_read_number and r.get_longest_reference_length() >= min_region_length]
646
647 ordered = sorted(filtered, reverse=True)
648
649 if ordered:
650 length = ordered[0].length
651 else:
652 length = 0
653
654 logging.debug(
655 'Group %s: %i candidate regions remained after filtering, the longest is %i bp long' %
656 (self.family_name, len(ordered), length))
657
658 self.regions = ordered
659
660 return ordered
661
662 def merge_regions(self, max_gap_length):
663 """ Merges candidate regions into homologous regions. """
664
665 logging.debug('Group %s: merging %i candidate regions' %
666 (self.family_name, len(self.regions)))
667
668 if len(self.regions) > 1:
669
670 potentially_mergable = self.regions
671 not_mergable = []
672
673 while len(potentially_mergable) > 1:
674
675 merged = False
676 current = potentially_mergable[0]
677 compared_to = potentially_mergable[1:]
678
679 for region in compared_to:
680 if region.overlaps(current, max_gap_length):
681 region.merge(current)
682 region.clean_references(max_gap_length)
683 #logging.debug('Group %s: merged a region. %i potentially mergable candidate regions remaining' % (self.family_name, len(potentially_mergable)))
684 potentially_mergable = compared_to
685 merged = True
686 break
687
688 if not merged:
689 not_mergable.append(current)
690 potentially_mergable = compared_to
691 #logging.debug('Group %s: not merged a region. %i potentially mergable candidate regions remaining' % (self.family_name, len(potentially_mergable)))
692
693 results = not_mergable + potentially_mergable
694
695 logging.debug('Group %s: merged into %i regions' %
696 (self.family_name, len(results)))
697
698 self.regions = results
699
700 else:
701 logging.debug(
702 'Group %s: found only 1 region, no mergin necessary' % self.family_name)
703
704 def add_transcripts_to_regions(self):
705
706 logging.debug('Group %s: enriching %i regions with human transcript information'
707 % (self.family_name, len(self.regions)))
708
709 for region in self.regions:
710
711 added_transcripts = 0
712
713 for identifier, locations in region.references.iteritems():
714 chromosome = identifier.split(';')[-1]
715
716 for start, end in locations:
717 transcript_identifiers = self.sequence_proxy.get_overlapping_human_transcript_identifiers(
718 chromosome, start, end)
719
720 for transcript_identifier in transcript_identifiers:
721 region.add_transcript(transcript_identifier)
722 added_transcripts += 1
723
724 logging.debug('Group %s: added %i transcripts to a region' %
725 (self.family_name, added_transcripts))
726
727 def __str__(self):
728
729 s = "Group %s with %i regions" % (self.family_name, len(self.regions))
730
731 return s
732
733
734 class GroupGenerator:
735
736 """ Produces Groups from Hitfiles. Reads are assigned to groups based on
737 taxonomic families. Transcripts overlapping human read mappings as well
738 as genome sequences of pathogens the reads map to are added to the
739 Groups.
740 """
741
742 def __init__(self, sequence_proxy, reference_database_filter=None, pathogen_family_filter=None, tmp_dir=None):
743
744 self.sequence_proxy = sequence_proxy
745
746 self.tmp_path = tmp_dir
747 self.reference_database_filter = reference_database_filter
748 self.pathogen_family_filter = pathogen_family_filter
749
750 self.groups = {}
751
752 if tmp_dir:
753 self.tmp_dir = tempfile.mkdtemp(
754 dir=os.path.abspath(os.path.expandvars(tmp_dir)))
755 else:
756 self.tmp_dir = tempfile.mkdtemp()
757
758 def get_groups(self, min_read_number=5):
759
760 logging.debug(
761 'GroupGenerator: generating groups from %i hit files, please wait...' %
762 len(self.sequence_proxy.hit_records))
763
764 for hit_index in self.sequence_proxy.hit_records:
765 for read_id, record in hit_index.iteritems():
766
767 # Extract mapping locations to human DNA, human transcript, and
768 # pathogen genomes
769 mapping_locations = record.description.strip().split(
770 ' ')[1].split('|')
771
772 pathogen_families = defaultdict(set)
773 human_mapping_locations = set()
774
775 for mapping_location in mapping_locations:
776 fields = mapping_location.split(';')
777
778 # Convert start and end to integers
779 fields[-2] = int(fields[-2]) # start
780 fields[-1] = int(fields[-1]) # end
781
782 # Add mapping locations to human cDNA or human DNA
783 if fields[2] == 'Homo_sapiens':
784 human_mapping_locations.add(tuple(fields))
785
786 # Add mapping locations to non-human references
787 else:
788 reference_database, family = fields[:2]
789 if self.reference_database_filter and reference_database\
790 not in self.reference_database_filter:
791 continue
792 if self.pathogen_family_filter and family\
793 not in self.pathogen_family_filter:
794 continue
795 pathogen_families[
796 (reference_database, family)].add(tuple(fields))
797
798 if not pathogen_families:
799 continue
800
801 # Process the hits to different pathogen families
802 for (reference_database, family), pathogen_locations in pathogen_families.iteritems():
803
804 qualified_family_name = reference_database + '_' + family
805
806 # Obtain existing group or make new group
807 if qualified_family_name in self.groups:
808 group = self.groups[qualified_family_name]
809 else:
810 group = Group(family, qualified_family_name,
811 self.sequence_proxy, self.tmp_dir)
812 self.groups[qualified_family_name] = group
813
814 group.add_region(
815 read_id, pathogen_locations, human_mapping_locations)
816
817 # Exclude groups that have collected to few reads
818 for qualified_family_name, group in self.groups.items():
819 if len(group.regions) < min_read_number:
820 del self.groups[qualified_family_name]
821 else:
822 logging.debug(
823 'GroupGenerator: made new group for family %s' % qualified_family_name)
824
825 return self.groups
826
827
828 class Region:
829
830 def __init__(self, sequence_proxy, tmp_dir=None):
831
832 self.sequence_proxy = sequence_proxy
833
834 self.references = defaultdict(set)
835 self.reads = set()
836 self.transcripts = set()
837 self.length = None
838
839 self.sorted_reference_positions = None
840
841 self.master_tmp = tmp_dir
842 self.tmp_dir = None
843
844 self.unaligned_fasta_path = None
845 self.alignment_sam_path = None
846 self.consensus_fasta_path = None
847
848 def add_reference(self, name, start, end):
849
850 assert start < end
851 self.length = None
852 self.longest_reference_id = None
853 self.references[name].add((start, end))
854
855 def add_read(self, name):
856
857 self.reads.add(name)
858
859 def add_transcript(self, name):
860
861 self.transcripts.add(name)
862
863 def get_tmp_path(self):
864
865 if self.tmp_dir:
866 return self.tmp_dir
867
868 if self.master_tmp:
869 self.tmp_dir = tempfile.mkdtemp(
870 dir=os.path.abspath(os.path.expandvars(self.master_tmp)))
871 else:
872 self.tmp_dir = tempfile.mkdtemp()
873
874 return self.tmp_dir
875
876 def _distance(self, range1, range2):
877
878 first, second = sorted((range1, range2))
879
880 if first[0] <= first[1] < second[0]:
881 return (second[0] - first[1])
882 else:
883 return 0
884
885 def _delete_temporary_dir(self):
886
887 self.unaligned_fasta_path = None
888 if self.tmp_dir:
889 try:
890 rmtree(self.tmp_dir)
891 except OSError:
892 pass
893
894 def _get_unaligned_fasta_path(self):
895
896 output_path = os.path.join(self.get_tmp_path(), 'unaligned_region.fa')
897 logging.debug('Region: writing unaligned fasta to %s' % output_path)
898
899 assert self.reads
900
901 with open(output_path, 'w', buffering=100 * 1024 * 1024) as output_handler:
902
903 # Cut and write references
904 human_positions = []
905 pathogen_positions = []
906 for length, identifier, start, end in self.get_sorted_reference_positions():
907 if ';Homo_sapiens;' in identifier:
908 human_positions.append((identifier, start, end))
909 else:
910 pathogen_positions.append((identifier, start, end))
911
912 for identifier, start, end in pathogen_positions + human_positions:
913 record = self.sequence_proxy.get_reference_record(identifier)
914 if record:
915 record = SeqRecord(record.seq, record.id, '', '')
916 #record.id += ';%i-%i' % (start, end)
917 SeqIO.write(
918 [record[start - 1:end]], output_handler, "fasta")
919 else:
920 pass
921 #logging.debug('Region: could not retrieve reference %s' % identifier)
922
923 # Write full-length transcripts
924 for identifier in self.transcripts:
925 record = self.sequence_proxy.get_transcript_record(identifier)
926 if record:
927 record = SeqRecord(record.seq, record.id, '', '')
928 SeqIO.write([record], output_handler, "fasta")
929 else:
930 logging.debug(
931 'Region: could not retrieve reference %s' % identifier)
932
933 # Write reads
934 for read_id in sorted(self.reads):
935 record = self.sequence_proxy.get_read_record(read_id)
936 if record:
937 # Transform read ID so that LASTZ can work with it (it
938 # makes nicknames based on some characters )
939 identifier = record.id.replace(
940 ':', '_').replace('|', '_').replace('/', '_')
941 record = SeqRecord(record.seq, identifier, '', '')
942 SeqIO.write([record], output_handler, "fasta")
943 else:
944 logging.debug(
945 'Region: could not retrieve read %s' % read_id)
946
947 self.unaligned_fasta_path = output_path
948
949 return self.unaligned_fasta_path
950
951 def get_unaligned_fasta_path(self):
952
953 if not self.unaligned_fasta_path:
954 self._get_unaligned_fasta_path()
955
956 return self.unaligned_fasta_path
957
958 def _align_fastas(self, aligner, assert_record=None):
959
960 # Write reference fasta
961 queries_path = self.get_unaligned_fasta_path()
962 first_record = SeqIO.parse(open(queries_path, "rU"), "fasta").next()
963 reference_path = os.path.join(self.get_tmp_path(), 'reference.fa')
964
965 logging.debug(
966 'Region: writing longest reference to %s' % reference_path)
967
968 SeqIO.write([first_record], reference_path, "fasta")
969
970 # Do alignment
971 bam_path = os.path.join(self.get_tmp_path(), 'aligned.bam')
972 logging.debug(
973 'Region: starting alignment using queries %s to sam path %s' %
974 (queries_path, bam_path))
975
976 aligner.align_to_bam_file(
977 reference_path, queries_path, bam_path, assert_record=assert_record)
978
979 return bam_path
980
981 def _build_alignment_consensus(self, consensus_builder, alignment_bam_path):
982
983 consensus_path = os.path.join(self.get_tmp_path(), 'consensus.fa')
984 logging.debug('Region: writing consensus to %s' % consensus_path)
985
986 consensus_builder.run(alignment_bam_path, consensus_path)
987
988 return consensus_path
989
990 def get_alignment_bam_path(self, aligner, assert_record=None):
991
992 return self._align_fastas(aligner, assert_record)
993
994 def get_consensus_fasta_path(self, consensus_builder, alignment_bam_path):
995
996 return self._build_alignment_consensus(consensus_builder, alignment_bam_path)
997
998 def get_consensus_figure_path(self, jalview_runner, consensus_fasta_path):
999
1000 png_path = os.path.join(self.get_tmp_path(), 'consensus_figure.png')
1001
1002 return jalview_runner.run(consensus_fasta_path, png_path)
1003
1004 def overlaps(self, other, max_gap_length):
1005
1006 overlap = (set(self.references) & set(other.references))
1007
1008 # Overlap is solely based on non-human references
1009 overlap = [ref for ref in overlap if not ';Homo_sapiens;' in ref]
1010
1011 if not overlap:
1012 return False
1013
1014 for reference in overlap:
1015 reflist = self.references[reference]
1016 for start, end in reflist:
1017 for other_start, other_end in other.references[reference]:
1018 distance = self._distance((start, end),
1019 (other_start, other_end))
1020 if distance <= max_gap_length:
1021 return True
1022 return False
1023
1024 def merge(self, other):
1025
1026 for reference, reflist in other.references.iteritems():
1027 for (start, end) in reflist:
1028 self.add_reference(reference, start, end)
1029
1030 for read in other.reads:
1031 self.add_read(read)
1032
1033 def clean_references(self, max_gap_length):
1034
1035 for reference, reflist in self.references.iteritems():
1036
1037 start_list = sorted(reflist)
1038 saved = list(start_list[0])
1039 result = set()
1040
1041 for item in start_list:
1042 if self._distance(saved, item) <= max_gap_length:
1043 saved[1] = max(saved[1], item[1])
1044 else:
1045 result.add(tuple(saved))
1046 saved[0] = item[0]
1047 saved[1] = item[1]
1048 result.add(tuple(saved))
1049 self.references[reference] = result
1050
1051 def to_sorted_record_ids(self):
1052
1053 references = []
1054 for name, locations in self.references.iteritems():
1055 for start, end in locations:
1056 references.append(end - start, name)
1057
1058 reads = defaultdict(list)
1059 for name, locations in self.references.iteritems():
1060 condition = name.split(';')[1]
1061 for start, end in locations:
1062 reads[condition].append(end - start, name)
1063
1064 all_record_ids = sorted(references, reverse=True)
1065 for condition, the_reads in reads.iteritems():
1066 all_record_ids += sorted(the_reads, reverse=True)
1067
1068 for length, record_id in all_record_ids:
1069 yield record_id
1070
1071 def __len__(self):
1072
1073 return len(self.reads)
1074
1075 def __cmp__(self, other):
1076
1077 if self.get_longest_reference_length() <\
1078 other.get_longest_reference_length():
1079 return -1
1080 elif self.get_longest_reference_length() ==\
1081 other.get_longest_reference_length():
1082 return 0
1083 return 1
1084
1085 def get_longest_reference_length(self):
1086
1087 if self.length is not None:
1088 return self.length
1089
1090 positions = self.get_sorted_reference_positions()
1091 if positions:
1092 self.length = self.get_sorted_reference_positions()[0][0]
1093 else:
1094 self.length = 0
1095
1096 return self.length
1097
1098 def get_sorted_reference_positions(self):
1099
1100 if self.sorted_reference_positions is not None:
1101 return self.sorted_reference_positions
1102
1103 lengths = []
1104 for reference, the_set in self.references.iteritems():
1105 for start, end in the_set:
1106 lengths.append([end - start, reference, start, end])
1107
1108 self.sorted_reference_positions = sorted(lengths, reverse=True)
1109
1110 return self.sorted_reference_positions
1111
1112 def __str__(self):
1113 return '<Region> of length %10i with %10i reads and %5i references' %\
1114 (self.get_longest_reference_length(),
1115 len(self.reads), len(self.references))
1116
1117
1118 class RegionStatistics:
1119
1120 def __init__(self, input_dir):
1121
1122 self.input_dir = input_dir
1123 self.stats = []
1124
1125 def _add_sequence(self, qualified_name, region_id,
1126 longest_human_reference, longest_pathogen_reference, record):
1127
1128 state = 'pathogen'
1129 if longest_human_reference:
1130 state = 'ambiguous'
1131
1132 reference_type = qualified_name.split('_')[0]
1133 family_name = '_'.join(qualified_name.split('_')[1:])
1134
1135 read_type, sample_id, number_reads, basepairs = record.id.split(';')
1136
1137 coverage = int(basepairs) / float(len(longest_pathogen_reference))
1138
1139 self.stats.append(
1140 [reference_type, family_name, region_id, sample_id, state,
1141 number_reads, str(len(longest_pathogen_reference)), basepairs, '%.5f' % coverage])
1142
1143 def run(self, output_file):
1144
1145 for qualified_family_name in os.listdir(self.input_dir):
1146
1147 family_dir = os.path.join(self.input_dir, qualified_family_name)
1148
1149 if not os.path.isdir(family_dir):
1150 continue
1151
1152 consensus_regions = glob.glob(os.path.join(
1153 family_dir, 'region_*_consensus.fa'))
1154
1155 for consensus_region in consensus_regions:
1156
1157 base_bame = os.path.basename(consensus_region)
1158
1159 region_id = base_bame.split('_')[1]
1160
1161 reads = []
1162 longest_human_reference = None
1163 longest_pathogen_reference = None
1164
1165 for consensus_record in SeqIO.parse(consensus_region, 'fasta'):
1166 if consensus_record.id.lower().startswith('read'):
1167 # ends with read number; cumulative bases
1168 reads.append(consensus_record)
1169 elif ';Homo_sapiens;' in consensus_record.id:
1170 if longest_human_reference is None or\
1171 len(consensus_record) > longest_human_reference:
1172 longest_human_reference = consensus_record
1173 else:
1174 if longest_pathogen_reference is None or\
1175 len(consensus_record) > longest_pathogen_reference:
1176 longest_pathogen_reference = consensus_record
1177
1178 for read in reads:
1179 self._add_sequence(qualified_family_name, region_id,
1180 longest_human_reference, longest_pathogen_reference, read)
1181
1182 with open(os.path.abspath(os.path.expandvars(output_file)), 'w') as output_handler:
1183 output_handler.write(
1184 '\t'.join(['reference_type', 'family_name', 'region_id',
1185 'sample_id', 'taxonomic_origin', 'number_reads',
1186 'region_length', 'basepairs', 'coverage']) + '\n')
1187 for entries in sorted(self.stats):
1188 line = '\t'.join(entries) + '\n'
1189 output_handler.write(line)
1190
1191
1192 @CLI.subcommand("regions")
1193 class RegionRunner(cli.Application):
1194
1195 """ Derive homologous groups and regions from hit files """
1196
1197 hit_files = cli.SwitchAttr(
1198 ['-v', '--virana_hits'], str, list=True, mandatory=True,
1199 help="Add hit file for analysis. May be supplied multiple times.")
1200
1201 lastz_path = cli.SwitchAttr(['-z', '--lastz_path'], str, mandatory=False,
1202 help="Path to lastz executable",
1203 default='')
1204
1205 jalview_jar_dir = cli.SwitchAttr(
1206 ['-j', '--jalview_jar_dir'], str, mandatory=False,
1207 help="Directory containing the jalview.jar file",
1208 default=None)
1209
1210 references_path = cli.SwitchAttr(
1211 ['-r', '--references'], str, mandatory=True,
1212 help="Input fasta file containing genomic references of pathogens and possibly of human chromosomes as well (the latter is experimental)",
1213 default='')
1214
1215 cdna_path = cli.SwitchAttr(['-c', '--cdna'], str, mandatory=False,
1216 help="Input fasta file containing human cDNA records.",
1217 default=None)
1218
1219 output_dir = cli.SwitchAttr(['-o', '--output_dir'], str, mandatory=True,
1220 help="Output directory that will be filled with subdirectories corresponding to homologous regions. The directory will be generated if it is not existing.",
1221 default='')
1222
1223 tmp_dir = cli.SwitchAttr(['-t', '--tmp_dir'], str, mandatory=False,
1224 help="Directory in which temorary files are stored. Temporary files will be stored in subdirectories of the provided directory and will be deleted after completion.",
1225 default=None)
1226
1227 stat_path = cli.SwitchAttr(['-s', '--region_stats'], str, mandatory=False,
1228 help="Path to output statistics tab-delimited text file.",
1229 default='')
1230
1231 reference_database_filter = cli.SwitchAttr(
1232 ['-b', '--reference_database_filter'], str, list=True, mandatory=False,
1233 help="Specifies which kind of reference databases are consisdered when extracting hits from hit files. May be specified multiple times. If any are specified, all reference databases not specified are filtered out. By default, this parameter is empty.",
1234 default=[])
1235
1236 pathogen_family_filter = cli.SwitchAttr(
1237 ['-f', '--pathogen_family_filter'], str, list=True, mandatory=False,
1238 help="Specifies which kind of pathogen families are consisdered when extracting hits from hit files. May be specified multiple times. If any are specified, all families not specified are filtered out. By default, this parameter is empty.",
1239 default=[])
1240
1241 min_read_number = cli.SwitchAttr(
1242 ['-m', '--min_read_number'], cli.Range(1, 1000), mandatory=False,
1243 help="Minimum number of reads that are required to be present in homologous region. Regions with fewer reads will be omitted from the results.",
1244 default=5)
1245
1246 max_gap_length = cli.SwitchAttr(
1247 ['-l', '--max_gap_length'], cli.Range(1, 1000), mandatory=False,
1248 help="Maximum number bases that two candidate homologous regions are distant from each other with regard to their positions on a common reference sequence in order for being eligable for merging.",
1249 default=25)
1250
1251 min_region_length = cli.SwitchAttr(
1252 ['-x', '--min_region_length'], cli.Range(1, 1000), mandatory=False,
1253 help="Minimum number bases of the longest reference sequence of each homologous region that is generated. Shoer regions will be omitted from the results.",
1254 default=50)
1255
1256 word_length = cli.SwitchAttr(
1257 ['-w', '--word_length'], cli.Range(1, 21), mandatory=False,
1258 help="Word length of the lastz alignment process. Shorter word lengths allow more sensitive but less specific alignments.",
1259 default=7)
1260
1261 ambiguity_cutoff = cli.SwitchAttr(
1262 ['-a', '--ambiguity_cutoff'], float, mandatory=False,
1263 help="Ratio of variant positions within a column of the homologous region sequence alignment so that the corresponding consensus sequence at this positions show a ambiguous base instead of the majority base.",
1264 default=0.3)
1265
1266 debug = cli.Flag(
1267 ["-d", "--debug"], help="Enable debug information")
1268
1269 def main(self):
1270
1271 if self.debug:
1272 logging.getLogger().setLevel(logging.DEBUG)
1273
1274 # Make sequence proxy for managing hit files, references, and cdna
1275 proxy = SequenceProxy(self.references_path, self.cdna_path)
1276 for hit_file in self.hit_files:
1277 proxy.add_hit_file(hit_file)
1278
1279 # Generate homologous groups
1280 generator = GroupGenerator(proxy,
1281 reference_database_filter=self.reference_database_filter,
1282 pathogen_family_filter=self.pathogen_family_filter,
1283 tmp_dir=self.tmp_dir)
1284 groups = generator.get_groups(min_read_number=self.min_read_number)
1285
1286 # Prepare analysis modules ('runners') for postprocessing homologous
1287 # regions
1288 consensus = ConsensusRunner(ambiguity_cutoff=self.ambiguity_cutoff)
1289 lastz = LastzRunner(word_length=self.word_length)
1290
1291 if self.jalview_jar_dir:
1292 jalview = JalviewRunner(
1293 jalview_jar_dir=self.jalview_jar_dir, tmp_dir=self.tmp_dir)
1294 else:
1295 jalview = None
1296
1297 # Make homologous regions within each homologous group
1298 for name, group in groups.iteritems():
1299 group.merge_regions(max_gap_length=self.max_gap_length)
1300 group.filter_regions(
1301 min_region_length=self.min_region_length, min_read_number=self.min_read_number)
1302
1303 if self.cdna_path:
1304 group.add_transcripts_to_regions()
1305
1306 group.write_outputs(self.output_dir, lastz, consensus, jalview)
1307 group._delete_temporary_dir()
1308
1309 # Run statistics on all homologous regions
1310 if self.stat_path:
1311 statistics = RegionStatistics(self.output_dir)
1312 statistics.run(self.stat_path)
1313
1314 if __name__ == "__main__":
1315 CLI.run()