0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import sys
|
|
4 import os
|
|
5 import logging
|
|
6 import tempfile
|
|
7 import shutil
|
|
8
|
|
9 import subprocess
|
|
10 import bz2
|
|
11 import re
|
|
12 import glob
|
|
13
|
|
14 from shutil import rmtree
|
|
15 from subprocess import PIPE
|
|
16 from collections import defaultdict, Counter
|
|
17
|
|
18 try:
|
|
19 import pysam
|
|
20 except ImportError:
|
|
21 message = 'This script requires the pysam python package\n'
|
|
22 sys.stderr.write(message)
|
|
23 sys.exit(1)
|
|
24
|
|
25 try:
|
|
26 from plumbum import cli
|
|
27 except ImportError:
|
|
28 message = 'This script requires the plumbum python package\n'
|
|
29 sys.stderr.write(message)
|
|
30 sys.exit(1)
|
|
31
|
|
32 try:
|
|
33 from Bio.SeqRecord import SeqRecord
|
|
34 from Bio import SeqIO
|
|
35 from Bio.Seq import Seq
|
|
36 from Bio.Alphabet import IUPAC, Gapped
|
|
37 except ImportError:
|
|
38 message = 'This script requires the BioPython python package\n'
|
|
39 sys.stderr.write(message)
|
|
40 sys.exit(1)
|
|
41
|
|
42 try:
|
|
43 import HTSeq
|
|
44 except ImportError:
|
|
45 message = 'This script requires the HTSeq python package\n'
|
|
46 sys.stderr.write(message)
|
|
47 sys.exit(1)
|
|
48
|
|
49 logging.basicConfig(level=logging.INFO, format='%(message)s')
|
|
50
|
|
51
|
|
52 class CLI(cli.Application):
|
|
53
|
|
54 """ Homologous region analysis of hit files"""
|
|
55 PROGNAME = "vmap"
|
|
56 VERSION = "1.0.0"
|
|
57 DESCRIPTION = """"""
|
|
58 USAGE = """"""
|
|
59
|
|
60 def main(self, *args):
|
|
61
|
|
62 if args:
|
|
63 print("Unknown command %r" % (args[0]))
|
|
64 print self.USAGE
|
|
65 return 1
|
|
66
|
|
67 if not self.nested_command:
|
|
68 print("No command given")
|
|
69 print self.USAGE
|
|
70 return 1
|
|
71
|
|
72 class SequenceProxy:
|
|
73
|
|
74 def __init__(self, references_path, human_transcripts_path=None):
|
|
75
|
|
76 self.references_path = references_path
|
|
77 self.human_transcripts_path = human_transcripts_path
|
|
78
|
|
79 self.human_transcript_records = None
|
|
80 self.reference_records = None
|
|
81 self.hit_records = []
|
|
82
|
|
83 self.transcript_ranges = None
|
|
84
|
|
85 def add_hit_file(self, hit_file_path):
|
|
86
|
|
87 logging.debug('SequenceProxy: adding new hit file %s' % hit_file_path)
|
|
88
|
|
89 hit_handle = bz2.BZ2File(hit_file_path, 'rb')
|
|
90 try:
|
|
91 hit_handle.read(10)
|
|
92 except IOError:
|
|
93 hit_handle = open(hit_file_path, 'rb')
|
|
94
|
|
95 hit_dict = SeqIO.to_dict(SeqIO.parse(hit_handle, "fasta"))
|
|
96
|
|
97 logging.debug('SequenceProxy: hit file has %i entries' % len(hit_dict))
|
|
98
|
|
99 self.hit_records.append(hit_dict)
|
|
100
|
|
101 def get_read_record(self, read_id):
|
|
102
|
|
103 for record_dict in self.hit_records:
|
|
104 if read_id in record_dict:
|
|
105 record = record_dict[read_id]
|
|
106 return SeqRecord(record.seq, record.id, '', '')
|
|
107
|
|
108 return None
|
|
109
|
|
110 def get_reference_record(self, identifier):
|
|
111
|
|
112 if not self.reference_records:
|
|
113 logging.debug('SequenceProxy: indexing reference records')
|
|
114 self.reference_records = SeqIO.index(self.references_path, 'fasta')
|
|
115
|
|
116 try:
|
|
117 record = self.reference_records[identifier]
|
|
118 return SeqRecord(record.seq, record.id, '', '')
|
|
119 except KeyError:
|
|
120 return None
|
|
121
|
|
122 def get_transcript_record(self, identifier):
|
|
123
|
|
124 if not self.human_transcripts_path:
|
|
125 return None
|
|
126
|
|
127 if not self.human_transcript_records:
|
|
128 logging.debug('SequenceProxy: indexing human transcripts')
|
|
129 self.human_transcript_records = SeqIO.index(
|
|
130 self.human_transcripts_path, 'fasta')
|
|
131
|
|
132 try:
|
|
133 record = self.human_transcript_records[identifier]
|
|
134 return SeqRecord(record.seq, record.id, '', '')
|
|
135
|
|
136 except KeyError:
|
|
137 return None
|
|
138
|
|
139 def get_overlapping_human_transcript_identifiers(self, chromosome, start, end):
|
|
140
|
|
141 if not self.human_transcripts_path:
|
|
142 return set()
|
|
143
|
|
144 if not self.transcript_ranges:
|
|
145 self._set_transcript_ranges()
|
|
146
|
|
147 interval = HTSeq.GenomicInterval(chromosome, start, end, '.')
|
|
148 all_transcript_ids = set()
|
|
149 for interval, transcript_ids in self.transcript_ranges[interval].steps():
|
|
150 all_transcript_ids = all_transcript_ids.union(transcript_ids)
|
|
151
|
|
152 return all_transcript_ids
|
|
153
|
|
154 def _set_transcript_ranges(self):
|
|
155
|
|
156 logging.debug(
|
|
157 'SequenceProxy: Preparing transcript ranges, please wait...')
|
|
158
|
|
159 self.transcript_ranges = HTSeq.GenomicArrayOfSets(
|
|
160 'auto', stranded=False)
|
|
161
|
|
162 for record in SeqIO.parse(self.human_transcripts_path, 'fasta'):
|
|
163 transcript_id = record.id
|
|
164 for exon in record.description.strip().split(' ')[1].split('|'):
|
|
165 chromosome, start, end = exon.split(';')
|
|
166 start, end = sorted((int(start), int(end)))
|
|
167 if start == end:
|
|
168 continue
|
|
169 interval = HTSeq.GenomicInterval(chromosome, start, end, ".")
|
|
170 self.transcript_ranges[interval] += transcript_id
|
|
171
|
|
172
|
|
173 class JalviewRunner:
|
|
174
|
|
175 def __init__(self, jalview_jar_dir, tmp_dir=None):
|
|
176
|
|
177 self.jalview_jar_dir = jalview_jar_dir
|
|
178
|
|
179 if tmp_dir:
|
|
180 self.tmp_dir = tempfile.mkdtemp(
|
|
181 dir=os.path.abspath(os.path.expandvars(tmp_dir)))
|
|
182 else:
|
|
183 self.tmp_dir = tempfile.mkdtemp()
|
|
184
|
|
185 self.config = ['#---JalviewX Properties File---',
|
|
186 '#Fri Nov 04 14:23:53 CET 2011',
|
|
187 'FIGURE_AUTOIDWIDTH=true',
|
|
188 'ID_ITALICS=false',
|
|
189 'SORT_ALIGNMENT=No sort',
|
|
190 'SHOW_IDENTITY=true',
|
|
191 'FONT_NAME=SansSerif',
|
|
192 'GAP_SYMBOL=.',
|
|
193 'SHOW_QUALITY=false',
|
|
194 'SHOW_GROUP_CONSERVATION=false',
|
|
195 'FONT_STYLE=plain',
|
|
196 'ANTI_ALIAS=false',
|
|
197 'SORT_BY_TREE=false',
|
|
198 'SHOW_CONSENSUS_HISTOGRAM=false',
|
|
199 'SHOW_OVERVIEW=false',
|
|
200 'DEFAULT_COLOUR=Nucleotide',
|
|
201 'SHOW_CONSENSUS_LOGO=false',
|
|
202 'SHOW_ANNOTATIONS=true',
|
|
203 'SHOW_UNCONSERVED=true',
|
|
204 'AUTO_CALC_CONSENSUS=true',
|
|
205 'PAD_GAPS=true',
|
|
206 'FONT_SIZE=10',
|
|
207 'RIGHT_ALIGN_IDS=false',
|
|
208 'WRAP_ALIGNMENT=false',
|
|
209 'FASTA_JVSUFFIX=false',
|
|
210 'PILEUP_JVSUFFIX=false',
|
|
211 'SHOW_JVSUFFIX=false']
|
|
212
|
|
213 def _make_configuration(self, sub_tmp_dir):
|
|
214
|
|
215 config_path = os.path.join(sub_tmp_dir, 'config.txt')
|
|
216 logging.debug(
|
|
217 'JalviewRunner: writing configuration file %s' % config_path)
|
|
218 with open(config_path, 'w') as config_file:
|
|
219 for c in self.config:
|
|
220 config_file.write(c + '\n')
|
|
221
|
|
222 return config_path
|
|
223
|
|
224 def _delete_tmp_dir(self):
|
|
225
|
|
226 try:
|
|
227 rmtree(self.tmp_dir)
|
|
228 except OSError:
|
|
229 pass
|
|
230
|
|
231 def _delete_sub_tmp_dir(self, sub_tmp_dir):
|
|
232
|
|
233 try:
|
|
234 rmtree(sub_tmp_dir)
|
|
235 except OSError:
|
|
236 pass
|
|
237
|
|
238 def run(self, fasta_path, png_output_path):
|
|
239
|
|
240 fasta_path = os.path.abspath(os.path.expandvars(fasta_path))
|
|
241
|
|
242 png_output_path = os.path.abspath(os.path.expandvars(png_output_path))
|
|
243
|
|
244 sub_tmp_dir = tempfile.mkdtemp(dir=self.tmp_dir)
|
|
245 config_path = self._make_configuration(sub_tmp_dir)
|
|
246
|
|
247 logging.debug(
|
|
248 'JalviewRunner: running Jalview on fasta %s with temp dir %s' %
|
|
249 (fasta_path, sub_tmp_dir))
|
|
250
|
|
251 cline = ['java', '-Djava.ext.dirs=%s/lib' % self.jalview_jar_dir,
|
|
252 '-cp %s/jalview.jar' % self.jalview_jar_dir, 'jalview.bin.Jalview',
|
|
253 '-open', fasta_path, '-nodisplay', '-png', png_output_path,
|
|
254 '-colour', 'nucleotide', '-props', config_path]
|
|
255
|
|
256 # Run sibelia process
|
|
257 java_process = subprocess.Popen(
|
|
258 ' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
|
|
259
|
|
260 # Block until streams are closed by the process
|
|
261 stdout, stderr = java_process.communicate()
|
|
262
|
|
263 if stderr:
|
|
264 sys.stderr.write('JalviewRunner: ' + stderr.replace('\n', ' '))
|
|
265
|
|
266 self._delete_sub_tmp_dir(sub_tmp_dir)
|
|
267
|
|
268 return png_output_path
|
|
269
|
|
270 def __del__(self):
|
|
271
|
|
272 self._delete_tmp_dir()
|
|
273
|
|
274
|
|
275 class LastzRunner:
|
|
276
|
|
277 def __init__(self, lastz_path=None, word_length=7):
|
|
278
|
|
279 self.lastz_path = lastz_path
|
|
280 self.word_length = word_length
|
|
281
|
|
282 def align_to_bam_file(self, reference_fasta_path, query_fasta_path, output_bam_path, multiple=False, assert_record=None):
|
|
283
|
|
284 logging.debug('LastzRunner: running on reference %s and query %s' %
|
|
285 (reference_fasta_path, query_fasta_path))
|
|
286 output_sam_path = os.path.abspath(
|
|
287 os.path.expandvars(output_bam_path.replace('.bam', '.sam')))
|
|
288 output_bam_unsorted_path = os.path.abspath(
|
|
289 os.path.expandvars(output_bam_path + '.unsorted'))
|
|
290
|
|
291 logging.debug(
|
|
292 'LastzRunner: aligning with output in temporary sam file %s' %
|
|
293 output_sam_path)
|
|
294 with open(output_sam_path, 'w') as output_sam_handler:
|
|
295 for line in self._align(reference_fasta_path, query_fasta_path, multiple):
|
|
296 output_sam_handler.write(line)
|
|
297
|
|
298 logging.debug(
|
|
299 'LastzRunner: transforming sam into unsorted bam file %s' %
|
|
300 output_bam_unsorted_path)
|
|
301 input_sam_handler = pysam.Samfile(output_sam_path, "r")
|
|
302 output_bam_file = pysam.Samfile(
|
|
303 output_bam_unsorted_path, "wb", template=input_sam_handler)
|
|
304
|
|
305 logging.debug(
|
|
306 'LastzRunner: copying from sam file to bam file')
|
|
307 for s in input_sam_handler:
|
|
308 output_bam_file.write(s)
|
|
309 output_bam_file.close()
|
|
310
|
|
311 logging.debug('LastzRunner: sorting and indexing bam file %s' %
|
|
312 output_bam_path)
|
|
313 pysam.sort(output_bam_unsorted_path,
|
|
314 output_bam_path.replace('.bam', ''))
|
|
315
|
|
316 pysam.index(output_bam_path)
|
|
317
|
|
318 def align_to_samlines(self, reference_fasta_path, query_fasta_path, multiple=False):
|
|
319
|
|
320 logging.debug('LastzRunner: running on reference %s and query %s' %
|
|
321 (reference_fasta_path, query_fasta_path))
|
|
322
|
|
323 for line in self._align(reference_fasta_path, query_fasta_path, multiple):
|
|
324 yield line
|
|
325
|
|
326 def _align(self, reference_fasta_path, query_fasta_path, multiple):
|
|
327
|
|
328 reference_fasta_path = os.path.abspath(
|
|
329 os.path.expandvars(reference_fasta_path))
|
|
330 query_fasta_path = os.path.abspath(
|
|
331 os.path.expandvars(query_fasta_path))
|
|
332
|
|
333 lastz = [self.lastz_path and self.lastz_path or 'lastz']
|
|
334 if multiple:
|
|
335 cline = lastz + [
|
|
336 reference_fasta_path +
|
|
337 '[multiple]', query_fasta_path + '[nameparse=darkspace]',
|
|
338 '--format=sam', '--strand=both', '--gapped', '--chain',
|
|
339 '--nogfextend', '--seed=match%i' % self.word_length]
|
|
340
|
|
341 else:
|
|
342 cline = lastz + [
|
|
343 reference_fasta_path, query_fasta_path +
|
|
344 '[nameparse=darkspace]',
|
|
345 '--format=sam', '--strand=both', '--gapped',
|
|
346 '--chain', '--nogfextend', '--seed=match%i' % self.word_length]
|
|
347
|
|
348 # Run lastz process
|
|
349 lastz_process = subprocess.Popen(
|
|
350 ' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
|
|
351
|
|
352 # Filter multiple read mappings (one to each strand) by only keeping
|
|
353 # the one with the most matches
|
|
354 alignments = []
|
|
355 for i, line in enumerate(iter(lastz_process.stdout.readline, '')):
|
|
356 if line.startswith('@'):
|
|
357 yield line
|
|
358 elif line.lower().startswith('read'):
|
|
359
|
|
360 fields = line.split('\t')
|
|
361 read_name = fields[0]
|
|
362 read_cigar = fields[5]
|
|
363 if not alignments:
|
|
364 alignments = [read_name, line, read_cigar]
|
|
365 continue
|
|
366 else:
|
|
367 if alignments[0] == read_name:
|
|
368 this_mapping = sum(
|
|
369 [int(c[:-1]) for c in re.findall('[0-9]*M', read_cigar)])
|
|
370 other_mapping = sum(
|
|
371 [int(c[:-1]) for c in re.findall('[0-9]*M', alignments[2])])
|
|
372
|
|
373 if other_mapping > this_mapping:
|
|
374 yield alignments[1]
|
|
375 else:
|
|
376 yield line
|
|
377 else:
|
|
378 yield line
|
|
379 alignments = []
|
|
380 else:
|
|
381 yield line
|
|
382
|
|
383
|
|
384 class ConsensusRunner:
|
|
385
|
|
386 def __init__(self, ambiguity_cutoff=0.3):
|
|
387
|
|
388 self.ambiguity_cutoff = ambiguity_cutoff
|
|
389 self.ambiguity = {
|
|
390 'GT': 'K', 'AC': 'M', 'AG': 'R', 'CT': 'Y', 'CG': 'S',
|
|
391 'AT': 'W', 'CGT': 'B', 'ACG': 'V', 'ACT': 'H', 'AGT': 'D',
|
|
392 'ACGT': 'N'}
|
|
393
|
|
394 self.alphabet = Gapped(IUPAC.unambiguous_dna, "-")
|
|
395
|
|
396 def run(self, input_sam_path, output_fasta_path):
|
|
397
|
|
398 samfile = pysam.Samfile(input_sam_path, "rb")
|
|
399 references = samfile.references
|
|
400
|
|
401 logging.debug(
|
|
402 'ConsensusRunner: writing consensus of bam file %s' % input_sam_path)
|
|
403
|
|
404 assert len(references) == 1
|
|
405
|
|
406 consensus = defaultdict(list)
|
|
407 conditions = {}
|
|
408
|
|
409 base_mapper = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4, '-': 5}
|
|
410 ambiguity = self.ambiguity
|
|
411
|
|
412 alphabet = Gapped(IUPAC.ambiguous_dna)
|
|
413 frequency_cutoff = self.ambiguity_cutoff
|
|
414
|
|
415 # Get all conditions of aligned reads
|
|
416 for alignedread in samfile.fetch(references[0]):
|
|
417
|
|
418 name = alignedread.qname
|
|
419 length = len(alignedread.seq)
|
|
420 condition = None
|
|
421
|
|
422 if name.lower().startswith('read'):
|
|
423 condition = ';'.join(name.split(';')[:2])
|
|
424 else:
|
|
425 condition = ';'.join(name.split(';')[:-1])
|
|
426
|
|
427 if condition in conditions:
|
|
428 if name not in conditions[condition][0]:
|
|
429 conditions[condition][0].add(name)
|
|
430 conditions[condition][1] += 1
|
|
431 conditions[condition][2] += length
|
|
432 else:
|
|
433 conditions[condition] = [set([name]), 1, length]
|
|
434
|
|
435 # Prepare count dictionaries
|
|
436 consensus = defaultdict(list)
|
|
437
|
|
438 for i, pileupcolumn in enumerate(samfile.pileup(references[0], max_depth=100000)):
|
|
439
|
|
440 if i % 1000 == 0:
|
|
441 logging.debug(
|
|
442 'ConsensusRunner: making consensus at position %i' % i)
|
|
443
|
|
444 counters = {}
|
|
445 for condition in conditions:
|
|
446 counters[condition] = [0, 0, 0, 0, 0, 0] # ACGTN-
|
|
447
|
|
448 for pileupread in pileupcolumn.pileups:
|
|
449
|
|
450 alignment = pileupread.alignment
|
|
451 name = alignment.qname
|
|
452
|
|
453 if pileupread.is_del:
|
|
454 base = '-'
|
|
455 else:
|
|
456 base = alignment.seq[pileupread.qpos].upper()
|
|
457
|
|
458 if name.lower().startswith('read'):
|
|
459 condition = ';'.join(name.split(';')[:2])
|
|
460 counters[condition][base_mapper[base]] += 1
|
|
461 else:
|
|
462 condition = ';'.join(name.split(';')[:-1])
|
|
463 counters[condition][base_mapper[base]] += 1
|
|
464
|
|
465 for condition, counts in counters.iteritems():
|
|
466 depth = float(sum(counts))
|
|
467 bases = []
|
|
468 a, c, g, t, n, gap = counts
|
|
469
|
|
470 if depth > 0:
|
|
471 if (n / depth) >= frequency_cutoff:
|
|
472 consensus[condition].append('N')
|
|
473 else:
|
|
474 if (a / depth) >= frequency_cutoff:
|
|
475 bases.append('A')
|
|
476 if (c / depth) >= frequency_cutoff:
|
|
477 bases.append('C')
|
|
478 if (g / depth) >= frequency_cutoff:
|
|
479 bases.append('G')
|
|
480 if (t / depth) >= frequency_cutoff:
|
|
481 bases.append('T')
|
|
482
|
|
483 if len(bases) > 1:
|
|
484 consensus[condition].append(
|
|
485 ambiguity[''.join(sorted(bases))])
|
|
486 elif len(bases) == 1:
|
|
487 consensus[condition].append(bases[0])
|
|
488 else:
|
|
489 consensus[condition].append('-')
|
|
490
|
|
491 # Split consensuses by type
|
|
492 reference_consensuses = []
|
|
493 read_consensuses = []
|
|
494 for condition, sequence_list in consensus.iteritems():
|
|
495 if condition.lower().startswith('read'):
|
|
496 read_consensuses.append((''.join(sequence_list), condition))
|
|
497 else:
|
|
498 reference_consensuses.append(
|
|
499 (''.join(sequence_list), condition))
|
|
500
|
|
501 reference_consensuses.sort(reverse=True)
|
|
502 read_consensuses.sort(reverse=True)
|
|
503
|
|
504 # Get start and end of reads (remove fully gapped positions)
|
|
505 start = None
|
|
506 end = None
|
|
507 for read_consensus, condition in read_consensuses:
|
|
508
|
|
509 # Forward direction
|
|
510 for i, char in enumerate(read_consensus):
|
|
511 if char != '-':
|
|
512 if start is None:
|
|
513 start = i
|
|
514 else:
|
|
515 start = min(start, i)
|
|
516 break
|
|
517
|
|
518 # Reverse direction
|
|
519 for i, char in enumerate(read_consensus[::-1]):
|
|
520 if char != '-':
|
|
521 if end is None:
|
|
522 end = i
|
|
523 else:
|
|
524 end = min(end, i)
|
|
525 break
|
|
526
|
|
527 if end == 0 or end is None:
|
|
528 end = None
|
|
529 else:
|
|
530 end = -end
|
|
531
|
|
532 # Filter all records by start and end of reads
|
|
533 reference_consensuses =\
|
|
534 [(r[start:end], c) for r, c in reference_consensuses]
|
|
535
|
|
536 read_consensuses =\
|
|
537 [(r[start:end], c) for r, c in read_consensuses]
|
|
538
|
|
539 # Write consensus records to file
|
|
540 with open(output_fasta_path, 'w', buffering=100 * 1024 * 1024) as output_handler:
|
|
541 for consensus, condition in read_consensuses + reference_consensuses:
|
|
542 stats = ';'.join(map(str, conditions[condition][1:]))
|
|
543 record = SeqRecord(
|
|
544 Seq(consensus, alphabet=alphabet), id=condition + ';' + stats, name='', description='')
|
|
545 SeqIO.write([record], output_handler, "fasta")
|
|
546
|
|
547
|
|
548 class Group:
|
|
549
|
|
550 def __init__(self, family_name, qualified_family_name, sequence_proxy, tmp_dir=None):
|
|
551
|
|
552 self.family_name = family_name
|
|
553 self.qualified_family_name = qualified_family_name
|
|
554 self.sequence_proxy = sequence_proxy
|
|
555
|
|
556 self.regions = []
|
|
557 self.fasta_path = None
|
|
558
|
|
559 if tmp_dir:
|
|
560 self.tmp_dir = tempfile.mkdtemp(
|
|
561 dir=os.path.abspath(os.path.expandvars(tmp_dir)))
|
|
562 else:
|
|
563 self.tmp_dir = tempfile.mkdtemp()
|
|
564
|
|
565 def add_region(self, read_id, pathogen_mapping_locations, human_mapping_locations):
|
|
566
|
|
567 region = Region(self.sequence_proxy, self.tmp_dir)
|
|
568
|
|
569 region.add_read(read_id)
|
|
570
|
|
571 for mapping_location in list(pathogen_mapping_locations) + list(human_mapping_locations):
|
|
572 reference = ';'.join(mapping_location[:-2])
|
|
573 region.add_reference(
|
|
574 reference, int(mapping_location[-2]), int(mapping_location[-1]))
|
|
575
|
|
576 self.regions.append(region)
|
|
577
|
|
578 def write_outputs(self, output_dir, lastz_runner, consensus_builder, jalview_runner):
|
|
579
|
|
580 if not self.regions:
|
|
581 logging.debug('Group %s: no regions for family' % self.family_name)
|
|
582 return
|
|
583
|
|
584 made_output_dir = False
|
|
585 for i, region in enumerate(self.regions):
|
|
586
|
|
587 if not made_output_dir:
|
|
588 output_dir = os.path.abspath(
|
|
589 os.path.expandvars(os.path.join(output_dir, self.qualified_family_name)))
|
|
590 logging.debug('Group %s: writing output files to %s' %
|
|
591 (self.family_name, output_dir))
|
|
592 try:
|
|
593 os.makedirs(output_dir)
|
|
594 except OSError:
|
|
595 pass
|
|
596 made_output_dir = True
|
|
597
|
|
598 logging.debug('Group %s: writing region number %i files to %s' %
|
|
599 (self.family_name, i + 1, output_dir))
|
|
600
|
|
601 unaligned_path = os.path.join(
|
|
602 output_dir, 'region_%i_unaligned.fa.bzip2' % (i + 1))
|
|
603 compressed_unaligned = bz2.BZ2File(unaligned_path, 'wb')
|
|
604 with open(region.get_unaligned_fasta_path()) as unaligned_fasta:
|
|
605 for line in unaligned_fasta:
|
|
606 compressed_unaligned.write(line)
|
|
607
|
|
608 bam_path = os.path.join(
|
|
609 output_dir, 'region_%i_alignment.bam' % (i + 1))
|
|
610 temporary_alignment_bam_path = region.get_alignment_bam_path(
|
|
611 lastz_runner, assert_record='Read')
|
|
612 shutil.copy(temporary_alignment_bam_path, bam_path)
|
|
613
|
|
614 shutil.copy(
|
|
615 temporary_alignment_bam_path + '.bai', bam_path + '.bai')
|
|
616
|
|
617 consensus_path = os.path.join(
|
|
618 output_dir, 'region_%i_consensus.fa' % (i + 1))
|
|
619 temporary_consensus_path = region.get_consensus_fasta_path(
|
|
620 consensus_builder, temporary_alignment_bam_path)
|
|
621 shutil.copy(temporary_consensus_path, consensus_path)
|
|
622
|
|
623 if jalview_runner:
|
|
624
|
|
625 jalview_path = os.path.join(
|
|
626 output_dir, 'region_%i_consensus.png' % (i + 1))
|
|
627 temporary_jalview_path = region.get_consensus_figure_path(
|
|
628 jalview_runner, temporary_consensus_path)
|
|
629 shutil.copy(temporary_jalview_path, jalview_path)
|
|
630
|
|
631 def _delete_temporary_dir(self):
|
|
632
|
|
633 self.fasta_path = None
|
|
634 try:
|
|
635 rmtree(self.tmp_dir)
|
|
636 except OSError:
|
|
637 pass
|
|
638
|
|
639 def filter_regions(self, min_region_length=50, min_read_number=5):
|
|
640
|
|
641 logging.debug('Group %s: filtering %i candidate regions' %
|
|
642 (self.family_name, len(self.regions)))
|
|
643
|
|
644 filtered = [r for r in self.regions if len(
|
|
645 r) >= min_read_number and r.get_longest_reference_length() >= min_region_length]
|
|
646
|
|
647 ordered = sorted(filtered, reverse=True)
|
|
648
|
|
649 if ordered:
|
|
650 length = ordered[0].length
|
|
651 else:
|
|
652 length = 0
|
|
653
|
|
654 logging.debug(
|
|
655 'Group %s: %i candidate regions remained after filtering, the longest is %i bp long' %
|
|
656 (self.family_name, len(ordered), length))
|
|
657
|
|
658 self.regions = ordered
|
|
659
|
|
660 return ordered
|
|
661
|
|
662 def merge_regions(self, max_gap_length):
|
|
663 """ Merges candidate regions into homologous regions. """
|
|
664
|
|
665 logging.debug('Group %s: merging %i candidate regions' %
|
|
666 (self.family_name, len(self.regions)))
|
|
667
|
|
668 if len(self.regions) > 1:
|
|
669
|
|
670 potentially_mergable = self.regions
|
|
671 not_mergable = []
|
|
672
|
|
673 while len(potentially_mergable) > 1:
|
|
674
|
|
675 merged = False
|
|
676 current = potentially_mergable[0]
|
|
677 compared_to = potentially_mergable[1:]
|
|
678
|
|
679 for region in compared_to:
|
|
680 if region.overlaps(current, max_gap_length):
|
|
681 region.merge(current)
|
|
682 region.clean_references(max_gap_length)
|
|
683 #logging.debug('Group %s: merged a region. %i potentially mergable candidate regions remaining' % (self.family_name, len(potentially_mergable)))
|
|
684 potentially_mergable = compared_to
|
|
685 merged = True
|
|
686 break
|
|
687
|
|
688 if not merged:
|
|
689 not_mergable.append(current)
|
|
690 potentially_mergable = compared_to
|
|
691 #logging.debug('Group %s: not merged a region. %i potentially mergable candidate regions remaining' % (self.family_name, len(potentially_mergable)))
|
|
692
|
|
693 results = not_mergable + potentially_mergable
|
|
694
|
|
695 logging.debug('Group %s: merged into %i regions' %
|
|
696 (self.family_name, len(results)))
|
|
697
|
|
698 self.regions = results
|
|
699
|
|
700 else:
|
|
701 logging.debug(
|
|
702 'Group %s: found only 1 region, no mergin necessary' % self.family_name)
|
|
703
|
|
704 def add_transcripts_to_regions(self):
|
|
705
|
|
706 logging.debug('Group %s: enriching %i regions with human transcript information'
|
|
707 % (self.family_name, len(self.regions)))
|
|
708
|
|
709 for region in self.regions:
|
|
710
|
|
711 added_transcripts = 0
|
|
712
|
|
713 for identifier, locations in region.references.iteritems():
|
|
714 chromosome = identifier.split(';')[-1]
|
|
715
|
|
716 for start, end in locations:
|
|
717 transcript_identifiers = self.sequence_proxy.get_overlapping_human_transcript_identifiers(
|
|
718 chromosome, start, end)
|
|
719
|
|
720 for transcript_identifier in transcript_identifiers:
|
|
721 region.add_transcript(transcript_identifier)
|
|
722 added_transcripts += 1
|
|
723
|
|
724 logging.debug('Group %s: added %i transcripts to a region' %
|
|
725 (self.family_name, added_transcripts))
|
|
726
|
|
727 def __str__(self):
|
|
728
|
|
729 s = "Group %s with %i regions" % (self.family_name, len(self.regions))
|
|
730
|
|
731 return s
|
|
732
|
|
733
|
|
734 class GroupGenerator:
|
|
735
|
|
736 """ Produces Groups from Hitfiles. Reads are assigned to groups based on
|
|
737 taxonomic families. Transcripts overlapping human read mappings as well
|
|
738 as genome sequences of pathogens the reads map to are added to the
|
|
739 Groups.
|
|
740 """
|
|
741
|
|
742 def __init__(self, sequence_proxy, reference_database_filter=None, pathogen_family_filter=None, tmp_dir=None):
|
|
743
|
|
744 self.sequence_proxy = sequence_proxy
|
|
745
|
|
746 self.tmp_path = tmp_dir
|
|
747 self.reference_database_filter = reference_database_filter
|
|
748 self.pathogen_family_filter = pathogen_family_filter
|
|
749
|
|
750 self.groups = {}
|
|
751
|
|
752 if tmp_dir:
|
|
753 self.tmp_dir = tempfile.mkdtemp(
|
|
754 dir=os.path.abspath(os.path.expandvars(tmp_dir)))
|
|
755 else:
|
|
756 self.tmp_dir = tempfile.mkdtemp()
|
|
757
|
|
758 def get_groups(self, min_read_number=5):
|
|
759
|
|
760 logging.debug(
|
|
761 'GroupGenerator: generating groups from %i hit files, please wait...' %
|
|
762 len(self.sequence_proxy.hit_records))
|
|
763
|
|
764 for hit_index in self.sequence_proxy.hit_records:
|
|
765 for read_id, record in hit_index.iteritems():
|
|
766
|
|
767 # Extract mapping locations to human DNA, human transcript, and
|
|
768 # pathogen genomes
|
|
769 mapping_locations = record.description.strip().split(
|
|
770 ' ')[1].split('|')
|
|
771
|
|
772 pathogen_families = defaultdict(set)
|
|
773 human_mapping_locations = set()
|
|
774
|
|
775 for mapping_location in mapping_locations:
|
|
776 fields = mapping_location.split(';')
|
|
777
|
|
778 # Convert start and end to integers
|
|
779 fields[-2] = int(fields[-2]) # start
|
|
780 fields[-1] = int(fields[-1]) # end
|
|
781
|
|
782 # Add mapping locations to human cDNA or human DNA
|
|
783 if fields[2] == 'Homo_sapiens':
|
|
784 human_mapping_locations.add(tuple(fields))
|
|
785
|
|
786 # Add mapping locations to non-human references
|
|
787 else:
|
|
788 reference_database, family = fields[:2]
|
|
789 if self.reference_database_filter and reference_database\
|
|
790 not in self.reference_database_filter:
|
|
791 continue
|
|
792 if self.pathogen_family_filter and family\
|
|
793 not in self.pathogen_family_filter:
|
|
794 continue
|
|
795 pathogen_families[
|
|
796 (reference_database, family)].add(tuple(fields))
|
|
797
|
|
798 if not pathogen_families:
|
|
799 continue
|
|
800
|
|
801 # Process the hits to different pathogen families
|
|
802 for (reference_database, family), pathogen_locations in pathogen_families.iteritems():
|
|
803
|
|
804 qualified_family_name = reference_database + '_' + family
|
|
805
|
|
806 # Obtain existing group or make new group
|
|
807 if qualified_family_name in self.groups:
|
|
808 group = self.groups[qualified_family_name]
|
|
809 else:
|
|
810 group = Group(family, qualified_family_name,
|
|
811 self.sequence_proxy, self.tmp_dir)
|
|
812 self.groups[qualified_family_name] = group
|
|
813
|
|
814 group.add_region(
|
|
815 read_id, pathogen_locations, human_mapping_locations)
|
|
816
|
|
817 # Exclude groups that have collected to few reads
|
|
818 for qualified_family_name, group in self.groups.items():
|
|
819 if len(group.regions) < min_read_number:
|
|
820 del self.groups[qualified_family_name]
|
|
821 else:
|
|
822 logging.debug(
|
|
823 'GroupGenerator: made new group for family %s' % qualified_family_name)
|
|
824
|
|
825 return self.groups
|
|
826
|
|
827
|
|
828 class Region:
|
|
829
|
|
830 def __init__(self, sequence_proxy, tmp_dir=None):
|
|
831
|
|
832 self.sequence_proxy = sequence_proxy
|
|
833
|
|
834 self.references = defaultdict(set)
|
|
835 self.reads = set()
|
|
836 self.transcripts = set()
|
|
837 self.length = None
|
|
838
|
|
839 self.sorted_reference_positions = None
|
|
840
|
|
841 self.master_tmp = tmp_dir
|
|
842 self.tmp_dir = None
|
|
843
|
|
844 self.unaligned_fasta_path = None
|
|
845 self.alignment_sam_path = None
|
|
846 self.consensus_fasta_path = None
|
|
847
|
|
848 def add_reference(self, name, start, end):
|
|
849
|
|
850 assert start < end
|
|
851 self.length = None
|
|
852 self.longest_reference_id = None
|
|
853 self.references[name].add((start, end))
|
|
854
|
|
855 def add_read(self, name):
|
|
856
|
|
857 self.reads.add(name)
|
|
858
|
|
859 def add_transcript(self, name):
|
|
860
|
|
861 self.transcripts.add(name)
|
|
862
|
|
863 def get_tmp_path(self):
|
|
864
|
|
865 if self.tmp_dir:
|
|
866 return self.tmp_dir
|
|
867
|
|
868 if self.master_tmp:
|
|
869 self.tmp_dir = tempfile.mkdtemp(
|
|
870 dir=os.path.abspath(os.path.expandvars(self.master_tmp)))
|
|
871 else:
|
|
872 self.tmp_dir = tempfile.mkdtemp()
|
|
873
|
|
874 return self.tmp_dir
|
|
875
|
|
876 def _distance(self, range1, range2):
|
|
877
|
|
878 first, second = sorted((range1, range2))
|
|
879
|
|
880 if first[0] <= first[1] < second[0]:
|
|
881 return (second[0] - first[1])
|
|
882 else:
|
|
883 return 0
|
|
884
|
|
885 def _delete_temporary_dir(self):
|
|
886
|
|
887 self.unaligned_fasta_path = None
|
|
888 if self.tmp_dir:
|
|
889 try:
|
|
890 rmtree(self.tmp_dir)
|
|
891 except OSError:
|
|
892 pass
|
|
893
|
|
894 def _get_unaligned_fasta_path(self):
|
|
895
|
|
896 output_path = os.path.join(self.get_tmp_path(), 'unaligned_region.fa')
|
|
897 logging.debug('Region: writing unaligned fasta to %s' % output_path)
|
|
898
|
|
899 assert self.reads
|
|
900
|
|
901 with open(output_path, 'w', buffering=100 * 1024 * 1024) as output_handler:
|
|
902
|
|
903 # Cut and write references
|
|
904 human_positions = []
|
|
905 pathogen_positions = []
|
|
906 for length, identifier, start, end in self.get_sorted_reference_positions():
|
|
907 if ';Homo_sapiens;' in identifier:
|
|
908 human_positions.append((identifier, start, end))
|
|
909 else:
|
|
910 pathogen_positions.append((identifier, start, end))
|
|
911
|
|
912 for identifier, start, end in pathogen_positions + human_positions:
|
|
913 record = self.sequence_proxy.get_reference_record(identifier)
|
|
914 if record:
|
|
915 record = SeqRecord(record.seq, record.id, '', '')
|
|
916 #record.id += ';%i-%i' % (start, end)
|
|
917 SeqIO.write(
|
|
918 [record[start - 1:end]], output_handler, "fasta")
|
|
919 else:
|
|
920 pass
|
|
921 #logging.debug('Region: could not retrieve reference %s' % identifier)
|
|
922
|
|
923 # Write full-length transcripts
|
|
924 for identifier in self.transcripts:
|
|
925 record = self.sequence_proxy.get_transcript_record(identifier)
|
|
926 if record:
|
|
927 record = SeqRecord(record.seq, record.id, '', '')
|
|
928 SeqIO.write([record], output_handler, "fasta")
|
|
929 else:
|
|
930 logging.debug(
|
|
931 'Region: could not retrieve reference %s' % identifier)
|
|
932
|
|
933 # Write reads
|
|
934 for read_id in sorted(self.reads):
|
|
935 record = self.sequence_proxy.get_read_record(read_id)
|
|
936 if record:
|
|
937 # Transform read ID so that LASTZ can work with it (it
|
|
938 # makes nicknames based on some characters )
|
|
939 identifier = record.id.replace(
|
|
940 ':', '_').replace('|', '_').replace('/', '_')
|
|
941 record = SeqRecord(record.seq, identifier, '', '')
|
|
942 SeqIO.write([record], output_handler, "fasta")
|
|
943 else:
|
|
944 logging.debug(
|
|
945 'Region: could not retrieve read %s' % read_id)
|
|
946
|
|
947 self.unaligned_fasta_path = output_path
|
|
948
|
|
949 return self.unaligned_fasta_path
|
|
950
|
|
951 def get_unaligned_fasta_path(self):
|
|
952
|
|
953 if not self.unaligned_fasta_path:
|
|
954 self._get_unaligned_fasta_path()
|
|
955
|
|
956 return self.unaligned_fasta_path
|
|
957
|
|
958 def _align_fastas(self, aligner, assert_record=None):
|
|
959
|
|
960 # Write reference fasta
|
|
961 queries_path = self.get_unaligned_fasta_path()
|
|
962 first_record = SeqIO.parse(open(queries_path, "rU"), "fasta").next()
|
|
963 reference_path = os.path.join(self.get_tmp_path(), 'reference.fa')
|
|
964
|
|
965 logging.debug(
|
|
966 'Region: writing longest reference to %s' % reference_path)
|
|
967
|
|
968 SeqIO.write([first_record], reference_path, "fasta")
|
|
969
|
|
970 # Do alignment
|
|
971 bam_path = os.path.join(self.get_tmp_path(), 'aligned.bam')
|
|
972 logging.debug(
|
|
973 'Region: starting alignment using queries %s to sam path %s' %
|
|
974 (queries_path, bam_path))
|
|
975
|
|
976 aligner.align_to_bam_file(
|
|
977 reference_path, queries_path, bam_path, assert_record=assert_record)
|
|
978
|
|
979 return bam_path
|
|
980
|
|
981 def _build_alignment_consensus(self, consensus_builder, alignment_bam_path):
|
|
982
|
|
983 consensus_path = os.path.join(self.get_tmp_path(), 'consensus.fa')
|
|
984 logging.debug('Region: writing consensus to %s' % consensus_path)
|
|
985
|
|
986 consensus_builder.run(alignment_bam_path, consensus_path)
|
|
987
|
|
988 return consensus_path
|
|
989
|
|
990 def get_alignment_bam_path(self, aligner, assert_record=None):
|
|
991
|
|
992 return self._align_fastas(aligner, assert_record)
|
|
993
|
|
994 def get_consensus_fasta_path(self, consensus_builder, alignment_bam_path):
|
|
995
|
|
996 return self._build_alignment_consensus(consensus_builder, alignment_bam_path)
|
|
997
|
|
998 def get_consensus_figure_path(self, jalview_runner, consensus_fasta_path):
|
|
999
|
|
1000 png_path = os.path.join(self.get_tmp_path(), 'consensus_figure.png')
|
|
1001
|
|
1002 return jalview_runner.run(consensus_fasta_path, png_path)
|
|
1003
|
|
1004 def overlaps(self, other, max_gap_length):
|
|
1005
|
|
1006 overlap = (set(self.references) & set(other.references))
|
|
1007
|
|
1008 # Overlap is solely based on non-human references
|
|
1009 overlap = [ref for ref in overlap if not ';Homo_sapiens;' in ref]
|
|
1010
|
|
1011 if not overlap:
|
|
1012 return False
|
|
1013
|
|
1014 for reference in overlap:
|
|
1015 reflist = self.references[reference]
|
|
1016 for start, end in reflist:
|
|
1017 for other_start, other_end in other.references[reference]:
|
|
1018 distance = self._distance((start, end),
|
|
1019 (other_start, other_end))
|
|
1020 if distance <= max_gap_length:
|
|
1021 return True
|
|
1022 return False
|
|
1023
|
|
1024 def merge(self, other):
|
|
1025
|
|
1026 for reference, reflist in other.references.iteritems():
|
|
1027 for (start, end) in reflist:
|
|
1028 self.add_reference(reference, start, end)
|
|
1029
|
|
1030 for read in other.reads:
|
|
1031 self.add_read(read)
|
|
1032
|
|
1033 def clean_references(self, max_gap_length):
|
|
1034
|
|
1035 for reference, reflist in self.references.iteritems():
|
|
1036
|
|
1037 start_list = sorted(reflist)
|
|
1038 saved = list(start_list[0])
|
|
1039 result = set()
|
|
1040
|
|
1041 for item in start_list:
|
|
1042 if self._distance(saved, item) <= max_gap_length:
|
|
1043 saved[1] = max(saved[1], item[1])
|
|
1044 else:
|
|
1045 result.add(tuple(saved))
|
|
1046 saved[0] = item[0]
|
|
1047 saved[1] = item[1]
|
|
1048 result.add(tuple(saved))
|
|
1049 self.references[reference] = result
|
|
1050
|
|
1051 def to_sorted_record_ids(self):
|
|
1052
|
|
1053 references = []
|
|
1054 for name, locations in self.references.iteritems():
|
|
1055 for start, end in locations:
|
|
1056 references.append(end - start, name)
|
|
1057
|
|
1058 reads = defaultdict(list)
|
|
1059 for name, locations in self.references.iteritems():
|
|
1060 condition = name.split(';')[1]
|
|
1061 for start, end in locations:
|
|
1062 reads[condition].append(end - start, name)
|
|
1063
|
|
1064 all_record_ids = sorted(references, reverse=True)
|
|
1065 for condition, the_reads in reads.iteritems():
|
|
1066 all_record_ids += sorted(the_reads, reverse=True)
|
|
1067
|
|
1068 for length, record_id in all_record_ids:
|
|
1069 yield record_id
|
|
1070
|
|
1071 def __len__(self):
|
|
1072
|
|
1073 return len(self.reads)
|
|
1074
|
|
1075 def __cmp__(self, other):
|
|
1076
|
|
1077 if self.get_longest_reference_length() <\
|
|
1078 other.get_longest_reference_length():
|
|
1079 return -1
|
|
1080 elif self.get_longest_reference_length() ==\
|
|
1081 other.get_longest_reference_length():
|
|
1082 return 0
|
|
1083 return 1
|
|
1084
|
|
1085 def get_longest_reference_length(self):
|
|
1086
|
|
1087 if self.length is not None:
|
|
1088 return self.length
|
|
1089
|
|
1090 positions = self.get_sorted_reference_positions()
|
|
1091 if positions:
|
|
1092 self.length = self.get_sorted_reference_positions()[0][0]
|
|
1093 else:
|
|
1094 self.length = 0
|
|
1095
|
|
1096 return self.length
|
|
1097
|
|
1098 def get_sorted_reference_positions(self):
|
|
1099
|
|
1100 if self.sorted_reference_positions is not None:
|
|
1101 return self.sorted_reference_positions
|
|
1102
|
|
1103 lengths = []
|
|
1104 for reference, the_set in self.references.iteritems():
|
|
1105 for start, end in the_set:
|
|
1106 lengths.append([end - start, reference, start, end])
|
|
1107
|
|
1108 self.sorted_reference_positions = sorted(lengths, reverse=True)
|
|
1109
|
|
1110 return self.sorted_reference_positions
|
|
1111
|
|
1112 def __str__(self):
|
|
1113 return '<Region> of length %10i with %10i reads and %5i references' %\
|
|
1114 (self.get_longest_reference_length(),
|
|
1115 len(self.reads), len(self.references))
|
|
1116
|
|
1117
|
|
1118 class RegionStatistics:
|
|
1119
|
|
1120 def __init__(self, input_dir):
|
|
1121
|
|
1122 self.input_dir = input_dir
|
|
1123 self.stats = []
|
|
1124
|
|
1125 def _add_sequence(self, qualified_name, region_id,
|
|
1126 longest_human_reference, longest_pathogen_reference, record):
|
|
1127
|
|
1128 state = 'pathogen'
|
|
1129 if longest_human_reference:
|
|
1130 state = 'ambiguous'
|
|
1131
|
|
1132 reference_type = qualified_name.split('_')[0]
|
|
1133 family_name = '_'.join(qualified_name.split('_')[1:])
|
|
1134
|
|
1135 read_type, sample_id, number_reads, basepairs = record.id.split(';')
|
|
1136
|
|
1137 coverage = int(basepairs) / float(len(longest_pathogen_reference))
|
|
1138
|
|
1139 self.stats.append(
|
|
1140 [reference_type, family_name, region_id, sample_id, state,
|
|
1141 number_reads, str(len(longest_pathogen_reference)), basepairs, '%.5f' % coverage])
|
|
1142
|
|
1143 def run(self, output_file):
|
|
1144
|
|
1145 for qualified_family_name in os.listdir(self.input_dir):
|
|
1146
|
|
1147 family_dir = os.path.join(self.input_dir, qualified_family_name)
|
|
1148
|
|
1149 if not os.path.isdir(family_dir):
|
|
1150 continue
|
|
1151
|
|
1152 consensus_regions = glob.glob(os.path.join(
|
|
1153 family_dir, 'region_*_consensus.fa'))
|
|
1154
|
|
1155 for consensus_region in consensus_regions:
|
|
1156
|
|
1157 base_bame = os.path.basename(consensus_region)
|
|
1158
|
|
1159 region_id = base_bame.split('_')[1]
|
|
1160
|
|
1161 reads = []
|
|
1162 longest_human_reference = None
|
|
1163 longest_pathogen_reference = None
|
|
1164
|
|
1165 for consensus_record in SeqIO.parse(consensus_region, 'fasta'):
|
|
1166 if consensus_record.id.lower().startswith('read'):
|
|
1167 # ends with read number; cumulative bases
|
|
1168 reads.append(consensus_record)
|
|
1169 elif ';Homo_sapiens;' in consensus_record.id:
|
|
1170 if longest_human_reference is None or\
|
|
1171 len(consensus_record) > longest_human_reference:
|
|
1172 longest_human_reference = consensus_record
|
|
1173 else:
|
|
1174 if longest_pathogen_reference is None or\
|
|
1175 len(consensus_record) > longest_pathogen_reference:
|
|
1176 longest_pathogen_reference = consensus_record
|
|
1177
|
|
1178 for read in reads:
|
|
1179 self._add_sequence(qualified_family_name, region_id,
|
|
1180 longest_human_reference, longest_pathogen_reference, read)
|
|
1181
|
|
1182 with open(os.path.abspath(os.path.expandvars(output_file)), 'w') as output_handler:
|
|
1183 output_handler.write(
|
|
1184 '\t'.join(['reference_type', 'family_name', 'region_id',
|
|
1185 'sample_id', 'taxonomic_origin', 'number_reads',
|
|
1186 'region_length', 'basepairs', 'coverage']) + '\n')
|
|
1187 for entries in sorted(self.stats):
|
|
1188 line = '\t'.join(entries) + '\n'
|
|
1189 output_handler.write(line)
|
|
1190
|
|
1191
|
|
1192 @CLI.subcommand("regions")
|
|
1193 class RegionRunner(cli.Application):
|
|
1194
|
|
1195 """ Derive homologous groups and regions from hit files """
|
|
1196
|
|
1197 hit_files = cli.SwitchAttr(
|
|
1198 ['-v', '--virana_hits'], str, list=True, mandatory=True,
|
|
1199 help="Add hit file for analysis. May be supplied multiple times.")
|
|
1200
|
|
1201 lastz_path = cli.SwitchAttr(['-z', '--lastz_path'], str, mandatory=False,
|
|
1202 help="Path to lastz executable",
|
|
1203 default='')
|
|
1204
|
|
1205 jalview_jar_dir = cli.SwitchAttr(
|
|
1206 ['-j', '--jalview_jar_dir'], str, mandatory=False,
|
|
1207 help="Directory containing the jalview.jar file",
|
|
1208 default=None)
|
|
1209
|
|
1210 references_path = cli.SwitchAttr(
|
|
1211 ['-r', '--references'], str, mandatory=True,
|
|
1212 help="Input fasta file containing genomic references of pathogens and possibly of human chromosomes as well (the latter is experimental)",
|
|
1213 default='')
|
|
1214
|
|
1215 cdna_path = cli.SwitchAttr(['-c', '--cdna'], str, mandatory=False,
|
|
1216 help="Input fasta file containing human cDNA records.",
|
|
1217 default=None)
|
|
1218
|
|
1219 output_dir = cli.SwitchAttr(['-o', '--output_dir'], str, mandatory=True,
|
|
1220 help="Output directory that will be filled with subdirectories corresponding to homologous regions. The directory will be generated if it is not existing.",
|
|
1221 default='')
|
|
1222
|
|
1223 tmp_dir = cli.SwitchAttr(['-t', '--tmp_dir'], str, mandatory=False,
|
|
1224 help="Directory in which temorary files are stored. Temporary files will be stored in subdirectories of the provided directory and will be deleted after completion.",
|
|
1225 default=None)
|
|
1226
|
|
1227 stat_path = cli.SwitchAttr(['-s', '--region_stats'], str, mandatory=False,
|
|
1228 help="Path to output statistics tab-delimited text file.",
|
|
1229 default='')
|
|
1230
|
|
1231 reference_database_filter = cli.SwitchAttr(
|
|
1232 ['-b', '--reference_database_filter'], str, list=True, mandatory=False,
|
|
1233 help="Specifies which kind of reference databases are consisdered when extracting hits from hit files. May be specified multiple times. If any are specified, all reference databases not specified are filtered out. By default, this parameter is empty.",
|
|
1234 default=[])
|
|
1235
|
|
1236 pathogen_family_filter = cli.SwitchAttr(
|
|
1237 ['-f', '--pathogen_family_filter'], str, list=True, mandatory=False,
|
|
1238 help="Specifies which kind of pathogen families are consisdered when extracting hits from hit files. May be specified multiple times. If any are specified, all families not specified are filtered out. By default, this parameter is empty.",
|
|
1239 default=[])
|
|
1240
|
|
1241 min_read_number = cli.SwitchAttr(
|
|
1242 ['-m', '--min_read_number'], cli.Range(1, 1000), mandatory=False,
|
|
1243 help="Minimum number of reads that are required to be present in homologous region. Regions with fewer reads will be omitted from the results.",
|
|
1244 default=5)
|
|
1245
|
|
1246 max_gap_length = cli.SwitchAttr(
|
|
1247 ['-l', '--max_gap_length'], cli.Range(1, 1000), mandatory=False,
|
|
1248 help="Maximum number bases that two candidate homologous regions are distant from each other with regard to their positions on a common reference sequence in order for being eligable for merging.",
|
|
1249 default=25)
|
|
1250
|
|
1251 min_region_length = cli.SwitchAttr(
|
|
1252 ['-x', '--min_region_length'], cli.Range(1, 1000), mandatory=False,
|
|
1253 help="Minimum number bases of the longest reference sequence of each homologous region that is generated. Shoer regions will be omitted from the results.",
|
|
1254 default=50)
|
|
1255
|
|
1256 word_length = cli.SwitchAttr(
|
|
1257 ['-w', '--word_length'], cli.Range(1, 21), mandatory=False,
|
|
1258 help="Word length of the lastz alignment process. Shorter word lengths allow more sensitive but less specific alignments.",
|
|
1259 default=7)
|
|
1260
|
|
1261 ambiguity_cutoff = cli.SwitchAttr(
|
|
1262 ['-a', '--ambiguity_cutoff'], float, mandatory=False,
|
|
1263 help="Ratio of variant positions within a column of the homologous region sequence alignment so that the corresponding consensus sequence at this positions show a ambiguous base instead of the majority base.",
|
|
1264 default=0.3)
|
|
1265
|
|
1266 debug = cli.Flag(
|
|
1267 ["-d", "--debug"], help="Enable debug information")
|
|
1268
|
|
1269 def main(self):
|
|
1270
|
|
1271 if self.debug:
|
|
1272 logging.getLogger().setLevel(logging.DEBUG)
|
|
1273
|
|
1274 # Make sequence proxy for managing hit files, references, and cdna
|
|
1275 proxy = SequenceProxy(self.references_path, self.cdna_path)
|
|
1276 for hit_file in self.hit_files:
|
|
1277 proxy.add_hit_file(hit_file)
|
|
1278
|
|
1279 # Generate homologous groups
|
|
1280 generator = GroupGenerator(proxy,
|
|
1281 reference_database_filter=self.reference_database_filter,
|
|
1282 pathogen_family_filter=self.pathogen_family_filter,
|
|
1283 tmp_dir=self.tmp_dir)
|
|
1284 groups = generator.get_groups(min_read_number=self.min_read_number)
|
|
1285
|
|
1286 # Prepare analysis modules ('runners') for postprocessing homologous
|
|
1287 # regions
|
|
1288 consensus = ConsensusRunner(ambiguity_cutoff=self.ambiguity_cutoff)
|
|
1289 lastz = LastzRunner(word_length=self.word_length)
|
|
1290
|
|
1291 if self.jalview_jar_dir:
|
|
1292 jalview = JalviewRunner(
|
|
1293 jalview_jar_dir=self.jalview_jar_dir, tmp_dir=self.tmp_dir)
|
|
1294 else:
|
|
1295 jalview = None
|
|
1296
|
|
1297 # Make homologous regions within each homologous group
|
|
1298 for name, group in groups.iteritems():
|
|
1299 group.merge_regions(max_gap_length=self.max_gap_length)
|
|
1300 group.filter_regions(
|
|
1301 min_region_length=self.min_region_length, min_read_number=self.min_read_number)
|
|
1302
|
|
1303 if self.cdna_path:
|
|
1304 group.add_transcripts_to_regions()
|
|
1305
|
|
1306 group.write_outputs(self.output_dir, lastz, consensus, jalview)
|
|
1307 group._delete_temporary_dir()
|
|
1308
|
|
1309 # Run statistics on all homologous regions
|
|
1310 if self.stat_path:
|
|
1311 statistics = RegionStatistics(self.output_dir)
|
|
1312 statistics.run(self.stat_path)
|
|
1313
|
|
1314 if __name__ == "__main__":
|
|
1315 CLI.run()
|