annotate vhom.py @ 18:d5a3b07f91c4 draft

Uploaded
author mzeidler
date Mon, 30 Sep 2013 06:22:58 -0400
parents f63d639b223c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1 #!/usr/bin/env python
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
2
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
3 import sys
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
4 import os
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
5 import logging
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
6 import tempfile
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
7 import shutil
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
8
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
9 import subprocess
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
10 import bz2
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
11 import re
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
12 import glob
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
13
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
14 from shutil import rmtree
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
15 from subprocess import PIPE
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
16 from collections import defaultdict, Counter
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
17
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
18 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
19 import pysam
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
20 except ImportError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
21 message = 'This script requires the pysam python package\n'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
22 sys.stderr.write(message)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
23 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
24
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
25 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
26 from plumbum import cli
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
27 except ImportError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
28 message = 'This script requires the plumbum python package\n'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
29 sys.stderr.write(message)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
30 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
31
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
32 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
33 from Bio.SeqRecord import SeqRecord
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
34 from Bio import SeqIO
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
35 from Bio.Seq import Seq
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
36 from Bio.Alphabet import IUPAC, Gapped
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
37 except ImportError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
38 message = 'This script requires the BioPython python package\n'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
39 sys.stderr.write(message)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
40 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
41
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
42 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
43 import HTSeq
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
44 except ImportError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
45 message = 'This script requires the HTSeq python package\n'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
46 sys.stderr.write(message)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
47 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
48
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
49 logging.basicConfig(level=logging.INFO, format='%(message)s')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
50
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
51
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
52 class CLI(cli.Application):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
53
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
54 """ Homologous region analysis of hit files"""
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
55 PROGNAME = "vmap"
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
56 VERSION = "1.0.0"
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
57 DESCRIPTION = """"""
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
58 USAGE = """"""
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
59
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
60 def main(self, *args):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
61
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
62 if args:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
63 print("Unknown command %r" % (args[0]))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
64 print self.USAGE
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
65 return 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
66
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
67 if not self.nested_command:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
68 print("No command given")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
69 print self.USAGE
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
70 return 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
71
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
72 class SequenceProxy:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
73
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
74 def __init__(self, references_path, human_transcripts_path=None):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
75
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
76 self.references_path = references_path
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
77 self.human_transcripts_path = human_transcripts_path
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
78
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
79 self.human_transcript_records = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
80 self.reference_records = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
81 self.hit_records = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
82
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
83 self.transcript_ranges = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
84
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
85 def add_hit_file(self, hit_file_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
86
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
87 logging.debug('SequenceProxy: adding new hit file %s' % hit_file_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
88
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
89 hit_handle = bz2.BZ2File(hit_file_path, 'rb')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
90 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
91 hit_handle.read(10)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
92 except IOError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
93 hit_handle = open(hit_file_path, 'rb')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
94
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
95 hit_dict = SeqIO.to_dict(SeqIO.parse(hit_handle, "fasta"))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
96
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
97 logging.debug('SequenceProxy: hit file has %i entries' % len(hit_dict))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
98
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
99 self.hit_records.append(hit_dict)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
100
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
101 def get_read_record(self, read_id):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
102
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
103 for record_dict in self.hit_records:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
104 if read_id in record_dict:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
105 record = record_dict[read_id]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
106 return SeqRecord(record.seq, record.id, '', '')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
107
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
108 return None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
109
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
110 def get_reference_record(self, identifier):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
111
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
112 if not self.reference_records:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
113 logging.debug('SequenceProxy: indexing reference records')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
114 self.reference_records = SeqIO.index(self.references_path, 'fasta')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
115
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
116 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
117 record = self.reference_records[identifier]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
118 return SeqRecord(record.seq, record.id, '', '')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
119 except KeyError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
120 return None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
121
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
122 def get_transcript_record(self, identifier):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
123
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
124 if not self.human_transcripts_path:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
125 return None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
126
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
127 if not self.human_transcript_records:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
128 logging.debug('SequenceProxy: indexing human transcripts')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
129 self.human_transcript_records = SeqIO.index(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
130 self.human_transcripts_path, 'fasta')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
131
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
132 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
133 record = self.human_transcript_records[identifier]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
134 return SeqRecord(record.seq, record.id, '', '')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
135
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
136 except KeyError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
137 return None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
138
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
139 def get_overlapping_human_transcript_identifiers(self, chromosome, start, end):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
140
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
141 if not self.human_transcripts_path:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
142 return set()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
143
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
144 if not self.transcript_ranges:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
145 self._set_transcript_ranges()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
146
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
147 interval = HTSeq.GenomicInterval(chromosome, start, end, '.')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
148 all_transcript_ids = set()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
149 for interval, transcript_ids in self.transcript_ranges[interval].steps():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
150 all_transcript_ids = all_transcript_ids.union(transcript_ids)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
151
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
152 return all_transcript_ids
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
153
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
154 def _set_transcript_ranges(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
155
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
156 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
157 'SequenceProxy: Preparing transcript ranges, please wait...')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
158
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
159 self.transcript_ranges = HTSeq.GenomicArrayOfSets(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
160 'auto', stranded=False)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
161
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
162 for record in SeqIO.parse(self.human_transcripts_path, 'fasta'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
163 transcript_id = record.id
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
164 for exon in record.description.strip().split(' ')[1].split('|'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
165 chromosome, start, end = exon.split(';')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
166 start, end = sorted((int(start), int(end)))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
167 if start == end:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
168 continue
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
169 interval = HTSeq.GenomicInterval(chromosome, start, end, ".")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
170 self.transcript_ranges[interval] += transcript_id
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
171
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
172
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
173 class JalviewRunner:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
174
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
175 def __init__(self, jalview_jar_dir, tmp_dir=None):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
176
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
177 self.jalview_jar_dir = jalview_jar_dir
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
178
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
179 if tmp_dir:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
180 self.tmp_dir = tempfile.mkdtemp(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
181 dir=os.path.abspath(os.path.expandvars(tmp_dir)))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
182 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
183 self.tmp_dir = tempfile.mkdtemp()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
184
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
185 self.config = ['#---JalviewX Properties File---',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
186 '#Fri Nov 04 14:23:53 CET 2011',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
187 'FIGURE_AUTOIDWIDTH=true',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
188 'ID_ITALICS=false',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
189 'SORT_ALIGNMENT=No sort',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
190 'SHOW_IDENTITY=true',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
191 'FONT_NAME=SansSerif',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
192 'GAP_SYMBOL=.',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
193 'SHOW_QUALITY=false',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
194 'SHOW_GROUP_CONSERVATION=false',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
195 'FONT_STYLE=plain',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
196 'ANTI_ALIAS=false',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
197 'SORT_BY_TREE=false',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
198 'SHOW_CONSENSUS_HISTOGRAM=false',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
199 'SHOW_OVERVIEW=false',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
200 'DEFAULT_COLOUR=Nucleotide',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
201 'SHOW_CONSENSUS_LOGO=false',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
202 'SHOW_ANNOTATIONS=true',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
203 'SHOW_UNCONSERVED=true',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
204 'AUTO_CALC_CONSENSUS=true',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
205 'PAD_GAPS=true',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
206 'FONT_SIZE=10',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
207 'RIGHT_ALIGN_IDS=false',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
208 'WRAP_ALIGNMENT=false',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
209 'FASTA_JVSUFFIX=false',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
210 'PILEUP_JVSUFFIX=false',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
211 'SHOW_JVSUFFIX=false']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
212
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
213 def _make_configuration(self, sub_tmp_dir):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
214
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
215 config_path = os.path.join(sub_tmp_dir, 'config.txt')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
216 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
217 'JalviewRunner: writing configuration file %s' % config_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
218 with open(config_path, 'w') as config_file:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
219 for c in self.config:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
220 config_file.write(c + '\n')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
221
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
222 return config_path
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
223
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
224 def _delete_tmp_dir(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
225
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
226 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
227 rmtree(self.tmp_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
228 except OSError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
229 pass
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
230
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
231 def _delete_sub_tmp_dir(self, sub_tmp_dir):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
232
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
233 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
234 rmtree(sub_tmp_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
235 except OSError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
236 pass
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
237
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
238 def run(self, fasta_path, png_output_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
239
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
240 fasta_path = os.path.abspath(os.path.expandvars(fasta_path))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
241
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
242 png_output_path = os.path.abspath(os.path.expandvars(png_output_path))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
243
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
244 sub_tmp_dir = tempfile.mkdtemp(dir=self.tmp_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
245 config_path = self._make_configuration(sub_tmp_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
246
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
247 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
248 'JalviewRunner: running Jalview on fasta %s with temp dir %s' %
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
249 (fasta_path, sub_tmp_dir))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
250
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
251 cline = ['java', '-Djava.ext.dirs=%s/lib' % self.jalview_jar_dir,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
252 '-cp %s/jalview.jar' % self.jalview_jar_dir, 'jalview.bin.Jalview',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
253 '-open', fasta_path, '-nodisplay', '-png', png_output_path,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
254 '-colour', 'nucleotide', '-props', config_path]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
255
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
256 # Run sibelia process
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
257 java_process = subprocess.Popen(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
258 ' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
259
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
260 # Block until streams are closed by the process
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
261 stdout, stderr = java_process.communicate()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
262
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
263 if stderr:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
264 sys.stderr.write('JalviewRunner: ' + stderr.replace('\n', ' '))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
265
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
266 self._delete_sub_tmp_dir(sub_tmp_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
267
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
268 return png_output_path
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
269
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
270 def __del__(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
271
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
272 self._delete_tmp_dir()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
273
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
274
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
275 class LastzRunner:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
276
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
277 def __init__(self, lastz_path=None, word_length=7):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
278
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
279 self.lastz_path = lastz_path
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
280 self.word_length = word_length
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
281
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
282 def align_to_bam_file(self, reference_fasta_path, query_fasta_path, output_bam_path, multiple=False, assert_record=None):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
283
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
284 logging.debug('LastzRunner: running on reference %s and query %s' %
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
285 (reference_fasta_path, query_fasta_path))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
286 output_sam_path = os.path.abspath(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
287 os.path.expandvars(output_bam_path.replace('.bam', '.sam')))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
288 output_bam_unsorted_path = os.path.abspath(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
289 os.path.expandvars(output_bam_path + '.unsorted'))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
290
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
291 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
292 'LastzRunner: aligning with output in temporary sam file %s' %
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
293 output_sam_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
294 with open(output_sam_path, 'w') as output_sam_handler:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
295 for line in self._align(reference_fasta_path, query_fasta_path, multiple):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
296 output_sam_handler.write(line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
297
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
298 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
299 'LastzRunner: transforming sam into unsorted bam file %s' %
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
300 output_bam_unsorted_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
301 input_sam_handler = pysam.Samfile(output_sam_path, "r")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
302 output_bam_file = pysam.Samfile(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
303 output_bam_unsorted_path, "wb", template=input_sam_handler)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
304
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
305 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
306 'LastzRunner: copying from sam file to bam file')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
307 for s in input_sam_handler:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
308 output_bam_file.write(s)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
309 output_bam_file.close()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
310
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
311 logging.debug('LastzRunner: sorting and indexing bam file %s' %
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
312 output_bam_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
313 pysam.sort(output_bam_unsorted_path,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
314 output_bam_path.replace('.bam', ''))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
315
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
316 pysam.index(output_bam_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
317
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
318 def align_to_samlines(self, reference_fasta_path, query_fasta_path, multiple=False):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
319
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
320 logging.debug('LastzRunner: running on reference %s and query %s' %
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
321 (reference_fasta_path, query_fasta_path))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
322
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
323 for line in self._align(reference_fasta_path, query_fasta_path, multiple):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
324 yield line
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
325
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
326 def _align(self, reference_fasta_path, query_fasta_path, multiple):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
327
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
328 reference_fasta_path = os.path.abspath(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
329 os.path.expandvars(reference_fasta_path))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
330 query_fasta_path = os.path.abspath(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
331 os.path.expandvars(query_fasta_path))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
332
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
333 lastz = [self.lastz_path and self.lastz_path or 'lastz']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
334 if multiple:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
335 cline = lastz + [
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
336 reference_fasta_path +
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
337 '[multiple]', query_fasta_path + '[nameparse=darkspace]',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
338 '--format=sam', '--strand=both', '--gapped', '--chain',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
339 '--nogfextend', '--seed=match%i' % self.word_length]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
340
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
341 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
342 cline = lastz + [
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
343 reference_fasta_path, query_fasta_path +
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
344 '[nameparse=darkspace]',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
345 '--format=sam', '--strand=both', '--gapped',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
346 '--chain', '--nogfextend', '--seed=match%i' % self.word_length]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
347
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
348 # Run lastz process
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
349 lastz_process = subprocess.Popen(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
350 ' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
351
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
352 # Filter multiple read mappings (one to each strand) by only keeping
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
353 # the one with the most matches
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
354 alignments = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
355 for i, line in enumerate(iter(lastz_process.stdout.readline, '')):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
356 if line.startswith('@'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
357 yield line
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
358 elif line.lower().startswith('read'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
359
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
360 fields = line.split('\t')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
361 read_name = fields[0]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
362 read_cigar = fields[5]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
363 if not alignments:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
364 alignments = [read_name, line, read_cigar]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
365 continue
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
366 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
367 if alignments[0] == read_name:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
368 this_mapping = sum(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
369 [int(c[:-1]) for c in re.findall('[0-9]*M', read_cigar)])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
370 other_mapping = sum(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
371 [int(c[:-1]) for c in re.findall('[0-9]*M', alignments[2])])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
372
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
373 if other_mapping > this_mapping:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
374 yield alignments[1]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
375 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
376 yield line
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
377 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
378 yield line
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
379 alignments = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
380 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
381 yield line
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
382
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
383
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
384 class ConsensusRunner:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
385
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
386 def __init__(self, ambiguity_cutoff=0.3):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
387
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
388 self.ambiguity_cutoff = ambiguity_cutoff
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
389 self.ambiguity = {
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
390 'GT': 'K', 'AC': 'M', 'AG': 'R', 'CT': 'Y', 'CG': 'S',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
391 'AT': 'W', 'CGT': 'B', 'ACG': 'V', 'ACT': 'H', 'AGT': 'D',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
392 'ACGT': 'N'}
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
393
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
394 self.alphabet = Gapped(IUPAC.unambiguous_dna, "-")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
395
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
396 def run(self, input_sam_path, output_fasta_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
397
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
398 samfile = pysam.Samfile(input_sam_path, "rb")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
399 references = samfile.references
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
400
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
401 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
402 'ConsensusRunner: writing consensus of bam file %s' % input_sam_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
403
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
404 assert len(references) == 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
405
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
406 consensus = defaultdict(list)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
407 conditions = {}
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
408
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
409 base_mapper = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4, '-': 5}
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
410 ambiguity = self.ambiguity
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
411
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
412 alphabet = Gapped(IUPAC.ambiguous_dna)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
413 frequency_cutoff = self.ambiguity_cutoff
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
414
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
415 # Get all conditions of aligned reads
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
416 for alignedread in samfile.fetch(references[0]):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
417
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
418 name = alignedread.qname
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
419 length = len(alignedread.seq)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
420 condition = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
421
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
422 if name.lower().startswith('read'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
423 condition = ';'.join(name.split(';')[:2])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
424 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
425 condition = ';'.join(name.split(';')[:-1])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
426
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
427 if condition in conditions:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
428 if name not in conditions[condition][0]:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
429 conditions[condition][0].add(name)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
430 conditions[condition][1] += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
431 conditions[condition][2] += length
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
432 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
433 conditions[condition] = [set([name]), 1, length]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
434
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
435 # Prepare count dictionaries
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
436 consensus = defaultdict(list)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
437
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
438 for i, pileupcolumn in enumerate(samfile.pileup(references[0], max_depth=100000)):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
439
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
440 if i % 1000 == 0:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
441 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
442 'ConsensusRunner: making consensus at position %i' % i)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
443
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
444 counters = {}
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
445 for condition in conditions:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
446 counters[condition] = [0, 0, 0, 0, 0, 0] # ACGTN-
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
447
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
448 for pileupread in pileupcolumn.pileups:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
449
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
450 alignment = pileupread.alignment
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
451 name = alignment.qname
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
452
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
453 if pileupread.is_del:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
454 base = '-'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
455 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
456 base = alignment.seq[pileupread.qpos].upper()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
457
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
458 if name.lower().startswith('read'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
459 condition = ';'.join(name.split(';')[:2])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
460 counters[condition][base_mapper[base]] += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
461 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
462 condition = ';'.join(name.split(';')[:-1])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
463 counters[condition][base_mapper[base]] += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
464
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
465 for condition, counts in counters.iteritems():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
466 depth = float(sum(counts))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
467 bases = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
468 a, c, g, t, n, gap = counts
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
469
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
470 if depth > 0:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
471 if (n / depth) >= frequency_cutoff:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
472 consensus[condition].append('N')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
473 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
474 if (a / depth) >= frequency_cutoff:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
475 bases.append('A')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
476 if (c / depth) >= frequency_cutoff:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
477 bases.append('C')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
478 if (g / depth) >= frequency_cutoff:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
479 bases.append('G')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
480 if (t / depth) >= frequency_cutoff:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
481 bases.append('T')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
482
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
483 if len(bases) > 1:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
484 consensus[condition].append(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
485 ambiguity[''.join(sorted(bases))])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
486 elif len(bases) == 1:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
487 consensus[condition].append(bases[0])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
488 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
489 consensus[condition].append('-')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
490
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
491 # Split consensuses by type
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
492 reference_consensuses = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
493 read_consensuses = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
494 for condition, sequence_list in consensus.iteritems():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
495 if condition.lower().startswith('read'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
496 read_consensuses.append((''.join(sequence_list), condition))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
497 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
498 reference_consensuses.append(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
499 (''.join(sequence_list), condition))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
500
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
501 reference_consensuses.sort(reverse=True)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
502 read_consensuses.sort(reverse=True)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
503
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
504 # Get start and end of reads (remove fully gapped positions)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
505 start = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
506 end = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
507 for read_consensus, condition in read_consensuses:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
508
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
509 # Forward direction
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
510 for i, char in enumerate(read_consensus):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
511 if char != '-':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
512 if start is None:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
513 start = i
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
514 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
515 start = min(start, i)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
516 break
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
517
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
518 # Reverse direction
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
519 for i, char in enumerate(read_consensus[::-1]):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
520 if char != '-':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
521 if end is None:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
522 end = i
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
523 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
524 end = min(end, i)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
525 break
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
526
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
527 if end == 0 or end is None:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
528 end = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
529 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
530 end = -end
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
531
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
532 # Filter all records by start and end of reads
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
533 reference_consensuses =\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
534 [(r[start:end], c) for r, c in reference_consensuses]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
535
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
536 read_consensuses =\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
537 [(r[start:end], c) for r, c in read_consensuses]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
538
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
539 # Write consensus records to file
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
540 with open(output_fasta_path, 'w', buffering=100 * 1024 * 1024) as output_handler:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
541 for consensus, condition in read_consensuses + reference_consensuses:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
542 stats = ';'.join(map(str, conditions[condition][1:]))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
543 record = SeqRecord(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
544 Seq(consensus, alphabet=alphabet), id=condition + ';' + stats, name='', description='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
545 SeqIO.write([record], output_handler, "fasta")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
546
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
547
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
548 class Group:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
549
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
550 def __init__(self, family_name, qualified_family_name, sequence_proxy, tmp_dir=None):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
551
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
552 self.family_name = family_name
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
553 self.qualified_family_name = qualified_family_name
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
554 self.sequence_proxy = sequence_proxy
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
555
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
556 self.regions = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
557 self.fasta_path = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
558
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
559 if tmp_dir:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
560 self.tmp_dir = tempfile.mkdtemp(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
561 dir=os.path.abspath(os.path.expandvars(tmp_dir)))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
562 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
563 self.tmp_dir = tempfile.mkdtemp()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
564
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
565 def add_region(self, read_id, pathogen_mapping_locations, human_mapping_locations):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
566
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
567 region = Region(self.sequence_proxy, self.tmp_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
568
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
569 region.add_read(read_id)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
570
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
571 for mapping_location in list(pathogen_mapping_locations) + list(human_mapping_locations):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
572 reference = ';'.join(mapping_location[:-2])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
573 region.add_reference(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
574 reference, int(mapping_location[-2]), int(mapping_location[-1]))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
575
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
576 self.regions.append(region)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
577
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
578 def write_outputs(self, output_dir, lastz_runner, consensus_builder, jalview_runner):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
579
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
580 if not self.regions:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
581 logging.debug('Group %s: no regions for family' % self.family_name)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
582 return
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
583
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
584 made_output_dir = False
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
585 for i, region in enumerate(self.regions):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
586
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
587 if not made_output_dir:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
588 output_dir = os.path.abspath(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
589 os.path.expandvars(os.path.join(output_dir, self.qualified_family_name)))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
590 logging.debug('Group %s: writing output files to %s' %
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
591 (self.family_name, output_dir))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
592 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
593 os.makedirs(output_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
594 except OSError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
595 pass
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
596 made_output_dir = True
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
597
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
598 logging.debug('Group %s: writing region number %i files to %s' %
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
599 (self.family_name, i + 1, output_dir))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
600
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
601 unaligned_path = os.path.join(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
602 output_dir, 'region_%i_unaligned.fa.bzip2' % (i + 1))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
603 compressed_unaligned = bz2.BZ2File(unaligned_path, 'wb')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
604 with open(region.get_unaligned_fasta_path()) as unaligned_fasta:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
605 for line in unaligned_fasta:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
606 compressed_unaligned.write(line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
607
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
608 bam_path = os.path.join(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
609 output_dir, 'region_%i_alignment.bam' % (i + 1))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
610 temporary_alignment_bam_path = region.get_alignment_bam_path(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
611 lastz_runner, assert_record='Read')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
612 shutil.copy(temporary_alignment_bam_path, bam_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
613
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
614 shutil.copy(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
615 temporary_alignment_bam_path + '.bai', bam_path + '.bai')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
616
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
617 consensus_path = os.path.join(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
618 output_dir, 'region_%i_consensus.fa' % (i + 1))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
619 temporary_consensus_path = region.get_consensus_fasta_path(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
620 consensus_builder, temporary_alignment_bam_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
621 shutil.copy(temporary_consensus_path, consensus_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
622
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
623 if jalview_runner:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
624
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
625 jalview_path = os.path.join(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
626 output_dir, 'region_%i_consensus.png' % (i + 1))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
627 temporary_jalview_path = region.get_consensus_figure_path(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
628 jalview_runner, temporary_consensus_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
629 shutil.copy(temporary_jalview_path, jalview_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
630
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
631 def _delete_temporary_dir(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
632
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
633 self.fasta_path = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
634 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
635 rmtree(self.tmp_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
636 except OSError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
637 pass
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
638
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
639 def filter_regions(self, min_region_length=50, min_read_number=5):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
640
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
641 logging.debug('Group %s: filtering %i candidate regions' %
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
642 (self.family_name, len(self.regions)))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
643
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
644 filtered = [r for r in self.regions if len(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
645 r) >= min_read_number and r.get_longest_reference_length() >= min_region_length]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
646
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
647 ordered = sorted(filtered, reverse=True)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
648
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
649 if ordered:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
650 length = ordered[0].length
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
651 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
652 length = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
653
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
654 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
655 'Group %s: %i candidate regions remained after filtering, the longest is %i bp long' %
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
656 (self.family_name, len(ordered), length))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
657
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
658 self.regions = ordered
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
659
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
660 return ordered
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
661
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
662 def merge_regions(self, max_gap_length):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
663 """ Merges candidate regions into homologous regions. """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
664
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
665 logging.debug('Group %s: merging %i candidate regions' %
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
666 (self.family_name, len(self.regions)))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
667
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
668 if len(self.regions) > 1:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
669
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
670 potentially_mergable = self.regions
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
671 not_mergable = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
672
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
673 while len(potentially_mergable) > 1:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
674
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
675 merged = False
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
676 current = potentially_mergable[0]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
677 compared_to = potentially_mergable[1:]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
678
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
679 for region in compared_to:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
680 if region.overlaps(current, max_gap_length):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
681 region.merge(current)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
682 region.clean_references(max_gap_length)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
683 #logging.debug('Group %s: merged a region. %i potentially mergable candidate regions remaining' % (self.family_name, len(potentially_mergable)))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
684 potentially_mergable = compared_to
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
685 merged = True
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
686 break
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
687
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
688 if not merged:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
689 not_mergable.append(current)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
690 potentially_mergable = compared_to
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
691 #logging.debug('Group %s: not merged a region. %i potentially mergable candidate regions remaining' % (self.family_name, len(potentially_mergable)))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
692
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
693 results = not_mergable + potentially_mergable
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
694
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
695 logging.debug('Group %s: merged into %i regions' %
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
696 (self.family_name, len(results)))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
697
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
698 self.regions = results
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
699
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
700 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
701 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
702 'Group %s: found only 1 region, no mergin necessary' % self.family_name)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
703
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
704 def add_transcripts_to_regions(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
705
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
706 logging.debug('Group %s: enriching %i regions with human transcript information'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
707 % (self.family_name, len(self.regions)))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
708
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
709 for region in self.regions:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
710
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
711 added_transcripts = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
712
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
713 for identifier, locations in region.references.iteritems():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
714 chromosome = identifier.split(';')[-1]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
715
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
716 for start, end in locations:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
717 transcript_identifiers = self.sequence_proxy.get_overlapping_human_transcript_identifiers(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
718 chromosome, start, end)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
719
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
720 for transcript_identifier in transcript_identifiers:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
721 region.add_transcript(transcript_identifier)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
722 added_transcripts += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
723
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
724 logging.debug('Group %s: added %i transcripts to a region' %
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
725 (self.family_name, added_transcripts))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
726
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
727 def __str__(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
728
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
729 s = "Group %s with %i regions" % (self.family_name, len(self.regions))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
730
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
731 return s
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
732
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
733
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
734 class GroupGenerator:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
735
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
736 """ Produces Groups from Hitfiles. Reads are assigned to groups based on
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
737 taxonomic families. Transcripts overlapping human read mappings as well
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
738 as genome sequences of pathogens the reads map to are added to the
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
739 Groups.
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
740 """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
741
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
742 def __init__(self, sequence_proxy, reference_database_filter=None, pathogen_family_filter=None, tmp_dir=None):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
743
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
744 self.sequence_proxy = sequence_proxy
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
745
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
746 self.tmp_path = tmp_dir
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
747 self.reference_database_filter = reference_database_filter
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
748 self.pathogen_family_filter = pathogen_family_filter
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
749
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
750 self.groups = {}
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
751
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
752 if tmp_dir:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
753 self.tmp_dir = tempfile.mkdtemp(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
754 dir=os.path.abspath(os.path.expandvars(tmp_dir)))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
755 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
756 self.tmp_dir = tempfile.mkdtemp()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
757
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
758 def get_groups(self, min_read_number=5):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
759
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
760 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
761 'GroupGenerator: generating groups from %i hit files, please wait...' %
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
762 len(self.sequence_proxy.hit_records))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
763
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
764 for hit_index in self.sequence_proxy.hit_records:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
765 for read_id, record in hit_index.iteritems():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
766
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
767 # Extract mapping locations to human DNA, human transcript, and
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
768 # pathogen genomes
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
769 mapping_locations = record.description.strip().split(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
770 ' ')[1].split('|')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
771
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
772 pathogen_families = defaultdict(set)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
773 human_mapping_locations = set()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
774
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
775 for mapping_location in mapping_locations:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
776 fields = mapping_location.split(';')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
777
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
778 # Convert start and end to integers
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
779 fields[-2] = int(fields[-2]) # start
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
780 fields[-1] = int(fields[-1]) # end
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
781
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
782 # Add mapping locations to human cDNA or human DNA
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
783 if fields[2] == 'Homo_sapiens':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
784 human_mapping_locations.add(tuple(fields))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
785
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
786 # Add mapping locations to non-human references
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
787 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
788 reference_database, family = fields[:2]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
789 if self.reference_database_filter and reference_database\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
790 not in self.reference_database_filter:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
791 continue
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
792 if self.pathogen_family_filter and family\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
793 not in self.pathogen_family_filter:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
794 continue
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
795 pathogen_families[
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
796 (reference_database, family)].add(tuple(fields))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
797
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
798 if not pathogen_families:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
799 continue
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
800
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
801 # Process the hits to different pathogen families
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
802 for (reference_database, family), pathogen_locations in pathogen_families.iteritems():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
803
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
804 qualified_family_name = reference_database + '_' + family
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
805
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
806 # Obtain existing group or make new group
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
807 if qualified_family_name in self.groups:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
808 group = self.groups[qualified_family_name]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
809 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
810 group = Group(family, qualified_family_name,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
811 self.sequence_proxy, self.tmp_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
812 self.groups[qualified_family_name] = group
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
813
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
814 group.add_region(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
815 read_id, pathogen_locations, human_mapping_locations)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
816
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
817 # Exclude groups that have collected to few reads
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
818 for qualified_family_name, group in self.groups.items():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
819 if len(group.regions) < min_read_number:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
820 del self.groups[qualified_family_name]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
821 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
822 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
823 'GroupGenerator: made new group for family %s' % qualified_family_name)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
824
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
825 return self.groups
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
826
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
827
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
828 class Region:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
829
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
830 def __init__(self, sequence_proxy, tmp_dir=None):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
831
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
832 self.sequence_proxy = sequence_proxy
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
833
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
834 self.references = defaultdict(set)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
835 self.reads = set()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
836 self.transcripts = set()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
837 self.length = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
838
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
839 self.sorted_reference_positions = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
840
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
841 self.master_tmp = tmp_dir
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
842 self.tmp_dir = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
843
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
844 self.unaligned_fasta_path = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
845 self.alignment_sam_path = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
846 self.consensus_fasta_path = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
847
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
848 def add_reference(self, name, start, end):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
849
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
850 assert start < end
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
851 self.length = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
852 self.longest_reference_id = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
853 self.references[name].add((start, end))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
854
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
855 def add_read(self, name):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
856
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
857 self.reads.add(name)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
858
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
859 def add_transcript(self, name):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
860
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
861 self.transcripts.add(name)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
862
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
863 def get_tmp_path(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
864
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
865 if self.tmp_dir:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
866 return self.tmp_dir
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
867
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
868 if self.master_tmp:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
869 self.tmp_dir = tempfile.mkdtemp(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
870 dir=os.path.abspath(os.path.expandvars(self.master_tmp)))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
871 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
872 self.tmp_dir = tempfile.mkdtemp()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
873
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
874 return self.tmp_dir
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
875
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
876 def _distance(self, range1, range2):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
877
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
878 first, second = sorted((range1, range2))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
879
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
880 if first[0] <= first[1] < second[0]:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
881 return (second[0] - first[1])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
882 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
883 return 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
884
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
885 def _delete_temporary_dir(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
886
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
887 self.unaligned_fasta_path = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
888 if self.tmp_dir:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
889 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
890 rmtree(self.tmp_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
891 except OSError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
892 pass
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
893
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
894 def _get_unaligned_fasta_path(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
895
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
896 output_path = os.path.join(self.get_tmp_path(), 'unaligned_region.fa')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
897 logging.debug('Region: writing unaligned fasta to %s' % output_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
898
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
899 assert self.reads
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
900
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
901 with open(output_path, 'w', buffering=100 * 1024 * 1024) as output_handler:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
902
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
903 # Cut and write references
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
904 human_positions = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
905 pathogen_positions = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
906 for length, identifier, start, end in self.get_sorted_reference_positions():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
907 if ';Homo_sapiens;' in identifier:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
908 human_positions.append((identifier, start, end))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
909 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
910 pathogen_positions.append((identifier, start, end))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
911
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
912 for identifier, start, end in pathogen_positions + human_positions:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
913 record = self.sequence_proxy.get_reference_record(identifier)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
914 if record:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
915 record = SeqRecord(record.seq, record.id, '', '')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
916 #record.id += ';%i-%i' % (start, end)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
917 SeqIO.write(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
918 [record[start - 1:end]], output_handler, "fasta")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
919 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
920 pass
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
921 #logging.debug('Region: could not retrieve reference %s' % identifier)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
922
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
923 # Write full-length transcripts
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
924 for identifier in self.transcripts:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
925 record = self.sequence_proxy.get_transcript_record(identifier)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
926 if record:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
927 record = SeqRecord(record.seq, record.id, '', '')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
928 SeqIO.write([record], output_handler, "fasta")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
929 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
930 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
931 'Region: could not retrieve reference %s' % identifier)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
932
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
933 # Write reads
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
934 for read_id in sorted(self.reads):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
935 record = self.sequence_proxy.get_read_record(read_id)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
936 if record:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
937 # Transform read ID so that LASTZ can work with it (it
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
938 # makes nicknames based on some characters )
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
939 identifier = record.id.replace(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
940 ':', '_').replace('|', '_').replace('/', '_')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
941 record = SeqRecord(record.seq, identifier, '', '')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
942 SeqIO.write([record], output_handler, "fasta")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
943 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
944 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
945 'Region: could not retrieve read %s' % read_id)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
946
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
947 self.unaligned_fasta_path = output_path
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
948
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
949 return self.unaligned_fasta_path
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
950
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
951 def get_unaligned_fasta_path(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
952
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
953 if not self.unaligned_fasta_path:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
954 self._get_unaligned_fasta_path()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
955
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
956 return self.unaligned_fasta_path
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
957
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
958 def _align_fastas(self, aligner, assert_record=None):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
959
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
960 # Write reference fasta
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
961 queries_path = self.get_unaligned_fasta_path()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
962 first_record = SeqIO.parse(open(queries_path, "rU"), "fasta").next()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
963 reference_path = os.path.join(self.get_tmp_path(), 'reference.fa')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
964
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
965 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
966 'Region: writing longest reference to %s' % reference_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
967
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
968 SeqIO.write([first_record], reference_path, "fasta")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
969
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
970 # Do alignment
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
971 bam_path = os.path.join(self.get_tmp_path(), 'aligned.bam')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
972 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
973 'Region: starting alignment using queries %s to sam path %s' %
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
974 (queries_path, bam_path))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
975
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
976 aligner.align_to_bam_file(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
977 reference_path, queries_path, bam_path, assert_record=assert_record)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
978
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
979 return bam_path
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
980
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
981 def _build_alignment_consensus(self, consensus_builder, alignment_bam_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
982
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
983 consensus_path = os.path.join(self.get_tmp_path(), 'consensus.fa')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
984 logging.debug('Region: writing consensus to %s' % consensus_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
985
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
986 consensus_builder.run(alignment_bam_path, consensus_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
987
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
988 return consensus_path
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
989
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
990 def get_alignment_bam_path(self, aligner, assert_record=None):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
991
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
992 return self._align_fastas(aligner, assert_record)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
993
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
994 def get_consensus_fasta_path(self, consensus_builder, alignment_bam_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
995
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
996 return self._build_alignment_consensus(consensus_builder, alignment_bam_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
997
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
998 def get_consensus_figure_path(self, jalview_runner, consensus_fasta_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
999
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1000 png_path = os.path.join(self.get_tmp_path(), 'consensus_figure.png')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1001
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1002 return jalview_runner.run(consensus_fasta_path, png_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1003
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1004 def overlaps(self, other, max_gap_length):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1005
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1006 overlap = (set(self.references) & set(other.references))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1007
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1008 # Overlap is solely based on non-human references
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1009 overlap = [ref for ref in overlap if not ';Homo_sapiens;' in ref]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1010
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1011 if not overlap:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1012 return False
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1013
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1014 for reference in overlap:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1015 reflist = self.references[reference]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1016 for start, end in reflist:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1017 for other_start, other_end in other.references[reference]:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1018 distance = self._distance((start, end),
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1019 (other_start, other_end))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1020 if distance <= max_gap_length:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1021 return True
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1022 return False
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1023
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1024 def merge(self, other):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1025
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1026 for reference, reflist in other.references.iteritems():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1027 for (start, end) in reflist:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1028 self.add_reference(reference, start, end)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1029
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1030 for read in other.reads:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1031 self.add_read(read)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1032
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1033 def clean_references(self, max_gap_length):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1034
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1035 for reference, reflist in self.references.iteritems():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1036
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1037 start_list = sorted(reflist)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1038 saved = list(start_list[0])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1039 result = set()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1040
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1041 for item in start_list:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1042 if self._distance(saved, item) <= max_gap_length:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1043 saved[1] = max(saved[1], item[1])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1044 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1045 result.add(tuple(saved))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1046 saved[0] = item[0]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1047 saved[1] = item[1]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1048 result.add(tuple(saved))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1049 self.references[reference] = result
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1050
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1051 def to_sorted_record_ids(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1052
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1053 references = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1054 for name, locations in self.references.iteritems():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1055 for start, end in locations:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1056 references.append(end - start, name)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1057
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1058 reads = defaultdict(list)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1059 for name, locations in self.references.iteritems():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1060 condition = name.split(';')[1]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1061 for start, end in locations:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1062 reads[condition].append(end - start, name)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1063
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1064 all_record_ids = sorted(references, reverse=True)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1065 for condition, the_reads in reads.iteritems():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1066 all_record_ids += sorted(the_reads, reverse=True)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1067
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1068 for length, record_id in all_record_ids:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1069 yield record_id
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1070
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1071 def __len__(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1072
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1073 return len(self.reads)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1074
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1075 def __cmp__(self, other):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1076
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1077 if self.get_longest_reference_length() <\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1078 other.get_longest_reference_length():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1079 return -1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1080 elif self.get_longest_reference_length() ==\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1081 other.get_longest_reference_length():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1082 return 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1083 return 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1084
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1085 def get_longest_reference_length(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1086
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1087 if self.length is not None:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1088 return self.length
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1089
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1090 positions = self.get_sorted_reference_positions()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1091 if positions:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1092 self.length = self.get_sorted_reference_positions()[0][0]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1093 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1094 self.length = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1095
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1096 return self.length
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1097
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1098 def get_sorted_reference_positions(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1099
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1100 if self.sorted_reference_positions is not None:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1101 return self.sorted_reference_positions
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1102
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1103 lengths = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1104 for reference, the_set in self.references.iteritems():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1105 for start, end in the_set:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1106 lengths.append([end - start, reference, start, end])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1107
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1108 self.sorted_reference_positions = sorted(lengths, reverse=True)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1109
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1110 return self.sorted_reference_positions
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1111
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1112 def __str__(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1113 return '<Region> of length %10i with %10i reads and %5i references' %\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1114 (self.get_longest_reference_length(),
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1115 len(self.reads), len(self.references))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1116
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1117
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1118 class RegionStatistics:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1119
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1120 def __init__(self, input_dir):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1121
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1122 self.input_dir = input_dir
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1123 self.stats = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1124
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1125 def _add_sequence(self, qualified_name, region_id,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1126 longest_human_reference, longest_pathogen_reference, record):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1127
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1128 state = 'pathogen'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1129 if longest_human_reference:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1130 state = 'ambiguous'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1131
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1132 reference_type = qualified_name.split('_')[0]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1133 family_name = '_'.join(qualified_name.split('_')[1:])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1134
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1135 read_type, sample_id, number_reads, basepairs = record.id.split(';')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1136
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1137 coverage = int(basepairs) / float(len(longest_pathogen_reference))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1138
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1139 self.stats.append(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1140 [reference_type, family_name, region_id, sample_id, state,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1141 number_reads, str(len(longest_pathogen_reference)), basepairs, '%.5f' % coverage])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1142
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1143 def run(self, output_file):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1144
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1145 for qualified_family_name in os.listdir(self.input_dir):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1146
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1147 family_dir = os.path.join(self.input_dir, qualified_family_name)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1148
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1149 if not os.path.isdir(family_dir):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1150 continue
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1151
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1152 consensus_regions = glob.glob(os.path.join(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1153 family_dir, 'region_*_consensus.fa'))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1154
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1155 for consensus_region in consensus_regions:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1156
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1157 base_bame = os.path.basename(consensus_region)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1158
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1159 region_id = base_bame.split('_')[1]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1160
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1161 reads = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1162 longest_human_reference = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1163 longest_pathogen_reference = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1164
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1165 for consensus_record in SeqIO.parse(consensus_region, 'fasta'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1166 if consensus_record.id.lower().startswith('read'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1167 # ends with read number; cumulative bases
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1168 reads.append(consensus_record)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1169 elif ';Homo_sapiens;' in consensus_record.id:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1170 if longest_human_reference is None or\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1171 len(consensus_record) > longest_human_reference:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1172 longest_human_reference = consensus_record
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1173 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1174 if longest_pathogen_reference is None or\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1175 len(consensus_record) > longest_pathogen_reference:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1176 longest_pathogen_reference = consensus_record
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1177
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1178 for read in reads:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1179 self._add_sequence(qualified_family_name, region_id,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1180 longest_human_reference, longest_pathogen_reference, read)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1181
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1182 with open(os.path.abspath(os.path.expandvars(output_file)), 'w') as output_handler:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1183 output_handler.write(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1184 '\t'.join(['reference_type', 'family_name', 'region_id',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1185 'sample_id', 'taxonomic_origin', 'number_reads',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1186 'region_length', 'basepairs', 'coverage']) + '\n')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1187 for entries in sorted(self.stats):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1188 line = '\t'.join(entries) + '\n'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1189 output_handler.write(line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1190
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1191
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1192 @CLI.subcommand("regions")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1193 class RegionRunner(cli.Application):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1194
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1195 """ Derive homologous groups and regions from hit files """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1196
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1197 hit_files = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1198 ['-v', '--virana_hits'], str, list=True, mandatory=True,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1199 help="Add hit file for analysis. May be supplied multiple times.")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1200
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1201 lastz_path = cli.SwitchAttr(['-z', '--lastz_path'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1202 help="Path to lastz executable",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1203 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1204
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1205 jalview_jar_dir = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1206 ['-j', '--jalview_jar_dir'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1207 help="Directory containing the jalview.jar file",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1208 default=None)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1209
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1210 references_path = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1211 ['-r', '--references'], str, mandatory=True,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1212 help="Input fasta file containing genomic references of pathogens and possibly of human chromosomes as well (the latter is experimental)",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1213 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1214
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1215 cdna_path = cli.SwitchAttr(['-c', '--cdna'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1216 help="Input fasta file containing human cDNA records.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1217 default=None)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1218
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1219 output_dir = cli.SwitchAttr(['-o', '--output_dir'], str, mandatory=True,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1220 help="Output directory that will be filled with subdirectories corresponding to homologous regions. The directory will be generated if it is not existing.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1221 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1222
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1223 tmp_dir = cli.SwitchAttr(['-t', '--tmp_dir'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1224 help="Directory in which temorary files are stored. Temporary files will be stored in subdirectories of the provided directory and will be deleted after completion.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1225 default=None)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1226
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1227 stat_path = cli.SwitchAttr(['-s', '--region_stats'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1228 help="Path to output statistics tab-delimited text file.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1229 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1230
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1231 reference_database_filter = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1232 ['-b', '--reference_database_filter'], str, list=True, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1233 help="Specifies which kind of reference databases are consisdered when extracting hits from hit files. May be specified multiple times. If any are specified, all reference databases not specified are filtered out. By default, this parameter is empty.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1234 default=[])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1235
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1236 pathogen_family_filter = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1237 ['-f', '--pathogen_family_filter'], str, list=True, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1238 help="Specifies which kind of pathogen families are consisdered when extracting hits from hit files. May be specified multiple times. If any are specified, all families not specified are filtered out. By default, this parameter is empty.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1239 default=[])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1240
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1241 min_read_number = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1242 ['-m', '--min_read_number'], cli.Range(1, 1000), mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1243 help="Minimum number of reads that are required to be present in homologous region. Regions with fewer reads will be omitted from the results.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1244 default=5)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1245
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1246 max_gap_length = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1247 ['-l', '--max_gap_length'], cli.Range(1, 1000), mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1248 help="Maximum number bases that two candidate homologous regions are distant from each other with regard to their positions on a common reference sequence in order for being eligable for merging.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1249 default=25)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1250
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1251 min_region_length = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1252 ['-x', '--min_region_length'], cli.Range(1, 1000), mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1253 help="Minimum number bases of the longest reference sequence of each homologous region that is generated. Shoer regions will be omitted from the results.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1254 default=50)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1255
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1256 word_length = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1257 ['-w', '--word_length'], cli.Range(1, 21), mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1258 help="Word length of the lastz alignment process. Shorter word lengths allow more sensitive but less specific alignments.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1259 default=7)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1260
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1261 ambiguity_cutoff = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1262 ['-a', '--ambiguity_cutoff'], float, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1263 help="Ratio of variant positions within a column of the homologous region sequence alignment so that the corresponding consensus sequence at this positions show a ambiguous base instead of the majority base.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1264 default=0.3)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1265
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1266 debug = cli.Flag(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1267 ["-d", "--debug"], help="Enable debug information")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1268
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1269 def main(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1270
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1271 if self.debug:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1272 logging.getLogger().setLevel(logging.DEBUG)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1273
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1274 # Make sequence proxy for managing hit files, references, and cdna
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1275 proxy = SequenceProxy(self.references_path, self.cdna_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1276 for hit_file in self.hit_files:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1277 proxy.add_hit_file(hit_file)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1278
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1279 # Generate homologous groups
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1280 generator = GroupGenerator(proxy,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1281 reference_database_filter=self.reference_database_filter,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1282 pathogen_family_filter=self.pathogen_family_filter,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1283 tmp_dir=self.tmp_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1284 groups = generator.get_groups(min_read_number=self.min_read_number)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1285
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1286 # Prepare analysis modules ('runners') for postprocessing homologous
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1287 # regions
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1288 consensus = ConsensusRunner(ambiguity_cutoff=self.ambiguity_cutoff)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1289 lastz = LastzRunner(word_length=self.word_length)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1290
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1291 if self.jalview_jar_dir:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1292 jalview = JalviewRunner(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1293 jalview_jar_dir=self.jalview_jar_dir, tmp_dir=self.tmp_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1294 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1295 jalview = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1296
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1297 # Make homologous regions within each homologous group
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1298 for name, group in groups.iteritems():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1299 group.merge_regions(max_gap_length=self.max_gap_length)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1300 group.filter_regions(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1301 min_region_length=self.min_region_length, min_read_number=self.min_read_number)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1302
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1303 if self.cdna_path:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1304 group.add_transcripts_to_regions()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1305
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1306 group.write_outputs(self.output_dir, lastz, consensus, jalview)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1307 group._delete_temporary_dir()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1308
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1309 # Run statistics on all homologous regions
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1310 if self.stat_path:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1311 statistics = RegionStatistics(self.output_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1312 statistics.run(self.stat_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1313
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1314 if __name__ == "__main__":
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1315 CLI.run()