annotate vhom.py @ 0:3ba5983012cf draft

Uploaded
author mzeidler
date Tue, 24 Sep 2013 10:19:40 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1 #!/usr/bin/env python
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
2
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
3 import sys
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
4 import os
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
5 import logging
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
6 import tempfile
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
7 import shutil
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
8
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
9 import subprocess
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
10 import bz2
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
11 import re
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
12 import glob
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
13
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
14 from shutil import rmtree
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
15 from subprocess import PIPE
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
16 from collections import defaultdict, Counter
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
17
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
18 try:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
19 import pysam
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
20 except ImportError:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
21 message = 'This script requires the pysam python package\n'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
22 sys.stderr.write(message)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
23 sys.exit(1)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
24
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
25 try:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
26 from plumbum import cli
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
27 except ImportError:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
28 message = 'This script requires the plumbum python package\n'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
29 sys.stderr.write(message)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
30 sys.exit(1)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
31
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
32 try:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
33 from Bio.SeqRecord import SeqRecord
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
34 from Bio import SeqIO
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
35 from Bio.Seq import Seq
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
36 from Bio.Alphabet import IUPAC, Gapped
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
37 except ImportError:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
38 message = 'This script requires the BioPython python package\n'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
39 sys.stderr.write(message)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
40 sys.exit(1)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
41
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
42 try:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
43 import HTSeq
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
44 except ImportError:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
45 message = 'This script requires the HTSeq python package\n'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
46 sys.stderr.write(message)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
47 sys.exit(1)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
48
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
49 logging.basicConfig(level=logging.INFO, format='%(message)s')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
50
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
51
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
52 class CLI(cli.Application):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
53
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
54 """ Homologous region analysis of hit files"""
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
55 PROGNAME = "vmap"
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
56 VERSION = "1.0.0"
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
57 DESCRIPTION = """"""
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
58 USAGE = """"""
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
59
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
60 def main(self, *args):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
61
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
62 if args:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
63 print("Unknown command %r" % (args[0]))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
64 print self.USAGE
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
65 return 1
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
66
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
67 if not self.nested_command:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
68 print("No command given")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
69 print self.USAGE
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
70 return 1
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
71
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
72 class SequenceProxy:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
73
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
74 def __init__(self, references_path, human_transcripts_path=None):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
75
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
76 self.references_path = references_path
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
77 self.human_transcripts_path = human_transcripts_path
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
78
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
79 self.human_transcript_records = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
80 self.reference_records = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
81 self.hit_records = []
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
82
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
83 self.transcript_ranges = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
84
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
85 def add_hit_file(self, hit_file_path):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
86
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
87 logging.debug('SequenceProxy: adding new hit file %s' % hit_file_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
88
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
89 hit_handle = bz2.BZ2File(hit_file_path, 'rb')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
90 try:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
91 hit_handle.read(10)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
92 except IOError:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
93 hit_handle = open(hit_file_path, 'rb')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
94
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
95 hit_dict = SeqIO.to_dict(SeqIO.parse(hit_handle, "fasta"))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
96
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
97 logging.debug('SequenceProxy: hit file has %i entries' % len(hit_dict))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
98
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
99 self.hit_records.append(hit_dict)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
100
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
101 def get_read_record(self, read_id):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
102
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
103 for record_dict in self.hit_records:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
104 if read_id in record_dict:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
105 record = record_dict[read_id]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
106 return SeqRecord(record.seq, record.id, '', '')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
107
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
108 return None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
109
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
110 def get_reference_record(self, identifier):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
111
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
112 if not self.reference_records:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
113 logging.debug('SequenceProxy: indexing reference records')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
114 self.reference_records = SeqIO.index(self.references_path, 'fasta')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
115
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
116 try:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
117 record = self.reference_records[identifier]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
118 return SeqRecord(record.seq, record.id, '', '')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
119 except KeyError:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
120 return None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
121
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
122 def get_transcript_record(self, identifier):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
123
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
124 if not self.human_transcripts_path:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
125 return None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
126
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
127 if not self.human_transcript_records:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
128 logging.debug('SequenceProxy: indexing human transcripts')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
129 self.human_transcript_records = SeqIO.index(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
130 self.human_transcripts_path, 'fasta')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
131
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
132 try:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
133 record = self.human_transcript_records[identifier]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
134 return SeqRecord(record.seq, record.id, '', '')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
135
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
136 except KeyError:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
137 return None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
138
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
139 def get_overlapping_human_transcript_identifiers(self, chromosome, start, end):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
140
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
141 if not self.human_transcripts_path:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
142 return set()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
143
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
144 if not self.transcript_ranges:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
145 self._set_transcript_ranges()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
146
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
147 interval = HTSeq.GenomicInterval(chromosome, start, end, '.')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
148 all_transcript_ids = set()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
149 for interval, transcript_ids in self.transcript_ranges[interval].steps():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
150 all_transcript_ids = all_transcript_ids.union(transcript_ids)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
151
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
152 return all_transcript_ids
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
153
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
154 def _set_transcript_ranges(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
155
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
156 logging.debug(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
157 'SequenceProxy: Preparing transcript ranges, please wait...')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
158
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
159 self.transcript_ranges = HTSeq.GenomicArrayOfSets(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
160 'auto', stranded=False)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
161
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
162 for record in SeqIO.parse(self.human_transcripts_path, 'fasta'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
163 transcript_id = record.id
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
164 for exon in record.description.strip().split(' ')[1].split('|'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
165 chromosome, start, end = exon.split(';')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
166 start, end = sorted((int(start), int(end)))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
167 if start == end:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
168 continue
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
169 interval = HTSeq.GenomicInterval(chromosome, start, end, ".")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
170 self.transcript_ranges[interval] += transcript_id
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
171
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
172
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
173 class JalviewRunner:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
174
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
175 def __init__(self, jalview_jar_dir, tmp_dir=None):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
176
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
177 self.jalview_jar_dir = jalview_jar_dir
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
178
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
179 if tmp_dir:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
180 self.tmp_dir = tempfile.mkdtemp(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
181 dir=os.path.abspath(os.path.expandvars(tmp_dir)))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
182 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
183 self.tmp_dir = tempfile.mkdtemp()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
184
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
185 self.config = ['#---JalviewX Properties File---',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
186 '#Fri Nov 04 14:23:53 CET 2011',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
187 'FIGURE_AUTOIDWIDTH=true',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
188 'ID_ITALICS=false',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
189 'SORT_ALIGNMENT=No sort',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
190 'SHOW_IDENTITY=true',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
191 'FONT_NAME=SansSerif',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
192 'GAP_SYMBOL=.',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
193 'SHOW_QUALITY=false',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
194 'SHOW_GROUP_CONSERVATION=false',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
195 'FONT_STYLE=plain',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
196 'ANTI_ALIAS=false',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
197 'SORT_BY_TREE=false',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
198 'SHOW_CONSENSUS_HISTOGRAM=false',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
199 'SHOW_OVERVIEW=false',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
200 'DEFAULT_COLOUR=Nucleotide',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
201 'SHOW_CONSENSUS_LOGO=false',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
202 'SHOW_ANNOTATIONS=true',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
203 'SHOW_UNCONSERVED=true',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
204 'AUTO_CALC_CONSENSUS=true',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
205 'PAD_GAPS=true',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
206 'FONT_SIZE=10',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
207 'RIGHT_ALIGN_IDS=false',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
208 'WRAP_ALIGNMENT=false',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
209 'FASTA_JVSUFFIX=false',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
210 'PILEUP_JVSUFFIX=false',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
211 'SHOW_JVSUFFIX=false']
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
212
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
213 def _make_configuration(self, sub_tmp_dir):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
214
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
215 config_path = os.path.join(sub_tmp_dir, 'config.txt')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
216 logging.debug(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
217 'JalviewRunner: writing configuration file %s' % config_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
218 with open(config_path, 'w') as config_file:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
219 for c in self.config:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
220 config_file.write(c + '\n')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
221
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
222 return config_path
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
223
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
224 def _delete_tmp_dir(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
225
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
226 try:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
227 rmtree(self.tmp_dir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
228 except OSError:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
229 pass
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
230
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
231 def _delete_sub_tmp_dir(self, sub_tmp_dir):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
232
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
233 try:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
234 rmtree(sub_tmp_dir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
235 except OSError:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
236 pass
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
237
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
238 def run(self, fasta_path, png_output_path):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
239
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
240 fasta_path = os.path.abspath(os.path.expandvars(fasta_path))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
241
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
242 png_output_path = os.path.abspath(os.path.expandvars(png_output_path))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
243
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
244 sub_tmp_dir = tempfile.mkdtemp(dir=self.tmp_dir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
245 config_path = self._make_configuration(sub_tmp_dir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
246
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
247 logging.debug(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
248 'JalviewRunner: running Jalview on fasta %s with temp dir %s' %
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
249 (fasta_path, sub_tmp_dir))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
250
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
251 cline = ['java', '-Djava.ext.dirs=%s/lib' % self.jalview_jar_dir,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
252 '-cp %s/jalview.jar' % self.jalview_jar_dir, 'jalview.bin.Jalview',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
253 '-open', fasta_path, '-nodisplay', '-png', png_output_path,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
254 '-colour', 'nucleotide', '-props', config_path]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
255
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
256 # Run sibelia process
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
257 java_process = subprocess.Popen(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
258 ' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
259
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
260 # Block until streams are closed by the process
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
261 stdout, stderr = java_process.communicate()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
262
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
263 if stderr:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
264 sys.stderr.write('JalviewRunner: ' + stderr.replace('\n', ' '))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
265
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
266 self._delete_sub_tmp_dir(sub_tmp_dir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
267
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
268 return png_output_path
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
269
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
270 def __del__(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
271
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
272 self._delete_tmp_dir()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
273
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
274
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
275 class LastzRunner:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
276
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
277 def __init__(self, lastz_path=None, word_length=7):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
278
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
279 self.lastz_path = lastz_path
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
280 self.word_length = word_length
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
281
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
282 def align_to_bam_file(self, reference_fasta_path, query_fasta_path, output_bam_path, multiple=False, assert_record=None):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
283
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
284 logging.debug('LastzRunner: running on reference %s and query %s' %
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
285 (reference_fasta_path, query_fasta_path))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
286 output_sam_path = os.path.abspath(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
287 os.path.expandvars(output_bam_path.replace('.bam', '.sam')))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
288 output_bam_unsorted_path = os.path.abspath(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
289 os.path.expandvars(output_bam_path + '.unsorted'))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
290
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
291 logging.debug(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
292 'LastzRunner: aligning with output in temporary sam file %s' %
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
293 output_sam_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
294 with open(output_sam_path, 'w') as output_sam_handler:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
295 for line in self._align(reference_fasta_path, query_fasta_path, multiple):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
296 output_sam_handler.write(line)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
297
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
298 logging.debug(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
299 'LastzRunner: transforming sam into unsorted bam file %s' %
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
300 output_bam_unsorted_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
301 input_sam_handler = pysam.Samfile(output_sam_path, "r")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
302 output_bam_file = pysam.Samfile(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
303 output_bam_unsorted_path, "wb", template=input_sam_handler)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
304
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
305 logging.debug(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
306 'LastzRunner: copying from sam file to bam file')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
307 for s in input_sam_handler:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
308 output_bam_file.write(s)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
309 output_bam_file.close()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
310
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
311 logging.debug('LastzRunner: sorting and indexing bam file %s' %
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
312 output_bam_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
313 pysam.sort(output_bam_unsorted_path,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
314 output_bam_path.replace('.bam', ''))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
315
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
316 pysam.index(output_bam_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
317
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
318 def align_to_samlines(self, reference_fasta_path, query_fasta_path, multiple=False):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
319
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
320 logging.debug('LastzRunner: running on reference %s and query %s' %
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
321 (reference_fasta_path, query_fasta_path))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
322
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
323 for line in self._align(reference_fasta_path, query_fasta_path, multiple):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
324 yield line
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
325
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
326 def _align(self, reference_fasta_path, query_fasta_path, multiple):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
327
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
328 reference_fasta_path = os.path.abspath(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
329 os.path.expandvars(reference_fasta_path))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
330 query_fasta_path = os.path.abspath(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
331 os.path.expandvars(query_fasta_path))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
332
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
333 lastz = [self.lastz_path and self.lastz_path or 'lastz']
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
334 if multiple:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
335 cline = lastz + [
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
336 reference_fasta_path +
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
337 '[multiple]', query_fasta_path + '[nameparse=darkspace]',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
338 '--format=sam', '--strand=both', '--gapped', '--chain',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
339 '--nogfextend', '--seed=match%i' % self.word_length]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
340
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
341 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
342 cline = lastz + [
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
343 reference_fasta_path, query_fasta_path +
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
344 '[nameparse=darkspace]',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
345 '--format=sam', '--strand=both', '--gapped',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
346 '--chain', '--nogfextend', '--seed=match%i' % self.word_length]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
347
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
348 # Run lastz process
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
349 lastz_process = subprocess.Popen(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
350 ' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
351
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
352 # Filter multiple read mappings (one to each strand) by only keeping
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
353 # the one with the most matches
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
354 alignments = []
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
355 for i, line in enumerate(iter(lastz_process.stdout.readline, '')):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
356 if line.startswith('@'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
357 yield line
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
358 elif line.lower().startswith('read'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
359
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
360 fields = line.split('\t')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
361 read_name = fields[0]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
362 read_cigar = fields[5]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
363 if not alignments:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
364 alignments = [read_name, line, read_cigar]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
365 continue
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
366 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
367 if alignments[0] == read_name:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
368 this_mapping = sum(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
369 [int(c[:-1]) for c in re.findall('[0-9]*M', read_cigar)])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
370 other_mapping = sum(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
371 [int(c[:-1]) for c in re.findall('[0-9]*M', alignments[2])])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
372
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
373 if other_mapping > this_mapping:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
374 yield alignments[1]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
375 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
376 yield line
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
377 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
378 yield line
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
379 alignments = []
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
380 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
381 yield line
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
382
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
383
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
384 class ConsensusRunner:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
385
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
386 def __init__(self, ambiguity_cutoff=0.3):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
387
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
388 self.ambiguity_cutoff = ambiguity_cutoff
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
389 self.ambiguity = {
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
390 'GT': 'K', 'AC': 'M', 'AG': 'R', 'CT': 'Y', 'CG': 'S',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
391 'AT': 'W', 'CGT': 'B', 'ACG': 'V', 'ACT': 'H', 'AGT': 'D',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
392 'ACGT': 'N'}
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
393
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
394 self.alphabet = Gapped(IUPAC.unambiguous_dna, "-")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
395
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
396 def run(self, input_sam_path, output_fasta_path):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
397
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
398 samfile = pysam.Samfile(input_sam_path, "rb")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
399 references = samfile.references
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
400
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
401 logging.debug(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
402 'ConsensusRunner: writing consensus of bam file %s' % input_sam_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
403
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
404 assert len(references) == 1
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
405
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
406 consensus = defaultdict(list)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
407 conditions = {}
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
408
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
409 base_mapper = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4, '-': 5}
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
410 ambiguity = self.ambiguity
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
411
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
412 alphabet = Gapped(IUPAC.ambiguous_dna)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
413 frequency_cutoff = self.ambiguity_cutoff
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
414
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
415 # Get all conditions of aligned reads
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
416 for alignedread in samfile.fetch(references[0]):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
417
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
418 name = alignedread.qname
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
419 length = len(alignedread.seq)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
420 condition = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
421
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
422 if name.lower().startswith('read'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
423 condition = ';'.join(name.split(';')[:2])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
424 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
425 condition = ';'.join(name.split(';')[:-1])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
426
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
427 if condition in conditions:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
428 if name not in conditions[condition][0]:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
429 conditions[condition][0].add(name)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
430 conditions[condition][1] += 1
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
431 conditions[condition][2] += length
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
432 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
433 conditions[condition] = [set([name]), 1, length]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
434
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
435 # Prepare count dictionaries
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
436 consensus = defaultdict(list)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
437
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
438 for i, pileupcolumn in enumerate(samfile.pileup(references[0], max_depth=100000)):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
439
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
440 if i % 1000 == 0:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
441 logging.debug(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
442 'ConsensusRunner: making consensus at position %i' % i)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
443
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
444 counters = {}
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
445 for condition in conditions:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
446 counters[condition] = [0, 0, 0, 0, 0, 0] # ACGTN-
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
447
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
448 for pileupread in pileupcolumn.pileups:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
449
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
450 alignment = pileupread.alignment
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
451 name = alignment.qname
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
452
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
453 if pileupread.is_del:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
454 base = '-'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
455 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
456 base = alignment.seq[pileupread.qpos].upper()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
457
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
458 if name.lower().startswith('read'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
459 condition = ';'.join(name.split(';')[:2])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
460 counters[condition][base_mapper[base]] += 1
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
461 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
462 condition = ';'.join(name.split(';')[:-1])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
463 counters[condition][base_mapper[base]] += 1
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
464
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
465 for condition, counts in counters.iteritems():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
466 depth = float(sum(counts))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
467 bases = []
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
468 a, c, g, t, n, gap = counts
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
469
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
470 if depth > 0:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
471 if (n / depth) >= frequency_cutoff:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
472 consensus[condition].append('N')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
473 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
474 if (a / depth) >= frequency_cutoff:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
475 bases.append('A')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
476 if (c / depth) >= frequency_cutoff:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
477 bases.append('C')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
478 if (g / depth) >= frequency_cutoff:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
479 bases.append('G')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
480 if (t / depth) >= frequency_cutoff:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
481 bases.append('T')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
482
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
483 if len(bases) > 1:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
484 consensus[condition].append(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
485 ambiguity[''.join(sorted(bases))])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
486 elif len(bases) == 1:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
487 consensus[condition].append(bases[0])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
488 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
489 consensus[condition].append('-')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
490
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
491 # Split consensuses by type
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
492 reference_consensuses = []
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
493 read_consensuses = []
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
494 for condition, sequence_list in consensus.iteritems():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
495 if condition.lower().startswith('read'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
496 read_consensuses.append((''.join(sequence_list), condition))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
497 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
498 reference_consensuses.append(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
499 (''.join(sequence_list), condition))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
500
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
501 reference_consensuses.sort(reverse=True)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
502 read_consensuses.sort(reverse=True)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
503
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
504 # Get start and end of reads (remove fully gapped positions)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
505 start = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
506 end = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
507 for read_consensus, condition in read_consensuses:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
508
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
509 # Forward direction
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
510 for i, char in enumerate(read_consensus):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
511 if char != '-':
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
512 if start is None:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
513 start = i
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
514 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
515 start = min(start, i)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
516 break
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
517
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
518 # Reverse direction
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
519 for i, char in enumerate(read_consensus[::-1]):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
520 if char != '-':
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
521 if end is None:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
522 end = i
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
523 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
524 end = min(end, i)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
525 break
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
526
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
527 if end == 0 or end is None:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
528 end = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
529 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
530 end = -end
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
531
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
532 # Filter all records by start and end of reads
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
533 reference_consensuses =\
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
534 [(r[start:end], c) for r, c in reference_consensuses]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
535
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
536 read_consensuses =\
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
537 [(r[start:end], c) for r, c in read_consensuses]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
538
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
539 # Write consensus records to file
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
540 with open(output_fasta_path, 'w', buffering=100 * 1024 * 1024) as output_handler:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
541 for consensus, condition in read_consensuses + reference_consensuses:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
542 stats = ';'.join(map(str, conditions[condition][1:]))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
543 record = SeqRecord(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
544 Seq(consensus, alphabet=alphabet), id=condition + ';' + stats, name='', description='')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
545 SeqIO.write([record], output_handler, "fasta")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
546
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
547
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
548 class Group:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
549
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
550 def __init__(self, family_name, qualified_family_name, sequence_proxy, tmp_dir=None):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
551
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
552 self.family_name = family_name
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
553 self.qualified_family_name = qualified_family_name
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
554 self.sequence_proxy = sequence_proxy
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
555
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
556 self.regions = []
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
557 self.fasta_path = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
558
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
559 if tmp_dir:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
560 self.tmp_dir = tempfile.mkdtemp(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
561 dir=os.path.abspath(os.path.expandvars(tmp_dir)))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
562 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
563 self.tmp_dir = tempfile.mkdtemp()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
564
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
565 def add_region(self, read_id, pathogen_mapping_locations, human_mapping_locations):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
566
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
567 region = Region(self.sequence_proxy, self.tmp_dir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
568
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
569 region.add_read(read_id)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
570
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
571 for mapping_location in list(pathogen_mapping_locations) + list(human_mapping_locations):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
572 reference = ';'.join(mapping_location[:-2])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
573 region.add_reference(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
574 reference, int(mapping_location[-2]), int(mapping_location[-1]))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
575
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
576 self.regions.append(region)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
577
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
578 def write_outputs(self, output_dir, lastz_runner, consensus_builder, jalview_runner):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
579
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
580 if not self.regions:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
581 logging.debug('Group %s: no regions for family' % self.family_name)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
582 return
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
583
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
584 made_output_dir = False
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
585 for i, region in enumerate(self.regions):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
586
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
587 if not made_output_dir:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
588 output_dir = os.path.abspath(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
589 os.path.expandvars(os.path.join(output_dir, self.qualified_family_name)))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
590 logging.debug('Group %s: writing output files to %s' %
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
591 (self.family_name, output_dir))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
592 try:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
593 os.makedirs(output_dir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
594 except OSError:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
595 pass
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
596 made_output_dir = True
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
597
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
598 logging.debug('Group %s: writing region number %i files to %s' %
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
599 (self.family_name, i + 1, output_dir))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
600
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
601 unaligned_path = os.path.join(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
602 output_dir, 'region_%i_unaligned.fa.bzip2' % (i + 1))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
603 compressed_unaligned = bz2.BZ2File(unaligned_path, 'wb')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
604 with open(region.get_unaligned_fasta_path()) as unaligned_fasta:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
605 for line in unaligned_fasta:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
606 compressed_unaligned.write(line)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
607
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
608 bam_path = os.path.join(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
609 output_dir, 'region_%i_alignment.bam' % (i + 1))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
610 temporary_alignment_bam_path = region.get_alignment_bam_path(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
611 lastz_runner, assert_record='Read')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
612 shutil.copy(temporary_alignment_bam_path, bam_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
613
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
614 shutil.copy(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
615 temporary_alignment_bam_path + '.bai', bam_path + '.bai')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
616
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
617 consensus_path = os.path.join(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
618 output_dir, 'region_%i_consensus.fa' % (i + 1))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
619 temporary_consensus_path = region.get_consensus_fasta_path(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
620 consensus_builder, temporary_alignment_bam_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
621 shutil.copy(temporary_consensus_path, consensus_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
622
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
623 if jalview_runner:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
624
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
625 jalview_path = os.path.join(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
626 output_dir, 'region_%i_consensus.png' % (i + 1))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
627 temporary_jalview_path = region.get_consensus_figure_path(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
628 jalview_runner, temporary_consensus_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
629 shutil.copy(temporary_jalview_path, jalview_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
630
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
631 def _delete_temporary_dir(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
632
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
633 self.fasta_path = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
634 try:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
635 rmtree(self.tmp_dir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
636 except OSError:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
637 pass
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
638
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
639 def filter_regions(self, min_region_length=50, min_read_number=5):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
640
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
641 logging.debug('Group %s: filtering %i candidate regions' %
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
642 (self.family_name, len(self.regions)))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
643
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
644 filtered = [r for r in self.regions if len(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
645 r) >= min_read_number and r.get_longest_reference_length() >= min_region_length]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
646
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
647 ordered = sorted(filtered, reverse=True)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
648
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
649 if ordered:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
650 length = ordered[0].length
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
651 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
652 length = 0
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
653
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
654 logging.debug(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
655 'Group %s: %i candidate regions remained after filtering, the longest is %i bp long' %
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
656 (self.family_name, len(ordered), length))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
657
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
658 self.regions = ordered
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
659
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
660 return ordered
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
661
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
662 def merge_regions(self, max_gap_length):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
663 """ Merges candidate regions into homologous regions. """
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
664
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
665 logging.debug('Group %s: merging %i candidate regions' %
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
666 (self.family_name, len(self.regions)))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
667
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
668 if len(self.regions) > 1:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
669
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
670 potentially_mergable = self.regions
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
671 not_mergable = []
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
672
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
673 while len(potentially_mergable) > 1:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
674
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
675 merged = False
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
676 current = potentially_mergable[0]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
677 compared_to = potentially_mergable[1:]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
678
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
679 for region in compared_to:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
680 if region.overlaps(current, max_gap_length):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
681 region.merge(current)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
682 region.clean_references(max_gap_length)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
683 #logging.debug('Group %s: merged a region. %i potentially mergable candidate regions remaining' % (self.family_name, len(potentially_mergable)))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
684 potentially_mergable = compared_to
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
685 merged = True
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
686 break
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
687
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
688 if not merged:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
689 not_mergable.append(current)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
690 potentially_mergable = compared_to
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
691 #logging.debug('Group %s: not merged a region. %i potentially mergable candidate regions remaining' % (self.family_name, len(potentially_mergable)))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
692
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
693 results = not_mergable + potentially_mergable
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
694
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
695 logging.debug('Group %s: merged into %i regions' %
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
696 (self.family_name, len(results)))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
697
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
698 self.regions = results
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
699
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
700 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
701 logging.debug(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
702 'Group %s: found only 1 region, no mergin necessary' % self.family_name)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
703
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
704 def add_transcripts_to_regions(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
705
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
706 logging.debug('Group %s: enriching %i regions with human transcript information'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
707 % (self.family_name, len(self.regions)))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
708
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
709 for region in self.regions:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
710
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
711 added_transcripts = 0
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
712
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
713 for identifier, locations in region.references.iteritems():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
714 chromosome = identifier.split(';')[-1]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
715
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
716 for start, end in locations:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
717 transcript_identifiers = self.sequence_proxy.get_overlapping_human_transcript_identifiers(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
718 chromosome, start, end)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
719
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
720 for transcript_identifier in transcript_identifiers:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
721 region.add_transcript(transcript_identifier)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
722 added_transcripts += 1
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
723
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
724 logging.debug('Group %s: added %i transcripts to a region' %
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
725 (self.family_name, added_transcripts))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
726
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
727 def __str__(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
728
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
729 s = "Group %s with %i regions" % (self.family_name, len(self.regions))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
730
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
731 return s
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
732
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
733
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
734 class GroupGenerator:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
735
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
736 """ Produces Groups from Hitfiles. Reads are assigned to groups based on
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
737 taxonomic families. Transcripts overlapping human read mappings as well
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
738 as genome sequences of pathogens the reads map to are added to the
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
739 Groups.
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
740 """
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
741
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
742 def __init__(self, sequence_proxy, reference_database_filter=None, pathogen_family_filter=None, tmp_dir=None):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
743
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
744 self.sequence_proxy = sequence_proxy
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
745
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
746 self.tmp_path = tmp_dir
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
747 self.reference_database_filter = reference_database_filter
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
748 self.pathogen_family_filter = pathogen_family_filter
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
749
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
750 self.groups = {}
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
751
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
752 if tmp_dir:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
753 self.tmp_dir = tempfile.mkdtemp(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
754 dir=os.path.abspath(os.path.expandvars(tmp_dir)))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
755 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
756 self.tmp_dir = tempfile.mkdtemp()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
757
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
758 def get_groups(self, min_read_number=5):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
759
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
760 logging.debug(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
761 'GroupGenerator: generating groups from %i hit files, please wait...' %
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
762 len(self.sequence_proxy.hit_records))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
763
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
764 for hit_index in self.sequence_proxy.hit_records:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
765 for read_id, record in hit_index.iteritems():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
766
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
767 # Extract mapping locations to human DNA, human transcript, and
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
768 # pathogen genomes
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
769 mapping_locations = record.description.strip().split(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
770 ' ')[1].split('|')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
771
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
772 pathogen_families = defaultdict(set)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
773 human_mapping_locations = set()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
774
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
775 for mapping_location in mapping_locations:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
776 fields = mapping_location.split(';')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
777
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
778 # Convert start and end to integers
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
779 fields[-2] = int(fields[-2]) # start
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
780 fields[-1] = int(fields[-1]) # end
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
781
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
782 # Add mapping locations to human cDNA or human DNA
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
783 if fields[2] == 'Homo_sapiens':
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
784 human_mapping_locations.add(tuple(fields))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
785
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
786 # Add mapping locations to non-human references
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
787 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
788 reference_database, family = fields[:2]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
789 if self.reference_database_filter and reference_database\
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
790 not in self.reference_database_filter:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
791 continue
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
792 if self.pathogen_family_filter and family\
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
793 not in self.pathogen_family_filter:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
794 continue
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
795 pathogen_families[
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
796 (reference_database, family)].add(tuple(fields))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
797
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
798 if not pathogen_families:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
799 continue
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
800
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
801 # Process the hits to different pathogen families
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
802 for (reference_database, family), pathogen_locations in pathogen_families.iteritems():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
803
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
804 qualified_family_name = reference_database + '_' + family
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
805
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
806 # Obtain existing group or make new group
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
807 if qualified_family_name in self.groups:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
808 group = self.groups[qualified_family_name]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
809 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
810 group = Group(family, qualified_family_name,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
811 self.sequence_proxy, self.tmp_dir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
812 self.groups[qualified_family_name] = group
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
813
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
814 group.add_region(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
815 read_id, pathogen_locations, human_mapping_locations)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
816
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
817 # Exclude groups that have collected to few reads
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
818 for qualified_family_name, group in self.groups.items():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
819 if len(group.regions) < min_read_number:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
820 del self.groups[qualified_family_name]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
821 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
822 logging.debug(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
823 'GroupGenerator: made new group for family %s' % qualified_family_name)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
824
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
825 return self.groups
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
826
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
827
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
828 class Region:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
829
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
830 def __init__(self, sequence_proxy, tmp_dir=None):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
831
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
832 self.sequence_proxy = sequence_proxy
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
833
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
834 self.references = defaultdict(set)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
835 self.reads = set()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
836 self.transcripts = set()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
837 self.length = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
838
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
839 self.sorted_reference_positions = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
840
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
841 self.master_tmp = tmp_dir
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
842 self.tmp_dir = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
843
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
844 self.unaligned_fasta_path = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
845 self.alignment_sam_path = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
846 self.consensus_fasta_path = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
847
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
848 def add_reference(self, name, start, end):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
849
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
850 assert start < end
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
851 self.length = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
852 self.longest_reference_id = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
853 self.references[name].add((start, end))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
854
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
855 def add_read(self, name):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
856
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
857 self.reads.add(name)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
858
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
859 def add_transcript(self, name):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
860
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
861 self.transcripts.add(name)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
862
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
863 def get_tmp_path(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
864
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
865 if self.tmp_dir:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
866 return self.tmp_dir
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
867
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
868 if self.master_tmp:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
869 self.tmp_dir = tempfile.mkdtemp(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
870 dir=os.path.abspath(os.path.expandvars(self.master_tmp)))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
871 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
872 self.tmp_dir = tempfile.mkdtemp()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
873
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
874 return self.tmp_dir
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
875
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
876 def _distance(self, range1, range2):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
877
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
878 first, second = sorted((range1, range2))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
879
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
880 if first[0] <= first[1] < second[0]:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
881 return (second[0] - first[1])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
882 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
883 return 0
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
884
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
885 def _delete_temporary_dir(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
886
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
887 self.unaligned_fasta_path = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
888 if self.tmp_dir:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
889 try:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
890 rmtree(self.tmp_dir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
891 except OSError:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
892 pass
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
893
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
894 def _get_unaligned_fasta_path(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
895
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
896 output_path = os.path.join(self.get_tmp_path(), 'unaligned_region.fa')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
897 logging.debug('Region: writing unaligned fasta to %s' % output_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
898
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
899 assert self.reads
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
900
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
901 with open(output_path, 'w', buffering=100 * 1024 * 1024) as output_handler:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
902
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
903 # Cut and write references
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
904 human_positions = []
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
905 pathogen_positions = []
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
906 for length, identifier, start, end in self.get_sorted_reference_positions():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
907 if ';Homo_sapiens;' in identifier:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
908 human_positions.append((identifier, start, end))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
909 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
910 pathogen_positions.append((identifier, start, end))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
911
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
912 for identifier, start, end in pathogen_positions + human_positions:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
913 record = self.sequence_proxy.get_reference_record(identifier)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
914 if record:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
915 record = SeqRecord(record.seq, record.id, '', '')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
916 #record.id += ';%i-%i' % (start, end)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
917 SeqIO.write(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
918 [record[start - 1:end]], output_handler, "fasta")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
919 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
920 pass
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
921 #logging.debug('Region: could not retrieve reference %s' % identifier)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
922
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
923 # Write full-length transcripts
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
924 for identifier in self.transcripts:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
925 record = self.sequence_proxy.get_transcript_record(identifier)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
926 if record:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
927 record = SeqRecord(record.seq, record.id, '', '')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
928 SeqIO.write([record], output_handler, "fasta")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
929 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
930 logging.debug(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
931 'Region: could not retrieve reference %s' % identifier)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
932
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
933 # Write reads
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
934 for read_id in sorted(self.reads):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
935 record = self.sequence_proxy.get_read_record(read_id)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
936 if record:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
937 # Transform read ID so that LASTZ can work with it (it
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
938 # makes nicknames based on some characters )
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
939 identifier = record.id.replace(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
940 ':', '_').replace('|', '_').replace('/', '_')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
941 record = SeqRecord(record.seq, identifier, '', '')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
942 SeqIO.write([record], output_handler, "fasta")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
943 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
944 logging.debug(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
945 'Region: could not retrieve read %s' % read_id)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
946
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
947 self.unaligned_fasta_path = output_path
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
948
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
949 return self.unaligned_fasta_path
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
950
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
951 def get_unaligned_fasta_path(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
952
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
953 if not self.unaligned_fasta_path:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
954 self._get_unaligned_fasta_path()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
955
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
956 return self.unaligned_fasta_path
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
957
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
958 def _align_fastas(self, aligner, assert_record=None):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
959
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
960 # Write reference fasta
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
961 queries_path = self.get_unaligned_fasta_path()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
962 first_record = SeqIO.parse(open(queries_path, "rU"), "fasta").next()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
963 reference_path = os.path.join(self.get_tmp_path(), 'reference.fa')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
964
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
965 logging.debug(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
966 'Region: writing longest reference to %s' % reference_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
967
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
968 SeqIO.write([first_record], reference_path, "fasta")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
969
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
970 # Do alignment
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
971 bam_path = os.path.join(self.get_tmp_path(), 'aligned.bam')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
972 logging.debug(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
973 'Region: starting alignment using queries %s to sam path %s' %
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
974 (queries_path, bam_path))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
975
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
976 aligner.align_to_bam_file(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
977 reference_path, queries_path, bam_path, assert_record=assert_record)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
978
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
979 return bam_path
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
980
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
981 def _build_alignment_consensus(self, consensus_builder, alignment_bam_path):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
982
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
983 consensus_path = os.path.join(self.get_tmp_path(), 'consensus.fa')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
984 logging.debug('Region: writing consensus to %s' % consensus_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
985
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
986 consensus_builder.run(alignment_bam_path, consensus_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
987
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
988 return consensus_path
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
989
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
990 def get_alignment_bam_path(self, aligner, assert_record=None):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
991
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
992 return self._align_fastas(aligner, assert_record)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
993
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
994 def get_consensus_fasta_path(self, consensus_builder, alignment_bam_path):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
995
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
996 return self._build_alignment_consensus(consensus_builder, alignment_bam_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
997
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
998 def get_consensus_figure_path(self, jalview_runner, consensus_fasta_path):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
999
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1000 png_path = os.path.join(self.get_tmp_path(), 'consensus_figure.png')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1001
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1002 return jalview_runner.run(consensus_fasta_path, png_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1003
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1004 def overlaps(self, other, max_gap_length):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1005
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1006 overlap = (set(self.references) & set(other.references))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1007
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1008 # Overlap is solely based on non-human references
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1009 overlap = [ref for ref in overlap if not ';Homo_sapiens;' in ref]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1010
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1011 if not overlap:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1012 return False
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1013
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1014 for reference in overlap:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1015 reflist = self.references[reference]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1016 for start, end in reflist:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1017 for other_start, other_end in other.references[reference]:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1018 distance = self._distance((start, end),
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1019 (other_start, other_end))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1020 if distance <= max_gap_length:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1021 return True
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1022 return False
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1023
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1024 def merge(self, other):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1025
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1026 for reference, reflist in other.references.iteritems():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1027 for (start, end) in reflist:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1028 self.add_reference(reference, start, end)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1029
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1030 for read in other.reads:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1031 self.add_read(read)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1032
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1033 def clean_references(self, max_gap_length):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1034
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1035 for reference, reflist in self.references.iteritems():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1036
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1037 start_list = sorted(reflist)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1038 saved = list(start_list[0])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1039 result = set()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1040
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1041 for item in start_list:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1042 if self._distance(saved, item) <= max_gap_length:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1043 saved[1] = max(saved[1], item[1])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1044 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1045 result.add(tuple(saved))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1046 saved[0] = item[0]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1047 saved[1] = item[1]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1048 result.add(tuple(saved))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1049 self.references[reference] = result
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1050
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1051 def to_sorted_record_ids(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1052
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1053 references = []
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1054 for name, locations in self.references.iteritems():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1055 for start, end in locations:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1056 references.append(end - start, name)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1057
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1058 reads = defaultdict(list)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1059 for name, locations in self.references.iteritems():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1060 condition = name.split(';')[1]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1061 for start, end in locations:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1062 reads[condition].append(end - start, name)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1063
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1064 all_record_ids = sorted(references, reverse=True)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1065 for condition, the_reads in reads.iteritems():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1066 all_record_ids += sorted(the_reads, reverse=True)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1067
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1068 for length, record_id in all_record_ids:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1069 yield record_id
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1070
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1071 def __len__(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1072
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1073 return len(self.reads)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1074
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1075 def __cmp__(self, other):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1076
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1077 if self.get_longest_reference_length() <\
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1078 other.get_longest_reference_length():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1079 return -1
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1080 elif self.get_longest_reference_length() ==\
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1081 other.get_longest_reference_length():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1082 return 0
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1083 return 1
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1084
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1085 def get_longest_reference_length(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1086
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1087 if self.length is not None:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1088 return self.length
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1089
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1090 positions = self.get_sorted_reference_positions()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1091 if positions:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1092 self.length = self.get_sorted_reference_positions()[0][0]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1093 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1094 self.length = 0
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1095
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1096 return self.length
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1097
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1098 def get_sorted_reference_positions(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1099
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1100 if self.sorted_reference_positions is not None:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1101 return self.sorted_reference_positions
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1102
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1103 lengths = []
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1104 for reference, the_set in self.references.iteritems():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1105 for start, end in the_set:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1106 lengths.append([end - start, reference, start, end])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1107
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1108 self.sorted_reference_positions = sorted(lengths, reverse=True)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1109
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1110 return self.sorted_reference_positions
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1111
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1112 def __str__(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1113 return '<Region> of length %10i with %10i reads and %5i references' %\
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1114 (self.get_longest_reference_length(),
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1115 len(self.reads), len(self.references))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1116
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1117
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1118 class RegionStatistics:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1119
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1120 def __init__(self, input_dir):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1121
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1122 self.input_dir = input_dir
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1123 self.stats = []
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1124
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1125 def _add_sequence(self, qualified_name, region_id,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1126 longest_human_reference, longest_pathogen_reference, record):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1127
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1128 state = 'pathogen'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1129 if longest_human_reference:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1130 state = 'ambiguous'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1131
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1132 reference_type = qualified_name.split('_')[0]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1133 family_name = '_'.join(qualified_name.split('_')[1:])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1134
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1135 read_type, sample_id, number_reads, basepairs = record.id.split(';')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1136
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1137 coverage = int(basepairs) / float(len(longest_pathogen_reference))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1138
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1139 self.stats.append(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1140 [reference_type, family_name, region_id, sample_id, state,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1141 number_reads, str(len(longest_pathogen_reference)), basepairs, '%.5f' % coverage])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1142
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1143 def run(self, output_file):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1144
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1145 for qualified_family_name in os.listdir(self.input_dir):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1146
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1147 family_dir = os.path.join(self.input_dir, qualified_family_name)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1148
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1149 if not os.path.isdir(family_dir):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1150 continue
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1151
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1152 consensus_regions = glob.glob(os.path.join(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1153 family_dir, 'region_*_consensus.fa'))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1154
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1155 for consensus_region in consensus_regions:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1156
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1157 base_bame = os.path.basename(consensus_region)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1158
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1159 region_id = base_bame.split('_')[1]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1160
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1161 reads = []
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1162 longest_human_reference = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1163 longest_pathogen_reference = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1164
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1165 for consensus_record in SeqIO.parse(consensus_region, 'fasta'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1166 if consensus_record.id.lower().startswith('read'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1167 # ends with read number; cumulative bases
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1168 reads.append(consensus_record)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1169 elif ';Homo_sapiens;' in consensus_record.id:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1170 if longest_human_reference is None or\
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1171 len(consensus_record) > longest_human_reference:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1172 longest_human_reference = consensus_record
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1173 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1174 if longest_pathogen_reference is None or\
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1175 len(consensus_record) > longest_pathogen_reference:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1176 longest_pathogen_reference = consensus_record
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1177
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1178 for read in reads:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1179 self._add_sequence(qualified_family_name, region_id,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1180 longest_human_reference, longest_pathogen_reference, read)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1181
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1182 with open(os.path.abspath(os.path.expandvars(output_file)), 'w') as output_handler:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1183 output_handler.write(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1184 '\t'.join(['reference_type', 'family_name', 'region_id',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1185 'sample_id', 'taxonomic_origin', 'number_reads',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1186 'region_length', 'basepairs', 'coverage']) + '\n')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1187 for entries in sorted(self.stats):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1188 line = '\t'.join(entries) + '\n'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1189 output_handler.write(line)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1190
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1191
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1192 @CLI.subcommand("regions")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1193 class RegionRunner(cli.Application):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1194
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1195 """ Derive homologous groups and regions from hit files """
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1196
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1197 hit_files = cli.SwitchAttr(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1198 ['-v', '--virana_hits'], str, list=True, mandatory=True,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1199 help="Add hit file for analysis. May be supplied multiple times.")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1200
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1201 lastz_path = cli.SwitchAttr(['-z', '--lastz_path'], str, mandatory=False,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1202 help="Path to lastz executable",
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1203 default='')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1204
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1205 jalview_jar_dir = cli.SwitchAttr(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1206 ['-j', '--jalview_jar_dir'], str, mandatory=False,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1207 help="Directory containing the jalview.jar file",
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1208 default=None)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1209
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1210 references_path = cli.SwitchAttr(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1211 ['-r', '--references'], str, mandatory=True,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1212 help="Input fasta file containing genomic references of pathogens and possibly of human chromosomes as well (the latter is experimental)",
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1213 default='')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1214
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1215 cdna_path = cli.SwitchAttr(['-c', '--cdna'], str, mandatory=False,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1216 help="Input fasta file containing human cDNA records.",
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1217 default=None)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1218
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1219 output_dir = cli.SwitchAttr(['-o', '--output_dir'], str, mandatory=True,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1220 help="Output directory that will be filled with subdirectories corresponding to homologous regions. The directory will be generated if it is not existing.",
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1221 default='')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1222
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1223 tmp_dir = cli.SwitchAttr(['-t', '--tmp_dir'], str, mandatory=False,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1224 help="Directory in which temorary files are stored. Temporary files will be stored in subdirectories of the provided directory and will be deleted after completion.",
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1225 default=None)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1226
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1227 stat_path = cli.SwitchAttr(['-s', '--region_stats'], str, mandatory=False,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1228 help="Path to output statistics tab-delimited text file.",
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1229 default='')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1230
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1231 reference_database_filter = cli.SwitchAttr(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1232 ['-b', '--reference_database_filter'], str, list=True, mandatory=False,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1233 help="Specifies which kind of reference databases are consisdered when extracting hits from hit files. May be specified multiple times. If any are specified, all reference databases not specified are filtered out. By default, this parameter is empty.",
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1234 default=[])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1235
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1236 pathogen_family_filter = cli.SwitchAttr(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1237 ['-f', '--pathogen_family_filter'], str, list=True, mandatory=False,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1238 help="Specifies which kind of pathogen families are consisdered when extracting hits from hit files. May be specified multiple times. If any are specified, all families not specified are filtered out. By default, this parameter is empty.",
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1239 default=[])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1240
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1241 min_read_number = cli.SwitchAttr(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1242 ['-m', '--min_read_number'], cli.Range(1, 1000), mandatory=False,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1243 help="Minimum number of reads that are required to be present in homologous region. Regions with fewer reads will be omitted from the results.",
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1244 default=5)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1245
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1246 max_gap_length = cli.SwitchAttr(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1247 ['-l', '--max_gap_length'], cli.Range(1, 1000), mandatory=False,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1248 help="Maximum number bases that two candidate homologous regions are distant from each other with regard to their positions on a common reference sequence in order for being eligable for merging.",
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1249 default=25)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1250
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1251 min_region_length = cli.SwitchAttr(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1252 ['-x', '--min_region_length'], cli.Range(1, 1000), mandatory=False,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1253 help="Minimum number bases of the longest reference sequence of each homologous region that is generated. Shoer regions will be omitted from the results.",
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1254 default=50)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1255
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1256 word_length = cli.SwitchAttr(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1257 ['-w', '--word_length'], cli.Range(1, 21), mandatory=False,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1258 help="Word length of the lastz alignment process. Shorter word lengths allow more sensitive but less specific alignments.",
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1259 default=7)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1260
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1261 ambiguity_cutoff = cli.SwitchAttr(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1262 ['-a', '--ambiguity_cutoff'], float, mandatory=False,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1263 help="Ratio of variant positions within a column of the homologous region sequence alignment so that the corresponding consensus sequence at this positions show a ambiguous base instead of the majority base.",
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1264 default=0.3)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1265
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1266 debug = cli.Flag(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1267 ["-d", "--debug"], help="Enable debug information")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1268
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1269 def main(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1270
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1271 if self.debug:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1272 logging.getLogger().setLevel(logging.DEBUG)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1273
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1274 # Make sequence proxy for managing hit files, references, and cdna
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1275 proxy = SequenceProxy(self.references_path, self.cdna_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1276 for hit_file in self.hit_files:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1277 proxy.add_hit_file(hit_file)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1278
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1279 # Generate homologous groups
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1280 generator = GroupGenerator(proxy,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1281 reference_database_filter=self.reference_database_filter,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1282 pathogen_family_filter=self.pathogen_family_filter,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1283 tmp_dir=self.tmp_dir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1284 groups = generator.get_groups(min_read_number=self.min_read_number)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1285
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1286 # Prepare analysis modules ('runners') for postprocessing homologous
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1287 # regions
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1288 consensus = ConsensusRunner(ambiguity_cutoff=self.ambiguity_cutoff)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1289 lastz = LastzRunner(word_length=self.word_length)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1290
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1291 if self.jalview_jar_dir:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1292 jalview = JalviewRunner(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1293 jalview_jar_dir=self.jalview_jar_dir, tmp_dir=self.tmp_dir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1294 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1295 jalview = None
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1296
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1297 # Make homologous regions within each homologous group
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1298 for name, group in groups.iteritems():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1299 group.merge_regions(max_gap_length=self.max_gap_length)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1300 group.filter_regions(
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1301 min_region_length=self.min_region_length, min_read_number=self.min_read_number)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1302
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1303 if self.cdna_path:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1304 group.add_transcripts_to_regions()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1305
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1306 group.write_outputs(self.output_dir, lastz, consensus, jalview)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1307 group._delete_temporary_dir()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1308
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1309 # Run statistics on all homologous regions
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1310 if self.stat_path:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1311 statistics = RegionStatistics(self.output_dir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1312 statistics.run(self.stat_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1313
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1314 if __name__ == "__main__":
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1315 CLI.run()