0
|
1 import cPickle
|
|
2 import numpy
|
|
3 import os
|
|
4 import string
|
|
5 import sys
|
|
6
|
|
7 COMPLEMENT = lambda s: s.translate(string.maketrans('ATCGatcgNnXx', 'TAGCtagcNnXx'))
|
|
8 MAGIC = "@flattened@"
|
|
9
|
|
10
|
|
11 def ext_is_flat(ext):
|
|
12 fh = open(ext)
|
|
13 t = fh.read(len(MAGIC))
|
|
14 fh.close()
|
|
15 return MAGIC == t
|
|
16
|
|
17
|
|
18 def is_up_to_date(a, b):
|
|
19 return os.path.exists(a) and os.stat(a).st_mtime >= os.stat(b).st_mtime
|
|
20
|
|
21
|
|
22 class FastaRecord(object):
|
|
23 __slots__ = ('fh', 'start', 'stop')
|
|
24 ext = ".flat"
|
|
25 idx = ".gdx"
|
|
26
|
|
27 def __init__(self, fh, start, stop):
|
|
28 self.fh = fh
|
|
29 self.stop = stop
|
|
30 self.start = start
|
|
31
|
|
32 @property
|
|
33 def __array_interface__(self):
|
|
34 return {'shape': (len(self), ),
|
|
35 'typestr': '|S1',
|
|
36 'version': 3,
|
|
37 'data': buffer(self)}
|
|
38
|
|
39 def __getitem__(self, islice):
|
|
40 fh = self.fh
|
|
41 fh.seek(self.start)
|
|
42 if isinstance(islice, (int, long)):
|
|
43 if islice < 0:
|
|
44 if -islice > self.stop - self.start:
|
|
45 raise IndexError
|
|
46 fh.seek(self.stop + islice)
|
|
47 else:
|
|
48 fh.seek(self.start + islice)
|
|
49 return fh.read(1)
|
|
50 if islice.start in (0, None) and islice.stop in (None, sys.maxint):
|
|
51 if islice.step in (1, None):
|
|
52 return fh.read(self.stop - self.start)
|
|
53 return fh.read(self.stop - self.start)[::islice.step]
|
|
54 istart, istop = self._adjust_slice(islice)
|
|
55 if istart is None:
|
|
56 return ""
|
|
57 l = istop - istart
|
|
58 if l == 0:
|
|
59 return ""
|
|
60 fh.seek(istart)
|
|
61 if islice.step in (1, None):
|
|
62 return fh.read(l)
|
|
63 return fh.read(l)[::islice.step]
|
|
64
|
|
65 def __len__(self):
|
|
66 return self.stop - self.start
|
|
67
|
|
68 def __repr__(self):
|
|
69 return "%s('%s', %i..%i)" % (self.__class__.__name__,
|
|
70 self.fh.name,
|
|
71 self.start,
|
|
72 self.stop)
|
|
73
|
|
74 def __str__(self):
|
|
75 return self[:]
|
|
76
|
|
77 def _adjust_slice(self, islice):
|
|
78 if islice.start is not None and islice.start < 0:
|
|
79 istart = self.stop + islice.start
|
|
80 else:
|
|
81 if islice.start is None:
|
|
82 istart = self.start
|
|
83 else:
|
|
84 istart = self.start + islice.start
|
|
85 if islice.stop is not None and islice.stop < 0:
|
|
86 istop = self.stop + islice.stop
|
|
87 else:
|
|
88 istop = islice.stop is None and self.stop or (self.start + islice.stop)
|
|
89 # This will give empty string.
|
|
90 if istart > self.stop:
|
|
91 return self.stop, self.stop
|
|
92 if istart < self.start:
|
|
93 istart = self.start
|
|
94 if istop < self.start:
|
|
95 istop = self.start
|
|
96 elif istop > self.stop:
|
|
97 istop = self.stop
|
|
98 return istart, istop
|
|
99
|
|
100 @classmethod
|
|
101 def copy_inplace(klass, flat_name, fasta_name):
|
|
102 """
|
|
103 Overwrite the .fasta file with the .fasta.flat file and save
|
|
104 something in the .flat as a place-holder.
|
|
105 """
|
|
106 os.rename(flat_name, fasta_name)
|
|
107 # Still need the flattened file to show it's current.
|
|
108 flatfh = open(fasta_name + klass.ext, 'wb')
|
|
109 flatfh.write(MAGIC)
|
|
110 flatfh.close()
|
|
111
|
|
112 @classmethod
|
|
113 def is_current(klass, fasta_name):
|
|
114 utd = is_up_to_date(fasta_name + klass.idx, fasta_name)
|
|
115 if not utd:
|
|
116 return False
|
|
117 return is_up_to_date(fasta_name + klass.ext, fasta_name)
|
|
118
|
|
119 @classmethod
|
|
120 def modify_flat(klass, flat_file):
|
|
121 return open(flat_file, 'rb')
|
|
122
|
|
123 @classmethod
|
|
124 def prepare(klass, fasta_obj, seqinfo_generator, flatten_inplace):
|
|
125 """
|
|
126 Returns the __getitem__'able index. and the thing from which to get the seqs.
|
|
127 """
|
|
128 f = fasta_obj.fasta_name
|
|
129 if klass.is_current(f):
|
|
130 fh = open(f + klass.idx, 'rb')
|
|
131 idx = cPickle.load(fh)
|
|
132 fh.close()
|
|
133 if flatten_inplace or ext_is_flat(f + klass.ext):
|
|
134 flat = klass.modify_flat(f)
|
|
135 else:
|
|
136 flat = klass.modify_flat(f + klass.ext)
|
|
137 if flatten_inplace and not ext_is_flat(f + klass.ext):
|
|
138 del flat
|
|
139 else:
|
|
140 return idx, flat
|
|
141 idx = {}
|
|
142 flatfh = open(f + klass.ext, 'wb')
|
|
143 for i, (seqid, seq) in enumerate(seqinfo_generator):
|
|
144 if flatten_inplace:
|
|
145 if i == 0:
|
|
146 flatfh.write('>%s\n' % seqid)
|
|
147 else:
|
|
148 flatfh.write('\n>%s\n' % seqid)
|
|
149 start = flatfh.tell()
|
|
150 flatfh.write(seq)
|
|
151 stop = flatfh.tell()
|
|
152 idx[seqid] = (start, stop)
|
|
153 flatfh.close()
|
|
154 if flatten_inplace:
|
|
155 klass.copy_inplace(flatfh.name, f)
|
|
156 fh = open(f + klass.idx, 'wb')
|
|
157 cPickle.dump(idx, fh, -1)
|
|
158 fh.close()
|
|
159 return idx, klass.modify_flat(f)
|
|
160 fh = open(f + klass.idx, 'wb')
|
|
161 cPickle.dump(idx, fh, -1)
|
|
162 fh.close()
|
|
163 return idx, klass.modify_flat(f + klass.ext)
|
|
164
|
|
165
|
|
166 class NpyFastaRecord(FastaRecord):
|
|
167 __slots__ = ('start', 'stop', 'mm', 'as_string')
|
|
168
|
|
169 def __init__(self, mm, start, stop, as_string=True):
|
|
170 self.mm = mm
|
|
171 self.start = start
|
|
172 self.stop = stop
|
|
173 self.as_string = as_string
|
|
174
|
|
175 def __repr__(self):
|
|
176 return "%s(%i..%i)" % (self.__class__.__name__, self.start, self.stop)
|
|
177
|
|
178 @classmethod
|
|
179 def modify_flat(klass, flat_file):
|
|
180 mm = numpy.memmap(flat_file, dtype="S1", mode="r")
|
|
181 return mm
|
|
182
|
|
183 def getdata(self, islice):
|
|
184 if isinstance(islice, (int, long)):
|
|
185 if islice >= 0:
|
|
186 islice += self.start
|
|
187 else:
|
|
188 islice += self.stop
|
|
189 if islice < 0:
|
|
190 raise IndexError
|
|
191 return self.mm[islice]
|
|
192 start, stop = self._adjust_slice(islice)
|
|
193 return self.mm[start:stop:islice.step]
|
|
194
|
|
195 def __getitem__(self, islice):
|
|
196 d = self.getdata(islice)
|
|
197 return d.tostring() if self.as_string else d
|
|
198
|
|
199 @property
|
|
200 def __array_interface__(self):
|
|
201 return {'shape': (len(self), ),
|
|
202 'typestr': '|S1',
|
|
203 'version': 3,
|
|
204 'data': self[:]}
|
|
205
|
|
206
|
|
207 class DuplicateHeaderException(Exception):
|
|
208
|
|
209 def __init__(self, header):
|
|
210 Exception.__init__(self, 'headers must be unique: %s is duplicated' % header)
|
|
211
|
|
212
|
|
213 class Fasta(dict):
|
|
214
|
|
215 def __init__(self, fasta_name, record_class=NpyFastaRecord, flatten_inplace=False, key_fn=None):
|
|
216 self.fasta_name = fasta_name
|
|
217 self.record_class = record_class
|
|
218 self.key_fn = key_fn
|
|
219 self.index, self.prepared = self.record_class.prepare(self,
|
|
220 self.gen_seqs_with_headers(key_fn),
|
|
221 flatten_inplace)
|
|
222 self.chr = {}
|
|
223
|
|
224 def __contains__(self, key):
|
|
225 return key in self.index
|
|
226
|
|
227 def __getitem__(self, i):
|
|
228 # Implements the lazy loading
|
|
229 if self.key_fn is not None:
|
|
230 i = self.key_fn(i)
|
|
231 if i in self.chr:
|
|
232 return self.chr[i]
|
|
233 c = self.index[i]
|
|
234 self.chr[i] = self.record_class(self.prepared, c[0], c[1])
|
|
235 return self.chr[i]
|
|
236
|
|
237 def __len__(self):
|
|
238 return len(self.index)
|
|
239
|
|
240 @classmethod
|
|
241 def as_kmers(klass, seq, k, overlap=0):
|
|
242 kmax = len(seq)
|
|
243 assert overlap < k, ('overlap must be < kmer length')
|
|
244 i = 0
|
|
245 while i < kmax:
|
|
246 yield i, seq[i:i + k]
|
|
247 i += k - overlap
|
|
248
|
|
249 def gen_seqs_with_headers(self, key_fn=None):
|
|
250 """
|
|
251 Remove all newlines from the sequence in a fasta file
|
|
252 and generate starts, stops to be used by the record class.
|
|
253 """
|
|
254 fh = open(self.fasta_name, 'r')
|
|
255 # Do the flattening (remove newlines)
|
|
256 # check of unique-ness of headers.
|
|
257 seen_headers = {}
|
|
258 header = None
|
|
259 seqs = None
|
|
260 for line in fh:
|
|
261 line = line.rstrip()
|
|
262 if not line:
|
|
263 continue
|
|
264 if line[0] == ">":
|
|
265 if seqs is not None:
|
|
266 if header in seen_headers:
|
|
267 raise DuplicateHeaderException(header)
|
|
268 seen_headers[header] = None
|
|
269 yield header, "".join(seqs)
|
|
270 header = line[1:].strip()
|
|
271 if key_fn is not None:
|
|
272 header = key_fn(header)
|
|
273 seqs = []
|
|
274 else:
|
|
275 seqs.append(line)
|
|
276 if seqs != []:
|
|
277 if header in seen_headers:
|
|
278 raise DuplicateHeaderException(header)
|
|
279 yield header, "".join(seqs)
|
|
280 fh.close()
|
|
281
|
|
282 def iterkeys(self):
|
|
283 for k in self.index.iterkeys():
|
|
284 yield k
|
|
285
|
|
286 def iteritems(self):
|
|
287 for k in self.keys():
|
|
288 yield k, self[k]
|
|
289
|
|
290 def keys(self):
|
|
291 return self.index.keys()
|
|
292
|
|
293 def sequence(self, f, asstring=True, auto_rc=True, exon_keys=None, one_based=True):
|
|
294 """
|
|
295 Take a feature and use the start/stop or exon_keys to return the sequence from the
|
|
296 associated fasta file. By default, just return the full sequence between start
|
|
297 and stop, but if exon_keys is set to an iterable, search for those keys and use the
|
|
298 first to create a sequence and return the concatenated result.
|
|
299
|
|
300 Note that sequence is 2 characters shorter than the entire feature, to account for
|
|
301 the introns at base-pairs 12 and 16.
|
|
302
|
|
303 Also note, this function looks for an item with key of 'rnas'. If one is not found,
|
|
304 it continues on to 'exons'. If it doesn't find any of the exon keys it will fall
|
|
305 back on the start, stop of the feature:
|
|
306
|
|
307 f: a feature, a feature can have exons.
|
|
308 asstring: if true, return the sequence as a string, if false, return as a numpy array
|
|
309 auto_rc : if True and the strand of the feature == -1, returnthe reverse complement of the sequence
|
|
310 one_based: if true, query is using 1 based closed intervals, if false semi-open zero based intervals
|
|
311 """
|
|
312 assert 'chr' in f and f['chr'] in self, (f, f['chr'], self.keys())
|
|
313 fasta = self[f['chr']]
|
|
314 sequence = None
|
|
315 if exon_keys is not None:
|
|
316 sequence = self._seq_from_keys(f, fasta, exon_keys, one_based=one_based)
|
|
317 if sequence is None:
|
|
318 start = f['start'] - int(one_based)
|
|
319 sequence = fasta[start: f['stop']]
|
|
320 if auto_rc and f.get('strand') in (-1, '-1', '-'):
|
|
321 sequence = COMPLEMENT(sequence)[::-1]
|
|
322 if asstring:
|
|
323 return sequence
|
|
324 return numpy.array(sequence, dtype='c')
|
|
325
|
|
326 def _seq_from_keys(self, f, fasta, exon_keys, base='locations', one_based=True):
|
|
327 """
|
|
328 f: a feature dict
|
|
329 fasta: a Fasta object
|
|
330 exon_keys: an iterable of keys, to look for start/stop arrays to get sequence.
|
|
331 base: if base ('locations') exists, look there fore the exon_keys, not in the base
|
|
332 of the object.
|
|
333 """
|
|
334 fbase = f.get(base, f)
|
|
335 for ek in exon_keys:
|
|
336 if ek not in fbase:
|
|
337 continue
|
|
338 locs = fbase[ek]
|
|
339 seq = ""
|
|
340 for start, stop in locs:
|
|
341 seq += fasta[start - int(one_based):stop]
|
|
342 return seq
|
|
343 return None
|