comparison fasta_extract_utils.py @ 0:bc3f2a5c7b53 draft

Uploaded
author greg
date Sun, 10 Jan 2016 13:03:12 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:bc3f2a5c7b53
1 import cPickle
2 import numpy
3 import os
4 import string
5 import sys
6
7 COMPLEMENT = lambda s: s.translate(string.maketrans('ATCGatcgNnXx', 'TAGCtagcNnXx'))
8 MAGIC = "@flattened@"
9
10
11 def ext_is_flat(ext):
12 fh = open(ext)
13 t = fh.read(len(MAGIC))
14 fh.close()
15 return MAGIC == t
16
17
18 def is_up_to_date(a, b):
19 return os.path.exists(a) and os.stat(a).st_mtime >= os.stat(b).st_mtime
20
21
22 class FastaRecord(object):
23 __slots__ = ('fh', 'start', 'stop')
24 ext = ".flat"
25 idx = ".gdx"
26
27 def __init__(self, fh, start, stop):
28 self.fh = fh
29 self.stop = stop
30 self.start = start
31
32 @property
33 def __array_interface__(self):
34 return {'shape': (len(self), ),
35 'typestr': '|S1',
36 'version': 3,
37 'data': buffer(self)}
38
39 def __getitem__(self, islice):
40 fh = self.fh
41 fh.seek(self.start)
42 if isinstance(islice, (int, long)):
43 if islice < 0:
44 if -islice > self.stop - self.start:
45 raise IndexError
46 fh.seek(self.stop + islice)
47 else:
48 fh.seek(self.start + islice)
49 return fh.read(1)
50 if islice.start in (0, None) and islice.stop in (None, sys.maxint):
51 if islice.step in (1, None):
52 return fh.read(self.stop - self.start)
53 return fh.read(self.stop - self.start)[::islice.step]
54 istart, istop = self._adjust_slice(islice)
55 if istart is None:
56 return ""
57 l = istop - istart
58 if l == 0:
59 return ""
60 fh.seek(istart)
61 if islice.step in (1, None):
62 return fh.read(l)
63 return fh.read(l)[::islice.step]
64
65 def __len__(self):
66 return self.stop - self.start
67
68 def __repr__(self):
69 return "%s('%s', %i..%i)" % (self.__class__.__name__,
70 self.fh.name,
71 self.start,
72 self.stop)
73
74 def __str__(self):
75 return self[:]
76
77 def _adjust_slice(self, islice):
78 if islice.start is not None and islice.start < 0:
79 istart = self.stop + islice.start
80 else:
81 if islice.start is None:
82 istart = self.start
83 else:
84 istart = self.start + islice.start
85 if islice.stop is not None and islice.stop < 0:
86 istop = self.stop + islice.stop
87 else:
88 istop = islice.stop is None and self.stop or (self.start + islice.stop)
89 # This will give empty string.
90 if istart > self.stop:
91 return self.stop, self.stop
92 if istart < self.start:
93 istart = self.start
94 if istop < self.start:
95 istop = self.start
96 elif istop > self.stop:
97 istop = self.stop
98 return istart, istop
99
100 @classmethod
101 def copy_inplace(klass, flat_name, fasta_name):
102 """
103 Overwrite the .fasta file with the .fasta.flat file and save
104 something in the .flat as a place-holder.
105 """
106 os.rename(flat_name, fasta_name)
107 # Still need the flattened file to show it's current.
108 flatfh = open(fasta_name + klass.ext, 'wb')
109 flatfh.write(MAGIC)
110 flatfh.close()
111
112 @classmethod
113 def is_current(klass, fasta_name):
114 utd = is_up_to_date(fasta_name + klass.idx, fasta_name)
115 if not utd:
116 return False
117 return is_up_to_date(fasta_name + klass.ext, fasta_name)
118
119 @classmethod
120 def modify_flat(klass, flat_file):
121 return open(flat_file, 'rb')
122
123 @classmethod
124 def prepare(klass, fasta_obj, seqinfo_generator, flatten_inplace):
125 """
126 Returns the __getitem__'able index. and the thing from which to get the seqs.
127 """
128 f = fasta_obj.fasta_name
129 if klass.is_current(f):
130 fh = open(f + klass.idx, 'rb')
131 idx = cPickle.load(fh)
132 fh.close()
133 if flatten_inplace or ext_is_flat(f + klass.ext):
134 flat = klass.modify_flat(f)
135 else:
136 flat = klass.modify_flat(f + klass.ext)
137 if flatten_inplace and not ext_is_flat(f + klass.ext):
138 del flat
139 else:
140 return idx, flat
141 idx = {}
142 flatfh = open(f + klass.ext, 'wb')
143 for i, (seqid, seq) in enumerate(seqinfo_generator):
144 if flatten_inplace:
145 if i == 0:
146 flatfh.write('>%s\n' % seqid)
147 else:
148 flatfh.write('\n>%s\n' % seqid)
149 start = flatfh.tell()
150 flatfh.write(seq)
151 stop = flatfh.tell()
152 idx[seqid] = (start, stop)
153 flatfh.close()
154 if flatten_inplace:
155 klass.copy_inplace(flatfh.name, f)
156 fh = open(f + klass.idx, 'wb')
157 cPickle.dump(idx, fh, -1)
158 fh.close()
159 return idx, klass.modify_flat(f)
160 fh = open(f + klass.idx, 'wb')
161 cPickle.dump(idx, fh, -1)
162 fh.close()
163 return idx, klass.modify_flat(f + klass.ext)
164
165
166 class NpyFastaRecord(FastaRecord):
167 __slots__ = ('start', 'stop', 'mm', 'as_string')
168
169 def __init__(self, mm, start, stop, as_string=True):
170 self.mm = mm
171 self.start = start
172 self.stop = stop
173 self.as_string = as_string
174
175 def __repr__(self):
176 return "%s(%i..%i)" % (self.__class__.__name__, self.start, self.stop)
177
178 @classmethod
179 def modify_flat(klass, flat_file):
180 mm = numpy.memmap(flat_file, dtype="S1", mode="r")
181 return mm
182
183 def getdata(self, islice):
184 if isinstance(islice, (int, long)):
185 if islice >= 0:
186 islice += self.start
187 else:
188 islice += self.stop
189 if islice < 0:
190 raise IndexError
191 return self.mm[islice]
192 start, stop = self._adjust_slice(islice)
193 return self.mm[start:stop:islice.step]
194
195 def __getitem__(self, islice):
196 d = self.getdata(islice)
197 return d.tostring() if self.as_string else d
198
199 @property
200 def __array_interface__(self):
201 return {'shape': (len(self), ),
202 'typestr': '|S1',
203 'version': 3,
204 'data': self[:]}
205
206
207 class DuplicateHeaderException(Exception):
208
209 def __init__(self, header):
210 Exception.__init__(self, 'headers must be unique: %s is duplicated' % header)
211
212
213 class Fasta(dict):
214
215 def __init__(self, fasta_name, record_class=NpyFastaRecord, flatten_inplace=False, key_fn=None):
216 self.fasta_name = fasta_name
217 self.record_class = record_class
218 self.key_fn = key_fn
219 self.index, self.prepared = self.record_class.prepare(self,
220 self.gen_seqs_with_headers(key_fn),
221 flatten_inplace)
222 self.chr = {}
223
224 def __contains__(self, key):
225 return key in self.index
226
227 def __getitem__(self, i):
228 # Implements the lazy loading
229 if self.key_fn is not None:
230 i = self.key_fn(i)
231 if i in self.chr:
232 return self.chr[i]
233 c = self.index[i]
234 self.chr[i] = self.record_class(self.prepared, c[0], c[1])
235 return self.chr[i]
236
237 def __len__(self):
238 return len(self.index)
239
240 @classmethod
241 def as_kmers(klass, seq, k, overlap=0):
242 kmax = len(seq)
243 assert overlap < k, ('overlap must be < kmer length')
244 i = 0
245 while i < kmax:
246 yield i, seq[i:i + k]
247 i += k - overlap
248
249 def gen_seqs_with_headers(self, key_fn=None):
250 """
251 Remove all newlines from the sequence in a fasta file
252 and generate starts, stops to be used by the record class.
253 """
254 fh = open(self.fasta_name, 'r')
255 # Do the flattening (remove newlines)
256 # check of unique-ness of headers.
257 seen_headers = {}
258 header = None
259 seqs = None
260 for line in fh:
261 line = line.rstrip()
262 if not line:
263 continue
264 if line[0] == ">":
265 if seqs is not None:
266 if header in seen_headers:
267 raise DuplicateHeaderException(header)
268 seen_headers[header] = None
269 yield header, "".join(seqs)
270 header = line[1:].strip()
271 if key_fn is not None:
272 header = key_fn(header)
273 seqs = []
274 else:
275 seqs.append(line)
276 if seqs != []:
277 if header in seen_headers:
278 raise DuplicateHeaderException(header)
279 yield header, "".join(seqs)
280 fh.close()
281
282 def iterkeys(self):
283 for k in self.index.iterkeys():
284 yield k
285
286 def iteritems(self):
287 for k in self.keys():
288 yield k, self[k]
289
290 def keys(self):
291 return self.index.keys()
292
293 def sequence(self, f, asstring=True, auto_rc=True, exon_keys=None, one_based=True):
294 """
295 Take a feature and use the start/stop or exon_keys to return the sequence from the
296 associated fasta file. By default, just return the full sequence between start
297 and stop, but if exon_keys is set to an iterable, search for those keys and use the
298 first to create a sequence and return the concatenated result.
299
300 Note that sequence is 2 characters shorter than the entire feature, to account for
301 the introns at base-pairs 12 and 16.
302
303 Also note, this function looks for an item with key of 'rnas'. If one is not found,
304 it continues on to 'exons'. If it doesn't find any of the exon keys it will fall
305 back on the start, stop of the feature:
306
307 f: a feature, a feature can have exons.
308 asstring: if true, return the sequence as a string, if false, return as a numpy array
309 auto_rc : if True and the strand of the feature == -1, returnthe reverse complement of the sequence
310 one_based: if true, query is using 1 based closed intervals, if false semi-open zero based intervals
311 """
312 assert 'chr' in f and f['chr'] in self, (f, f['chr'], self.keys())
313 fasta = self[f['chr']]
314 sequence = None
315 if exon_keys is not None:
316 sequence = self._seq_from_keys(f, fasta, exon_keys, one_based=one_based)
317 if sequence is None:
318 start = f['start'] - int(one_based)
319 sequence = fasta[start: f['stop']]
320 if auto_rc and f.get('strand') in (-1, '-1', '-'):
321 sequence = COMPLEMENT(sequence)[::-1]
322 if asstring:
323 return sequence
324 return numpy.array(sequence, dtype='c')
325
326 def _seq_from_keys(self, f, fasta, exon_keys, base='locations', one_based=True):
327 """
328 f: a feature dict
329 fasta: a Fasta object
330 exon_keys: an iterable of keys, to look for start/stop arrays to get sequence.
331 base: if base ('locations') exists, look there fore the exon_keys, not in the base
332 of the object.
333 """
334 fbase = f.get(base, f)
335 for ek in exon_keys:
336 if ek not in fbase:
337 continue
338 locs = fbase[ek]
339 seq = ""
340 for start, stop in locs:
341 seq += fasta[start - int(one_based):stop]
342 return seq
343 return None