Mercurial > repos > greg > fasta_extract
comparison fasta_extract_utils.py @ 0:bc3f2a5c7b53 draft
Uploaded
author | greg |
---|---|
date | Sun, 10 Jan 2016 13:03:12 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:bc3f2a5c7b53 |
---|---|
1 import cPickle | |
2 import numpy | |
3 import os | |
4 import string | |
5 import sys | |
6 | |
7 COMPLEMENT = lambda s: s.translate(string.maketrans('ATCGatcgNnXx', 'TAGCtagcNnXx')) | |
8 MAGIC = "@flattened@" | |
9 | |
10 | |
11 def ext_is_flat(ext): | |
12 fh = open(ext) | |
13 t = fh.read(len(MAGIC)) | |
14 fh.close() | |
15 return MAGIC == t | |
16 | |
17 | |
18 def is_up_to_date(a, b): | |
19 return os.path.exists(a) and os.stat(a).st_mtime >= os.stat(b).st_mtime | |
20 | |
21 | |
22 class FastaRecord(object): | |
23 __slots__ = ('fh', 'start', 'stop') | |
24 ext = ".flat" | |
25 idx = ".gdx" | |
26 | |
27 def __init__(self, fh, start, stop): | |
28 self.fh = fh | |
29 self.stop = stop | |
30 self.start = start | |
31 | |
32 @property | |
33 def __array_interface__(self): | |
34 return {'shape': (len(self), ), | |
35 'typestr': '|S1', | |
36 'version': 3, | |
37 'data': buffer(self)} | |
38 | |
39 def __getitem__(self, islice): | |
40 fh = self.fh | |
41 fh.seek(self.start) | |
42 if isinstance(islice, (int, long)): | |
43 if islice < 0: | |
44 if -islice > self.stop - self.start: | |
45 raise IndexError | |
46 fh.seek(self.stop + islice) | |
47 else: | |
48 fh.seek(self.start + islice) | |
49 return fh.read(1) | |
50 if islice.start in (0, None) and islice.stop in (None, sys.maxint): | |
51 if islice.step in (1, None): | |
52 return fh.read(self.stop - self.start) | |
53 return fh.read(self.stop - self.start)[::islice.step] | |
54 istart, istop = self._adjust_slice(islice) | |
55 if istart is None: | |
56 return "" | |
57 l = istop - istart | |
58 if l == 0: | |
59 return "" | |
60 fh.seek(istart) | |
61 if islice.step in (1, None): | |
62 return fh.read(l) | |
63 return fh.read(l)[::islice.step] | |
64 | |
65 def __len__(self): | |
66 return self.stop - self.start | |
67 | |
68 def __repr__(self): | |
69 return "%s('%s', %i..%i)" % (self.__class__.__name__, | |
70 self.fh.name, | |
71 self.start, | |
72 self.stop) | |
73 | |
74 def __str__(self): | |
75 return self[:] | |
76 | |
77 def _adjust_slice(self, islice): | |
78 if islice.start is not None and islice.start < 0: | |
79 istart = self.stop + islice.start | |
80 else: | |
81 if islice.start is None: | |
82 istart = self.start | |
83 else: | |
84 istart = self.start + islice.start | |
85 if islice.stop is not None and islice.stop < 0: | |
86 istop = self.stop + islice.stop | |
87 else: | |
88 istop = islice.stop is None and self.stop or (self.start + islice.stop) | |
89 # This will give empty string. | |
90 if istart > self.stop: | |
91 return self.stop, self.stop | |
92 if istart < self.start: | |
93 istart = self.start | |
94 if istop < self.start: | |
95 istop = self.start | |
96 elif istop > self.stop: | |
97 istop = self.stop | |
98 return istart, istop | |
99 | |
100 @classmethod | |
101 def copy_inplace(klass, flat_name, fasta_name): | |
102 """ | |
103 Overwrite the .fasta file with the .fasta.flat file and save | |
104 something in the .flat as a place-holder. | |
105 """ | |
106 os.rename(flat_name, fasta_name) | |
107 # Still need the flattened file to show it's current. | |
108 flatfh = open(fasta_name + klass.ext, 'wb') | |
109 flatfh.write(MAGIC) | |
110 flatfh.close() | |
111 | |
112 @classmethod | |
113 def is_current(klass, fasta_name): | |
114 utd = is_up_to_date(fasta_name + klass.idx, fasta_name) | |
115 if not utd: | |
116 return False | |
117 return is_up_to_date(fasta_name + klass.ext, fasta_name) | |
118 | |
119 @classmethod | |
120 def modify_flat(klass, flat_file): | |
121 return open(flat_file, 'rb') | |
122 | |
123 @classmethod | |
124 def prepare(klass, fasta_obj, seqinfo_generator, flatten_inplace): | |
125 """ | |
126 Returns the __getitem__'able index. and the thing from which to get the seqs. | |
127 """ | |
128 f = fasta_obj.fasta_name | |
129 if klass.is_current(f): | |
130 fh = open(f + klass.idx, 'rb') | |
131 idx = cPickle.load(fh) | |
132 fh.close() | |
133 if flatten_inplace or ext_is_flat(f + klass.ext): | |
134 flat = klass.modify_flat(f) | |
135 else: | |
136 flat = klass.modify_flat(f + klass.ext) | |
137 if flatten_inplace and not ext_is_flat(f + klass.ext): | |
138 del flat | |
139 else: | |
140 return idx, flat | |
141 idx = {} | |
142 flatfh = open(f + klass.ext, 'wb') | |
143 for i, (seqid, seq) in enumerate(seqinfo_generator): | |
144 if flatten_inplace: | |
145 if i == 0: | |
146 flatfh.write('>%s\n' % seqid) | |
147 else: | |
148 flatfh.write('\n>%s\n' % seqid) | |
149 start = flatfh.tell() | |
150 flatfh.write(seq) | |
151 stop = flatfh.tell() | |
152 idx[seqid] = (start, stop) | |
153 flatfh.close() | |
154 if flatten_inplace: | |
155 klass.copy_inplace(flatfh.name, f) | |
156 fh = open(f + klass.idx, 'wb') | |
157 cPickle.dump(idx, fh, -1) | |
158 fh.close() | |
159 return idx, klass.modify_flat(f) | |
160 fh = open(f + klass.idx, 'wb') | |
161 cPickle.dump(idx, fh, -1) | |
162 fh.close() | |
163 return idx, klass.modify_flat(f + klass.ext) | |
164 | |
165 | |
166 class NpyFastaRecord(FastaRecord): | |
167 __slots__ = ('start', 'stop', 'mm', 'as_string') | |
168 | |
169 def __init__(self, mm, start, stop, as_string=True): | |
170 self.mm = mm | |
171 self.start = start | |
172 self.stop = stop | |
173 self.as_string = as_string | |
174 | |
175 def __repr__(self): | |
176 return "%s(%i..%i)" % (self.__class__.__name__, self.start, self.stop) | |
177 | |
178 @classmethod | |
179 def modify_flat(klass, flat_file): | |
180 mm = numpy.memmap(flat_file, dtype="S1", mode="r") | |
181 return mm | |
182 | |
183 def getdata(self, islice): | |
184 if isinstance(islice, (int, long)): | |
185 if islice >= 0: | |
186 islice += self.start | |
187 else: | |
188 islice += self.stop | |
189 if islice < 0: | |
190 raise IndexError | |
191 return self.mm[islice] | |
192 start, stop = self._adjust_slice(islice) | |
193 return self.mm[start:stop:islice.step] | |
194 | |
195 def __getitem__(self, islice): | |
196 d = self.getdata(islice) | |
197 return d.tostring() if self.as_string else d | |
198 | |
199 @property | |
200 def __array_interface__(self): | |
201 return {'shape': (len(self), ), | |
202 'typestr': '|S1', | |
203 'version': 3, | |
204 'data': self[:]} | |
205 | |
206 | |
207 class DuplicateHeaderException(Exception): | |
208 | |
209 def __init__(self, header): | |
210 Exception.__init__(self, 'headers must be unique: %s is duplicated' % header) | |
211 | |
212 | |
213 class Fasta(dict): | |
214 | |
215 def __init__(self, fasta_name, record_class=NpyFastaRecord, flatten_inplace=False, key_fn=None): | |
216 self.fasta_name = fasta_name | |
217 self.record_class = record_class | |
218 self.key_fn = key_fn | |
219 self.index, self.prepared = self.record_class.prepare(self, | |
220 self.gen_seqs_with_headers(key_fn), | |
221 flatten_inplace) | |
222 self.chr = {} | |
223 | |
224 def __contains__(self, key): | |
225 return key in self.index | |
226 | |
227 def __getitem__(self, i): | |
228 # Implements the lazy loading | |
229 if self.key_fn is not None: | |
230 i = self.key_fn(i) | |
231 if i in self.chr: | |
232 return self.chr[i] | |
233 c = self.index[i] | |
234 self.chr[i] = self.record_class(self.prepared, c[0], c[1]) | |
235 return self.chr[i] | |
236 | |
237 def __len__(self): | |
238 return len(self.index) | |
239 | |
240 @classmethod | |
241 def as_kmers(klass, seq, k, overlap=0): | |
242 kmax = len(seq) | |
243 assert overlap < k, ('overlap must be < kmer length') | |
244 i = 0 | |
245 while i < kmax: | |
246 yield i, seq[i:i + k] | |
247 i += k - overlap | |
248 | |
249 def gen_seqs_with_headers(self, key_fn=None): | |
250 """ | |
251 Remove all newlines from the sequence in a fasta file | |
252 and generate starts, stops to be used by the record class. | |
253 """ | |
254 fh = open(self.fasta_name, 'r') | |
255 # Do the flattening (remove newlines) | |
256 # check of unique-ness of headers. | |
257 seen_headers = {} | |
258 header = None | |
259 seqs = None | |
260 for line in fh: | |
261 line = line.rstrip() | |
262 if not line: | |
263 continue | |
264 if line[0] == ">": | |
265 if seqs is not None: | |
266 if header in seen_headers: | |
267 raise DuplicateHeaderException(header) | |
268 seen_headers[header] = None | |
269 yield header, "".join(seqs) | |
270 header = line[1:].strip() | |
271 if key_fn is not None: | |
272 header = key_fn(header) | |
273 seqs = [] | |
274 else: | |
275 seqs.append(line) | |
276 if seqs != []: | |
277 if header in seen_headers: | |
278 raise DuplicateHeaderException(header) | |
279 yield header, "".join(seqs) | |
280 fh.close() | |
281 | |
282 def iterkeys(self): | |
283 for k in self.index.iterkeys(): | |
284 yield k | |
285 | |
286 def iteritems(self): | |
287 for k in self.keys(): | |
288 yield k, self[k] | |
289 | |
290 def keys(self): | |
291 return self.index.keys() | |
292 | |
293 def sequence(self, f, asstring=True, auto_rc=True, exon_keys=None, one_based=True): | |
294 """ | |
295 Take a feature and use the start/stop or exon_keys to return the sequence from the | |
296 associated fasta file. By default, just return the full sequence between start | |
297 and stop, but if exon_keys is set to an iterable, search for those keys and use the | |
298 first to create a sequence and return the concatenated result. | |
299 | |
300 Note that sequence is 2 characters shorter than the entire feature, to account for | |
301 the introns at base-pairs 12 and 16. | |
302 | |
303 Also note, this function looks for an item with key of 'rnas'. If one is not found, | |
304 it continues on to 'exons'. If it doesn't find any of the exon keys it will fall | |
305 back on the start, stop of the feature: | |
306 | |
307 f: a feature, a feature can have exons. | |
308 asstring: if true, return the sequence as a string, if false, return as a numpy array | |
309 auto_rc : if True and the strand of the feature == -1, returnthe reverse complement of the sequence | |
310 one_based: if true, query is using 1 based closed intervals, if false semi-open zero based intervals | |
311 """ | |
312 assert 'chr' in f and f['chr'] in self, (f, f['chr'], self.keys()) | |
313 fasta = self[f['chr']] | |
314 sequence = None | |
315 if exon_keys is not None: | |
316 sequence = self._seq_from_keys(f, fasta, exon_keys, one_based=one_based) | |
317 if sequence is None: | |
318 start = f['start'] - int(one_based) | |
319 sequence = fasta[start: f['stop']] | |
320 if auto_rc and f.get('strand') in (-1, '-1', '-'): | |
321 sequence = COMPLEMENT(sequence)[::-1] | |
322 if asstring: | |
323 return sequence | |
324 return numpy.array(sequence, dtype='c') | |
325 | |
326 def _seq_from_keys(self, f, fasta, exon_keys, base='locations', one_based=True): | |
327 """ | |
328 f: a feature dict | |
329 fasta: a Fasta object | |
330 exon_keys: an iterable of keys, to look for start/stop arrays to get sequence. | |
331 base: if base ('locations') exists, look there fore the exon_keys, not in the base | |
332 of the object. | |
333 """ | |
334 fbase = f.get(base, f) | |
335 for ek in exon_keys: | |
336 if ek not in fbase: | |
337 continue | |
338 locs = fbase[ek] | |
339 seq = "" | |
340 for start, stop in locs: | |
341 seq += fasta[start - int(one_based):stop] | |
342 return seq | |
343 return None |