Mercurial > repos > greg > fasta_extract
comparison fasta_extract_utils.py @ 0:bc3f2a5c7b53 draft
Uploaded
| author | greg |
|---|---|
| date | Sun, 10 Jan 2016 13:03:12 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:bc3f2a5c7b53 |
|---|---|
| 1 import cPickle | |
| 2 import numpy | |
| 3 import os | |
| 4 import string | |
| 5 import sys | |
| 6 | |
| 7 COMPLEMENT = lambda s: s.translate(string.maketrans('ATCGatcgNnXx', 'TAGCtagcNnXx')) | |
| 8 MAGIC = "@flattened@" | |
| 9 | |
| 10 | |
| 11 def ext_is_flat(ext): | |
| 12 fh = open(ext) | |
| 13 t = fh.read(len(MAGIC)) | |
| 14 fh.close() | |
| 15 return MAGIC == t | |
| 16 | |
| 17 | |
| 18 def is_up_to_date(a, b): | |
| 19 return os.path.exists(a) and os.stat(a).st_mtime >= os.stat(b).st_mtime | |
| 20 | |
| 21 | |
| 22 class FastaRecord(object): | |
| 23 __slots__ = ('fh', 'start', 'stop') | |
| 24 ext = ".flat" | |
| 25 idx = ".gdx" | |
| 26 | |
| 27 def __init__(self, fh, start, stop): | |
| 28 self.fh = fh | |
| 29 self.stop = stop | |
| 30 self.start = start | |
| 31 | |
| 32 @property | |
| 33 def __array_interface__(self): | |
| 34 return {'shape': (len(self), ), | |
| 35 'typestr': '|S1', | |
| 36 'version': 3, | |
| 37 'data': buffer(self)} | |
| 38 | |
| 39 def __getitem__(self, islice): | |
| 40 fh = self.fh | |
| 41 fh.seek(self.start) | |
| 42 if isinstance(islice, (int, long)): | |
| 43 if islice < 0: | |
| 44 if -islice > self.stop - self.start: | |
| 45 raise IndexError | |
| 46 fh.seek(self.stop + islice) | |
| 47 else: | |
| 48 fh.seek(self.start + islice) | |
| 49 return fh.read(1) | |
| 50 if islice.start in (0, None) and islice.stop in (None, sys.maxint): | |
| 51 if islice.step in (1, None): | |
| 52 return fh.read(self.stop - self.start) | |
| 53 return fh.read(self.stop - self.start)[::islice.step] | |
| 54 istart, istop = self._adjust_slice(islice) | |
| 55 if istart is None: | |
| 56 return "" | |
| 57 l = istop - istart | |
| 58 if l == 0: | |
| 59 return "" | |
| 60 fh.seek(istart) | |
| 61 if islice.step in (1, None): | |
| 62 return fh.read(l) | |
| 63 return fh.read(l)[::islice.step] | |
| 64 | |
| 65 def __len__(self): | |
| 66 return self.stop - self.start | |
| 67 | |
| 68 def __repr__(self): | |
| 69 return "%s('%s', %i..%i)" % (self.__class__.__name__, | |
| 70 self.fh.name, | |
| 71 self.start, | |
| 72 self.stop) | |
| 73 | |
| 74 def __str__(self): | |
| 75 return self[:] | |
| 76 | |
| 77 def _adjust_slice(self, islice): | |
| 78 if islice.start is not None and islice.start < 0: | |
| 79 istart = self.stop + islice.start | |
| 80 else: | |
| 81 if islice.start is None: | |
| 82 istart = self.start | |
| 83 else: | |
| 84 istart = self.start + islice.start | |
| 85 if islice.stop is not None and islice.stop < 0: | |
| 86 istop = self.stop + islice.stop | |
| 87 else: | |
| 88 istop = islice.stop is None and self.stop or (self.start + islice.stop) | |
| 89 # This will give empty string. | |
| 90 if istart > self.stop: | |
| 91 return self.stop, self.stop | |
| 92 if istart < self.start: | |
| 93 istart = self.start | |
| 94 if istop < self.start: | |
| 95 istop = self.start | |
| 96 elif istop > self.stop: | |
| 97 istop = self.stop | |
| 98 return istart, istop | |
| 99 | |
| 100 @classmethod | |
| 101 def copy_inplace(klass, flat_name, fasta_name): | |
| 102 """ | |
| 103 Overwrite the .fasta file with the .fasta.flat file and save | |
| 104 something in the .flat as a place-holder. | |
| 105 """ | |
| 106 os.rename(flat_name, fasta_name) | |
| 107 # Still need the flattened file to show it's current. | |
| 108 flatfh = open(fasta_name + klass.ext, 'wb') | |
| 109 flatfh.write(MAGIC) | |
| 110 flatfh.close() | |
| 111 | |
| 112 @classmethod | |
| 113 def is_current(klass, fasta_name): | |
| 114 utd = is_up_to_date(fasta_name + klass.idx, fasta_name) | |
| 115 if not utd: | |
| 116 return False | |
| 117 return is_up_to_date(fasta_name + klass.ext, fasta_name) | |
| 118 | |
| 119 @classmethod | |
| 120 def modify_flat(klass, flat_file): | |
| 121 return open(flat_file, 'rb') | |
| 122 | |
| 123 @classmethod | |
| 124 def prepare(klass, fasta_obj, seqinfo_generator, flatten_inplace): | |
| 125 """ | |
| 126 Returns the __getitem__'able index. and the thing from which to get the seqs. | |
| 127 """ | |
| 128 f = fasta_obj.fasta_name | |
| 129 if klass.is_current(f): | |
| 130 fh = open(f + klass.idx, 'rb') | |
| 131 idx = cPickle.load(fh) | |
| 132 fh.close() | |
| 133 if flatten_inplace or ext_is_flat(f + klass.ext): | |
| 134 flat = klass.modify_flat(f) | |
| 135 else: | |
| 136 flat = klass.modify_flat(f + klass.ext) | |
| 137 if flatten_inplace and not ext_is_flat(f + klass.ext): | |
| 138 del flat | |
| 139 else: | |
| 140 return idx, flat | |
| 141 idx = {} | |
| 142 flatfh = open(f + klass.ext, 'wb') | |
| 143 for i, (seqid, seq) in enumerate(seqinfo_generator): | |
| 144 if flatten_inplace: | |
| 145 if i == 0: | |
| 146 flatfh.write('>%s\n' % seqid) | |
| 147 else: | |
| 148 flatfh.write('\n>%s\n' % seqid) | |
| 149 start = flatfh.tell() | |
| 150 flatfh.write(seq) | |
| 151 stop = flatfh.tell() | |
| 152 idx[seqid] = (start, stop) | |
| 153 flatfh.close() | |
| 154 if flatten_inplace: | |
| 155 klass.copy_inplace(flatfh.name, f) | |
| 156 fh = open(f + klass.idx, 'wb') | |
| 157 cPickle.dump(idx, fh, -1) | |
| 158 fh.close() | |
| 159 return idx, klass.modify_flat(f) | |
| 160 fh = open(f + klass.idx, 'wb') | |
| 161 cPickle.dump(idx, fh, -1) | |
| 162 fh.close() | |
| 163 return idx, klass.modify_flat(f + klass.ext) | |
| 164 | |
| 165 | |
| 166 class NpyFastaRecord(FastaRecord): | |
| 167 __slots__ = ('start', 'stop', 'mm', 'as_string') | |
| 168 | |
| 169 def __init__(self, mm, start, stop, as_string=True): | |
| 170 self.mm = mm | |
| 171 self.start = start | |
| 172 self.stop = stop | |
| 173 self.as_string = as_string | |
| 174 | |
| 175 def __repr__(self): | |
| 176 return "%s(%i..%i)" % (self.__class__.__name__, self.start, self.stop) | |
| 177 | |
| 178 @classmethod | |
| 179 def modify_flat(klass, flat_file): | |
| 180 mm = numpy.memmap(flat_file, dtype="S1", mode="r") | |
| 181 return mm | |
| 182 | |
| 183 def getdata(self, islice): | |
| 184 if isinstance(islice, (int, long)): | |
| 185 if islice >= 0: | |
| 186 islice += self.start | |
| 187 else: | |
| 188 islice += self.stop | |
| 189 if islice < 0: | |
| 190 raise IndexError | |
| 191 return self.mm[islice] | |
| 192 start, stop = self._adjust_slice(islice) | |
| 193 return self.mm[start:stop:islice.step] | |
| 194 | |
| 195 def __getitem__(self, islice): | |
| 196 d = self.getdata(islice) | |
| 197 return d.tostring() if self.as_string else d | |
| 198 | |
| 199 @property | |
| 200 def __array_interface__(self): | |
| 201 return {'shape': (len(self), ), | |
| 202 'typestr': '|S1', | |
| 203 'version': 3, | |
| 204 'data': self[:]} | |
| 205 | |
| 206 | |
| 207 class DuplicateHeaderException(Exception): | |
| 208 | |
| 209 def __init__(self, header): | |
| 210 Exception.__init__(self, 'headers must be unique: %s is duplicated' % header) | |
| 211 | |
| 212 | |
| 213 class Fasta(dict): | |
| 214 | |
| 215 def __init__(self, fasta_name, record_class=NpyFastaRecord, flatten_inplace=False, key_fn=None): | |
| 216 self.fasta_name = fasta_name | |
| 217 self.record_class = record_class | |
| 218 self.key_fn = key_fn | |
| 219 self.index, self.prepared = self.record_class.prepare(self, | |
| 220 self.gen_seqs_with_headers(key_fn), | |
| 221 flatten_inplace) | |
| 222 self.chr = {} | |
| 223 | |
| 224 def __contains__(self, key): | |
| 225 return key in self.index | |
| 226 | |
| 227 def __getitem__(self, i): | |
| 228 # Implements the lazy loading | |
| 229 if self.key_fn is not None: | |
| 230 i = self.key_fn(i) | |
| 231 if i in self.chr: | |
| 232 return self.chr[i] | |
| 233 c = self.index[i] | |
| 234 self.chr[i] = self.record_class(self.prepared, c[0], c[1]) | |
| 235 return self.chr[i] | |
| 236 | |
| 237 def __len__(self): | |
| 238 return len(self.index) | |
| 239 | |
| 240 @classmethod | |
| 241 def as_kmers(klass, seq, k, overlap=0): | |
| 242 kmax = len(seq) | |
| 243 assert overlap < k, ('overlap must be < kmer length') | |
| 244 i = 0 | |
| 245 while i < kmax: | |
| 246 yield i, seq[i:i + k] | |
| 247 i += k - overlap | |
| 248 | |
| 249 def gen_seqs_with_headers(self, key_fn=None): | |
| 250 """ | |
| 251 Remove all newlines from the sequence in a fasta file | |
| 252 and generate starts, stops to be used by the record class. | |
| 253 """ | |
| 254 fh = open(self.fasta_name, 'r') | |
| 255 # Do the flattening (remove newlines) | |
| 256 # check of unique-ness of headers. | |
| 257 seen_headers = {} | |
| 258 header = None | |
| 259 seqs = None | |
| 260 for line in fh: | |
| 261 line = line.rstrip() | |
| 262 if not line: | |
| 263 continue | |
| 264 if line[0] == ">": | |
| 265 if seqs is not None: | |
| 266 if header in seen_headers: | |
| 267 raise DuplicateHeaderException(header) | |
| 268 seen_headers[header] = None | |
| 269 yield header, "".join(seqs) | |
| 270 header = line[1:].strip() | |
| 271 if key_fn is not None: | |
| 272 header = key_fn(header) | |
| 273 seqs = [] | |
| 274 else: | |
| 275 seqs.append(line) | |
| 276 if seqs != []: | |
| 277 if header in seen_headers: | |
| 278 raise DuplicateHeaderException(header) | |
| 279 yield header, "".join(seqs) | |
| 280 fh.close() | |
| 281 | |
| 282 def iterkeys(self): | |
| 283 for k in self.index.iterkeys(): | |
| 284 yield k | |
| 285 | |
| 286 def iteritems(self): | |
| 287 for k in self.keys(): | |
| 288 yield k, self[k] | |
| 289 | |
| 290 def keys(self): | |
| 291 return self.index.keys() | |
| 292 | |
| 293 def sequence(self, f, asstring=True, auto_rc=True, exon_keys=None, one_based=True): | |
| 294 """ | |
| 295 Take a feature and use the start/stop or exon_keys to return the sequence from the | |
| 296 associated fasta file. By default, just return the full sequence between start | |
| 297 and stop, but if exon_keys is set to an iterable, search for those keys and use the | |
| 298 first to create a sequence and return the concatenated result. | |
| 299 | |
| 300 Note that sequence is 2 characters shorter than the entire feature, to account for | |
| 301 the introns at base-pairs 12 and 16. | |
| 302 | |
| 303 Also note, this function looks for an item with key of 'rnas'. If one is not found, | |
| 304 it continues on to 'exons'. If it doesn't find any of the exon keys it will fall | |
| 305 back on the start, stop of the feature: | |
| 306 | |
| 307 f: a feature, a feature can have exons. | |
| 308 asstring: if true, return the sequence as a string, if false, return as a numpy array | |
| 309 auto_rc : if True and the strand of the feature == -1, returnthe reverse complement of the sequence | |
| 310 one_based: if true, query is using 1 based closed intervals, if false semi-open zero based intervals | |
| 311 """ | |
| 312 assert 'chr' in f and f['chr'] in self, (f, f['chr'], self.keys()) | |
| 313 fasta = self[f['chr']] | |
| 314 sequence = None | |
| 315 if exon_keys is not None: | |
| 316 sequence = self._seq_from_keys(f, fasta, exon_keys, one_based=one_based) | |
| 317 if sequence is None: | |
| 318 start = f['start'] - int(one_based) | |
| 319 sequence = fasta[start: f['stop']] | |
| 320 if auto_rc and f.get('strand') in (-1, '-1', '-'): | |
| 321 sequence = COMPLEMENT(sequence)[::-1] | |
| 322 if asstring: | |
| 323 return sequence | |
| 324 return numpy.array(sequence, dtype='c') | |
| 325 | |
| 326 def _seq_from_keys(self, f, fasta, exon_keys, base='locations', one_based=True): | |
| 327 """ | |
| 328 f: a feature dict | |
| 329 fasta: a Fasta object | |
| 330 exon_keys: an iterable of keys, to look for start/stop arrays to get sequence. | |
| 331 base: if base ('locations') exists, look there fore the exon_keys, not in the base | |
| 332 of the object. | |
| 333 """ | |
| 334 fbase = f.get(base, f) | |
| 335 for ek in exon_keys: | |
| 336 if ek not in fbase: | |
| 337 continue | |
| 338 locs = fbase[ek] | |
| 339 seq = "" | |
| 340 for start, stop in locs: | |
| 341 seq += fasta[start - int(one_based):stop] | |
| 342 return seq | |
| 343 return None |
