annotate fasta_extract_utils.py @ 5:24769b1ca957 draft

Uploaded
author greg
date Sun, 10 Jan 2016 14:15:26 -0500
parents bc3f2a5c7b53
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
1 import cPickle
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
2 import numpy
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
3 import os
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
4 import string
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
5 import sys
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
6
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
7 COMPLEMENT = lambda s: s.translate(string.maketrans('ATCGatcgNnXx', 'TAGCtagcNnXx'))
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
8 MAGIC = "@flattened@"
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
9
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
10
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
11 def ext_is_flat(ext):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
12 fh = open(ext)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
13 t = fh.read(len(MAGIC))
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
14 fh.close()
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
15 return MAGIC == t
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
16
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
17
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
18 def is_up_to_date(a, b):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
19 return os.path.exists(a) and os.stat(a).st_mtime >= os.stat(b).st_mtime
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
20
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
21
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
22 class FastaRecord(object):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
23 __slots__ = ('fh', 'start', 'stop')
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
24 ext = ".flat"
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
25 idx = ".gdx"
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
26
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
27 def __init__(self, fh, start, stop):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
28 self.fh = fh
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
29 self.stop = stop
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
30 self.start = start
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
31
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
32 @property
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
33 def __array_interface__(self):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
34 return {'shape': (len(self), ),
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
35 'typestr': '|S1',
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
36 'version': 3,
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
37 'data': buffer(self)}
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
38
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
39 def __getitem__(self, islice):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
40 fh = self.fh
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
41 fh.seek(self.start)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
42 if isinstance(islice, (int, long)):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
43 if islice < 0:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
44 if -islice > self.stop - self.start:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
45 raise IndexError
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
46 fh.seek(self.stop + islice)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
47 else:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
48 fh.seek(self.start + islice)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
49 return fh.read(1)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
50 if islice.start in (0, None) and islice.stop in (None, sys.maxint):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
51 if islice.step in (1, None):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
52 return fh.read(self.stop - self.start)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
53 return fh.read(self.stop - self.start)[::islice.step]
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
54 istart, istop = self._adjust_slice(islice)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
55 if istart is None:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
56 return ""
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
57 l = istop - istart
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
58 if l == 0:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
59 return ""
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
60 fh.seek(istart)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
61 if islice.step in (1, None):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
62 return fh.read(l)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
63 return fh.read(l)[::islice.step]
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
64
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
65 def __len__(self):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
66 return self.stop - self.start
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
67
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
68 def __repr__(self):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
69 return "%s('%s', %i..%i)" % (self.__class__.__name__,
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
70 self.fh.name,
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
71 self.start,
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
72 self.stop)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
73
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
74 def __str__(self):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
75 return self[:]
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
76
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
77 def _adjust_slice(self, islice):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
78 if islice.start is not None and islice.start < 0:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
79 istart = self.stop + islice.start
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
80 else:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
81 if islice.start is None:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
82 istart = self.start
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
83 else:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
84 istart = self.start + islice.start
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
85 if islice.stop is not None and islice.stop < 0:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
86 istop = self.stop + islice.stop
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
87 else:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
88 istop = islice.stop is None and self.stop or (self.start + islice.stop)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
89 # This will give empty string.
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
90 if istart > self.stop:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
91 return self.stop, self.stop
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
92 if istart < self.start:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
93 istart = self.start
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
94 if istop < self.start:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
95 istop = self.start
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
96 elif istop > self.stop:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
97 istop = self.stop
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
98 return istart, istop
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
99
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
100 @classmethod
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
101 def copy_inplace(klass, flat_name, fasta_name):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
102 """
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
103 Overwrite the .fasta file with the .fasta.flat file and save
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
104 something in the .flat as a place-holder.
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
105 """
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
106 os.rename(flat_name, fasta_name)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
107 # Still need the flattened file to show it's current.
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
108 flatfh = open(fasta_name + klass.ext, 'wb')
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
109 flatfh.write(MAGIC)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
110 flatfh.close()
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
111
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
112 @classmethod
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
113 def is_current(klass, fasta_name):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
114 utd = is_up_to_date(fasta_name + klass.idx, fasta_name)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
115 if not utd:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
116 return False
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
117 return is_up_to_date(fasta_name + klass.ext, fasta_name)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
118
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
119 @classmethod
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
120 def modify_flat(klass, flat_file):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
121 return open(flat_file, 'rb')
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
122
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
123 @classmethod
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
124 def prepare(klass, fasta_obj, seqinfo_generator, flatten_inplace):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
125 """
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
126 Returns the __getitem__'able index. and the thing from which to get the seqs.
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
127 """
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
128 f = fasta_obj.fasta_name
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
129 if klass.is_current(f):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
130 fh = open(f + klass.idx, 'rb')
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
131 idx = cPickle.load(fh)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
132 fh.close()
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
133 if flatten_inplace or ext_is_flat(f + klass.ext):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
134 flat = klass.modify_flat(f)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
135 else:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
136 flat = klass.modify_flat(f + klass.ext)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
137 if flatten_inplace and not ext_is_flat(f + klass.ext):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
138 del flat
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
139 else:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
140 return idx, flat
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
141 idx = {}
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
142 flatfh = open(f + klass.ext, 'wb')
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
143 for i, (seqid, seq) in enumerate(seqinfo_generator):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
144 if flatten_inplace:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
145 if i == 0:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
146 flatfh.write('>%s\n' % seqid)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
147 else:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
148 flatfh.write('\n>%s\n' % seqid)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
149 start = flatfh.tell()
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
150 flatfh.write(seq)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
151 stop = flatfh.tell()
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
152 idx[seqid] = (start, stop)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
153 flatfh.close()
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
154 if flatten_inplace:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
155 klass.copy_inplace(flatfh.name, f)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
156 fh = open(f + klass.idx, 'wb')
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
157 cPickle.dump(idx, fh, -1)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
158 fh.close()
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
159 return idx, klass.modify_flat(f)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
160 fh = open(f + klass.idx, 'wb')
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
161 cPickle.dump(idx, fh, -1)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
162 fh.close()
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
163 return idx, klass.modify_flat(f + klass.ext)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
164
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
165
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
166 class NpyFastaRecord(FastaRecord):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
167 __slots__ = ('start', 'stop', 'mm', 'as_string')
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
168
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
169 def __init__(self, mm, start, stop, as_string=True):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
170 self.mm = mm
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
171 self.start = start
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
172 self.stop = stop
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
173 self.as_string = as_string
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
174
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
175 def __repr__(self):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
176 return "%s(%i..%i)" % (self.__class__.__name__, self.start, self.stop)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
177
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
178 @classmethod
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
179 def modify_flat(klass, flat_file):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
180 mm = numpy.memmap(flat_file, dtype="S1", mode="r")
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
181 return mm
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
182
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
183 def getdata(self, islice):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
184 if isinstance(islice, (int, long)):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
185 if islice >= 0:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
186 islice += self.start
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
187 else:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
188 islice += self.stop
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
189 if islice < 0:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
190 raise IndexError
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
191 return self.mm[islice]
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
192 start, stop = self._adjust_slice(islice)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
193 return self.mm[start:stop:islice.step]
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
194
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
195 def __getitem__(self, islice):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
196 d = self.getdata(islice)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
197 return d.tostring() if self.as_string else d
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
198
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
199 @property
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
200 def __array_interface__(self):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
201 return {'shape': (len(self), ),
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
202 'typestr': '|S1',
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
203 'version': 3,
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
204 'data': self[:]}
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
205
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
206
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
207 class DuplicateHeaderException(Exception):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
208
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
209 def __init__(self, header):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
210 Exception.__init__(self, 'headers must be unique: %s is duplicated' % header)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
211
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
212
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
213 class Fasta(dict):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
214
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
215 def __init__(self, fasta_name, record_class=NpyFastaRecord, flatten_inplace=False, key_fn=None):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
216 self.fasta_name = fasta_name
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
217 self.record_class = record_class
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
218 self.key_fn = key_fn
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
219 self.index, self.prepared = self.record_class.prepare(self,
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
220 self.gen_seqs_with_headers(key_fn),
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
221 flatten_inplace)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
222 self.chr = {}
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
223
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
224 def __contains__(self, key):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
225 return key in self.index
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
226
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
227 def __getitem__(self, i):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
228 # Implements the lazy loading
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
229 if self.key_fn is not None:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
230 i = self.key_fn(i)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
231 if i in self.chr:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
232 return self.chr[i]
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
233 c = self.index[i]
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
234 self.chr[i] = self.record_class(self.prepared, c[0], c[1])
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
235 return self.chr[i]
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
236
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
237 def __len__(self):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
238 return len(self.index)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
239
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
240 @classmethod
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
241 def as_kmers(klass, seq, k, overlap=0):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
242 kmax = len(seq)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
243 assert overlap < k, ('overlap must be < kmer length')
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
244 i = 0
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
245 while i < kmax:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
246 yield i, seq[i:i + k]
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
247 i += k - overlap
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
248
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
249 def gen_seqs_with_headers(self, key_fn=None):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
250 """
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
251 Remove all newlines from the sequence in a fasta file
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
252 and generate starts, stops to be used by the record class.
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
253 """
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
254 fh = open(self.fasta_name, 'r')
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
255 # Do the flattening (remove newlines)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
256 # check of unique-ness of headers.
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
257 seen_headers = {}
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
258 header = None
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
259 seqs = None
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
260 for line in fh:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
261 line = line.rstrip()
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
262 if not line:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
263 continue
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
264 if line[0] == ">":
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
265 if seqs is not None:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
266 if header in seen_headers:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
267 raise DuplicateHeaderException(header)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
268 seen_headers[header] = None
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
269 yield header, "".join(seqs)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
270 header = line[1:].strip()
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
271 if key_fn is not None:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
272 header = key_fn(header)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
273 seqs = []
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
274 else:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
275 seqs.append(line)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
276 if seqs != []:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
277 if header in seen_headers:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
278 raise DuplicateHeaderException(header)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
279 yield header, "".join(seqs)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
280 fh.close()
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
281
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
282 def iterkeys(self):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
283 for k in self.index.iterkeys():
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
284 yield k
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
285
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
286 def iteritems(self):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
287 for k in self.keys():
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
288 yield k, self[k]
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
289
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
290 def keys(self):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
291 return self.index.keys()
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
292
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
293 def sequence(self, f, asstring=True, auto_rc=True, exon_keys=None, one_based=True):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
294 """
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
295 Take a feature and use the start/stop or exon_keys to return the sequence from the
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
296 associated fasta file. By default, just return the full sequence between start
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
297 and stop, but if exon_keys is set to an iterable, search for those keys and use the
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
298 first to create a sequence and return the concatenated result.
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
299
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
300 Note that sequence is 2 characters shorter than the entire feature, to account for
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
301 the introns at base-pairs 12 and 16.
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
302
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
303 Also note, this function looks for an item with key of 'rnas'. If one is not found,
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
304 it continues on to 'exons'. If it doesn't find any of the exon keys it will fall
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
305 back on the start, stop of the feature:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
306
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
307 f: a feature, a feature can have exons.
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
308 asstring: if true, return the sequence as a string, if false, return as a numpy array
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
309 auto_rc : if True and the strand of the feature == -1, returnthe reverse complement of the sequence
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
310 one_based: if true, query is using 1 based closed intervals, if false semi-open zero based intervals
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
311 """
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
312 assert 'chr' in f and f['chr'] in self, (f, f['chr'], self.keys())
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
313 fasta = self[f['chr']]
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
314 sequence = None
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
315 if exon_keys is not None:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
316 sequence = self._seq_from_keys(f, fasta, exon_keys, one_based=one_based)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
317 if sequence is None:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
318 start = f['start'] - int(one_based)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
319 sequence = fasta[start: f['stop']]
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
320 if auto_rc and f.get('strand') in (-1, '-1', '-'):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
321 sequence = COMPLEMENT(sequence)[::-1]
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
322 if asstring:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
323 return sequence
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
324 return numpy.array(sequence, dtype='c')
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
325
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
326 def _seq_from_keys(self, f, fasta, exon_keys, base='locations', one_based=True):
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
327 """
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
328 f: a feature dict
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
329 fasta: a Fasta object
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
330 exon_keys: an iterable of keys, to look for start/stop arrays to get sequence.
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
331 base: if base ('locations') exists, look there fore the exon_keys, not in the base
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
332 of the object.
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
333 """
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
334 fbase = f.get(base, f)
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
335 for ek in exon_keys:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
336 if ek not in fbase:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
337 continue
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
338 locs = fbase[ek]
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
339 seq = ""
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
340 for start, stop in locs:
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
341 seq += fasta[start - int(one_based):stop]
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
342 return seq
bc3f2a5c7b53 Uploaded
greg
parents:
diff changeset
343 return None