Mercurial > repos > melissacline > ucsc_cancer_utilities
comparison seg2matrix/CGData/__init__.py @ 31:ab20c0d04f4a
add seg2matrix tool
author | jingchunzhu |
---|---|
date | Fri, 24 Jul 2015 13:10:11 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
30:7a7a52e9b019 | 31:ab20c0d04f4a |
---|---|
1 | |
2 import os | |
3 import re | |
4 import json | |
5 import functools | |
6 from zipfile import ZipFile | |
7 import sys | |
8 import hashlib | |
9 """ | |
10 CGData object style: | |
11 | |
12 Every file type documented in the CGData specification has an equivilent object | |
13 to parse and manipulate the contents of that file type. For <dataType> there | |
14 should be a CGData.<dataType> object with a <CGData> class. These classes | |
15 should extend the baseObject class. For loading they implement the 'read' | |
16 function which will parse the contents of a file from a passed file handle. | |
17 """ | |
18 | |
19 | |
20 OBJECT_MAP = { | |
21 'genomicSegment': ('CGData.GenomicSegment', 'GenomicSegment'), | |
22 'genomicMatrix': ('CGData.GenomicMatrix', 'GenomicMatrix'), | |
23 'probeMap': ('CGData.ProbeMap', 'ProbeMap'), | |
24 'probeLoc': ('CGData.ProbeLoc', 'ProbeLoc'), | |
25 'aliasMap' : ('CGData.AliasMap', 'AliasMap'), | |
26 'idDAG': ('CGData.IDDag', 'IDDag'), | |
27 'clinicalMatrix': ('CGData.ClinicalMatrix', 'ClinicalMatrix'), | |
28 'dataSubType': ('CGData.DataSubType', 'DataSubType'), | |
29 'assembly': ('CGData.Assembly', 'Assembly'), | |
30 'featureDescription': ('CGData.FeatureDescription', 'FeatureDescription'), | |
31 'refGene' : ('CGData.RefGene', 'RefGene'), | |
32 'idList' : ('CGData.IDList', 'IDList') | |
33 } | |
34 | |
35 class FormatException(Exception): | |
36 | |
37 def __init__(self, str): | |
38 Exception.__init__(self, str) | |
39 | |
40 | |
41 def has_type(type_str): | |
42 return type_str in OBJECT_MAP | |
43 | |
44 def get_type(type_str): | |
45 mod_name, cls_name = OBJECT_MAP[type_str] | |
46 module = __import__(mod_name, globals(), locals(), [ cls_name ]) | |
47 cls = getattr(module, cls_name) | |
48 return cls | |
49 | |
50 class UnimplementedException(Exception): | |
51 def __init__(self, str="Method not implemented"): | |
52 Exception.__init__(self, str) | |
53 | |
54 class CGObjectBase(dict): | |
55 """ | |
56 This is the base object for CGData loadable objects. | |
57 The methods covered in the base case cover usage meta-information | |
58 loading/unloading and manipulation as well as zip (cgz) file access. | |
59 """ | |
60 __format__ = None | |
61 def __init__(self): | |
62 self.path = None | |
63 self.zip = None | |
64 self.light_mode = False | |
65 self.loaded = False | |
66 if 'cgformat' not in self and self.__format__ is not None: | |
67 self['cgformat'] = self.__format__ | |
68 super(CGObjectBase,self).__init__() | |
69 | |
70 def load(self, path=None, **kw): | |
71 """ | |
72 Load a data object in from path | |
73 """ | |
74 if path is None and self.path is not None: | |
75 path = self.path | |
76 if path is None: | |
77 raise OSError( "Path not defined" ) | |
78 | |
79 if self.zip is None: | |
80 if os.path.exists(path): | |
81 dhandle = open(path,'rU') | |
82 self.read(dhandle, **kw) | |
83 dhandle.close() | |
84 else: | |
85 z = ZipFile(self.zip) | |
86 dhandle = z.open(self.path, 'rU') | |
87 self.read(dhandle, **kw) | |
88 dhandle.close() | |
89 z.close() | |
90 | |
91 self.path = path | |
92 if (os.path.exists(path + ".json")): | |
93 mhandle = open(path + ".json",'rU') | |
94 meta = json.loads(mhandle.read()) | |
95 meta = dict((k, v) for k, v in meta.iteritems() if v != None) | |
96 self.update(meta) | |
97 mhandle.close() | |
98 self.loaded = True | |
99 | |
100 def unload(self): | |
101 """Call to start freeing up memory""" | |
102 self.free() | |
103 self.loaded = False | |
104 | |
105 def store(self, path=None): | |
106 """ | |
107 Store an object onto the path provided. | |
108 Will write a path and a path.json file. | |
109 """ | |
110 if path is None and self.path is not None: | |
111 path = self.path | |
112 if path is None: | |
113 raise OSError( "Path not defined" ) | |
114 meta = {} | |
115 meta.update(self) | |
116 if 'cgformat' in meta: | |
117 del meta['cgformat'] | |
118 mHandle = open(path + ".json", "w") | |
119 mHandle.write(json.dumps(meta)) | |
120 mHandle.close() | |
121 if not self.light_mode: | |
122 self.path = path | |
123 dhandle = open(path, "w") | |
124 self.write(dhandle) | |
125 dhandle.close() | |
126 | |
127 def load_keyset(self, key_predicate): | |
128 if self.path is not None: | |
129 if self.zip is None: | |
130 if os.path.exists(self.path): | |
131 dhandle = open(self.path, 'rU') | |
132 out = self.read_keyset(dhandle, key_predicate) | |
133 for a in out: | |
134 yield a | |
135 dhandle.close() | |
136 else: | |
137 z = ZipFile(self.zip) | |
138 dhandle = z.open(self.path, 'rU') | |
139 out = self.read_keyset(dhandle, key_predicate) | |
140 for a in out: | |
141 yield a | |
142 dhandle.close() | |
143 z.close() | |
144 | |
145 def read_keyset(self, handle, key_predicate=None): | |
146 raise UnimplementedException() | |
147 | |
148 def read(self, handle): | |
149 """ | |
150 The read method is implemented by the subclass that | |
151 inherits from CGObjectBase. It is passed a handle | |
152 to a file (which may be on file, in a compressed object, or | |
153 from a network source). The implementing class then uses his handle | |
154 to populate it's data structures. | |
155 """ | |
156 raise UnimplementedException() | |
157 | |
158 def write(self, handle): | |
159 """ | |
160 The write method is implemented by the subclass that | |
161 inherits from CGObjectBase. It is passed a handle to an | |
162 output file, which it can use 'write' method calls to emit | |
163 it's data. | |
164 """ | |
165 raise UnimplementedException() | |
166 | |
167 def get_name(self): | |
168 """ | |
169 Get object name | |
170 """ | |
171 return self.get( 'cgdata', {} ).get( 'name', None ) | |
172 | |
173 def get_type(self): | |
174 """ | |
175 Get object type | |
176 """ | |
177 return self.get('cgdata', {}).get('type', None) | |
178 | |
179 def get_link_map(self): | |
180 """ | |
181 Get a dict that represents the declared file relationships from the meta-info | |
182 """ | |
183 out = {} | |
184 if "cgformat" in self: | |
185 if "links" in self["cgformat"]: | |
186 for field in self['cgformat']['links']: | |
187 if field in self['cgdata']: | |
188 if isinstance(self['cgdata'][field], str) or isinstance(self['cgdata'][field], unicode) : | |
189 out[field] = { 'type' : field, 'name' : self['cgdata'][field] } | |
190 else: | |
191 out[field] = { 'type' : self['cgdata'][field]['type'], 'name' : self['cgdata'][field]['name'] } | |
192 | |
193 for e in ['columnKeySrc', 'rowKeySrc' ]: | |
194 if e in self['cgdata']: | |
195 if e not in out: | |
196 out[e] = {} | |
197 link = self['cgdata'][e] | |
198 out[e] = { 'type' : link['type'], 'name' : link['name'] } | |
199 return out | |
200 | |
201 def add_history(self, desc): | |
202 if not 'history' in self: | |
203 self[ 'history' ] = [] | |
204 self[ 'history' ].append( desc ) | |
205 | |
206 | |
207 class CGDataMatrixObject(CGObjectBase): | |
208 | |
209 def __init__(self): | |
210 CGObjectBase.__init__(self) | |
211 | |
212 | |
213 def get_col_namespace(self): | |
214 """ | |
215 Return the name of the column namespace | |
216 """ | |
217 raise UnimplementedException() | |
218 | |
219 def get_row_namespace(self): | |
220 """ | |
221 Return the name of the row namespace | |
222 """ | |
223 raise UnimplementedException() | |
224 | |
225 def get_col_list(self): | |
226 """ | |
227 Returns names of columns | |
228 """ | |
229 raise UnimplementedException() | |
230 | |
231 def get_row_list(self): | |
232 """ | |
233 Returns names of rows | |
234 """ | |
235 raise UnimplementedException() | |
236 | |
237 def get_row_map(self): | |
238 """ | |
239 Returns map of row name indexes | |
240 """ | |
241 raise UnimplementedException() | |
242 | |
243 def get_col_map(self): | |
244 """ | |
245 Returns map of row name indexes | |
246 """ | |
247 raise UnimplementedException() | |
248 | |
249 | |
250 def get_row_pos(self, row): | |
251 raise UnimplementedException() | |
252 | |
253 def get_col_pos(self, col): | |
254 raise UnimplementedException() | |
255 | |
256 def get_row_count(self): | |
257 raise UnimplementedException() | |
258 | |
259 def get_col_count(self): | |
260 raise UnimplementedException() | |
261 | |
262 def get_row(self, row_name): | |
263 raise UnimplementedException() | |
264 | |
265 def get_col(self, col_name): | |
266 raise UnimplementedException() | |
267 | |
268 | |
269 def cg_new(type_str): | |
270 """ | |
271 cg_new takes a type string and creates a new object from the | |
272 class named, it uses an internally defined map to find all | |
273 official CGData data types. So if a 'genomicMatrix' is requested | |
274 a CGData.GenomicMatrix.GenomicMatrix is initialized. | |
275 | |
276 type_str -- A string name of a CGData type, ie 'genomicMatrix' | |
277 """ | |
278 mod_name, cls_name = OBJECT_MAP[type_str] | |
279 module = __import__(mod_name, globals(), locals(), [ cls_name ]) | |
280 cls = getattr(module, cls_name) | |
281 out = cls() | |
282 return out | |
283 | |
284 def load(path, zip=None): | |
285 """ | |
286 load is a the automatic CGData loading function. There has to | |
287 be a '.json' file for this function to work. It inspects the | |
288 '.json' file and uses the 'type' field to determine the | |
289 appropriate object loader to use. The object is created | |
290 (using the cg_new function) and the 'read' method is passed | |
291 a handle to the data file. If the 'zip' parameter is not None, | |
292 then it is used as the path to a zipfile, and the path parameter | |
293 is used as an path inside the zip file to the object data | |
294 | |
295 path -- path to file (in file system space if zip is None, otherwise | |
296 it is the location in the zip file) | |
297 zip -- path to zip file (None by default) | |
298 """ | |
299 if not path.endswith(".json"): | |
300 path = path + ".json" | |
301 | |
302 data_path = re.sub(r'.json$', '', path) | |
303 | |
304 try: | |
305 handle = open(path, 'rU') | |
306 meta = json.loads(handle.read()) | |
307 except IOError: | |
308 raise FormatException("Meta-info (%s) file not found" % (path)) | |
309 | |
310 # Throw away empty values | |
311 meta = dict((k, v) for k, v in meta.iteritems() if v != None) | |
312 | |
313 if meta['cgdata']['type'] in OBJECT_MAP: | |
314 out = cg_new(meta['cgdata']['type']) | |
315 out.update( meta ) | |
316 out.path = data_path | |
317 out.load(data_path) | |
318 return out | |
319 else: | |
320 raise FormatException("%s class not found" % (meta['cgdata']['type'])) | |
321 | |
322 | |
323 def light_load(path, zip=None): | |
324 if not path.endswith(".json"): | |
325 path = path + ".json" | |
326 | |
327 data_path = re.sub(r'.json$', '', path) | |
328 | |
329 if zip is None: | |
330 try: | |
331 handle = open(path, 'rU') | |
332 meta = json.loads(handle.read()) | |
333 except IOError: | |
334 raise FormatException("Meta-info (%s) file not found" % (path)) | |
335 else: | |
336 z = ZipFile(zip) | |
337 handle = z.open(path,'rU') | |
338 meta = json.loads(handle.read()) | |
339 handle.close() | |
340 z.close() | |
341 | |
342 # Throw away empty values | |
343 meta = dict((k, v) for k, v in meta.iteritems() if v != None) | |
344 | |
345 if meta['cgdata']['type'] in OBJECT_MAP: | |
346 out = cg_new(meta['cgdata']['type']) | |
347 out.update( meta ) | |
348 out.path = data_path | |
349 out.zip = zip | |
350 out.light_mode = True | |
351 return out | |
352 else: | |
353 raise FormatException("%s class not found" % (meta['cgdata']['type'])) | |
354 | |
355 global LOG_LEVEL | |
356 LOG_LEVEL = 2 | |
357 | |
358 def info(eStr): | |
359 if LOG_LEVEL < 2: | |
360 sys.stderr.write("LOG: %s\n" % (eStr)) | |
361 #errorLogHandle.write("LOG: %s\n" % (eStr)) | |
362 | |
363 def debug(eStr): | |
364 if LOG_LEVEL < 1: | |
365 sys.stderr.write("DEBUG: %s\n" % (eStr)) | |
366 #errorLogHandle.write("LOG: %s\n" % (eStr)) | |
367 | |
368 def warn(eStr): | |
369 if LOG_LEVEL < 3: | |
370 sys.stderr.write("WARNING: %s\n" % (eStr)) | |
371 #errorLogHandle.write("WARNING: %s\n" % (eStr)) | |
372 | |
373 | |
374 def error(eStr): | |
375 sys.stderr.write("ERROR: %s\n" % (eStr)) | |
376 #errorLogHandle.write("ERROR: %s\n" % (eStr)) | |
377 |