31
|
1
|
|
2 import os
|
|
3 import re
|
|
4 import json
|
|
5 import functools
|
|
6 from zipfile import ZipFile
|
|
7 import sys
|
|
8 import hashlib
|
|
9 """
|
|
10 CGData object style:
|
|
11
|
|
12 Every file type documented in the CGData specification has an equivilent object
|
|
13 to parse and manipulate the contents of that file type. For <dataType> there
|
|
14 should be a CGData.<dataType> object with a <CGData> class. These classes
|
|
15 should extend the baseObject class. For loading they implement the 'read'
|
|
16 function which will parse the contents of a file from a passed file handle.
|
|
17 """
|
|
18
|
|
19
|
|
20 OBJECT_MAP = {
|
|
21 'genomicSegment': ('CGData.GenomicSegment', 'GenomicSegment'),
|
|
22 'genomicMatrix': ('CGData.GenomicMatrix', 'GenomicMatrix'),
|
|
23 'probeMap': ('CGData.ProbeMap', 'ProbeMap'),
|
|
24 'probeLoc': ('CGData.ProbeLoc', 'ProbeLoc'),
|
|
25 'aliasMap' : ('CGData.AliasMap', 'AliasMap'),
|
|
26 'idDAG': ('CGData.IDDag', 'IDDag'),
|
|
27 'clinicalMatrix': ('CGData.ClinicalMatrix', 'ClinicalMatrix'),
|
|
28 'dataSubType': ('CGData.DataSubType', 'DataSubType'),
|
|
29 'assembly': ('CGData.Assembly', 'Assembly'),
|
|
30 'featureDescription': ('CGData.FeatureDescription', 'FeatureDescription'),
|
|
31 'refGene' : ('CGData.RefGene', 'RefGene'),
|
|
32 'idList' : ('CGData.IDList', 'IDList')
|
|
33 }
|
|
34
|
|
35 class FormatException(Exception):
|
|
36
|
|
37 def __init__(self, str):
|
|
38 Exception.__init__(self, str)
|
|
39
|
|
40
|
|
41 def has_type(type_str):
|
|
42 return type_str in OBJECT_MAP
|
|
43
|
|
44 def get_type(type_str):
|
|
45 mod_name, cls_name = OBJECT_MAP[type_str]
|
|
46 module = __import__(mod_name, globals(), locals(), [ cls_name ])
|
|
47 cls = getattr(module, cls_name)
|
|
48 return cls
|
|
49
|
|
50 class UnimplementedException(Exception):
|
|
51 def __init__(self, str="Method not implemented"):
|
|
52 Exception.__init__(self, str)
|
|
53
|
|
54 class CGObjectBase(dict):
|
|
55 """
|
|
56 This is the base object for CGData loadable objects.
|
|
57 The methods covered in the base case cover usage meta-information
|
|
58 loading/unloading and manipulation as well as zip (cgz) file access.
|
|
59 """
|
|
60 __format__ = None
|
|
61 def __init__(self):
|
|
62 self.path = None
|
|
63 self.zip = None
|
|
64 self.light_mode = False
|
|
65 self.loaded = False
|
|
66 if 'cgformat' not in self and self.__format__ is not None:
|
|
67 self['cgformat'] = self.__format__
|
|
68 super(CGObjectBase,self).__init__()
|
|
69
|
|
70 def load(self, path=None, **kw):
|
|
71 """
|
|
72 Load a data object in from path
|
|
73 """
|
|
74 if path is None and self.path is not None:
|
|
75 path = self.path
|
|
76 if path is None:
|
|
77 raise OSError( "Path not defined" )
|
|
78
|
|
79 if self.zip is None:
|
|
80 if os.path.exists(path):
|
|
81 dhandle = open(path,'rU')
|
|
82 self.read(dhandle, **kw)
|
|
83 dhandle.close()
|
|
84 else:
|
|
85 z = ZipFile(self.zip)
|
|
86 dhandle = z.open(self.path, 'rU')
|
|
87 self.read(dhandle, **kw)
|
|
88 dhandle.close()
|
|
89 z.close()
|
|
90
|
|
91 self.path = path
|
|
92 if (os.path.exists(path + ".json")):
|
|
93 mhandle = open(path + ".json",'rU')
|
|
94 meta = json.loads(mhandle.read())
|
|
95 meta = dict((k, v) for k, v in meta.iteritems() if v != None)
|
|
96 self.update(meta)
|
|
97 mhandle.close()
|
|
98 self.loaded = True
|
|
99
|
|
100 def unload(self):
|
|
101 """Call to start freeing up memory"""
|
|
102 self.free()
|
|
103 self.loaded = False
|
|
104
|
|
105 def store(self, path=None):
|
|
106 """
|
|
107 Store an object onto the path provided.
|
|
108 Will write a path and a path.json file.
|
|
109 """
|
|
110 if path is None and self.path is not None:
|
|
111 path = self.path
|
|
112 if path is None:
|
|
113 raise OSError( "Path not defined" )
|
|
114 meta = {}
|
|
115 meta.update(self)
|
|
116 if 'cgformat' in meta:
|
|
117 del meta['cgformat']
|
|
118 mHandle = open(path + ".json", "w")
|
|
119 mHandle.write(json.dumps(meta))
|
|
120 mHandle.close()
|
|
121 if not self.light_mode:
|
|
122 self.path = path
|
|
123 dhandle = open(path, "w")
|
|
124 self.write(dhandle)
|
|
125 dhandle.close()
|
|
126
|
|
127 def load_keyset(self, key_predicate):
|
|
128 if self.path is not None:
|
|
129 if self.zip is None:
|
|
130 if os.path.exists(self.path):
|
|
131 dhandle = open(self.path, 'rU')
|
|
132 out = self.read_keyset(dhandle, key_predicate)
|
|
133 for a in out:
|
|
134 yield a
|
|
135 dhandle.close()
|
|
136 else:
|
|
137 z = ZipFile(self.zip)
|
|
138 dhandle = z.open(self.path, 'rU')
|
|
139 out = self.read_keyset(dhandle, key_predicate)
|
|
140 for a in out:
|
|
141 yield a
|
|
142 dhandle.close()
|
|
143 z.close()
|
|
144
|
|
145 def read_keyset(self, handle, key_predicate=None):
|
|
146 raise UnimplementedException()
|
|
147
|
|
148 def read(self, handle):
|
|
149 """
|
|
150 The read method is implemented by the subclass that
|
|
151 inherits from CGObjectBase. It is passed a handle
|
|
152 to a file (which may be on file, in a compressed object, or
|
|
153 from a network source). The implementing class then uses his handle
|
|
154 to populate it's data structures.
|
|
155 """
|
|
156 raise UnimplementedException()
|
|
157
|
|
158 def write(self, handle):
|
|
159 """
|
|
160 The write method is implemented by the subclass that
|
|
161 inherits from CGObjectBase. It is passed a handle to an
|
|
162 output file, which it can use 'write' method calls to emit
|
|
163 it's data.
|
|
164 """
|
|
165 raise UnimplementedException()
|
|
166
|
|
167 def get_name(self):
|
|
168 """
|
|
169 Get object name
|
|
170 """
|
|
171 return self.get( 'cgdata', {} ).get( 'name', None )
|
|
172
|
|
173 def get_type(self):
|
|
174 """
|
|
175 Get object type
|
|
176 """
|
|
177 return self.get('cgdata', {}).get('type', None)
|
|
178
|
|
179 def get_link_map(self):
|
|
180 """
|
|
181 Get a dict that represents the declared file relationships from the meta-info
|
|
182 """
|
|
183 out = {}
|
|
184 if "cgformat" in self:
|
|
185 if "links" in self["cgformat"]:
|
|
186 for field in self['cgformat']['links']:
|
|
187 if field in self['cgdata']:
|
|
188 if isinstance(self['cgdata'][field], str) or isinstance(self['cgdata'][field], unicode) :
|
|
189 out[field] = { 'type' : field, 'name' : self['cgdata'][field] }
|
|
190 else:
|
|
191 out[field] = { 'type' : self['cgdata'][field]['type'], 'name' : self['cgdata'][field]['name'] }
|
|
192
|
|
193 for e in ['columnKeySrc', 'rowKeySrc' ]:
|
|
194 if e in self['cgdata']:
|
|
195 if e not in out:
|
|
196 out[e] = {}
|
|
197 link = self['cgdata'][e]
|
|
198 out[e] = { 'type' : link['type'], 'name' : link['name'] }
|
|
199 return out
|
|
200
|
|
201 def add_history(self, desc):
|
|
202 if not 'history' in self:
|
|
203 self[ 'history' ] = []
|
|
204 self[ 'history' ].append( desc )
|
|
205
|
|
206
|
|
207 class CGDataMatrixObject(CGObjectBase):
|
|
208
|
|
209 def __init__(self):
|
|
210 CGObjectBase.__init__(self)
|
|
211
|
|
212
|
|
213 def get_col_namespace(self):
|
|
214 """
|
|
215 Return the name of the column namespace
|
|
216 """
|
|
217 raise UnimplementedException()
|
|
218
|
|
219 def get_row_namespace(self):
|
|
220 """
|
|
221 Return the name of the row namespace
|
|
222 """
|
|
223 raise UnimplementedException()
|
|
224
|
|
225 def get_col_list(self):
|
|
226 """
|
|
227 Returns names of columns
|
|
228 """
|
|
229 raise UnimplementedException()
|
|
230
|
|
231 def get_row_list(self):
|
|
232 """
|
|
233 Returns names of rows
|
|
234 """
|
|
235 raise UnimplementedException()
|
|
236
|
|
237 def get_row_map(self):
|
|
238 """
|
|
239 Returns map of row name indexes
|
|
240 """
|
|
241 raise UnimplementedException()
|
|
242
|
|
243 def get_col_map(self):
|
|
244 """
|
|
245 Returns map of row name indexes
|
|
246 """
|
|
247 raise UnimplementedException()
|
|
248
|
|
249
|
|
250 def get_row_pos(self, row):
|
|
251 raise UnimplementedException()
|
|
252
|
|
253 def get_col_pos(self, col):
|
|
254 raise UnimplementedException()
|
|
255
|
|
256 def get_row_count(self):
|
|
257 raise UnimplementedException()
|
|
258
|
|
259 def get_col_count(self):
|
|
260 raise UnimplementedException()
|
|
261
|
|
262 def get_row(self, row_name):
|
|
263 raise UnimplementedException()
|
|
264
|
|
265 def get_col(self, col_name):
|
|
266 raise UnimplementedException()
|
|
267
|
|
268
|
|
269 def cg_new(type_str):
|
|
270 """
|
|
271 cg_new takes a type string and creates a new object from the
|
|
272 class named, it uses an internally defined map to find all
|
|
273 official CGData data types. So if a 'genomicMatrix' is requested
|
|
274 a CGData.GenomicMatrix.GenomicMatrix is initialized.
|
|
275
|
|
276 type_str -- A string name of a CGData type, ie 'genomicMatrix'
|
|
277 """
|
|
278 mod_name, cls_name = OBJECT_MAP[type_str]
|
|
279 module = __import__(mod_name, globals(), locals(), [ cls_name ])
|
|
280 cls = getattr(module, cls_name)
|
|
281 out = cls()
|
|
282 return out
|
|
283
|
|
284 def load(path, zip=None):
|
|
285 """
|
|
286 load is a the automatic CGData loading function. There has to
|
|
287 be a '.json' file for this function to work. It inspects the
|
|
288 '.json' file and uses the 'type' field to determine the
|
|
289 appropriate object loader to use. The object is created
|
|
290 (using the cg_new function) and the 'read' method is passed
|
|
291 a handle to the data file. If the 'zip' parameter is not None,
|
|
292 then it is used as the path to a zipfile, and the path parameter
|
|
293 is used as an path inside the zip file to the object data
|
|
294
|
|
295 path -- path to file (in file system space if zip is None, otherwise
|
|
296 it is the location in the zip file)
|
|
297 zip -- path to zip file (None by default)
|
|
298 """
|
|
299 if not path.endswith(".json"):
|
|
300 path = path + ".json"
|
|
301
|
|
302 data_path = re.sub(r'.json$', '', path)
|
|
303
|
|
304 try:
|
|
305 handle = open(path, 'rU')
|
|
306 meta = json.loads(handle.read())
|
|
307 except IOError:
|
|
308 raise FormatException("Meta-info (%s) file not found" % (path))
|
|
309
|
|
310 # Throw away empty values
|
|
311 meta = dict((k, v) for k, v in meta.iteritems() if v != None)
|
|
312
|
|
313 if meta['cgdata']['type'] in OBJECT_MAP:
|
|
314 out = cg_new(meta['cgdata']['type'])
|
|
315 out.update( meta )
|
|
316 out.path = data_path
|
|
317 out.load(data_path)
|
|
318 return out
|
|
319 else:
|
|
320 raise FormatException("%s class not found" % (meta['cgdata']['type']))
|
|
321
|
|
322
|
|
323 def light_load(path, zip=None):
|
|
324 if not path.endswith(".json"):
|
|
325 path = path + ".json"
|
|
326
|
|
327 data_path = re.sub(r'.json$', '', path)
|
|
328
|
|
329 if zip is None:
|
|
330 try:
|
|
331 handle = open(path, 'rU')
|
|
332 meta = json.loads(handle.read())
|
|
333 except IOError:
|
|
334 raise FormatException("Meta-info (%s) file not found" % (path))
|
|
335 else:
|
|
336 z = ZipFile(zip)
|
|
337 handle = z.open(path,'rU')
|
|
338 meta = json.loads(handle.read())
|
|
339 handle.close()
|
|
340 z.close()
|
|
341
|
|
342 # Throw away empty values
|
|
343 meta = dict((k, v) for k, v in meta.iteritems() if v != None)
|
|
344
|
|
345 if meta['cgdata']['type'] in OBJECT_MAP:
|
|
346 out = cg_new(meta['cgdata']['type'])
|
|
347 out.update( meta )
|
|
348 out.path = data_path
|
|
349 out.zip = zip
|
|
350 out.light_mode = True
|
|
351 return out
|
|
352 else:
|
|
353 raise FormatException("%s class not found" % (meta['cgdata']['type']))
|
|
354
|
|
355 global LOG_LEVEL
|
|
356 LOG_LEVEL = 2
|
|
357
|
|
358 def info(eStr):
|
|
359 if LOG_LEVEL < 2:
|
|
360 sys.stderr.write("LOG: %s\n" % (eStr))
|
|
361 #errorLogHandle.write("LOG: %s\n" % (eStr))
|
|
362
|
|
363 def debug(eStr):
|
|
364 if LOG_LEVEL < 1:
|
|
365 sys.stderr.write("DEBUG: %s\n" % (eStr))
|
|
366 #errorLogHandle.write("LOG: %s\n" % (eStr))
|
|
367
|
|
368 def warn(eStr):
|
|
369 if LOG_LEVEL < 3:
|
|
370 sys.stderr.write("WARNING: %s\n" % (eStr))
|
|
371 #errorLogHandle.write("WARNING: %s\n" % (eStr))
|
|
372
|
|
373
|
|
374 def error(eStr):
|
|
375 sys.stderr.write("ERROR: %s\n" % (eStr))
|
|
376 #errorLogHandle.write("ERROR: %s\n" % (eStr))
|
|
377
|