comparison seg2matrix/CGData/__init__.py @ 31:ab20c0d04f4a

add seg2matrix tool
author jingchunzhu
date Fri, 24 Jul 2015 13:10:11 -0700
parents
children
comparison
equal deleted inserted replaced
30:7a7a52e9b019 31:ab20c0d04f4a
1
2 import os
3 import re
4 import json
5 import functools
6 from zipfile import ZipFile
7 import sys
8 import hashlib
9 """
10 CGData object style:
11
12 Every file type documented in the CGData specification has an equivilent object
13 to parse and manipulate the contents of that file type. For <dataType> there
14 should be a CGData.<dataType> object with a <CGData> class. These classes
15 should extend the baseObject class. For loading they implement the 'read'
16 function which will parse the contents of a file from a passed file handle.
17 """
18
19
20 OBJECT_MAP = {
21 'genomicSegment': ('CGData.GenomicSegment', 'GenomicSegment'),
22 'genomicMatrix': ('CGData.GenomicMatrix', 'GenomicMatrix'),
23 'probeMap': ('CGData.ProbeMap', 'ProbeMap'),
24 'probeLoc': ('CGData.ProbeLoc', 'ProbeLoc'),
25 'aliasMap' : ('CGData.AliasMap', 'AliasMap'),
26 'idDAG': ('CGData.IDDag', 'IDDag'),
27 'clinicalMatrix': ('CGData.ClinicalMatrix', 'ClinicalMatrix'),
28 'dataSubType': ('CGData.DataSubType', 'DataSubType'),
29 'assembly': ('CGData.Assembly', 'Assembly'),
30 'featureDescription': ('CGData.FeatureDescription', 'FeatureDescription'),
31 'refGene' : ('CGData.RefGene', 'RefGene'),
32 'idList' : ('CGData.IDList', 'IDList')
33 }
34
35 class FormatException(Exception):
36
37 def __init__(self, str):
38 Exception.__init__(self, str)
39
40
41 def has_type(type_str):
42 return type_str in OBJECT_MAP
43
44 def get_type(type_str):
45 mod_name, cls_name = OBJECT_MAP[type_str]
46 module = __import__(mod_name, globals(), locals(), [ cls_name ])
47 cls = getattr(module, cls_name)
48 return cls
49
50 class UnimplementedException(Exception):
51 def __init__(self, str="Method not implemented"):
52 Exception.__init__(self, str)
53
54 class CGObjectBase(dict):
55 """
56 This is the base object for CGData loadable objects.
57 The methods covered in the base case cover usage meta-information
58 loading/unloading and manipulation as well as zip (cgz) file access.
59 """
60 __format__ = None
61 def __init__(self):
62 self.path = None
63 self.zip = None
64 self.light_mode = False
65 self.loaded = False
66 if 'cgformat' not in self and self.__format__ is not None:
67 self['cgformat'] = self.__format__
68 super(CGObjectBase,self).__init__()
69
70 def load(self, path=None, **kw):
71 """
72 Load a data object in from path
73 """
74 if path is None and self.path is not None:
75 path = self.path
76 if path is None:
77 raise OSError( "Path not defined" )
78
79 if self.zip is None:
80 if os.path.exists(path):
81 dhandle = open(path,'rU')
82 self.read(dhandle, **kw)
83 dhandle.close()
84 else:
85 z = ZipFile(self.zip)
86 dhandle = z.open(self.path, 'rU')
87 self.read(dhandle, **kw)
88 dhandle.close()
89 z.close()
90
91 self.path = path
92 if (os.path.exists(path + ".json")):
93 mhandle = open(path + ".json",'rU')
94 meta = json.loads(mhandle.read())
95 meta = dict((k, v) for k, v in meta.iteritems() if v != None)
96 self.update(meta)
97 mhandle.close()
98 self.loaded = True
99
100 def unload(self):
101 """Call to start freeing up memory"""
102 self.free()
103 self.loaded = False
104
105 def store(self, path=None):
106 """
107 Store an object onto the path provided.
108 Will write a path and a path.json file.
109 """
110 if path is None and self.path is not None:
111 path = self.path
112 if path is None:
113 raise OSError( "Path not defined" )
114 meta = {}
115 meta.update(self)
116 if 'cgformat' in meta:
117 del meta['cgformat']
118 mHandle = open(path + ".json", "w")
119 mHandle.write(json.dumps(meta))
120 mHandle.close()
121 if not self.light_mode:
122 self.path = path
123 dhandle = open(path, "w")
124 self.write(dhandle)
125 dhandle.close()
126
127 def load_keyset(self, key_predicate):
128 if self.path is not None:
129 if self.zip is None:
130 if os.path.exists(self.path):
131 dhandle = open(self.path, 'rU')
132 out = self.read_keyset(dhandle, key_predicate)
133 for a in out:
134 yield a
135 dhandle.close()
136 else:
137 z = ZipFile(self.zip)
138 dhandle = z.open(self.path, 'rU')
139 out = self.read_keyset(dhandle, key_predicate)
140 for a in out:
141 yield a
142 dhandle.close()
143 z.close()
144
145 def read_keyset(self, handle, key_predicate=None):
146 raise UnimplementedException()
147
148 def read(self, handle):
149 """
150 The read method is implemented by the subclass that
151 inherits from CGObjectBase. It is passed a handle
152 to a file (which may be on file, in a compressed object, or
153 from a network source). The implementing class then uses his handle
154 to populate it's data structures.
155 """
156 raise UnimplementedException()
157
158 def write(self, handle):
159 """
160 The write method is implemented by the subclass that
161 inherits from CGObjectBase. It is passed a handle to an
162 output file, which it can use 'write' method calls to emit
163 it's data.
164 """
165 raise UnimplementedException()
166
167 def get_name(self):
168 """
169 Get object name
170 """
171 return self.get( 'cgdata', {} ).get( 'name', None )
172
173 def get_type(self):
174 """
175 Get object type
176 """
177 return self.get('cgdata', {}).get('type', None)
178
179 def get_link_map(self):
180 """
181 Get a dict that represents the declared file relationships from the meta-info
182 """
183 out = {}
184 if "cgformat" in self:
185 if "links" in self["cgformat"]:
186 for field in self['cgformat']['links']:
187 if field in self['cgdata']:
188 if isinstance(self['cgdata'][field], str) or isinstance(self['cgdata'][field], unicode) :
189 out[field] = { 'type' : field, 'name' : self['cgdata'][field] }
190 else:
191 out[field] = { 'type' : self['cgdata'][field]['type'], 'name' : self['cgdata'][field]['name'] }
192
193 for e in ['columnKeySrc', 'rowKeySrc' ]:
194 if e in self['cgdata']:
195 if e not in out:
196 out[e] = {}
197 link = self['cgdata'][e]
198 out[e] = { 'type' : link['type'], 'name' : link['name'] }
199 return out
200
201 def add_history(self, desc):
202 if not 'history' in self:
203 self[ 'history' ] = []
204 self[ 'history' ].append( desc )
205
206
207 class CGDataMatrixObject(CGObjectBase):
208
209 def __init__(self):
210 CGObjectBase.__init__(self)
211
212
213 def get_col_namespace(self):
214 """
215 Return the name of the column namespace
216 """
217 raise UnimplementedException()
218
219 def get_row_namespace(self):
220 """
221 Return the name of the row namespace
222 """
223 raise UnimplementedException()
224
225 def get_col_list(self):
226 """
227 Returns names of columns
228 """
229 raise UnimplementedException()
230
231 def get_row_list(self):
232 """
233 Returns names of rows
234 """
235 raise UnimplementedException()
236
237 def get_row_map(self):
238 """
239 Returns map of row name indexes
240 """
241 raise UnimplementedException()
242
243 def get_col_map(self):
244 """
245 Returns map of row name indexes
246 """
247 raise UnimplementedException()
248
249
250 def get_row_pos(self, row):
251 raise UnimplementedException()
252
253 def get_col_pos(self, col):
254 raise UnimplementedException()
255
256 def get_row_count(self):
257 raise UnimplementedException()
258
259 def get_col_count(self):
260 raise UnimplementedException()
261
262 def get_row(self, row_name):
263 raise UnimplementedException()
264
265 def get_col(self, col_name):
266 raise UnimplementedException()
267
268
269 def cg_new(type_str):
270 """
271 cg_new takes a type string and creates a new object from the
272 class named, it uses an internally defined map to find all
273 official CGData data types. So if a 'genomicMatrix' is requested
274 a CGData.GenomicMatrix.GenomicMatrix is initialized.
275
276 type_str -- A string name of a CGData type, ie 'genomicMatrix'
277 """
278 mod_name, cls_name = OBJECT_MAP[type_str]
279 module = __import__(mod_name, globals(), locals(), [ cls_name ])
280 cls = getattr(module, cls_name)
281 out = cls()
282 return out
283
284 def load(path, zip=None):
285 """
286 load is a the automatic CGData loading function. There has to
287 be a '.json' file for this function to work. It inspects the
288 '.json' file and uses the 'type' field to determine the
289 appropriate object loader to use. The object is created
290 (using the cg_new function) and the 'read' method is passed
291 a handle to the data file. If the 'zip' parameter is not None,
292 then it is used as the path to a zipfile, and the path parameter
293 is used as an path inside the zip file to the object data
294
295 path -- path to file (in file system space if zip is None, otherwise
296 it is the location in the zip file)
297 zip -- path to zip file (None by default)
298 """
299 if not path.endswith(".json"):
300 path = path + ".json"
301
302 data_path = re.sub(r'.json$', '', path)
303
304 try:
305 handle = open(path, 'rU')
306 meta = json.loads(handle.read())
307 except IOError:
308 raise FormatException("Meta-info (%s) file not found" % (path))
309
310 # Throw away empty values
311 meta = dict((k, v) for k, v in meta.iteritems() if v != None)
312
313 if meta['cgdata']['type'] in OBJECT_MAP:
314 out = cg_new(meta['cgdata']['type'])
315 out.update( meta )
316 out.path = data_path
317 out.load(data_path)
318 return out
319 else:
320 raise FormatException("%s class not found" % (meta['cgdata']['type']))
321
322
323 def light_load(path, zip=None):
324 if not path.endswith(".json"):
325 path = path + ".json"
326
327 data_path = re.sub(r'.json$', '', path)
328
329 if zip is None:
330 try:
331 handle = open(path, 'rU')
332 meta = json.loads(handle.read())
333 except IOError:
334 raise FormatException("Meta-info (%s) file not found" % (path))
335 else:
336 z = ZipFile(zip)
337 handle = z.open(path,'rU')
338 meta = json.loads(handle.read())
339 handle.close()
340 z.close()
341
342 # Throw away empty values
343 meta = dict((k, v) for k, v in meta.iteritems() if v != None)
344
345 if meta['cgdata']['type'] in OBJECT_MAP:
346 out = cg_new(meta['cgdata']['type'])
347 out.update( meta )
348 out.path = data_path
349 out.zip = zip
350 out.light_mode = True
351 return out
352 else:
353 raise FormatException("%s class not found" % (meta['cgdata']['type']))
354
355 global LOG_LEVEL
356 LOG_LEVEL = 2
357
358 def info(eStr):
359 if LOG_LEVEL < 2:
360 sys.stderr.write("LOG: %s\n" % (eStr))
361 #errorLogHandle.write("LOG: %s\n" % (eStr))
362
363 def debug(eStr):
364 if LOG_LEVEL < 1:
365 sys.stderr.write("DEBUG: %s\n" % (eStr))
366 #errorLogHandle.write("LOG: %s\n" % (eStr))
367
368 def warn(eStr):
369 if LOG_LEVEL < 3:
370 sys.stderr.write("WARNING: %s\n" % (eStr))
371 #errorLogHandle.write("WARNING: %s\n" % (eStr))
372
373
374 def error(eStr):
375 sys.stderr.write("ERROR: %s\n" % (eStr))
376 #errorLogHandle.write("ERROR: %s\n" % (eStr))
377