Mercurial > repos > melissacline > ucsc_cancer_utilities
diff seg2matrix/CGData/__init__.py @ 31:ab20c0d04f4a
add seg2matrix tool
author | jingchunzhu |
---|---|
date | Fri, 24 Jul 2015 13:10:11 -0700 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/seg2matrix/CGData/__init__.py Fri Jul 24 13:10:11 2015 -0700 @@ -0,0 +1,377 @@ + +import os +import re +import json +import functools +from zipfile import ZipFile +import sys +import hashlib +""" +CGData object style: + +Every file type documented in the CGData specification has an equivilent object +to parse and manipulate the contents of that file type. For <dataType> there +should be a CGData.<dataType> object with a <CGData> class. These classes +should extend the baseObject class. For loading they implement the 'read' +function which will parse the contents of a file from a passed file handle. +""" + + +OBJECT_MAP = { + 'genomicSegment': ('CGData.GenomicSegment', 'GenomicSegment'), + 'genomicMatrix': ('CGData.GenomicMatrix', 'GenomicMatrix'), + 'probeMap': ('CGData.ProbeMap', 'ProbeMap'), + 'probeLoc': ('CGData.ProbeLoc', 'ProbeLoc'), + 'aliasMap' : ('CGData.AliasMap', 'AliasMap'), + 'idDAG': ('CGData.IDDag', 'IDDag'), + 'clinicalMatrix': ('CGData.ClinicalMatrix', 'ClinicalMatrix'), + 'dataSubType': ('CGData.DataSubType', 'DataSubType'), + 'assembly': ('CGData.Assembly', 'Assembly'), + 'featureDescription': ('CGData.FeatureDescription', 'FeatureDescription'), + 'refGene' : ('CGData.RefGene', 'RefGene'), + 'idList' : ('CGData.IDList', 'IDList') +} + +class FormatException(Exception): + + def __init__(self, str): + Exception.__init__(self, str) + + +def has_type(type_str): + return type_str in OBJECT_MAP + +def get_type(type_str): + mod_name, cls_name = OBJECT_MAP[type_str] + module = __import__(mod_name, globals(), locals(), [ cls_name ]) + cls = getattr(module, cls_name) + return cls + +class UnimplementedException(Exception): + def __init__(self, str="Method not implemented"): + Exception.__init__(self, str) + +class CGObjectBase(dict): + """ + This is the base object for CGData loadable objects. + The methods covered in the base case cover usage meta-information + loading/unloading and manipulation as well as zip (cgz) file access. + """ + __format__ = None + def __init__(self): + self.path = None + self.zip = None + self.light_mode = False + self.loaded = False + if 'cgformat' not in self and self.__format__ is not None: + self['cgformat'] = self.__format__ + super(CGObjectBase,self).__init__() + + def load(self, path=None, **kw): + """ + Load a data object in from path + """ + if path is None and self.path is not None: + path = self.path + if path is None: + raise OSError( "Path not defined" ) + + if self.zip is None: + if os.path.exists(path): + dhandle = open(path,'rU') + self.read(dhandle, **kw) + dhandle.close() + else: + z = ZipFile(self.zip) + dhandle = z.open(self.path, 'rU') + self.read(dhandle, **kw) + dhandle.close() + z.close() + + self.path = path + if (os.path.exists(path + ".json")): + mhandle = open(path + ".json",'rU') + meta = json.loads(mhandle.read()) + meta = dict((k, v) for k, v in meta.iteritems() if v != None) + self.update(meta) + mhandle.close() + self.loaded = True + + def unload(self): + """Call to start freeing up memory""" + self.free() + self.loaded = False + + def store(self, path=None): + """ + Store an object onto the path provided. + Will write a path and a path.json file. + """ + if path is None and self.path is not None: + path = self.path + if path is None: + raise OSError( "Path not defined" ) + meta = {} + meta.update(self) + if 'cgformat' in meta: + del meta['cgformat'] + mHandle = open(path + ".json", "w") + mHandle.write(json.dumps(meta)) + mHandle.close() + if not self.light_mode: + self.path = path + dhandle = open(path, "w") + self.write(dhandle) + dhandle.close() + + def load_keyset(self, key_predicate): + if self.path is not None: + if self.zip is None: + if os.path.exists(self.path): + dhandle = open(self.path, 'rU') + out = self.read_keyset(dhandle, key_predicate) + for a in out: + yield a + dhandle.close() + else: + z = ZipFile(self.zip) + dhandle = z.open(self.path, 'rU') + out = self.read_keyset(dhandle, key_predicate) + for a in out: + yield a + dhandle.close() + z.close() + + def read_keyset(self, handle, key_predicate=None): + raise UnimplementedException() + + def read(self, handle): + """ + The read method is implemented by the subclass that + inherits from CGObjectBase. It is passed a handle + to a file (which may be on file, in a compressed object, or + from a network source). The implementing class then uses his handle + to populate it's data structures. + """ + raise UnimplementedException() + + def write(self, handle): + """ + The write method is implemented by the subclass that + inherits from CGObjectBase. It is passed a handle to an + output file, which it can use 'write' method calls to emit + it's data. + """ + raise UnimplementedException() + + def get_name(self): + """ + Get object name + """ + return self.get( 'cgdata', {} ).get( 'name', None ) + + def get_type(self): + """ + Get object type + """ + return self.get('cgdata', {}).get('type', None) + + def get_link_map(self): + """ + Get a dict that represents the declared file relationships from the meta-info + """ + out = {} + if "cgformat" in self: + if "links" in self["cgformat"]: + for field in self['cgformat']['links']: + if field in self['cgdata']: + if isinstance(self['cgdata'][field], str) or isinstance(self['cgdata'][field], unicode) : + out[field] = { 'type' : field, 'name' : self['cgdata'][field] } + else: + out[field] = { 'type' : self['cgdata'][field]['type'], 'name' : self['cgdata'][field]['name'] } + + for e in ['columnKeySrc', 'rowKeySrc' ]: + if e in self['cgdata']: + if e not in out: + out[e] = {} + link = self['cgdata'][e] + out[e] = { 'type' : link['type'], 'name' : link['name'] } + return out + + def add_history(self, desc): + if not 'history' in self: + self[ 'history' ] = [] + self[ 'history' ].append( desc ) + + +class CGDataMatrixObject(CGObjectBase): + + def __init__(self): + CGObjectBase.__init__(self) + + + def get_col_namespace(self): + """ + Return the name of the column namespace + """ + raise UnimplementedException() + + def get_row_namespace(self): + """ + Return the name of the row namespace + """ + raise UnimplementedException() + + def get_col_list(self): + """ + Returns names of columns + """ + raise UnimplementedException() + + def get_row_list(self): + """ + Returns names of rows + """ + raise UnimplementedException() + + def get_row_map(self): + """ + Returns map of row name indexes + """ + raise UnimplementedException() + + def get_col_map(self): + """ + Returns map of row name indexes + """ + raise UnimplementedException() + + + def get_row_pos(self, row): + raise UnimplementedException() + + def get_col_pos(self, col): + raise UnimplementedException() + + def get_row_count(self): + raise UnimplementedException() + + def get_col_count(self): + raise UnimplementedException() + + def get_row(self, row_name): + raise UnimplementedException() + + def get_col(self, col_name): + raise UnimplementedException() + + +def cg_new(type_str): + """ + cg_new takes a type string and creates a new object from the + class named, it uses an internally defined map to find all + official CGData data types. So if a 'genomicMatrix' is requested + a CGData.GenomicMatrix.GenomicMatrix is initialized. + + type_str -- A string name of a CGData type, ie 'genomicMatrix' + """ + mod_name, cls_name = OBJECT_MAP[type_str] + module = __import__(mod_name, globals(), locals(), [ cls_name ]) + cls = getattr(module, cls_name) + out = cls() + return out + +def load(path, zip=None): + """ + load is a the automatic CGData loading function. There has to + be a '.json' file for this function to work. It inspects the + '.json' file and uses the 'type' field to determine the + appropriate object loader to use. The object is created + (using the cg_new function) and the 'read' method is passed + a handle to the data file. If the 'zip' parameter is not None, + then it is used as the path to a zipfile, and the path parameter + is used as an path inside the zip file to the object data + + path -- path to file (in file system space if zip is None, otherwise + it is the location in the zip file) + zip -- path to zip file (None by default) + """ + if not path.endswith(".json"): + path = path + ".json" + + data_path = re.sub(r'.json$', '', path) + + try: + handle = open(path, 'rU') + meta = json.loads(handle.read()) + except IOError: + raise FormatException("Meta-info (%s) file not found" % (path)) + + # Throw away empty values + meta = dict((k, v) for k, v in meta.iteritems() if v != None) + + if meta['cgdata']['type'] in OBJECT_MAP: + out = cg_new(meta['cgdata']['type']) + out.update( meta ) + out.path = data_path + out.load(data_path) + return out + else: + raise FormatException("%s class not found" % (meta['cgdata']['type'])) + + +def light_load(path, zip=None): + if not path.endswith(".json"): + path = path + ".json" + + data_path = re.sub(r'.json$', '', path) + + if zip is None: + try: + handle = open(path, 'rU') + meta = json.loads(handle.read()) + except IOError: + raise FormatException("Meta-info (%s) file not found" % (path)) + else: + z = ZipFile(zip) + handle = z.open(path,'rU') + meta = json.loads(handle.read()) + handle.close() + z.close() + + # Throw away empty values + meta = dict((k, v) for k, v in meta.iteritems() if v != None) + + if meta['cgdata']['type'] in OBJECT_MAP: + out = cg_new(meta['cgdata']['type']) + out.update( meta ) + out.path = data_path + out.zip = zip + out.light_mode = True + return out + else: + raise FormatException("%s class not found" % (meta['cgdata']['type'])) + +global LOG_LEVEL +LOG_LEVEL = 2 + +def info(eStr): + if LOG_LEVEL < 2: + sys.stderr.write("LOG: %s\n" % (eStr)) + #errorLogHandle.write("LOG: %s\n" % (eStr)) + +def debug(eStr): + if LOG_LEVEL < 1: + sys.stderr.write("DEBUG: %s\n" % (eStr)) + #errorLogHandle.write("LOG: %s\n" % (eStr)) + +def warn(eStr): + if LOG_LEVEL < 3: + sys.stderr.write("WARNING: %s\n" % (eStr)) + #errorLogHandle.write("WARNING: %s\n" % (eStr)) + + +def error(eStr): + sys.stderr.write("ERROR: %s\n" % (eStr)) + #errorLogHandle.write("ERROR: %s\n" % (eStr)) +