Mercurial > repos > melissacline > ucsc_cancer_utilities
diff seg2matrix/CGData/RefGene.py @ 31:ab20c0d04f4a
add seg2matrix tool
author | jingchunzhu |
---|---|
date | Fri, 24 Jul 2015 13:10:11 -0700 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/seg2matrix/CGData/RefGene.py Fri Jul 24 13:10:11 2015 -0700 @@ -0,0 +1,124 @@ + +import csv +import re +import CGData + +#column definitions for the current refGene_hg18.table +COL_CHROM = 2 +COL_STRAND = 3 +COL_START = 4 +COL_END = 5 +COL_CDSSTART = 6 +COL_CDSEND = 7 +COL_EXCOUNT = 8 +COL_EXSTART = 9 +COL_EXEND = 10 +COL_HUGO = 12 + + +#sometimes the ref table ends with a comma, which makes +#arrays that end with '' when you split +re_comma_end = re.compile(r',$') + + +class GeneInfo(object): + """ + Class to hold information about gene, including exon start/stops + """ + + def __init__(self, + chrom, strand, start, end, cds_start, cds_end, ex_count, ex_start, ex_end, hugo): + self.chrom = chrom + self.strand = strand + self.chrom_start = int(start)+1 + self.chrom_end = int(end) + self.cds_start = int(cds_start)+1 + self.cds_end = int(cds_end) + + self.ex_count = int(ex_count) + self.ex_start = [] + for p in re_comma_end.sub("", ex_start).split(','): + self.ex_start.append(int(p)+1) + self.ex_end = [] + for p in re_comma_end.sub("", ex_end).split(','): + self.ex_end.append(int(p)) + self.name = hugo + + def __repr__(self): + #return "%s_%s_%d_%d" % (self.hugo, self.chrom, self.start, self.end ) + return self.name + + +class RefGene(CGData.CGObjectBase): + + def __init__(self): + CGData.CGObjectBase.__init__(self) + self.hugo_map = {} + self.chrom_map = {} + + def read(self, handle): + read = csv.reader(handle, delimiter="\t") + + self.hugo_map = {} + for row in read: + gene = GeneInfo( + row[COL_CHROM], + row[COL_STRAND], + row[COL_START], + row[COL_END], + row[COL_CDSSTART], + row[COL_CDSEND], + row[COL_EXCOUNT], + row[COL_EXSTART], + row[COL_EXEND], + row[COL_HUGO]) + if not row[COL_HUGO] in self.hugo_map: + self.hugo_map[row[COL_HUGO]] = [gene] + else: + self.hugo_map[row[COL_HUGO]].append(gene) + self.chrom_map = {} + for hugo in self.hugo_map: + genes = self.hugo_map[hugo] + for gene in genes: + if not gene.chrom in self.chrom_map: + self.chrom_map[gene.chrom] = [] + self.chrom_map[gene.chrom].append(gene) + + for chrom in self.chrom_map: + self.chrom_map[chrom].sort( + lambda x, y: x.chrom_start - y.chrom_start) + self.loaded = True + + def add(self, gene): + if gene.chrom not in self.chrom_map: + self.chrom_map[gene.chrom] = [] + self.chrom_map[gene.chrom].append(gene) + if gene.name not in self.hugo_map: + self.hugo_map[gene.name] = [] + self.hugo_map[gene.name].append(gene) + + + def get_chrom_list(self): + if not self.loaded: + self.load() + return self.chrom_map.keys() + + def has_chrom(self, chrom): + if not self.loaded: + self.load() + return chrom in self.chrom_map + + def get_chrom(self, chrom): + if not self.loaded: + self.load() + return self.chrom_map[chrom] + + def get_gene_list(self): + if not self.loaded: + self.load() + return self.hugo_map.keys() + + def get_gene(self, gene): + if not self.loaded: + self.load() + return self.hugo_map[gene]