comparison seg2matrix/CGData/RefGene.py @ 31:ab20c0d04f4a

add seg2matrix tool
author jingchunzhu
date Fri, 24 Jul 2015 13:10:11 -0700
parents
children
comparison
equal deleted inserted replaced
30:7a7a52e9b019 31:ab20c0d04f4a
1
2 import csv
3 import re
4 import CGData
5
6 #column definitions for the current refGene_hg18.table
7 COL_CHROM = 2
8 COL_STRAND = 3
9 COL_START = 4
10 COL_END = 5
11 COL_CDSSTART = 6
12 COL_CDSEND = 7
13 COL_EXCOUNT = 8
14 COL_EXSTART = 9
15 COL_EXEND = 10
16 COL_HUGO = 12
17
18
19 #sometimes the ref table ends with a comma, which makes
20 #arrays that end with '' when you split
21 re_comma_end = re.compile(r',$')
22
23
24 class GeneInfo(object):
25 """
26 Class to hold information about gene, including exon start/stops
27 """
28
29 def __init__(self,
30 chrom, strand, start, end, cds_start, cds_end, ex_count, ex_start, ex_end, hugo):
31 self.chrom = chrom
32 self.strand = strand
33 self.chrom_start = int(start)+1
34 self.chrom_end = int(end)
35 self.cds_start = int(cds_start)+1
36 self.cds_end = int(cds_end)
37
38 self.ex_count = int(ex_count)
39 self.ex_start = []
40 for p in re_comma_end.sub("", ex_start).split(','):
41 self.ex_start.append(int(p)+1)
42 self.ex_end = []
43 for p in re_comma_end.sub("", ex_end).split(','):
44 self.ex_end.append(int(p))
45 self.name = hugo
46
47 def __repr__(self):
48 #return "%s_%s_%d_%d" % (self.hugo, self.chrom, self.start, self.end )
49 return self.name
50
51
52 class RefGene(CGData.CGObjectBase):
53
54 def __init__(self):
55 CGData.CGObjectBase.__init__(self)
56 self.hugo_map = {}
57 self.chrom_map = {}
58
59 def read(self, handle):
60 read = csv.reader(handle, delimiter="\t")
61
62 self.hugo_map = {}
63 for row in read:
64 gene = GeneInfo(
65 row[COL_CHROM],
66 row[COL_STRAND],
67 row[COL_START],
68 row[COL_END],
69 row[COL_CDSSTART],
70 row[COL_CDSEND],
71 row[COL_EXCOUNT],
72 row[COL_EXSTART],
73 row[COL_EXEND],
74 row[COL_HUGO])
75 if not row[COL_HUGO] in self.hugo_map:
76 self.hugo_map[row[COL_HUGO]] = [gene]
77 else:
78 self.hugo_map[row[COL_HUGO]].append(gene)
79 self.chrom_map = {}
80 for hugo in self.hugo_map:
81 genes = self.hugo_map[hugo]
82 for gene in genes:
83 if not gene.chrom in self.chrom_map:
84 self.chrom_map[gene.chrom] = []
85 self.chrom_map[gene.chrom].append(gene)
86
87 for chrom in self.chrom_map:
88 self.chrom_map[chrom].sort(
89 lambda x, y: x.chrom_start - y.chrom_start)
90 self.loaded = True
91
92 def add(self, gene):
93 if gene.chrom not in self.chrom_map:
94 self.chrom_map[gene.chrom] = []
95 self.chrom_map[gene.chrom].append(gene)
96 if gene.name not in self.hugo_map:
97 self.hugo_map[gene.name] = []
98 self.hugo_map[gene.name].append(gene)
99
100
101 def get_chrom_list(self):
102 if not self.loaded:
103 self.load()
104 return self.chrom_map.keys()
105
106 def has_chrom(self, chrom):
107 if not self.loaded:
108 self.load()
109 return chrom in self.chrom_map
110
111 def get_chrom(self, chrom):
112 if not self.loaded:
113 self.load()
114 return self.chrom_map[chrom]
115
116 def get_gene_list(self):
117 if not self.loaded:
118 self.load()
119 return self.hugo_map.keys()
120
121 def get_gene(self, gene):
122 if not self.loaded:
123 self.load()
124 return self.hugo_map[gene]