31
|
1
|
|
2 import csv
|
|
3 import re
|
|
4 import CGData
|
|
5
|
|
6 #column definitions for the current refGene_hg18.table
|
|
7 COL_CHROM = 2
|
|
8 COL_STRAND = 3
|
|
9 COL_START = 4
|
|
10 COL_END = 5
|
|
11 COL_CDSSTART = 6
|
|
12 COL_CDSEND = 7
|
|
13 COL_EXCOUNT = 8
|
|
14 COL_EXSTART = 9
|
|
15 COL_EXEND = 10
|
|
16 COL_HUGO = 12
|
|
17
|
|
18
|
|
19 #sometimes the ref table ends with a comma, which makes
|
|
20 #arrays that end with '' when you split
|
|
21 re_comma_end = re.compile(r',$')
|
|
22
|
|
23
|
|
24 class GeneInfo(object):
|
|
25 """
|
|
26 Class to hold information about gene, including exon start/stops
|
|
27 """
|
|
28
|
|
29 def __init__(self,
|
|
30 chrom, strand, start, end, cds_start, cds_end, ex_count, ex_start, ex_end, hugo):
|
|
31 self.chrom = chrom
|
|
32 self.strand = strand
|
|
33 self.chrom_start = int(start)+1
|
|
34 self.chrom_end = int(end)
|
|
35 self.cds_start = int(cds_start)+1
|
|
36 self.cds_end = int(cds_end)
|
|
37
|
|
38 self.ex_count = int(ex_count)
|
|
39 self.ex_start = []
|
|
40 for p in re_comma_end.sub("", ex_start).split(','):
|
|
41 self.ex_start.append(int(p)+1)
|
|
42 self.ex_end = []
|
|
43 for p in re_comma_end.sub("", ex_end).split(','):
|
|
44 self.ex_end.append(int(p))
|
|
45 self.name = hugo
|
|
46
|
|
47 def __repr__(self):
|
|
48 #return "%s_%s_%d_%d" % (self.hugo, self.chrom, self.start, self.end )
|
|
49 return self.name
|
|
50
|
|
51
|
|
52 class RefGene(CGData.CGObjectBase):
|
|
53
|
|
54 def __init__(self):
|
|
55 CGData.CGObjectBase.__init__(self)
|
|
56 self.hugo_map = {}
|
|
57 self.chrom_map = {}
|
|
58
|
|
59 def read(self, handle):
|
|
60 read = csv.reader(handle, delimiter="\t")
|
|
61
|
|
62 self.hugo_map = {}
|
|
63 for row in read:
|
|
64 gene = GeneInfo(
|
|
65 row[COL_CHROM],
|
|
66 row[COL_STRAND],
|
|
67 row[COL_START],
|
|
68 row[COL_END],
|
|
69 row[COL_CDSSTART],
|
|
70 row[COL_CDSEND],
|
|
71 row[COL_EXCOUNT],
|
|
72 row[COL_EXSTART],
|
|
73 row[COL_EXEND],
|
|
74 row[COL_HUGO])
|
|
75 if not row[COL_HUGO] in self.hugo_map:
|
|
76 self.hugo_map[row[COL_HUGO]] = [gene]
|
|
77 else:
|
|
78 self.hugo_map[row[COL_HUGO]].append(gene)
|
|
79 self.chrom_map = {}
|
|
80 for hugo in self.hugo_map:
|
|
81 genes = self.hugo_map[hugo]
|
|
82 for gene in genes:
|
|
83 if not gene.chrom in self.chrom_map:
|
|
84 self.chrom_map[gene.chrom] = []
|
|
85 self.chrom_map[gene.chrom].append(gene)
|
|
86
|
|
87 for chrom in self.chrom_map:
|
|
88 self.chrom_map[chrom].sort(
|
|
89 lambda x, y: x.chrom_start - y.chrom_start)
|
|
90 self.loaded = True
|
|
91
|
|
92 def add(self, gene):
|
|
93 if gene.chrom not in self.chrom_map:
|
|
94 self.chrom_map[gene.chrom] = []
|
|
95 self.chrom_map[gene.chrom].append(gene)
|
|
96 if gene.name not in self.hugo_map:
|
|
97 self.hugo_map[gene.name] = []
|
|
98 self.hugo_map[gene.name].append(gene)
|
|
99
|
|
100
|
|
101 def get_chrom_list(self):
|
|
102 if not self.loaded:
|
|
103 self.load()
|
|
104 return self.chrom_map.keys()
|
|
105
|
|
106 def has_chrom(self, chrom):
|
|
107 if not self.loaded:
|
|
108 self.load()
|
|
109 return chrom in self.chrom_map
|
|
110
|
|
111 def get_chrom(self, chrom):
|
|
112 if not self.loaded:
|
|
113 self.load()
|
|
114 return self.chrom_map[chrom]
|
|
115
|
|
116 def get_gene_list(self):
|
|
117 if not self.loaded:
|
|
118 self.load()
|
|
119 return self.hugo_map.keys()
|
|
120
|
|
121 def get_gene(self, gene):
|
|
122 if not self.loaded:
|
|
123 self.load()
|
|
124 return self.hugo_map[gene]
|