annotate seg2matrix/CGData/RefGene.py @ 52:3a036a34c362

better handle of input file
author jingchunzhu
date Thu, 17 Sep 2015 15:00:45 -0700
parents ab20c0d04f4a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
31
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
1
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
2 import csv
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
3 import re
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
4 import CGData
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
5
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
6 #column definitions for the current refGene_hg18.table
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
7 COL_CHROM = 2
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
8 COL_STRAND = 3
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
9 COL_START = 4
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
10 COL_END = 5
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
11 COL_CDSSTART = 6
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
12 COL_CDSEND = 7
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
13 COL_EXCOUNT = 8
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
14 COL_EXSTART = 9
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
15 COL_EXEND = 10
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
16 COL_HUGO = 12
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
17
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
18
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
19 #sometimes the ref table ends with a comma, which makes
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
20 #arrays that end with '' when you split
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
21 re_comma_end = re.compile(r',$')
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
22
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
23
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
24 class GeneInfo(object):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
25 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
26 Class to hold information about gene, including exon start/stops
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
27 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
28
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
29 def __init__(self,
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
30 chrom, strand, start, end, cds_start, cds_end, ex_count, ex_start, ex_end, hugo):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
31 self.chrom = chrom
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
32 self.strand = strand
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
33 self.chrom_start = int(start)+1
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
34 self.chrom_end = int(end)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
35 self.cds_start = int(cds_start)+1
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
36 self.cds_end = int(cds_end)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
37
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
38 self.ex_count = int(ex_count)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
39 self.ex_start = []
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
40 for p in re_comma_end.sub("", ex_start).split(','):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
41 self.ex_start.append(int(p)+1)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
42 self.ex_end = []
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
43 for p in re_comma_end.sub("", ex_end).split(','):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
44 self.ex_end.append(int(p))
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
45 self.name = hugo
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
46
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
47 def __repr__(self):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
48 #return "%s_%s_%d_%d" % (self.hugo, self.chrom, self.start, self.end )
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
49 return self.name
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
50
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
51
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
52 class RefGene(CGData.CGObjectBase):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
53
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
54 def __init__(self):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
55 CGData.CGObjectBase.__init__(self)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
56 self.hugo_map = {}
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
57 self.chrom_map = {}
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
58
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
59 def read(self, handle):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
60 read = csv.reader(handle, delimiter="\t")
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
61
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
62 self.hugo_map = {}
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
63 for row in read:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
64 gene = GeneInfo(
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
65 row[COL_CHROM],
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
66 row[COL_STRAND],
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
67 row[COL_START],
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
68 row[COL_END],
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
69 row[COL_CDSSTART],
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
70 row[COL_CDSEND],
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
71 row[COL_EXCOUNT],
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
72 row[COL_EXSTART],
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
73 row[COL_EXEND],
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
74 row[COL_HUGO])
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
75 if not row[COL_HUGO] in self.hugo_map:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
76 self.hugo_map[row[COL_HUGO]] = [gene]
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
77 else:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
78 self.hugo_map[row[COL_HUGO]].append(gene)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
79 self.chrom_map = {}
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
80 for hugo in self.hugo_map:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
81 genes = self.hugo_map[hugo]
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
82 for gene in genes:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
83 if not gene.chrom in self.chrom_map:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
84 self.chrom_map[gene.chrom] = []
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
85 self.chrom_map[gene.chrom].append(gene)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
86
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
87 for chrom in self.chrom_map:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
88 self.chrom_map[chrom].sort(
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
89 lambda x, y: x.chrom_start - y.chrom_start)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
90 self.loaded = True
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
91
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
92 def add(self, gene):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
93 if gene.chrom not in self.chrom_map:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
94 self.chrom_map[gene.chrom] = []
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
95 self.chrom_map[gene.chrom].append(gene)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
96 if gene.name not in self.hugo_map:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
97 self.hugo_map[gene.name] = []
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
98 self.hugo_map[gene.name].append(gene)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
99
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
100
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
101 def get_chrom_list(self):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
102 if not self.loaded:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
103 self.load()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
104 return self.chrom_map.keys()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
105
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
106 def has_chrom(self, chrom):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
107 if not self.loaded:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
108 self.load()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
109 return chrom in self.chrom_map
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
110
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
111 def get_chrom(self, chrom):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
112 if not self.loaded:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
113 self.load()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
114 return self.chrom_map[chrom]
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
115
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
116 def get_gene_list(self):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
117 if not self.loaded:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
118 self.load()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
119 return self.hugo_map.keys()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
120
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
121 def get_gene(self, gene):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
122 if not self.loaded:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
123 self.load()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
124 return self.hugo_map[gene]