Mercurial > repos > melissacline > ucsc_cancer_utilities
comparison seg2matrix/CGData/GeneMap.py @ 31:ab20c0d04f4a
add seg2matrix tool
| author | jingchunzhu |
|---|---|
| date | Fri, 24 Jul 2015 13:10:11 -0700 |
| parents | |
| children | 3e5680fecd7a |
comparison
equal
deleted
inserted
replaced
| 30:7a7a52e9b019 | 31:ab20c0d04f4a |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import csv | |
| 4 import sys | |
| 5 import re | |
| 6 | |
| 7 import CGData.ProbeMap | |
| 8 import CGData.GenomicMatrix | |
| 9 | |
| 10 class ProbeMapper(object): | |
| 11 """ | |
| 12 Class to map the probes. Expects handle to the refGene_hg18.table file | |
| 13 """ | |
| 14 | |
| 15 def __init__(self, mode='g'): | |
| 16 self.cmp_func = optionMap[mode] | |
| 17 | |
| 18 def find_overlap(self, segment, ref_gene, cmp_func=None): | |
| 19 """ | |
| 20 Function to find overlaps for a given probe description. | |
| 21 the cmp_func arg is a function that returns a 'True' or 'False' for | |
| 22 a given probe description and a gene, examples include 'gene_overlap' | |
| 23 and 'gene_simple_meth_overlap' | |
| 24 """ | |
| 25 if cmp_func is None: | |
| 26 cmp_func = self.cmp_func | |
| 27 if not ref_gene.has_chrom(segment.chrom): | |
| 28 return [] | |
| 29 chromList = ref_gene.get_chrom(segment.chrom) | |
| 30 | |
| 31 out = [] | |
| 32 for gene in chromList: | |
| 33 if cmp_func(segment.chrom_start, | |
| 34 segment.chrom_end, segment.strand, gene): | |
| 35 out.append(gene) | |
| 36 return out | |
| 37 | |
| 38 | |
| 39 # | |
| 40 # The set of functions that can be used to do comparisons | |
| 41 # | |
| 42 | |
| 43 | |
| 44 def block_both_strand(start, end, strand, gene): | |
| 45 """ | |
| 46 Check is segment is between gene start and end, either strand | |
| 47 | |
| 48 **Code 'b'** | |
| 49 """ | |
| 50 if gene.chrom_end >= start and gene.chrom_start <= end: | |
| 51 return True | |
| 52 return False | |
| 53 | |
| 54 def block_same_strand(start, end, strand, gene): | |
| 55 """ | |
| 56 Check is segment is on same strand, between gene start and end | |
| 57 | |
| 58 **Code 's'** | |
| 59 """ | |
| 60 if gene.chrom_end >= start and gene.chrom_start <= end and strand == gene.strand: | |
| 61 return True | |
| 62 return False | |
| 63 | |
| 64 | |
| 65 def exon_same_strand(start, end, strand, gene): | |
| 66 """ | |
| 67 Check is segment is on same strand, and occurs on an exon | |
| 68 | |
| 69 **Code 'g'** | |
| 70 """ | |
| 71 | |
| 72 if gene.strand != strand: | |
| 73 return False | |
| 74 for i in range(int(gene.ex_count)): | |
| 75 if gene.ex_end[i] >= start and gene.ex_start[i] <= end: | |
| 76 return True | |
| 77 return False | |
| 78 | |
| 79 | |
| 80 def exon_both_strand(start, end, strand, gene): | |
| 81 """ | |
| 82 Check is segment occurs on an exon, on either stand | |
| 83 | |
| 84 **Code 'e'** | |
| 85 """ | |
| 86 for i in range(int(gene.ex_count)): | |
| 87 if gene.ex_end[i] >= start and gene.ex_start[i] <= end: | |
| 88 return True | |
| 89 return False | |
| 90 | |
| 91 def block_same_strand_coverage75(start, end, strand, gene): | |
| 92 if gene.chrom_end >= start and gene.chrom_start <= end and strand == gene.strand: | |
| 93 cov = min(gene.chrom_end,end) - max(gene.chrom_start, start) | |
| 94 if float(cov) / float(end - start) > 0.75: | |
| 95 return True | |
| 96 return False | |
| 97 | |
| 98 | |
| 99 def exon_same_strand_coverage75(start, end, strand, gene): | |
| 100 if strand != gene.strand: | |
| 101 return False | |
| 102 cov = 0 | |
| 103 for i in range(int(gene.ex_count)): | |
| 104 if gene.ex_end[i] >= start and gene.ex_start[i] <= end: | |
| 105 cov += min(gene.ex_end[i],end) - max(gene.ex_start[i], start) | |
| 106 if float(cov) / float(end - start) > 0.75: | |
| 107 return True | |
| 108 return False | |
| 109 | |
| 110 | |
| 111 class Intersector: | |
| 112 def __init__(self, same_strand=True, exons=False, coverage=None, start_rel_cdsStart=None, end_rel_cdsStart=None, start_rel_tss=None, end_rel_tss=None): | |
| 113 self.same_strand = same_strand | |
| 114 self.exons = exons | |
| 115 self.coverage = coverage | |
| 116 self.start_rel_tss = start_rel_tss | |
| 117 self.end_rel_tss = end_rel_tss | |
| 118 self.start_rel_cdsStart = start_rel_cdsStart | |
| 119 self.end_rel_cdsStart = end_rel_cdsStart | |
| 120 | |
| 121 def hit(self, start, end, strand, gene): | |
| 122 if self.same_strand and strand != gene.strand: | |
| 123 return False | |
| 124 | |
| 125 if self.exons: | |
| 126 cov = 0 | |
| 127 for i in range(int(gene.ex_count)): | |
| 128 if gene.ex_end[i] >= start and gene.ex_start[i] <= end: | |
| 129 cov += min(gene.ex_end[i],end) - max(gene.ex_start[i], start) | |
| 130 if self.coverage is None and cov > 0: | |
| 131 return True | |
| 132 else: | |
| 133 if float(cov) / float(end - start) > self.coverage: | |
| 134 return True | |
| 135 else: | |
| 136 wStart = gene.chrom_start | |
| 137 wEnd = gene.chrom_end | |
| 138 | |
| 139 if self.start_rel_cdsStart is not None: | |
| 140 if gene.strand == "+": | |
| 141 wStart = gene.cds_start + self.start_rel_cdsStart | |
| 142 if gene.strand == "-": | |
| 143 wStart = gene.cds_end - self.start_rel_cdsStart | |
| 144 if self.end_rel_cdsStart is not None: | |
| 145 if gene.strand == "+": | |
| 146 wEnd = gene.cds_start + self.end_rel_cdsStart | |
| 147 if gene.strand == "-": | |
| 148 wEnd = gene.cds_end - self.end_rel_cdsStart | |
| 149 | |
| 150 if self.start_rel_tss is not None: | |
| 151 if gene.strand == "+": | |
| 152 wStart = gene.chrom_start + self.start_rel_tss | |
| 153 if gene.strand == "-": | |
| 154 wStart = gene.chrom_end - self.start_rel_tss | |
| 155 if self.end_rel_tss is not None: | |
| 156 if gene.strand == "+": | |
| 157 wEnd = gene.chrom_start + self.end_rel_tss | |
| 158 if gene.strand == "-": | |
| 159 wEnd = gene.chrom_end - self.end_rel_tss | |
| 160 | |
| 161 cstart = min(wEnd, wStart) | |
| 162 cend = max(wEnd, wStart) | |
| 163 if cend >= start and cstart <= end: | |
| 164 cov = min(gene.chrom_end,end) - max(gene.chrom_start, start) | |
| 165 if self.coverage is None and cov > 0: | |
| 166 return True | |
| 167 else: | |
| 168 if float(cov) / float(end - start) > 0.75: | |
| 169 return True | |
| 170 | |
| 171 return False | |
| 172 | |
| 173 | |
| 174 | |
| 175 ###ADD MORE FUNCTIONS HERE | |
| 176 | |
| 177 | |
| 178 #### | |
| 179 | |
| 180 ###To add options to the command line, map the option character to a function | |
| 181 ###for example '-m' maps to gene_simple_meth_overlap | |
| 182 | |
| 183 optionMap = { | |
| 184 "b": block_both_strand, | |
| 185 "s": block_same_strand, | |
| 186 "g": exon_same_strand, | |
| 187 "e": exon_both_strand | |
| 188 } | |
| 189 | |
| 190 | |
| 191 | |
| 192 def genomicSegment2Matrix(genomicSegment, refGene, probeMapper): | |
| 193 """ | |
| 194 Take a genomicSegment map, compare it against a refGene table, | |
| 195 and contruct a genomicMatrix | |
| 196 """ | |
| 197 out = CGData.GenomicMatrix.GenomicMatrix() | |
| 198 out.init_blank( rows=refGene.get_gene_list(), cols=genomicSegment.get_key_list() ) | |
| 199 for id in genomicSegment.get_key_list(): | |
| 200 for segment in genomicSegment.get_by(id): | |
| 201 for hit in probeMapper.find_overlap( segment, refGene ): | |
| 202 out.set_val(row_name=hit.name, col_name=segment.id, value=segment.value ) | |
| 203 return out | |
| 204 | |
| 205 | |
| 206 def filter_longest_form(refgene): | |
| 207 """ | |
| 208 take a refgene table and filter multiple gene isoforms down to the longest | |
| 209 """ | |
| 210 ng = CGData.RefGene.RefGene() | |
| 211 for g in refgene.get_gene_list(): | |
| 212 longest = None | |
| 213 length = 0 | |
| 214 for elem in refgene.get_gene(g): | |
| 215 newLength = elem.chrom_end - elem.chrom_start | |
| 216 if newLength > length: | |
| 217 length = newLength | |
| 218 longest = elem | |
| 219 ng.add(longest) | |
| 220 ng.loaded = True | |
| 221 return ng | |
| 222 | |
| 223 | |
| 224 def genomicSegment2MatrixNorm(genomicSegment, refGene, probeMapper): | |
| 225 """ | |
| 226 Given | |
| 227 """ | |
| 228 ng = filter_longest_form(refGene) | |
| 229 #enumerate the col order of the sample ids | |
| 230 idList = genomicSegment.get_key_list() | |
| 231 | |
| 232 geneList = ng.get_gene_list() | |
| 233 | |
| 234 tmp = CGData.GenomicMatrix.GenomicMatrix() | |
| 235 tmp.init_blank( rows=geneList, cols=idList ) | |
| 236 geneHits = {} | |
| 237 #read through the segment one sample id at a time | |
| 238 for id in idList: | |
| 239 segmentMap = {} | |
| 240 for segment in genomicSegment.get_by(id): | |
| 241 for hit in probeMapper.find_overlap( segment, ng ): | |
| 242 span = float(min(segment.chrom_end, hit.chrom_end) - max(segment.chrom_start, hit.chrom_start)) / float(hit.chrom_end - hit.chrom_start) | |
| 243 #if hit.name not in segmentMap: | |
| 244 # segmentMap[hit.name] = [] | |
| 245 try: | |
| 246 segmentMap[hit.name].append( | |
| 247 ( span, segment.value ) | |
| 248 ) | |
| 249 except KeyError: | |
| 250 segmentMap[hit.name] = [ | |
| 251 ( span, segment.value ) | |
| 252 ] | |
| 253 | |
| 254 for gene in segmentMap: | |
| 255 geneHits[gene] = True | |
| 256 mapInfo = segmentMap[gene] | |
| 257 coverage = sum( i[0] for i in mapInfo ) | |
| 258 assert coverage <= 1.0 | |
| 259 value = sum( i[0]*i[1] for i in mapInfo ) | |
| 260 #print coverage, value, value/coverage, segmentMap[gene] | |
| 261 tmp.set_val(row_name=gene, col_name=id, value=value/coverage) | |
| 262 | |
| 263 #now remove the blanks | |
| 264 out = CGData.GenomicMatrix.GenomicMatrix() | |
| 265 out.init_blank( rows=geneHits, cols=idList ) | |
| 266 for gene in geneHits: | |
| 267 for sample in idList: | |
| 268 out.set_val( row_name=gene, col_name=sample, value=tmp.get_val(row_name=gene, col_name=sample)) | |
| 269 return out | |
| 270 | |
| 271 | |
| 272 | |
| 273 def aliasRemap(genomicMatrix, aliasMap): | |
| 274 """ | |
| 275 Given a genomicMatrix and an alias map, create a new genomic matrix | |
| 276 with the probes from the original matrix remapped to the connected aliases | |
| 277 from the map | |
| 278 """ | |
| 279 | |
| 280 am = {} | |
| 281 for probe in aliasMap.get_key_list(): | |
| 282 for alias in aliasMap.get_by(probe): | |
| 283 if alias not in am: | |
| 284 am[alias.alias] = {} | |
| 285 am[alias.alias][probe] = True | |
| 286 | |
| 287 out = CGData.GenomicMatrix.GenomicMatrix() | |
| 288 out.init_blank( rows=am.keys(), cols=genomicMatrix.get_col_list() ) | |
| 289 probeMap = genomicMatrix.get_row_map() | |
| 290 for a in am: | |
| 291 for sample in genomicMatrix.get_col_list(): | |
| 292 o = [] | |
| 293 for p in am[a]: | |
| 294 if p in probeMap: | |
| 295 o.append( genomicMatrix.get_val( col_name=sample, row_name=p) ) | |
| 296 if len(o): | |
| 297 out.set_val(col_name=sample, row_name=a, value=sum(o) / float(len(o))) | |
| 298 | |
| 299 return out | |
| 300 | |
| 301 def refGeneLink2ProbeLoc(aliasMap, refGene): | |
| 302 """ | |
| 303 given an alias map, and a refGene produce a probeMap by connecting | |
| 304 alias symbols. Returns the coordinates of the longest form | |
| 305 """ | |
| 306 out = CGData.ProbeLoc.ProbeLoc() | |
| 307 out.init_blank() | |
| 308 | |
| 309 for probe in aliasMap.get_key_list(): | |
| 310 for link in aliasMap.get_by(probe): | |
| 311 probe = link.probe | |
| 312 geneName = link.alias | |
| 313 sGene = None | |
| 314 try: | |
| 315 for gene in refGene.get_gene(geneName): | |
| 316 if sGene is None or gene.chrom_end - gene.chrom_start > sGene.chrom_end - sGene.chrom_start: | |
| 317 sGene = gene | |
| 318 except KeyError: | |
| 319 pass | |
| 320 if sGene is not None: | |
| 321 out.insert(probe, { 'probe' : probe, 'chrom' : sGene.chrom, 'strand' : sGene.strand, 'chrom_start' : sGene.chrom_start, 'chrom_end' : sGene.chrom_end }) | |
| 322 return out | |
| 323 | |
| 324 | |
| 325 | |
| 326 | |
| 327 |
