| 31 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 import csv | 
|  | 4 import sys | 
|  | 5 import re | 
|  | 6 | 
|  | 7 import CGData.ProbeMap | 
|  | 8 import CGData.GenomicMatrix | 
|  | 9 | 
|  | 10 class ProbeMapper(object): | 
|  | 11     """ | 
|  | 12     Class to map the probes. Expects handle to the refGene_hg18.table file | 
|  | 13     """ | 
|  | 14 | 
|  | 15     def __init__(self, mode='g'): | 
|  | 16         self.cmp_func = optionMap[mode] | 
|  | 17 | 
|  | 18     def find_overlap(self, segment, ref_gene, cmp_func=None): | 
|  | 19         """ | 
|  | 20         Function to find overlaps for a given probe description. | 
|  | 21         the cmp_func arg is a function that returns a 'True' or 'False' for | 
|  | 22         a given probe description and a gene, examples include 'gene_overlap' | 
|  | 23         and 'gene_simple_meth_overlap' | 
|  | 24         """ | 
|  | 25         if cmp_func is None: | 
|  | 26             cmp_func = self.cmp_func | 
| 58 | 27 | 
| 31 | 28         if not ref_gene.has_chrom(segment.chrom): | 
|  | 29             return [] | 
|  | 30         chromList = ref_gene.get_chrom(segment.chrom) | 
|  | 31 | 
|  | 32         out = [] | 
|  | 33         for gene in chromList: | 
|  | 34             if cmp_func(segment.chrom_start, | 
|  | 35                         segment.chrom_end, segment.strand, gene): | 
|  | 36                 out.append(gene) | 
|  | 37         return out | 
|  | 38 | 
|  | 39 | 
|  | 40 # | 
|  | 41 # The set of functions that can be used to do comparisons | 
|  | 42 # | 
|  | 43 | 
|  | 44 | 
|  | 45 def block_both_strand(start, end, strand, gene): | 
|  | 46     """ | 
|  | 47     Check is segment is between gene start and end, either strand | 
|  | 48 | 
|  | 49     **Code 'b'** | 
|  | 50     """ | 
|  | 51     if gene.chrom_end >= start and gene.chrom_start <= end: | 
|  | 52         return True | 
|  | 53     return False | 
|  | 54 | 
|  | 55 def block_same_strand(start, end, strand, gene): | 
|  | 56     """ | 
|  | 57     Check is segment is on same strand, between gene start and end | 
|  | 58 | 
|  | 59     **Code 's'** | 
|  | 60     """ | 
|  | 61     if gene.chrom_end >= start and gene.chrom_start <= end and strand == gene.strand: | 
|  | 62         return True | 
|  | 63     return False | 
|  | 64 | 
|  | 65 | 
|  | 66 def exon_same_strand(start, end, strand, gene): | 
|  | 67     """ | 
|  | 68     Check is segment is on same strand, and occurs on an exon | 
|  | 69 | 
|  | 70     **Code 'g'** | 
|  | 71     """ | 
|  | 72 | 
|  | 73     if gene.strand != strand: | 
|  | 74         return False | 
|  | 75     for i in range(int(gene.ex_count)): | 
|  | 76         if gene.ex_end[i] >= start and gene.ex_start[i] <= end: | 
|  | 77             return True | 
|  | 78     return False | 
|  | 79 | 
|  | 80 | 
|  | 81 def exon_both_strand(start, end, strand, gene): | 
|  | 82     """ | 
|  | 83     Check is segment occurs on an exon, on either stand | 
|  | 84 | 
|  | 85     **Code 'e'** | 
|  | 86     """ | 
|  | 87     for i in range(int(gene.ex_count)): | 
|  | 88         if gene.ex_end[i] >= start and gene.ex_start[i] <= end: | 
|  | 89             return True | 
|  | 90     return False | 
|  | 91 | 
|  | 92 def block_same_strand_coverage75(start, end, strand, gene): | 
|  | 93     if gene.chrom_end >= start and gene.chrom_start <= end and strand == gene.strand: | 
|  | 94         cov = min(gene.chrom_end,end) - max(gene.chrom_start, start) | 
|  | 95         if float(cov) / float(end - start) > 0.75: | 
|  | 96             return True | 
|  | 97     return False | 
|  | 98 | 
|  | 99 | 
|  | 100 def exon_same_strand_coverage75(start, end, strand, gene): | 
|  | 101     if strand != gene.strand: | 
|  | 102         return False | 
|  | 103     cov = 0 | 
|  | 104     for i in range(int(gene.ex_count)): | 
|  | 105         if gene.ex_end[i] >= start and gene.ex_start[i] <= end: | 
|  | 106             cov += min(gene.ex_end[i],end) - max(gene.ex_start[i], start) | 
|  | 107     if float(cov) / float(end - start) > 0.75: | 
|  | 108         return True | 
|  | 109     return False | 
|  | 110 | 
|  | 111 | 
|  | 112 class Intersector: | 
|  | 113     def __init__(self, same_strand=True, exons=False, coverage=None, start_rel_cdsStart=None, end_rel_cdsStart=None, start_rel_tss=None, end_rel_tss=None): | 
|  | 114         self.same_strand = same_strand | 
|  | 115         self.exons = exons | 
|  | 116         self.coverage = coverage | 
|  | 117         self.start_rel_tss = start_rel_tss | 
|  | 118         self.end_rel_tss = end_rel_tss | 
|  | 119         self.start_rel_cdsStart = start_rel_cdsStart | 
|  | 120         self.end_rel_cdsStart = end_rel_cdsStart | 
|  | 121 | 
|  | 122     def hit(self, start, end, strand, gene): | 
|  | 123         if self.same_strand and strand != gene.strand: | 
|  | 124             return False | 
|  | 125 | 
|  | 126         if self.exons: | 
|  | 127             cov = 0 | 
|  | 128             for i in range(int(gene.ex_count)): | 
|  | 129                 if gene.ex_end[i] >= start and gene.ex_start[i] <= end: | 
|  | 130                     cov += min(gene.ex_end[i],end) - max(gene.ex_start[i], start) | 
|  | 131             if self.coverage is None and cov > 0: | 
|  | 132                 return True | 
|  | 133             else: | 
|  | 134                 if float(cov) / float(end - start) > self.coverage: | 
|  | 135                     return True | 
|  | 136         else: | 
|  | 137                 wStart = gene.chrom_start | 
|  | 138                 wEnd   = gene.chrom_end | 
|  | 139 | 
|  | 140                 if self.start_rel_cdsStart is not None: | 
|  | 141                     if gene.strand == "+": | 
|  | 142                         wStart = gene.cds_start + self.start_rel_cdsStart | 
|  | 143                     if gene.strand == "-": | 
|  | 144                         wStart = gene.cds_end - self.start_rel_cdsStart | 
|  | 145                 if self.end_rel_cdsStart is not None: | 
|  | 146                     if gene.strand == "+": | 
|  | 147                         wEnd = gene.cds_start + self.end_rel_cdsStart | 
|  | 148                     if gene.strand == "-": | 
|  | 149                         wEnd = gene.cds_end - self.end_rel_cdsStart | 
|  | 150 | 
|  | 151                 if self.start_rel_tss is not None: | 
|  | 152                     if gene.strand == "+": | 
|  | 153                         wStart = gene.chrom_start + self.start_rel_tss | 
|  | 154                     if gene.strand == "-": | 
|  | 155                         wStart = gene.chrom_end - self.start_rel_tss | 
|  | 156                 if self.end_rel_tss is not None: | 
|  | 157                     if gene.strand == "+": | 
|  | 158                         wEnd = gene.chrom_start + self.end_rel_tss | 
|  | 159                     if gene.strand == "-": | 
|  | 160                         wEnd = gene.chrom_end - self.end_rel_tss | 
|  | 161 | 
|  | 162                 cstart = min(wEnd, wStart) | 
|  | 163                 cend = max(wEnd, wStart) | 
|  | 164                 if cend >= start and cstart <= end: | 
|  | 165                     cov = min(gene.chrom_end,end) - max(gene.chrom_start, start) | 
|  | 166                     if self.coverage is None and cov > 0: | 
|  | 167                         return True | 
|  | 168                     else: | 
|  | 169                         if float(cov) / float(end - start) > 0.75: | 
|  | 170                             return True | 
|  | 171 | 
|  | 172         return False | 
|  | 173 | 
|  | 174 | 
|  | 175 | 
|  | 176 ###ADD MORE FUNCTIONS HERE | 
|  | 177 | 
|  | 178 | 
|  | 179 #### | 
|  | 180 | 
|  | 181 ###To add options to the command line, map the option character to a function | 
|  | 182 ###for example '-m' maps to gene_simple_meth_overlap | 
|  | 183 | 
|  | 184 optionMap = { | 
|  | 185     "b": block_both_strand, | 
|  | 186     "s": block_same_strand, | 
|  | 187     "g": exon_same_strand, | 
|  | 188     "e": exon_both_strand | 
|  | 189 } | 
|  | 190 | 
|  | 191 | 
|  | 192 | 
|  | 193 def genomicSegment2Matrix(genomicSegment, refGene, probeMapper): | 
|  | 194     """ | 
|  | 195     Take a genomicSegment map, compare it against a refGene table, | 
|  | 196     and contruct a genomicMatrix | 
|  | 197     """ | 
|  | 198     out = CGData.GenomicMatrix.GenomicMatrix() | 
|  | 199     out.init_blank( rows=refGene.get_gene_list(), cols=genomicSegment.get_key_list() ) | 
|  | 200     for id in genomicSegment.get_key_list(): | 
|  | 201         for segment in genomicSegment.get_by(id): | 
|  | 202             for hit in probeMapper.find_overlap( segment, refGene ): | 
|  | 203                 out.set_val(row_name=hit.name, col_name=segment.id, value=segment.value ) | 
|  | 204     return out | 
|  | 205 | 
|  | 206 | 
|  | 207 def filter_longest_form(refgene): | 
|  | 208     """ | 
|  | 209     take a refgene table and filter multiple gene isoforms down to the longest | 
|  | 210     """ | 
|  | 211     ng = CGData.RefGene.RefGene() | 
|  | 212     for g in refgene.get_gene_list(): | 
|  | 213         longest = None | 
|  | 214         length = 0 | 
|  | 215         for elem in refgene.get_gene(g): | 
|  | 216             newLength = elem.chrom_end - elem.chrom_start | 
|  | 217             if newLength > length: | 
|  | 218                 length = newLength | 
|  | 219                 longest = elem | 
|  | 220         ng.add(longest) | 
|  | 221     ng.loaded = True | 
|  | 222     return ng | 
|  | 223 | 
|  | 224 | 
|  | 225 def genomicSegment2MatrixNorm(genomicSegment, refGene, probeMapper): | 
|  | 226     """ | 
|  | 227     Given | 
|  | 228     """ | 
|  | 229     ng = filter_longest_form(refGene) | 
|  | 230     #enumerate the col order of the sample ids | 
|  | 231     idList = genomicSegment.get_key_list() | 
|  | 232 | 
|  | 233     geneList = ng.get_gene_list() | 
|  | 234 | 
|  | 235     tmp = CGData.GenomicMatrix.GenomicMatrix() | 
|  | 236     tmp.init_blank( rows=geneList, cols=idList ) | 
|  | 237     geneHits = {} | 
|  | 238     #read through the segment one sample id at a time | 
|  | 239     for id in idList: | 
|  | 240         segmentMap = {} | 
|  | 241         for segment in genomicSegment.get_by(id): | 
|  | 242             for hit in probeMapper.find_overlap( segment, ng ): | 
|  | 243                 span = float(min(segment.chrom_end, hit.chrom_end) - max(segment.chrom_start, hit.chrom_start)) / float(hit.chrom_end - hit.chrom_start) | 
|  | 244                 #if hit.name not in segmentMap: | 
|  | 245                 #    segmentMap[hit.name] = [] | 
|  | 246                 try: | 
|  | 247                     segmentMap[hit.name].append( | 
|  | 248                         ( span, segment.value ) | 
|  | 249                     ) | 
|  | 250                 except KeyError: | 
|  | 251                     segmentMap[hit.name] = [ | 
|  | 252                         ( span, segment.value ) | 
|  | 253                     ] | 
|  | 254 | 
|  | 255         for gene in segmentMap: | 
|  | 256             geneHits[gene] = True | 
|  | 257             mapInfo = segmentMap[gene] | 
|  | 258             coverage = sum( i[0] for i in mapInfo ) | 
|  | 259             assert coverage <= 1.0 | 
|  | 260             value = sum( i[0]*i[1] for i in mapInfo ) | 
|  | 261             #print coverage, value, value/coverage, segmentMap[gene] | 
|  | 262             tmp.set_val(row_name=gene, col_name=id, value=value/coverage) | 
|  | 263 | 
|  | 264     #now remove the blanks | 
|  | 265     out = CGData.GenomicMatrix.GenomicMatrix() | 
|  | 266     out.init_blank( rows=geneHits, cols=idList ) | 
|  | 267     for gene in geneHits: | 
|  | 268         for sample in idList: | 
|  | 269             out.set_val( row_name=gene, col_name=sample, value=tmp.get_val(row_name=gene, col_name=sample)) | 
|  | 270     return out | 
|  | 271 | 
|  | 272 | 
|  | 273 | 
|  | 274 def aliasRemap(genomicMatrix, aliasMap): | 
|  | 275     """ | 
|  | 276     Given a genomicMatrix and an alias map, create a new genomic matrix | 
|  | 277     with the probes from the original matrix remapped to the connected aliases | 
|  | 278     from the map | 
|  | 279     """ | 
|  | 280 | 
|  | 281     am = {} | 
|  | 282     for probe in aliasMap.get_key_list(): | 
|  | 283         for alias in aliasMap.get_by(probe): | 
|  | 284             if alias not in am: | 
|  | 285                 am[alias.alias] = {} | 
|  | 286             am[alias.alias][probe] = True | 
|  | 287 | 
|  | 288     out = CGData.GenomicMatrix.GenomicMatrix() | 
|  | 289     out.init_blank( rows=am.keys(), cols=genomicMatrix.get_col_list() ) | 
|  | 290     probeMap = genomicMatrix.get_row_map() | 
|  | 291     for a in am: | 
|  | 292         for sample in genomicMatrix.get_col_list(): | 
|  | 293             o = [] | 
|  | 294             for p in am[a]: | 
|  | 295                 if p in probeMap: | 
|  | 296                     o.append( genomicMatrix.get_val( col_name=sample, row_name=p) ) | 
|  | 297             if len(o): | 
|  | 298                 out.set_val(col_name=sample, row_name=a, value=sum(o) / float(len(o))) | 
|  | 299 | 
|  | 300     return out | 
|  | 301 | 
|  | 302 def refGeneLink2ProbeLoc(aliasMap, refGene): | 
|  | 303     """ | 
|  | 304     given an alias map, and a refGene produce a probeMap by connecting | 
|  | 305     alias symbols. Returns the coordinates of the longest form | 
|  | 306     """ | 
|  | 307     out = CGData.ProbeLoc.ProbeLoc() | 
|  | 308     out.init_blank() | 
|  | 309 | 
|  | 310     for probe in aliasMap.get_key_list(): | 
|  | 311         for link in aliasMap.get_by(probe): | 
|  | 312             probe = link.probe | 
|  | 313             geneName = link.alias | 
|  | 314             sGene = None | 
|  | 315             try: | 
|  | 316                 for gene in refGene.get_gene(geneName): | 
|  | 317                     if sGene is None or gene.chrom_end - gene.chrom_start > sGene.chrom_end - sGene.chrom_start: | 
|  | 318                         sGene = gene | 
|  | 319             except KeyError: | 
|  | 320                 pass | 
|  | 321         if sGene is not None: | 
|  | 322             out.insert(probe, { 'probe' : probe, 'chrom' : sGene.chrom, 'strand' : sGene.strand, 'chrom_start' : sGene.chrom_start, 'chrom_end' : sGene.chrom_end }) | 
|  | 323     return out | 
|  | 324 | 
|  | 325 | 
|  | 326 | 
|  | 327 | 
|  | 328 |