Mercurial > repos > melissacline > ucsc_cancer_utilities
diff seg2matrix/CGData/GeneMap.py @ 31:ab20c0d04f4a
add seg2matrix tool
author | jingchunzhu |
---|---|
date | Fri, 24 Jul 2015 13:10:11 -0700 |
parents | |
children | 3e5680fecd7a |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/seg2matrix/CGData/GeneMap.py Fri Jul 24 13:10:11 2015 -0700 @@ -0,0 +1,327 @@ +#!/usr/bin/env python + +import csv +import sys +import re + +import CGData.ProbeMap +import CGData.GenomicMatrix + +class ProbeMapper(object): + """ + Class to map the probes. Expects handle to the refGene_hg18.table file + """ + + def __init__(self, mode='g'): + self.cmp_func = optionMap[mode] + + def find_overlap(self, segment, ref_gene, cmp_func=None): + """ + Function to find overlaps for a given probe description. + the cmp_func arg is a function that returns a 'True' or 'False' for + a given probe description and a gene, examples include 'gene_overlap' + and 'gene_simple_meth_overlap' + """ + if cmp_func is None: + cmp_func = self.cmp_func + if not ref_gene.has_chrom(segment.chrom): + return [] + chromList = ref_gene.get_chrom(segment.chrom) + + out = [] + for gene in chromList: + if cmp_func(segment.chrom_start, + segment.chrom_end, segment.strand, gene): + out.append(gene) + return out + + +# +# The set of functions that can be used to do comparisons +# + + +def block_both_strand(start, end, strand, gene): + """ + Check is segment is between gene start and end, either strand + + **Code 'b'** + """ + if gene.chrom_end >= start and gene.chrom_start <= end: + return True + return False + +def block_same_strand(start, end, strand, gene): + """ + Check is segment is on same strand, between gene start and end + + **Code 's'** + """ + if gene.chrom_end >= start and gene.chrom_start <= end and strand == gene.strand: + return True + return False + + +def exon_same_strand(start, end, strand, gene): + """ + Check is segment is on same strand, and occurs on an exon + + **Code 'g'** + """ + + if gene.strand != strand: + return False + for i in range(int(gene.ex_count)): + if gene.ex_end[i] >= start and gene.ex_start[i] <= end: + return True + return False + + +def exon_both_strand(start, end, strand, gene): + """ + Check is segment occurs on an exon, on either stand + + **Code 'e'** + """ + for i in range(int(gene.ex_count)): + if gene.ex_end[i] >= start and gene.ex_start[i] <= end: + return True + return False + +def block_same_strand_coverage75(start, end, strand, gene): + if gene.chrom_end >= start and gene.chrom_start <= end and strand == gene.strand: + cov = min(gene.chrom_end,end) - max(gene.chrom_start, start) + if float(cov) / float(end - start) > 0.75: + return True + return False + + +def exon_same_strand_coverage75(start, end, strand, gene): + if strand != gene.strand: + return False + cov = 0 + for i in range(int(gene.ex_count)): + if gene.ex_end[i] >= start and gene.ex_start[i] <= end: + cov += min(gene.ex_end[i],end) - max(gene.ex_start[i], start) + if float(cov) / float(end - start) > 0.75: + return True + return False + + +class Intersector: + def __init__(self, same_strand=True, exons=False, coverage=None, start_rel_cdsStart=None, end_rel_cdsStart=None, start_rel_tss=None, end_rel_tss=None): + self.same_strand = same_strand + self.exons = exons + self.coverage = coverage + self.start_rel_tss = start_rel_tss + self.end_rel_tss = end_rel_tss + self.start_rel_cdsStart = start_rel_cdsStart + self.end_rel_cdsStart = end_rel_cdsStart + + def hit(self, start, end, strand, gene): + if self.same_strand and strand != gene.strand: + return False + + if self.exons: + cov = 0 + for i in range(int(gene.ex_count)): + if gene.ex_end[i] >= start and gene.ex_start[i] <= end: + cov += min(gene.ex_end[i],end) - max(gene.ex_start[i], start) + if self.coverage is None and cov > 0: + return True + else: + if float(cov) / float(end - start) > self.coverage: + return True + else: + wStart = gene.chrom_start + wEnd = gene.chrom_end + + if self.start_rel_cdsStart is not None: + if gene.strand == "+": + wStart = gene.cds_start + self.start_rel_cdsStart + if gene.strand == "-": + wStart = gene.cds_end - self.start_rel_cdsStart + if self.end_rel_cdsStart is not None: + if gene.strand == "+": + wEnd = gene.cds_start + self.end_rel_cdsStart + if gene.strand == "-": + wEnd = gene.cds_end - self.end_rel_cdsStart + + if self.start_rel_tss is not None: + if gene.strand == "+": + wStart = gene.chrom_start + self.start_rel_tss + if gene.strand == "-": + wStart = gene.chrom_end - self.start_rel_tss + if self.end_rel_tss is not None: + if gene.strand == "+": + wEnd = gene.chrom_start + self.end_rel_tss + if gene.strand == "-": + wEnd = gene.chrom_end - self.end_rel_tss + + cstart = min(wEnd, wStart) + cend = max(wEnd, wStart) + if cend >= start and cstart <= end: + cov = min(gene.chrom_end,end) - max(gene.chrom_start, start) + if self.coverage is None and cov > 0: + return True + else: + if float(cov) / float(end - start) > 0.75: + return True + + return False + + + +###ADD MORE FUNCTIONS HERE + + +#### + +###To add options to the command line, map the option character to a function +###for example '-m' maps to gene_simple_meth_overlap + +optionMap = { + "b": block_both_strand, + "s": block_same_strand, + "g": exon_same_strand, + "e": exon_both_strand +} + + + +def genomicSegment2Matrix(genomicSegment, refGene, probeMapper): + """ + Take a genomicSegment map, compare it against a refGene table, + and contruct a genomicMatrix + """ + out = CGData.GenomicMatrix.GenomicMatrix() + out.init_blank( rows=refGene.get_gene_list(), cols=genomicSegment.get_key_list() ) + for id in genomicSegment.get_key_list(): + for segment in genomicSegment.get_by(id): + for hit in probeMapper.find_overlap( segment, refGene ): + out.set_val(row_name=hit.name, col_name=segment.id, value=segment.value ) + return out + + +def filter_longest_form(refgene): + """ + take a refgene table and filter multiple gene isoforms down to the longest + """ + ng = CGData.RefGene.RefGene() + for g in refgene.get_gene_list(): + longest = None + length = 0 + for elem in refgene.get_gene(g): + newLength = elem.chrom_end - elem.chrom_start + if newLength > length: + length = newLength + longest = elem + ng.add(longest) + ng.loaded = True + return ng + + +def genomicSegment2MatrixNorm(genomicSegment, refGene, probeMapper): + """ + Given + """ + ng = filter_longest_form(refGene) + #enumerate the col order of the sample ids + idList = genomicSegment.get_key_list() + + geneList = ng.get_gene_list() + + tmp = CGData.GenomicMatrix.GenomicMatrix() + tmp.init_blank( rows=geneList, cols=idList ) + geneHits = {} + #read through the segment one sample id at a time + for id in idList: + segmentMap = {} + for segment in genomicSegment.get_by(id): + for hit in probeMapper.find_overlap( segment, ng ): + span = float(min(segment.chrom_end, hit.chrom_end) - max(segment.chrom_start, hit.chrom_start)) / float(hit.chrom_end - hit.chrom_start) + #if hit.name not in segmentMap: + # segmentMap[hit.name] = [] + try: + segmentMap[hit.name].append( + ( span, segment.value ) + ) + except KeyError: + segmentMap[hit.name] = [ + ( span, segment.value ) + ] + + for gene in segmentMap: + geneHits[gene] = True + mapInfo = segmentMap[gene] + coverage = sum( i[0] for i in mapInfo ) + assert coverage <= 1.0 + value = sum( i[0]*i[1] for i in mapInfo ) + #print coverage, value, value/coverage, segmentMap[gene] + tmp.set_val(row_name=gene, col_name=id, value=value/coverage) + + #now remove the blanks + out = CGData.GenomicMatrix.GenomicMatrix() + out.init_blank( rows=geneHits, cols=idList ) + for gene in geneHits: + for sample in idList: + out.set_val( row_name=gene, col_name=sample, value=tmp.get_val(row_name=gene, col_name=sample)) + return out + + + +def aliasRemap(genomicMatrix, aliasMap): + """ + Given a genomicMatrix and an alias map, create a new genomic matrix + with the probes from the original matrix remapped to the connected aliases + from the map + """ + + am = {} + for probe in aliasMap.get_key_list(): + for alias in aliasMap.get_by(probe): + if alias not in am: + am[alias.alias] = {} + am[alias.alias][probe] = True + + out = CGData.GenomicMatrix.GenomicMatrix() + out.init_blank( rows=am.keys(), cols=genomicMatrix.get_col_list() ) + probeMap = genomicMatrix.get_row_map() + for a in am: + for sample in genomicMatrix.get_col_list(): + o = [] + for p in am[a]: + if p in probeMap: + o.append( genomicMatrix.get_val( col_name=sample, row_name=p) ) + if len(o): + out.set_val(col_name=sample, row_name=a, value=sum(o) / float(len(o))) + + return out + +def refGeneLink2ProbeLoc(aliasMap, refGene): + """ + given an alias map, and a refGene produce a probeMap by connecting + alias symbols. Returns the coordinates of the longest form + """ + out = CGData.ProbeLoc.ProbeLoc() + out.init_blank() + + for probe in aliasMap.get_key_list(): + for link in aliasMap.get_by(probe): + probe = link.probe + geneName = link.alias + sGene = None + try: + for gene in refGene.get_gene(geneName): + if sGene is None or gene.chrom_end - gene.chrom_start > sGene.chrom_end - sGene.chrom_start: + sGene = gene + except KeyError: + pass + if sGene is not None: + out.insert(probe, { 'probe' : probe, 'chrom' : sGene.chrom, 'strand' : sGene.strand, 'chrom_start' : sGene.chrom_start, 'chrom_end' : sGene.chrom_end }) + return out + + + + +