view seg2matrix/CGData/GeneMap.py @ 37:e81019e3ac99

Updated synapseGetDataset to look at the filename rather than the (no longer existant) content type field to determine if the data is in zip format
author melissacline
date Mon, 27 Jul 2015 16:29:24 -0700
parents ab20c0d04f4a
children 3e5680fecd7a
line wrap: on
line source

#!/usr/bin/env python

import csv
import sys
import re

import CGData.ProbeMap
import CGData.GenomicMatrix

class ProbeMapper(object):
    """
    Class to map the probes. Expects handle to the refGene_hg18.table file
    """

    def __init__(self, mode='g'):
        self.cmp_func = optionMap[mode]

    def find_overlap(self, segment, ref_gene, cmp_func=None):
        """
        Function to find overlaps for a given probe description.
        the cmp_func arg is a function that returns a 'True' or 'False' for
        a given probe description and a gene, examples include 'gene_overlap'
        and 'gene_simple_meth_overlap'
        """
        if cmp_func is None:
            cmp_func = self.cmp_func
        if not ref_gene.has_chrom(segment.chrom):
            return []
        chromList = ref_gene.get_chrom(segment.chrom)

        out = []
        for gene in chromList:
            if cmp_func(segment.chrom_start,
                        segment.chrom_end, segment.strand, gene):
                out.append(gene)
        return out


#
# The set of functions that can be used to do comparisons
#


def block_both_strand(start, end, strand, gene):
    """
    Check is segment is between gene start and end, either strand
    
    **Code 'b'**
    """
    if gene.chrom_end >= start and gene.chrom_start <= end:
        return True
    return False

def block_same_strand(start, end, strand, gene):
    """
    Check is segment is on same strand, between gene start and end
    
    **Code 's'**
    """
    if gene.chrom_end >= start and gene.chrom_start <= end and strand == gene.strand:
        return True
    return False


def exon_same_strand(start, end, strand, gene):
    """
    Check is segment is on same strand, and occurs on an exon
    
    **Code 'g'**
    """

    if gene.strand != strand:
        return False
    for i in range(int(gene.ex_count)):
        if gene.ex_end[i] >= start and gene.ex_start[i] <= end:
            return True
    return False


def exon_both_strand(start, end, strand, gene):
    """
    Check is segment occurs on an exon, on either stand
    
    **Code 'e'**
    """
    for i in range(int(gene.ex_count)):
        if gene.ex_end[i] >= start and gene.ex_start[i] <= end:
            return True
    return False

def block_same_strand_coverage75(start, end, strand, gene):
    if gene.chrom_end >= start and gene.chrom_start <= end and strand == gene.strand:
        cov = min(gene.chrom_end,end) - max(gene.chrom_start, start)
        if float(cov) / float(end - start) > 0.75:
            return True
    return False


def exon_same_strand_coverage75(start, end, strand, gene):
    if strand != gene.strand:
        return False
    cov = 0
    for i in range(int(gene.ex_count)):
        if gene.ex_end[i] >= start and gene.ex_start[i] <= end:
            cov += min(gene.ex_end[i],end) - max(gene.ex_start[i], start)
    if float(cov) / float(end - start) > 0.75:
        return True
    return False


class Intersector:
    def __init__(self, same_strand=True, exons=False, coverage=None, start_rel_cdsStart=None, end_rel_cdsStart=None, start_rel_tss=None, end_rel_tss=None):
        self.same_strand = same_strand
        self.exons = exons
        self.coverage = coverage
        self.start_rel_tss = start_rel_tss
        self.end_rel_tss = end_rel_tss
        self.start_rel_cdsStart = start_rel_cdsStart
        self.end_rel_cdsStart = end_rel_cdsStart
    
    def hit(self, start, end, strand, gene):
        if self.same_strand and strand != gene.strand:
            return False

        if self.exons:
            cov = 0
            for i in range(int(gene.ex_count)):
                if gene.ex_end[i] >= start and gene.ex_start[i] <= end:
                    cov += min(gene.ex_end[i],end) - max(gene.ex_start[i], start)
            if self.coverage is None and cov > 0:
                return True
            else:
                if float(cov) / float(end - start) > self.coverage:
                    return True
        else:
                wStart = gene.chrom_start
                wEnd   = gene.chrom_end
                
                if self.start_rel_cdsStart is not None:                
                    if gene.strand == "+":
                        wStart = gene.cds_start + self.start_rel_cdsStart
                    if gene.strand == "-":
                        wStart = gene.cds_end - self.start_rel_cdsStart
                if self.end_rel_cdsStart is not None:                
                    if gene.strand == "+":
                        wEnd = gene.cds_start + self.end_rel_cdsStart
                    if gene.strand == "-":
                        wEnd = gene.cds_end - self.end_rel_cdsStart
                
                if self.start_rel_tss is not None:
                    if gene.strand == "+":
                        wStart = gene.chrom_start + self.start_rel_tss
                    if gene.strand == "-":
                        wStart = gene.chrom_end - self.start_rel_tss
                if self.end_rel_tss is not None:
                    if gene.strand == "+":
                        wEnd = gene.chrom_start + self.end_rel_tss
                    if gene.strand == "-":
                        wEnd = gene.chrom_end - self.end_rel_tss
                
                cstart = min(wEnd, wStart)
                cend = max(wEnd, wStart)
                if cend >= start and cstart <= end:
                    cov = min(gene.chrom_end,end) - max(gene.chrom_start, start)
                    if self.coverage is None and cov > 0:
                        return True
                    else:
                        if float(cov) / float(end - start) > 0.75:
                            return True
    
        return False

        

###ADD MORE FUNCTIONS HERE


####

###To add options to the command line, map the option character to a function
###for example '-m' maps to gene_simple_meth_overlap

optionMap = {
    "b": block_both_strand,
    "s": block_same_strand,
    "g": exon_same_strand,
    "e": exon_both_strand
}



def genomicSegment2Matrix(genomicSegment, refGene, probeMapper):
    """
    Take a genomicSegment map, compare it against a refGene table,
    and contruct a genomicMatrix
    """
    out = CGData.GenomicMatrix.GenomicMatrix()
    out.init_blank( rows=refGene.get_gene_list(), cols=genomicSegment.get_key_list() )
    for id in genomicSegment.get_key_list():
        for segment in genomicSegment.get_by(id):
            for hit in probeMapper.find_overlap( segment, refGene ):
                out.set_val(row_name=hit.name, col_name=segment.id, value=segment.value )
    return out


def filter_longest_form(refgene):
    """
    take a refgene table and filter multiple gene isoforms down to the longest
    """
    ng = CGData.RefGene.RefGene()
    for g in refgene.get_gene_list():
        longest = None
        length = 0
        for elem in refgene.get_gene(g):
            newLength = elem.chrom_end - elem.chrom_start
            if newLength > length:
                length = newLength
                longest = elem
        ng.add(longest)
    ng.loaded = True
    return ng


def genomicSegment2MatrixNorm(genomicSegment, refGene, probeMapper):
    """
    Given 
    """
    ng = filter_longest_form(refGene)
    #enumerate the col order of the sample ids
    idList = genomicSegment.get_key_list()
    
    geneList = ng.get_gene_list()
    
    tmp = CGData.GenomicMatrix.GenomicMatrix()
    tmp.init_blank( rows=geneList, cols=idList )
    geneHits = {}
    #read through the segment one sample id at a time
    for id in idList:   
        segmentMap = {}
        for segment in genomicSegment.get_by(id):
            for hit in probeMapper.find_overlap( segment, ng ):
                span = float(min(segment.chrom_end, hit.chrom_end) - max(segment.chrom_start, hit.chrom_start)) / float(hit.chrom_end - hit.chrom_start)
                #if hit.name not in segmentMap:
                #    segmentMap[hit.name] = []
                try:
                    segmentMap[hit.name].append(
                        ( span, segment.value )
                    )
                except KeyError:
                    segmentMap[hit.name] = [
                        ( span, segment.value )
                    ]
        
        for gene in segmentMap:
            geneHits[gene] = True
            mapInfo = segmentMap[gene]
            coverage = sum( i[0] for i in mapInfo )
            assert coverage <= 1.0
            value = sum( i[0]*i[1] for i in mapInfo )
            #print coverage, value, value/coverage, segmentMap[gene]
            tmp.set_val(row_name=gene, col_name=id, value=value/coverage)
    
    #now remove the blanks
    out = CGData.GenomicMatrix.GenomicMatrix()
    out.init_blank( rows=geneHits, cols=idList )
    for gene in geneHits:
        for sample in idList:
            out.set_val( row_name=gene, col_name=sample, value=tmp.get_val(row_name=gene, col_name=sample))
    return out



def aliasRemap(genomicMatrix, aliasMap):
    """
    Given a genomicMatrix and an alias map, create a new genomic matrix 
    with the probes from the original matrix remapped to the connected aliases
    from the map
    """
    
    am = {}
    for probe in aliasMap.get_key_list():
        for alias in aliasMap.get_by(probe):
            if alias not in am:
                am[alias.alias] = {}
            am[alias.alias][probe] = True
        
    out = CGData.GenomicMatrix.GenomicMatrix()
    out.init_blank( rows=am.keys(), cols=genomicMatrix.get_col_list() )
    probeMap = genomicMatrix.get_row_map()
    for a in am:
        for sample in genomicMatrix.get_col_list():
            o = []
            for p in am[a]:
                if p in probeMap:
                    o.append( genomicMatrix.get_val( col_name=sample, row_name=p) )
            if len(o):
                out.set_val(col_name=sample, row_name=a, value=sum(o) / float(len(o)))
    
    return out
    
def refGeneLink2ProbeLoc(aliasMap, refGene):
    """
    given an alias map, and a refGene produce a probeMap by connecting 
    alias symbols. Returns the coordinates of the longest form
    """
    out = CGData.ProbeLoc.ProbeLoc()
    out.init_blank()
    
    for probe in aliasMap.get_key_list():
        for link in aliasMap.get_by(probe):
            probe = link.probe
            geneName = link.alias
            sGene = None
            try:
                for gene in refGene.get_gene(geneName):
                    if sGene is None or gene.chrom_end - gene.chrom_start > sGene.chrom_end - sGene.chrom_start:
                        sGene = gene
            except KeyError:
                pass
        if sGene is not None:
            out.insert(probe, { 'probe' : probe, 'chrom' : sGene.chrom, 'strand' : sGene.strand, 'chrom_start' : sGene.chrom_start, 'chrom_end' : sGene.chrom_end })
    return out