view seg2matrix/mapSegToGeneMatrix.py @ 46:ebf3bc09c383

add snpEff code
author jingchunzhu
date Thu, 13 Aug 2015 21:49:03 -0700
parents 61f03b481b0d
children 728eda331f07
line wrap: on
line source

#!/usr/bin/env python

import sys,string,copy
import CGData.RefGene
import CGData.GeneMap
import segToProbeMap

if __name__ == "__main__":
    
    if len(sys.argv[:])!=4:
        print "python mapSegToGeneMatrix.py genomicsSegmentIn refGene GeneLevelMatrixOut\n"
        sys.exit()
    refgene = CGData.RefGene.RefGene()
    refgene.load( sys.argv[2] )

    #* b for cnv 
    probeMapper = CGData.GeneMap.ProbeMapper('b')

    fin =open(sys.argv[1],'r')
    genes= {}
    samples={}
    matrix=[]  #sample then gene
    for gene in refgene.get_gene_list():
        genes[gene]=len(genes)

    Ngene = len(genes.keys())
    oneSample=[]    
    for i in range(0, Ngene):
        oneSample.append([]);

    print "genes: ", len(genes)

    count =0

    while 1:
        count = count+1
        #print count
        line =string.strip(fin.readline())
        if line =="":
            break
        if line[0]=="#":
            continue
        tmp = string.split(line,"\t")
        if len(tmp)!= 6:
            continue
        seg = segToProbeMap.probeseg("", tmp[1], int(tmp[2]), int(tmp[3]),tmp[4])
        sample = tmp[0]
        value = float(tmp[5])
        if sample not in samples:
            samples[sample]=len(samples)
            matrix.append(copy.deepcopy(oneSample))
            
        hits={}
        for hit in probeMapper.find_overlap( seg, refgene ):
            gene = hit.name
            if gene in hits:
                continue
            hits[gene]=0
            matrix[samples[sample]][genes[gene]].append(value)
    fin.close()

    print "segments: ", count

    fout =open(sys.argv[3],'w')
    sample_list =samples.keys()
    fout.write("sample\t"+string.join(sample_list,"\t")+"\n")
    for gene in genes.keys():
        fout.write(gene)
        for sample in sample_list:
            list = matrix[samples[sample]][genes[gene]]
            if len(list)==0:
                average =0
            elif len(list)==1:
                average = list[0]
                average =round(average,3)
            else:
                total=0.0
                for value in list:
                    total = total +value
                average = total/len(list)
                average =round(average,3)
            fout.write("\t"+str(average))
        fout.write("\n")
    fout.close()