# HG changeset patch # User jingchunzhu # Date 1438400307 25200 # Node ID 61f03b481b0d8437096d0c2f850c06096c921c0d # Parent 84eb11adc22f89ccee540838b74f61c17419e4f4 new tool diff -r 84eb11adc22f -r 61f03b481b0d seg2matrix/mapSegToGeneMatrix.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/seg2matrix/mapSegToGeneMatrix.py Fri Jul 31 20:38:27 2015 -0700 @@ -0,0 +1,85 @@ +#!/usr/bin/env python + +import sys,string,copy +import CGData.RefGene +import CGData.GeneMap +import segToProbeMap + +if __name__ == "__main__": + + if len(sys.argv[:])!=4: + print "python mapSegToGeneMatrix.py genomicsSegmentIn refGene GeneLevelMatrixOut\n" + sys.exit() + refgene = CGData.RefGene.RefGene() + refgene.load( sys.argv[2] ) + + #* b for cnv + probeMapper = CGData.GeneMap.ProbeMapper('b') + + fin =open(sys.argv[1],'r') + genes= {} + samples={} + matrix=[] #sample then gene + for gene in refgene.get_gene_list(): + genes[gene]=len(genes) + + Ngene = len(genes.keys()) + oneSample=[] + for i in range(0, Ngene): + oneSample.append([]); + + print "genes: ", len(genes) + + count =0 + + while 1: + count = count+1 + #print count + line =string.strip(fin.readline()) + if line =="": + break + if line[0]=="#": + continue + tmp = string.split(line,"\t") + if len(tmp)!= 6: + continue + seg = segToProbeMap.probeseg("", tmp[1], int(tmp[2]), int(tmp[3]),tmp[4]) + sample = tmp[0] + value = float(tmp[5]) + if sample not in samples: + samples[sample]=len(samples) + matrix.append(copy.deepcopy(oneSample)) + + hits={} + for hit in probeMapper.find_overlap( seg, refgene ): + gene = hit.name + if gene in hits: + continue + hits[gene]=0 + matrix[samples[sample]][genes[gene]].append(value) + fin.close() + + print "segments: ", count + + fout =open(sys.argv[3],'w') + sample_list =samples.keys() + fout.write("sample\t"+string.join(sample_list,"\t")+"\n") + for gene in genes.keys(): + fout.write(gene) + for sample in sample_list: + list = matrix[samples[sample]][genes[gene]] + if len(list)==0: + average =0 + elif len(list)==1: + average = list[0] + average =round(average,3) + else: + total=0.0 + for value in list: + total = total +value + average = total/len(list) + average =round(average,3) + fout.write("\t"+str(average)) + fout.write("\n") + fout.close() + diff -r 84eb11adc22f -r 61f03b481b0d seg2matrix/segToProbeMap.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/seg2matrix/segToProbeMap.py Fri Jul 31 20:38:27 2015 -0700 @@ -0,0 +1,77 @@ +#!/usr/bin/env python + +import sys,string +sys.path.insert(0,"../CGDataNew") + +import CGData.GenomicSegment +import CGData.SegToMatrix +import CGData.RefGene +import CGData.GeneMap +import optparse + +class segs: + def __init__(self): + self.probes = [] + + def load (self, handle): #handle bed6 + fin =open(handle,'r') + while 1: + line =string.strip(fin.readline()) + if line =="": + break + if line[0]=="#": + continue + tmp = string.split(line,"\t") + if len(tmp)== 5: + p = probeseg(tmp[0], tmp[1], int(tmp[3]), int(tmp[4]),tmp[2]) + self.probes.append(p) + fin.close() + + +class probeseg: + def __init__(self, name, chrom, chrom_start, chrom_end, strand): + self.name = name + self.chrom = chrom + self.chrom_start = chrom_start + self.chrom_end = chrom_end + self.strand = strand + + +if __name__ == "__main__": + def printUsage(): + print "python segToProbeMap.py segInput(name,chr,strand,start,end) refGene(eg hg18) probeMapOut --mode=cnv|exp\n" + + if len(sys.argv) != 5: + printUsage() + sys.exit() + + parser = optparse.OptionParser() + parser.add_option("--mode", action="store", type="string", dest="mode") + (options, args) = parser.parse_args() + + if (not options.mode) or (options.mode not in ["cnv","exp"]): + printUsage() + sys.exit() + + probes=segs() + probes.load(sys.argv[1]) + + refgene = CGData.RefGene.RefGene() + refgene.load(sys.argv[2]) + + handle = open(sys.argv[3], "w") + if options.mode=="cnv": + probeMapper = CGData.GeneMap.ProbeMapper('b') + if options.mode=="exp": + probeMapper = CGData.GeneMap.ProbeMapper('g') + + handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" % ("#id", "gene","chrom","chromStart","chromEnd","strand")) + for probe in probes.probes: + hits = [] + for hit in probeMapper.find_overlap( probe, refgene ): + if hit.name not in hits: + hits.append(hit.name) + handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (probe.name, ",".join(hits), probe.chrom, probe.chrom_start, probe.chrom_end, probe.strand)) + handle.close() + + diff -r 84eb11adc22f -r 61f03b481b0d segToGeneMatrix.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/segToGeneMatrix.xml Fri Jul 31 20:38:27 2015 -0700 @@ -0,0 +1,35 @@ + + + Convert segmented copy number data to gene based matrix data + + + seg2matrix/mapSegToGeneMatrix.py $input $__tool_directory__/seg2matrix/$refGene.assembly $outputMatrix + + + + + + + + + + + + + + + + + + + + + ***Convert segmented copy number data for input into xena*** + + Given a segmented copy number data file, convert it into gene based matrix data + + Output File no 1. matrix file + + + + diff -r 84eb11adc22f -r 61f03b481b0d segToMatrix.xml --- a/segToMatrix.xml Fri Jul 31 15:29:53 2015 -0700 +++ b/segToMatrix.xml Fri Jul 31 20:38:27 2015 -0700 @@ -3,7 +3,7 @@ Prep segmented copy number data for Xena - seg2matrix/segToMatrixGalaxy.py $input seg2matrix/$refGene.assembly $outputMatrix $outputProbeMap + seg2matrix/segToMatrixGalaxy.py $input $__tool_directory__/seg2matrix/$refGene.assembly $outputMatrix $outputProbeMap