# HG changeset patch # User yong # Date 1332281979 14400 # Node ID 263c680a104ca5c9421300efde7c42f45e1a8b4c Uploaded diff -r 000000000000 -r 263c680a104c ConComp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ConComp.py Tue Mar 20 18:19:39 2012 -0400 @@ -0,0 +1,209 @@ +#!/usr/bin/env python + + +######## compare TFs occupied segements between human and mouse +######## input: Two ChIP-seq peak of the same factor (human and mouse, bed format) +######## output:Conserved peaks group + +######## required tools : LiftOver +####### Liftover http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver +####### requried python library pybedtools +####### pybedtools http://pypi.python.org/pypi/pybedtools + +from optparse import OptionParser +import sys +import os +import pybedtools +#### parameters +parser=OptionParser("Usage: %prog [optinos] hg_mm_liftover_chain mm_hg_liftover_chain hg_homologous_TSS mm_homologous_TSS human_peaks.bed mouse_peaks.bed") +parser.add_option("-p","--percent",dest="liftOver_threshold",default=0.6,help="The -minMatch parameters in liftOver program, with default is 0.6") +parser.add_option("-l","--overlap",dest="intersect_th",default=1,help="The overlap requirement between the human and mouse peaks, the -f option in interserBed, default is 1E-9 ,1bp") +parser.add_option("-o","--outfile",dest="outfilename",default="TF",help="prefix of the final outputs, , default is TF") + +(options,args)=parser.parse_args() +if len(args) !=6: + parser.error("incorrect number of input peaks, two liftover chians , two tss files and two peak files ") + + + +#### data sets used +LO_Per=options.liftOver_threshold +hg_peak=args[4] +mm_peak=args[5] +hg_mm_chain=args[0] +mm_hg_chain=args[1] +hg_homo_TSS=args[2] +mm_homo_TSS=args[3] + +##### define output file names +cons_out=options.outfilename + "_conserved_OSs.txt" +hg_out=options.outfilename + "_hg_conserved_OSs.bed" +mm_out=options.outfilename + "_mm_conserved_OSs.bed" +hg_only_out=options.outfilename + "_hg_only_OSs.bed" +mm_only_out=options.outfilename + "_mm_only_OSs.bed" +gene_label_out=options.outfilename + "_nearest_homogenes.txt" + + + +def label_bed(file,outname,id): ###label input bedformat + outfile=open(outname,"w") + ct=0 + for line in open(file): + f=line.split() + chr=f[0] + start=int(f[1]) + end=int(f[2]) + if start <0: ##### need to add format check late + continue + elif end <0: + continue + elif start >end: + continue + else: + outfile.write("%s\t%d\t%d\t%s%06d\n"%(chr,start,end,id,ct)) + ct=ct+1 + outfile.close() + +def bed2dic(file): ### transform bed file into dictionary + out={} + for line in open(file): + f=line.split() + chr=f[0] + start=f[1] + end=f[2] + id=f[3] + if id not in out: + out[id]=[] + out[id]="\t".join([chr,start,end]) + return out +### label the input bed files +print "add id to peaks.." +label_bed(hg_peak,"hg_peaks_label.bed.tp","hg_peaks") +label_bed(mm_peak,"mm_peaks_label.bed.tp","mm_peaks") + +### reciprocal liftOver use labeled peaks +print "running reciprocal liftOver..." +hg_liftover_cmd="liftOver -minMatch=%s hg_peaks_label.bed.tp %s hg_peaks_mm9.bed.tp hg_peaks_unmapped" % (LO_Per,hg_mm_chain) +print "liftOver human peaks to mouse..." +os.system(hg_liftover_cmd) +print "liftOver mouse peaks to human..." +mm_liftover_cmd="liftOver -minMatch=%s mm_peaks_label.bed.tp %s mm_peaks_hg19.bed.tp mm_peaks_unmapped" % (LO_Per,mm_hg_chain) +os.system(mm_liftover_cmd) +### chose conserved peaks supported by two directions +hg_mm9=pybedtools.BedTool('hg_peaks_mm9.bed.tp') +hg_mm9.intersect("mm_peaks_label.bed.tp",wa=True,wb=True,f=options.intersect_th).saveas('hg_mm_conserved_mm9.txt.tp') + +mm_hg19=pybedtools.BedTool('mm_peaks_hg19.bed.tp') +mm_hg19.intersect("hg_peaks_label.bed.tp",wa=True,wb=True,f=options.intersect_th).saveas('mm_hg_conserved_hg19.txt.tp') +### merged the two files and assign group ID to each pair +print "printing output ..." +mm9_pair=set() +hg19_pair=set() +for line in open("hg_mm_conserved_mm9.txt.tp"): + f=line.split() + hg_id=f[3] + mm_id=f[7] + mm9_pair.add(hg_id+":"+mm_id) +for line in open("mm_hg_conserved_hg19.txt.tp"): + f=line.split() + hg_id=f[7] + mm_id=f[3] + hg19_pair.add(hg_id+":"+mm_id) +conserved_pair=hg19_pair.intersection(mm9_pair) + + +### output results : conserved peaks with group id +gp_ct=0 +hgpeak_dic={} +mmpeak_dic={} +def bed2dic(file): + out={} + for line in open(file): + f=line.split() + chr=f[0] + start=f[1] + end=f[2] + id=f[3] + if id not in out: + out[id]=[] + out[id]="\t".join([chr,start,end]) + return out + +### converted peaks in to dictinoary list, used for the follwing intersection +hgpeak_dic=bed2dic("hg_peaks_label.bed.tp") +mmpeak_dic=bed2dic("mm_peaks_label.bed.tp") +##### define output file names +cons_out=options.outfilename + "_conserved_OSs.txt" +hg_out=options.outfilename + "_hg_conserved_OSs.bed" +mm_out=options.outfilename + "_mm_conserved_OSs.bed" +hg_only_out=options.outfilename + "_hg_only_OSs.bed" +mm_only_out=options.outfilename + "_mm_only_OSs.bed" + +pair_out=file(cons_out,"w") +hg_conf=file(hg_out,"w") +mm_conf=file(mm_out,"w") +pair_out.write("chr_hg\tstart_hg\tend_hg\tchr_mm\tstart_mm\tend_mm\tgroup_id\n") + + +hg_ocs=set() +mm_ocs=set() +for item in conserved_pair: + hg_id=item.split(":")[0] + mm_id=item.split(":")[1] + group_id="group%06d"%(gp_ct) + gp_ct=gp_ct+1 + hg_ocs.add(hg_id) +# pair_out.write("%s\t%s\t%s\t%s\t%s\n"%(hgpeak_dic[hg_id],hg_id,mmpeak_dic[mm_id],mm_id,group_id)) + pair_out.write("%s\t%s\t%s\n"%(hgpeak_dic[hg_id],mmpeak_dic[mm_id],group_id)) + hg_conf.write("%s\t%s\n"%(hgpeak_dic[hg_id],group_id)) + mm_conf.write("%s\t%s\n"%(mmpeak_dic[mm_id],group_id)) +pair_out.close() +hg_conf.close() +mm_conf.close() + +pybedtools.BedTool(hg_peak).intersect(hg_out,wa=True,v=True).saveas(hg_only_out) +pybedtools.BedTool(mm_peak).intersect(mm_out,wa=True,v=True).saveas(mm_only_out) +################ neaerest homologous genes label ######################### +print "labeling nearest homologous genes..." + +outfile=file(gene_label_out,"w") +pybedtools.BedTool(hg_out).closest(hg_homo_TSS,d=True).saveas("hg_nearest.tp") +pybedtools.BedTool(mm_out).closest(mm_homo_TSS,d=True).saveas("mm_nearest.tp") +hg_rn={} + +for line in open("hg_nearest.tp"): + f=line.split() + gid=f[3] + chr=f[0] + start=f[1] + end=f[2] + gene=f[7] + ggid=f[8] + d=f[9] + info=[chr,start,end,gene,d] + if gid not in hg_rn: + hg_rn[gid]=[] + hg_rn[gid].append(ggid) + hg_rn[gid].append(info) +outfile.write("chr_hg\tstart_hg\tend_hg\tgene_hg\tdistance_hg\tchr_mm\tstart_mm\tend_mm\tgene_mm\tdistance_mm\tgroup_id\n" ) +for line in open("mm_nearest.tp"): + f=line.split() + gid=f[3] + chr=f[0] + start=f[1] + end=f[2] + gene=f[7] + ggid=f[8] + d=f[9] + info=[chr,start,end,gene,d] + if gid in hg_rn and ggid==hg_rn[gid][0]: + outfile.write("%s\t%s\t%s\n"%("\t".join(hg_rn[gid][1]),"\t".join(info),gid) ) + +outfile.close() + +#### clean the temporary file +cmd="rm *.tp" +os.system(cmd) +cmd="rm *unmapped" +os.system(cmd) +print "Done! check %s"%(cons_out)