changeset 0:263c680a104c default tip

Uploaded
author yong
date Tue, 20 Mar 2012 18:19:39 -0400
parents
children
files ConComp.py
diffstat 1 files changed, 209 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ConComp.py	Tue Mar 20 18:19:39 2012 -0400
@@ -0,0 +1,209 @@
+#!/usr/bin/env python
+
+
+######## compare TFs occupied segements between human and mouse 
+######## input: Two ChIP-seq peak of the same factor (human and mouse, bed format)
+######## output:Conserved peaks group
+       
+######## required tools : LiftOver
+#######  Liftover http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver
+#######  requried python library  pybedtools
+#######  pybedtools http://pypi.python.org/pypi/pybedtools
+
+from optparse import OptionParser
+import sys
+import os
+import pybedtools
+#### parameters
+parser=OptionParser("Usage: %prog [optinos] hg_mm_liftover_chain mm_hg_liftover_chain hg_homologous_TSS mm_homologous_TSS human_peaks.bed mouse_peaks.bed")
+parser.add_option("-p","--percent",dest="liftOver_threshold",default=0.6,help="The -minMatch parameters in liftOver program, with default is 0.6")
+parser.add_option("-l","--overlap",dest="intersect_th",default=1,help="The overlap requirement between the human and mouse peaks, the -f option in interserBed, default is 1E-9 ,1bp")
+parser.add_option("-o","--outfile",dest="outfilename",default="TF",help="prefix  of the final outputs, , default  is TF")
+
+(options,args)=parser.parse_args()
+if len(args) !=6:
+        parser.error("incorrect number of input peaks, two liftover chians , two tss files and two peak files ")
+
+
+
+#### data sets used 
+LO_Per=options.liftOver_threshold
+hg_peak=args[4]
+mm_peak=args[5]
+hg_mm_chain=args[0]
+mm_hg_chain=args[1]
+hg_homo_TSS=args[2]
+mm_homo_TSS=args[3]
+
+##### define output file names
+cons_out=options.outfilename + "_conserved_OSs.txt"
+hg_out=options.outfilename + "_hg_conserved_OSs.bed"
+mm_out=options.outfilename + "_mm_conserved_OSs.bed"
+hg_only_out=options.outfilename + "_hg_only_OSs.bed"
+mm_only_out=options.outfilename + "_mm_only_OSs.bed"
+gene_label_out=options.outfilename + "_nearest_homogenes.txt"
+
+
+
+def label_bed(file,outname,id):  ###label input bedformat 
+        outfile=open(outname,"w")
+        ct=0
+        for line in open(file):
+                f=line.split()
+                chr=f[0]
+                start=int(f[1])
+                end=int(f[2])
+                if start <0: ##### need to add format check late
+                        continue
+                elif end <0:
+                        continue
+                elif start >end:
+                        continue
+                else:
+                        outfile.write("%s\t%d\t%d\t%s%06d\n"%(chr,start,end,id,ct)) 
+                        ct=ct+1
+        outfile.close()
+
+def bed2dic(file):  ### transform bed file into dictionary
+        out={}
+        for line in open(file):
+                f=line.split()
+                chr=f[0]
+                start=f[1]
+                end=f[2]
+                id=f[3]
+                if id not in out:
+                        out[id]=[]
+                out[id]="\t".join([chr,start,end])
+        return out
+###  label the input bed files 
+print "add id to peaks.."
+label_bed(hg_peak,"hg_peaks_label.bed.tp","hg_peaks")
+label_bed(mm_peak,"mm_peaks_label.bed.tp","mm_peaks")
+
+###     reciprocal liftOver use labeled peaks 
+print "running reciprocal liftOver..."
+hg_liftover_cmd="liftOver -minMatch=%s  hg_peaks_label.bed.tp  %s   hg_peaks_mm9.bed.tp  hg_peaks_unmapped" % (LO_Per,hg_mm_chain)
+print "liftOver human peaks to mouse..."
+os.system(hg_liftover_cmd)
+print "liftOver mouse peaks to human..."
+mm_liftover_cmd="liftOver -minMatch=%s  mm_peaks_label.bed.tp  %s  mm_peaks_hg19.bed.tp  mm_peaks_unmapped" % (LO_Per,mm_hg_chain)
+os.system(mm_liftover_cmd)
+###    chose conserved peaks supported by two directions 
+hg_mm9=pybedtools.BedTool('hg_peaks_mm9.bed.tp')
+hg_mm9.intersect("mm_peaks_label.bed.tp",wa=True,wb=True,f=options.intersect_th).saveas('hg_mm_conserved_mm9.txt.tp')
+
+mm_hg19=pybedtools.BedTool('mm_peaks_hg19.bed.tp')
+mm_hg19.intersect("hg_peaks_label.bed.tp",wa=True,wb=True,f=options.intersect_th).saveas('mm_hg_conserved_hg19.txt.tp')
+###     merged the two files and assign group ID to each pair
+print "printing output ..."
+mm9_pair=set()
+hg19_pair=set()
+for line in open("hg_mm_conserved_mm9.txt.tp"):
+        f=line.split()
+        hg_id=f[3]
+        mm_id=f[7]
+        mm9_pair.add(hg_id+":"+mm_id)
+for line in open("mm_hg_conserved_hg19.txt.tp"):
+        f=line.split()
+        hg_id=f[7]
+        mm_id=f[3]
+        hg19_pair.add(hg_id+":"+mm_id)
+conserved_pair=hg19_pair.intersection(mm9_pair)
+
+
+###     output results : conserved peaks with group id 
+gp_ct=0
+hgpeak_dic={}
+mmpeak_dic={}
+def bed2dic(file):
+        out={}
+        for line in open(file):
+                f=line.split()
+                chr=f[0]
+                start=f[1]
+                end=f[2]
+                id=f[3]
+                if id not in out:
+                        out[id]=[]
+                out[id]="\t".join([chr,start,end])
+        return out
+
+### converted peaks in to dictinoary list, used for the follwing intersection
+hgpeak_dic=bed2dic("hg_peaks_label.bed.tp")
+mmpeak_dic=bed2dic("mm_peaks_label.bed.tp")
+##### define output file names
+cons_out=options.outfilename + "_conserved_OSs.txt"
+hg_out=options.outfilename + "_hg_conserved_OSs.bed"
+mm_out=options.outfilename + "_mm_conserved_OSs.bed"
+hg_only_out=options.outfilename + "_hg_only_OSs.bed"
+mm_only_out=options.outfilename + "_mm_only_OSs.bed"
+
+pair_out=file(cons_out,"w")
+hg_conf=file(hg_out,"w")
+mm_conf=file(mm_out,"w")
+pair_out.write("chr_hg\tstart_hg\tend_hg\tchr_mm\tstart_mm\tend_mm\tgroup_id\n")
+
+
+hg_ocs=set()
+mm_ocs=set()
+for item in conserved_pair:
+        hg_id=item.split(":")[0]
+        mm_id=item.split(":")[1]
+        group_id="group%06d"%(gp_ct)
+        gp_ct=gp_ct+1
+        hg_ocs.add(hg_id)
+#        pair_out.write("%s\t%s\t%s\t%s\t%s\n"%(hgpeak_dic[hg_id],hg_id,mmpeak_dic[mm_id],mm_id,group_id))       
+        pair_out.write("%s\t%s\t%s\n"%(hgpeak_dic[hg_id],mmpeak_dic[mm_id],group_id))
+        hg_conf.write("%s\t%s\n"%(hgpeak_dic[hg_id],group_id))
+        mm_conf.write("%s\t%s\n"%(mmpeak_dic[mm_id],group_id))
+pair_out.close()
+hg_conf.close()
+mm_conf.close()
+
+pybedtools.BedTool(hg_peak).intersect(hg_out,wa=True,v=True).saveas(hg_only_out)
+pybedtools.BedTool(mm_peak).intersect(mm_out,wa=True,v=True).saveas(mm_only_out)
+################  neaerest homologous genes label #########################
+print "labeling nearest homologous genes..."
+
+outfile=file(gene_label_out,"w")
+pybedtools.BedTool(hg_out).closest(hg_homo_TSS,d=True).saveas("hg_nearest.tp")
+pybedtools.BedTool(mm_out).closest(mm_homo_TSS,d=True).saveas("mm_nearest.tp")
+hg_rn={}
+
+for line in open("hg_nearest.tp"):
+        f=line.split()
+        gid=f[3]
+        chr=f[0]
+        start=f[1]
+        end=f[2]
+        gene=f[7]
+        ggid=f[8]
+        d=f[9]
+        info=[chr,start,end,gene,d]
+        if gid not in hg_rn:
+                hg_rn[gid]=[]
+        hg_rn[gid].append(ggid)
+        hg_rn[gid].append(info)
+outfile.write("chr_hg\tstart_hg\tend_hg\tgene_hg\tdistance_hg\tchr_mm\tstart_mm\tend_mm\tgene_mm\tdistance_mm\tgroup_id\n" )
+for line in open("mm_nearest.tp"):
+        f=line.split()
+        gid=f[3]
+        chr=f[0]
+        start=f[1]
+        end=f[2]
+        gene=f[7]
+        ggid=f[8]
+        d=f[9]
+        info=[chr,start,end,gene,d]
+        if gid in hg_rn and ggid==hg_rn[gid][0]:
+                outfile.write("%s\t%s\t%s\n"%("\t".join(hg_rn[gid][1]),"\t".join(info),gid) )
+
+outfile.close()
+
+#### clean the temporary file 
+cmd="rm *.tp"
+os.system(cmd)
+cmd="rm *unmapped"
+os.system(cmd)
+print "Done! check %s"%(cons_out)