go_enrichment: GO_enrichment/GO_enrichment.py comparison

comparison GO_enrichment/GO_enrichment.py @ 0:bde64415f03b draft

Uploaded

author	nanettec
date	Wed, 16 Mar 2016 10:15:05 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:bde64415f03b
+"""
+@summary: GO enrichment analysis (hotspots)
+@author: nanette.coetzer@gmail.com
+@version 5
+"""
+import optparse, sys
+import subprocess
+import tempfile
+import os, re
+def stop_err( msg ):
+sys.stderr.write( "%s\n" % msg )
+sys.exit()
+def __main__():
+#Parse Command Line
+parser = optparse.OptionParser()
+parser.add_option("-n", "--rscript", default=None, dest="rscript",
+help="R script")
+parser.add_option("-i", "--input1", default=None, dest="input1",
+help="Gene universe (Gene positions file)")
+parser.add_option("-j", "--input2", default=None, dest="input2",
+help="Hotspots genelists all eQTLs")
+parser.add_option("-k", "--input3", default=None, dest="input3",
+help="Hotspots genelists cis-eQTLs")
+parser.add_option("-l", "--input4", default=None, dest="input4",
+help="Hotspots genelists trans-eQTLs")
+parser.add_option("-m", "--input5", default=None, dest="input5",
+help="Gene2GO mapping")
+parser.add_option("-o", "--output1", default=None, dest="output1",
+help="GO enrichment output; all eQTLs")
+parser.add_option("-p", "--output2", default=None, dest="output2",
+help="GO enrichment output; cis eQTLs")
+parser.add_option("-q", "--output3", default=None, dest="output3",
+help="GO enrichment output; trans eQTLs")
+parser.add_option("-r", "--output4", default=None, dest="output4",
+help="Hotspot summary file")
+(options, args) = parser.parse_args()
+try:
+open(options.input1, "r").close()
+except TypeError, e:
+stop_err("You need to supply the Gene Universe file:\n" + str(e))
+except IOError, e:
+stop_err("Can not open the Gene Universe file:\n" + str(e))
+try:
+open(options.input2, "r").close()
+except TypeError, e:
+stop_err("You need to supply the Genes of Interest file (all eQTLs):\n" + str(e))
+except IOError, e:
+stop_err("Can not open the Hotspots genelists file (all eQTLs):\n" + str(e))
+try:
+open(options.input3, "r").close()
+except TypeError, e:
+stop_err("You need to supply the Genes of Interest file (cis-eQTLs):\n" + str(e))
+except IOError, e:
+stop_err("Can not open the Hotspots genelists file (cis-eQTLs):\n" + str(e))
+try:
+open(options.input4, "r").close()
+except TypeError, e:
+stop_err("You need to supply the Genes of Interest file (trnas-eQTLs):\n" + str(e))
+except IOError, e:
+stop_err("Can not open the Hotspots genelists file (trans-eQTLs):\n" + str(e))
+try:
+open(options.input5, "r").close()
+except TypeError, e:
+stop_err("You need to supply the Gene2GO mapping file:\n" + str(e))
+except IOError, e:
+stop_err("Can not open the Gene2GO mapping file:\n" + str(e))
+##########################################################
+def uniq(input):
+	output = []
+	for x in input:
+	  if x not in output:
+	    output.append(x)
+	return output
+def manage_output(hotspot_nr, tempdir, type):
+	sign_list = [];	p_BP = 0; adjp_BP = 0; p_MF = 0; adjp_MF = 0; p_CC = 0;	adjp_CC = 0
+	bpfile = open(tempdir+"/GO_table_BP.txt","r")
+	new_file = open(tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_GO_table.txt","w")
+	i = 0
+	stars = 0
+	for line in bpfile:
+	    i += 1
+	    if i > 1:
+		l = line.strip().split("\t")
+		new_file.write("\t".join(l[1:])+"\n")
+		if l[9] == "\"*\"":
+		    p_BP += 1
+		if l[10] == "\"*\"":
+		    adjp_BP += 1
+	    else:
+		l = line.strip().split("\t")
+		l[0] = "BP_GO.ID"
+		newl = "\t".join(l)
+		new_file.write(newl+"\n")
+	bpfile.close()
+	stars = 5
+	name = "tGO_BP_classic_"+str(stars)+"_all.pdf"
+	full_name = tempdir+"/"+name
+	new_name = tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_"+name
+	s = "mv %s %s" %(full_name, new_name)
+	subprocess.call(s, shell=True)
+	sign_list.append(p_BP)
+	sign_list.append(adjp_BP)
+	mffile = open(tempdir+"/GO_table_MF.txt","r")
+	i = 0
+	stars = 0
+	for line in mffile:
+	    i += 1
+	    if i > 1:
+		l = line.strip().split("\t")
+		new_file.write("\t".join(l[1:])+"\n")
+		if l[9] == "\"*\"":
+		    p_MF += 1
+		if l[10] == "\"*\"":
+		    adjp_MF += 1
+	    else:
+		l = line.strip().split("\t")
+		l[0] = "MF_GO.ID"
+		newl = "\t".join(l)
+		new_file.write("\n"+newl+"\n")
+	mffile.close()
+	stars = 5
+	name = "tGO_MF_classic_"+str(stars)+"_all.pdf"
+	full_name = tempdir+"/"+name
+	new_name = tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_"+name
+	s = "mv %s %s" %(full_name, new_name)
+	subprocess.call(s, shell=True)
+	sign_list.append(p_MF)
+	sign_list.append(adjp_MF)
+	ccfile = open(tempdir+"/GO_table_CC.txt","r")
+	i = 0
+	stars = 0
+	for line in ccfile:
+	    i += 1
+	    if i > 1:
+		l = line.strip().split("\t")
+		new_file.write("\t".join(l[1:])+"\n")
+		if l[9] == "\"*\"":
+		    p_CC += 1
+		if l[10] == "\"*\"":
+		    adjp_CC += 1
+	    else:
+		l = line.strip().split("\t")
+		l[0] = "CC_GO.ID"
+		newl = "\t".join(l)
+		new_file.write("\n"+newl+"\n")
+	ccfile.close()
+	stars = 5
+	name = "tGO_CC_classic_"+str(stars)+"_all.pdf"
+	full_name = tempdir+"/"+name
+	new_name = tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_"+name
+	s = "mv %s %s" %(full_name, new_name)
+	subprocess.call(s, shell=True)
+	new_file.close()
+	sign_list.append(p_CC)
+	sign_list.append(adjp_CC)
+	return sign_list
+# Create temp direcotry
+tempdir = tempfile.mkdtemp()
+#print tempdir
+#fixdir = "/lustre/galchew/MeQTL/eQTL_pipeline"
+#fixdir = "/Users/nanettecoetzer/Documents/Bioinformatics/MAIZE_project/Hotspots_eQTLs/enrichment_analysis/TopGO/add_to_hotspots_pipeline"
+# copy INPUT file to the temp directory
+s = "cp %s %s/gene_universe.txt" %(options.input1, tempdir)
+subprocess.call(s, shell=True)
+s = "cp %s %s/gene2GO_mapping.map" %(options.input5, tempdir)
+subprocess.call(s, shell=True)
+# create R script => save in temp directory
+# generate new header
+new_script = open(tempdir+"/new_script.r","w")
+header = "setwd(\"%s\")" %tempdir
+new_script.write(header+"\n")
+# add script body
+script = open(options.rscript,"r")
+#script = open(fixdir+"/TopGO_pipeline_new.txt","r")
+for line in script:
+new_script.write(line.strip()+"\n")
+new_script.close()
+# get 2 parents
+#infile = open(options.input2,'r')
+#parents = []
+#for line in infile:
+#	l = line.strip().split("\t")
+#	if not line.startswith("=") and len(l) > 4:
+#	    parents.append(l[12])
+#parent_list = uniq(parents)
+#infile.close()
+summary_file = open(options.output4,'w')
+summary_file.write("Type\tNumber\tChr\tSliding_IDs\tInterval_IDs\tNum_eQTLs\tParent_neg_additive\tParent_pos_additive\tBP_pVal<0.01\tBP_adj_pVal<0.05\tMF_pVal<0.01\tMF_adj_pVal<0.05\tCC_pVal<0.01\tCC_adj_pVal<0.05\n")
+############################## all eQTLs ##############################
+hotspot_nr = 0
+i = 0
+parent1 = 0 # Parent_neg_additive
+parent2 = 0 # Parent_pos_additive
+count_eqtl = 0
+infile2 = open(options.input2,'r')
+for line in infile2:
+	l = line.strip().split("\t")
+	if line.startswith("="):
+	    hotspot = l[0].split("Hotspot")[1].split(" ")[1]
+	    chr = l[0].split("Hotspot")[1].split(" ")[4]
+	    sliding_IDs = l[1].split(": ")[1]
+	    int_IDs = l[2].split(": ")[1]
+	    nr_eqtl = l[3].split(" ")[1]
+	    if hotspot_nr == 0:
+		summary_file.write("All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
+		#print "All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
+	    if hotspot_nr != 0:
+		interesting_genes.close()
+		# run R script from temp directory --> for the previous hotspot
+		s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
+		subprocess.call(s, shell=True)
+		# remove old interesting genes file
+		#s = "rm %s/interesting_genes.txt" %(tempdir)
+		subprocess.call(s, shell=True)
+		sign_GO = manage_output(hotspot_nr, tempdir, "all")
+		sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
+		percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
+		percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
+		summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
+		#print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
+		summary_file.write("All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
+		#print "All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
+		parent1 = 0
+		parent2 = 0
+		count_eqtl = 0
+	    # write a new interesting genes file
+	    hotspot_nr += 1
+	    interesting_genes = open(tempdir+"/interesting_genes.txt",'w')
+	else:
+	    if len(l) > 4:
+		# populate the new interesting genes file
+		gene_id = l[0]
+		parent = l[12]
+		# populate the new interesting genes file
+		if gene_id != "":
+		    interesting_genes.write(gene_id+"\n")
+		    count_eqtl += 1
+		    #if parent == parent_list[0]:
+		    #	parent1 += 1
+		    #if parent == parent_list[1]:
+		    #	parent2 += 1
+		    if float(parent) < 0:
+			parent1 += 1
+		    else:
+		    	parent2 += 1
+interesting_genes.close()
+# run R script from temp directory --> for the previous hotspot
+s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
+subprocess.call(s, shell=True)
+# remove old interesting genes file
+#s = "rm %s/interesting_genes.txt" %(tempdir)
+subprocess.call(s, shell=True)
+sign_GO = manage_output(hotspot_nr, tempdir, "all")
+percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
+percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
+sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
+summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
+#print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
+parent1 = 0
+parent2 = 0
+count_eqtl = 0
+infile2.close()
+############
+filenames = []
+os.chdir(tempdir+"/")
+for files in os.listdir("."):
+	if (files.endswith(".txt") and files.startswith("hotspot")):
+	    filenames.append(files)
+filenames = sorted(filenames)
+with open(tempdir+"/"+"hotspots_all_GO_tables.txt","w") as outfile:
+	for fname in filenames:
+	    outfile.write("\n"+fname+"\n")
+	    with open(tempdir+"/"+fname) as infile:
+		for line in infile:
+		    outfile.write(line)
+#for f in filenames:
+#	os.remove(f)
+############
+tempdir2 = tempfile.mkdtemp()
+s = "zip -j %s/output.zip %s/hotspot*" %(tempdir2,tempdir)
+subprocess.call(s, shell=True)
+s = "rm %s/hotspot*" %(tempdir)
+subprocess.call(s, shell=True)
+# move the zipped temp directory to options.output
+os.system("mv %s/output.zip %s" %(tempdir2,options.output1))
+############################## cis eQTLs ##############################
+hotspot_nr = 0
+i = 0
+parent1 = 0
+parent2 = 0
+count_eqtl = 0
+infile3 = open(options.input3,'r')
+for line in infile3:
+	l = line.strip().split("\t")
+	if line.startswith("="):
+	    hotspot = l[0].split("Hotspot")[1].split(" ")[1]
+	    chr = l[0].split("Hotspot")[1].split(" ")[4]
+	    sliding_IDs = l[1].split(": ")[1]
+	    int_IDs = l[2].split(": ")[1]
+	    nr_eqtl = l[3].split(" ")[1]
+	    if hotspot_nr == 0:
+		summary_file.write("Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
+		#print "Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
+	    if hotspot_nr != 0:
+		interesting_genes.close()
+		# run R script from temp directory --> for the previous hotspot
+		s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
+		subprocess.call(s, shell=True)
+		# remove old interesting genes file
+		#s = "rm %s/interesting_genes.txt" %(tempdir)
+		subprocess.call(s, shell=True)
+		sign_GO = manage_output(hotspot_nr, tempdir, "cis")
+		sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
+		percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
+		percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
+		summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
+		#print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
+		summary_file.write("Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
+		#print "Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
+		parent1 = 0
+		parent2 = 0
+		count_eqtl = 0
+	    # write a new interesting genes file
+	    hotspot_nr += 1
+	    interesting_genes = open(tempdir+"/interesting_genes.txt",'w')
+	else:
+	    if len(l) > 4:
+		# populate the new interesting genes file
+		gene_id = l[0]
+		parent = l[12]
+		# populate the new interesting genes file
+		if gene_id != "":
+		    interesting_genes.write(gene_id+"\n")
+		    count_eqtl += 1
+		    #if parent == parent_list[0]:
+		    #	parent1 += 1
+		    #if parent == parent_list[1]:
+		    #	parent2 += 1
+		    if float(parent) < 0:
+			parent1 += 1
+		    else:
+		    	parent2 += 1
+interesting_genes.close()
+# run R script from temp directory --> for the previous hotspot
+s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
+subprocess.call(s, shell=True)
+# remove old interesting genes file
+#s = "rm %s/interesting_genes.txt" %(tempdir)
+subprocess.call(s, shell=True)
+sign_GO = manage_output(hotspot_nr, tempdir, "cis")
+percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
+percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
+sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
+summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
+#print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
+parent1 = 0
+parent2 = 0
+count_eqtl = 0
+infile3.close()
+############
+filenames = []
+os.chdir(tempdir+"/")
+for files in os.listdir("."):
+	if (files.endswith(".txt") and files.startswith("hotspot")):
+	    filenames.append(files)
+filenames = sorted(filenames)
+with open(tempdir+"/"+"hotspots_cis_GO_tables.txt","w") as outfile:
+	for fname in filenames:
+	    outfile.write("\n"+fname+"\n")
+	    with open(tempdir+"/"+fname) as infile:
+		for line in infile:
+		    outfile.write(line)
+#for f in filenames:
+#	os.remove(f)
+############
+tempdir2 = tempfile.mkdtemp()
+s = "zip -j %s/output.zip %s/hotspot*" %(tempdir2,tempdir)
+subprocess.call(s, shell=True)
+s = "rm %s/hotspot*" %(tempdir)
+subprocess.call(s, shell=True)
+# move the zipped temp directory to options.output
+os.system("mv %s/output.zip %s" %(tempdir2,options.output2))
+############################## trans eQTLs ##############################
+hotspot_nr = 0
+i = 0
+parent1 = 0
+parent2 = 0
+count_eqtl = 0
+infile4 = open(options.input4,'r')
+for line in infile4:
+	l = line.strip().split("\t")
+	if line.startswith("="):
+	    hotspot = l[0].split("Hotspot")[1].split(" ")[1]
+	    chr = l[0].split("Hotspot")[1].split(" ")[4]
+	    sliding_IDs = l[1].split(": ")[1]
+	    int_IDs = l[2].split(": ")[1]
+	    nr_eqtl = l[3].split(" ")[1]
+	    if hotspot_nr == 0:
+		summary_file.write("Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
+		#print "Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
+	    if hotspot_nr != 0:
+		interesting_genes.close()
+		# run R script from temp directory --> for the previous hotspot
+		s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
+		subprocess.call(s, shell=True)
+		# remove old interesting genes file
+		#s = "rm %s/interesting_genes.txt" %(tempdir)
+		subprocess.call(s, shell=True)
+		sign_GO = manage_output(hotspot_nr, tempdir, "trans")
+		sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
+		percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
+		percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
+		summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
+		#print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
+		summary_file.write("Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
+		#print "Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
+		parent1 = 0
+		parent2 = 0
+		count_eqtl = 0
+	    # write a new interesting genes file
+	    hotspot_nr += 1
+	    interesting_genes = open(tempdir+"/interesting_genes.txt",'w')
+	else:
+	    if len(l) > 4:
+		# populate the new interesting genes file
+		gene_id = l[0]
+		parent = l[12]
+		# populate the new interesting genes file
+		if gene_id != "":
+		    interesting_genes.write(gene_id+"\n")
+		    count_eqtl += 1
+		    #if parent == parent_list[0]:
+		    #	parent1 += 1
+		    #if parent == parent_list[1]:
+		    #	parent2 += 1
+		    if float(parent) < 0:
+			parent1 += 1
+		    else:
+		    	parent2 += 1
+interesting_genes.close()
+# run R script from temp directory --> for the previous hotspot
+s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
+subprocess.call(s, shell=True)
+# remove old interesting genes file
+#s = "rm %s/interesting_genes.txt" %(tempdir)
+subprocess.call(s, shell=True)
+sign_GO = manage_output(hotspot_nr, tempdir, "trans")
+percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
+percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
+sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
+summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
+#print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
+parent1 = 0
+parent2 = 0
+count_eqtl = 0
+infile4.close()
+############
+filenames = []
+os.chdir(tempdir+"/")
+for files in os.listdir("."):
+	if (files.endswith(".txt") and files.startswith("hotspot")):
+	    filenames.append(files)
+filenames = sorted(filenames)
+with open(tempdir+"/"+"hotspots_trans_GO_tables.txt","w") as outfile:
+	for fname in filenames:
+	    outfile.write("\n"+fname+"\n")
+	    with open(tempdir+"/"+fname) as infile:
+		for line in infile:
+		    outfile.write(line)
+#for f in filenames:
+#	os.remove(f)
+############
+tempdir2 = tempfile.mkdtemp()
+s = "zip -j %s/output.zip %s/hotspot*" %(tempdir2,tempdir)
+subprocess.call(s, shell=True)
+s = "rm %s/hotspot*" %(tempdir)
+subprocess.call(s, shell=True)
+# move the zipped temp directory to options.output
+os.system("mv %s/output.zip %s" %(tempdir2,options.output3))
+summary_file.close()
+##############################################
+if __name__=="__main__":
+__main__()

Mercurial > repos > nanettec > go_enrichment

comparison GO_enrichment/GO_enrichment.py @ 0:bde64415f03b draft