Mercurial > repos > nanettec > go_enrichment

"""
@summary: GO enrichment analysis (hotspots)
@author: nanette.coetzer@gmail.com
@version 5

"""

import optparse, sys
import subprocess
import tempfile
import os, re

def stop_err( msg ):
    sys.stderr.write( "%s\n" % msg )
    sys.exit()

def __main__():
    #Parse Command Line
    parser = optparse.OptionParser()
    parser.add_option("-n", "--rscript", default=None, dest="rscript",
                      help="R script")
    parser.add_option("-i", "--input1", default=None, dest="input1",
                      help="Gene universe (Gene positions file)")
    parser.add_option("-j", "--input2", default=None, dest="input2",
                      help="Hotspots genelists all eQTLs")
    parser.add_option("-k", "--input3", default=None, dest="input3",
                      help="Hotspots genelists cis-eQTLs")
    parser.add_option("-l", "--input4", default=None, dest="input4",
                      help="Hotspots genelists trans-eQTLs")
    parser.add_option("-m", "--input5", default=None, dest="input5",
                      help="Gene2GO mapping")
    parser.add_option("-o", "--output1", default=None, dest="output1",
                      help="GO enrichment output; all eQTLs")
    parser.add_option("-p", "--output2", default=None, dest="output2",
                      help="GO enrichment output; cis eQTLs")
    parser.add_option("-q", "--output3", default=None, dest="output3",
                      help="GO enrichment output; trans eQTLs")
    parser.add_option("-r", "--output4", default=None, dest="output4",
                      help="Hotspot summary file")

    (options, args) = parser.parse_args()

    try:
        open(options.input1, "r").close()
    except TypeError, e:
        stop_err("You need to supply the Gene Universe file:\n" + str(e))
    except IOError, e:
        stop_err("Can not open the Gene Universe file:\n" + str(e))

    try:
        open(options.input2, "r").close()
    except TypeError, e:
        stop_err("You need to supply the Genes of Interest file (all eQTLs):\n" + str(e))
    except IOError, e:
        stop_err("Can not open the Hotspots genelists file (all eQTLs):\n" + str(e))

    try:
        open(options.input3, "r").close()
    except TypeError, e:
        stop_err("You need to supply the Genes of Interest file (cis-eQTLs):\n" + str(e))
    except IOError, e:
        stop_err("Can not open the Hotspots genelists file (cis-eQTLs):\n" + str(e))

    try:
        open(options.input4, "r").close()
    except TypeError, e:
        stop_err("You need to supply the Genes of Interest file (trnas-eQTLs):\n" + str(e))
    except IOError, e:
        stop_err("Can not open the Hotspots genelists file (trans-eQTLs):\n" + str(e))

    try:
        open(options.input5, "r").close()
    except TypeError, e:
        stop_err("You need to supply the Gene2GO mapping file:\n" + str(e))
    except IOError, e:
        stop_err("Can not open the Gene2GO mapping file:\n" + str(e))

    ##########################################################

    def uniq(input):
	output = []
	for x in input:
	  if x not in output:
	    output.append(x)
	return output

    def manage_output(hotspot_nr, tempdir, type):
	sign_list = [];	p_BP = 0; adjp_BP = 0; p_MF = 0; adjp_MF = 0; p_CC = 0;	adjp_CC = 0

	bpfile = open(tempdir+"/GO_table_BP.txt","r")
	new_file = open(tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_GO_table.txt","w")
	i = 0
	stars = 0
	for line in bpfile:
	    i += 1
	    if i > 1:
		l = line.strip().split("\t")
		new_file.write("\t".join(l[1:])+"\n")
		if l[9] == "\"*\"":
		    p_BP += 1
		if l[10] == "\"*\"":
		    adjp_BP += 1
	    else:
		l = line.strip().split("\t")
		l[0] = "BP_GO.ID"
		newl = "\t".join(l)
		new_file.write(newl+"\n")
	bpfile.close()
	stars = 5
	name = "tGO_BP_classic_"+str(stars)+"_all.pdf"
	full_name = tempdir+"/"+name
	new_name = tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_"+name
	s = "mv %s %s" %(full_name, new_name)
	subprocess.call(s, shell=True)
	sign_list.append(p_BP)
	sign_list.append(adjp_BP)

	mffile = open(tempdir+"/GO_table_MF.txt","r")
	i = 0
	stars = 0
	for line in mffile:
	    i += 1
	    if i > 1:
		l = line.strip().split("\t")
		new_file.write("\t".join(l[1:])+"\n")
		if l[9] == "\"*\"":
		    p_MF += 1
		if l[10] == "\"*\"":
		    adjp_MF += 1
	    else:
		l = line.strip().split("\t")
		l[0] = "MF_GO.ID"
		newl = "\t".join(l)
		new_file.write("\n"+newl+"\n")
	mffile.close()
	stars = 5
    	name = "tGO_MF_classic_"+str(stars)+"_all.pdf"
	full_name = tempdir+"/"+name
	new_name = tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_"+name
	s = "mv %s %s" %(full_name, new_name)
	subprocess.call(s, shell=True)
	sign_list.append(p_MF)
	sign_list.append(adjp_MF)

	ccfile = open(tempdir+"/GO_table_CC.txt","r")
	i = 0
	stars = 0
	for line in ccfile:
	    i += 1
	    if i > 1:
		l = line.strip().split("\t")
		new_file.write("\t".join(l[1:])+"\n")
		if l[9] == "\"*\"":
		    p_CC += 1
		if l[10] == "\"*\"":
		    adjp_CC += 1
	    else:
		l = line.strip().split("\t")
		l[0] = "CC_GO.ID"
		newl = "\t".join(l)
		new_file.write("\n"+newl+"\n")
	ccfile.close()
	stars = 5
	name = "tGO_CC_classic_"+str(stars)+"_all.pdf"
	full_name = tempdir+"/"+name
	new_name = tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_"+name
	s = "mv %s %s" %(full_name, new_name)
	subprocess.call(s, shell=True)
	new_file.close()
	sign_list.append(p_CC)
	sign_list.append(adjp_CC)
	return sign_list

    # Create temp direcotry
    tempdir = tempfile.mkdtemp()
    #print tempdir
    #fixdir = "/lustre/galchew/MeQTL/eQTL_pipeline"
    #fixdir = "/Users/nanettecoetzer/Documents/Bioinformatics/MAIZE_project/Hotspots_eQTLs/enrichment_analysis/TopGO/add_to_hotspots_pipeline"

    # copy INPUT file to the temp directory
    s = "cp %s %s/gene_universe.txt" %(options.input1, tempdir)
    subprocess.call(s, shell=True)
    s = "cp %s %s/gene2GO_mapping.map" %(options.input5, tempdir)
    subprocess.call(s, shell=True)

    # create R script => save in temp directory
    # generate new header
    new_script = open(tempdir+"/new_script.r","w")
    header = "setwd(\"%s\")" %tempdir
    new_script.write(header+"\n")
    # add script body
    script = open(options.rscript,"r")
    #script = open(fixdir+"/TopGO_pipeline_new.txt","r")
    for line in script:
        new_script.write(line.strip()+"\n")
    new_script.close()

    # get 2 parents
    #infile = open(options.input2,'r')
    #parents = []
    #for line in infile:
    #	l = line.strip().split("\t")
    #	if not line.startswith("=") and len(l) > 4:
    #	    parents.append(l[12])
    #parent_list = uniq(parents)
    #infile.close()

    summary_file = open(options.output4,'w')
    summary_file.write("Type\tNumber\tChr\tSliding_IDs\tInterval_IDs\tNum_eQTLs\tParent_neg_additive\tParent_pos_additive\tBP_pVal<0.01\tBP_adj_pVal<0.05\tMF_pVal<0.01\tMF_adj_pVal<0.05\tCC_pVal<0.01\tCC_adj_pVal<0.05\n")

    ############################## all eQTLs ##############################
    hotspot_nr = 0
    i = 0
    parent1 = 0 # Parent_neg_additive
    parent2 = 0 # Parent_pos_additive
    count_eqtl = 0
    infile2 = open(options.input2,'r')
    for line in infile2:
	l = line.strip().split("\t")
	if line.startswith("="):
	    hotspot = l[0].split("Hotspot")[1].split(" ")[1]
	    chr = l[0].split("Hotspot")[1].split(" ")[4]
	    sliding_IDs = l[1].split(": ")[1]
	    int_IDs = l[2].split(": ")[1]
	    nr_eqtl = l[3].split(" ")[1]
	    if hotspot_nr == 0:
		summary_file.write("All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
		#print "All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
	    if hotspot_nr != 0:
		interesting_genes.close()
		# run R script from temp directory --> for the previous hotspot
		s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
		subprocess.call(s, shell=True)
		# remove old interesting genes file
		#s = "rm %s/interesting_genes.txt" %(tempdir)
		subprocess.call(s, shell=True)
		sign_GO = manage_output(hotspot_nr, tempdir, "all")
		sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
		percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
		percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
		summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
		#print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
		summary_file.write("All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
		#print "All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
		parent1 = 0
		parent2 = 0
		count_eqtl = 0
	    # write a new interesting genes file
	    hotspot_nr += 1
	    interesting_genes = open(tempdir+"/interesting_genes.txt",'w')
	else:
	    if len(l) > 4:
		# populate the new interesting genes file
		gene_id = l[0]
		parent = l[12]
		# populate the new interesting genes file
		if gene_id != "":
		    interesting_genes.write(gene_id+"\n")
		    count_eqtl += 1
		    #if parent == parent_list[0]:
		    #	parent1 += 1
		    #if parent == parent_list[1]:
		    #	parent2 += 1
		    if float(parent) < 0:
			parent1 += 1
		    else:
		    	parent2 += 1
    interesting_genes.close()
    # run R script from temp directory --> for the previous hotspot
    s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
    subprocess.call(s, shell=True)
    # remove old interesting genes file
    #s = "rm %s/interesting_genes.txt" %(tempdir)
    subprocess.call(s, shell=True)
    sign_GO = manage_output(hotspot_nr, tempdir, "all")
    percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
    percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
    sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
    summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
    #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
    parent1 = 0
    parent2 = 0
    count_eqtl = 0
    infile2.close()

    ############
    filenames = []
    os.chdir(tempdir+"/")
    for files in os.listdir("."):
	if (files.endswith(".txt") and files.startswith("hotspot")):
	    filenames.append(files)
    filenames = sorted(filenames)

    with open(tempdir+"/"+"hotspots_all_GO_tables.txt","w") as outfile:
	for fname in filenames:
	    outfile.write("\n"+fname+"\n")
	    with open(tempdir+"/"+fname) as infile:
		for line in infile:
		    outfile.write(line)

    #for f in filenames:
    #	os.remove(f)

    ############

    tempdir2 = tempfile.mkdtemp()
    s = "zip -j %s/output.zip %s/hotspot*" %(tempdir2,tempdir)
    subprocess.call(s, shell=True)
    s = "rm %s/hotspot*" %(tempdir)
    subprocess.call(s, shell=True)

    # move the zipped temp directory to options.output
    os.system("mv %s/output.zip %s" %(tempdir2,options.output1))


    ############################## cis eQTLs ##############################
    hotspot_nr = 0
    i = 0
    parent1 = 0
    parent2 = 0
    count_eqtl = 0
    infile3 = open(options.input3,'r')
    for line in infile3:
	l = line.strip().split("\t")
	if line.startswith("="):
	    hotspot = l[0].split("Hotspot")[1].split(" ")[1]
	    chr = l[0].split("Hotspot")[1].split(" ")[4]
	    sliding_IDs = l[1].split(": ")[1]
	    int_IDs = l[2].split(": ")[1]
	    nr_eqtl = l[3].split(" ")[1]
	    if hotspot_nr == 0:
		summary_file.write("Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
		#print "Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
	    if hotspot_nr != 0:
		interesting_genes.close()
		# run R script from temp directory --> for the previous hotspot
		s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
		subprocess.call(s, shell=True)
		# remove old interesting genes file
		#s = "rm %s/interesting_genes.txt" %(tempdir)
		subprocess.call(s, shell=True)
		sign_GO = manage_output(hotspot_nr, tempdir, "cis")
		sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
		percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
		percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
		summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
		#print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
		summary_file.write("Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
		#print "Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
		parent1 = 0
		parent2 = 0
		count_eqtl = 0
	    # write a new interesting genes file
	    hotspot_nr += 1
	    interesting_genes = open(tempdir+"/interesting_genes.txt",'w')
	else:
	    if len(l) > 4:
		# populate the new interesting genes file
		gene_id = l[0]
		parent = l[12]
		# populate the new interesting genes file
		if gene_id != "":
		    interesting_genes.write(gene_id+"\n")
		    count_eqtl += 1
		    #if parent == parent_list[0]:
		    #	parent1 += 1
		    #if parent == parent_list[1]:
		    #	parent2 += 1
		    if float(parent) < 0:
			parent1 += 1
		    else:
		    	parent2 += 1
    interesting_genes.close()
    # run R script from temp directory --> for the previous hotspot
    s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
    subprocess.call(s, shell=True)
    # remove old interesting genes file
    #s = "rm %s/interesting_genes.txt" %(tempdir)
    subprocess.call(s, shell=True)
    sign_GO = manage_output(hotspot_nr, tempdir, "cis")
    percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
    percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
    sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
    summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
    #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
    parent1 = 0
    parent2 = 0
    count_eqtl = 0
    infile3.close()

    ############
    filenames = []
    os.chdir(tempdir+"/")
    for files in os.listdir("."):
	if (files.endswith(".txt") and files.startswith("hotspot")):
	    filenames.append(files)
    filenames = sorted(filenames)

    with open(tempdir+"/"+"hotspots_cis_GO_tables.txt","w") as outfile:
	for fname in filenames:
	    outfile.write("\n"+fname+"\n")
	    with open(tempdir+"/"+fname) as infile:
		for line in infile:
		    outfile.write(line)

    #for f in filenames:
    #	os.remove(f)

    ############

    tempdir2 = tempfile.mkdtemp()
    s = "zip -j %s/output.zip %s/hotspot*" %(tempdir2,tempdir)
    subprocess.call(s, shell=True)
    s = "rm %s/hotspot*" %(tempdir)
    subprocess.call(s, shell=True)

    # move the zipped temp directory to options.output
    os.system("mv %s/output.zip %s" %(tempdir2,options.output2))


    ############################## trans eQTLs ##############################
    hotspot_nr = 0
    i = 0
    parent1 = 0
    parent2 = 0
    count_eqtl = 0
    infile4 = open(options.input4,'r')
    for line in infile4:
	l = line.strip().split("\t")
	if line.startswith("="):
	    hotspot = l[0].split("Hotspot")[1].split(" ")[1]
	    chr = l[0].split("Hotspot")[1].split(" ")[4]
	    sliding_IDs = l[1].split(": ")[1]
	    int_IDs = l[2].split(": ")[1]
	    nr_eqtl = l[3].split(" ")[1]
	    if hotspot_nr == 0:
		summary_file.write("Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
		#print "Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
	    if hotspot_nr != 0:
		interesting_genes.close()
		# run R script from temp directory --> for the previous hotspot
		s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
		subprocess.call(s, shell=True)
		# remove old interesting genes file
		#s = "rm %s/interesting_genes.txt" %(tempdir)
		subprocess.call(s, shell=True)
		sign_GO = manage_output(hotspot_nr, tempdir, "trans")
		sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
		percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
		percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
		summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
		#print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
		summary_file.write("Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
		#print "Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
		parent1 = 0
		parent2 = 0
		count_eqtl = 0
	    # write a new interesting genes file
	    hotspot_nr += 1
	    interesting_genes = open(tempdir+"/interesting_genes.txt",'w')
	else:
	    if len(l) > 4:
		# populate the new interesting genes file
		gene_id = l[0]
		parent = l[12]
		# populate the new interesting genes file
		if gene_id != "":
		    interesting_genes.write(gene_id+"\n")
		    count_eqtl += 1
		    #if parent == parent_list[0]:
		    #	parent1 += 1
		    #if parent == parent_list[1]:
		    #	parent2 += 1
		    if float(parent) < 0:
			parent1 += 1
		    else:
		    	parent2 += 1
    interesting_genes.close()
    # run R script from temp directory --> for the previous hotspot
    s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
    subprocess.call(s, shell=True)
    # remove old interesting genes file
    #s = "rm %s/interesting_genes.txt" %(tempdir)
    subprocess.call(s, shell=True)
    sign_GO = manage_output(hotspot_nr, tempdir, "trans")
    percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
    percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
    sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
    summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
    #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
    parent1 = 0
    parent2 = 0
    count_eqtl = 0
    infile4.close()

    ############
    filenames = []
    os.chdir(tempdir+"/")
    for files in os.listdir("."):
	if (files.endswith(".txt") and files.startswith("hotspot")):
	    filenames.append(files)
    filenames = sorted(filenames)

    with open(tempdir+"/"+"hotspots_trans_GO_tables.txt","w") as outfile:
	for fname in filenames:
	    outfile.write("\n"+fname+"\n")
	    with open(tempdir+"/"+fname) as infile:
		for line in infile:
		    outfile.write(line)

    #for f in filenames:
    #	os.remove(f)

    ############

    tempdir2 = tempfile.mkdtemp()
    s = "zip -j %s/output.zip %s/hotspot*" %(tempdir2,tempdir)
    subprocess.call(s, shell=True)
    s = "rm %s/hotspot*" %(tempdir)
    subprocess.call(s, shell=True)

    # move the zipped temp directory to options.output
    os.system("mv %s/output.zip %s" %(tempdir2,options.output3))
    summary_file.close()

    ##############################################

if __name__=="__main__":
    __main__()
author	nanettec
date	Wed, 16 Mar 2016 10:16:23 -0400
parents	bde64415f03b
children