Mercurial > repos > nanettec > go_enrichment
diff GO_enrichment/GO_enrichment.py @ 0:bde64415f03b draft
Uploaded
author | nanettec |
---|---|
date | Wed, 16 Mar 2016 10:15:05 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GO_enrichment/GO_enrichment.py Wed Mar 16 10:15:05 2016 -0400 @@ -0,0 +1,529 @@ +""" +@summary: GO enrichment analysis (hotspots) +@author: nanette.coetzer@gmail.com +@version 5 + +""" + +import optparse, sys +import subprocess +import tempfile +import os, re + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option("-n", "--rscript", default=None, dest="rscript", + help="R script") + parser.add_option("-i", "--input1", default=None, dest="input1", + help="Gene universe (Gene positions file)") + parser.add_option("-j", "--input2", default=None, dest="input2", + help="Hotspots genelists all eQTLs") + parser.add_option("-k", "--input3", default=None, dest="input3", + help="Hotspots genelists cis-eQTLs") + parser.add_option("-l", "--input4", default=None, dest="input4", + help="Hotspots genelists trans-eQTLs") + parser.add_option("-m", "--input5", default=None, dest="input5", + help="Gene2GO mapping") + parser.add_option("-o", "--output1", default=None, dest="output1", + help="GO enrichment output; all eQTLs") + parser.add_option("-p", "--output2", default=None, dest="output2", + help="GO enrichment output; cis eQTLs") + parser.add_option("-q", "--output3", default=None, dest="output3", + help="GO enrichment output; trans eQTLs") + parser.add_option("-r", "--output4", default=None, dest="output4", + help="Hotspot summary file") + + (options, args) = parser.parse_args() + + try: + open(options.input1, "r").close() + except TypeError, e: + stop_err("You need to supply the Gene Universe file:\n" + str(e)) + except IOError, e: + stop_err("Can not open the Gene Universe file:\n" + str(e)) + + try: + open(options.input2, "r").close() + except TypeError, e: + stop_err("You need to supply the Genes of Interest file (all eQTLs):\n" + str(e)) + except IOError, e: + stop_err("Can not open the Hotspots genelists file (all eQTLs):\n" + str(e)) + + try: + open(options.input3, "r").close() + except TypeError, e: + stop_err("You need to supply the Genes of Interest file (cis-eQTLs):\n" + str(e)) + except IOError, e: + stop_err("Can not open the Hotspots genelists file (cis-eQTLs):\n" + str(e)) + + try: + open(options.input4, "r").close() + except TypeError, e: + stop_err("You need to supply the Genes of Interest file (trnas-eQTLs):\n" + str(e)) + except IOError, e: + stop_err("Can not open the Hotspots genelists file (trans-eQTLs):\n" + str(e)) + + try: + open(options.input5, "r").close() + except TypeError, e: + stop_err("You need to supply the Gene2GO mapping file:\n" + str(e)) + except IOError, e: + stop_err("Can not open the Gene2GO mapping file:\n" + str(e)) + + ########################################################## + + def uniq(input): + output = [] + for x in input: + if x not in output: + output.append(x) + return output + + def manage_output(hotspot_nr, tempdir, type): + sign_list = []; p_BP = 0; adjp_BP = 0; p_MF = 0; adjp_MF = 0; p_CC = 0; adjp_CC = 0 + + bpfile = open(tempdir+"/GO_table_BP.txt","r") + new_file = open(tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_GO_table.txt","w") + i = 0 + stars = 0 + for line in bpfile: + i += 1 + if i > 1: + l = line.strip().split("\t") + new_file.write("\t".join(l[1:])+"\n") + if l[9] == "\"*\"": + p_BP += 1 + if l[10] == "\"*\"": + adjp_BP += 1 + else: + l = line.strip().split("\t") + l[0] = "BP_GO.ID" + newl = "\t".join(l) + new_file.write(newl+"\n") + bpfile.close() + stars = 5 + name = "tGO_BP_classic_"+str(stars)+"_all.pdf" + full_name = tempdir+"/"+name + new_name = tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_"+name + s = "mv %s %s" %(full_name, new_name) + subprocess.call(s, shell=True) + sign_list.append(p_BP) + sign_list.append(adjp_BP) + + mffile = open(tempdir+"/GO_table_MF.txt","r") + i = 0 + stars = 0 + for line in mffile: + i += 1 + if i > 1: + l = line.strip().split("\t") + new_file.write("\t".join(l[1:])+"\n") + if l[9] == "\"*\"": + p_MF += 1 + if l[10] == "\"*\"": + adjp_MF += 1 + else: + l = line.strip().split("\t") + l[0] = "MF_GO.ID" + newl = "\t".join(l) + new_file.write("\n"+newl+"\n") + mffile.close() + stars = 5 + name = "tGO_MF_classic_"+str(stars)+"_all.pdf" + full_name = tempdir+"/"+name + new_name = tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_"+name + s = "mv %s %s" %(full_name, new_name) + subprocess.call(s, shell=True) + sign_list.append(p_MF) + sign_list.append(adjp_MF) + + ccfile = open(tempdir+"/GO_table_CC.txt","r") + i = 0 + stars = 0 + for line in ccfile: + i += 1 + if i > 1: + l = line.strip().split("\t") + new_file.write("\t".join(l[1:])+"\n") + if l[9] == "\"*\"": + p_CC += 1 + if l[10] == "\"*\"": + adjp_CC += 1 + else: + l = line.strip().split("\t") + l[0] = "CC_GO.ID" + newl = "\t".join(l) + new_file.write("\n"+newl+"\n") + ccfile.close() + stars = 5 + name = "tGO_CC_classic_"+str(stars)+"_all.pdf" + full_name = tempdir+"/"+name + new_name = tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_"+name + s = "mv %s %s" %(full_name, new_name) + subprocess.call(s, shell=True) + new_file.close() + sign_list.append(p_CC) + sign_list.append(adjp_CC) + return sign_list + + # Create temp direcotry + tempdir = tempfile.mkdtemp() + #print tempdir + #fixdir = "/lustre/galchew/MeQTL/eQTL_pipeline" + #fixdir = "/Users/nanettecoetzer/Documents/Bioinformatics/MAIZE_project/Hotspots_eQTLs/enrichment_analysis/TopGO/add_to_hotspots_pipeline" + + # copy INPUT file to the temp directory + s = "cp %s %s/gene_universe.txt" %(options.input1, tempdir) + subprocess.call(s, shell=True) + s = "cp %s %s/gene2GO_mapping.map" %(options.input5, tempdir) + subprocess.call(s, shell=True) + + # create R script => save in temp directory + # generate new header + new_script = open(tempdir+"/new_script.r","w") + header = "setwd(\"%s\")" %tempdir + new_script.write(header+"\n") + # add script body + script = open(options.rscript,"r") + #script = open(fixdir+"/TopGO_pipeline_new.txt","r") + for line in script: + new_script.write(line.strip()+"\n") + new_script.close() + + # get 2 parents + #infile = open(options.input2,'r') + #parents = [] + #for line in infile: + # l = line.strip().split("\t") + # if not line.startswith("=") and len(l) > 4: + # parents.append(l[12]) + #parent_list = uniq(parents) + #infile.close() + + summary_file = open(options.output4,'w') + summary_file.write("Type\tNumber\tChr\tSliding_IDs\tInterval_IDs\tNum_eQTLs\tParent_neg_additive\tParent_pos_additive\tBP_pVal<0.01\tBP_adj_pVal<0.05\tMF_pVal<0.01\tMF_adj_pVal<0.05\tCC_pVal<0.01\tCC_adj_pVal<0.05\n") + + ############################## all eQTLs ############################## + hotspot_nr = 0 + i = 0 + parent1 = 0 # Parent_neg_additive + parent2 = 0 # Parent_pos_additive + count_eqtl = 0 + infile2 = open(options.input2,'r') + for line in infile2: + l = line.strip().split("\t") + if line.startswith("="): + hotspot = l[0].split("Hotspot")[1].split(" ")[1] + chr = l[0].split("Hotspot")[1].split(" ")[4] + sliding_IDs = l[1].split(": ")[1] + int_IDs = l[2].split(": ")[1] + nr_eqtl = l[3].split(" ")[1] + if hotspot_nr == 0: + summary_file.write("All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t") + #print "All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t" + if hotspot_nr != 0: + interesting_genes.close() + # run R script from temp directory --> for the previous hotspot + s = "R CMD BATCH %s/new_script.r out.txt" %tempdir + subprocess.call(s, shell=True) + # remove old interesting genes file + #s = "rm %s/interesting_genes.txt" %(tempdir) + subprocess.call(s, shell=True) + sign_GO = manage_output(hotspot_nr, tempdir, "all") + sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5]) + percentage_1 = round(int(parent1)/float(count_eqtl)*100,1) + percentage_2 = round(int(parent2)/float(count_eqtl)*100,1) + summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n") + #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n" + summary_file.write("All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t") + #print "All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t" + parent1 = 0 + parent2 = 0 + count_eqtl = 0 + # write a new interesting genes file + hotspot_nr += 1 + interesting_genes = open(tempdir+"/interesting_genes.txt",'w') + else: + if len(l) > 4: + # populate the new interesting genes file + gene_id = l[0] + parent = l[12] + # populate the new interesting genes file + if gene_id != "": + interesting_genes.write(gene_id+"\n") + count_eqtl += 1 + #if parent == parent_list[0]: + # parent1 += 1 + #if parent == parent_list[1]: + # parent2 += 1 + if float(parent) < 0: + parent1 += 1 + else: + parent2 += 1 + interesting_genes.close() + # run R script from temp directory --> for the previous hotspot + s = "R CMD BATCH %s/new_script.r out.txt" %tempdir + subprocess.call(s, shell=True) + # remove old interesting genes file + #s = "rm %s/interesting_genes.txt" %(tempdir) + subprocess.call(s, shell=True) + sign_GO = manage_output(hotspot_nr, tempdir, "all") + percentage_1 = round(int(parent1)/float(count_eqtl)*100,1) + percentage_2 = round(int(parent2)/float(count_eqtl)*100,1) + sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5]) + summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n") + #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n" + parent1 = 0 + parent2 = 0 + count_eqtl = 0 + infile2.close() + + ############ + filenames = [] + os.chdir(tempdir+"/") + for files in os.listdir("."): + if (files.endswith(".txt") and files.startswith("hotspot")): + filenames.append(files) + filenames = sorted(filenames) + + with open(tempdir+"/"+"hotspots_all_GO_tables.txt","w") as outfile: + for fname in filenames: + outfile.write("\n"+fname+"\n") + with open(tempdir+"/"+fname) as infile: + for line in infile: + outfile.write(line) + + #for f in filenames: + # os.remove(f) + + ############ + + tempdir2 = tempfile.mkdtemp() + s = "zip -j %s/output.zip %s/hotspot*" %(tempdir2,tempdir) + subprocess.call(s, shell=True) + s = "rm %s/hotspot*" %(tempdir) + subprocess.call(s, shell=True) + + # move the zipped temp directory to options.output + os.system("mv %s/output.zip %s" %(tempdir2,options.output1)) + + + ############################## cis eQTLs ############################## + hotspot_nr = 0 + i = 0 + parent1 = 0 + parent2 = 0 + count_eqtl = 0 + infile3 = open(options.input3,'r') + for line in infile3: + l = line.strip().split("\t") + if line.startswith("="): + hotspot = l[0].split("Hotspot")[1].split(" ")[1] + chr = l[0].split("Hotspot")[1].split(" ")[4] + sliding_IDs = l[1].split(": ")[1] + int_IDs = l[2].split(": ")[1] + nr_eqtl = l[3].split(" ")[1] + if hotspot_nr == 0: + summary_file.write("Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t") + #print "Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t" + if hotspot_nr != 0: + interesting_genes.close() + # run R script from temp directory --> for the previous hotspot + s = "R CMD BATCH %s/new_script.r out.txt" %tempdir + subprocess.call(s, shell=True) + # remove old interesting genes file + #s = "rm %s/interesting_genes.txt" %(tempdir) + subprocess.call(s, shell=True) + sign_GO = manage_output(hotspot_nr, tempdir, "cis") + sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5]) + percentage_1 = round(int(parent1)/float(count_eqtl)*100,1) + percentage_2 = round(int(parent2)/float(count_eqtl)*100,1) + summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n") + #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n" + summary_file.write("Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t") + #print "Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t" + parent1 = 0 + parent2 = 0 + count_eqtl = 0 + # write a new interesting genes file + hotspot_nr += 1 + interesting_genes = open(tempdir+"/interesting_genes.txt",'w') + else: + if len(l) > 4: + # populate the new interesting genes file + gene_id = l[0] + parent = l[12] + # populate the new interesting genes file + if gene_id != "": + interesting_genes.write(gene_id+"\n") + count_eqtl += 1 + #if parent == parent_list[0]: + # parent1 += 1 + #if parent == parent_list[1]: + # parent2 += 1 + if float(parent) < 0: + parent1 += 1 + else: + parent2 += 1 + interesting_genes.close() + # run R script from temp directory --> for the previous hotspot + s = "R CMD BATCH %s/new_script.r out.txt" %tempdir + subprocess.call(s, shell=True) + # remove old interesting genes file + #s = "rm %s/interesting_genes.txt" %(tempdir) + subprocess.call(s, shell=True) + sign_GO = manage_output(hotspot_nr, tempdir, "cis") + percentage_1 = round(int(parent1)/float(count_eqtl)*100,1) + percentage_2 = round(int(parent2)/float(count_eqtl)*100,1) + sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5]) + summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n") + #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n" + parent1 = 0 + parent2 = 0 + count_eqtl = 0 + infile3.close() + + ############ + filenames = [] + os.chdir(tempdir+"/") + for files in os.listdir("."): + if (files.endswith(".txt") and files.startswith("hotspot")): + filenames.append(files) + filenames = sorted(filenames) + + with open(tempdir+"/"+"hotspots_cis_GO_tables.txt","w") as outfile: + for fname in filenames: + outfile.write("\n"+fname+"\n") + with open(tempdir+"/"+fname) as infile: + for line in infile: + outfile.write(line) + + #for f in filenames: + # os.remove(f) + + ############ + + tempdir2 = tempfile.mkdtemp() + s = "zip -j %s/output.zip %s/hotspot*" %(tempdir2,tempdir) + subprocess.call(s, shell=True) + s = "rm %s/hotspot*" %(tempdir) + subprocess.call(s, shell=True) + + # move the zipped temp directory to options.output + os.system("mv %s/output.zip %s" %(tempdir2,options.output2)) + + + ############################## trans eQTLs ############################## + hotspot_nr = 0 + i = 0 + parent1 = 0 + parent2 = 0 + count_eqtl = 0 + infile4 = open(options.input4,'r') + for line in infile4: + l = line.strip().split("\t") + if line.startswith("="): + hotspot = l[0].split("Hotspot")[1].split(" ")[1] + chr = l[0].split("Hotspot")[1].split(" ")[4] + sliding_IDs = l[1].split(": ")[1] + int_IDs = l[2].split(": ")[1] + nr_eqtl = l[3].split(" ")[1] + if hotspot_nr == 0: + summary_file.write("Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t") + #print "Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t" + if hotspot_nr != 0: + interesting_genes.close() + # run R script from temp directory --> for the previous hotspot + s = "R CMD BATCH %s/new_script.r out.txt" %tempdir + subprocess.call(s, shell=True) + # remove old interesting genes file + #s = "rm %s/interesting_genes.txt" %(tempdir) + subprocess.call(s, shell=True) + sign_GO = manage_output(hotspot_nr, tempdir, "trans") + sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5]) + percentage_1 = round(int(parent1)/float(count_eqtl)*100,1) + percentage_2 = round(int(parent2)/float(count_eqtl)*100,1) + summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n") + #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n" + summary_file.write("Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t") + #print "Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t" + parent1 = 0 + parent2 = 0 + count_eqtl = 0 + # write a new interesting genes file + hotspot_nr += 1 + interesting_genes = open(tempdir+"/interesting_genes.txt",'w') + else: + if len(l) > 4: + # populate the new interesting genes file + gene_id = l[0] + parent = l[12] + # populate the new interesting genes file + if gene_id != "": + interesting_genes.write(gene_id+"\n") + count_eqtl += 1 + #if parent == parent_list[0]: + # parent1 += 1 + #if parent == parent_list[1]: + # parent2 += 1 + if float(parent) < 0: + parent1 += 1 + else: + parent2 += 1 + interesting_genes.close() + # run R script from temp directory --> for the previous hotspot + s = "R CMD BATCH %s/new_script.r out.txt" %tempdir + subprocess.call(s, shell=True) + # remove old interesting genes file + #s = "rm %s/interesting_genes.txt" %(tempdir) + subprocess.call(s, shell=True) + sign_GO = manage_output(hotspot_nr, tempdir, "trans") + percentage_1 = round(int(parent1)/float(count_eqtl)*100,1) + percentage_2 = round(int(parent2)/float(count_eqtl)*100,1) + sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5]) + summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n") + #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n" + parent1 = 0 + parent2 = 0 + count_eqtl = 0 + infile4.close() + + ############ + filenames = [] + os.chdir(tempdir+"/") + for files in os.listdir("."): + if (files.endswith(".txt") and files.startswith("hotspot")): + filenames.append(files) + filenames = sorted(filenames) + + with open(tempdir+"/"+"hotspots_trans_GO_tables.txt","w") as outfile: + for fname in filenames: + outfile.write("\n"+fname+"\n") + with open(tempdir+"/"+fname) as infile: + for line in infile: + outfile.write(line) + + #for f in filenames: + # os.remove(f) + + ############ + + tempdir2 = tempfile.mkdtemp() + s = "zip -j %s/output.zip %s/hotspot*" %(tempdir2,tempdir) + subprocess.call(s, shell=True) + s = "rm %s/hotspot*" %(tempdir) + subprocess.call(s, shell=True) + + # move the zipped temp directory to options.output + os.system("mv %s/output.zip %s" %(tempdir2,options.output3)) + summary_file.close() + + ############################################## + +if __name__=="__main__": + __main__()