Mercurial > repos > nanettec > go_enrichment
view GO_enrichment/GO_enrichment.py @ 1:ec4d749f467b draft
Deleted selected files
author | nanettec |
---|---|
date | Wed, 16 Mar 2016 10:16:23 -0400 |
parents | bde64415f03b |
children |
line wrap: on
line source
""" @summary: GO enrichment analysis (hotspots) @author: nanette.coetzer@gmail.com @version 5 """ import optparse, sys import subprocess import tempfile import os, re def stop_err( msg ): sys.stderr.write( "%s\n" % msg ) sys.exit() def __main__(): #Parse Command Line parser = optparse.OptionParser() parser.add_option("-n", "--rscript", default=None, dest="rscript", help="R script") parser.add_option("-i", "--input1", default=None, dest="input1", help="Gene universe (Gene positions file)") parser.add_option("-j", "--input2", default=None, dest="input2", help="Hotspots genelists all eQTLs") parser.add_option("-k", "--input3", default=None, dest="input3", help="Hotspots genelists cis-eQTLs") parser.add_option("-l", "--input4", default=None, dest="input4", help="Hotspots genelists trans-eQTLs") parser.add_option("-m", "--input5", default=None, dest="input5", help="Gene2GO mapping") parser.add_option("-o", "--output1", default=None, dest="output1", help="GO enrichment output; all eQTLs") parser.add_option("-p", "--output2", default=None, dest="output2", help="GO enrichment output; cis eQTLs") parser.add_option("-q", "--output3", default=None, dest="output3", help="GO enrichment output; trans eQTLs") parser.add_option("-r", "--output4", default=None, dest="output4", help="Hotspot summary file") (options, args) = parser.parse_args() try: open(options.input1, "r").close() except TypeError, e: stop_err("You need to supply the Gene Universe file:\n" + str(e)) except IOError, e: stop_err("Can not open the Gene Universe file:\n" + str(e)) try: open(options.input2, "r").close() except TypeError, e: stop_err("You need to supply the Genes of Interest file (all eQTLs):\n" + str(e)) except IOError, e: stop_err("Can not open the Hotspots genelists file (all eQTLs):\n" + str(e)) try: open(options.input3, "r").close() except TypeError, e: stop_err("You need to supply the Genes of Interest file (cis-eQTLs):\n" + str(e)) except IOError, e: stop_err("Can not open the Hotspots genelists file (cis-eQTLs):\n" + str(e)) try: open(options.input4, "r").close() except TypeError, e: stop_err("You need to supply the Genes of Interest file (trnas-eQTLs):\n" + str(e)) except IOError, e: stop_err("Can not open the Hotspots genelists file (trans-eQTLs):\n" + str(e)) try: open(options.input5, "r").close() except TypeError, e: stop_err("You need to supply the Gene2GO mapping file:\n" + str(e)) except IOError, e: stop_err("Can not open the Gene2GO mapping file:\n" + str(e)) ########################################################## def uniq(input): output = [] for x in input: if x not in output: output.append(x) return output def manage_output(hotspot_nr, tempdir, type): sign_list = []; p_BP = 0; adjp_BP = 0; p_MF = 0; adjp_MF = 0; p_CC = 0; adjp_CC = 0 bpfile = open(tempdir+"/GO_table_BP.txt","r") new_file = open(tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_GO_table.txt","w") i = 0 stars = 0 for line in bpfile: i += 1 if i > 1: l = line.strip().split("\t") new_file.write("\t".join(l[1:])+"\n") if l[9] == "\"*\"": p_BP += 1 if l[10] == "\"*\"": adjp_BP += 1 else: l = line.strip().split("\t") l[0] = "BP_GO.ID" newl = "\t".join(l) new_file.write(newl+"\n") bpfile.close() stars = 5 name = "tGO_BP_classic_"+str(stars)+"_all.pdf" full_name = tempdir+"/"+name new_name = tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_"+name s = "mv %s %s" %(full_name, new_name) subprocess.call(s, shell=True) sign_list.append(p_BP) sign_list.append(adjp_BP) mffile = open(tempdir+"/GO_table_MF.txt","r") i = 0 stars = 0 for line in mffile: i += 1 if i > 1: l = line.strip().split("\t") new_file.write("\t".join(l[1:])+"\n") if l[9] == "\"*\"": p_MF += 1 if l[10] == "\"*\"": adjp_MF += 1 else: l = line.strip().split("\t") l[0] = "MF_GO.ID" newl = "\t".join(l) new_file.write("\n"+newl+"\n") mffile.close() stars = 5 name = "tGO_MF_classic_"+str(stars)+"_all.pdf" full_name = tempdir+"/"+name new_name = tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_"+name s = "mv %s %s" %(full_name, new_name) subprocess.call(s, shell=True) sign_list.append(p_MF) sign_list.append(adjp_MF) ccfile = open(tempdir+"/GO_table_CC.txt","r") i = 0 stars = 0 for line in ccfile: i += 1 if i > 1: l = line.strip().split("\t") new_file.write("\t".join(l[1:])+"\n") if l[9] == "\"*\"": p_CC += 1 if l[10] == "\"*\"": adjp_CC += 1 else: l = line.strip().split("\t") l[0] = "CC_GO.ID" newl = "\t".join(l) new_file.write("\n"+newl+"\n") ccfile.close() stars = 5 name = "tGO_CC_classic_"+str(stars)+"_all.pdf" full_name = tempdir+"/"+name new_name = tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_"+name s = "mv %s %s" %(full_name, new_name) subprocess.call(s, shell=True) new_file.close() sign_list.append(p_CC) sign_list.append(adjp_CC) return sign_list # Create temp direcotry tempdir = tempfile.mkdtemp() #print tempdir #fixdir = "/lustre/galchew/MeQTL/eQTL_pipeline" #fixdir = "/Users/nanettecoetzer/Documents/Bioinformatics/MAIZE_project/Hotspots_eQTLs/enrichment_analysis/TopGO/add_to_hotspots_pipeline" # copy INPUT file to the temp directory s = "cp %s %s/gene_universe.txt" %(options.input1, tempdir) subprocess.call(s, shell=True) s = "cp %s %s/gene2GO_mapping.map" %(options.input5, tempdir) subprocess.call(s, shell=True) # create R script => save in temp directory # generate new header new_script = open(tempdir+"/new_script.r","w") header = "setwd(\"%s\")" %tempdir new_script.write(header+"\n") # add script body script = open(options.rscript,"r") #script = open(fixdir+"/TopGO_pipeline_new.txt","r") for line in script: new_script.write(line.strip()+"\n") new_script.close() # get 2 parents #infile = open(options.input2,'r') #parents = [] #for line in infile: # l = line.strip().split("\t") # if not line.startswith("=") and len(l) > 4: # parents.append(l[12]) #parent_list = uniq(parents) #infile.close() summary_file = open(options.output4,'w') summary_file.write("Type\tNumber\tChr\tSliding_IDs\tInterval_IDs\tNum_eQTLs\tParent_neg_additive\tParent_pos_additive\tBP_pVal<0.01\tBP_adj_pVal<0.05\tMF_pVal<0.01\tMF_adj_pVal<0.05\tCC_pVal<0.01\tCC_adj_pVal<0.05\n") ############################## all eQTLs ############################## hotspot_nr = 0 i = 0 parent1 = 0 # Parent_neg_additive parent2 = 0 # Parent_pos_additive count_eqtl = 0 infile2 = open(options.input2,'r') for line in infile2: l = line.strip().split("\t") if line.startswith("="): hotspot = l[0].split("Hotspot")[1].split(" ")[1] chr = l[0].split("Hotspot")[1].split(" ")[4] sliding_IDs = l[1].split(": ")[1] int_IDs = l[2].split(": ")[1] nr_eqtl = l[3].split(" ")[1] if hotspot_nr == 0: summary_file.write("All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t") #print "All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t" if hotspot_nr != 0: interesting_genes.close() # run R script from temp directory --> for the previous hotspot s = "R CMD BATCH %s/new_script.r out.txt" %tempdir subprocess.call(s, shell=True) # remove old interesting genes file #s = "rm %s/interesting_genes.txt" %(tempdir) subprocess.call(s, shell=True) sign_GO = manage_output(hotspot_nr, tempdir, "all") sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5]) percentage_1 = round(int(parent1)/float(count_eqtl)*100,1) percentage_2 = round(int(parent2)/float(count_eqtl)*100,1) summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n") #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n" summary_file.write("All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t") #print "All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t" parent1 = 0 parent2 = 0 count_eqtl = 0 # write a new interesting genes file hotspot_nr += 1 interesting_genes = open(tempdir+"/interesting_genes.txt",'w') else: if len(l) > 4: # populate the new interesting genes file gene_id = l[0] parent = l[12] # populate the new interesting genes file if gene_id != "": interesting_genes.write(gene_id+"\n") count_eqtl += 1 #if parent == parent_list[0]: # parent1 += 1 #if parent == parent_list[1]: # parent2 += 1 if float(parent) < 0: parent1 += 1 else: parent2 += 1 interesting_genes.close() # run R script from temp directory --> for the previous hotspot s = "R CMD BATCH %s/new_script.r out.txt" %tempdir subprocess.call(s, shell=True) # remove old interesting genes file #s = "rm %s/interesting_genes.txt" %(tempdir) subprocess.call(s, shell=True) sign_GO = manage_output(hotspot_nr, tempdir, "all") percentage_1 = round(int(parent1)/float(count_eqtl)*100,1) percentage_2 = round(int(parent2)/float(count_eqtl)*100,1) sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5]) summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n") #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n" parent1 = 0 parent2 = 0 count_eqtl = 0 infile2.close() ############ filenames = [] os.chdir(tempdir+"/") for files in os.listdir("."): if (files.endswith(".txt") and files.startswith("hotspot")): filenames.append(files) filenames = sorted(filenames) with open(tempdir+"/"+"hotspots_all_GO_tables.txt","w") as outfile: for fname in filenames: outfile.write("\n"+fname+"\n") with open(tempdir+"/"+fname) as infile: for line in infile: outfile.write(line) #for f in filenames: # os.remove(f) ############ tempdir2 = tempfile.mkdtemp() s = "zip -j %s/output.zip %s/hotspot*" %(tempdir2,tempdir) subprocess.call(s, shell=True) s = "rm %s/hotspot*" %(tempdir) subprocess.call(s, shell=True) # move the zipped temp directory to options.output os.system("mv %s/output.zip %s" %(tempdir2,options.output1)) ############################## cis eQTLs ############################## hotspot_nr = 0 i = 0 parent1 = 0 parent2 = 0 count_eqtl = 0 infile3 = open(options.input3,'r') for line in infile3: l = line.strip().split("\t") if line.startswith("="): hotspot = l[0].split("Hotspot")[1].split(" ")[1] chr = l[0].split("Hotspot")[1].split(" ")[4] sliding_IDs = l[1].split(": ")[1] int_IDs = l[2].split(": ")[1] nr_eqtl = l[3].split(" ")[1] if hotspot_nr == 0: summary_file.write("Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t") #print "Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t" if hotspot_nr != 0: interesting_genes.close() # run R script from temp directory --> for the previous hotspot s = "R CMD BATCH %s/new_script.r out.txt" %tempdir subprocess.call(s, shell=True) # remove old interesting genes file #s = "rm %s/interesting_genes.txt" %(tempdir) subprocess.call(s, shell=True) sign_GO = manage_output(hotspot_nr, tempdir, "cis") sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5]) percentage_1 = round(int(parent1)/float(count_eqtl)*100,1) percentage_2 = round(int(parent2)/float(count_eqtl)*100,1) summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n") #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n" summary_file.write("Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t") #print "Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t" parent1 = 0 parent2 = 0 count_eqtl = 0 # write a new interesting genes file hotspot_nr += 1 interesting_genes = open(tempdir+"/interesting_genes.txt",'w') else: if len(l) > 4: # populate the new interesting genes file gene_id = l[0] parent = l[12] # populate the new interesting genes file if gene_id != "": interesting_genes.write(gene_id+"\n") count_eqtl += 1 #if parent == parent_list[0]: # parent1 += 1 #if parent == parent_list[1]: # parent2 += 1 if float(parent) < 0: parent1 += 1 else: parent2 += 1 interesting_genes.close() # run R script from temp directory --> for the previous hotspot s = "R CMD BATCH %s/new_script.r out.txt" %tempdir subprocess.call(s, shell=True) # remove old interesting genes file #s = "rm %s/interesting_genes.txt" %(tempdir) subprocess.call(s, shell=True) sign_GO = manage_output(hotspot_nr, tempdir, "cis") percentage_1 = round(int(parent1)/float(count_eqtl)*100,1) percentage_2 = round(int(parent2)/float(count_eqtl)*100,1) sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5]) summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n") #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n" parent1 = 0 parent2 = 0 count_eqtl = 0 infile3.close() ############ filenames = [] os.chdir(tempdir+"/") for files in os.listdir("."): if (files.endswith(".txt") and files.startswith("hotspot")): filenames.append(files) filenames = sorted(filenames) with open(tempdir+"/"+"hotspots_cis_GO_tables.txt","w") as outfile: for fname in filenames: outfile.write("\n"+fname+"\n") with open(tempdir+"/"+fname) as infile: for line in infile: outfile.write(line) #for f in filenames: # os.remove(f) ############ tempdir2 = tempfile.mkdtemp() s = "zip -j %s/output.zip %s/hotspot*" %(tempdir2,tempdir) subprocess.call(s, shell=True) s = "rm %s/hotspot*" %(tempdir) subprocess.call(s, shell=True) # move the zipped temp directory to options.output os.system("mv %s/output.zip %s" %(tempdir2,options.output2)) ############################## trans eQTLs ############################## hotspot_nr = 0 i = 0 parent1 = 0 parent2 = 0 count_eqtl = 0 infile4 = open(options.input4,'r') for line in infile4: l = line.strip().split("\t") if line.startswith("="): hotspot = l[0].split("Hotspot")[1].split(" ")[1] chr = l[0].split("Hotspot")[1].split(" ")[4] sliding_IDs = l[1].split(": ")[1] int_IDs = l[2].split(": ")[1] nr_eqtl = l[3].split(" ")[1] if hotspot_nr == 0: summary_file.write("Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t") #print "Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t" if hotspot_nr != 0: interesting_genes.close() # run R script from temp directory --> for the previous hotspot s = "R CMD BATCH %s/new_script.r out.txt" %tempdir subprocess.call(s, shell=True) # remove old interesting genes file #s = "rm %s/interesting_genes.txt" %(tempdir) subprocess.call(s, shell=True) sign_GO = manage_output(hotspot_nr, tempdir, "trans") sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5]) percentage_1 = round(int(parent1)/float(count_eqtl)*100,1) percentage_2 = round(int(parent2)/float(count_eqtl)*100,1) summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n") #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n" summary_file.write("Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t") #print "Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t" parent1 = 0 parent2 = 0 count_eqtl = 0 # write a new interesting genes file hotspot_nr += 1 interesting_genes = open(tempdir+"/interesting_genes.txt",'w') else: if len(l) > 4: # populate the new interesting genes file gene_id = l[0] parent = l[12] # populate the new interesting genes file if gene_id != "": interesting_genes.write(gene_id+"\n") count_eqtl += 1 #if parent == parent_list[0]: # parent1 += 1 #if parent == parent_list[1]: # parent2 += 1 if float(parent) < 0: parent1 += 1 else: parent2 += 1 interesting_genes.close() # run R script from temp directory --> for the previous hotspot s = "R CMD BATCH %s/new_script.r out.txt" %tempdir subprocess.call(s, shell=True) # remove old interesting genes file #s = "rm %s/interesting_genes.txt" %(tempdir) subprocess.call(s, shell=True) sign_GO = manage_output(hotspot_nr, tempdir, "trans") percentage_1 = round(int(parent1)/float(count_eqtl)*100,1) percentage_2 = round(int(parent2)/float(count_eqtl)*100,1) sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5]) summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n") #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n" parent1 = 0 parent2 = 0 count_eqtl = 0 infile4.close() ############ filenames = [] os.chdir(tempdir+"/") for files in os.listdir("."): if (files.endswith(".txt") and files.startswith("hotspot")): filenames.append(files) filenames = sorted(filenames) with open(tempdir+"/"+"hotspots_trans_GO_tables.txt","w") as outfile: for fname in filenames: outfile.write("\n"+fname+"\n") with open(tempdir+"/"+fname) as infile: for line in infile: outfile.write(line) #for f in filenames: # os.remove(f) ############ tempdir2 = tempfile.mkdtemp() s = "zip -j %s/output.zip %s/hotspot*" %(tempdir2,tempdir) subprocess.call(s, shell=True) s = "rm %s/hotspot*" %(tempdir) subprocess.call(s, shell=True) # move the zipped temp directory to options.output os.system("mv %s/output.zip %s" %(tempdir2,options.output3)) summary_file.close() ############################################## if __name__=="__main__": __main__()