comparison GO_enrichment/GO_enrichment.py @ 0:bde64415f03b draft

Uploaded
author nanettec
date Wed, 16 Mar 2016 10:15:05 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:bde64415f03b
1 """
2 @summary: GO enrichment analysis (hotspots)
3 @author: nanette.coetzer@gmail.com
4 @version 5
5
6 """
7
8 import optparse, sys
9 import subprocess
10 import tempfile
11 import os, re
12
13 def stop_err( msg ):
14 sys.stderr.write( "%s\n" % msg )
15 sys.exit()
16
17 def __main__():
18 #Parse Command Line
19 parser = optparse.OptionParser()
20 parser.add_option("-n", "--rscript", default=None, dest="rscript",
21 help="R script")
22 parser.add_option("-i", "--input1", default=None, dest="input1",
23 help="Gene universe (Gene positions file)")
24 parser.add_option("-j", "--input2", default=None, dest="input2",
25 help="Hotspots genelists all eQTLs")
26 parser.add_option("-k", "--input3", default=None, dest="input3",
27 help="Hotspots genelists cis-eQTLs")
28 parser.add_option("-l", "--input4", default=None, dest="input4",
29 help="Hotspots genelists trans-eQTLs")
30 parser.add_option("-m", "--input5", default=None, dest="input5",
31 help="Gene2GO mapping")
32 parser.add_option("-o", "--output1", default=None, dest="output1",
33 help="GO enrichment output; all eQTLs")
34 parser.add_option("-p", "--output2", default=None, dest="output2",
35 help="GO enrichment output; cis eQTLs")
36 parser.add_option("-q", "--output3", default=None, dest="output3",
37 help="GO enrichment output; trans eQTLs")
38 parser.add_option("-r", "--output4", default=None, dest="output4",
39 help="Hotspot summary file")
40
41 (options, args) = parser.parse_args()
42
43 try:
44 open(options.input1, "r").close()
45 except TypeError, e:
46 stop_err("You need to supply the Gene Universe file:\n" + str(e))
47 except IOError, e:
48 stop_err("Can not open the Gene Universe file:\n" + str(e))
49
50 try:
51 open(options.input2, "r").close()
52 except TypeError, e:
53 stop_err("You need to supply the Genes of Interest file (all eQTLs):\n" + str(e))
54 except IOError, e:
55 stop_err("Can not open the Hotspots genelists file (all eQTLs):\n" + str(e))
56
57 try:
58 open(options.input3, "r").close()
59 except TypeError, e:
60 stop_err("You need to supply the Genes of Interest file (cis-eQTLs):\n" + str(e))
61 except IOError, e:
62 stop_err("Can not open the Hotspots genelists file (cis-eQTLs):\n" + str(e))
63
64 try:
65 open(options.input4, "r").close()
66 except TypeError, e:
67 stop_err("You need to supply the Genes of Interest file (trnas-eQTLs):\n" + str(e))
68 except IOError, e:
69 stop_err("Can not open the Hotspots genelists file (trans-eQTLs):\n" + str(e))
70
71 try:
72 open(options.input5, "r").close()
73 except TypeError, e:
74 stop_err("You need to supply the Gene2GO mapping file:\n" + str(e))
75 except IOError, e:
76 stop_err("Can not open the Gene2GO mapping file:\n" + str(e))
77
78 ##########################################################
79
80 def uniq(input):
81 output = []
82 for x in input:
83 if x not in output:
84 output.append(x)
85 return output
86
87 def manage_output(hotspot_nr, tempdir, type):
88 sign_list = []; p_BP = 0; adjp_BP = 0; p_MF = 0; adjp_MF = 0; p_CC = 0; adjp_CC = 0
89
90 bpfile = open(tempdir+"/GO_table_BP.txt","r")
91 new_file = open(tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_GO_table.txt","w")
92 i = 0
93 stars = 0
94 for line in bpfile:
95 i += 1
96 if i > 1:
97 l = line.strip().split("\t")
98 new_file.write("\t".join(l[1:])+"\n")
99 if l[9] == "\"*\"":
100 p_BP += 1
101 if l[10] == "\"*\"":
102 adjp_BP += 1
103 else:
104 l = line.strip().split("\t")
105 l[0] = "BP_GO.ID"
106 newl = "\t".join(l)
107 new_file.write(newl+"\n")
108 bpfile.close()
109 stars = 5
110 name = "tGO_BP_classic_"+str(stars)+"_all.pdf"
111 full_name = tempdir+"/"+name
112 new_name = tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_"+name
113 s = "mv %s %s" %(full_name, new_name)
114 subprocess.call(s, shell=True)
115 sign_list.append(p_BP)
116 sign_list.append(adjp_BP)
117
118 mffile = open(tempdir+"/GO_table_MF.txt","r")
119 i = 0
120 stars = 0
121 for line in mffile:
122 i += 1
123 if i > 1:
124 l = line.strip().split("\t")
125 new_file.write("\t".join(l[1:])+"\n")
126 if l[9] == "\"*\"":
127 p_MF += 1
128 if l[10] == "\"*\"":
129 adjp_MF += 1
130 else:
131 l = line.strip().split("\t")
132 l[0] = "MF_GO.ID"
133 newl = "\t".join(l)
134 new_file.write("\n"+newl+"\n")
135 mffile.close()
136 stars = 5
137 name = "tGO_MF_classic_"+str(stars)+"_all.pdf"
138 full_name = tempdir+"/"+name
139 new_name = tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_"+name
140 s = "mv %s %s" %(full_name, new_name)
141 subprocess.call(s, shell=True)
142 sign_list.append(p_MF)
143 sign_list.append(adjp_MF)
144
145 ccfile = open(tempdir+"/GO_table_CC.txt","r")
146 i = 0
147 stars = 0
148 for line in ccfile:
149 i += 1
150 if i > 1:
151 l = line.strip().split("\t")
152 new_file.write("\t".join(l[1:])+"\n")
153 if l[9] == "\"*\"":
154 p_CC += 1
155 if l[10] == "\"*\"":
156 adjp_CC += 1
157 else:
158 l = line.strip().split("\t")
159 l[0] = "CC_GO.ID"
160 newl = "\t".join(l)
161 new_file.write("\n"+newl+"\n")
162 ccfile.close()
163 stars = 5
164 name = "tGO_CC_classic_"+str(stars)+"_all.pdf"
165 full_name = tempdir+"/"+name
166 new_name = tempdir+"/hotspot"+str(hotspot_nr)+"_"+str(type)+"_"+name
167 s = "mv %s %s" %(full_name, new_name)
168 subprocess.call(s, shell=True)
169 new_file.close()
170 sign_list.append(p_CC)
171 sign_list.append(adjp_CC)
172 return sign_list
173
174 # Create temp direcotry
175 tempdir = tempfile.mkdtemp()
176 #print tempdir
177 #fixdir = "/lustre/galchew/MeQTL/eQTL_pipeline"
178 #fixdir = "/Users/nanettecoetzer/Documents/Bioinformatics/MAIZE_project/Hotspots_eQTLs/enrichment_analysis/TopGO/add_to_hotspots_pipeline"
179
180 # copy INPUT file to the temp directory
181 s = "cp %s %s/gene_universe.txt" %(options.input1, tempdir)
182 subprocess.call(s, shell=True)
183 s = "cp %s %s/gene2GO_mapping.map" %(options.input5, tempdir)
184 subprocess.call(s, shell=True)
185
186 # create R script => save in temp directory
187 # generate new header
188 new_script = open(tempdir+"/new_script.r","w")
189 header = "setwd(\"%s\")" %tempdir
190 new_script.write(header+"\n")
191 # add script body
192 script = open(options.rscript,"r")
193 #script = open(fixdir+"/TopGO_pipeline_new.txt","r")
194 for line in script:
195 new_script.write(line.strip()+"\n")
196 new_script.close()
197
198 # get 2 parents
199 #infile = open(options.input2,'r')
200 #parents = []
201 #for line in infile:
202 # l = line.strip().split("\t")
203 # if not line.startswith("=") and len(l) > 4:
204 # parents.append(l[12])
205 #parent_list = uniq(parents)
206 #infile.close()
207
208 summary_file = open(options.output4,'w')
209 summary_file.write("Type\tNumber\tChr\tSliding_IDs\tInterval_IDs\tNum_eQTLs\tParent_neg_additive\tParent_pos_additive\tBP_pVal<0.01\tBP_adj_pVal<0.05\tMF_pVal<0.01\tMF_adj_pVal<0.05\tCC_pVal<0.01\tCC_adj_pVal<0.05\n")
210
211 ############################## all eQTLs ##############################
212 hotspot_nr = 0
213 i = 0
214 parent1 = 0 # Parent_neg_additive
215 parent2 = 0 # Parent_pos_additive
216 count_eqtl = 0
217 infile2 = open(options.input2,'r')
218 for line in infile2:
219 l = line.strip().split("\t")
220 if line.startswith("="):
221 hotspot = l[0].split("Hotspot")[1].split(" ")[1]
222 chr = l[0].split("Hotspot")[1].split(" ")[4]
223 sliding_IDs = l[1].split(": ")[1]
224 int_IDs = l[2].split(": ")[1]
225 nr_eqtl = l[3].split(" ")[1]
226 if hotspot_nr == 0:
227 summary_file.write("All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
228 #print "All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
229 if hotspot_nr != 0:
230 interesting_genes.close()
231 # run R script from temp directory --> for the previous hotspot
232 s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
233 subprocess.call(s, shell=True)
234 # remove old interesting genes file
235 #s = "rm %s/interesting_genes.txt" %(tempdir)
236 subprocess.call(s, shell=True)
237 sign_GO = manage_output(hotspot_nr, tempdir, "all")
238 sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
239 percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
240 percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
241 summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
242 #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
243 summary_file.write("All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
244 #print "All\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
245 parent1 = 0
246 parent2 = 0
247 count_eqtl = 0
248 # write a new interesting genes file
249 hotspot_nr += 1
250 interesting_genes = open(tempdir+"/interesting_genes.txt",'w')
251 else:
252 if len(l) > 4:
253 # populate the new interesting genes file
254 gene_id = l[0]
255 parent = l[12]
256 # populate the new interesting genes file
257 if gene_id != "":
258 interesting_genes.write(gene_id+"\n")
259 count_eqtl += 1
260 #if parent == parent_list[0]:
261 # parent1 += 1
262 #if parent == parent_list[1]:
263 # parent2 += 1
264 if float(parent) < 0:
265 parent1 += 1
266 else:
267 parent2 += 1
268 interesting_genes.close()
269 # run R script from temp directory --> for the previous hotspot
270 s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
271 subprocess.call(s, shell=True)
272 # remove old interesting genes file
273 #s = "rm %s/interesting_genes.txt" %(tempdir)
274 subprocess.call(s, shell=True)
275 sign_GO = manage_output(hotspot_nr, tempdir, "all")
276 percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
277 percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
278 sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
279 summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
280 #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
281 parent1 = 0
282 parent2 = 0
283 count_eqtl = 0
284 infile2.close()
285
286 ############
287 filenames = []
288 os.chdir(tempdir+"/")
289 for files in os.listdir("."):
290 if (files.endswith(".txt") and files.startswith("hotspot")):
291 filenames.append(files)
292 filenames = sorted(filenames)
293
294 with open(tempdir+"/"+"hotspots_all_GO_tables.txt","w") as outfile:
295 for fname in filenames:
296 outfile.write("\n"+fname+"\n")
297 with open(tempdir+"/"+fname) as infile:
298 for line in infile:
299 outfile.write(line)
300
301 #for f in filenames:
302 # os.remove(f)
303
304 ############
305
306 tempdir2 = tempfile.mkdtemp()
307 s = "zip -j %s/output.zip %s/hotspot*" %(tempdir2,tempdir)
308 subprocess.call(s, shell=True)
309 s = "rm %s/hotspot*" %(tempdir)
310 subprocess.call(s, shell=True)
311
312 # move the zipped temp directory to options.output
313 os.system("mv %s/output.zip %s" %(tempdir2,options.output1))
314
315
316 ############################## cis eQTLs ##############################
317 hotspot_nr = 0
318 i = 0
319 parent1 = 0
320 parent2 = 0
321 count_eqtl = 0
322 infile3 = open(options.input3,'r')
323 for line in infile3:
324 l = line.strip().split("\t")
325 if line.startswith("="):
326 hotspot = l[0].split("Hotspot")[1].split(" ")[1]
327 chr = l[0].split("Hotspot")[1].split(" ")[4]
328 sliding_IDs = l[1].split(": ")[1]
329 int_IDs = l[2].split(": ")[1]
330 nr_eqtl = l[3].split(" ")[1]
331 if hotspot_nr == 0:
332 summary_file.write("Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
333 #print "Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
334 if hotspot_nr != 0:
335 interesting_genes.close()
336 # run R script from temp directory --> for the previous hotspot
337 s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
338 subprocess.call(s, shell=True)
339 # remove old interesting genes file
340 #s = "rm %s/interesting_genes.txt" %(tempdir)
341 subprocess.call(s, shell=True)
342 sign_GO = manage_output(hotspot_nr, tempdir, "cis")
343 sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
344 percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
345 percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
346 summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
347 #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
348 summary_file.write("Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
349 #print "Cis\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
350 parent1 = 0
351 parent2 = 0
352 count_eqtl = 0
353 # write a new interesting genes file
354 hotspot_nr += 1
355 interesting_genes = open(tempdir+"/interesting_genes.txt",'w')
356 else:
357 if len(l) > 4:
358 # populate the new interesting genes file
359 gene_id = l[0]
360 parent = l[12]
361 # populate the new interesting genes file
362 if gene_id != "":
363 interesting_genes.write(gene_id+"\n")
364 count_eqtl += 1
365 #if parent == parent_list[0]:
366 # parent1 += 1
367 #if parent == parent_list[1]:
368 # parent2 += 1
369 if float(parent) < 0:
370 parent1 += 1
371 else:
372 parent2 += 1
373 interesting_genes.close()
374 # run R script from temp directory --> for the previous hotspot
375 s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
376 subprocess.call(s, shell=True)
377 # remove old interesting genes file
378 #s = "rm %s/interesting_genes.txt" %(tempdir)
379 subprocess.call(s, shell=True)
380 sign_GO = manage_output(hotspot_nr, tempdir, "cis")
381 percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
382 percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
383 sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
384 summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
385 #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
386 parent1 = 0
387 parent2 = 0
388 count_eqtl = 0
389 infile3.close()
390
391 ############
392 filenames = []
393 os.chdir(tempdir+"/")
394 for files in os.listdir("."):
395 if (files.endswith(".txt") and files.startswith("hotspot")):
396 filenames.append(files)
397 filenames = sorted(filenames)
398
399 with open(tempdir+"/"+"hotspots_cis_GO_tables.txt","w") as outfile:
400 for fname in filenames:
401 outfile.write("\n"+fname+"\n")
402 with open(tempdir+"/"+fname) as infile:
403 for line in infile:
404 outfile.write(line)
405
406 #for f in filenames:
407 # os.remove(f)
408
409 ############
410
411 tempdir2 = tempfile.mkdtemp()
412 s = "zip -j %s/output.zip %s/hotspot*" %(tempdir2,tempdir)
413 subprocess.call(s, shell=True)
414 s = "rm %s/hotspot*" %(tempdir)
415 subprocess.call(s, shell=True)
416
417 # move the zipped temp directory to options.output
418 os.system("mv %s/output.zip %s" %(tempdir2,options.output2))
419
420
421 ############################## trans eQTLs ##############################
422 hotspot_nr = 0
423 i = 0
424 parent1 = 0
425 parent2 = 0
426 count_eqtl = 0
427 infile4 = open(options.input4,'r')
428 for line in infile4:
429 l = line.strip().split("\t")
430 if line.startswith("="):
431 hotspot = l[0].split("Hotspot")[1].split(" ")[1]
432 chr = l[0].split("Hotspot")[1].split(" ")[4]
433 sliding_IDs = l[1].split(": ")[1]
434 int_IDs = l[2].split(": ")[1]
435 nr_eqtl = l[3].split(" ")[1]
436 if hotspot_nr == 0:
437 summary_file.write("Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
438 #print "Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
439 if hotspot_nr != 0:
440 interesting_genes.close()
441 # run R script from temp directory --> for the previous hotspot
442 s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
443 subprocess.call(s, shell=True)
444 # remove old interesting genes file
445 #s = "rm %s/interesting_genes.txt" %(tempdir)
446 subprocess.call(s, shell=True)
447 sign_GO = manage_output(hotspot_nr, tempdir, "trans")
448 sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
449 percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
450 percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
451 summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
452 #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
453 summary_file.write("Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t")
454 #print "Trans\t"+hotspot+"\t"+chr+"\t"+sliding_IDs+"\t"+int_IDs+"\t"+nr_eqtl+"\t"
455 parent1 = 0
456 parent2 = 0
457 count_eqtl = 0
458 # write a new interesting genes file
459 hotspot_nr += 1
460 interesting_genes = open(tempdir+"/interesting_genes.txt",'w')
461 else:
462 if len(l) > 4:
463 # populate the new interesting genes file
464 gene_id = l[0]
465 parent = l[12]
466 # populate the new interesting genes file
467 if gene_id != "":
468 interesting_genes.write(gene_id+"\n")
469 count_eqtl += 1
470 #if parent == parent_list[0]:
471 # parent1 += 1
472 #if parent == parent_list[1]:
473 # parent2 += 1
474 if float(parent) < 0:
475 parent1 += 1
476 else:
477 parent2 += 1
478 interesting_genes.close()
479 # run R script from temp directory --> for the previous hotspot
480 s = "R CMD BATCH %s/new_script.r out.txt" %tempdir
481 subprocess.call(s, shell=True)
482 # remove old interesting genes file
483 #s = "rm %s/interesting_genes.txt" %(tempdir)
484 subprocess.call(s, shell=True)
485 sign_GO = manage_output(hotspot_nr, tempdir, "trans")
486 percentage_1 = round(int(parent1)/float(count_eqtl)*100,1)
487 percentage_2 = round(int(parent2)/float(count_eqtl)*100,1)
488 sGO = str(sign_GO[0])+"\t"+str(sign_GO[1])+"\t"+str(sign_GO[2])+"\t"+str(sign_GO[3])+"\t"+str(sign_GO[4])+"\t"+str(sign_GO[5])
489 summary_file.write(str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n")
490 #print str(percentage_1)+"\t"+str(percentage_2)+"\t"+str(sGO)+"\n"
491 parent1 = 0
492 parent2 = 0
493 count_eqtl = 0
494 infile4.close()
495
496 ############
497 filenames = []
498 os.chdir(tempdir+"/")
499 for files in os.listdir("."):
500 if (files.endswith(".txt") and files.startswith("hotspot")):
501 filenames.append(files)
502 filenames = sorted(filenames)
503
504 with open(tempdir+"/"+"hotspots_trans_GO_tables.txt","w") as outfile:
505 for fname in filenames:
506 outfile.write("\n"+fname+"\n")
507 with open(tempdir+"/"+fname) as infile:
508 for line in infile:
509 outfile.write(line)
510
511 #for f in filenames:
512 # os.remove(f)
513
514 ############
515
516 tempdir2 = tempfile.mkdtemp()
517 s = "zip -j %s/output.zip %s/hotspot*" %(tempdir2,tempdir)
518 subprocess.call(s, shell=True)
519 s = "rm %s/hotspot*" %(tempdir)
520 subprocess.call(s, shell=True)
521
522 # move the zipped temp directory to options.output
523 os.system("mv %s/output.zip %s" %(tempdir2,options.output3))
524 summary_file.close()
525
526 ##############################################
527
528 if __name__=="__main__":
529 __main__()