comparison get_codon_frequency.py @ 15:702e60e819c2 draft

Uploaded
author rlegendre
date Mon, 11 May 2015 09:53:08 -0400
parents 344bacf6acb8
children fcfdb2607cb8
comparison
equal deleted inserted replaced
14:344bacf6acb8 15:702e60e819c2
9 ''' 9 '''
10 10
11 from __future__ import division 11 from __future__ import division
12 import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time 12 import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time
13 import itertools 13 import itertools
14 import math 14 from math import log10
15 from decimal import Decimal 15 from decimal import Decimal
16 from Bio import SeqIO 16 from Bio import SeqIO
17 from Bio.Seq import Seq 17 from Bio.Seq import Seq
18 from numpy import arange, std, array, linspace, average 18 from numpy import arange, std, array, linspace, average
19 #from matplotlib import pyplot as pl 19 #from matplotlib import pyplot as pl
27 from collections import OrderedDict 27 from collections import OrderedDict
28 import ribo_functions 28 import ribo_functions
29 import HTSeq 29 import HTSeq
30 # #libraries for debugg 30 # #libraries for debugg
31 #import pdb 31 #import pdb
32 # import cPickle 32 import cPickle
33 33
34 def stop_err(msg): 34 def stop_err(msg):
35 sys.stderr.write("%s\n" % msg) 35 sys.stderr.write("%s\n" % msg)
36 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) 36 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
37 sys.exit() 37 sys.exit()
300 cond1_2 = result[1].copy() 300 cond1_2 = result[1].copy()
301 cond2_1 = result[2].copy() 301 cond2_1 = result[2].copy()
302 cond2_2 = result[3].copy() 302 cond2_2 = result[3].copy()
303 # get codon order in one of list 303 # get codon order in one of list
304 codon_sorted = sorted(cond1_1.iterkeys(), reverse=False) 304 codon_sorted = sorted(cond1_1.iterkeys(), reverse=False)
305 # get max of each list 305 try:
306 sum11 = sum(list(cond1_1.itervalues())) 306 # get max of each list
307 sum12 = sum(list(cond1_2.itervalues())) 307 sum11 = sum(list(cond1_1.itervalues()))
308 sum21 = sum(list(cond2_1.itervalues())) 308 sum12 = sum(list(cond1_2.itervalues()))
309 sum22 = sum(list(cond2_2.itervalues())) 309 sum21 = sum(list(cond2_1.itervalues()))
310 # for each codon, get values and sd in each condition 310 sum22 = sum(list(cond2_2.itervalues()))
311 cond1_val = {} 311 # for each codon, get values and sd in each condition
312 cond1 = {} 312 cond1_val = {}
313 cond2_val = {} 313 cond1 = {}
314 cond2 = {} 314 cond2_val = {}
315 std_cond1 = [] 315 cond2 = {}
316 std_cond2 = [] 316 std_cond1 = []
317 max_val = [] # # max value for graph 317 std_cond2 = []
318 for i in codon_sorted: 318 max_val = [] # # max value for graph
319 # # cond1 = moyenne of replicats cond1 divided by max 319 for i in codon_sorted:
320 cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2) 320 # # cond1 = moyenne of replicats cond1 divided by max
321 cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2) 321 cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2)
322 # # standard deviation = absolute value of diffence between replicats of cond1 322 cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2)
323 std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)]))) 323 # # standard deviation = absolute value of diffence between replicats of cond1
324 # # cond2 = moyenne of replicats cond1divided by max 324 std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)])))
325 cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2) 325 # # cond2 = moyenne of replicats cond1divided by max
326 cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2) 326 cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)
327 # # standard deviation = absolute value of diffence between replicats of cond2 327 cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2)
328 std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)]))) 328 # # standard deviation = absolute value of diffence between replicats of cond2
329 # # max value for each codon 329 std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)])))
330 max_val.append(max((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2, (cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)) 330 # # max value for each codon
331 331 max_val.append(max((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2, (cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2))
332 # for graph design 332
333 cond1_norm = OrderedDict(sorted(cond1_val.items(), key=lambda t: t[0])) 333 # for graph design
334 cond1_norm.update ((x, y * 100) for x, y in cond1_norm.items()) 334 cond1_norm = OrderedDict(sorted(cond1_val.items(), key=lambda t: t[0]))
335 cond2_norm = OrderedDict(sorted(cond2_val.items(), key=lambda t: t[0])) 335 cond1_norm.update ((x, y * 100) for x, y in cond1_norm.items())
336 cond2_norm.update ((x, y * 100) for x, y in cond2_norm.items()) 336 cond2_norm = OrderedDict(sorted(cond2_val.items(), key=lambda t: t[0]))
337 max_val = [x * 100 for x in max_val] 337 cond2_norm.update ((x, y * 100) for x, y in cond2_norm.items())
338 max_val = [x * 100 for x in max_val]
339 except ZeroDivisionError:
340 stop_err("Not enough reads to compute the codon occupancy")
338 341
339 AA = get_aa_dict(cond1_norm, cond2_norm) 342 AA = get_aa_dict(cond1_norm, cond2_norm)
340 max_valaa = [] 343 max_valaa = []
341 cond1_aa = [] 344 cond1_aa = []
342 cond2_aa = [] 345 cond2_aa = []
344 for z in AA.itervalues(): 347 for z in AA.itervalues():
345 cond1_aa.append(z[0]) 348 cond1_aa.append(z[0])
346 cond2_aa.append(z[1]) 349 cond2_aa.append(z[1])
347 max_valaa.append(max(z)) 350 max_valaa.append(max(z))
348 # # plot amino acid profile : 351 # # plot amino acid profile :
349 fig = pl.figure(figsize=(30, 10), num=1) 352 fig = pl.figure(figsize=(15,10), num=1)
350 width = .50 353 width = .50
351 ax = fig.add_subplot(111) 354 ax = fig.add_subplot(111)
352 ax.xaxis.set_ticks([]) 355 ax.xaxis.set_ticks([])
353 ind = arange(21) 356 ind = arange(21)
354 pl.xlim(0, 21) 357 pl.xlim(0, 21)
384 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') 387 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')
385 388
386 389
387 390
388 # plot result 391 # plot result
389 fig = pl.figure(figsize=(30, 10), num=1) 392 fig = pl.figure(figsize=(20,10), num=1)
390 width = .40 393 width = .40
391 ind = arange(len(codon_sorted)) 394 ind = arange(len(codon_sorted))
392 ax = fig.add_subplot(111) 395 ax = fig.add_subplot(111)
393 pl.xlim(0, len(codon_sorted) + 1) 396 pl.xlim(0, len(codon_sorted) + 1)
394 ax.spines['right'].set_color('none') 397 ax.spines['right'].set_color('none')
404 ax.set_xlabel('Codons') 407 ax.set_xlabel('Codons')
405 handles, labels = ax.get_legend_handles_labels() 408 handles, labels = ax.get_legend_handles_labels()
406 ax.legend(handles, labels) 409 ax.legend(handles, labels)
407 pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340) 410 pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
408 pl.clf() 411 pl.clf()
409
410 412
411 413
412 elif len(result) == 2 : 414 elif len(result) == 2 :
413 415
414 # store each dict in OrderedDict sorted by key to make code more readable 416 # store each dict in OrderedDict sorted by key to make code more readable
417 cond1_norm = result[0].copy() 419 cond1_norm = result[0].copy()
418 cond2_norm = result[1].copy() 420 cond2_norm = result[1].copy()
419 # pdb.set_trace() 421 # pdb.set_trace()
420 # get codon order in one of list 422 # get codon order in one of list
421 codon_sorted = sorted(cond1.iterkeys(), reverse=False) 423 codon_sorted = sorted(cond1.iterkeys(), reverse=False)
422 424 try:
423 # get sum of each list 425 # get sum of each list
424 sum1 = sum(list(cond1.itervalues())) 426 sum1 = sum(list(cond1.itervalues()))
425 sum2 = sum(list(cond2.itervalues())) 427 sum2 = sum(list(cond2.itervalues()))
426 # #Normalize values by sum of each libraries 428 # #Normalize values by sum of each libraries
427 cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items()) 429 cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items())
428 cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items()) 430 cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items())
429 431 except ZeroDivisionError:
432 stop_err("Not enough reads to compute the codon occupancy")
433
430 # # compute theorical count in COND2 434 # # compute theorical count in COND2
431 cond2_count = [] 435 cond2_count = []
432 for z in cond1_norm.itervalues() : 436 for z in cond1_norm.itervalues() :
433 count = int(z * sum2 / 100.0) 437 count = int(z * sum2 / 100.0)
434 cond2_count.append(count) 438 cond2_count.append(count)
446 cond1_aa.append(z[0]) 450 cond1_aa.append(z[0])
447 cond2_aa.append(z[1]) 451 cond2_aa.append(z[1])
448 max_val.append(max(z)) 452 max_val.append(max(z))
449 453
450 # # plot amino acid profile : 454 # # plot amino acid profile :
451 fig = pl.figure(num=1) 455 fig = pl.figure(figsize=(15,10), num=1)
452 width = .45 456 width = .45
453 ax = fig.add_subplot(111) 457 ax = fig.add_subplot(111)
454 ind = arange(21) 458 ind = arange(21)
455 pl.xlim(0, 21) 459 pl.xlim(0, 21)
456 #kwargs = {"hatch":'x'} 460 #kwargs = {"hatch":'x'}
490 for i in cond1: 494 for i in cond1:
491 # # max value for each codon 495 # # max value for each codon
492 max_val.append(max(cond1_norm[i], cond2_norm[i])) 496 max_val.append(max(cond1_norm[i], cond2_norm[i]))
493 497
494 # plot result 498 # plot result
495 fig = pl.figure(figsize=(40, 10), num=1) 499 fig = pl.figure(figsize=(20,10), num=1)
496 #fig = pl.figure(num=1) 500 #fig = pl.figure(num=1)
497 width = .40 501 width = .40
498 ind = arange(len(codon_sorted)) 502 ind = arange(len(codon_sorted))
499 ax = fig.add_subplot(111) 503 ax = fig.add_subplot(111)
500 pl.xlim(0, len(codon_sorted) + 1) 504 pl.xlim(0, len(codon_sorted) + 1)
535 <link href="/static/june_2007_style/blue/base.css" media="screen" rel="Stylesheet" type="text/css" /> 539 <link href="/static/june_2007_style/blue/base.css" media="screen" rel="Stylesheet" type="text/css" />
536 </head> 540 </head>
537 <body> 541 <body>
538 <h3>Global visualization</h3> 542 <h3>Global visualization</h3>
539 <p> 543 <p>
540 <h5>Visualization of density footprint in each codon.</h5><br> If user has selected analyse with replicats, standart error deviation between each replicate as plotting as error bar in histogram.<br> 544 <h5>Visualization of density footprint in each codon.</h5><br> If user has selected "Yes" for the replicate option the standard deviation between each replicate is plotted as an error bar in histogram.<br>
541 <img border="0" src="hist_codons.png" width="1040"/> 545 <img border="0" src="hist_codons.png" width="1040"/>
542 </p> 546 </p>
543 <p> 547 <p>
544 <h5>Test for homogeneity distribution between each condition</h5><br> 548 <h5>Test for homogeneity distribution between each condition</h5><br>
545 H0 : %s and %s are same distribution <br> 549 H0 : %s and %s are same distribution <br>
580 cmd = "samtools index %s " % (bamfile) 584 cmd = "samtools index %s " % (bamfile)
581 proc = subprocess.Popen(args=cmd, shell=True, stderr=subprocess.PIPE) 585 proc = subprocess.Popen(args=cmd, shell=True, stderr=subprocess.PIPE)
582 returncode = proc.wait() 586 returncode = proc.wait()
583 # if returncode != 0: 587 # if returncode != 0:
584 # raise Exception 588 # raise Exception
589
590 def plot_fc (cond1, cond2, site, dirout):
591
592 fc = cond1.copy()
593
594 for key, value in fc.iteritems():
595 fc[key] = cond2[key]/cond1[key]
596
597 index = arange(len(fc.keys()))
598 label = fc.keys()
599 label = [w.replace('T','U') for w in label]
600 pl.figure(figsize=(15,10), num=1)
601 ax = pl.subplot(1,1,1)
602 pl.xticks([])
603 pl.scatter(index, fc.values(), color='b')
604 pl.axhline(y=1,color='r')
605 pl.xticks(index, label, rotation=90)
606 pl.ylabel('Foldchange of codon occupancy')
607 ax.yaxis.set_ticks_position('left')
608 ax.xaxis.set_ticks_position('bottom')
609 pl.title(site+" site")
610 pl.savefig(dirout + '/fc_codons.png', format="png", dpi=340)
611
585 612
586 def __main__(): 613 def __main__():
587 614
588 615
589 # Parse command line options 616 # Parse command line options
673 for fh in itertools.chain(cond1, cond2): 700 for fh in itertools.chain(cond1, cond2):
674 check_index_bam (fh) 701 check_index_bam (fh)
675 result.append(get_codon_usage(fh, GFF, options.site, options.kmer, options.asite)) 702 result.append(get_codon_usage(fh, GFF, options.site, options.kmer, options.asite))
676 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2) 703 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
677 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir) 704 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
678 705 plot_fc (cond1, cond2, options.site, html_dir)
679 706
680 # #If there are no replicat 707 # #If there are no replicat
681 elif options.rep == "no" : 708 elif options.rep == "no" :
682 result = [] 709 result = []
683 # #calcul for each cond 710 # #calcul for each cond
684 for fh in (options.file1, options.file2): 711 for fh in (options.file1, options.file2):
685 check_index_bam (fh) 712 check_index_bam (fh)
686 result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite)) 713 result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite))
687 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2) 714 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
688 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir) 715 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
689 716 plot_fc (cond1, cond2, options.site, html_dir)
690 else : 717 else :
691 sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time()))) 718 sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time())))
692 sys.exit() 719 sys.exit()
693 720
694 # write_html_file(html,chi_pval,t_pval,codons,options.c1, options.c2) 721 # write_html_file(html,chi_pval,t_pval,codons,options.c1, options.c2)