Mercurial > repos > rlegendre > ribo_tools
comparison get_codon_frequency.py @ 15:702e60e819c2 draft
Uploaded
author | rlegendre |
---|---|
date | Mon, 11 May 2015 09:53:08 -0400 |
parents | 344bacf6acb8 |
children | fcfdb2607cb8 |
comparison
equal
deleted
inserted
replaced
14:344bacf6acb8 | 15:702e60e819c2 |
---|---|
9 ''' | 9 ''' |
10 | 10 |
11 from __future__ import division | 11 from __future__ import division |
12 import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time | 12 import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time |
13 import itertools | 13 import itertools |
14 import math | 14 from math import log10 |
15 from decimal import Decimal | 15 from decimal import Decimal |
16 from Bio import SeqIO | 16 from Bio import SeqIO |
17 from Bio.Seq import Seq | 17 from Bio.Seq import Seq |
18 from numpy import arange, std, array, linspace, average | 18 from numpy import arange, std, array, linspace, average |
19 #from matplotlib import pyplot as pl | 19 #from matplotlib import pyplot as pl |
27 from collections import OrderedDict | 27 from collections import OrderedDict |
28 import ribo_functions | 28 import ribo_functions |
29 import HTSeq | 29 import HTSeq |
30 # #libraries for debugg | 30 # #libraries for debugg |
31 #import pdb | 31 #import pdb |
32 # import cPickle | 32 import cPickle |
33 | 33 |
34 def stop_err(msg): | 34 def stop_err(msg): |
35 sys.stderr.write("%s\n" % msg) | 35 sys.stderr.write("%s\n" % msg) |
36 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) | 36 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) |
37 sys.exit() | 37 sys.exit() |
300 cond1_2 = result[1].copy() | 300 cond1_2 = result[1].copy() |
301 cond2_1 = result[2].copy() | 301 cond2_1 = result[2].copy() |
302 cond2_2 = result[3].copy() | 302 cond2_2 = result[3].copy() |
303 # get codon order in one of list | 303 # get codon order in one of list |
304 codon_sorted = sorted(cond1_1.iterkeys(), reverse=False) | 304 codon_sorted = sorted(cond1_1.iterkeys(), reverse=False) |
305 # get max of each list | 305 try: |
306 sum11 = sum(list(cond1_1.itervalues())) | 306 # get max of each list |
307 sum12 = sum(list(cond1_2.itervalues())) | 307 sum11 = sum(list(cond1_1.itervalues())) |
308 sum21 = sum(list(cond2_1.itervalues())) | 308 sum12 = sum(list(cond1_2.itervalues())) |
309 sum22 = sum(list(cond2_2.itervalues())) | 309 sum21 = sum(list(cond2_1.itervalues())) |
310 # for each codon, get values and sd in each condition | 310 sum22 = sum(list(cond2_2.itervalues())) |
311 cond1_val = {} | 311 # for each codon, get values and sd in each condition |
312 cond1 = {} | 312 cond1_val = {} |
313 cond2_val = {} | 313 cond1 = {} |
314 cond2 = {} | 314 cond2_val = {} |
315 std_cond1 = [] | 315 cond2 = {} |
316 std_cond2 = [] | 316 std_cond1 = [] |
317 max_val = [] # # max value for graph | 317 std_cond2 = [] |
318 for i in codon_sorted: | 318 max_val = [] # # max value for graph |
319 # # cond1 = moyenne of replicats cond1 divided by max | 319 for i in codon_sorted: |
320 cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2) | 320 # # cond1 = moyenne of replicats cond1 divided by max |
321 cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2) | 321 cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2) |
322 # # standard deviation = absolute value of diffence between replicats of cond1 | 322 cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2) |
323 std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)]))) | 323 # # standard deviation = absolute value of diffence between replicats of cond1 |
324 # # cond2 = moyenne of replicats cond1divided by max | 324 std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)]))) |
325 cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2) | 325 # # cond2 = moyenne of replicats cond1divided by max |
326 cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2) | 326 cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2) |
327 # # standard deviation = absolute value of diffence between replicats of cond2 | 327 cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2) |
328 std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)]))) | 328 # # standard deviation = absolute value of diffence between replicats of cond2 |
329 # # max value for each codon | 329 std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)]))) |
330 max_val.append(max((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2, (cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)) | 330 # # max value for each codon |
331 | 331 max_val.append(max((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2, (cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)) |
332 # for graph design | 332 |
333 cond1_norm = OrderedDict(sorted(cond1_val.items(), key=lambda t: t[0])) | 333 # for graph design |
334 cond1_norm.update ((x, y * 100) for x, y in cond1_norm.items()) | 334 cond1_norm = OrderedDict(sorted(cond1_val.items(), key=lambda t: t[0])) |
335 cond2_norm = OrderedDict(sorted(cond2_val.items(), key=lambda t: t[0])) | 335 cond1_norm.update ((x, y * 100) for x, y in cond1_norm.items()) |
336 cond2_norm.update ((x, y * 100) for x, y in cond2_norm.items()) | 336 cond2_norm = OrderedDict(sorted(cond2_val.items(), key=lambda t: t[0])) |
337 max_val = [x * 100 for x in max_val] | 337 cond2_norm.update ((x, y * 100) for x, y in cond2_norm.items()) |
338 max_val = [x * 100 for x in max_val] | |
339 except ZeroDivisionError: | |
340 stop_err("Not enough reads to compute the codon occupancy") | |
338 | 341 |
339 AA = get_aa_dict(cond1_norm, cond2_norm) | 342 AA = get_aa_dict(cond1_norm, cond2_norm) |
340 max_valaa = [] | 343 max_valaa = [] |
341 cond1_aa = [] | 344 cond1_aa = [] |
342 cond2_aa = [] | 345 cond2_aa = [] |
344 for z in AA.itervalues(): | 347 for z in AA.itervalues(): |
345 cond1_aa.append(z[0]) | 348 cond1_aa.append(z[0]) |
346 cond2_aa.append(z[1]) | 349 cond2_aa.append(z[1]) |
347 max_valaa.append(max(z)) | 350 max_valaa.append(max(z)) |
348 # # plot amino acid profile : | 351 # # plot amino acid profile : |
349 fig = pl.figure(figsize=(30, 10), num=1) | 352 fig = pl.figure(figsize=(15,10), num=1) |
350 width = .50 | 353 width = .50 |
351 ax = fig.add_subplot(111) | 354 ax = fig.add_subplot(111) |
352 ax.xaxis.set_ticks([]) | 355 ax.xaxis.set_ticks([]) |
353 ind = arange(21) | 356 ind = arange(21) |
354 pl.xlim(0, 21) | 357 pl.xlim(0, 21) |
384 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') | 387 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') |
385 | 388 |
386 | 389 |
387 | 390 |
388 # plot result | 391 # plot result |
389 fig = pl.figure(figsize=(30, 10), num=1) | 392 fig = pl.figure(figsize=(20,10), num=1) |
390 width = .40 | 393 width = .40 |
391 ind = arange(len(codon_sorted)) | 394 ind = arange(len(codon_sorted)) |
392 ax = fig.add_subplot(111) | 395 ax = fig.add_subplot(111) |
393 pl.xlim(0, len(codon_sorted) + 1) | 396 pl.xlim(0, len(codon_sorted) + 1) |
394 ax.spines['right'].set_color('none') | 397 ax.spines['right'].set_color('none') |
404 ax.set_xlabel('Codons') | 407 ax.set_xlabel('Codons') |
405 handles, labels = ax.get_legend_handles_labels() | 408 handles, labels = ax.get_legend_handles_labels() |
406 ax.legend(handles, labels) | 409 ax.legend(handles, labels) |
407 pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340) | 410 pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340) |
408 pl.clf() | 411 pl.clf() |
409 | |
410 | 412 |
411 | 413 |
412 elif len(result) == 2 : | 414 elif len(result) == 2 : |
413 | 415 |
414 # store each dict in OrderedDict sorted by key to make code more readable | 416 # store each dict in OrderedDict sorted by key to make code more readable |
417 cond1_norm = result[0].copy() | 419 cond1_norm = result[0].copy() |
418 cond2_norm = result[1].copy() | 420 cond2_norm = result[1].copy() |
419 # pdb.set_trace() | 421 # pdb.set_trace() |
420 # get codon order in one of list | 422 # get codon order in one of list |
421 codon_sorted = sorted(cond1.iterkeys(), reverse=False) | 423 codon_sorted = sorted(cond1.iterkeys(), reverse=False) |
422 | 424 try: |
423 # get sum of each list | 425 # get sum of each list |
424 sum1 = sum(list(cond1.itervalues())) | 426 sum1 = sum(list(cond1.itervalues())) |
425 sum2 = sum(list(cond2.itervalues())) | 427 sum2 = sum(list(cond2.itervalues())) |
426 # #Normalize values by sum of each libraries | 428 # #Normalize values by sum of each libraries |
427 cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items()) | 429 cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items()) |
428 cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items()) | 430 cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items()) |
429 | 431 except ZeroDivisionError: |
432 stop_err("Not enough reads to compute the codon occupancy") | |
433 | |
430 # # compute theorical count in COND2 | 434 # # compute theorical count in COND2 |
431 cond2_count = [] | 435 cond2_count = [] |
432 for z in cond1_norm.itervalues() : | 436 for z in cond1_norm.itervalues() : |
433 count = int(z * sum2 / 100.0) | 437 count = int(z * sum2 / 100.0) |
434 cond2_count.append(count) | 438 cond2_count.append(count) |
446 cond1_aa.append(z[0]) | 450 cond1_aa.append(z[0]) |
447 cond2_aa.append(z[1]) | 451 cond2_aa.append(z[1]) |
448 max_val.append(max(z)) | 452 max_val.append(max(z)) |
449 | 453 |
450 # # plot amino acid profile : | 454 # # plot amino acid profile : |
451 fig = pl.figure(num=1) | 455 fig = pl.figure(figsize=(15,10), num=1) |
452 width = .45 | 456 width = .45 |
453 ax = fig.add_subplot(111) | 457 ax = fig.add_subplot(111) |
454 ind = arange(21) | 458 ind = arange(21) |
455 pl.xlim(0, 21) | 459 pl.xlim(0, 21) |
456 #kwargs = {"hatch":'x'} | 460 #kwargs = {"hatch":'x'} |
490 for i in cond1: | 494 for i in cond1: |
491 # # max value for each codon | 495 # # max value for each codon |
492 max_val.append(max(cond1_norm[i], cond2_norm[i])) | 496 max_val.append(max(cond1_norm[i], cond2_norm[i])) |
493 | 497 |
494 # plot result | 498 # plot result |
495 fig = pl.figure(figsize=(40, 10), num=1) | 499 fig = pl.figure(figsize=(20,10), num=1) |
496 #fig = pl.figure(num=1) | 500 #fig = pl.figure(num=1) |
497 width = .40 | 501 width = .40 |
498 ind = arange(len(codon_sorted)) | 502 ind = arange(len(codon_sorted)) |
499 ax = fig.add_subplot(111) | 503 ax = fig.add_subplot(111) |
500 pl.xlim(0, len(codon_sorted) + 1) | 504 pl.xlim(0, len(codon_sorted) + 1) |
535 <link href="/static/june_2007_style/blue/base.css" media="screen" rel="Stylesheet" type="text/css" /> | 539 <link href="/static/june_2007_style/blue/base.css" media="screen" rel="Stylesheet" type="text/css" /> |
536 </head> | 540 </head> |
537 <body> | 541 <body> |
538 <h3>Global visualization</h3> | 542 <h3>Global visualization</h3> |
539 <p> | 543 <p> |
540 <h5>Visualization of density footprint in each codon.</h5><br> If user has selected analyse with replicats, standart error deviation between each replicate as plotting as error bar in histogram.<br> | 544 <h5>Visualization of density footprint in each codon.</h5><br> If user has selected "Yes" for the replicate option the standard deviation between each replicate is plotted as an error bar in histogram.<br> |
541 <img border="0" src="hist_codons.png" width="1040"/> | 545 <img border="0" src="hist_codons.png" width="1040"/> |
542 </p> | 546 </p> |
543 <p> | 547 <p> |
544 <h5>Test for homogeneity distribution between each condition</h5><br> | 548 <h5>Test for homogeneity distribution between each condition</h5><br> |
545 H0 : %s and %s are same distribution <br> | 549 H0 : %s and %s are same distribution <br> |
580 cmd = "samtools index %s " % (bamfile) | 584 cmd = "samtools index %s " % (bamfile) |
581 proc = subprocess.Popen(args=cmd, shell=True, stderr=subprocess.PIPE) | 585 proc = subprocess.Popen(args=cmd, shell=True, stderr=subprocess.PIPE) |
582 returncode = proc.wait() | 586 returncode = proc.wait() |
583 # if returncode != 0: | 587 # if returncode != 0: |
584 # raise Exception | 588 # raise Exception |
589 | |
590 def plot_fc (cond1, cond2, site, dirout): | |
591 | |
592 fc = cond1.copy() | |
593 | |
594 for key, value in fc.iteritems(): | |
595 fc[key] = cond2[key]/cond1[key] | |
596 | |
597 index = arange(len(fc.keys())) | |
598 label = fc.keys() | |
599 label = [w.replace('T','U') for w in label] | |
600 pl.figure(figsize=(15,10), num=1) | |
601 ax = pl.subplot(1,1,1) | |
602 pl.xticks([]) | |
603 pl.scatter(index, fc.values(), color='b') | |
604 pl.axhline(y=1,color='r') | |
605 pl.xticks(index, label, rotation=90) | |
606 pl.ylabel('Foldchange of codon occupancy') | |
607 ax.yaxis.set_ticks_position('left') | |
608 ax.xaxis.set_ticks_position('bottom') | |
609 pl.title(site+" site") | |
610 pl.savefig(dirout + '/fc_codons.png', format="png", dpi=340) | |
611 | |
585 | 612 |
586 def __main__(): | 613 def __main__(): |
587 | 614 |
588 | 615 |
589 # Parse command line options | 616 # Parse command line options |
673 for fh in itertools.chain(cond1, cond2): | 700 for fh in itertools.chain(cond1, cond2): |
674 check_index_bam (fh) | 701 check_index_bam (fh) |
675 result.append(get_codon_usage(fh, GFF, options.site, options.kmer, options.asite)) | 702 result.append(get_codon_usage(fh, GFF, options.site, options.kmer, options.asite)) |
676 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2) | 703 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2) |
677 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir) | 704 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir) |
678 | 705 plot_fc (cond1, cond2, options.site, html_dir) |
679 | 706 |
680 # #If there are no replicat | 707 # #If there are no replicat |
681 elif options.rep == "no" : | 708 elif options.rep == "no" : |
682 result = [] | 709 result = [] |
683 # #calcul for each cond | 710 # #calcul for each cond |
684 for fh in (options.file1, options.file2): | 711 for fh in (options.file1, options.file2): |
685 check_index_bam (fh) | 712 check_index_bam (fh) |
686 result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite)) | 713 result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite)) |
687 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2) | 714 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2) |
688 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir) | 715 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir) |
689 | 716 plot_fc (cond1, cond2, options.site, html_dir) |
690 else : | 717 else : |
691 sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time()))) | 718 sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time()))) |
692 sys.exit() | 719 sys.exit() |
693 | 720 |
694 # write_html_file(html,chi_pval,t_pval,codons,options.c1, options.c2) | 721 # write_html_file(html,chi_pval,t_pval,codons,options.c1, options.c2) |