Mercurial > repos > rlegendre > ribo_tools
comparison get_codon_frequency.py @ 10:707807fee542
(none)
author | rlegendre |
---|---|
date | Thu, 22 Jan 2015 14:34:38 +0100 |
parents | b8c070add3b7 |
children | 7c944fd9907e |
comparison
equal
deleted
inserted
replaced
9:d7739f797a26 | 10:707807fee542 |
---|---|
23 from matplotlib import font_manager | 23 from matplotlib import font_manager |
24 from matplotlib import colors | 24 from matplotlib import colors |
25 import csv | 25 import csv |
26 from scipy import stats | 26 from scipy import stats |
27 from collections import OrderedDict | 27 from collections import OrderedDict |
28 import ribo_functions | |
29 import HTSeq | |
28 # #libraries for debugg | 30 # #libraries for debugg |
29 # import pdb | 31 import pdb |
30 # import cPickle | 32 # import cPickle |
31 | 33 |
32 def stop_err(msg): | 34 def stop_err(msg): |
33 sys.stderr.write("%s\n" % msg) | 35 sys.stderr.write("%s\n" % msg) |
34 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) | 36 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) |
35 sys.exit() | 37 sys.exit() |
36 | 38 |
37 | |
38 def store_gff(gff): | 39 def store_gff(gff): |
39 ''' | 40 ''' |
40 parse and store gff file in a dictionnary | 41 parse and store gff file in a dictionnary |
41 ''' | 42 ''' |
42 try: | 43 try: |
96 #7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra | 97 #7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra |
97 #ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr | 98 #ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr |
98 #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified | 99 #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified |
99 #chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified | 100 #chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified |
100 #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified | 101 #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified |
101 | 102 |
102 | |
103 | 103 |
104 def init_codon_dict(): | 104 def init_codon_dict(): |
105 | 105 |
106 Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)]) | 106 Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)]) |
107 return Codon_dict | 107 return Codon_dict |
113 Read GFF dict and get gene codon usage. | 113 Read GFF dict and get gene codon usage. |
114 Return dict of codons usage | 114 Return dict of codons usage |
115 ''' | 115 ''' |
116 try: | 116 try: |
117 codon = init_codon_dict() | 117 codon = init_codon_dict() |
118 | 118 for feature in GFF : |
119 for chrom in GFF.iterkeys(): | 119 if feature.type == 'gene' : |
120 for gene in GFF[chrom] : | |
121 codon_dict = init_codon_dict() | 120 codon_dict = init_codon_dict() |
122 start = GFF[chrom][gene]['start'] | 121 chrom = feature.iv.chrom |
123 stop = GFF[chrom][gene]['stop'] | 122 start = feature.iv.start |
123 stop = feature.iv.end | |
124 region = chrom + ':' + str(start) + '-' + str(stop) | 124 region = chrom + ':' + str(start) + '-' + str(stop) |
125 | |
126 ## DEPRECATED | |
127 #for chrom in GFF.iterkeys(): | |
128 #for gene in GFF[chrom] : | |
129 # codon_dict = init_codon_dict() | |
130 #start = GFF[chrom][gene]['start'] | |
131 #print start | |
132 #stop = GFF[chrom][gene]['stop'] | |
133 #print stop | |
134 #region = chrom + ':' + str(start) + '-' + str(stop) | |
135 ####### | |
125 # #get all reads in this gene | 136 # #get all reads in this gene |
126 reads = subprocess.check_output(["samtools", "view", bamfile, region]) | 137 reads = subprocess.check_output(["samtools", "view", bamfile, region]) |
127 head = subprocess.check_output(["samtools", "view", "-H", bamfile]) | 138 head = subprocess.check_output(["samtools", "view", "-H", bamfile]) |
128 read_tab = reads.split('\n') | 139 read_tab = reads.split('\n') |
129 for read in read_tab: | 140 for read in read_tab: |
130 # # search mapper for eliminate multiple alignements | 141 # # search mapper for eliminate multiple alignements |
131 if 'bowtie' in head: | 142 if 'bowtie' in head: |
132 multi_tag = "XS:i:" | 143 multi_tag = "XS:i:" |
133 elif 'bwa' in head: | 144 elif 'bwa' in head: |
134 multi_tag = "XT:A:R" | 145 multi_tag = "XT:A:R" |
135 else : | 146 else : |
136 stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") | 147 stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") |
137 if len(read) == 0: | 148 if len(read) == 0: |
138 continue | 149 continue |
139 | |
140 len_read = len(read.split('\t')[9]) | 150 len_read = len(read.split('\t')[9]) |
141 # if it's read of good length | 151 # if it's read of good length |
142 if len_read == kmer and multi_tag not in read: | 152 if len_read == kmer and multi_tag not in read: |
143 feat = read.split('\t') | 153 feat = read.split('\t') |
144 seq = feat[9] | 154 seq = feat[9] |
177 | 187 |
178 except Exception, e: | 188 except Exception, e: |
179 stop_err('Error during codon usage calcul: ' + str(e)) | 189 stop_err('Error during codon usage calcul: ' + str(e)) |
180 | 190 |
181 | 191 |
192 | |
193 | |
182 ''' | 194 ''' |
183 http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html | 195 http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html |
184 ''' | 196 ''' |
185 def violin_plot(ax, data, pos, bp=False): | 197 def violin_plot(ax, data, pos, bp=False): |
186 ''' | 198 ''' |
489 cond2_aa.append(z[1]) | 501 cond2_aa.append(z[1]) |
490 max_val.append(max(z)) | 502 max_val.append(max(z)) |
491 | 503 |
492 # # plot amino acid profile : | 504 # # plot amino acid profile : |
493 fig = pl.figure(num=1) | 505 fig = pl.figure(num=1) |
494 width = .35 | 506 width = .45 |
495 ax = fig.add_subplot(111) | 507 ax = fig.add_subplot(111) |
496 ind = arange(21) | 508 ind = arange(21) |
497 pl.xlim(0, 21) | 509 pl.xlim(0, 21) |
498 #kwargs = {"hatch":'x'} | 510 #kwargs = {"hatch":'x'} |
499 #ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1, **kwargs) | 511 #ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1, **kwargs) |
501 #ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2, **kwargs) | 513 #ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2, **kwargs) |
502 ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1) | 514 ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1) |
503 ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2) | 515 ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2) |
504 #for x, y, z in zip(ind, max_val, aa_name): | 516 #for x, y, z in zip(ind, max_val, aa_name): |
505 # ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14) | 517 # ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14) |
506 axis_font = {'size':'16'} | 518 axis_font = {'size':'10'} |
507 pl.xticks(ind + width, aa_name,**axis_font) | 519 pl.xticks(ind + width, aa_name,**axis_font) |
508 ax.spines['right'].set_visible(False) | 520 ax.spines['right'].set_visible(False) |
509 ax.spines['top'].set_visible(False) | 521 ax.spines['top'].set_visible(False) |
510 ax.yaxis.set_ticks_position('left') | 522 ax.yaxis.set_ticks_position('left') |
511 ax.xaxis.set_ticks_position('bottom') | 523 ax.xaxis.set_ticks_position('bottom') |
512 #ax.xaxis.set_ticks([]) | 524 #ax.xaxis.set_ticks([]) |
513 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font) | 525 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font) |
514 ax.set_xlabel('Amino Acids', **axis_font) | 526 ax.set_xlabel('Amino Acids', **axis_font) |
515 handles, labels = ax.get_legend_handles_labels() | 527 handles, labels = ax.get_legend_handles_labels() |
516 font_prop = font_manager.FontProperties(size=12) | 528 font_prop = font_manager.FontProperties(size=8) |
517 ax.legend(handles, labels, prop=font_prop) | 529 ax.legend(handles, labels, prop=font_prop) |
518 pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340) | 530 pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340) |
519 pl.clf() | 531 pl.clf() |
520 | 532 |
521 # write result | 533 # write result |
532 for i in cond1: | 544 for i in cond1: |
533 # # max value for each codon | 545 # # max value for each codon |
534 max_val.append(max(cond1_norm[i], cond2_norm[i])) | 546 max_val.append(max(cond1_norm[i], cond2_norm[i])) |
535 | 547 |
536 # plot result | 548 # plot result |
537 fig = pl.figure(figsize=(24, 10), num=1) | 549 fig = pl.figure(figsize=(30, 10), num=1) |
538 width = .50 | 550 #fig = pl.figure(num=1) |
551 width = .40 | |
539 ind = arange(len(codon_sorted)) | 552 ind = arange(len(codon_sorted)) |
540 ax = fig.add_subplot(111) | 553 ax = fig.add_subplot(111) |
541 pl.xlim(0, len(codon_sorted) + 1) | 554 pl.xlim(0, len(codon_sorted) + 1) |
542 ax.spines['right'].set_color('none') | 555 ax.spines['right'].set_color('none') |
543 ax.spines['top'].set_color('none') | 556 ax.spines['top'].set_color('none') |
623 returncode = proc.wait() | 636 returncode = proc.wait() |
624 # if returncode != 0: | 637 # if returncode != 0: |
625 # raise Exception | 638 # raise Exception |
626 | 639 |
627 def __main__(): | 640 def __main__(): |
628 ''' | 641 |
629 python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -1 psiM1_sorted.bam,psiM2_sorted.bam -2 psiP1_sorted.bam,psiP2_sorted.bam -c psiM -C psiP -l TAG,TAA,TGA -r yes -o psi_count -d psi.html,html_dir > log2 | |
630 python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -c psiM -C psiP -1 RPF_psi-_28sorted.bam -2 RPF_psi+_28sorted.bam -l TAG,TAA,TGA -n Stop Codon -r no -o psi_count -d psi.html,html_dir > log2 | |
631 ''' | |
632 | 642 |
633 # Parse command line options | 643 # Parse command line options |
634 parser = optparse.OptionParser() | 644 parser = optparse.OptionParser() |
635 parser.add_option("-g", "--gff", dest="gff", type="string", | 645 parser.add_option("-g", "--gff", dest="gff", type="string", |
636 help="gff file", metavar="FILE") | 646 help="gff file", metavar="FILE") |
646 | 656 |
647 parser.add_option("-C", "--cond2", dest="c2", type="string", | 657 parser.add_option("-C", "--cond2", dest="c2", type="string", |
648 help="Name of second condition", metavar="STR") | 658 help="Name of second condition", metavar="STR") |
649 | 659 |
650 parser.add_option("-k", "--kmer", dest="kmer", type="int", | 660 parser.add_option("-k", "--kmer", dest="kmer", type="int", |
651 help="Longer of your phasing reads", metavar="INT") | 661 help="Length of your phasing reads", metavar="INT") |
652 | 662 |
653 # parser.add_option("-l", "--list", dest="list_cod", type= "string", | 663 # parser.add_option("-l", "--list", dest="list_cod", type= "string", |
654 # help="list of codons to compare to other", metavar="STR") | 664 # help="list of codons to compare to other", metavar="STR") |
655 | 665 |
656 parser.add_option("-o", "--out", dest="outfile", type="string", | 666 parser.add_option("-o", "--out", dest="outfile", type="string", |
690 if not colors.is_color_like(options.color1) : | 700 if not colors.is_color_like(options.color1) : |
691 stop_err( options.color1+' is not a proper color' ) | 701 stop_err( options.color1+' is not a proper color' ) |
692 if not colors.is_color_like(options.color2) : | 702 if not colors.is_color_like(options.color2) : |
693 stop_err( options.color2+' is not a proper color' ) | 703 stop_err( options.color2+' is not a proper color' ) |
694 | 704 |
695 GFF = store_gff(options.gff) | 705 ## identify GFF or GTF format from 9th column |
706 #with open (options.gff,"r") as gffile : | |
707 # for line in gffile : | |
708 # if '#' in line : | |
709 # ## skip header | |
710 # gffile.next() | |
711 # elif 'gene_id' in line : | |
712 # ## launch gtf reader : | |
713 # GFF = ribo_functions.store_gtf(options.gff) | |
714 # break | |
715 # elif 'ID=' in line : | |
716 # ## launch gff reader | |
717 # GFF = ribo_functions.store_gff(options.gff) | |
718 # break | |
719 # else : | |
720 # stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) | |
721 | |
722 #GFF = store_gff(options.gff) | |
723 #GFF = ribo_functions.store_gtf(options.gff) | |
724 ## check gff reading | |
725 #if not GFF['order'] : | |
726 # stop_err( 'Incorrect GFF file' + str( e ) ) | |
696 | 727 |
697 #### NOT USE IN FINAL VERSION | 728 #### NOT USE IN FINAL VERSION |
698 # # get codon list | 729 # # get codon list |
699 # codons = options.list_cod.upper().split(',') | 730 # codons = options.list_cod.upper().split(',') |
700 # check_codons_list(codons) | 731 # check_codons_list(codons) |
701 | 732 GFF = HTSeq.GFF_Reader(options.gff) |
702 # # get html file and directory : | 733 # # get html file and directory : |
703 (html, html_dir) = options.dirout.split(',') | 734 (html, html_dir) = options.dirout.split(',') |
704 if os.path.exists(html_dir): | 735 if os.path.exists(html_dir): |
705 raise | 736 raise |
706 try: | 737 try: |