comparison get_codon_frequency.py @ 10:707807fee542

(none)
author rlegendre
date Thu, 22 Jan 2015 14:34:38 +0100
parents b8c070add3b7
children 7c944fd9907e
comparison
equal deleted inserted replaced
9:d7739f797a26 10:707807fee542
23 from matplotlib import font_manager 23 from matplotlib import font_manager
24 from matplotlib import colors 24 from matplotlib import colors
25 import csv 25 import csv
26 from scipy import stats 26 from scipy import stats
27 from collections import OrderedDict 27 from collections import OrderedDict
28 import ribo_functions
29 import HTSeq
28 # #libraries for debugg 30 # #libraries for debugg
29 # import pdb 31 import pdb
30 # import cPickle 32 # import cPickle
31 33
32 def stop_err(msg): 34 def stop_err(msg):
33 sys.stderr.write("%s\n" % msg) 35 sys.stderr.write("%s\n" % msg)
34 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) 36 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
35 sys.exit() 37 sys.exit()
36 38
37
38 def store_gff(gff): 39 def store_gff(gff):
39 ''' 40 '''
40 parse and store gff file in a dictionnary 41 parse and store gff file in a dictionnary
41 ''' 42 '''
42 try: 43 try:
96 #7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra 97 #7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra
97 #ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr 98 #ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr
98 #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified 99 #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified
99 #chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified 100 #chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
100 #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified 101 #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
101 102
102
103 103
104 def init_codon_dict(): 104 def init_codon_dict():
105 105
106 Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)]) 106 Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)])
107 return Codon_dict 107 return Codon_dict
113 Read GFF dict and get gene codon usage. 113 Read GFF dict and get gene codon usage.
114 Return dict of codons usage 114 Return dict of codons usage
115 ''' 115 '''
116 try: 116 try:
117 codon = init_codon_dict() 117 codon = init_codon_dict()
118 118 for feature in GFF :
119 for chrom in GFF.iterkeys(): 119 if feature.type == 'gene' :
120 for gene in GFF[chrom] :
121 codon_dict = init_codon_dict() 120 codon_dict = init_codon_dict()
122 start = GFF[chrom][gene]['start'] 121 chrom = feature.iv.chrom
123 stop = GFF[chrom][gene]['stop'] 122 start = feature.iv.start
123 stop = feature.iv.end
124 region = chrom + ':' + str(start) + '-' + str(stop) 124 region = chrom + ':' + str(start) + '-' + str(stop)
125
126 ## DEPRECATED
127 #for chrom in GFF.iterkeys():
128 #for gene in GFF[chrom] :
129 # codon_dict = init_codon_dict()
130 #start = GFF[chrom][gene]['start']
131 #print start
132 #stop = GFF[chrom][gene]['stop']
133 #print stop
134 #region = chrom + ':' + str(start) + '-' + str(stop)
135 #######
125 # #get all reads in this gene 136 # #get all reads in this gene
126 reads = subprocess.check_output(["samtools", "view", bamfile, region]) 137 reads = subprocess.check_output(["samtools", "view", bamfile, region])
127 head = subprocess.check_output(["samtools", "view", "-H", bamfile]) 138 head = subprocess.check_output(["samtools", "view", "-H", bamfile])
128 read_tab = reads.split('\n') 139 read_tab = reads.split('\n')
129 for read in read_tab: 140 for read in read_tab:
130 # # search mapper for eliminate multiple alignements 141 # # search mapper for eliminate multiple alignements
131 if 'bowtie' in head: 142 if 'bowtie' in head:
132 multi_tag = "XS:i:" 143 multi_tag = "XS:i:"
133 elif 'bwa' in head: 144 elif 'bwa' in head:
134 multi_tag = "XT:A:R" 145 multi_tag = "XT:A:R"
135 else : 146 else :
136 stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") 147 stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")
137 if len(read) == 0: 148 if len(read) == 0:
138 continue 149 continue
139
140 len_read = len(read.split('\t')[9]) 150 len_read = len(read.split('\t')[9])
141 # if it's read of good length 151 # if it's read of good length
142 if len_read == kmer and multi_tag not in read: 152 if len_read == kmer and multi_tag not in read:
143 feat = read.split('\t') 153 feat = read.split('\t')
144 seq = feat[9] 154 seq = feat[9]
177 187
178 except Exception, e: 188 except Exception, e:
179 stop_err('Error during codon usage calcul: ' + str(e)) 189 stop_err('Error during codon usage calcul: ' + str(e))
180 190
181 191
192
193
182 ''' 194 '''
183 http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html 195 http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html
184 ''' 196 '''
185 def violin_plot(ax, data, pos, bp=False): 197 def violin_plot(ax, data, pos, bp=False):
186 ''' 198 '''
489 cond2_aa.append(z[1]) 501 cond2_aa.append(z[1])
490 max_val.append(max(z)) 502 max_val.append(max(z))
491 503
492 # # plot amino acid profile : 504 # # plot amino acid profile :
493 fig = pl.figure(num=1) 505 fig = pl.figure(num=1)
494 width = .35 506 width = .45
495 ax = fig.add_subplot(111) 507 ax = fig.add_subplot(111)
496 ind = arange(21) 508 ind = arange(21)
497 pl.xlim(0, 21) 509 pl.xlim(0, 21)
498 #kwargs = {"hatch":'x'} 510 #kwargs = {"hatch":'x'}
499 #ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1, **kwargs) 511 #ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1, **kwargs)
501 #ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2, **kwargs) 513 #ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2, **kwargs)
502 ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1) 514 ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1)
503 ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2) 515 ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2)
504 #for x, y, z in zip(ind, max_val, aa_name): 516 #for x, y, z in zip(ind, max_val, aa_name):
505 # ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14) 517 # ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14)
506 axis_font = {'size':'16'} 518 axis_font = {'size':'10'}
507 pl.xticks(ind + width, aa_name,**axis_font) 519 pl.xticks(ind + width, aa_name,**axis_font)
508 ax.spines['right'].set_visible(False) 520 ax.spines['right'].set_visible(False)
509 ax.spines['top'].set_visible(False) 521 ax.spines['top'].set_visible(False)
510 ax.yaxis.set_ticks_position('left') 522 ax.yaxis.set_ticks_position('left')
511 ax.xaxis.set_ticks_position('bottom') 523 ax.xaxis.set_ticks_position('bottom')
512 #ax.xaxis.set_ticks([]) 524 #ax.xaxis.set_ticks([])
513 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font) 525 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font)
514 ax.set_xlabel('Amino Acids', **axis_font) 526 ax.set_xlabel('Amino Acids', **axis_font)
515 handles, labels = ax.get_legend_handles_labels() 527 handles, labels = ax.get_legend_handles_labels()
516 font_prop = font_manager.FontProperties(size=12) 528 font_prop = font_manager.FontProperties(size=8)
517 ax.legend(handles, labels, prop=font_prop) 529 ax.legend(handles, labels, prop=font_prop)
518 pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340) 530 pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340)
519 pl.clf() 531 pl.clf()
520 532
521 # write result 533 # write result
532 for i in cond1: 544 for i in cond1:
533 # # max value for each codon 545 # # max value for each codon
534 max_val.append(max(cond1_norm[i], cond2_norm[i])) 546 max_val.append(max(cond1_norm[i], cond2_norm[i]))
535 547
536 # plot result 548 # plot result
537 fig = pl.figure(figsize=(24, 10), num=1) 549 fig = pl.figure(figsize=(30, 10), num=1)
538 width = .50 550 #fig = pl.figure(num=1)
551 width = .40
539 ind = arange(len(codon_sorted)) 552 ind = arange(len(codon_sorted))
540 ax = fig.add_subplot(111) 553 ax = fig.add_subplot(111)
541 pl.xlim(0, len(codon_sorted) + 1) 554 pl.xlim(0, len(codon_sorted) + 1)
542 ax.spines['right'].set_color('none') 555 ax.spines['right'].set_color('none')
543 ax.spines['top'].set_color('none') 556 ax.spines['top'].set_color('none')
623 returncode = proc.wait() 636 returncode = proc.wait()
624 # if returncode != 0: 637 # if returncode != 0:
625 # raise Exception 638 # raise Exception
626 639
627 def __main__(): 640 def __main__():
628 ''' 641
629 python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -1 psiM1_sorted.bam,psiM2_sorted.bam -2 psiP1_sorted.bam,psiP2_sorted.bam -c psiM -C psiP -l TAG,TAA,TGA -r yes -o psi_count -d psi.html,html_dir > log2
630 python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -c psiM -C psiP -1 RPF_psi-_28sorted.bam -2 RPF_psi+_28sorted.bam -l TAG,TAA,TGA -n Stop Codon -r no -o psi_count -d psi.html,html_dir > log2
631 '''
632 642
633 # Parse command line options 643 # Parse command line options
634 parser = optparse.OptionParser() 644 parser = optparse.OptionParser()
635 parser.add_option("-g", "--gff", dest="gff", type="string", 645 parser.add_option("-g", "--gff", dest="gff", type="string",
636 help="gff file", metavar="FILE") 646 help="gff file", metavar="FILE")
646 656
647 parser.add_option("-C", "--cond2", dest="c2", type="string", 657 parser.add_option("-C", "--cond2", dest="c2", type="string",
648 help="Name of second condition", metavar="STR") 658 help="Name of second condition", metavar="STR")
649 659
650 parser.add_option("-k", "--kmer", dest="kmer", type="int", 660 parser.add_option("-k", "--kmer", dest="kmer", type="int",
651 help="Longer of your phasing reads", metavar="INT") 661 help="Length of your phasing reads", metavar="INT")
652 662
653 # parser.add_option("-l", "--list", dest="list_cod", type= "string", 663 # parser.add_option("-l", "--list", dest="list_cod", type= "string",
654 # help="list of codons to compare to other", metavar="STR") 664 # help="list of codons to compare to other", metavar="STR")
655 665
656 parser.add_option("-o", "--out", dest="outfile", type="string", 666 parser.add_option("-o", "--out", dest="outfile", type="string",
690 if not colors.is_color_like(options.color1) : 700 if not colors.is_color_like(options.color1) :
691 stop_err( options.color1+' is not a proper color' ) 701 stop_err( options.color1+' is not a proper color' )
692 if not colors.is_color_like(options.color2) : 702 if not colors.is_color_like(options.color2) :
693 stop_err( options.color2+' is not a proper color' ) 703 stop_err( options.color2+' is not a proper color' )
694 704
695 GFF = store_gff(options.gff) 705 ## identify GFF or GTF format from 9th column
706 #with open (options.gff,"r") as gffile :
707 # for line in gffile :
708 # if '#' in line :
709 # ## skip header
710 # gffile.next()
711 # elif 'gene_id' in line :
712 # ## launch gtf reader :
713 # GFF = ribo_functions.store_gtf(options.gff)
714 # break
715 # elif 'ID=' in line :
716 # ## launch gff reader
717 # GFF = ribo_functions.store_gff(options.gff)
718 # break
719 # else :
720 # stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
721
722 #GFF = store_gff(options.gff)
723 #GFF = ribo_functions.store_gtf(options.gff)
724 ## check gff reading
725 #if not GFF['order'] :
726 # stop_err( 'Incorrect GFF file' + str( e ) )
696 727
697 #### NOT USE IN FINAL VERSION 728 #### NOT USE IN FINAL VERSION
698 # # get codon list 729 # # get codon list
699 # codons = options.list_cod.upper().split(',') 730 # codons = options.list_cod.upper().split(',')
700 # check_codons_list(codons) 731 # check_codons_list(codons)
701 732 GFF = HTSeq.GFF_Reader(options.gff)
702 # # get html file and directory : 733 # # get html file and directory :
703 (html, html_dir) = options.dirout.split(',') 734 (html, html_dir) = options.dirout.split(',')
704 if os.path.exists(html_dir): 735 if os.path.exists(html_dir):
705 raise 736 raise
706 try: 737 try: