Mercurial > repos > rlegendre > ribo_tools
comparison get_codon_frequency.py @ 13:7c944fd9907e draft
release 2
author | rlegendre |
---|---|
date | Thu, 09 Apr 2015 11:35:48 -0400 |
parents | 707807fee542 |
children | 344bacf6acb8 |
comparison
equal
deleted
inserted
replaced
12:ee3a3435ce43 | 13:7c944fd9907e |
---|---|
26 from scipy import stats | 26 from scipy import stats |
27 from collections import OrderedDict | 27 from collections import OrderedDict |
28 import ribo_functions | 28 import ribo_functions |
29 import HTSeq | 29 import HTSeq |
30 # #libraries for debugg | 30 # #libraries for debugg |
31 import pdb | 31 #import pdb |
32 # import cPickle | 32 # import cPickle |
33 | 33 |
34 def stop_err(msg): | 34 def stop_err(msg): |
35 sys.stderr.write("%s\n" % msg) | 35 sys.stderr.write("%s\n" % msg) |
36 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) | 36 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) |
37 sys.exit() | 37 sys.exit() |
38 | 38 |
39 def store_gff(gff): | |
40 ''' | |
41 parse and store gff file in a dictionnary | |
42 ''' | |
43 try: | |
44 GFF = OrderedDict({}) | |
45 with open(gff, 'r') as f_gff : | |
46 # GFF['order'] = [] | |
47 for line in f_gff: | |
48 # # switch commented lines | |
49 head = line.split("#")[0] | |
50 if head != "" : | |
51 feature = (line.split('\t')[8]).split(';') | |
52 chrom = line.split('\t')[0] | |
53 if chrom not in GFF : | |
54 GFF[chrom] = {} | |
55 # first line is already gene line : | |
56 if line.split('\t')[2] == 'gene' : | |
57 gene = feature[0].replace("ID=", "") | |
58 if re.search('gene', feature[2]) : | |
59 Name = feature[2].replace("gene=", "") | |
60 else : | |
61 Name = "Unknown" | |
62 # #get annotation | |
63 note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) | |
64 note = urllib.unquote(str(note)).replace("\n", "") | |
65 # # store gene information | |
66 # GFF['order'].append(gene) | |
67 GFF[chrom][gene] = {} | |
68 GFF[chrom][gene]['chrom'] = line.split('\t')[0] | |
69 GFF[chrom][gene]['start'] = int(line.split('\t')[3]) | |
70 GFF[chrom][gene]['stop'] = int(line.split('\t')[4]) | |
71 GFF[chrom][gene]['strand'] = line.split('\t')[6] | |
72 GFF[chrom][gene]['name'] = Name | |
73 GFF[chrom][gene]['note'] = note | |
74 GFF[chrom][gene]['exon'] = {} | |
75 GFF[chrom][gene]['exon_number'] = 0 | |
76 # print Name | |
77 elif line.split('\t')[2] == 'CDS' : | |
78 gene = re.sub(r"Parent\=(.+)_mRNA", r"\1", feature[0]) | |
79 if GFF[chrom].has_key(gene) : | |
80 GFF[chrom][gene]['exon_number'] += 1 | |
81 exon_number = GFF[chrom][gene]['exon_number'] | |
82 GFF[chrom][gene]['exon'][exon_number] = {} | |
83 GFF[chrom][gene]['exon'][exon_number]['frame'] = line.split('\t')[7] | |
84 GFF[chrom][gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) | |
85 GFF[chrom][gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) | |
86 # # if there is a five prim UTR intron, we change start of gene | |
87 elif line.split('\t')[2] == 'five_prime_UTR_intron' : | |
88 if GFF[chrom][gene]['strand'] == "+" : | |
89 GFF[chrom][gene]['start'] = GFF[chrom][gene]['exon'][1]['start'] | |
90 else : | |
91 GFF[chrom][gene]['stop'] = GFF[chrom][gene]['exon'][exon_number]['stop'] | |
92 return GFF | |
93 except Exception, e: | |
94 stop_err('Error during gff storage : ' + str(e)) | |
95 | |
96 #chrI SGD gene 87286 87752 . + . ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689 | |
97 #7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra | |
98 #ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr | |
99 #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified | |
100 #chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified | |
101 #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified | |
102 | |
103 | 39 |
104 def init_codon_dict(): | 40 def init_codon_dict(): |
105 | 41 |
106 Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)]) | 42 Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)]) |
107 return Codon_dict | 43 return Codon_dict |
113 Read GFF dict and get gene codon usage. | 49 Read GFF dict and get gene codon usage. |
114 Return dict of codons usage | 50 Return dict of codons usage |
115 ''' | 51 ''' |
116 try: | 52 try: |
117 codon = init_codon_dict() | 53 codon = init_codon_dict() |
54 multi_tag = "XS:i:" ## bowtie Tag | |
55 tag = "IH:i:1" ## RUM tag | |
56 | |
118 for feature in GFF : | 57 for feature in GFF : |
119 if feature.type == 'gene' : | 58 if feature.type == 'gene' : |
120 codon_dict = init_codon_dict() | 59 codon_dict = init_codon_dict() |
121 chrom = feature.iv.chrom | 60 chrom = feature.iv.chrom |
122 start = feature.iv.start | 61 start = feature.iv.start |
123 stop = feature.iv.end | 62 stop = feature.iv.end |
124 region = chrom + ':' + str(start) + '-' + str(stop) | 63 if start+50 < stop-50 : |
64 region = chrom + ':' + str(start+50) + '-' + str(stop-50) | |
65 else : | |
66 break | |
125 | 67 |
126 ## DEPRECATED | 68 ## DEPRECATED |
127 #for chrom in GFF.iterkeys(): | 69 #for chrom in GFF.iterkeys(): |
128 #for gene in GFF[chrom] : | 70 #for gene in GFF[chrom] : |
129 # codon_dict = init_codon_dict() | 71 # codon_dict = init_codon_dict() |
141 # # search mapper for eliminate multiple alignements | 83 # # search mapper for eliminate multiple alignements |
142 if 'bowtie' in head: | 84 if 'bowtie' in head: |
143 multi_tag = "XS:i:" | 85 multi_tag = "XS:i:" |
144 elif 'bwa' in head: | 86 elif 'bwa' in head: |
145 multi_tag = "XT:A:R" | 87 multi_tag = "XT:A:R" |
88 elif 'TopHat' in head: | |
89 tag = "NH:i:1" | |
146 else : | 90 else : |
147 stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") | 91 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") |
92 | |
148 if len(read) == 0: | 93 if len(read) == 0: |
149 continue | 94 continue |
150 len_read = len(read.split('\t')[9]) | 95 len_read = len(read.split('\t')[9]) |
151 # if it's read of good length | 96 # if it's read of good length |
152 if len_read == kmer and multi_tag not in read: | 97 if len_read == kmer and (tag in line or multi_tag not in line): |
153 feat = read.split('\t') | 98 feat = read.split('\t') |
154 seq = feat[9] | 99 seq = feat[9] |
155 # if it's a reverse read | 100 # if it's a reverse read |
156 if feat[1] == '16' : | 101 if feat[1] == '16' : |
157 if site == "A" : | 102 if site == "A" : |
177 else : | 122 else : |
178 # #get site-E | 123 # #get site-E |
179 cod = seq[a_pos-6:a_pos-3] | 124 cod = seq[a_pos-6:a_pos-3] |
180 if codon_dict.has_key(cod) : | 125 if codon_dict.has_key(cod) : |
181 codon_dict[cod] += 1 | 126 codon_dict[cod] += 1 |
127 del(read) | |
182 # # add in global dict | 128 # # add in global dict |
183 for cod, count in codon_dict.iteritems() : | 129 for cod, count in codon_dict.iteritems() : |
184 codon[cod] += count | 130 codon[cod] += count |
185 | 131 |
186 return codon | 132 return codon |
398 for z in AA.itervalues(): | 344 for z in AA.itervalues(): |
399 cond1_aa.append(z[0]) | 345 cond1_aa.append(z[0]) |
400 cond2_aa.append(z[1]) | 346 cond2_aa.append(z[1]) |
401 max_valaa.append(max(z)) | 347 max_valaa.append(max(z)) |
402 # # plot amino acid profile : | 348 # # plot amino acid profile : |
403 fig = pl.figure(figsize=(24, 10), num=1) | 349 fig = pl.figure(figsize=(30, 10), num=1) |
404 width = .50 | 350 width = .50 |
405 ax = fig.add_subplot(111) | 351 ax = fig.add_subplot(111) |
406 ax.xaxis.set_ticks([]) | 352 ax.xaxis.set_ticks([]) |
407 ind = arange(21) | 353 ind = arange(21) |
408 pl.xlim(0, 21) | 354 pl.xlim(0, 21) |
438 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') | 384 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') |
439 | 385 |
440 | 386 |
441 | 387 |
442 # plot result | 388 # plot result |
443 fig = pl.figure(figsize=(24, 10), num=1) | 389 fig = pl.figure(figsize=(30, 10), num=1) |
444 width = .50 | 390 width = .40 |
445 ind = arange(len(codon_sorted)) | 391 ind = arange(len(codon_sorted)) |
446 ax = fig.add_subplot(111) | 392 ax = fig.add_subplot(111) |
447 pl.xlim(0, len(codon_sorted) + 1) | 393 pl.xlim(0, len(codon_sorted) + 1) |
448 ax.spines['right'].set_color('none') | 394 ax.spines['right'].set_color('none') |
449 ax.spines['top'].set_color('none') | 395 ax.spines['top'].set_color('none') |
544 for i in cond1: | 490 for i in cond1: |
545 # # max value for each codon | 491 # # max value for each codon |
546 max_val.append(max(cond1_norm[i], cond2_norm[i])) | 492 max_val.append(max(cond1_norm[i], cond2_norm[i])) |
547 | 493 |
548 # plot result | 494 # plot result |
549 fig = pl.figure(figsize=(30, 10), num=1) | 495 fig = pl.figure(figsize=(40, 10), num=1) |
550 #fig = pl.figure(num=1) | 496 #fig = pl.figure(num=1) |
551 width = .40 | 497 width = .40 |
552 ind = arange(len(codon_sorted)) | 498 ind = arange(len(codon_sorted)) |
553 ax = fig.add_subplot(111) | 499 ax = fig.add_subplot(111) |
554 pl.xlim(0, len(codon_sorted) + 1) | 500 pl.xlim(0, len(codon_sorted) + 1) |
558 ax.spines['left'].set_smart_bounds(True) | 504 ax.spines['left'].set_smart_bounds(True) |
559 ax.yaxis.set_ticks_position('left') | 505 ax.yaxis.set_ticks_position('left') |
560 ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1) | 506 ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1) |
561 ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2) | 507 ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2) |
562 for x, y, z in zip(ind, max_val, codon_sorted): | 508 for x, y, z in zip(ind, max_val, codon_sorted): |
563 ax.text(x + width, y + 0.02, '%s' % z, ha='center', va='bottom', fontsize=8) | 509 ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8) |
564 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)') | 510 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)') |
565 ax.set_xlabel('Codons') | 511 ax.set_xlabel('Codons') |
566 handles, labels = ax.get_legend_handles_labels() | 512 handles, labels = ax.get_legend_handles_labels() |
567 ax.legend(handles, labels) | 513 ax.legend(handles, labels) |
568 pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340) | 514 pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340) |
655 help="Name for first condition", metavar="STR") | 601 help="Name for first condition", metavar="STR") |
656 | 602 |
657 parser.add_option("-C", "--cond2", dest="c2", type="string", | 603 parser.add_option("-C", "--cond2", dest="c2", type="string", |
658 help="Name of second condition", metavar="STR") | 604 help="Name of second condition", metavar="STR") |
659 | 605 |
660 parser.add_option("-k", "--kmer", dest="kmer", type="int", | 606 parser.add_option("-k", "--kmer", dest="kmer", type="int", default = 28 , |
661 help="Length of your phasing reads", metavar="INT") | 607 help="Length of your phasing reads", metavar="INT") |
662 | 608 |
663 # parser.add_option("-l", "--list", dest="list_cod", type= "string", | 609 # parser.add_option("-l", "--list", dest="list_cod", type= "string", |
664 # help="list of codons to compare to other", metavar="STR") | 610 # help="list of codons to compare to other", metavar="STR") |
665 | 611 |
667 help="write report to FILE", metavar="FILE") | 613 help="write report to FILE", metavar="FILE") |
668 | 614 |
669 parser.add_option("-d", "--dirout", dest="dirout", type="string", | 615 parser.add_option("-d", "--dirout", dest="dirout", type="string", |
670 help="write report to PNG files", metavar="FILE") | 616 help="write report to PNG files", metavar="FILE") |
671 | 617 |
672 parser.add_option("-a", "--asite", dest="asite", type="int", | 618 parser.add_option("-a", "--asite", dest="asite", type="int", default = 15 , |
673 help="Off-set from the 5'end of the footprint to the A-site", metavar="INT") | 619 help="Off-set from the 5'end of the footprint to the A-site (default is 15)", metavar="INT") |
674 | 620 |
675 parser.add_option("-s", "--site", dest="site", type="string", | 621 parser.add_option("-s", "--site", dest="site", type="string", default = "A" , |
676 help="Script can compute in site A, P or E", metavar="A|P|E") | 622 help="Script can compute in site A, P or E (default is A-site)", metavar="A|P|E") |
677 | 623 |
678 parser.add_option("-r", "--rep", dest="rep", type="string", | 624 parser.add_option("-r", "--rep", dest="rep", type="string", default = "no" , |
679 help="if replicate or not", metavar="yes|no") | 625 help="if replicate or not", metavar="yes|no") |
680 | 626 |
681 parser.add_option("-x", "--hex_col1", dest="color1", type= "string", | 627 parser.add_option("-x", "--hex_col1", dest="color1", type= "string", default = "SkyBlue" , |
682 help="Color for first condition", metavar="STR") | 628 help="Color for first condition", metavar="STR") |
683 | 629 |
684 parser.add_option("-X", "--hex_col2", dest="color2", type= "string", | 630 parser.add_option("-X", "--hex_col2", dest="color2", type= "string", default = "Plum" , |
685 help="Color for second condition", metavar="STR") | 631 help="Color for second condition", metavar="STR") |
686 | 632 |
687 parser.add_option("-q", "--quiet", | 633 parser.add_option("-q", "--quiet", |
688 action="store_false", dest="verbose", default=True, | 634 action="store_false", dest="verbose", default=True, |
689 help="don't print status messages to stdout") | 635 help="don't print status messages to stdout") |
700 if not colors.is_color_like(options.color1) : | 646 if not colors.is_color_like(options.color1) : |
701 stop_err( options.color1+' is not a proper color' ) | 647 stop_err( options.color1+' is not a proper color' ) |
702 if not colors.is_color_like(options.color2) : | 648 if not colors.is_color_like(options.color2) : |
703 stop_err( options.color2+' is not a proper color' ) | 649 stop_err( options.color2+' is not a proper color' ) |
704 | 650 |
705 ## identify GFF or GTF format from 9th column | |
706 #with open (options.gff,"r") as gffile : | |
707 # for line in gffile : | |
708 # if '#' in line : | |
709 # ## skip header | |
710 # gffile.next() | |
711 # elif 'gene_id' in line : | |
712 # ## launch gtf reader : | |
713 # GFF = ribo_functions.store_gtf(options.gff) | |
714 # break | |
715 # elif 'ID=' in line : | |
716 # ## launch gff reader | |
717 # GFF = ribo_functions.store_gff(options.gff) | |
718 # break | |
719 # else : | |
720 # stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) | |
721 | |
722 #GFF = store_gff(options.gff) | |
723 #GFF = ribo_functions.store_gtf(options.gff) | |
724 ## check gff reading | |
725 #if not GFF['order'] : | |
726 # stop_err( 'Incorrect GFF file' + str( e ) ) | |
727 | 651 |
728 #### NOT USE IN FINAL VERSION | 652 #### NOT USE IN FINAL VERSION |
729 # # get codon list | 653 # # get codon list |
730 # codons = options.list_cod.upper().split(',') | 654 # codons = options.list_cod.upper().split(',') |
731 # check_codons_list(codons) | 655 # check_codons_list(codons) |