comparison get_codon_frequency.py @ 13:7c944fd9907e draft

release 2
author rlegendre
date Thu, 09 Apr 2015 11:35:48 -0400
parents 707807fee542
children 344bacf6acb8
comparison
equal deleted inserted replaced
12:ee3a3435ce43 13:7c944fd9907e
26 from scipy import stats 26 from scipy import stats
27 from collections import OrderedDict 27 from collections import OrderedDict
28 import ribo_functions 28 import ribo_functions
29 import HTSeq 29 import HTSeq
30 # #libraries for debugg 30 # #libraries for debugg
31 import pdb 31 #import pdb
32 # import cPickle 32 # import cPickle
33 33
34 def stop_err(msg): 34 def stop_err(msg):
35 sys.stderr.write("%s\n" % msg) 35 sys.stderr.write("%s\n" % msg)
36 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) 36 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
37 sys.exit() 37 sys.exit()
38 38
39 def store_gff(gff):
40 '''
41 parse and store gff file in a dictionnary
42 '''
43 try:
44 GFF = OrderedDict({})
45 with open(gff, 'r') as f_gff :
46 # GFF['order'] = []
47 for line in f_gff:
48 # # switch commented lines
49 head = line.split("#")[0]
50 if head != "" :
51 feature = (line.split('\t')[8]).split(';')
52 chrom = line.split('\t')[0]
53 if chrom not in GFF :
54 GFF[chrom] = {}
55 # first line is already gene line :
56 if line.split('\t')[2] == 'gene' :
57 gene = feature[0].replace("ID=", "")
58 if re.search('gene', feature[2]) :
59 Name = feature[2].replace("gene=", "")
60 else :
61 Name = "Unknown"
62 # #get annotation
63 note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line)
64 note = urllib.unquote(str(note)).replace("\n", "")
65 # # store gene information
66 # GFF['order'].append(gene)
67 GFF[chrom][gene] = {}
68 GFF[chrom][gene]['chrom'] = line.split('\t')[0]
69 GFF[chrom][gene]['start'] = int(line.split('\t')[3])
70 GFF[chrom][gene]['stop'] = int(line.split('\t')[4])
71 GFF[chrom][gene]['strand'] = line.split('\t')[6]
72 GFF[chrom][gene]['name'] = Name
73 GFF[chrom][gene]['note'] = note
74 GFF[chrom][gene]['exon'] = {}
75 GFF[chrom][gene]['exon_number'] = 0
76 # print Name
77 elif line.split('\t')[2] == 'CDS' :
78 gene = re.sub(r"Parent\=(.+)_mRNA", r"\1", feature[0])
79 if GFF[chrom].has_key(gene) :
80 GFF[chrom][gene]['exon_number'] += 1
81 exon_number = GFF[chrom][gene]['exon_number']
82 GFF[chrom][gene]['exon'][exon_number] = {}
83 GFF[chrom][gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
84 GFF[chrom][gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
85 GFF[chrom][gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
86 # # if there is a five prim UTR intron, we change start of gene
87 elif line.split('\t')[2] == 'five_prime_UTR_intron' :
88 if GFF[chrom][gene]['strand'] == "+" :
89 GFF[chrom][gene]['start'] = GFF[chrom][gene]['exon'][1]['start']
90 else :
91 GFF[chrom][gene]['stop'] = GFF[chrom][gene]['exon'][exon_number]['stop']
92 return GFF
93 except Exception, e:
94 stop_err('Error during gff storage : ' + str(e))
95
96 #chrI SGD gene 87286 87752 . + . ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689
97 #7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra
98 #ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr
99 #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified
100 #chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
101 #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
102
103 39
104 def init_codon_dict(): 40 def init_codon_dict():
105 41
106 Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)]) 42 Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)])
107 return Codon_dict 43 return Codon_dict
113 Read GFF dict and get gene codon usage. 49 Read GFF dict and get gene codon usage.
114 Return dict of codons usage 50 Return dict of codons usage
115 ''' 51 '''
116 try: 52 try:
117 codon = init_codon_dict() 53 codon = init_codon_dict()
54 multi_tag = "XS:i:" ## bowtie Tag
55 tag = "IH:i:1" ## RUM tag
56
118 for feature in GFF : 57 for feature in GFF :
119 if feature.type == 'gene' : 58 if feature.type == 'gene' :
120 codon_dict = init_codon_dict() 59 codon_dict = init_codon_dict()
121 chrom = feature.iv.chrom 60 chrom = feature.iv.chrom
122 start = feature.iv.start 61 start = feature.iv.start
123 stop = feature.iv.end 62 stop = feature.iv.end
124 region = chrom + ':' + str(start) + '-' + str(stop) 63 if start+50 < stop-50 :
64 region = chrom + ':' + str(start+50) + '-' + str(stop-50)
65 else :
66 break
125 67
126 ## DEPRECATED 68 ## DEPRECATED
127 #for chrom in GFF.iterkeys(): 69 #for chrom in GFF.iterkeys():
128 #for gene in GFF[chrom] : 70 #for gene in GFF[chrom] :
129 # codon_dict = init_codon_dict() 71 # codon_dict = init_codon_dict()
141 # # search mapper for eliminate multiple alignements 83 # # search mapper for eliminate multiple alignements
142 if 'bowtie' in head: 84 if 'bowtie' in head:
143 multi_tag = "XS:i:" 85 multi_tag = "XS:i:"
144 elif 'bwa' in head: 86 elif 'bwa' in head:
145 multi_tag = "XT:A:R" 87 multi_tag = "XT:A:R"
88 elif 'TopHat' in head:
89 tag = "NH:i:1"
146 else : 90 else :
147 stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") 91 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
92
148 if len(read) == 0: 93 if len(read) == 0:
149 continue 94 continue
150 len_read = len(read.split('\t')[9]) 95 len_read = len(read.split('\t')[9])
151 # if it's read of good length 96 # if it's read of good length
152 if len_read == kmer and multi_tag not in read: 97 if len_read == kmer and (tag in line or multi_tag not in line):
153 feat = read.split('\t') 98 feat = read.split('\t')
154 seq = feat[9] 99 seq = feat[9]
155 # if it's a reverse read 100 # if it's a reverse read
156 if feat[1] == '16' : 101 if feat[1] == '16' :
157 if site == "A" : 102 if site == "A" :
177 else : 122 else :
178 # #get site-E 123 # #get site-E
179 cod = seq[a_pos-6:a_pos-3] 124 cod = seq[a_pos-6:a_pos-3]
180 if codon_dict.has_key(cod) : 125 if codon_dict.has_key(cod) :
181 codon_dict[cod] += 1 126 codon_dict[cod] += 1
127 del(read)
182 # # add in global dict 128 # # add in global dict
183 for cod, count in codon_dict.iteritems() : 129 for cod, count in codon_dict.iteritems() :
184 codon[cod] += count 130 codon[cod] += count
185 131
186 return codon 132 return codon
398 for z in AA.itervalues(): 344 for z in AA.itervalues():
399 cond1_aa.append(z[0]) 345 cond1_aa.append(z[0])
400 cond2_aa.append(z[1]) 346 cond2_aa.append(z[1])
401 max_valaa.append(max(z)) 347 max_valaa.append(max(z))
402 # # plot amino acid profile : 348 # # plot amino acid profile :
403 fig = pl.figure(figsize=(24, 10), num=1) 349 fig = pl.figure(figsize=(30, 10), num=1)
404 width = .50 350 width = .50
405 ax = fig.add_subplot(111) 351 ax = fig.add_subplot(111)
406 ax.xaxis.set_ticks([]) 352 ax.xaxis.set_ticks([])
407 ind = arange(21) 353 ind = arange(21)
408 pl.xlim(0, 21) 354 pl.xlim(0, 21)
438 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') 384 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')
439 385
440 386
441 387
442 # plot result 388 # plot result
443 fig = pl.figure(figsize=(24, 10), num=1) 389 fig = pl.figure(figsize=(30, 10), num=1)
444 width = .50 390 width = .40
445 ind = arange(len(codon_sorted)) 391 ind = arange(len(codon_sorted))
446 ax = fig.add_subplot(111) 392 ax = fig.add_subplot(111)
447 pl.xlim(0, len(codon_sorted) + 1) 393 pl.xlim(0, len(codon_sorted) + 1)
448 ax.spines['right'].set_color('none') 394 ax.spines['right'].set_color('none')
449 ax.spines['top'].set_color('none') 395 ax.spines['top'].set_color('none')
544 for i in cond1: 490 for i in cond1:
545 # # max value for each codon 491 # # max value for each codon
546 max_val.append(max(cond1_norm[i], cond2_norm[i])) 492 max_val.append(max(cond1_norm[i], cond2_norm[i]))
547 493
548 # plot result 494 # plot result
549 fig = pl.figure(figsize=(30, 10), num=1) 495 fig = pl.figure(figsize=(40, 10), num=1)
550 #fig = pl.figure(num=1) 496 #fig = pl.figure(num=1)
551 width = .40 497 width = .40
552 ind = arange(len(codon_sorted)) 498 ind = arange(len(codon_sorted))
553 ax = fig.add_subplot(111) 499 ax = fig.add_subplot(111)
554 pl.xlim(0, len(codon_sorted) + 1) 500 pl.xlim(0, len(codon_sorted) + 1)
558 ax.spines['left'].set_smart_bounds(True) 504 ax.spines['left'].set_smart_bounds(True)
559 ax.yaxis.set_ticks_position('left') 505 ax.yaxis.set_ticks_position('left')
560 ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1) 506 ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1)
561 ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2) 507 ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2)
562 for x, y, z in zip(ind, max_val, codon_sorted): 508 for x, y, z in zip(ind, max_val, codon_sorted):
563 ax.text(x + width, y + 0.02, '%s' % z, ha='center', va='bottom', fontsize=8) 509 ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8)
564 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)') 510 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
565 ax.set_xlabel('Codons') 511 ax.set_xlabel('Codons')
566 handles, labels = ax.get_legend_handles_labels() 512 handles, labels = ax.get_legend_handles_labels()
567 ax.legend(handles, labels) 513 ax.legend(handles, labels)
568 pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340) 514 pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
655 help="Name for first condition", metavar="STR") 601 help="Name for first condition", metavar="STR")
656 602
657 parser.add_option("-C", "--cond2", dest="c2", type="string", 603 parser.add_option("-C", "--cond2", dest="c2", type="string",
658 help="Name of second condition", metavar="STR") 604 help="Name of second condition", metavar="STR")
659 605
660 parser.add_option("-k", "--kmer", dest="kmer", type="int", 606 parser.add_option("-k", "--kmer", dest="kmer", type="int", default = 28 ,
661 help="Length of your phasing reads", metavar="INT") 607 help="Length of your phasing reads", metavar="INT")
662 608
663 # parser.add_option("-l", "--list", dest="list_cod", type= "string", 609 # parser.add_option("-l", "--list", dest="list_cod", type= "string",
664 # help="list of codons to compare to other", metavar="STR") 610 # help="list of codons to compare to other", metavar="STR")
665 611
667 help="write report to FILE", metavar="FILE") 613 help="write report to FILE", metavar="FILE")
668 614
669 parser.add_option("-d", "--dirout", dest="dirout", type="string", 615 parser.add_option("-d", "--dirout", dest="dirout", type="string",
670 help="write report to PNG files", metavar="FILE") 616 help="write report to PNG files", metavar="FILE")
671 617
672 parser.add_option("-a", "--asite", dest="asite", type="int", 618 parser.add_option("-a", "--asite", dest="asite", type="int", default = 15 ,
673 help="Off-set from the 5'end of the footprint to the A-site", metavar="INT") 619 help="Off-set from the 5'end of the footprint to the A-site (default is 15)", metavar="INT")
674 620
675 parser.add_option("-s", "--site", dest="site", type="string", 621 parser.add_option("-s", "--site", dest="site", type="string", default = "A" ,
676 help="Script can compute in site A, P or E", metavar="A|P|E") 622 help="Script can compute in site A, P or E (default is A-site)", metavar="A|P|E")
677 623
678 parser.add_option("-r", "--rep", dest="rep", type="string", 624 parser.add_option("-r", "--rep", dest="rep", type="string", default = "no" ,
679 help="if replicate or not", metavar="yes|no") 625 help="if replicate or not", metavar="yes|no")
680 626
681 parser.add_option("-x", "--hex_col1", dest="color1", type= "string", 627 parser.add_option("-x", "--hex_col1", dest="color1", type= "string", default = "SkyBlue" ,
682 help="Color for first condition", metavar="STR") 628 help="Color for first condition", metavar="STR")
683 629
684 parser.add_option("-X", "--hex_col2", dest="color2", type= "string", 630 parser.add_option("-X", "--hex_col2", dest="color2", type= "string", default = "Plum" ,
685 help="Color for second condition", metavar="STR") 631 help="Color for second condition", metavar="STR")
686 632
687 parser.add_option("-q", "--quiet", 633 parser.add_option("-q", "--quiet",
688 action="store_false", dest="verbose", default=True, 634 action="store_false", dest="verbose", default=True,
689 help="don't print status messages to stdout") 635 help="don't print status messages to stdout")
700 if not colors.is_color_like(options.color1) : 646 if not colors.is_color_like(options.color1) :
701 stop_err( options.color1+' is not a proper color' ) 647 stop_err( options.color1+' is not a proper color' )
702 if not colors.is_color_like(options.color2) : 648 if not colors.is_color_like(options.color2) :
703 stop_err( options.color2+' is not a proper color' ) 649 stop_err( options.color2+' is not a proper color' )
704 650
705 ## identify GFF or GTF format from 9th column
706 #with open (options.gff,"r") as gffile :
707 # for line in gffile :
708 # if '#' in line :
709 # ## skip header
710 # gffile.next()
711 # elif 'gene_id' in line :
712 # ## launch gtf reader :
713 # GFF = ribo_functions.store_gtf(options.gff)
714 # break
715 # elif 'ID=' in line :
716 # ## launch gff reader
717 # GFF = ribo_functions.store_gff(options.gff)
718 # break
719 # else :
720 # stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
721
722 #GFF = store_gff(options.gff)
723 #GFF = ribo_functions.store_gtf(options.gff)
724 ## check gff reading
725 #if not GFF['order'] :
726 # stop_err( 'Incorrect GFF file' + str( e ) )
727 651
728 #### NOT USE IN FINAL VERSION 652 #### NOT USE IN FINAL VERSION
729 # # get codon list 653 # # get codon list
730 # codons = options.list_cod.upper().split(',') 654 # codons = options.list_cod.upper().split(',')
731 # check_codons_list(codons) 655 # check_codons_list(codons)