comparison get_codon_frequency.py @ 0:b8c070add3b7

Uploaded
author rlegendre
date Mon, 20 Oct 2014 11:06:17 -0400
parents
children 707807fee542
comparison
equal deleted inserted replaced
-1:000000000000 0:b8c070add3b7
1 #!/usr/bin/env python2.7
2 # -*- coding: utf-8 -*-
3
4 '''
5 Created on sep. 2013
6 @author: rachel legendre
7 @copyright: rachel.legendre@igmors.u-psud.fr
8 @license: GPL v3
9 '''
10
11 from __future__ import division
12 import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time
13 import itertools
14 import math
15 from decimal import Decimal
16 from Bio import SeqIO
17 from Bio.Seq import Seq
18 from numpy import arange, std, array, linspace, average
19 #from matplotlib import pyplot as pl
20 import matplotlib
21 matplotlib.use('Agg')
22 import matplotlib.pyplot as pl
23 from matplotlib import font_manager
24 from matplotlib import colors
25 import csv
26 from scipy import stats
27 from collections import OrderedDict
28 # #libraries for debugg
29 # import pdb
30 # import cPickle
31
32 def stop_err(msg):
33 sys.stderr.write("%s\n" % msg)
34 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
35 sys.exit()
36
37
38 def store_gff(gff):
39 '''
40 parse and store gff file in a dictionnary
41 '''
42 try:
43 GFF = OrderedDict({})
44 with open(gff, 'r') as f_gff :
45 # GFF['order'] = []
46 for line in f_gff:
47 # # switch commented lines
48 head = line.split("#")[0]
49 if head != "" :
50 feature = (line.split('\t')[8]).split(';')
51 chrom = line.split('\t')[0]
52 if chrom not in GFF :
53 GFF[chrom] = {}
54 # first line is already gene line :
55 if line.split('\t')[2] == 'gene' :
56 gene = feature[0].replace("ID=", "")
57 if re.search('gene', feature[2]) :
58 Name = feature[2].replace("gene=", "")
59 else :
60 Name = "Unknown"
61 # #get annotation
62 note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line)
63 note = urllib.unquote(str(note)).replace("\n", "")
64 # # store gene information
65 # GFF['order'].append(gene)
66 GFF[chrom][gene] = {}
67 GFF[chrom][gene]['chrom'] = line.split('\t')[0]
68 GFF[chrom][gene]['start'] = int(line.split('\t')[3])
69 GFF[chrom][gene]['stop'] = int(line.split('\t')[4])
70 GFF[chrom][gene]['strand'] = line.split('\t')[6]
71 GFF[chrom][gene]['name'] = Name
72 GFF[chrom][gene]['note'] = note
73 GFF[chrom][gene]['exon'] = {}
74 GFF[chrom][gene]['exon_number'] = 0
75 # print Name
76 elif line.split('\t')[2] == 'CDS' :
77 gene = re.sub(r"Parent\=(.+)_mRNA", r"\1", feature[0])
78 if GFF[chrom].has_key(gene) :
79 GFF[chrom][gene]['exon_number'] += 1
80 exon_number = GFF[chrom][gene]['exon_number']
81 GFF[chrom][gene]['exon'][exon_number] = {}
82 GFF[chrom][gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
83 GFF[chrom][gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
84 GFF[chrom][gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
85 # # if there is a five prim UTR intron, we change start of gene
86 elif line.split('\t')[2] == 'five_prime_UTR_intron' :
87 if GFF[chrom][gene]['strand'] == "+" :
88 GFF[chrom][gene]['start'] = GFF[chrom][gene]['exon'][1]['start']
89 else :
90 GFF[chrom][gene]['stop'] = GFF[chrom][gene]['exon'][exon_number]['stop']
91 return GFF
92 except Exception, e:
93 stop_err('Error during gff storage : ' + str(e))
94
95 #chrI SGD gene 87286 87752 . + . ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689
96 #7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra
97 #ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr
98 #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified
99 #chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
100 #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
101
102
103
104 def init_codon_dict():
105
106 Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)])
107 return Codon_dict
108
109
110
111 def get_codon_usage(bamfile, GFF, site, kmer, a_pos):
112 '''
113 Read GFF dict and get gene codon usage.
114 Return dict of codons usage
115 '''
116 try:
117 codon = init_codon_dict()
118
119 for chrom in GFF.iterkeys():
120 for gene in GFF[chrom] :
121 codon_dict = init_codon_dict()
122 start = GFF[chrom][gene]['start']
123 stop = GFF[chrom][gene]['stop']
124 region = chrom + ':' + str(start) + '-' + str(stop)
125 # #get all reads in this gene
126 reads = subprocess.check_output(["samtools", "view", bamfile, region])
127 head = subprocess.check_output(["samtools", "view", "-H", bamfile])
128 read_tab = reads.split('\n')
129 for read in read_tab:
130 # # search mapper for eliminate multiple alignements
131 if 'bowtie' in head:
132 multi_tag = "XS:i:"
133 elif 'bwa' in head:
134 multi_tag = "XT:A:R"
135 else :
136 stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")
137 if len(read) == 0:
138 continue
139
140 len_read = len(read.split('\t')[9])
141 # if it's read of good length
142 if len_read == kmer and multi_tag not in read:
143 feat = read.split('\t')
144 seq = feat[9]
145 # if it's a reverse read
146 if feat[1] == '16' :
147 if site == "A" :
148 # #get A-site
149 cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement())
150 elif site == "P" :
151 # #get P-site
152 cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement())
153 else :
154 # #get site-E
155 cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement())
156 # # test if it's a true codon not a CNG codon for example
157 if codon_dict.has_key(cod) :
158 codon_dict[cod] += 1
159 # if it's a forward read
160 elif feat[1] == '0' :
161 if site == "A" :
162 # #get A-site
163 cod = seq[a_pos:a_pos+3]
164 elif site == "P" :
165 # #get P-site
166 cod = seq[a_pos-3:a_pos]
167 else :
168 # #get site-E
169 cod = seq[a_pos-6:a_pos-3]
170 if codon_dict.has_key(cod) :
171 codon_dict[cod] += 1
172 # # add in global dict
173 for cod, count in codon_dict.iteritems() :
174 codon[cod] += count
175
176 return codon
177
178 except Exception, e:
179 stop_err('Error during codon usage calcul: ' + str(e))
180
181
182 '''
183 http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html
184 '''
185 def violin_plot(ax, data, pos, bp=False):
186 '''
187 create violin plots on an axis
188 '''
189 dist = max(pos) - min(pos)
190 w = min(0.15 * max(dist, 1.0), 0.5)
191 for d, p in zip(data, pos):
192 k = stats.gaussian_kde(d) # calculates the kernel density
193 m = k.dataset.min() # lower bound of violin
194 M = k.dataset.max() # upper bound of violin
195 x = arange(m, M, (M - m) / 100.) # support for violin
196 v = k.evaluate(x) # violin profile (density curve)
197 v = v / v.max() * w # scaling the violin to the available space
198 ax.fill_betweenx(x, p, v + p, facecolor=color1, alpha=0.3)
199 ax.fill_betweenx(x, p, -v + p, facecolor=color2, alpha=0.3)
200 if bp:
201 ax.boxplot(data, notch=1, positions=pos, vert=1)
202
203
204
205 '''
206 http://log.ooz.ie/2013/02/matplotlib-comparative-histogram-recipe.html
207 '''
208 def comphist(x1, x2, orientation='vertical', **kwargs):
209 """Draw a comparative histogram."""
210 # Split keyword args:
211 kwargs1 = {}
212 kwargs2 = {}
213 kwcommon = {}
214 for arg in kwargs:
215 tgt_arg = arg[:-1]
216 if arg.endswith('1'):
217 arg_dict = kwargs1
218 elif arg.endswith('2'):
219 arg_dict = kwargs2
220 else:
221 arg_dict = kwcommon
222 tgt_arg = arg
223 arg_dict[tgt_arg] = kwargs[arg]
224 kwargs1.update(kwcommon)
225 kwargs2.update(kwcommon)
226
227 fig = pl.figure()
228
229 # Have both histograms share one axis.
230 if orientation == 'vertical':
231 ax1 = pl.subplot(211)
232 ax2 = pl.subplot(212, sharex=ax1)
233 # Flip the ax2 histogram horizontally.
234 ax2.set_ylim(ax1.get_ylim()[::-1])
235 pl.setp(ax1.get_xticklabels(), visible=False)
236 legend_loc = (1, 4)
237 else:
238 ax1 = pl.subplot(122)
239 ax2 = pl.subplot(121, sharey=ax1)
240 # Flip the ax2 histogram vertically.
241 ax2.set_xlim(ax2.get_xlim()[::-1])
242 pl.setp(ax1.get_yticklabels(), visible=False)
243 legend_loc = (1, 2)
244
245 ax1.hist(x1, orientation=orientation, **kwargs1)
246 ax2.hist(x2, orientation=orientation, **kwargs2)
247 ax2.set_ylim(ax1.get_ylim()[::-1])
248 ax1.legend(loc=legend_loc[0])
249 ax2.legend(loc=legend_loc[1])
250 # Tighten up the layout.
251 pl.subplots_adjust(wspace=0.0, hspace=0.0)
252 return fig
253
254
255 def compute_FC_plot(cond1_norm, cond2_norm, cod_name, codon_to_test, dirout):
256
257 FC_tab = []
258 for z, y in zip(cond1_norm.itervalues(), cond2_norm.itervalues()):
259 fc = z - y
260 FC_tab.append(fc)
261 # #codon_to_test = ['TGA','TAG','TAA']
262
263 a = []
264 b = []
265 cod = []
266 for codon in cond1_norm.iterkeys():
267 if codon in codon_to_test :
268 fc = cond1_norm[codon] - cond2_norm[codon]
269 b.append(fc)
270 cod.append(codon)
271 else :
272 fc = cond1_norm[codon] - cond2_norm[codon]
273 a.append(fc)
274
275
276 fig = pl.figure(num=1)
277 comphist(array(a), array(b), label1='All codon', label2=cod_name, color2='green', bins=30, rwidth=1)
278 # pl.show()
279 pl.savefig(dirout + '/hist_codon_fc.png', format="png", dpi=340)
280 pl.clf()
281
282
283 # #violin plot
284 pos = range(2)
285 dat = array([array(a), array(b)])
286 fig = pl.figure()
287 pl.title("Distribution of codons FoldChange between two conditions")
288 ax = fig.add_subplot(1, 1, 1)
289 lab = array(['All codons', cod_name])
290 violin_plot(ax, dat, pos, bp=1)
291 for x, z in zip(dat, pos):
292 ax.plot(z, average(x), color='r', marker='*', markeredgecolor='r')
293 xtickNames = pl.setp(ax, xticklabels=lab)
294 pl.savefig(dirout + '/violinplot_codon.png', format="png", dpi=340)
295 pl.clf()
296
297 # (Fval,pval) = stats.ttest_ind(a, b, axis=0, equal_var=True)
298 (Fval, pval) = stats.mannwhitneyu(a, b)
299 return pval
300
301
302 def get_aa_dict(cond1_norm, cond2_norm):
303
304 # ## create amino acid dictionnary:
305 AA = OrderedDict({})
306 AA['Phe'] = [cond1_norm['TTT'] + cond1_norm['TTC'], cond2_norm['TTT'] + cond2_norm['TTC']]
307 AA['Leu'] = [cond1_norm['TTA'] + cond1_norm['TTG'] + cond1_norm['CTT'] + cond1_norm['CTC'] + cond1_norm['CTA'] + cond1_norm['CTG'], cond2_norm['TTA'] + cond2_norm['TTG'] + cond2_norm['CTT'] + cond2_norm['CTC'] + cond2_norm['CTA'] + cond2_norm['CTG']]
308 AA['Ile'] = [cond1_norm['ATT'] + cond1_norm['ATC'] + cond1_norm['ATA'], cond2_norm['ATT'] + cond2_norm['ATC'] + cond2_norm['ATA']]
309 AA['Met'] = [cond1_norm['ATG'], cond2_norm['ATG']]
310 AA['Val'] = [cond1_norm['GTT'] + cond1_norm['GTC'] + cond1_norm['GTA'] + cond1_norm['GTG'] + cond1_norm['AGT'] + cond1_norm['AGC'], cond2_norm['GTT'] + cond2_norm['GTC'] + cond2_norm['GTA'] + cond2_norm['GTG'] + cond2_norm['AGT'] + cond2_norm['AGC']]
311 AA['Ser'] = [cond1_norm['TCT'] + cond1_norm['TCC'] + cond1_norm['TCA'] + cond1_norm['TCG'], cond2_norm['TCT'] + cond2_norm['TCC'] + cond2_norm['TCA'] + cond2_norm['TCG']]
312 AA['Pro'] = [cond1_norm['CCT'] + cond1_norm['CCC'] + cond1_norm['CCA'] + cond1_norm['CCG'], cond2_norm['CCT'] + cond2_norm['CCC'] + cond2_norm['CCA'] + cond2_norm['CCG']]
313 AA['Thr'] = [cond1_norm['ACT'] + cond1_norm['ACC'] + cond1_norm['ACA'] + cond1_norm['ACG'], cond2_norm['ACT'] + cond2_norm['ACC'] + cond2_norm['ACA'] + cond2_norm['ACG']]
314 AA['Ala'] = [cond1_norm['GCT'] + cond1_norm['GCC'] + cond1_norm['GCA'] + cond1_norm['GCG'], cond2_norm['GCT'] + cond2_norm['GCC'] + cond2_norm['GCA'] + cond2_norm['GCG']]
315 AA['Tyr'] = [cond1_norm['TAT'] + cond1_norm['TAC'], cond2_norm['TAT'] + cond2_norm['TAC']]
316 AA['Stop'] = [cond1_norm['TAA'] + cond1_norm['TAG'] + cond1_norm['TGA'], cond2_norm['TAA'] + cond2_norm['TAG'] + cond2_norm['TGA']]
317 AA['His'] = [cond1_norm['CAT'] + cond1_norm['CAC'], cond2_norm['CAT'] + cond2_norm['CAC']]
318 AA['Gln'] = [cond1_norm['CAA'] + cond1_norm['CAG'], cond2_norm['CAA'] + cond2_norm['CAG']]
319 AA['Asn'] = [cond1_norm['AAT'] + cond1_norm['AAC'], cond2_norm['AAT'] + cond2_norm['AAC']]
320 AA['Lys'] = [cond1_norm['AAA'] + cond1_norm['AAG'], cond2_norm['AAA'] + cond2_norm['AAG']]
321 AA['Asp'] = [cond1_norm['GAT'] + cond1_norm['GAC'], cond2_norm['GAT'] + cond2_norm['GAC']]
322 AA['Glu'] = [cond1_norm['GAA'] + cond1_norm['GAG'], cond2_norm['GAA'] + cond2_norm['GAG']]
323 AA['Cys'] = [cond1_norm['TGT'] + cond1_norm['TGC'], cond2_norm['TGT'] + cond2_norm['TGC']]
324 AA['Trp'] = [cond1_norm['TGG'], cond2_norm['TGG']]
325 AA['Arg'] = [cond1_norm['CGT'] + cond1_norm['CGC'] + cond1_norm['CGA'] + cond1_norm['CGG'] + cond1_norm['AGA'] + cond1_norm['AGG'], cond2_norm['CGT'] + cond2_norm['CGC'] + cond2_norm['CGA'] + cond2_norm['CGG'] + cond2_norm['AGA'] + cond2_norm['AGG']]
326 AA['Gly'] = [cond1_norm['GGT'] + cond1_norm['GGC'] + cond1_norm['GGA'] + cond1_norm['GGG'], cond2_norm['GGT'] + cond2_norm['GGC'] + cond2_norm['GGA'] + cond2_norm['GGG']]
327
328
329 return AA
330
331
332
333 def plot_codon_usage(result, dirout, c1, c2, outfile, color1, color2):
334 '''
335 Take list of dict of codon usage and use matplotlib for do graph
336 '''
337
338 # #if there are replicat
339 if len(result) == 4 :
340 # store each dict in variables to make code more readable
341 cond1_1 = result[0].copy()
342 cond1_2 = result[1].copy()
343 cond2_1 = result[2].copy()
344 cond2_2 = result[3].copy()
345 # get codon order in one of list
346 codon_sorted = sorted(cond1_1.iterkeys(), reverse=False)
347 # get max of each list
348 sum11 = sum(list(cond1_1.itervalues()))
349 sum12 = sum(list(cond1_2.itervalues()))
350 sum21 = sum(list(cond2_1.itervalues()))
351 sum22 = sum(list(cond2_2.itervalues()))
352 # for each codon, get values and sd in each condition
353 cond1_val = {}
354 cond1 = {}
355 cond2_val = {}
356 cond2 = {}
357 std_cond1 = []
358 std_cond2 = []
359 max_val = [] # # max value for graph
360 for i in codon_sorted:
361 # # cond1 = moyenne of replicats cond1 divided by max
362 cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2)
363 cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2)
364 # # standard deviation = absolute value of diffence between replicats of cond1
365 std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)])))
366 # # cond2 = moyenne of replicats cond1divided by max
367 cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)
368 cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2)
369 # # standard deviation = absolute value of diffence between replicats of cond2
370 std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)])))
371 # # max value for each codon
372 max_val.append(max((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2, (cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2))
373
374 # for graph design
375 cond1_norm = OrderedDict(sorted(cond1_val.items(), key=lambda t: t[0]))
376 cond1_norm.update ((x, y * 100) for x, y in cond1_norm.items())
377 cond2_norm = OrderedDict(sorted(cond2_val.items(), key=lambda t: t[0]))
378 cond2_norm.update ((x, y * 100) for x, y in cond2_norm.items())
379 max_val = [x * 100 for x in max_val]
380
381 AA = get_aa_dict(cond1_norm, cond2_norm)
382 max_valaa = []
383 cond1_aa = []
384 cond2_aa = []
385 aa_name = list(AA.iterkeys())
386 for z in AA.itervalues():
387 cond1_aa.append(z[0])
388 cond2_aa.append(z[1])
389 max_valaa.append(max(z))
390 # # plot amino acid profile :
391 fig = pl.figure(figsize=(24, 10), num=1)
392 width = .50
393 ax = fig.add_subplot(111)
394 ax.xaxis.set_ticks([])
395 ind = arange(21)
396 pl.xlim(0, 21)
397 ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1)
398 ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2)
399 for x, y, z in zip(ind, max_valaa, aa_name):
400 ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14)
401 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
402 ax.set_xlabel('Amino Acid')
403 handles, labels = ax.get_legend_handles_labels()
404 ax.legend(handles, labels)
405 pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340)
406 pl.clf()
407
408
409 # # compute theorical count in COND2
410 sum2 = (sum21 + sum22) / 2
411 cond2_count = []
412 for z in cond1_norm.itervalues() :
413 count = int(z * sum2 / 100)
414 cond2_count.append(count)
415
416 expected = array(cond2_count)
417 observed = array(list(cond2.itervalues()))
418
419 # write result
420 with open(outfile, 'w') as out :
421 out.write('Codon\tRaw_' + c1 + '\tRaw_' + c2 + '\tNorm_' + c1 + '\tNorm_' + c2 + '\tFC\tFC_' + c1 + '\tFC_' + c2 + '\n')
422 for i in codon_sorted:
423 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\t' + str((cond2_1[i] / sum21) / (cond1_1[i] / sum11)) + '\t' + str((cond2_2[i] / sum22) / (cond1_1[i] / sum11)) + '\n')
424 chi = stats.chisquare(observed, expected)
425 out.write('Khi2 test\n')
426 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')
427
428
429
430 # plot result
431 fig = pl.figure(figsize=(24, 10), num=1)
432 width = .50
433 ind = arange(len(codon_sorted))
434 ax = fig.add_subplot(111)
435 pl.xlim(0, len(codon_sorted) + 1)
436 ax.spines['right'].set_color('none')
437 ax.spines['top'].set_color('none')
438 ax.xaxis.set_ticks([])
439 ax.spines['left'].set_smart_bounds(True)
440 ax.yaxis.set_ticks_position('left')
441 ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, yerr=std_cond1, error_kw={'elinewidth':1, 'ecolor':'black'}, label=c1)
442 ax.bar(ind + width, list(cond2_norm.itervalues()), width, yerr=std_cond2, facecolor=color2, error_kw={'elinewidth':1, 'ecolor':'black'}, label=c2)
443 for x, y, z in zip(ind, max_val, codon_sorted):
444 ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8)
445 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
446 ax.set_xlabel('Codons')
447 handles, labels = ax.get_legend_handles_labels()
448 ax.legend(handles, labels)
449 pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
450 pl.clf()
451
452
453
454 elif len(result) == 2 :
455
456 # store each dict in OrderedDict sorted by key to make code more readable
457 cond1 = result[0]
458 cond2 = result[1]
459 cond1_norm = result[0].copy()
460 cond2_norm = result[1].copy()
461 # pdb.set_trace()
462 # get codon order in one of list
463 codon_sorted = sorted(cond1.iterkeys(), reverse=False)
464
465 # get sum of each list
466 sum1 = sum(list(cond1.itervalues()))
467 sum2 = sum(list(cond2.itervalues()))
468 # #Normalize values by sum of each libraries
469 cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items())
470 cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items())
471
472 # # compute theorical count in COND2
473 cond2_count = []
474 for z in cond1_norm.itervalues() :
475 count = int(z * sum2 / 100.0)
476 cond2_count.append(count)
477
478 expected = array(cond2_count)
479 observed = array(list(cond2.itervalues()))
480
481 AA = get_aa_dict(cond1_norm, cond2_norm)
482
483 max_val = []
484 cond1_aa = []
485 cond2_aa = []
486 aa_name = list(AA.iterkeys())
487 for z in AA.itervalues():
488 cond1_aa.append(z[0])
489 cond2_aa.append(z[1])
490 max_val.append(max(z))
491
492 # # plot amino acid profile :
493 fig = pl.figure(num=1)
494 width = .35
495 ax = fig.add_subplot(111)
496 ind = arange(21)
497 pl.xlim(0, 21)
498 #kwargs = {"hatch":'x'}
499 #ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1, **kwargs)
500 #kwargs = {"hatch":'.'}
501 #ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2, **kwargs)
502 ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1)
503 ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2)
504 #for x, y, z in zip(ind, max_val, aa_name):
505 # ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14)
506 axis_font = {'size':'16'}
507 pl.xticks(ind + width, aa_name,**axis_font)
508 ax.spines['right'].set_visible(False)
509 ax.spines['top'].set_visible(False)
510 ax.yaxis.set_ticks_position('left')
511 ax.xaxis.set_ticks_position('bottom')
512 #ax.xaxis.set_ticks([])
513 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font)
514 ax.set_xlabel('Amino Acids', **axis_font)
515 handles, labels = ax.get_legend_handles_labels()
516 font_prop = font_manager.FontProperties(size=12)
517 ax.legend(handles, labels, prop=font_prop)
518 pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340)
519 pl.clf()
520
521 # write result
522 with open(outfile, 'w') as out :
523 out.write('Codon\tRaw_' + c1 + '\tRaw_' + c2 + '\tNorm_' + c1 + '\tNorm_' + c2 + '\tFC(Mut/WT)\n')
524 for i in codon_sorted:
525 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
526 out.write('Khi2 test\n')
527 chi = stats.chisquare(observed, expected)
528 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')
529
530 # # get max value for each codon for histogram
531 max_val = [] # # max value for graph
532 for i in cond1:
533 # # max value for each codon
534 max_val.append(max(cond1_norm[i], cond2_norm[i]))
535
536 # plot result
537 fig = pl.figure(figsize=(24, 10), num=1)
538 width = .50
539 ind = arange(len(codon_sorted))
540 ax = fig.add_subplot(111)
541 pl.xlim(0, len(codon_sorted) + 1)
542 ax.spines['right'].set_color('none')
543 ax.spines['top'].set_color('none')
544 ax.xaxis.set_ticks([])
545 ax.spines['left'].set_smart_bounds(True)
546 ax.yaxis.set_ticks_position('left')
547 ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1)
548 ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2)
549 for x, y, z in zip(ind, max_val, codon_sorted):
550 ax.text(x + width, y + 0.02, '%s' % z, ha='center', va='bottom', fontsize=8)
551 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
552 ax.set_xlabel('Codons')
553 handles, labels = ax.get_legend_handles_labels()
554 ax.legend(handles, labels)
555 pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
556 pl.clf()
557
558
559 else :
560 stop_err('Error running codon usage plotting : ' + str(e))
561
562
563 return (cond1_norm, cond2_norm, chi[1])
564
565 def write_html_file(html, chi_pval, cond1, cond2):
566 try :
567
568
569 html_str = """
570 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
571 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
572
573 <html xmlns="http://www.w3.org/1999/xhtml">
574 <head>
575 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
576 <link href="/static/june_2007_style/blue/base.css" media="screen" rel="Stylesheet" type="text/css" />
577 </head>
578 <body>
579 <h3>Global visualization</h3>
580 <p>
581 <h5>Visualization of density footprint in each codon.</h5><br> If user has selected analyse with replicats, standart error deviation between each replicate as plotting as error bar in histogram.<br>
582 <img border="0" src="hist_codons.png" width="1040"/>
583 </p>
584 <p>
585 <h5>Test for homogeneity distribution between each condition</h5><br>
586 H0 : %s and %s are same distribution <br>
587 Khi2 test p-value: %s<br><br>
588 If p-value less than 0.05, we can reject homogeneity distribution so we can hypothesize that distributions are not the same. Otherwise, we accept H0<br>
589
590 </p>
591 <p>
592 <h5>Visualization of density footprint in each codon groupe by amino acid</h5><br>
593 <img border="0" src="hist_amino_acid.png" width="1040"/>
594 </p>
595 </body>
596 </html> """ % (cond1,cond2,chi_pval)
597
598
599 html_file = open(html, "w")
600 html_file.write(html_str)
601 html_file.close()
602
603 except Exception, e :
604 stop_err('Error during html page creation : ' + str(e))
605
606
607
608
609 def check_codons_list (codons) :
610
611 for codon in codons :
612 if codon not in init_codon_dict().iterkeys() :
613 stop_err('Please to enter a valid codon : ' + codon + ' is not find\n')
614
615
616 def check_index_bam (bamfile) :
617 # #testing indexed bam file
618 if os.path.isfile(bamfile + ".bai") :
619 pass
620 else :
621 cmd = "samtools index %s " % (bamfile)
622 proc = subprocess.Popen(args=cmd, shell=True, stderr=subprocess.PIPE)
623 returncode = proc.wait()
624 # if returncode != 0:
625 # raise Exception
626
627 def __main__():
628 '''
629 python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -1 psiM1_sorted.bam,psiM2_sorted.bam -2 psiP1_sorted.bam,psiP2_sorted.bam -c psiM -C psiP -l TAG,TAA,TGA -r yes -o psi_count -d psi.html,html_dir > log2
630 python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -c psiM -C psiP -1 RPF_psi-_28sorted.bam -2 RPF_psi+_28sorted.bam -l TAG,TAA,TGA -n Stop Codon -r no -o psi_count -d psi.html,html_dir > log2
631 '''
632
633 # Parse command line options
634 parser = optparse.OptionParser()
635 parser.add_option("-g", "--gff", dest="gff", type="string",
636 help="gff file", metavar="FILE")
637
638 parser.add_option("-1", "--file1", dest="file1", type="string",
639 help="Bam Ribo-Seq alignments cond 1, if rep option, separate files by commas ", metavar="FILE")
640
641 parser.add_option("-2", "--file2", dest="file2", type="string",
642 help="Bam Ribo-Seq alignments cond 2, if rep option, separate files by commas", metavar="FILE")
643
644 parser.add_option("-c", "--cond1", dest="c1", type="string",
645 help="Name for first condition", metavar="STR")
646
647 parser.add_option("-C", "--cond2", dest="c2", type="string",
648 help="Name of second condition", metavar="STR")
649
650 parser.add_option("-k", "--kmer", dest="kmer", type="int",
651 help="Longer of your phasing reads", metavar="INT")
652
653 # parser.add_option("-l", "--list", dest="list_cod", type= "string",
654 # help="list of codons to compare to other", metavar="STR")
655
656 parser.add_option("-o", "--out", dest="outfile", type="string",
657 help="write report to FILE", metavar="FILE")
658
659 parser.add_option("-d", "--dirout", dest="dirout", type="string",
660 help="write report to PNG files", metavar="FILE")
661
662 parser.add_option("-a", "--asite", dest="asite", type="int",
663 help="Off-set from the 5'end of the footprint to the A-site", metavar="INT")
664
665 parser.add_option("-s", "--site", dest="site", type="string",
666 help="Script can compute in site A, P or E", metavar="A|P|E")
667
668 parser.add_option("-r", "--rep", dest="rep", type="string",
669 help="if replicate or not", metavar="yes|no")
670
671 parser.add_option("-x", "--hex_col1", dest="color1", type= "string",
672 help="Color for first condition", metavar="STR")
673
674 parser.add_option("-X", "--hex_col2", dest="color2", type= "string",
675 help="Color for second condition", metavar="STR")
676
677 parser.add_option("-q", "--quiet",
678 action="store_false", dest="verbose", default=True,
679 help="don't print status messages to stdout")
680
681 (options, args) = parser.parse_args()
682 print "Begin codon frequency analysis at", time.asctime(time.localtime(time.time()))
683
684 try:
685 authorized_site = ["A", "P", "E"]
686 if options.site not in authorized_site :
687 stop_err(options.site + ' is not a authorized ribosome site')
688
689 ## Check if colors exist
690 if not colors.is_color_like(options.color1) :
691 stop_err( options.color1+' is not a proper color' )
692 if not colors.is_color_like(options.color2) :
693 stop_err( options.color2+' is not a proper color' )
694
695 GFF = store_gff(options.gff)
696
697 #### NOT USE IN FINAL VERSION
698 # # get codon list
699 # codons = options.list_cod.upper().split(',')
700 # check_codons_list(codons)
701
702 # # get html file and directory :
703 (html, html_dir) = options.dirout.split(',')
704 if os.path.exists(html_dir):
705 raise
706 try:
707 os.mkdir(html_dir)
708 except:
709 raise Exception(html_dir + ' mkdir')
710 # #RUN analysis
711 # #If there are replicats
712 if options.rep == "yes" :
713 result = []
714 # split name of each file options by ","
715 cond1 = options.file1.split(',')
716 cond2 = options.file2.split(',')
717 # # calcul for each file
718 for fh in itertools.chain(cond1, cond2):
719 check_index_bam (fh)
720 result.append(get_codon_usage(fh, GFF, options.site, options.kmer, options.asite))
721 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
722 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
723
724
725 # #If there are no replicat
726 elif options.rep == "no" :
727 result = []
728 # #calcul for each cond
729 for fh in (options.file1, options.file2):
730 check_index_bam (fh)
731 result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite))
732 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
733 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
734
735 else :
736 sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time())))
737 sys.exit()
738
739 # write_html_file(html,chi_pval,t_pval,codons,options.c1, options.c2)
740 write_html_file(html, chi_pval, options.c1, options.c2)
741
742 print "Finish codon frequency analysis at", time.asctime(time.localtime(time.time()))
743 except Exception, e:
744 stop_err('Error running codon frequency analysis (main program) : ' + str(e))
745
746
747 if __name__=="__main__":
748 __main__()