0
|
1 #!/usr/bin/env python2.7
|
|
2 # -*- coding: utf-8 -*-
|
|
3
|
|
4 '''
|
|
5 Created on sep. 2013
|
|
6 @author: rachel legendre
|
|
7 @copyright: rachel.legendre@igmors.u-psud.fr
|
|
8 @license: GPL v3
|
|
9 '''
|
|
10
|
|
11 from __future__ import division
|
|
12 import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time
|
|
13 import itertools
|
15
|
14 from math import log10
|
0
|
15 from decimal import Decimal
|
|
16 from Bio import SeqIO
|
|
17 from Bio.Seq import Seq
|
|
18 from numpy import arange, std, array, linspace, average
|
|
19 #from matplotlib import pyplot as pl
|
|
20 import matplotlib
|
|
21 matplotlib.use('Agg')
|
|
22 import matplotlib.pyplot as pl
|
|
23 from matplotlib import font_manager
|
|
24 from matplotlib import colors
|
|
25 import csv
|
|
26 from scipy import stats
|
|
27 from collections import OrderedDict
|
10
|
28 import ribo_functions
|
|
29 import HTSeq
|
0
|
30 # #libraries for debugg
|
13
|
31 #import pdb
|
15
|
32 import cPickle
|
0
|
33
|
|
34 def stop_err(msg):
|
|
35 sys.stderr.write("%s\n" % msg)
|
|
36 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
|
|
37 sys.exit()
|
|
38
|
|
39
|
|
40 def init_codon_dict():
|
|
41
|
|
42 Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)])
|
|
43 return Codon_dict
|
|
44
|
|
45
|
|
46
|
|
47 def get_codon_usage(bamfile, GFF, site, kmer, a_pos):
|
|
48 '''
|
|
49 Read GFF dict and get gene codon usage.
|
|
50 Return dict of codons usage
|
|
51 '''
|
|
52 try:
|
|
53 codon = init_codon_dict()
|
13
|
54 multi_tag = "XS:i:" ## bowtie Tag
|
|
55 tag = "IH:i:1" ## RUM tag
|
|
56
|
10
|
57 for feature in GFF :
|
|
58 if feature.type == 'gene' :
|
0
|
59 codon_dict = init_codon_dict()
|
10
|
60 chrom = feature.iv.chrom
|
|
61 start = feature.iv.start
|
|
62 stop = feature.iv.end
|
13
|
63 if start+50 < stop-50 :
|
|
64 region = chrom + ':' + str(start+50) + '-' + str(stop-50)
|
|
65 else :
|
|
66 break
|
10
|
67
|
|
68 ## DEPRECATED
|
|
69 #for chrom in GFF.iterkeys():
|
|
70 #for gene in GFF[chrom] :
|
|
71 # codon_dict = init_codon_dict()
|
|
72 #start = GFF[chrom][gene]['start']
|
|
73 #print start
|
|
74 #stop = GFF[chrom][gene]['stop']
|
|
75 #print stop
|
|
76 #region = chrom + ':' + str(start) + '-' + str(stop)
|
|
77 #######
|
0
|
78 # #get all reads in this gene
|
|
79 reads = subprocess.check_output(["samtools", "view", bamfile, region])
|
|
80 head = subprocess.check_output(["samtools", "view", "-H", bamfile])
|
10
|
81 read_tab = reads.split('\n')
|
0
|
82 for read in read_tab:
|
|
83 # # search mapper for eliminate multiple alignements
|
|
84 if 'bowtie' in head:
|
|
85 multi_tag = "XS:i:"
|
|
86 elif 'bwa' in head:
|
|
87 multi_tag = "XT:A:R"
|
13
|
88 elif 'TopHat' in head:
|
|
89 tag = "NH:i:1"
|
0
|
90 else :
|
13
|
91 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
|
|
92
|
0
|
93 if len(read) == 0:
|
|
94 continue
|
|
95 len_read = len(read.split('\t')[9])
|
|
96 # if it's read of good length
|
14
|
97 if len_read == kmer and (tag in read or multi_tag not in read):
|
0
|
98 feat = read.split('\t')
|
|
99 seq = feat[9]
|
|
100 # if it's a reverse read
|
|
101 if feat[1] == '16' :
|
|
102 if site == "A" :
|
|
103 # #get A-site
|
|
104 cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement())
|
|
105 elif site == "P" :
|
|
106 # #get P-site
|
|
107 cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement())
|
|
108 else :
|
|
109 # #get site-E
|
|
110 cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement())
|
|
111 # # test if it's a true codon not a CNG codon for example
|
|
112 if codon_dict.has_key(cod) :
|
|
113 codon_dict[cod] += 1
|
|
114 # if it's a forward read
|
|
115 elif feat[1] == '0' :
|
|
116 if site == "A" :
|
|
117 # #get A-site
|
|
118 cod = seq[a_pos:a_pos+3]
|
|
119 elif site == "P" :
|
|
120 # #get P-site
|
|
121 cod = seq[a_pos-3:a_pos]
|
|
122 else :
|
|
123 # #get site-E
|
|
124 cod = seq[a_pos-6:a_pos-3]
|
|
125 if codon_dict.has_key(cod) :
|
|
126 codon_dict[cod] += 1
|
13
|
127 del(read)
|
0
|
128 # # add in global dict
|
|
129 for cod, count in codon_dict.iteritems() :
|
|
130 codon[cod] += count
|
|
131
|
|
132 return codon
|
|
133
|
|
134 except Exception, e:
|
|
135 stop_err('Error during codon usage calcul: ' + str(e))
|
|
136
|
|
137
|
10
|
138
|
|
139
|
0
|
140 '''
|
|
141 http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html
|
|
142 '''
|
|
143 def violin_plot(ax, data, pos, bp=False):
|
|
144 '''
|
|
145 create violin plots on an axis
|
|
146 '''
|
|
147 dist = max(pos) - min(pos)
|
|
148 w = min(0.15 * max(dist, 1.0), 0.5)
|
|
149 for d, p in zip(data, pos):
|
|
150 k = stats.gaussian_kde(d) # calculates the kernel density
|
|
151 m = k.dataset.min() # lower bound of violin
|
|
152 M = k.dataset.max() # upper bound of violin
|
|
153 x = arange(m, M, (M - m) / 100.) # support for violin
|
|
154 v = k.evaluate(x) # violin profile (density curve)
|
|
155 v = v / v.max() * w # scaling the violin to the available space
|
|
156 ax.fill_betweenx(x, p, v + p, facecolor=color1, alpha=0.3)
|
|
157 ax.fill_betweenx(x, p, -v + p, facecolor=color2, alpha=0.3)
|
|
158 if bp:
|
|
159 ax.boxplot(data, notch=1, positions=pos, vert=1)
|
|
160
|
|
161
|
|
162
|
|
163 '''
|
|
164 http://log.ooz.ie/2013/02/matplotlib-comparative-histogram-recipe.html
|
|
165 '''
|
|
166 def comphist(x1, x2, orientation='vertical', **kwargs):
|
|
167 """Draw a comparative histogram."""
|
|
168 # Split keyword args:
|
|
169 kwargs1 = {}
|
|
170 kwargs2 = {}
|
|
171 kwcommon = {}
|
|
172 for arg in kwargs:
|
|
173 tgt_arg = arg[:-1]
|
|
174 if arg.endswith('1'):
|
|
175 arg_dict = kwargs1
|
|
176 elif arg.endswith('2'):
|
|
177 arg_dict = kwargs2
|
|
178 else:
|
|
179 arg_dict = kwcommon
|
|
180 tgt_arg = arg
|
|
181 arg_dict[tgt_arg] = kwargs[arg]
|
|
182 kwargs1.update(kwcommon)
|
|
183 kwargs2.update(kwcommon)
|
|
184
|
|
185 fig = pl.figure()
|
|
186
|
|
187 # Have both histograms share one axis.
|
|
188 if orientation == 'vertical':
|
|
189 ax1 = pl.subplot(211)
|
|
190 ax2 = pl.subplot(212, sharex=ax1)
|
|
191 # Flip the ax2 histogram horizontally.
|
|
192 ax2.set_ylim(ax1.get_ylim()[::-1])
|
|
193 pl.setp(ax1.get_xticklabels(), visible=False)
|
|
194 legend_loc = (1, 4)
|
|
195 else:
|
|
196 ax1 = pl.subplot(122)
|
|
197 ax2 = pl.subplot(121, sharey=ax1)
|
|
198 # Flip the ax2 histogram vertically.
|
|
199 ax2.set_xlim(ax2.get_xlim()[::-1])
|
|
200 pl.setp(ax1.get_yticklabels(), visible=False)
|
|
201 legend_loc = (1, 2)
|
|
202
|
|
203 ax1.hist(x1, orientation=orientation, **kwargs1)
|
|
204 ax2.hist(x2, orientation=orientation, **kwargs2)
|
|
205 ax2.set_ylim(ax1.get_ylim()[::-1])
|
|
206 ax1.legend(loc=legend_loc[0])
|
|
207 ax2.legend(loc=legend_loc[1])
|
|
208 # Tighten up the layout.
|
|
209 pl.subplots_adjust(wspace=0.0, hspace=0.0)
|
|
210 return fig
|
|
211
|
|
212
|
|
213 def compute_FC_plot(cond1_norm, cond2_norm, cod_name, codon_to_test, dirout):
|
|
214
|
|
215 FC_tab = []
|
|
216 for z, y in zip(cond1_norm.itervalues(), cond2_norm.itervalues()):
|
|
217 fc = z - y
|
|
218 FC_tab.append(fc)
|
|
219 # #codon_to_test = ['TGA','TAG','TAA']
|
|
220
|
|
221 a = []
|
|
222 b = []
|
|
223 cod = []
|
|
224 for codon in cond1_norm.iterkeys():
|
|
225 if codon in codon_to_test :
|
|
226 fc = cond1_norm[codon] - cond2_norm[codon]
|
|
227 b.append(fc)
|
|
228 cod.append(codon)
|
|
229 else :
|
|
230 fc = cond1_norm[codon] - cond2_norm[codon]
|
|
231 a.append(fc)
|
|
232
|
|
233
|
|
234 fig = pl.figure(num=1)
|
|
235 comphist(array(a), array(b), label1='All codon', label2=cod_name, color2='green', bins=30, rwidth=1)
|
|
236 # pl.show()
|
|
237 pl.savefig(dirout + '/hist_codon_fc.png', format="png", dpi=340)
|
|
238 pl.clf()
|
|
239
|
|
240
|
|
241 # #violin plot
|
|
242 pos = range(2)
|
|
243 dat = array([array(a), array(b)])
|
|
244 fig = pl.figure()
|
|
245 pl.title("Distribution of codons FoldChange between two conditions")
|
|
246 ax = fig.add_subplot(1, 1, 1)
|
|
247 lab = array(['All codons', cod_name])
|
|
248 violin_plot(ax, dat, pos, bp=1)
|
|
249 for x, z in zip(dat, pos):
|
|
250 ax.plot(z, average(x), color='r', marker='*', markeredgecolor='r')
|
|
251 xtickNames = pl.setp(ax, xticklabels=lab)
|
|
252 pl.savefig(dirout + '/violinplot_codon.png', format="png", dpi=340)
|
|
253 pl.clf()
|
|
254
|
|
255 # (Fval,pval) = stats.ttest_ind(a, b, axis=0, equal_var=True)
|
|
256 (Fval, pval) = stats.mannwhitneyu(a, b)
|
|
257 return pval
|
|
258
|
|
259
|
|
260 def get_aa_dict(cond1_norm, cond2_norm):
|
|
261
|
|
262 # ## create amino acid dictionnary:
|
|
263 AA = OrderedDict({})
|
|
264 AA['Phe'] = [cond1_norm['TTT'] + cond1_norm['TTC'], cond2_norm['TTT'] + cond2_norm['TTC']]
|
|
265 AA['Leu'] = [cond1_norm['TTA'] + cond1_norm['TTG'] + cond1_norm['CTT'] + cond1_norm['CTC'] + cond1_norm['CTA'] + cond1_norm['CTG'], cond2_norm['TTA'] + cond2_norm['TTG'] + cond2_norm['CTT'] + cond2_norm['CTC'] + cond2_norm['CTA'] + cond2_norm['CTG']]
|
|
266 AA['Ile'] = [cond1_norm['ATT'] + cond1_norm['ATC'] + cond1_norm['ATA'], cond2_norm['ATT'] + cond2_norm['ATC'] + cond2_norm['ATA']]
|
|
267 AA['Met'] = [cond1_norm['ATG'], cond2_norm['ATG']]
|
|
268 AA['Val'] = [cond1_norm['GTT'] + cond1_norm['GTC'] + cond1_norm['GTA'] + cond1_norm['GTG'] + cond1_norm['AGT'] + cond1_norm['AGC'], cond2_norm['GTT'] + cond2_norm['GTC'] + cond2_norm['GTA'] + cond2_norm['GTG'] + cond2_norm['AGT'] + cond2_norm['AGC']]
|
|
269 AA['Ser'] = [cond1_norm['TCT'] + cond1_norm['TCC'] + cond1_norm['TCA'] + cond1_norm['TCG'], cond2_norm['TCT'] + cond2_norm['TCC'] + cond2_norm['TCA'] + cond2_norm['TCG']]
|
|
270 AA['Pro'] = [cond1_norm['CCT'] + cond1_norm['CCC'] + cond1_norm['CCA'] + cond1_norm['CCG'], cond2_norm['CCT'] + cond2_norm['CCC'] + cond2_norm['CCA'] + cond2_norm['CCG']]
|
|
271 AA['Thr'] = [cond1_norm['ACT'] + cond1_norm['ACC'] + cond1_norm['ACA'] + cond1_norm['ACG'], cond2_norm['ACT'] + cond2_norm['ACC'] + cond2_norm['ACA'] + cond2_norm['ACG']]
|
|
272 AA['Ala'] = [cond1_norm['GCT'] + cond1_norm['GCC'] + cond1_norm['GCA'] + cond1_norm['GCG'], cond2_norm['GCT'] + cond2_norm['GCC'] + cond2_norm['GCA'] + cond2_norm['GCG']]
|
|
273 AA['Tyr'] = [cond1_norm['TAT'] + cond1_norm['TAC'], cond2_norm['TAT'] + cond2_norm['TAC']]
|
|
274 AA['Stop'] = [cond1_norm['TAA'] + cond1_norm['TAG'] + cond1_norm['TGA'], cond2_norm['TAA'] + cond2_norm['TAG'] + cond2_norm['TGA']]
|
|
275 AA['His'] = [cond1_norm['CAT'] + cond1_norm['CAC'], cond2_norm['CAT'] + cond2_norm['CAC']]
|
|
276 AA['Gln'] = [cond1_norm['CAA'] + cond1_norm['CAG'], cond2_norm['CAA'] + cond2_norm['CAG']]
|
|
277 AA['Asn'] = [cond1_norm['AAT'] + cond1_norm['AAC'], cond2_norm['AAT'] + cond2_norm['AAC']]
|
|
278 AA['Lys'] = [cond1_norm['AAA'] + cond1_norm['AAG'], cond2_norm['AAA'] + cond2_norm['AAG']]
|
|
279 AA['Asp'] = [cond1_norm['GAT'] + cond1_norm['GAC'], cond2_norm['GAT'] + cond2_norm['GAC']]
|
|
280 AA['Glu'] = [cond1_norm['GAA'] + cond1_norm['GAG'], cond2_norm['GAA'] + cond2_norm['GAG']]
|
|
281 AA['Cys'] = [cond1_norm['TGT'] + cond1_norm['TGC'], cond2_norm['TGT'] + cond2_norm['TGC']]
|
|
282 AA['Trp'] = [cond1_norm['TGG'], cond2_norm['TGG']]
|
|
283 AA['Arg'] = [cond1_norm['CGT'] + cond1_norm['CGC'] + cond1_norm['CGA'] + cond1_norm['CGG'] + cond1_norm['AGA'] + cond1_norm['AGG'], cond2_norm['CGT'] + cond2_norm['CGC'] + cond2_norm['CGA'] + cond2_norm['CGG'] + cond2_norm['AGA'] + cond2_norm['AGG']]
|
|
284 AA['Gly'] = [cond1_norm['GGT'] + cond1_norm['GGC'] + cond1_norm['GGA'] + cond1_norm['GGG'], cond2_norm['GGT'] + cond2_norm['GGC'] + cond2_norm['GGA'] + cond2_norm['GGG']]
|
|
285
|
|
286
|
|
287 return AA
|
|
288
|
|
289
|
|
290
|
|
291 def plot_codon_usage(result, dirout, c1, c2, outfile, color1, color2):
|
|
292 '''
|
|
293 Take list of dict of codon usage and use matplotlib for do graph
|
|
294 '''
|
|
295
|
|
296 # #if there are replicat
|
|
297 if len(result) == 4 :
|
|
298 # store each dict in variables to make code more readable
|
|
299 cond1_1 = result[0].copy()
|
|
300 cond1_2 = result[1].copy()
|
|
301 cond2_1 = result[2].copy()
|
|
302 cond2_2 = result[3].copy()
|
|
303 # get codon order in one of list
|
|
304 codon_sorted = sorted(cond1_1.iterkeys(), reverse=False)
|
15
|
305 try:
|
|
306 # get max of each list
|
|
307 sum11 = sum(list(cond1_1.itervalues()))
|
|
308 sum12 = sum(list(cond1_2.itervalues()))
|
|
309 sum21 = sum(list(cond2_1.itervalues()))
|
|
310 sum22 = sum(list(cond2_2.itervalues()))
|
|
311 # for each codon, get values and sd in each condition
|
|
312 cond1_val = {}
|
|
313 cond1 = {}
|
|
314 cond2_val = {}
|
|
315 cond2 = {}
|
|
316 std_cond1 = []
|
|
317 std_cond2 = []
|
|
318 max_val = [] # # max value for graph
|
|
319 for i in codon_sorted:
|
|
320 # # cond1 = moyenne of replicats cond1 divided by max
|
|
321 cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2)
|
|
322 cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2)
|
|
323 # # standard deviation = absolute value of diffence between replicats of cond1
|
|
324 std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)])))
|
|
325 # # cond2 = moyenne of replicats cond1divided by max
|
|
326 cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)
|
|
327 cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2)
|
|
328 # # standard deviation = absolute value of diffence between replicats of cond2
|
|
329 std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)])))
|
|
330 # # max value for each codon
|
|
331 max_val.append(max((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2, (cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2))
|
|
332
|
|
333 # for graph design
|
|
334 cond1_norm = OrderedDict(sorted(cond1_val.items(), key=lambda t: t[0]))
|
|
335 cond1_norm.update ((x, y * 100) for x, y in cond1_norm.items())
|
|
336 cond2_norm = OrderedDict(sorted(cond2_val.items(), key=lambda t: t[0]))
|
|
337 cond2_norm.update ((x, y * 100) for x, y in cond2_norm.items())
|
|
338 max_val = [x * 100 for x in max_val]
|
|
339 except ZeroDivisionError:
|
|
340 stop_err("Not enough reads to compute the codon occupancy")
|
0
|
341
|
|
342 AA = get_aa_dict(cond1_norm, cond2_norm)
|
|
343 max_valaa = []
|
|
344 cond1_aa = []
|
|
345 cond2_aa = []
|
|
346 aa_name = list(AA.iterkeys())
|
|
347 for z in AA.itervalues():
|
|
348 cond1_aa.append(z[0])
|
|
349 cond2_aa.append(z[1])
|
|
350 max_valaa.append(max(z))
|
|
351 # # plot amino acid profile :
|
15
|
352 fig = pl.figure(figsize=(15,10), num=1)
|
0
|
353 width = .50
|
|
354 ax = fig.add_subplot(111)
|
|
355 ax.xaxis.set_ticks([])
|
|
356 ind = arange(21)
|
|
357 pl.xlim(0, 21)
|
|
358 ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1)
|
|
359 ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2)
|
|
360 for x, y, z in zip(ind, max_valaa, aa_name):
|
|
361 ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14)
|
|
362 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
|
|
363 ax.set_xlabel('Amino Acid')
|
|
364 handles, labels = ax.get_legend_handles_labels()
|
|
365 ax.legend(handles, labels)
|
|
366 pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340)
|
|
367 pl.clf()
|
|
368
|
|
369
|
|
370 # # compute theorical count in COND2
|
|
371 sum2 = (sum21 + sum22) / 2
|
|
372 cond2_count = []
|
|
373 for z in cond1_norm.itervalues() :
|
|
374 count = int(z * sum2 / 100)
|
|
375 cond2_count.append(count)
|
|
376
|
|
377 expected = array(cond2_count)
|
|
378 observed = array(list(cond2.itervalues()))
|
|
379
|
|
380 # write result
|
|
381 with open(outfile, 'w') as out :
|
|
382 out.write('Codon\tRaw_' + c1 + '\tRaw_' + c2 + '\tNorm_' + c1 + '\tNorm_' + c2 + '\tFC\tFC_' + c1 + '\tFC_' + c2 + '\n')
|
|
383 for i in codon_sorted:
|
|
384 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\t' + str((cond2_1[i] / sum21) / (cond1_1[i] / sum11)) + '\t' + str((cond2_2[i] / sum22) / (cond1_1[i] / sum11)) + '\n')
|
|
385 chi = stats.chisquare(observed, expected)
|
|
386 out.write('Khi2 test\n')
|
|
387 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')
|
|
388
|
|
389
|
|
390
|
|
391 # plot result
|
15
|
392 fig = pl.figure(figsize=(20,10), num=1)
|
13
|
393 width = .40
|
0
|
394 ind = arange(len(codon_sorted))
|
|
395 ax = fig.add_subplot(111)
|
|
396 pl.xlim(0, len(codon_sorted) + 1)
|
|
397 ax.spines['right'].set_color('none')
|
|
398 ax.spines['top'].set_color('none')
|
|
399 ax.xaxis.set_ticks([])
|
|
400 ax.spines['left'].set_smart_bounds(True)
|
|
401 ax.yaxis.set_ticks_position('left')
|
|
402 ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, yerr=std_cond1, error_kw={'elinewidth':1, 'ecolor':'black'}, label=c1)
|
|
403 ax.bar(ind + width, list(cond2_norm.itervalues()), width, yerr=std_cond2, facecolor=color2, error_kw={'elinewidth':1, 'ecolor':'black'}, label=c2)
|
|
404 for x, y, z in zip(ind, max_val, codon_sorted):
|
|
405 ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8)
|
|
406 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
|
|
407 ax.set_xlabel('Codons')
|
|
408 handles, labels = ax.get_legend_handles_labels()
|
|
409 ax.legend(handles, labels)
|
|
410 pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
|
|
411 pl.clf()
|
|
412
|
|
413
|
|
414 elif len(result) == 2 :
|
|
415
|
|
416 # store each dict in OrderedDict sorted by key to make code more readable
|
|
417 cond1 = result[0]
|
|
418 cond2 = result[1]
|
|
419 cond1_norm = result[0].copy()
|
|
420 cond2_norm = result[1].copy()
|
|
421 # pdb.set_trace()
|
|
422 # get codon order in one of list
|
|
423 codon_sorted = sorted(cond1.iterkeys(), reverse=False)
|
15
|
424 try:
|
|
425 # get sum of each list
|
|
426 sum1 = sum(list(cond1.itervalues()))
|
|
427 sum2 = sum(list(cond2.itervalues()))
|
|
428 # #Normalize values by sum of each libraries
|
|
429 cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items())
|
|
430 cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items())
|
|
431 except ZeroDivisionError:
|
|
432 stop_err("Not enough reads to compute the codon occupancy")
|
|
433
|
0
|
434 # # compute theorical count in COND2
|
|
435 cond2_count = []
|
|
436 for z in cond1_norm.itervalues() :
|
|
437 count = int(z * sum2 / 100.0)
|
|
438 cond2_count.append(count)
|
|
439
|
|
440 expected = array(cond2_count)
|
|
441 observed = array(list(cond2.itervalues()))
|
|
442
|
|
443 AA = get_aa_dict(cond1_norm, cond2_norm)
|
|
444
|
|
445 max_val = []
|
|
446 cond1_aa = []
|
|
447 cond2_aa = []
|
|
448 aa_name = list(AA.iterkeys())
|
|
449 for z in AA.itervalues():
|
|
450 cond1_aa.append(z[0])
|
|
451 cond2_aa.append(z[1])
|
|
452 max_val.append(max(z))
|
|
453
|
|
454 # # plot amino acid profile :
|
15
|
455 fig = pl.figure(figsize=(15,10), num=1)
|
10
|
456 width = .45
|
0
|
457 ax = fig.add_subplot(111)
|
|
458 ind = arange(21)
|
|
459 pl.xlim(0, 21)
|
|
460 #kwargs = {"hatch":'x'}
|
|
461 #ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1, **kwargs)
|
|
462 #kwargs = {"hatch":'.'}
|
|
463 #ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2, **kwargs)
|
|
464 ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1)
|
|
465 ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2)
|
|
466 #for x, y, z in zip(ind, max_val, aa_name):
|
|
467 # ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14)
|
10
|
468 axis_font = {'size':'10'}
|
0
|
469 pl.xticks(ind + width, aa_name,**axis_font)
|
|
470 ax.spines['right'].set_visible(False)
|
|
471 ax.spines['top'].set_visible(False)
|
|
472 ax.yaxis.set_ticks_position('left')
|
|
473 ax.xaxis.set_ticks_position('bottom')
|
|
474 #ax.xaxis.set_ticks([])
|
|
475 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font)
|
|
476 ax.set_xlabel('Amino Acids', **axis_font)
|
|
477 handles, labels = ax.get_legend_handles_labels()
|
10
|
478 font_prop = font_manager.FontProperties(size=8)
|
0
|
479 ax.legend(handles, labels, prop=font_prop)
|
|
480 pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340)
|
|
481 pl.clf()
|
|
482
|
|
483 # write result
|
|
484 with open(outfile, 'w') as out :
|
|
485 out.write('Codon\tRaw_' + c1 + '\tRaw_' + c2 + '\tNorm_' + c1 + '\tNorm_' + c2 + '\tFC(Mut/WT)\n')
|
|
486 for i in codon_sorted:
|
|
487 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
|
|
488 out.write('Khi2 test\n')
|
|
489 chi = stats.chisquare(observed, expected)
|
|
490 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')
|
|
491
|
|
492 # # get max value for each codon for histogram
|
|
493 max_val = [] # # max value for graph
|
|
494 for i in cond1:
|
|
495 # # max value for each codon
|
|
496 max_val.append(max(cond1_norm[i], cond2_norm[i]))
|
|
497
|
|
498 # plot result
|
15
|
499 fig = pl.figure(figsize=(20,10), num=1)
|
10
|
500 #fig = pl.figure(num=1)
|
|
501 width = .40
|
0
|
502 ind = arange(len(codon_sorted))
|
|
503 ax = fig.add_subplot(111)
|
|
504 pl.xlim(0, len(codon_sorted) + 1)
|
|
505 ax.spines['right'].set_color('none')
|
|
506 ax.spines['top'].set_color('none')
|
|
507 ax.xaxis.set_ticks([])
|
|
508 ax.spines['left'].set_smart_bounds(True)
|
|
509 ax.yaxis.set_ticks_position('left')
|
|
510 ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1)
|
|
511 ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2)
|
|
512 for x, y, z in zip(ind, max_val, codon_sorted):
|
13
|
513 ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8)
|
0
|
514 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
|
|
515 ax.set_xlabel('Codons')
|
|
516 handles, labels = ax.get_legend_handles_labels()
|
|
517 ax.legend(handles, labels)
|
|
518 pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
|
|
519 pl.clf()
|
|
520
|
|
521
|
|
522 else :
|
|
523 stop_err('Error running codon usage plotting : ' + str(e))
|
|
524
|
|
525
|
|
526 return (cond1_norm, cond2_norm, chi[1])
|
|
527
|
|
528 def write_html_file(html, chi_pval, cond1, cond2):
|
|
529 try :
|
|
530
|
|
531
|
|
532 html_str = """
|
|
533 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
534 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
535
|
|
536 <html xmlns="http://www.w3.org/1999/xhtml">
|
|
537 <head>
|
|
538 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
|
539 <link href="/static/june_2007_style/blue/base.css" media="screen" rel="Stylesheet" type="text/css" />
|
|
540 </head>
|
|
541 <body>
|
|
542 <h3>Global visualization</h3>
|
|
543 <p>
|
15
|
544 <h5>Visualization of density footprint in each codon.</h5><br> If user has selected "Yes" for the replicate option the standard deviation between each replicate is plotted as an error bar in histogram.<br>
|
0
|
545 <img border="0" src="hist_codons.png" width="1040"/>
|
|
546 </p>
|
|
547 <p>
|
|
548 <h5>Test for homogeneity distribution between each condition</h5><br>
|
|
549 H0 : %s and %s are same distribution <br>
|
|
550 Khi2 test p-value: %s<br><br>
|
|
551 If p-value less than 0.05, we can reject homogeneity distribution so we can hypothesize that distributions are not the same. Otherwise, we accept H0<br>
|
|
552
|
|
553 </p>
|
|
554 <p>
|
|
555 <h5>Visualization of density footprint in each codon groupe by amino acid</h5><br>
|
|
556 <img border="0" src="hist_amino_acid.png" width="1040"/>
|
|
557 </p>
|
|
558 </body>
|
|
559 </html> """ % (cond1,cond2,chi_pval)
|
|
560
|
|
561
|
|
562 html_file = open(html, "w")
|
|
563 html_file.write(html_str)
|
|
564 html_file.close()
|
|
565
|
|
566 except Exception, e :
|
|
567 stop_err('Error during html page creation : ' + str(e))
|
|
568
|
|
569
|
|
570
|
|
571
|
|
572 def check_codons_list (codons) :
|
|
573
|
|
574 for codon in codons :
|
|
575 if codon not in init_codon_dict().iterkeys() :
|
|
576 stop_err('Please to enter a valid codon : ' + codon + ' is not find\n')
|
|
577
|
|
578
|
|
579 def check_index_bam (bamfile) :
|
|
580 # #testing indexed bam file
|
|
581 if os.path.isfile(bamfile + ".bai") :
|
|
582 pass
|
|
583 else :
|
|
584 cmd = "samtools index %s " % (bamfile)
|
|
585 proc = subprocess.Popen(args=cmd, shell=True, stderr=subprocess.PIPE)
|
|
586 returncode = proc.wait()
|
|
587 # if returncode != 0:
|
|
588 # raise Exception
|
15
|
589
|
|
590 def plot_fc (cond1, cond2, site, dirout):
|
|
591
|
|
592 fc = cond1.copy()
|
|
593
|
|
594 for key, value in fc.iteritems():
|
|
595 fc[key] = cond2[key]/cond1[key]
|
|
596
|
|
597 index = arange(len(fc.keys()))
|
|
598 label = fc.keys()
|
|
599 label = [w.replace('T','U') for w in label]
|
|
600 pl.figure(figsize=(15,10), num=1)
|
|
601 ax = pl.subplot(1,1,1)
|
|
602 pl.xticks([])
|
|
603 pl.scatter(index, fc.values(), color='b')
|
|
604 pl.axhline(y=1,color='r')
|
|
605 pl.xticks(index, label, rotation=90)
|
|
606 pl.ylabel('Foldchange of codon occupancy')
|
|
607 ax.yaxis.set_ticks_position('left')
|
|
608 ax.xaxis.set_ticks_position('bottom')
|
|
609 pl.title(site+" site")
|
|
610 pl.savefig(dirout + '/fc_codons.png', format="png", dpi=340)
|
|
611
|
0
|
612
|
|
613 def __main__():
|
10
|
614
|
0
|
615
|
|
616 # Parse command line options
|
|
617 parser = optparse.OptionParser()
|
|
618 parser.add_option("-g", "--gff", dest="gff", type="string",
|
|
619 help="gff file", metavar="FILE")
|
|
620
|
|
621 parser.add_option("-1", "--file1", dest="file1", type="string",
|
|
622 help="Bam Ribo-Seq alignments cond 1, if rep option, separate files by commas ", metavar="FILE")
|
|
623
|
|
624 parser.add_option("-2", "--file2", dest="file2", type="string",
|
|
625 help="Bam Ribo-Seq alignments cond 2, if rep option, separate files by commas", metavar="FILE")
|
|
626
|
|
627 parser.add_option("-c", "--cond1", dest="c1", type="string",
|
|
628 help="Name for first condition", metavar="STR")
|
|
629
|
|
630 parser.add_option("-C", "--cond2", dest="c2", type="string",
|
|
631 help="Name of second condition", metavar="STR")
|
|
632
|
13
|
633 parser.add_option("-k", "--kmer", dest="kmer", type="int", default = 28 ,
|
10
|
634 help="Length of your phasing reads", metavar="INT")
|
0
|
635
|
|
636 # parser.add_option("-l", "--list", dest="list_cod", type= "string",
|
|
637 # help="list of codons to compare to other", metavar="STR")
|
|
638
|
|
639 parser.add_option("-o", "--out", dest="outfile", type="string",
|
|
640 help="write report to FILE", metavar="FILE")
|
|
641
|
|
642 parser.add_option("-d", "--dirout", dest="dirout", type="string",
|
|
643 help="write report to PNG files", metavar="FILE")
|
|
644
|
13
|
645 parser.add_option("-a", "--asite", dest="asite", type="int", default = 15 ,
|
|
646 help="Off-set from the 5'end of the footprint to the A-site (default is 15)", metavar="INT")
|
0
|
647
|
13
|
648 parser.add_option("-s", "--site", dest="site", type="string", default = "A" ,
|
|
649 help="Script can compute in site A, P or E (default is A-site)", metavar="A|P|E")
|
0
|
650
|
13
|
651 parser.add_option("-r", "--rep", dest="rep", type="string", default = "no" ,
|
0
|
652 help="if replicate or not", metavar="yes|no")
|
|
653
|
13
|
654 parser.add_option("-x", "--hex_col1", dest="color1", type= "string", default = "SkyBlue" ,
|
0
|
655 help="Color for first condition", metavar="STR")
|
|
656
|
13
|
657 parser.add_option("-X", "--hex_col2", dest="color2", type= "string", default = "Plum" ,
|
0
|
658 help="Color for second condition", metavar="STR")
|
|
659
|
|
660 parser.add_option("-q", "--quiet",
|
|
661 action="store_false", dest="verbose", default=True,
|
|
662 help="don't print status messages to stdout")
|
|
663
|
|
664 (options, args) = parser.parse_args()
|
|
665 print "Begin codon frequency analysis at", time.asctime(time.localtime(time.time()))
|
|
666
|
|
667 try:
|
|
668 authorized_site = ["A", "P", "E"]
|
|
669 if options.site not in authorized_site :
|
|
670 stop_err(options.site + ' is not a authorized ribosome site')
|
|
671
|
|
672 ## Check if colors exist
|
|
673 if not colors.is_color_like(options.color1) :
|
|
674 stop_err( options.color1+' is not a proper color' )
|
|
675 if not colors.is_color_like(options.color2) :
|
|
676 stop_err( options.color2+' is not a proper color' )
|
|
677
|
|
678
|
|
679 #### NOT USE IN FINAL VERSION
|
|
680 # # get codon list
|
|
681 # codons = options.list_cod.upper().split(',')
|
|
682 # check_codons_list(codons)
|
10
|
683 GFF = HTSeq.GFF_Reader(options.gff)
|
0
|
684 # # get html file and directory :
|
|
685 (html, html_dir) = options.dirout.split(',')
|
|
686 if os.path.exists(html_dir):
|
|
687 raise
|
|
688 try:
|
|
689 os.mkdir(html_dir)
|
|
690 except:
|
|
691 raise Exception(html_dir + ' mkdir')
|
|
692 # #RUN analysis
|
|
693 # #If there are replicats
|
|
694 if options.rep == "yes" :
|
|
695 result = []
|
|
696 # split name of each file options by ","
|
|
697 cond1 = options.file1.split(',')
|
|
698 cond2 = options.file2.split(',')
|
|
699 # # calcul for each file
|
|
700 for fh in itertools.chain(cond1, cond2):
|
|
701 check_index_bam (fh)
|
|
702 result.append(get_codon_usage(fh, GFF, options.site, options.kmer, options.asite))
|
|
703 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
|
|
704 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
|
15
|
705 plot_fc (cond1, cond2, options.site, html_dir)
|
0
|
706
|
|
707 # #If there are no replicat
|
|
708 elif options.rep == "no" :
|
|
709 result = []
|
|
710 # #calcul for each cond
|
|
711 for fh in (options.file1, options.file2):
|
|
712 check_index_bam (fh)
|
|
713 result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite))
|
|
714 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
|
|
715 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
|
15
|
716 plot_fc (cond1, cond2, options.site, html_dir)
|
0
|
717 else :
|
|
718 sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time())))
|
|
719 sys.exit()
|
|
720
|
|
721 # write_html_file(html,chi_pval,t_pval,codons,options.c1, options.c2)
|
|
722 write_html_file(html, chi_pval, options.c1, options.c2)
|
|
723
|
|
724 print "Finish codon frequency analysis at", time.asctime(time.localtime(time.time()))
|
|
725 except Exception, e:
|
|
726 stop_err('Error running codon frequency analysis (main program) : ' + str(e))
|
|
727
|
|
728
|
|
729 if __name__=="__main__":
|
|
730 __main__()
|