comparison scripts/S01a_codons_counting.py @ 5:0ba551449008 draft

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 273a9af69b672b2580cd5dec4c0e67a4a96fb0fe
author abims-sbr
date Tue, 27 Feb 2018 08:48:34 -0500
parents
children 04a9ada73cc4
comparison
equal deleted inserted replaced
4:5766f80370e7 5:0ba551449008
1 #!/usr/bin/env python
2 # coding: utf-8
3 # Author : Victor Mataigne
4
5 import string, os, sys, re, random, itertools, argparse, copy
6 import pandas as pd
7 import numpy as np
8
9 def buildDicts(list_codons, content, dict_genetic_code, dict_aa_classif):
10 """ Build dictionaries with values to 0. These dictionaries are used as starting point for each sequence counting
11
12 Args :
13 list_codons (list of str) : all codons except codons-stop
14 content (int or list) : an integer (for coutings and transitions), or an empty list (for resampling)
15 dict_genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...}
16 dict_aa_classif (dict) : the types of the amino-acids ( {type: [aa1, aa2, ...], ...}
17
18 Returns :
19 dico_codons, dico_aa, dico_aatypes (dicts) : keys : codons/amico-acids/amico-acids types, values : 0 or []
20 dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions (dicts of dicts) :
21 actually, the three first dictionaries nested as values of keys codons/amico-acids/amico-acids
22 """
23
24 # I could have make sub-routines here.
25 # the copy commands are mandatory, otherwise all dictionaries will reference the same variable(s)
26
27 dico_codons = {}
28 for codon in list_codons:
29 dico_codons[codon] = copy.deepcopy(content)
30
31 dico_aa = {}
32 for aa in dict_genetic_code.keys():
33 dico_aa[aa] = copy.deepcopy(content)
34
35 dico_aatypes = {}
36 for aatype in dict_aa_classif.keys():
37 dico_aatypes[aatype] = copy.deepcopy(content)
38
39 dico_codons_transitions=copy.deepcopy(dico_codons)
40 for key in dico_codons_transitions.keys():
41 dico_codons_transitions[key]=copy.deepcopy(dico_codons)
42
43 dico_aa_transitions = copy.deepcopy(dico_aa)
44 for key in dico_aa_transitions.keys():
45 dico_aa_transitions[key]=copy.deepcopy(dico_aa)
46
47 dico_aatypes_transitions = copy.deepcopy(dico_aatypes)
48 for key in dico_aatypes_transitions.keys():
49 dico_aatypes_transitions[key]=copy.deepcopy(dico_aatypes)
50
51 return dico_codons, dico_aa, dico_aatypes, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions
52
53 def viable(seqs, pos, method):
54 """ Compute if, among a set of sequences, either at least one of the codons at the specified position has not a "-",
55 or not any codon has a "-"
56
57 Args :
58 seqs : a list (the sequences, which must all have the same size)
59 pos : an integer (the positions, <= len(seqs) -3)
60
61 Returns:
62 bool
63 """
64 codons = [seq[pos:pos+3] for seq in seqs]
65 if method == "all":
66 return not all("-" in codon for codon in codons)
67 elif method == "any":
68 return not any("-" in codon for codon in codons)
69
70 # # # Function for codons, aa, aatypes Countings -------------------------------------------------------------------------------
71
72 def computeAllCountingsAndFreqs(seq, list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif):
73 """ Call all functions dedicated to the computation of occurences and frequencies of codons, amino-acids, amino-acids types
74
75 Args : see sub-routines
76
77 Returns : 6 dictionaries (occurences and frequencies for codons, amino-acids, amino-acids types). See sub-routines for details.
78 """
79
80 # ------ Sub-routines ------ #
81
82 def codonsCountings(seq, list_codons, init_dict_codons):
83 """ Count occurences of each codon in a sequence
84 First reading frame only : input sequence is supposed to be an ORF
85
86 Args :
87 seq (str) : the sequence
88 list_codons (list of str) : all codons except codons-stop
89 init_dict_codons (dict) : {codon1 : 0, codon2: 0, ...}
90
91 Return :
92 codon (dict) : codons (keys) and their occurences (values) in the sequence
93 """
94
95 codons = copy.deepcopy(init_dict_codons)
96
97 l = len(seq)
98
99 if l%3 == 0: max_indice = l-3
100 if l%3 == 1: max_indice = l-4
101 if l%3 == 2: max_indice = l-5
102
103 for codon in range(0,max_indice+1,3):
104 if "-" not in seq[codon:codon+3] :
105 codons[seq[codon:codon+3]] += 1
106
107 return codons
108
109 def codonsFreqs(dict_codons_counts):
110 """ Computes frequencies of each codon in a sequence
111
112 Args :
113 dict_codons (dict) : the output of codonsCounting()
114
115 Return :
116 codons_freqs (dict) : codons (keys) and their frequencies (values)
117 """
118
119 codons_freqs = {}
120
121 for key in dict_codons_counts.keys():
122 freq = float(dict_codons_counts[key])/sum(dict_codons_counts.values())
123 codons_freqs[key] = freq
124
125 return codons_freqs
126
127 def aaCountings(dict_codons, dict_genetic_code, init_dict_aa):
128 """ Count occurences of each amino-acid in a sequence, based on the countings of codons (1st ORF)
129
130 Args :
131 dict_codons (dict) : the output of codonsCounting()
132 dict_genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...}
133 init_dict_aa (dict) : {aa1 : 0, aa2: 0, ...}
134
135 Return :
136 dict_aa (dict) : amino-acids (keys) and their occurences (values)
137 """
138 dict_aa = copy.deepcopy(init_dict_aa)
139
140 for key in dict_codons.keys():
141 for value in dict_genetic_code.values():
142 if key in value:
143 dict_aa[dict_genetic_code.keys()[dict_genetic_code.values().index(value)]] += dict_codons[key]
144
145 return dict_aa
146
147 def aaFreqs(dict_aa_counts):
148 """ Computes frequencies of each amino-acid in a sequence, based on the countings of codons (1st ORF)
149
150 Args :
151 dict_aa_counts (dict) : the output of aaCountings()
152
153 Return :
154 dict_aa_freqs (dict) : amino-acids (keys) and their frequencies (values)
155 """
156
157 dict_aa_freqs = {}
158
159 for key in dict_aa_counts.keys():
160 freq = float(dict_aa_counts[key])/sum(dict_aa_counts.values())
161 dict_aa_freqs[key] = freq
162
163 return dict_aa_freqs
164
165 def aatypesCountings(dict_aa, dict_aa_classif, init_dict_classif):
166 """ Computes frequencies of each amino-acid type in a sequence, based on the countings of amino-acids (1st ORF)
167
168 Args :
169 - dict_aa (dict) : the output of aaCountings()
170 - dict_aa_classif (dict) : the types of the amino-acids ( {type: [aa1, aa2, ...], ...} )
171 - init_dict_classif (dict) : {'polar': 0, 'apolar': 0, ...}
172
173 Return :
174 dict_aatypes (dict) : amino-acids types (keys) and their occurences (values)
175 """
176 dict_aatypes = copy.deepcopy(init_dict_classif)
177
178 for key_classif in dict_aa_classif.keys():
179 for key_aa in dict_aa.keys():
180 if key_aa in dict_aa_classif[key_classif]:
181 dict_aatypes[key_classif] += dict_aa[key_aa]
182
183 return dict_aatypes
184
185 def aatypesFreqs(dict_aatypes):
186 """ Computes frequencies of each amino-acid type in a sequence, based on the countings of amino-acids (1st ORF)
187
188 Args :
189 dict_aatypes (dict) : the output of aatypesCountings()
190
191 Return :
192 dict_aatypes_freqs : amino-acids types (keys) and their frequencies (values)
193 """
194 dict_aatypes_freqs = {}
195
196 for key in dict_aatypes.keys():
197 freq = float(dict_aatypes[key])/sum(dict_aatypes.values())
198 dict_aatypes_freqs[key] = freq
199
200 return dict_aatypes_freqs
201
202 # ------ The function ------ #
203
204 codons_c = codonsCountings(seq, list_codons, init_dict_codons)
205 codons_f = codonsFreqs(codons_c)
206 aa_c = aaCountings(codons_c, dict_genetic_code, init_dict_aa)
207 aa_f = aaFreqs(aa_c)
208 aat_c = aatypesCountings(aa_c, dict_aa_classif, init_dict_classif)
209 aat_f = aatypesFreqs(aat_c)
210
211 return codons_c, codons_f, aa_c, aa_f, aat_c, aat_f
212
213 # # # Functions for various measures (ivywrel, ekqh...) -------------------------------------------------------------------------
214
215 def computeVarious(seq, dict_aa_counts, dict_aa_types):
216 """ Call al the functions for nucleic and amino-acids sequences description
217
218 Args : See sub-routines for details.
219
220 Returns : 6 integer or floats. See sub-routines for details.
221 """
222
223 # ------ Sub-routines ------ #
224
225 def nbCodons(seq):
226 """ Compute the number of full codons in a sequence
227 Arg : seq (str): the sequence
228 Return: nb_codons (int)
229 """
230 l = len(seq)
231 if l%3 == 0: nb_codons = l/3
232 if l%3 == 1: nb_codons = (l-1)/3
233 if l%3 == 2: nb_codons = (l-2)/3
234 return nb_codons
235
236 def maxIndice(seq):
237 """ Compute the highest indice for parsing a sequence
238 Arg : seq (str): the sequence
239 Return : max_indice (int)
240 """
241 l = len(seq)
242 if l%3 == 0: max_indice = l-3
243 if l%3 == 1: max_indice = l-4
244 if l%3 == 2: max_indice = l-5
245 return max_indice
246
247 def gc12Andgc3Count(seq, nb_codons, max_indice):
248 """ Compute the frequency of gc12 in a sequence
249 Arg : seq (str) : the sequence
250 Return : (float)
251 """
252
253 # TO IMPROVE ? : make this computation in the codonCountigns() function to avoid parsing twice the sequence ?
254 # But : involves a less readable code
255
256 gc12 = 0
257 gc3 = 0
258
259 for i in range(0, max_indice+1,3):
260 if seq[i] in ["c","g"]: gc12 += 1
261 if seq[i+1] in ["c","g"]: gc12 += 1
262 if seq[i+2] in ["c","g"] or seq[i+2] in ["c","g"]: gc3 += 1
263
264 return float(gc3)/nb_codons, float(gc12)/(2*nb_codons)
265
266 def ivywrelCount(nb_codons, dict_aa_counts):
267 """ Compute the sum of occurences of amino-acids IVYWREL divided by the number of codons
268
269 Args :
270 nb_codons (int) : the number of codons in the sequence
271 dict_aa_counts (dict) : the output of aaCountings()
272
273 return : (float)
274 """
275
276 IVYWREL = 0
277
278 for aa in ["I","V","Y","W","R","E","L"]: # Impossible to make a simple sum, in case one the aa is not in the dict keys
279 if aa in dict_aa_counts.keys():
280 IVYWREL += dict_aa_counts[aa]
281
282 return float(IVYWREL)/nb_codons
283
284 def ekqhCount(dict_aa_counts):
285 """ Compute the ratio of amino-acids EK/QH
286 Arg : dict_aa_counts (dict) : the output of aaCountings()
287 Return : (float)
288 """
289 ek = 0
290 qh = 0
291
292 ek = dict_aa_counts["E"] + dict_aa_counts["K"]
293 qh = dict_aa_counts["Q"] + dict_aa_counts["H"]
294
295 if qh != 0:
296 return float(ek)/qh
297 else : return ek
298
299 def payresdgmCount(dict_aa_counts):
300 """ Compute the ratio of amino-acids PASRE/SDGM
301 Arg : dict_aa_counts (dict) : the output of aaCountings()
302 Return : (float)
303 """
304 payre = 0
305 sdgm = 0
306
307 for aa in ["P","A","Y","R","E"]:
308 payre += dict_aa_counts[aa]
309 for aa in ["S","D","G","M"]:
310 sdgm += dict_aa_counts[aa]
311
312 if sdgm != 0:
313 return float(payre)/sdgm
314 else : return payre
315
316 def purineLoad(seq, nb_codons):
317 """ Compute the purine load indice of a sequence
318 Args :
319 seq (str) : the sequence
320 nb_codons (int) : the number of codons in the sequence
321
322 Return (float)
323 """
324
325 # TO IMPROVE ? : make this computation in the codonCountigns() function to avoid parsing twice the sequence ?
326 # But : involves a less readable code
327
328 g12, g3, A, c12, c3, T = 0.0,0.0,seq.count("a"),0.0,0.0,seq.count("t")
329
330 # g3 and c3 : g and c in 3rd position of a codon
331 s = ""
332 for i in range(2, len(seq), 3):
333 s += seq[i]
334 g3 = s.count("g")
335 c3 = s.count("c")
336
337 # g12 and c12 : g and c in 1st and 2d positions of a codon
338 s = ""
339 for i in range(0, len(seq), 3):
340 s += seq[i]
341 g12 = s.count("g")
342 c12 = s.count("c")
343 s = ""
344 for i in range(1, len(seq), 3):
345 s += seq[i]
346 g12 += s.count("g")
347 c12 += s.count("c")
348
349 return float(1000*(g12+g3+A-c12-c3-T))/(3*nb_codons)
350
351 def cvp(dict_aatypes):
352 """ Compute the difference nb_charged_aamino_acids - nb_polar_amino_acids
353 Return: (int)
354 """
355 return dict_aatypes["charged"] - dict_aatypes["polar"]
356
357 # ------ The function ------ #
358
359 nb_codons = nbCodons(seq)
360 max_indice = maxIndice(seq)
361 GC3, GC12 = gc12Andgc3Count(seq, nb_codons, max_indice)
362 IVYWREL, EKQH, PAYRESDGM = ivywrelCount(nb_codons, dict_aa_counts), ekqhCount(dict_aa_counts), payresdgmCount(dict_aa_counts)
363 purineload, CvP = purineLoad(seq, nb_codons), cvp(dict_aa_types)
364 return GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP
365
366 # # # Function for codons, aa, aatypes Transitions -----------------------------------------------------------------------------
367
368 def computeAllBiases(seq1, seq2, dico_codons_transi, dico_aa_transi, dico_aatypes_transi, reversecode, reverseclassif) :
369 """ Compute all biases (transisitions codon->codon, aa->-aa, aa_type->aa_type) between two sequences
370
371 Args : See sub-routines for details
372
373 Returns 3 dictionaries of dictionaries. See sub-routines for details
374 """
375
376 # ------ Sub-routines ------ #
377
378 def codons_transitions(seq1, seq2, dico_codons_transi):
379 """ Compute the number of transitions from a codon of a sequence to the codon of a second sequence
380
381 Args :
382 seq1 (str) : the first sequence
383 seq2 (str) : the second sequence
384 dico_codons_transi (dict of dicts) : { codon1 : {codon1: 0, codon2 : 0, ...}, ..}
385
386 Return :
387 codons_transi (dict of dicts) : the occurences of each codon to codon transition
388 """
389
390 codons_transi = copy.deepcopy(dico_codons_transi)
391
392 for i in range(0, len(seq1), 3):
393 # check if no indel and if len seq[i:i+3] is really 3 (check for the last codon)
394 if viable([seq1, seq2], i, "any") and len(seq1[i:i+3]) == 3 and len(seq2[i:i+3]) == 3 :
395 codons_transi[seq1[i:i+3]][seq2[i:i+3]] += 1
396
397 return codons_transi
398
399 def codons_transitions_freqs(codons_transitions_counts):
400 """ Computes frequencies of codon transitions between two sequences
401
402 Arg :
403 codons_transitions_counts (dict) : the output of codons_transitions()
404
405 Return :
406 codons_transi_freqs (dict of dicts) : the frequencies of each codon to codon transition
407 """
408
409 codons_transi_freqs = {}
410
411 for key in codons_transitions_counts.keys():
412 codons_transi_freqs[key] = {}
413 for key2 in codons_transitions_counts[key].keys():
414 if sum(codons_transitions_counts[key].values()) != 0:
415 freq = float(codons_transitions_counts[key][key2])/sum(codons_transitions_counts[key].values())
416 codons_transi_freqs[key][key2] = freq
417 else :
418 codons_transi_freqs[key][key2] = 0
419 return codons_transi_freqs
420
421 def aa_transitions(dico_codons_transi, dico_aa_transi, reversecode):
422 """ Compute the number of transitions from an amino-acid of a sequence to the amino-acid of a second sequence
423
424 Args :
425 dico_codons_transi (dict of dicts) : the codons transitions computed by codons_transitions()
426 dico_aa_transi (dict of dicts) : { aa1 : {aa1: 0, aa2 : 0, ...}, ..}
427 reversecode (dict) : the reversed genetic code {aa1 : [codons],...} -> {codon1: aa1, codon2: aa2, ...}
428
429 Return :
430 aa_transi (dict of dicts) : the occurences of each aa to aa transition
431 """
432
433 aa_transi = copy.deepcopy(dico_aa_transi)
434
435 for k in dico_codons_transi.keys():
436 newk = reversecode[k]
437 for k2 in dico_codons_transi[k].keys():
438 newk2 = reversecode[k2]
439 aa_transi[newk][newk2] += dico_codons_transi[k][k2]
440
441 return aa_transi
442
443 def aa_transitions_freqs(aa_transitions_counts):
444 """ Computes frequencies of amico-acids transitions between two sequences
445 Arg : aa_transitions_counts (dict of dicts): the output of aa_transitions()
446 Return : aa_transi_freqs (dict of dicts) : the frequencies of each aa to aa transition
447 """
448
449 aa_transi_freqs = {}
450
451 for key in aa_transitions_counts.keys():
452 aa_transi_freqs[key] = {}
453 for key2 in aa_transitions_counts[key].keys():
454 if sum(aa_transitions_counts[key].values()) != 0:
455 freq = float(aa_transitions_counts[key][key2])/sum(aa_transitions_counts[key].values())
456 aa_transi_freqs[key][key2] = freq
457 else :
458 aa_transi_freqs[key][key2] = 0
459 return aa_transi_freqs
460
461 def aatypes_transitions(dico_aa_transi, dico_aatypes_transi, reverseclassif):
462 """ Compute the number of transitions from an amino-acid type of a sequence to the amino-acid type of a second sequence
463
464 Args :
465 dico_aa_transi (dict of dicts) : the output of aa_transitions()
466 dico_aatypes_transi (dict of dicts) : { type1 : {type1: 0, type2 : 0, ...}, ..}
467 reverseclassif (dict) : the reversed amino-acid clasification {aa1: type, aa2: type, ...}
468
469 Return :
470 aatypes_transi (dict of dicts) : the occurences of each aatype to aatype transition
471 """
472
473 aatypes_transi = copy.deepcopy(dico_aatypes_transi)
474 for k in dico_aa_transi.keys():
475 newk = reverseclassif[k]
476 for k2 in dico_aa_transi[k].keys():
477 newk2 = reverseclassif[k2]
478 aatypes_transi[newk][newk2] += dico_aa_transi[k][k2]
479
480 return aatypes_transi
481
482 def aatypes_transitions_freqs(aatypes_transitions_counts):
483 """ Computes frequencies of amico-acids types transitions between two sequences
484 Args : aatypes_transitions_counts (dict of dicts) : the output of aatypes_transitions()
485 Return : aatypes_transi_freqs (dict of dicts) : the frequencies of each aatype to aatype transition
486 """
487
488 aatypes_transi_freqs = {}
489
490 for key in aatypes_transitions_counts.keys():
491 aatypes_transi_freqs[key] = {}
492 for key2 in aatypes_transitions_counts[key].keys():
493 if sum(aatypes_transitions_counts[key].values()) != 0:
494 freq = float(aatypes_transitions_counts[key][key2])/sum(aatypes_transitions_counts[key].values())
495 aatypes_transi_freqs[key][key2] = freq
496 else :
497 aatypes_transi_freqs[key][key2] = 0
498 return aatypes_transi_freqs
499
500 # ------ The function ------ #
501
502 codons_transitions = codons_transitions(seq1, seq2, dico_codons_transi)
503 codons_transitions_freqs = codons_transitions_freqs(codons_transitions)
504 aa_transitions = aa_transitions(codons_transitions, dico_aa_transi, reversecode)
505 aa_transitions_freqs = aa_transitions_freqs(aa_transitions)
506 aatypes_transitions = aatypes_transitions(aa_transitions, dico_aatypes_transi, reverseclassif)
507 aatypes_transitions_freqs = aatypes_transitions_freqs(aatypes_transitions)
508
509 return codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs
510
511 # # # Function for random resampling --------------------------------------------------------------------------------------------
512
513 def sampling (dict_seq, nb_iter, len_sample, list_codons, genetic_code, aa_classif, reversecode, reverseclassif):
514 """ Resample randomly codons from sequences (sort of bootsrap)
515
516 Args :
517 dict_seq (dict) : contains the species name and sequences used ( { 'sp1': seq1, 'sp2': seq2, ...}) without '-' removal
518 nb_iter (int) : number of resampling iterations (better >= 1000)
519 len_sample (int) : length (in codons) of a resampled sequence (better >= 1000)
520 list_codons (list of str): all codons except codons-stop
521 genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...}
522 aa_classif (dict) : the types of the amino-acids : {type: [aa1, aa2, ...], ...}
523 reversecode (dict) : the reversed genetic code : {codon1: aa1, codon2: aa2, ...}
524 reverseclassif (dict) : the reversed amino-acid : clasification {aa1: type, aa2: type, ...}
525
526 Returns :
527 codons_lst, aa_lst, classif_lst (dicts) : keys : codons/aa/aatypes, values : []
528 codons_transitions_lst, aa_transitions_lst, classif_transitions_lst (dict of dicts) : keys : codons/aa/aatypes, values : the 3 previous dicts
529 """
530
531 # Initialize empty dictionaries for countings and transitions. It's also possible to isntanciate these ones in the main() but it would make a function with ~14 parameters
532 codons_0, aa_0, classif_0, codons_transitions_0, aa_transitions_0, classif_transitions_0 = buildDicts(list_codons, 0, genetic_code, aa_classif)
533 codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst = buildDicts(list_codons, [], genetic_code, aa_classif)
534
535 # determine the max position where sampling is possible
536 l = len(dict_seq.values()[1])
537 if l%3 == 0: max_indice = l-3
538 if l%3 == 1: max_indice = l-4
539 if l%3 == 2: max_indice = l-5
540
541 # List of positions to resample
542 viable_positions = [pos for pos in range(0,max_indice,3) if viable(dict_seq.values(), pos, "all")]
543 sample_positions = np.random.choice(viable_positions, len_sample)
544
545 # nb_iter resampled sequences
546 for i in range(nb_iter):
547 if (i+1)%(nb_iter/10) == 0:
548 print " "+str( (i+1)*100/nb_iter)+"%"
549
550 seqa, seqb = "", ""
551 for pos in sample_positions:
552 codona, codonb = "---", "---"
553 # The sequence to be resampled in this position is randomly chosen ; no "-" resampled
554 while "-" in codona :
555 codona = dict_seq.values()[random.randrange(0, len(dict_seq.keys())-1)][pos:pos+3]
556 while "-" in codonb :
557 codonb = dict_seq.values()[random.randrange(0, len(dict_seq.keys())-1)][pos:pos+3]
558 seqa += codona
559 seqb += codonb
560
561 # dictionaries : frequences of codons, aa, aatypes (seq1)
562 codons_occ_tmp, codons_freq_tmp, aa_occ_tmp, aa_freq_tmp, aatypes_occ_tmp, aatypes_freq_tmp = computeAllCountingsAndFreqs(seqa, list_codons, codons_0, aa_0, classif_0, genetic_code, aa_classif)
563 # dictionaries frequences of transitions (seqa->seqb)
564 codons_transitions_tmp, codons_transitions_freq_tmp, aa_transition_tmp, aa_transitions_freq_tmp, aatypes_transitions_tmp, aatypes_transitions_freq_tmp = computeAllBiases(seqa, seqb, codons_transitions_0, aa_transitions_0, classif_transitions_0, reversecode, reverseclassif)
565
566 # Adding occurrences in final dicts
567 for key in codons_freq_tmp.keys():
568 codons_lst[key].append(codons_freq_tmp[key])
569 for key in aa_freq_tmp.keys():
570 aa_lst[key].append(aa_freq_tmp[key])
571 for key in aatypes_freq_tmp.keys():
572 classif_lst[key].append(aatypes_freq_tmp[key])
573
574 # Adding occurrences in final dicts (transitions)
575 for key in codons_transitions_freq_tmp.keys():
576 for key2 in codons_transitions_freq_tmp[key].keys():
577 codons_transitions_lst[key][key2].append(codons_transitions_freq_tmp[key][key2])
578 for key in aa_transitions_freq_tmp.keys():
579 for key2 in aa_transitions_freq_tmp[key].keys():
580 aa_transitions_lst[key][key2].append(aa_transitions_freq_tmp[key][key2])
581 for key in aatypes_transitions_freq_tmp.keys():
582 for key2 in aatypes_transitions_freq_tmp[key].keys():
583 classif_transitions_lst[key][key2].append(aatypes_transitions_freq_tmp[key][key2])
584
585 return codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst
586
587 def testPvalues(dict_counts, dict_resampling, nb_iter):
588 """ Computes where the observed value is located in the expected counting distribution
589
590 Args :
591 dict_counts (dict) : observed frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases()
592 dict_resampling (dict) : expected frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() within the sampling() function
593
594 Return :
595 pvalue (dict, dict of dicts) : the pvalues of all observed countings (dict) and transitions (dict of dicts)
596 """
597
598 def testPvalue(obs, exp, nb_iter):
599 """ Compute a pvalue
600
601 Args :
602 exp (list of floatsà : a list of length nb_iter, containing expected frequencies of a codon/aa/aatype at each iteration
603 obs (float) : the observed value
604 nb_iter (int) : the number of iterations for resampling
605
606 Returns :
607 pvalue (float)
608 """
609
610 max_val = nb_iter-1
611 min_val = 0
612 test_val = (max_val+min_val)/2
613
614 while max_val-min_val > 1:
615 if obs > exp[test_val]:
616 min_val = test_val
617 test_val = (max_val+min_val)/2
618 elif obs < exp[test_val]:
619 max_val = test_val
620 test_val = (max_val+min_val)/2
621 else:
622 break
623
624 pvalue = float(test_val+1)/(nb_iter+1)
625
626 return pvalue
627
628 # ------ The function ------ #
629
630 pvalues = {}
631
632 for key in dict_resampling.keys():
633 if type(dict_resampling.values()[1]) is not dict :
634 pvalues[key] = testPvalue(dict_counts[key], dict_resampling[key], nb_iter)
635 else :
636 pvalues[key] = {}
637 for key2 in dict_resampling[key].keys():
638 pvalues[key][key2] = testPvalue(dict_counts[key][key2], dict_resampling[key][key2], nb_iter)
639
640 return pvalues
641
642 def main():
643
644 # arguments
645 parser = argparse.ArgumentParser()
646 parser.add_argument("sequences_file", help="File containing sequences (the output of the tool 'ConcatPhyl'")
647 parser.add_argument("considered_species", help="The species name, separated by commas (must be the same than in the sequences_file). It is possible to consider only a subset of species.")
648 parser.add_argument("species_for_bootstrap", help="The species which will be used for bootstrapping, separated by commas. It is possible to consider only a subset of species.")
649 parser.add_argument("iteration", help="The number of iterations for bootstrapping (better if => 1000)", type=int)
650 parser.add_argument("sample_length", help="The lenght of a bootstrapped sequences (better if >= 1000", type=int)
651 args = parser.parse_args()
652
653 print "\n ------ Occurences and frequencies of codons, amino-acids, amino-acids types -------\n"
654
655 print "The script counts the number of codons, amino acids, and types of amino acids in sequences,"
656 print "as well as the mutation bias from one item to another between 2 sequences.\n"
657
658 print "Counting are then compared to empirical p-values, obtained from bootstrapped sequences obtained from a subset of sequences."
659 print "In the output files, the pvalues indicate the position of the observed data in a distribution of empirical countings obtained from"
660 print "a resample of the data. Values above 0.95 indicate a significantly higher counting, values under 0.05 a significantly lower counting."
661
662 print " Sequences file : {}".format(args.sequences_file)
663 print " Species retained for countings : {}\n".format(args.considered_species)
664
665 print "Processing : reading input file, opening output files, building dictionaries."
666
667 # make pairs
668 list_species = str.split(args.considered_species, ",")
669 list_species_boot = str.split(args.species_for_bootstrap, ",")
670 pairs_list=list(itertools.combinations(list_species,2))
671
672 # read sequences
673 sequences_for_counts = {}
674 sequences_for_resampling = {}
675 with open(args.sequences_file, "r") as file:
676 for line1,line2 in itertools.izip_longest(*[file]*2):
677 species = line1.strip(">\r\n")
678 sequence = line2.strip("\r\n")
679 if species in list_species:
680 sequences_for_counts[species] = sequence
681 if species in list_species_boot:
682 sequences_for_resampling[species] = sequence
683
684 print " Warning : countings might be biased and show high differences between species because of high variations of the indels proportions among sequences."
685 print " Frequences are more representative."
686
687 print "\n Indels percent :"
688
689 for k,v in sequences_for_counts.items():
690 print " {} : {} %".format(k, float(v.count("-"))/len(v)*100)
691
692 # useful dictionaries
693 dict_genetic_code={"F":["ttt","ttc"],
694 "L":["tta","ttg","ctt","ctc","cta","ctg"],
695 "I":["att","atc","ata"],
696 "M":["atg"],
697 "V":["gtt","gtc","gta","gtg"],
698 "S":["tct","tcc","tca","tcg","agt","agc"],
699 "P":["cct","cca","ccg","ccc"],
700 "T":["act","acc","aca","acg"],
701 "A":["gct","gcc","gca","gcg"],
702 "Y":["tat","tac"],
703 "H":["cat","cac"],
704 "Q":["caa","cag"],
705 "N":["aat","aac"],
706 "K":["aaa","aag"],
707 "D":["gat","gac"],
708 "E":["gaa","gag"],
709 "C":["tgt","tgc"],
710 "W":["tgg"],
711 "R":["cgt","cgc","cga","cgg","aga","agg"],
712 "G":["ggt","ggc","gga","ggg"]}
713
714 dict_aa_classif={"unpolar":["G","A","V","L","M","I"],
715 "polar":["S","T","C","P","N","Q"],
716 "charged":["K","R","H","D","E"],
717 "aromatics":["F","Y","W"]}
718
719 reversecode={v:k for k in dict_genetic_code for v in dict_genetic_code[k]}
720 reverseclassif={v:k for k in dict_aa_classif for v in dict_aa_classif[k]}
721
722 # codons list (without stop codons)
723 nucleotides = ['a', 'c', 'g', 't']
724 list_codons = [''.join(comb) for comb in itertools.product(nucleotides, repeat=3)]
725 list_codons.remove("taa")
726 list_codons.remove("tag")
727 list_codons.remove("tga")
728
729 # Store already computed species + row.names in output files
730 index = []
731 index_transi = []
732
733 # Final dictionaries writed to csv files
734 all_codons = {}
735 all_aa = {}
736 all_aatypes = {}
737 all_various = {}
738 all_codons_transitions = {} # Not used because too much : 61*61 columns
739 all_aa_transitions = {}
740 all_aatypes_transitions = {}
741
742 # RUN
743
744 print "\nProcessing : resampling ..."
745 print " Parameters : {niter} iterations, {lensample} codon per resampled sequence, species used : {species}\n".format(niter=args.iteration, lensample=args.sample_length, species=args.species_for_bootstrap)
746
747 codons_boot, aa_boot, aatypes_boot, codons_transi_boot, aa_transi_boot, aatypes_transi_boot = sampling(sequences_for_resampling, args.iteration, args.sample_length, list_codons, dict_genetic_code, dict_aa_classif, reversecode, reverseclassif)
748 print " Done.\n"
749
750 print "Processing : countings....\n"
751
752 # Initialize empty dictionaries for countings and transitions
753 init_dict_codons, init_dict_aa, init_dict_classif, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions = buildDicts(list_codons, 0, dict_genetic_code, dict_aa_classif)
754
755 for pair in pairs_list:
756 p1, p2 = pair[0], pair[1]
757 if p1 not in index:
758 print "Countings on {}".format(p1)
759
760 p1_codons_counts, p1_codons_freqs, p1_aa_counts, p1_aa_freqs, p1_aatypes_counts, p1_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p1], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif)
761 p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP = computeVarious(sequences_for_counts[p1], p1_aa_counts, p1_aatypes_freqs)
762
763 p1_codons_pvalues = testPvalues(p1_codons_freqs, codons_boot, args.iteration)
764 p1_aa_pvalues = testPvalues(p1_aa_freqs, aa_boot, args.iteration)
765 p1_aatypes_pvalues = testPvalues(p1_aatypes_freqs, aatypes_boot, args.iteration)
766
767 all_codons[p1+"_obs_counts"] = p1_codons_counts
768 all_codons[p1+"_obs_freqs"] = p1_codons_freqs
769 all_codons[p1+"_pvalues"] = p1_codons_pvalues
770 all_aa[p1+"_obs_counts"] = p1_aa_counts
771 all_aa[p1+"_obs_freqs"] = p1_aa_freqs
772 all_aa[p1+"_pvalues"] = p1_aa_pvalues
773 all_aatypes[p1+"_obs_counts"] = p1_aatypes_counts
774 all_aatypes[p1+"_obs_freqs"] = p1_aatypes_freqs
775 all_aatypes[p1+"_pvalues"] = p1_aatypes_pvalues
776 all_various[p1] = [p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP]
777
778 index.append(p1)
779
780 if p2 not in index:
781 print "Countings on {}".format(p2)
782
783 p2_codons_counts, p2_codons_freqs, p2_aa_counts, p2_aa_freqs, p2_aatypes_counts, p2_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p2], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif)
784 p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP = computeVarious(sequences_for_counts[p2], p2_aa_counts, p2_aatypes_freqs)
785
786 p2_codons_pvalues = testPvalues(p2_codons_freqs, codons_boot, args.iteration)
787 p2_aa_pvalues = testPvalues(p2_aa_freqs, aa_boot, args.iteration)
788 p2_aatypes_pvalues = testPvalues(p2_aatypes_freqs, aatypes_boot, args.iteration)
789
790 all_codons[p2+"_obs_counts"] = p2_codons_counts
791 all_codons[p2+"_obs_freqs"] = p2_codons_freqs
792 all_codons[p2+"_pvalues"] = p2_codons_pvalues
793 all_aa[p2+"_obs_counts"] = p2_aa_counts
794 all_aa[p2+"_obs_freqs"] = p2_aa_freqs
795 all_aa[p2+"_pvalues"] = p2_aa_pvalues
796 all_aatypes[p2+"_obs_counts"] = p2_aatypes_counts
797 all_aatypes[p2+"_obs_freqs"] = p2_aatypes_freqs
798 all_aatypes[p2+"_pvalues"] = p2_aatypes_pvalues
799 all_various[p2] = p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP
800
801 index.append(p2)
802
803 if (p1, p2) not in index_transi and p1 in sequences_for_counts and p2 in sequences_for_counts:
804 print "Countings transitions between {} and {}".format(p1, p2)
805 codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs = computeAllBiases(sequences_for_counts[p1], sequences_for_counts[p2], dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions, reversecode, reverseclassif)
806 index_transi.append((p1,p2))
807
808 p1p2_codons_pvalues = testPvalues(codons_transitions_freqs, codons_transi_boot, args.iteration)
809 p1p2_aa_pvalues = testPvalues(aa_transitions_freqs, aa_transi_boot, args.iteration)
810 p1p2_aatypes_pvalues = testPvalues(aatypes_transitions_freqs, aatypes_transi_boot, args.iteration)
811
812 all_codons_transitions[p1+">"+p2+"_obs_counts"] = codons_transitions
813 all_codons_transitions[p1+">"+p2+"_obs_freqs"] = codons_transitions_freqs
814 all_codons_transitions[p1+">"+p2+"_pvalues"] = p1p2_codons_pvalues
815 all_aa_transitions[p1+">"+p2+"_obs_counts"] = aa_transitions
816 all_aa_transitions[p1+">"+p2+"_obs_freqs"] = aa_transitions_freqs
817 all_aa_transitions[p1+">"+p2+"_pvalues"] = p1p2_aa_pvalues
818 all_aatypes_transitions[p1+">"+p2+"_obs_counts"] = aatypes_transitions
819 all_aatypes_transitions[p1+">"+p2+"_obs_freqs"] = aatypes_transitions_freqs
820 all_aatypes_transitions[p1+">"+p2+"_pvalues"] = p1p2_aatypes_pvalues
821
822 index_transi.append((p1, p2))
823
824 print "\n Done.\n"
825
826 print "Processing : creating dataframes ..."
827
828 frame_codons = pd.DataFrame(all_codons).T.astype('object')
829 frame_aa = pd.DataFrame(all_aa).T.astype('object')
830 frame_aatypes = pd.DataFrame(all_aatypes).T.astype('object')
831
832 frame_codons_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_codons_transitions.items()}).unstack()
833 frame_codons_transitions.columns = frame_codons_transitions.columns.map('>'.join)
834
835 frame_aa_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_aa_transitions.items()}).unstack()
836 frame_aa_transitions.columns = frame_aa_transitions.columns.map('>'.join)
837
838 frame_aatypes_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_aatypes_transitions.items()}).unstack()
839 frame_aatypes_transitions.columns = frame_aatypes_transitions.columns.map('>'.join)
840
841 frame_various = pd.DataFrame(all_various).T
842 frame_various.columns = ["GC3","GC12","IVYWREL","EKQH","PAYRESDGM","purineload", "CvP"]
843
844 frame_codons.index.name, frame_aa.index.name, frame_aatypes.index.name = "Species", "Species","Species"
845 frame_aa_transitions.index.name, frame_aatypes_transitions.index.name, frame_various.index.name = "Species","Species","Species"
846
847 print "Writing dataframes to output files ...\n"
848
849 frame_codons.to_csv("codons_freqs.csv", sep=",", encoding="utf-8")
850 frame_aa.to_csv("aa_freqs.csv", sep=",", encoding="utf-8")
851 frame_aatypes.astype('object').to_csv("aatypes_freqs.csv", sep=",", encoding="utf-8")
852 frame_codons_transitions.to_csv("codons_transitions_freqs.csv", sep=",", encoding="utf-8")
853 frame_aa_transitions.to_csv("aa_transitions_freqs.csv", sep=",", encoding="utf-8")
854 frame_aatypes_transitions.to_csv("aatypes_transitions_freqs.csv", sep=",", encoding="utf-8")
855 frame_various.to_csv("gc_and_others_freqs.csv", sep=",", encoding="utf-8")
856
857 print "Done."
858
859 if __name__ == "__main__":
860 main()