Mercurial > repos > abims-sbr > mutcount
comparison scripts/S01a_codons_counting.py @ 5:0ba551449008 draft
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 273a9af69b672b2580cd5dec4c0e67a4a96fb0fe
| author | abims-sbr |
|---|---|
| date | Tue, 27 Feb 2018 08:48:34 -0500 |
| parents | |
| children | 04a9ada73cc4 |
comparison
equal
deleted
inserted
replaced
| 4:5766f80370e7 | 5:0ba551449008 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # coding: utf-8 | |
| 3 # Author : Victor Mataigne | |
| 4 | |
| 5 import string, os, sys, re, random, itertools, argparse, copy | |
| 6 import pandas as pd | |
| 7 import numpy as np | |
| 8 | |
| 9 def buildDicts(list_codons, content, dict_genetic_code, dict_aa_classif): | |
| 10 """ Build dictionaries with values to 0. These dictionaries are used as starting point for each sequence counting | |
| 11 | |
| 12 Args : | |
| 13 list_codons (list of str) : all codons except codons-stop | |
| 14 content (int or list) : an integer (for coutings and transitions), or an empty list (for resampling) | |
| 15 dict_genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...} | |
| 16 dict_aa_classif (dict) : the types of the amino-acids ( {type: [aa1, aa2, ...], ...} | |
| 17 | |
| 18 Returns : | |
| 19 dico_codons, dico_aa, dico_aatypes (dicts) : keys : codons/amico-acids/amico-acids types, values : 0 or [] | |
| 20 dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions (dicts of dicts) : | |
| 21 actually, the three first dictionaries nested as values of keys codons/amico-acids/amico-acids | |
| 22 """ | |
| 23 | |
| 24 # I could have make sub-routines here. | |
| 25 # the copy commands are mandatory, otherwise all dictionaries will reference the same variable(s) | |
| 26 | |
| 27 dico_codons = {} | |
| 28 for codon in list_codons: | |
| 29 dico_codons[codon] = copy.deepcopy(content) | |
| 30 | |
| 31 dico_aa = {} | |
| 32 for aa in dict_genetic_code.keys(): | |
| 33 dico_aa[aa] = copy.deepcopy(content) | |
| 34 | |
| 35 dico_aatypes = {} | |
| 36 for aatype in dict_aa_classif.keys(): | |
| 37 dico_aatypes[aatype] = copy.deepcopy(content) | |
| 38 | |
| 39 dico_codons_transitions=copy.deepcopy(dico_codons) | |
| 40 for key in dico_codons_transitions.keys(): | |
| 41 dico_codons_transitions[key]=copy.deepcopy(dico_codons) | |
| 42 | |
| 43 dico_aa_transitions = copy.deepcopy(dico_aa) | |
| 44 for key in dico_aa_transitions.keys(): | |
| 45 dico_aa_transitions[key]=copy.deepcopy(dico_aa) | |
| 46 | |
| 47 dico_aatypes_transitions = copy.deepcopy(dico_aatypes) | |
| 48 for key in dico_aatypes_transitions.keys(): | |
| 49 dico_aatypes_transitions[key]=copy.deepcopy(dico_aatypes) | |
| 50 | |
| 51 return dico_codons, dico_aa, dico_aatypes, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions | |
| 52 | |
| 53 def viable(seqs, pos, method): | |
| 54 """ Compute if, among a set of sequences, either at least one of the codons at the specified position has not a "-", | |
| 55 or not any codon has a "-" | |
| 56 | |
| 57 Args : | |
| 58 seqs : a list (the sequences, which must all have the same size) | |
| 59 pos : an integer (the positions, <= len(seqs) -3) | |
| 60 | |
| 61 Returns: | |
| 62 bool | |
| 63 """ | |
| 64 codons = [seq[pos:pos+3] for seq in seqs] | |
| 65 if method == "all": | |
| 66 return not all("-" in codon for codon in codons) | |
| 67 elif method == "any": | |
| 68 return not any("-" in codon for codon in codons) | |
| 69 | |
| 70 # # # Function for codons, aa, aatypes Countings ------------------------------------------------------------------------------- | |
| 71 | |
| 72 def computeAllCountingsAndFreqs(seq, list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif): | |
| 73 """ Call all functions dedicated to the computation of occurences and frequencies of codons, amino-acids, amino-acids types | |
| 74 | |
| 75 Args : see sub-routines | |
| 76 | |
| 77 Returns : 6 dictionaries (occurences and frequencies for codons, amino-acids, amino-acids types). See sub-routines for details. | |
| 78 """ | |
| 79 | |
| 80 # ------ Sub-routines ------ # | |
| 81 | |
| 82 def codonsCountings(seq, list_codons, init_dict_codons): | |
| 83 """ Count occurences of each codon in a sequence | |
| 84 First reading frame only : input sequence is supposed to be an ORF | |
| 85 | |
| 86 Args : | |
| 87 seq (str) : the sequence | |
| 88 list_codons (list of str) : all codons except codons-stop | |
| 89 init_dict_codons (dict) : {codon1 : 0, codon2: 0, ...} | |
| 90 | |
| 91 Return : | |
| 92 codon (dict) : codons (keys) and their occurences (values) in the sequence | |
| 93 """ | |
| 94 | |
| 95 codons = copy.deepcopy(init_dict_codons) | |
| 96 | |
| 97 l = len(seq) | |
| 98 | |
| 99 if l%3 == 0: max_indice = l-3 | |
| 100 if l%3 == 1: max_indice = l-4 | |
| 101 if l%3 == 2: max_indice = l-5 | |
| 102 | |
| 103 for codon in range(0,max_indice+1,3): | |
| 104 if "-" not in seq[codon:codon+3] : | |
| 105 codons[seq[codon:codon+3]] += 1 | |
| 106 | |
| 107 return codons | |
| 108 | |
| 109 def codonsFreqs(dict_codons_counts): | |
| 110 """ Computes frequencies of each codon in a sequence | |
| 111 | |
| 112 Args : | |
| 113 dict_codons (dict) : the output of codonsCounting() | |
| 114 | |
| 115 Return : | |
| 116 codons_freqs (dict) : codons (keys) and their frequencies (values) | |
| 117 """ | |
| 118 | |
| 119 codons_freqs = {} | |
| 120 | |
| 121 for key in dict_codons_counts.keys(): | |
| 122 freq = float(dict_codons_counts[key])/sum(dict_codons_counts.values()) | |
| 123 codons_freqs[key] = freq | |
| 124 | |
| 125 return codons_freqs | |
| 126 | |
| 127 def aaCountings(dict_codons, dict_genetic_code, init_dict_aa): | |
| 128 """ Count occurences of each amino-acid in a sequence, based on the countings of codons (1st ORF) | |
| 129 | |
| 130 Args : | |
| 131 dict_codons (dict) : the output of codonsCounting() | |
| 132 dict_genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...} | |
| 133 init_dict_aa (dict) : {aa1 : 0, aa2: 0, ...} | |
| 134 | |
| 135 Return : | |
| 136 dict_aa (dict) : amino-acids (keys) and their occurences (values) | |
| 137 """ | |
| 138 dict_aa = copy.deepcopy(init_dict_aa) | |
| 139 | |
| 140 for key in dict_codons.keys(): | |
| 141 for value in dict_genetic_code.values(): | |
| 142 if key in value: | |
| 143 dict_aa[dict_genetic_code.keys()[dict_genetic_code.values().index(value)]] += dict_codons[key] | |
| 144 | |
| 145 return dict_aa | |
| 146 | |
| 147 def aaFreqs(dict_aa_counts): | |
| 148 """ Computes frequencies of each amino-acid in a sequence, based on the countings of codons (1st ORF) | |
| 149 | |
| 150 Args : | |
| 151 dict_aa_counts (dict) : the output of aaCountings() | |
| 152 | |
| 153 Return : | |
| 154 dict_aa_freqs (dict) : amino-acids (keys) and their frequencies (values) | |
| 155 """ | |
| 156 | |
| 157 dict_aa_freqs = {} | |
| 158 | |
| 159 for key in dict_aa_counts.keys(): | |
| 160 freq = float(dict_aa_counts[key])/sum(dict_aa_counts.values()) | |
| 161 dict_aa_freqs[key] = freq | |
| 162 | |
| 163 return dict_aa_freqs | |
| 164 | |
| 165 def aatypesCountings(dict_aa, dict_aa_classif, init_dict_classif): | |
| 166 """ Computes frequencies of each amino-acid type in a sequence, based on the countings of amino-acids (1st ORF) | |
| 167 | |
| 168 Args : | |
| 169 - dict_aa (dict) : the output of aaCountings() | |
| 170 - dict_aa_classif (dict) : the types of the amino-acids ( {type: [aa1, aa2, ...], ...} ) | |
| 171 - init_dict_classif (dict) : {'polar': 0, 'apolar': 0, ...} | |
| 172 | |
| 173 Return : | |
| 174 dict_aatypes (dict) : amino-acids types (keys) and their occurences (values) | |
| 175 """ | |
| 176 dict_aatypes = copy.deepcopy(init_dict_classif) | |
| 177 | |
| 178 for key_classif in dict_aa_classif.keys(): | |
| 179 for key_aa in dict_aa.keys(): | |
| 180 if key_aa in dict_aa_classif[key_classif]: | |
| 181 dict_aatypes[key_classif] += dict_aa[key_aa] | |
| 182 | |
| 183 return dict_aatypes | |
| 184 | |
| 185 def aatypesFreqs(dict_aatypes): | |
| 186 """ Computes frequencies of each amino-acid type in a sequence, based on the countings of amino-acids (1st ORF) | |
| 187 | |
| 188 Args : | |
| 189 dict_aatypes (dict) : the output of aatypesCountings() | |
| 190 | |
| 191 Return : | |
| 192 dict_aatypes_freqs : amino-acids types (keys) and their frequencies (values) | |
| 193 """ | |
| 194 dict_aatypes_freqs = {} | |
| 195 | |
| 196 for key in dict_aatypes.keys(): | |
| 197 freq = float(dict_aatypes[key])/sum(dict_aatypes.values()) | |
| 198 dict_aatypes_freqs[key] = freq | |
| 199 | |
| 200 return dict_aatypes_freqs | |
| 201 | |
| 202 # ------ The function ------ # | |
| 203 | |
| 204 codons_c = codonsCountings(seq, list_codons, init_dict_codons) | |
| 205 codons_f = codonsFreqs(codons_c) | |
| 206 aa_c = aaCountings(codons_c, dict_genetic_code, init_dict_aa) | |
| 207 aa_f = aaFreqs(aa_c) | |
| 208 aat_c = aatypesCountings(aa_c, dict_aa_classif, init_dict_classif) | |
| 209 aat_f = aatypesFreqs(aat_c) | |
| 210 | |
| 211 return codons_c, codons_f, aa_c, aa_f, aat_c, aat_f | |
| 212 | |
| 213 # # # Functions for various measures (ivywrel, ekqh...) ------------------------------------------------------------------------- | |
| 214 | |
| 215 def computeVarious(seq, dict_aa_counts, dict_aa_types): | |
| 216 """ Call al the functions for nucleic and amino-acids sequences description | |
| 217 | |
| 218 Args : See sub-routines for details. | |
| 219 | |
| 220 Returns : 6 integer or floats. See sub-routines for details. | |
| 221 """ | |
| 222 | |
| 223 # ------ Sub-routines ------ # | |
| 224 | |
| 225 def nbCodons(seq): | |
| 226 """ Compute the number of full codons in a sequence | |
| 227 Arg : seq (str): the sequence | |
| 228 Return: nb_codons (int) | |
| 229 """ | |
| 230 l = len(seq) | |
| 231 if l%3 == 0: nb_codons = l/3 | |
| 232 if l%3 == 1: nb_codons = (l-1)/3 | |
| 233 if l%3 == 2: nb_codons = (l-2)/3 | |
| 234 return nb_codons | |
| 235 | |
| 236 def maxIndice(seq): | |
| 237 """ Compute the highest indice for parsing a sequence | |
| 238 Arg : seq (str): the sequence | |
| 239 Return : max_indice (int) | |
| 240 """ | |
| 241 l = len(seq) | |
| 242 if l%3 == 0: max_indice = l-3 | |
| 243 if l%3 == 1: max_indice = l-4 | |
| 244 if l%3 == 2: max_indice = l-5 | |
| 245 return max_indice | |
| 246 | |
| 247 def gc12Andgc3Count(seq, nb_codons, max_indice): | |
| 248 """ Compute the frequency of gc12 in a sequence | |
| 249 Arg : seq (str) : the sequence | |
| 250 Return : (float) | |
| 251 """ | |
| 252 | |
| 253 # TO IMPROVE ? : make this computation in the codonCountigns() function to avoid parsing twice the sequence ? | |
| 254 # But : involves a less readable code | |
| 255 | |
| 256 gc12 = 0 | |
| 257 gc3 = 0 | |
| 258 | |
| 259 for i in range(0, max_indice+1,3): | |
| 260 if seq[i] in ["c","g"]: gc12 += 1 | |
| 261 if seq[i+1] in ["c","g"]: gc12 += 1 | |
| 262 if seq[i+2] in ["c","g"] or seq[i+2] in ["c","g"]: gc3 += 1 | |
| 263 | |
| 264 return float(gc3)/nb_codons, float(gc12)/(2*nb_codons) | |
| 265 | |
| 266 def ivywrelCount(nb_codons, dict_aa_counts): | |
| 267 """ Compute the sum of occurences of amino-acids IVYWREL divided by the number of codons | |
| 268 | |
| 269 Args : | |
| 270 nb_codons (int) : the number of codons in the sequence | |
| 271 dict_aa_counts (dict) : the output of aaCountings() | |
| 272 | |
| 273 return : (float) | |
| 274 """ | |
| 275 | |
| 276 IVYWREL = 0 | |
| 277 | |
| 278 for aa in ["I","V","Y","W","R","E","L"]: # Impossible to make a simple sum, in case one the aa is not in the dict keys | |
| 279 if aa in dict_aa_counts.keys(): | |
| 280 IVYWREL += dict_aa_counts[aa] | |
| 281 | |
| 282 return float(IVYWREL)/nb_codons | |
| 283 | |
| 284 def ekqhCount(dict_aa_counts): | |
| 285 """ Compute the ratio of amino-acids EK/QH | |
| 286 Arg : dict_aa_counts (dict) : the output of aaCountings() | |
| 287 Return : (float) | |
| 288 """ | |
| 289 ek = 0 | |
| 290 qh = 0 | |
| 291 | |
| 292 ek = dict_aa_counts["E"] + dict_aa_counts["K"] | |
| 293 qh = dict_aa_counts["Q"] + dict_aa_counts["H"] | |
| 294 | |
| 295 if qh != 0: | |
| 296 return float(ek)/qh | |
| 297 else : return ek | |
| 298 | |
| 299 def payresdgmCount(dict_aa_counts): | |
| 300 """ Compute the ratio of amino-acids PASRE/SDGM | |
| 301 Arg : dict_aa_counts (dict) : the output of aaCountings() | |
| 302 Return : (float) | |
| 303 """ | |
| 304 payre = 0 | |
| 305 sdgm = 0 | |
| 306 | |
| 307 for aa in ["P","A","Y","R","E"]: | |
| 308 payre += dict_aa_counts[aa] | |
| 309 for aa in ["S","D","G","M"]: | |
| 310 sdgm += dict_aa_counts[aa] | |
| 311 | |
| 312 if sdgm != 0: | |
| 313 return float(payre)/sdgm | |
| 314 else : return payre | |
| 315 | |
| 316 def purineLoad(seq, nb_codons): | |
| 317 """ Compute the purine load indice of a sequence | |
| 318 Args : | |
| 319 seq (str) : the sequence | |
| 320 nb_codons (int) : the number of codons in the sequence | |
| 321 | |
| 322 Return (float) | |
| 323 """ | |
| 324 | |
| 325 # TO IMPROVE ? : make this computation in the codonCountigns() function to avoid parsing twice the sequence ? | |
| 326 # But : involves a less readable code | |
| 327 | |
| 328 g12, g3, A, c12, c3, T = 0.0,0.0,seq.count("a"),0.0,0.0,seq.count("t") | |
| 329 | |
| 330 # g3 and c3 : g and c in 3rd position of a codon | |
| 331 s = "" | |
| 332 for i in range(2, len(seq), 3): | |
| 333 s += seq[i] | |
| 334 g3 = s.count("g") | |
| 335 c3 = s.count("c") | |
| 336 | |
| 337 # g12 and c12 : g and c in 1st and 2d positions of a codon | |
| 338 s = "" | |
| 339 for i in range(0, len(seq), 3): | |
| 340 s += seq[i] | |
| 341 g12 = s.count("g") | |
| 342 c12 = s.count("c") | |
| 343 s = "" | |
| 344 for i in range(1, len(seq), 3): | |
| 345 s += seq[i] | |
| 346 g12 += s.count("g") | |
| 347 c12 += s.count("c") | |
| 348 | |
| 349 return float(1000*(g12+g3+A-c12-c3-T))/(3*nb_codons) | |
| 350 | |
| 351 def cvp(dict_aatypes): | |
| 352 """ Compute the difference nb_charged_aamino_acids - nb_polar_amino_acids | |
| 353 Return: (int) | |
| 354 """ | |
| 355 return dict_aatypes["charged"] - dict_aatypes["polar"] | |
| 356 | |
| 357 # ------ The function ------ # | |
| 358 | |
| 359 nb_codons = nbCodons(seq) | |
| 360 max_indice = maxIndice(seq) | |
| 361 GC3, GC12 = gc12Andgc3Count(seq, nb_codons, max_indice) | |
| 362 IVYWREL, EKQH, PAYRESDGM = ivywrelCount(nb_codons, dict_aa_counts), ekqhCount(dict_aa_counts), payresdgmCount(dict_aa_counts) | |
| 363 purineload, CvP = purineLoad(seq, nb_codons), cvp(dict_aa_types) | |
| 364 return GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP | |
| 365 | |
| 366 # # # Function for codons, aa, aatypes Transitions ----------------------------------------------------------------------------- | |
| 367 | |
| 368 def computeAllBiases(seq1, seq2, dico_codons_transi, dico_aa_transi, dico_aatypes_transi, reversecode, reverseclassif) : | |
| 369 """ Compute all biases (transisitions codon->codon, aa->-aa, aa_type->aa_type) between two sequences | |
| 370 | |
| 371 Args : See sub-routines for details | |
| 372 | |
| 373 Returns 3 dictionaries of dictionaries. See sub-routines for details | |
| 374 """ | |
| 375 | |
| 376 # ------ Sub-routines ------ # | |
| 377 | |
| 378 def codons_transitions(seq1, seq2, dico_codons_transi): | |
| 379 """ Compute the number of transitions from a codon of a sequence to the codon of a second sequence | |
| 380 | |
| 381 Args : | |
| 382 seq1 (str) : the first sequence | |
| 383 seq2 (str) : the second sequence | |
| 384 dico_codons_transi (dict of dicts) : { codon1 : {codon1: 0, codon2 : 0, ...}, ..} | |
| 385 | |
| 386 Return : | |
| 387 codons_transi (dict of dicts) : the occurences of each codon to codon transition | |
| 388 """ | |
| 389 | |
| 390 codons_transi = copy.deepcopy(dico_codons_transi) | |
| 391 | |
| 392 for i in range(0, len(seq1), 3): | |
| 393 # check if no indel and if len seq[i:i+3] is really 3 (check for the last codon) | |
| 394 if viable([seq1, seq2], i, "any") and len(seq1[i:i+3]) == 3 and len(seq2[i:i+3]) == 3 : | |
| 395 codons_transi[seq1[i:i+3]][seq2[i:i+3]] += 1 | |
| 396 | |
| 397 return codons_transi | |
| 398 | |
| 399 def codons_transitions_freqs(codons_transitions_counts): | |
| 400 """ Computes frequencies of codon transitions between two sequences | |
| 401 | |
| 402 Arg : | |
| 403 codons_transitions_counts (dict) : the output of codons_transitions() | |
| 404 | |
| 405 Return : | |
| 406 codons_transi_freqs (dict of dicts) : the frequencies of each codon to codon transition | |
| 407 """ | |
| 408 | |
| 409 codons_transi_freqs = {} | |
| 410 | |
| 411 for key in codons_transitions_counts.keys(): | |
| 412 codons_transi_freqs[key] = {} | |
| 413 for key2 in codons_transitions_counts[key].keys(): | |
| 414 if sum(codons_transitions_counts[key].values()) != 0: | |
| 415 freq = float(codons_transitions_counts[key][key2])/sum(codons_transitions_counts[key].values()) | |
| 416 codons_transi_freqs[key][key2] = freq | |
| 417 else : | |
| 418 codons_transi_freqs[key][key2] = 0 | |
| 419 return codons_transi_freqs | |
| 420 | |
| 421 def aa_transitions(dico_codons_transi, dico_aa_transi, reversecode): | |
| 422 """ Compute the number of transitions from an amino-acid of a sequence to the amino-acid of a second sequence | |
| 423 | |
| 424 Args : | |
| 425 dico_codons_transi (dict of dicts) : the codons transitions computed by codons_transitions() | |
| 426 dico_aa_transi (dict of dicts) : { aa1 : {aa1: 0, aa2 : 0, ...}, ..} | |
| 427 reversecode (dict) : the reversed genetic code {aa1 : [codons],...} -> {codon1: aa1, codon2: aa2, ...} | |
| 428 | |
| 429 Return : | |
| 430 aa_transi (dict of dicts) : the occurences of each aa to aa transition | |
| 431 """ | |
| 432 | |
| 433 aa_transi = copy.deepcopy(dico_aa_transi) | |
| 434 | |
| 435 for k in dico_codons_transi.keys(): | |
| 436 newk = reversecode[k] | |
| 437 for k2 in dico_codons_transi[k].keys(): | |
| 438 newk2 = reversecode[k2] | |
| 439 aa_transi[newk][newk2] += dico_codons_transi[k][k2] | |
| 440 | |
| 441 return aa_transi | |
| 442 | |
| 443 def aa_transitions_freqs(aa_transitions_counts): | |
| 444 """ Computes frequencies of amico-acids transitions between two sequences | |
| 445 Arg : aa_transitions_counts (dict of dicts): the output of aa_transitions() | |
| 446 Return : aa_transi_freqs (dict of dicts) : the frequencies of each aa to aa transition | |
| 447 """ | |
| 448 | |
| 449 aa_transi_freqs = {} | |
| 450 | |
| 451 for key in aa_transitions_counts.keys(): | |
| 452 aa_transi_freqs[key] = {} | |
| 453 for key2 in aa_transitions_counts[key].keys(): | |
| 454 if sum(aa_transitions_counts[key].values()) != 0: | |
| 455 freq = float(aa_transitions_counts[key][key2])/sum(aa_transitions_counts[key].values()) | |
| 456 aa_transi_freqs[key][key2] = freq | |
| 457 else : | |
| 458 aa_transi_freqs[key][key2] = 0 | |
| 459 return aa_transi_freqs | |
| 460 | |
| 461 def aatypes_transitions(dico_aa_transi, dico_aatypes_transi, reverseclassif): | |
| 462 """ Compute the number of transitions from an amino-acid type of a sequence to the amino-acid type of a second sequence | |
| 463 | |
| 464 Args : | |
| 465 dico_aa_transi (dict of dicts) : the output of aa_transitions() | |
| 466 dico_aatypes_transi (dict of dicts) : { type1 : {type1: 0, type2 : 0, ...}, ..} | |
| 467 reverseclassif (dict) : the reversed amino-acid clasification {aa1: type, aa2: type, ...} | |
| 468 | |
| 469 Return : | |
| 470 aatypes_transi (dict of dicts) : the occurences of each aatype to aatype transition | |
| 471 """ | |
| 472 | |
| 473 aatypes_transi = copy.deepcopy(dico_aatypes_transi) | |
| 474 for k in dico_aa_transi.keys(): | |
| 475 newk = reverseclassif[k] | |
| 476 for k2 in dico_aa_transi[k].keys(): | |
| 477 newk2 = reverseclassif[k2] | |
| 478 aatypes_transi[newk][newk2] += dico_aa_transi[k][k2] | |
| 479 | |
| 480 return aatypes_transi | |
| 481 | |
| 482 def aatypes_transitions_freqs(aatypes_transitions_counts): | |
| 483 """ Computes frequencies of amico-acids types transitions between two sequences | |
| 484 Args : aatypes_transitions_counts (dict of dicts) : the output of aatypes_transitions() | |
| 485 Return : aatypes_transi_freqs (dict of dicts) : the frequencies of each aatype to aatype transition | |
| 486 """ | |
| 487 | |
| 488 aatypes_transi_freqs = {} | |
| 489 | |
| 490 for key in aatypes_transitions_counts.keys(): | |
| 491 aatypes_transi_freqs[key] = {} | |
| 492 for key2 in aatypes_transitions_counts[key].keys(): | |
| 493 if sum(aatypes_transitions_counts[key].values()) != 0: | |
| 494 freq = float(aatypes_transitions_counts[key][key2])/sum(aatypes_transitions_counts[key].values()) | |
| 495 aatypes_transi_freqs[key][key2] = freq | |
| 496 else : | |
| 497 aatypes_transi_freqs[key][key2] = 0 | |
| 498 return aatypes_transi_freqs | |
| 499 | |
| 500 # ------ The function ------ # | |
| 501 | |
| 502 codons_transitions = codons_transitions(seq1, seq2, dico_codons_transi) | |
| 503 codons_transitions_freqs = codons_transitions_freqs(codons_transitions) | |
| 504 aa_transitions = aa_transitions(codons_transitions, dico_aa_transi, reversecode) | |
| 505 aa_transitions_freqs = aa_transitions_freqs(aa_transitions) | |
| 506 aatypes_transitions = aatypes_transitions(aa_transitions, dico_aatypes_transi, reverseclassif) | |
| 507 aatypes_transitions_freqs = aatypes_transitions_freqs(aatypes_transitions) | |
| 508 | |
| 509 return codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs | |
| 510 | |
| 511 # # # Function for random resampling -------------------------------------------------------------------------------------------- | |
| 512 | |
| 513 def sampling (dict_seq, nb_iter, len_sample, list_codons, genetic_code, aa_classif, reversecode, reverseclassif): | |
| 514 """ Resample randomly codons from sequences (sort of bootsrap) | |
| 515 | |
| 516 Args : | |
| 517 dict_seq (dict) : contains the species name and sequences used ( { 'sp1': seq1, 'sp2': seq2, ...}) without '-' removal | |
| 518 nb_iter (int) : number of resampling iterations (better >= 1000) | |
| 519 len_sample (int) : length (in codons) of a resampled sequence (better >= 1000) | |
| 520 list_codons (list of str): all codons except codons-stop | |
| 521 genetic_code (dict) : the genetic code : {'aa1': [codon1, codon2,...], ...} | |
| 522 aa_classif (dict) : the types of the amino-acids : {type: [aa1, aa2, ...], ...} | |
| 523 reversecode (dict) : the reversed genetic code : {codon1: aa1, codon2: aa2, ...} | |
| 524 reverseclassif (dict) : the reversed amino-acid : clasification {aa1: type, aa2: type, ...} | |
| 525 | |
| 526 Returns : | |
| 527 codons_lst, aa_lst, classif_lst (dicts) : keys : codons/aa/aatypes, values : [] | |
| 528 codons_transitions_lst, aa_transitions_lst, classif_transitions_lst (dict of dicts) : keys : codons/aa/aatypes, values : the 3 previous dicts | |
| 529 """ | |
| 530 | |
| 531 # Initialize empty dictionaries for countings and transitions. It's also possible to isntanciate these ones in the main() but it would make a function with ~14 parameters | |
| 532 codons_0, aa_0, classif_0, codons_transitions_0, aa_transitions_0, classif_transitions_0 = buildDicts(list_codons, 0, genetic_code, aa_classif) | |
| 533 codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst = buildDicts(list_codons, [], genetic_code, aa_classif) | |
| 534 | |
| 535 # determine the max position where sampling is possible | |
| 536 l = len(dict_seq.values()[1]) | |
| 537 if l%3 == 0: max_indice = l-3 | |
| 538 if l%3 == 1: max_indice = l-4 | |
| 539 if l%3 == 2: max_indice = l-5 | |
| 540 | |
| 541 # List of positions to resample | |
| 542 viable_positions = [pos for pos in range(0,max_indice,3) if viable(dict_seq.values(), pos, "all")] | |
| 543 sample_positions = np.random.choice(viable_positions, len_sample) | |
| 544 | |
| 545 # nb_iter resampled sequences | |
| 546 for i in range(nb_iter): | |
| 547 if (i+1)%(nb_iter/10) == 0: | |
| 548 print " "+str( (i+1)*100/nb_iter)+"%" | |
| 549 | |
| 550 seqa, seqb = "", "" | |
| 551 for pos in sample_positions: | |
| 552 codona, codonb = "---", "---" | |
| 553 # The sequence to be resampled in this position is randomly chosen ; no "-" resampled | |
| 554 while "-" in codona : | |
| 555 codona = dict_seq.values()[random.randrange(0, len(dict_seq.keys())-1)][pos:pos+3] | |
| 556 while "-" in codonb : | |
| 557 codonb = dict_seq.values()[random.randrange(0, len(dict_seq.keys())-1)][pos:pos+3] | |
| 558 seqa += codona | |
| 559 seqb += codonb | |
| 560 | |
| 561 # dictionaries : frequences of codons, aa, aatypes (seq1) | |
| 562 codons_occ_tmp, codons_freq_tmp, aa_occ_tmp, aa_freq_tmp, aatypes_occ_tmp, aatypes_freq_tmp = computeAllCountingsAndFreqs(seqa, list_codons, codons_0, aa_0, classif_0, genetic_code, aa_classif) | |
| 563 # dictionaries frequences of transitions (seqa->seqb) | |
| 564 codons_transitions_tmp, codons_transitions_freq_tmp, aa_transition_tmp, aa_transitions_freq_tmp, aatypes_transitions_tmp, aatypes_transitions_freq_tmp = computeAllBiases(seqa, seqb, codons_transitions_0, aa_transitions_0, classif_transitions_0, reversecode, reverseclassif) | |
| 565 | |
| 566 # Adding occurrences in final dicts | |
| 567 for key in codons_freq_tmp.keys(): | |
| 568 codons_lst[key].append(codons_freq_tmp[key]) | |
| 569 for key in aa_freq_tmp.keys(): | |
| 570 aa_lst[key].append(aa_freq_tmp[key]) | |
| 571 for key in aatypes_freq_tmp.keys(): | |
| 572 classif_lst[key].append(aatypes_freq_tmp[key]) | |
| 573 | |
| 574 # Adding occurrences in final dicts (transitions) | |
| 575 for key in codons_transitions_freq_tmp.keys(): | |
| 576 for key2 in codons_transitions_freq_tmp[key].keys(): | |
| 577 codons_transitions_lst[key][key2].append(codons_transitions_freq_tmp[key][key2]) | |
| 578 for key in aa_transitions_freq_tmp.keys(): | |
| 579 for key2 in aa_transitions_freq_tmp[key].keys(): | |
| 580 aa_transitions_lst[key][key2].append(aa_transitions_freq_tmp[key][key2]) | |
| 581 for key in aatypes_transitions_freq_tmp.keys(): | |
| 582 for key2 in aatypes_transitions_freq_tmp[key].keys(): | |
| 583 classif_transitions_lst[key][key2].append(aatypes_transitions_freq_tmp[key][key2]) | |
| 584 | |
| 585 return codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst | |
| 586 | |
| 587 def testPvalues(dict_counts, dict_resampling, nb_iter): | |
| 588 """ Computes where the observed value is located in the expected counting distribution | |
| 589 | |
| 590 Args : | |
| 591 dict_counts (dict) : observed frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() | |
| 592 dict_resampling (dict) : expected frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() within the sampling() function | |
| 593 | |
| 594 Return : | |
| 595 pvalue (dict, dict of dicts) : the pvalues of all observed countings (dict) and transitions (dict of dicts) | |
| 596 """ | |
| 597 | |
| 598 def testPvalue(obs, exp, nb_iter): | |
| 599 """ Compute a pvalue | |
| 600 | |
| 601 Args : | |
| 602 exp (list of floatsà : a list of length nb_iter, containing expected frequencies of a codon/aa/aatype at each iteration | |
| 603 obs (float) : the observed value | |
| 604 nb_iter (int) : the number of iterations for resampling | |
| 605 | |
| 606 Returns : | |
| 607 pvalue (float) | |
| 608 """ | |
| 609 | |
| 610 max_val = nb_iter-1 | |
| 611 min_val = 0 | |
| 612 test_val = (max_val+min_val)/2 | |
| 613 | |
| 614 while max_val-min_val > 1: | |
| 615 if obs > exp[test_val]: | |
| 616 min_val = test_val | |
| 617 test_val = (max_val+min_val)/2 | |
| 618 elif obs < exp[test_val]: | |
| 619 max_val = test_val | |
| 620 test_val = (max_val+min_val)/2 | |
| 621 else: | |
| 622 break | |
| 623 | |
| 624 pvalue = float(test_val+1)/(nb_iter+1) | |
| 625 | |
| 626 return pvalue | |
| 627 | |
| 628 # ------ The function ------ # | |
| 629 | |
| 630 pvalues = {} | |
| 631 | |
| 632 for key in dict_resampling.keys(): | |
| 633 if type(dict_resampling.values()[1]) is not dict : | |
| 634 pvalues[key] = testPvalue(dict_counts[key], dict_resampling[key], nb_iter) | |
| 635 else : | |
| 636 pvalues[key] = {} | |
| 637 for key2 in dict_resampling[key].keys(): | |
| 638 pvalues[key][key2] = testPvalue(dict_counts[key][key2], dict_resampling[key][key2], nb_iter) | |
| 639 | |
| 640 return pvalues | |
| 641 | |
| 642 def main(): | |
| 643 | |
| 644 # arguments | |
| 645 parser = argparse.ArgumentParser() | |
| 646 parser.add_argument("sequences_file", help="File containing sequences (the output of the tool 'ConcatPhyl'") | |
| 647 parser.add_argument("considered_species", help="The species name, separated by commas (must be the same than in the sequences_file). It is possible to consider only a subset of species.") | |
| 648 parser.add_argument("species_for_bootstrap", help="The species which will be used for bootstrapping, separated by commas. It is possible to consider only a subset of species.") | |
| 649 parser.add_argument("iteration", help="The number of iterations for bootstrapping (better if => 1000)", type=int) | |
| 650 parser.add_argument("sample_length", help="The lenght of a bootstrapped sequences (better if >= 1000", type=int) | |
| 651 args = parser.parse_args() | |
| 652 | |
| 653 print "\n ------ Occurences and frequencies of codons, amino-acids, amino-acids types -------\n" | |
| 654 | |
| 655 print "The script counts the number of codons, amino acids, and types of amino acids in sequences," | |
| 656 print "as well as the mutation bias from one item to another between 2 sequences.\n" | |
| 657 | |
| 658 print "Counting are then compared to empirical p-values, obtained from bootstrapped sequences obtained from a subset of sequences." | |
| 659 print "In the output files, the pvalues indicate the position of the observed data in a distribution of empirical countings obtained from" | |
| 660 print "a resample of the data. Values above 0.95 indicate a significantly higher counting, values under 0.05 a significantly lower counting." | |
| 661 | |
| 662 print " Sequences file : {}".format(args.sequences_file) | |
| 663 print " Species retained for countings : {}\n".format(args.considered_species) | |
| 664 | |
| 665 print "Processing : reading input file, opening output files, building dictionaries." | |
| 666 | |
| 667 # make pairs | |
| 668 list_species = str.split(args.considered_species, ",") | |
| 669 list_species_boot = str.split(args.species_for_bootstrap, ",") | |
| 670 pairs_list=list(itertools.combinations(list_species,2)) | |
| 671 | |
| 672 # read sequences | |
| 673 sequences_for_counts = {} | |
| 674 sequences_for_resampling = {} | |
| 675 with open(args.sequences_file, "r") as file: | |
| 676 for line1,line2 in itertools.izip_longest(*[file]*2): | |
| 677 species = line1.strip(">\r\n") | |
| 678 sequence = line2.strip("\r\n") | |
| 679 if species in list_species: | |
| 680 sequences_for_counts[species] = sequence | |
| 681 if species in list_species_boot: | |
| 682 sequences_for_resampling[species] = sequence | |
| 683 | |
| 684 print " Warning : countings might be biased and show high differences between species because of high variations of the indels proportions among sequences." | |
| 685 print " Frequences are more representative." | |
| 686 | |
| 687 print "\n Indels percent :" | |
| 688 | |
| 689 for k,v in sequences_for_counts.items(): | |
| 690 print " {} : {} %".format(k, float(v.count("-"))/len(v)*100) | |
| 691 | |
| 692 # useful dictionaries | |
| 693 dict_genetic_code={"F":["ttt","ttc"], | |
| 694 "L":["tta","ttg","ctt","ctc","cta","ctg"], | |
| 695 "I":["att","atc","ata"], | |
| 696 "M":["atg"], | |
| 697 "V":["gtt","gtc","gta","gtg"], | |
| 698 "S":["tct","tcc","tca","tcg","agt","agc"], | |
| 699 "P":["cct","cca","ccg","ccc"], | |
| 700 "T":["act","acc","aca","acg"], | |
| 701 "A":["gct","gcc","gca","gcg"], | |
| 702 "Y":["tat","tac"], | |
| 703 "H":["cat","cac"], | |
| 704 "Q":["caa","cag"], | |
| 705 "N":["aat","aac"], | |
| 706 "K":["aaa","aag"], | |
| 707 "D":["gat","gac"], | |
| 708 "E":["gaa","gag"], | |
| 709 "C":["tgt","tgc"], | |
| 710 "W":["tgg"], | |
| 711 "R":["cgt","cgc","cga","cgg","aga","agg"], | |
| 712 "G":["ggt","ggc","gga","ggg"]} | |
| 713 | |
| 714 dict_aa_classif={"unpolar":["G","A","V","L","M","I"], | |
| 715 "polar":["S","T","C","P","N","Q"], | |
| 716 "charged":["K","R","H","D","E"], | |
| 717 "aromatics":["F","Y","W"]} | |
| 718 | |
| 719 reversecode={v:k for k in dict_genetic_code for v in dict_genetic_code[k]} | |
| 720 reverseclassif={v:k for k in dict_aa_classif for v in dict_aa_classif[k]} | |
| 721 | |
| 722 # codons list (without stop codons) | |
| 723 nucleotides = ['a', 'c', 'g', 't'] | |
| 724 list_codons = [''.join(comb) for comb in itertools.product(nucleotides, repeat=3)] | |
| 725 list_codons.remove("taa") | |
| 726 list_codons.remove("tag") | |
| 727 list_codons.remove("tga") | |
| 728 | |
| 729 # Store already computed species + row.names in output files | |
| 730 index = [] | |
| 731 index_transi = [] | |
| 732 | |
| 733 # Final dictionaries writed to csv files | |
| 734 all_codons = {} | |
| 735 all_aa = {} | |
| 736 all_aatypes = {} | |
| 737 all_various = {} | |
| 738 all_codons_transitions = {} # Not used because too much : 61*61 columns | |
| 739 all_aa_transitions = {} | |
| 740 all_aatypes_transitions = {} | |
| 741 | |
| 742 # RUN | |
| 743 | |
| 744 print "\nProcessing : resampling ..." | |
| 745 print " Parameters : {niter} iterations, {lensample} codon per resampled sequence, species used : {species}\n".format(niter=args.iteration, lensample=args.sample_length, species=args.species_for_bootstrap) | |
| 746 | |
| 747 codons_boot, aa_boot, aatypes_boot, codons_transi_boot, aa_transi_boot, aatypes_transi_boot = sampling(sequences_for_resampling, args.iteration, args.sample_length, list_codons, dict_genetic_code, dict_aa_classif, reversecode, reverseclassif) | |
| 748 print " Done.\n" | |
| 749 | |
| 750 print "Processing : countings....\n" | |
| 751 | |
| 752 # Initialize empty dictionaries for countings and transitions | |
| 753 init_dict_codons, init_dict_aa, init_dict_classif, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions = buildDicts(list_codons, 0, dict_genetic_code, dict_aa_classif) | |
| 754 | |
| 755 for pair in pairs_list: | |
| 756 p1, p2 = pair[0], pair[1] | |
| 757 if p1 not in index: | |
| 758 print "Countings on {}".format(p1) | |
| 759 | |
| 760 p1_codons_counts, p1_codons_freqs, p1_aa_counts, p1_aa_freqs, p1_aatypes_counts, p1_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p1], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif) | |
| 761 p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP = computeVarious(sequences_for_counts[p1], p1_aa_counts, p1_aatypes_freqs) | |
| 762 | |
| 763 p1_codons_pvalues = testPvalues(p1_codons_freqs, codons_boot, args.iteration) | |
| 764 p1_aa_pvalues = testPvalues(p1_aa_freqs, aa_boot, args.iteration) | |
| 765 p1_aatypes_pvalues = testPvalues(p1_aatypes_freqs, aatypes_boot, args.iteration) | |
| 766 | |
| 767 all_codons[p1+"_obs_counts"] = p1_codons_counts | |
| 768 all_codons[p1+"_obs_freqs"] = p1_codons_freqs | |
| 769 all_codons[p1+"_pvalues"] = p1_codons_pvalues | |
| 770 all_aa[p1+"_obs_counts"] = p1_aa_counts | |
| 771 all_aa[p1+"_obs_freqs"] = p1_aa_freqs | |
| 772 all_aa[p1+"_pvalues"] = p1_aa_pvalues | |
| 773 all_aatypes[p1+"_obs_counts"] = p1_aatypes_counts | |
| 774 all_aatypes[p1+"_obs_freqs"] = p1_aatypes_freqs | |
| 775 all_aatypes[p1+"_pvalues"] = p1_aatypes_pvalues | |
| 776 all_various[p1] = [p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP] | |
| 777 | |
| 778 index.append(p1) | |
| 779 | |
| 780 if p2 not in index: | |
| 781 print "Countings on {}".format(p2) | |
| 782 | |
| 783 p2_codons_counts, p2_codons_freqs, p2_aa_counts, p2_aa_freqs, p2_aatypes_counts, p2_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p2], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif) | |
| 784 p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP = computeVarious(sequences_for_counts[p2], p2_aa_counts, p2_aatypes_freqs) | |
| 785 | |
| 786 p2_codons_pvalues = testPvalues(p2_codons_freqs, codons_boot, args.iteration) | |
| 787 p2_aa_pvalues = testPvalues(p2_aa_freqs, aa_boot, args.iteration) | |
| 788 p2_aatypes_pvalues = testPvalues(p2_aatypes_freqs, aatypes_boot, args.iteration) | |
| 789 | |
| 790 all_codons[p2+"_obs_counts"] = p2_codons_counts | |
| 791 all_codons[p2+"_obs_freqs"] = p2_codons_freqs | |
| 792 all_codons[p2+"_pvalues"] = p2_codons_pvalues | |
| 793 all_aa[p2+"_obs_counts"] = p2_aa_counts | |
| 794 all_aa[p2+"_obs_freqs"] = p2_aa_freqs | |
| 795 all_aa[p2+"_pvalues"] = p2_aa_pvalues | |
| 796 all_aatypes[p2+"_obs_counts"] = p2_aatypes_counts | |
| 797 all_aatypes[p2+"_obs_freqs"] = p2_aatypes_freqs | |
| 798 all_aatypes[p2+"_pvalues"] = p2_aatypes_pvalues | |
| 799 all_various[p2] = p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP | |
| 800 | |
| 801 index.append(p2) | |
| 802 | |
| 803 if (p1, p2) not in index_transi and p1 in sequences_for_counts and p2 in sequences_for_counts: | |
| 804 print "Countings transitions between {} and {}".format(p1, p2) | |
| 805 codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs = computeAllBiases(sequences_for_counts[p1], sequences_for_counts[p2], dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions, reversecode, reverseclassif) | |
| 806 index_transi.append((p1,p2)) | |
| 807 | |
| 808 p1p2_codons_pvalues = testPvalues(codons_transitions_freqs, codons_transi_boot, args.iteration) | |
| 809 p1p2_aa_pvalues = testPvalues(aa_transitions_freqs, aa_transi_boot, args.iteration) | |
| 810 p1p2_aatypes_pvalues = testPvalues(aatypes_transitions_freqs, aatypes_transi_boot, args.iteration) | |
| 811 | |
| 812 all_codons_transitions[p1+">"+p2+"_obs_counts"] = codons_transitions | |
| 813 all_codons_transitions[p1+">"+p2+"_obs_freqs"] = codons_transitions_freqs | |
| 814 all_codons_transitions[p1+">"+p2+"_pvalues"] = p1p2_codons_pvalues | |
| 815 all_aa_transitions[p1+">"+p2+"_obs_counts"] = aa_transitions | |
| 816 all_aa_transitions[p1+">"+p2+"_obs_freqs"] = aa_transitions_freqs | |
| 817 all_aa_transitions[p1+">"+p2+"_pvalues"] = p1p2_aa_pvalues | |
| 818 all_aatypes_transitions[p1+">"+p2+"_obs_counts"] = aatypes_transitions | |
| 819 all_aatypes_transitions[p1+">"+p2+"_obs_freqs"] = aatypes_transitions_freqs | |
| 820 all_aatypes_transitions[p1+">"+p2+"_pvalues"] = p1p2_aatypes_pvalues | |
| 821 | |
| 822 index_transi.append((p1, p2)) | |
| 823 | |
| 824 print "\n Done.\n" | |
| 825 | |
| 826 print "Processing : creating dataframes ..." | |
| 827 | |
| 828 frame_codons = pd.DataFrame(all_codons).T.astype('object') | |
| 829 frame_aa = pd.DataFrame(all_aa).T.astype('object') | |
| 830 frame_aatypes = pd.DataFrame(all_aatypes).T.astype('object') | |
| 831 | |
| 832 frame_codons_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_codons_transitions.items()}).unstack() | |
| 833 frame_codons_transitions.columns = frame_codons_transitions.columns.map('>'.join) | |
| 834 | |
| 835 frame_aa_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_aa_transitions.items()}).unstack() | |
| 836 frame_aa_transitions.columns = frame_aa_transitions.columns.map('>'.join) | |
| 837 | |
| 838 frame_aatypes_transitions = pd.concat({k: pd.DataFrame(v) for k, v in all_aatypes_transitions.items()}).unstack() | |
| 839 frame_aatypes_transitions.columns = frame_aatypes_transitions.columns.map('>'.join) | |
| 840 | |
| 841 frame_various = pd.DataFrame(all_various).T | |
| 842 frame_various.columns = ["GC3","GC12","IVYWREL","EKQH","PAYRESDGM","purineload", "CvP"] | |
| 843 | |
| 844 frame_codons.index.name, frame_aa.index.name, frame_aatypes.index.name = "Species", "Species","Species" | |
| 845 frame_aa_transitions.index.name, frame_aatypes_transitions.index.name, frame_various.index.name = "Species","Species","Species" | |
| 846 | |
| 847 print "Writing dataframes to output files ...\n" | |
| 848 | |
| 849 frame_codons.to_csv("codons_freqs.csv", sep=",", encoding="utf-8") | |
| 850 frame_aa.to_csv("aa_freqs.csv", sep=",", encoding="utf-8") | |
| 851 frame_aatypes.astype('object').to_csv("aatypes_freqs.csv", sep=",", encoding="utf-8") | |
| 852 frame_codons_transitions.to_csv("codons_transitions_freqs.csv", sep=",", encoding="utf-8") | |
| 853 frame_aa_transitions.to_csv("aa_transitions_freqs.csv", sep=",", encoding="utf-8") | |
| 854 frame_aatypes_transitions.to_csv("aatypes_transitions_freqs.csv", sep=",", encoding="utf-8") | |
| 855 frame_various.to_csv("gc_and_others_freqs.csv", sep=",", encoding="utf-8") | |
| 856 | |
| 857 print "Done." | |
| 858 | |
| 859 if __name__ == "__main__": | |
| 860 main() |
