Mercurial > repos > abims-sbr > mutcount
comparison scripts/S01a_codons_counting.py @ 9:04a9ada73cc4 draft
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit f1ba8d136e0129f3e8435b25a95f70f697d51464-dirty
| author | abims-sbr |
|---|---|
| date | Tue, 03 Jul 2018 10:55:46 -0400 |
| parents | 0ba551449008 |
| children |
comparison
equal
deleted
inserted
replaced
| 8:705a7bf4c311 | 9:04a9ada73cc4 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 # coding: utf-8 | 2 # coding: utf-8 |
| 3 # Author : Victor Mataigne | 3 # Author : Victor Mataigne |
| 4 | 4 |
| 5 import string, os, sys, re, random, itertools, argparse, copy | 5 import string, os, sys, re, random, itertools, argparse, copy, math |
| 6 import pandas as pd | 6 import pandas as pd |
| 7 import numpy as np | 7 import numpy as np |
| 8 | 8 |
| 9 def buildDicts(list_codons, content, dict_genetic_code, dict_aa_classif): | 9 def buildDicts(list_codons, content, dict_genetic_code, dict_aa_classif): |
| 10 """ Build dictionaries with values to 0. These dictionaries are used as starting point for each sequence counting | 10 """ Build dictionaries with values to 0. These dictionaries are used as starting point for each sequence counting |
| 495 aatypes_transi_freqs[key][key2] = freq | 495 aatypes_transi_freqs[key][key2] = freq |
| 496 else : | 496 else : |
| 497 aatypes_transi_freqs[key][key2] = 0 | 497 aatypes_transi_freqs[key][key2] = 0 |
| 498 return aatypes_transi_freqs | 498 return aatypes_transi_freqs |
| 499 | 499 |
| 500 | |
| 501 | |
| 502 | |
| 500 # ------ The function ------ # | 503 # ------ The function ------ # |
| 501 | 504 |
| 502 codons_transitions = codons_transitions(seq1, seq2, dico_codons_transi) | 505 codons_transitions = codons_transitions(seq1, seq2, dico_codons_transi) |
| 503 codons_transitions_freqs = codons_transitions_freqs(codons_transitions) | 506 codons_transitions_freqs = codons_transitions_freqs(codons_transitions) |
| 504 aa_transitions = aa_transitions(codons_transitions, dico_aa_transi, reversecode) | 507 aa_transitions = aa_transitions(codons_transitions, dico_aa_transi, reversecode) |
| 505 aa_transitions_freqs = aa_transitions_freqs(aa_transitions) | 508 aa_transitions_freqs = aa_transitions_freqs(aa_transitions) |
| 506 aatypes_transitions = aatypes_transitions(aa_transitions, dico_aatypes_transi, reverseclassif) | 509 aatypes_transitions = aatypes_transitions(aa_transitions, dico_aatypes_transi, reverseclassif) |
| 507 aatypes_transitions_freqs = aatypes_transitions_freqs(aatypes_transitions) | 510 aatypes_transitions_freqs = aatypes_transitions_freqs(aatypes_transitions) |
| 508 | 511 |
| 509 return codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs | 512 return codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs |
| 513 | |
| 514 def all_sed(codons_c, aa_c, aat_c, codons_transitions, aa_transitions, aatypes_transitions, dico_codons_transi, dico_aa_transi, dico_aatypes_transi): | |
| 515 | |
| 516 def compute_sed(transi, counts, dico): | |
| 517 """ Compute the substitution exchangeability disequilibrium (SED) from one species A to another B between codons/aa//aatypes couples | |
| 518 | |
| 519 Args: | |
| 520 transi ; dict - dictionaries of all counts of transition from codon/aa/aatype X to Y from sp A to sp B | |
| 521 counts : dict - dictionaries of codons/aa/aatypes counts in species A | |
| 522 dico : dict - a dictionary (nested) with all values to 0 | |
| 523 | |
| 524 """ | |
| 525 dict_sed = copy.deepcopy(dico) | |
| 526 | |
| 527 for key in transi.keys(): | |
| 528 for key2 in transi.keys(): | |
| 529 if counts[key] != 0 and float(transi[key2][key])/counts[key] != 0.0: | |
| 530 x = (float(transi[key][key2])/counts[key]) / (float(transi[key2][key])/counts[key]) | |
| 531 dict_sed[key][key2] = - pow(2,1-x)+1 | |
| 532 else : | |
| 533 dict_sed[key][key2] = 'NA' | |
| 534 | |
| 535 return dict_sed | |
| 536 | |
| 537 codons_sed = compute_sed(codons_transitions, codons_c, dico_codons_transi) | |
| 538 aa_sed = compute_sed(aa_transitions, aa_c, dico_aa_transi) | |
| 539 aatypes_sed = compute_sed(aatypes_transitions, aat_c, dico_aatypes_transi) | |
| 540 | |
| 541 return codons_sed, aa_sed, aatypes_sed | |
| 510 | 542 |
| 511 # # # Function for random resampling -------------------------------------------------------------------------------------------- | 543 # # # Function for random resampling -------------------------------------------------------------------------------------------- |
| 512 | 544 |
| 513 def sampling (dict_seq, nb_iter, len_sample, list_codons, genetic_code, aa_classif, reversecode, reverseclassif): | 545 def sampling (dict_seq, nb_iter, len_sample, list_codons, genetic_code, aa_classif, reversecode, reverseclassif): |
| 514 """ Resample randomly codons from sequences (sort of bootsrap) | 546 """ Resample randomly codons from sequences (sort of bootsrap) |
| 582 for key2 in aatypes_transitions_freq_tmp[key].keys(): | 614 for key2 in aatypes_transitions_freq_tmp[key].keys(): |
| 583 classif_transitions_lst[key][key2].append(aatypes_transitions_freq_tmp[key][key2]) | 615 classif_transitions_lst[key][key2].append(aatypes_transitions_freq_tmp[key][key2]) |
| 584 | 616 |
| 585 return codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst | 617 return codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst |
| 586 | 618 |
| 587 def testPvalues(dict_counts, dict_resampling, nb_iter): | 619 def testPvalues(dict_counts, dict_resampling, nb_iter, method): |
| 588 """ Computes where the observed value is located in the expected counting distribution | 620 """ Computes where the observed value is located in the expected counting distribution |
| 589 | 621 |
| 590 Args : | 622 Args : |
| 591 dict_counts (dict) : observed frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() | 623 dict_counts (dict) : observed frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() |
| 592 dict_resampling (dict) : expected frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() within the sampling() function | 624 dict_resampling (dict) : expected frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() within the sampling() function |
| 593 | 625 |
| 594 Return : | 626 Return : |
| 595 pvalue (dict, dict of dicts) : the pvalues of all observed countings (dict) and transitions (dict of dicts) | 627 pvalue (dict, dict of dicts) : the pvalues of all observed countings (dict) and transitions (dict of dicts) |
| 628 | |
| 629 | |
| 630 pnorm computes the pvalue to have a value inferior to the observed value under a normal distribution | |
| 631 One sided to left tail : | |
| 632 p < 0.05 indicates significantly lower counts | |
| 633 p > 0.95 indicates significantly higher counts | |
| 596 """ | 634 """ |
| 635 | |
| 636 | |
| 637 def p_resampling(obs, values, nb_iter): | |
| 638 """ The pvalue is the proportion of bootsrapped values smaller than the observed value | |
| 639 If p = 0.025 : 2.5% of the bootstrapped values are smaller than the observed value | |
| 640 p < 0.025 : the obs value is most likely significantly lower. | |
| 641 If p = 0.975 : 97.5% of the bootstrapped values are smaller than the observed value | |
| 642 p > 0.975 the obs value is most likely significantly higher. | |
| 643 | |
| 644 Args : | |
| 645 obs : int or float - the observed value | |
| 646 values : list - values of resampling (int or floats) | |
| 647 nb_iter : int - the number of resampled values (=len(values)) | |
| 648 | |
| 649 Return : | |
| 650 pvalue (float) | |
| 651 """ | |
| 652 | |
| 653 num = len([x for x in values if x < obs]) | |
| 654 return float(num + 1) / (nb_iter+1) | |
| 597 | 655 |
| 598 def testPvalue(obs, exp, nb_iter): | 656 def testPvalue(obs, exp, nb_iter): |
| 599 """ Compute a pvalue | 657 """ Compute a pvalue |
| 600 | 658 |
| 601 Args : | 659 Args : |
| 629 | 687 |
| 630 pvalues = {} | 688 pvalues = {} |
| 631 | 689 |
| 632 for key in dict_resampling.keys(): | 690 for key in dict_resampling.keys(): |
| 633 if type(dict_resampling.values()[1]) is not dict : | 691 if type(dict_resampling.values()[1]) is not dict : |
| 634 pvalues[key] = testPvalue(dict_counts[key], dict_resampling[key], nb_iter) | 692 if method == 'origin': |
| 693 pvalues[key] = testPvalue(dict_counts[key], dict_resampling[key], nb_iter) | |
| 694 elif method == 'pnorm': | |
| 695 pvalues[key] = scipy.stats.norm.cdf(dict_counts[key], np.mean(dict_resampling[key]), np.std(dict_resampling[key])) | |
| 696 elif method == 'p_resampling': | |
| 697 pvalues[key] = p_resampling(dict_counts[key], dict_resampling[key], nb_iter) | |
| 635 else : | 698 else : |
| 636 pvalues[key] = {} | 699 pvalues[key] = {} |
| 637 for key2 in dict_resampling[key].keys(): | 700 for key2 in dict_resampling[key].keys(): |
| 638 pvalues[key][key2] = testPvalue(dict_counts[key][key2], dict_resampling[key][key2], nb_iter) | 701 if method == 'origin': |
| 702 pvalues[key][key2] = testPvalue(dict_counts[key][key2], dict_resampling[key][key2], nb_iter) | |
| 703 elif method == 'pnorm': | |
| 704 pvalues[key][key2] = scipy.stats.norm.cdf(dict_counts[key][key2], np.mean(dict_resampling[key][key2]), np.std(dict_resampling[key][key2])) | |
| 705 elif method == 'p_resampling': | |
| 706 pvalues[key][key2] = p_resampling(dict_counts[key][key2], dict_resampling[key][key2], nb_iter) | |
| 639 | 707 |
| 640 return pvalues | 708 return pvalues |
| 641 | 709 |
| 642 def main(): | 710 def main(): |
| 643 | 711 |
| 757 if p1 not in index: | 825 if p1 not in index: |
| 758 print "Countings on {}".format(p1) | 826 print "Countings on {}".format(p1) |
| 759 | 827 |
| 760 p1_codons_counts, p1_codons_freqs, p1_aa_counts, p1_aa_freqs, p1_aatypes_counts, p1_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p1], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif) | 828 p1_codons_counts, p1_codons_freqs, p1_aa_counts, p1_aa_freqs, p1_aatypes_counts, p1_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p1], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif) |
| 761 p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP = computeVarious(sequences_for_counts[p1], p1_aa_counts, p1_aatypes_freqs) | 829 p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP = computeVarious(sequences_for_counts[p1], p1_aa_counts, p1_aatypes_freqs) |
| 830 | |
| 762 | 831 |
| 763 p1_codons_pvalues = testPvalues(p1_codons_freqs, codons_boot, args.iteration) | 832 p1_codons_pvalues = testPvalues(p1_codons_freqs, codons_boot, args.iteration, 'p_resampling') |
| 764 p1_aa_pvalues = testPvalues(p1_aa_freqs, aa_boot, args.iteration) | 833 p1_aa_pvalues = testPvalues(p1_aa_freqs, aa_boot, args.iteration, 'p_resampling') |
| 765 p1_aatypes_pvalues = testPvalues(p1_aatypes_freqs, aatypes_boot, args.iteration) | 834 p1_aatypes_pvalues = testPvalues(p1_aatypes_freqs, aatypes_boot, args.iteration, 'p_resampling') |
| 766 | 835 |
| 767 all_codons[p1+"_obs_counts"] = p1_codons_counts | 836 all_codons[p1+"_obs_counts"] = p1_codons_counts |
| 768 all_codons[p1+"_obs_freqs"] = p1_codons_freqs | 837 all_codons[p1+"_obs_freqs"] = p1_codons_freqs |
| 769 all_codons[p1+"_pvalues"] = p1_codons_pvalues | 838 all_codons[p1+"_pvalues"] = p1_codons_pvalues |
| 770 all_aa[p1+"_obs_counts"] = p1_aa_counts | 839 all_aa[p1+"_obs_counts"] = p1_aa_counts |
| 781 print "Countings on {}".format(p2) | 850 print "Countings on {}".format(p2) |
| 782 | 851 |
| 783 p2_codons_counts, p2_codons_freqs, p2_aa_counts, p2_aa_freqs, p2_aatypes_counts, p2_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p2], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif) | 852 p2_codons_counts, p2_codons_freqs, p2_aa_counts, p2_aa_freqs, p2_aatypes_counts, p2_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p2], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif) |
| 784 p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP = computeVarious(sequences_for_counts[p2], p2_aa_counts, p2_aatypes_freqs) | 853 p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP = computeVarious(sequences_for_counts[p2], p2_aa_counts, p2_aatypes_freqs) |
| 785 | 854 |
| 786 p2_codons_pvalues = testPvalues(p2_codons_freqs, codons_boot, args.iteration) | 855 p2_codons_pvalues = testPvalues(p2_codons_freqs, codons_boot, args.iteration, 'p_resampling') |
| 787 p2_aa_pvalues = testPvalues(p2_aa_freqs, aa_boot, args.iteration) | 856 p2_aa_pvalues = testPvalues(p2_aa_freqs, aa_boot, args.iteration, 'p_resampling') |
| 788 p2_aatypes_pvalues = testPvalues(p2_aatypes_freqs, aatypes_boot, args.iteration) | 857 p2_aatypes_pvalues = testPvalues(p2_aatypes_freqs, aatypes_boot, args.iteration, 'p_resampling') |
| 789 | 858 |
| 790 all_codons[p2+"_obs_counts"] = p2_codons_counts | 859 all_codons[p2+"_obs_counts"] = p2_codons_counts |
| 791 all_codons[p2+"_obs_freqs"] = p2_codons_freqs | 860 all_codons[p2+"_obs_freqs"] = p2_codons_freqs |
| 792 all_codons[p2+"_pvalues"] = p2_codons_pvalues | 861 all_codons[p2+"_pvalues"] = p2_codons_pvalues |
| 793 all_aa[p2+"_obs_counts"] = p2_aa_counts | 862 all_aa[p2+"_obs_counts"] = p2_aa_counts |
| 801 index.append(p2) | 870 index.append(p2) |
| 802 | 871 |
| 803 if (p1, p2) not in index_transi and p1 in sequences_for_counts and p2 in sequences_for_counts: | 872 if (p1, p2) not in index_transi and p1 in sequences_for_counts and p2 in sequences_for_counts: |
| 804 print "Countings transitions between {} and {}".format(p1, p2) | 873 print "Countings transitions between {} and {}".format(p1, p2) |
| 805 codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs = computeAllBiases(sequences_for_counts[p1], sequences_for_counts[p2], dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions, reversecode, reverseclassif) | 874 codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs = computeAllBiases(sequences_for_counts[p1], sequences_for_counts[p2], dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions, reversecode, reverseclassif) |
| 875 | |
| 876 # Ajout | |
| 877 codons_sed, aa_sed, aatypes_sed = all_sed(p1_codons_counts, p1_aa_counts, p1_aatypes_counts, codons_transitions, aa_transitions, aatypes_transitions, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions) | |
| 878 | |
| 806 index_transi.append((p1,p2)) | 879 index_transi.append((p1,p2)) |
| 807 | 880 |
| 808 p1p2_codons_pvalues = testPvalues(codons_transitions_freqs, codons_transi_boot, args.iteration) | 881 p1p2_codons_pvalues = testPvalues(codons_transitions_freqs, codons_transi_boot, args.iteration, 'p_resampling') |
| 809 p1p2_aa_pvalues = testPvalues(aa_transitions_freqs, aa_transi_boot, args.iteration) | 882 p1p2_aa_pvalues = testPvalues(aa_transitions_freqs, aa_transi_boot, args.iteration, 'p_resampling') |
| 810 p1p2_aatypes_pvalues = testPvalues(aatypes_transitions_freqs, aatypes_transi_boot, args.iteration) | 883 p1p2_aatypes_pvalues = testPvalues(aatypes_transitions_freqs, aatypes_transi_boot, args.iteration, 'p_resampling') |
| 811 | 884 |
| 812 all_codons_transitions[p1+">"+p2+"_obs_counts"] = codons_transitions | 885 all_codons_transitions[p1+">"+p2+"_obs_counts"] = codons_transitions |
| 813 all_codons_transitions[p1+">"+p2+"_obs_freqs"] = codons_transitions_freqs | 886 all_codons_transitions[p1+">"+p2+"_obs_freqs"] = codons_transitions_freqs |
| 814 all_codons_transitions[p1+">"+p2+"_pvalues"] = p1p2_codons_pvalues | 887 all_codons_transitions[p1+">"+p2+"_pvalues"] = p1p2_codons_pvalues |
| 815 all_aa_transitions[p1+">"+p2+"_obs_counts"] = aa_transitions | 888 all_aa_transitions[p1+">"+p2+"_obs_counts"] = aa_transitions |
| 817 all_aa_transitions[p1+">"+p2+"_pvalues"] = p1p2_aa_pvalues | 890 all_aa_transitions[p1+">"+p2+"_pvalues"] = p1p2_aa_pvalues |
| 818 all_aatypes_transitions[p1+">"+p2+"_obs_counts"] = aatypes_transitions | 891 all_aatypes_transitions[p1+">"+p2+"_obs_counts"] = aatypes_transitions |
| 819 all_aatypes_transitions[p1+">"+p2+"_obs_freqs"] = aatypes_transitions_freqs | 892 all_aatypes_transitions[p1+">"+p2+"_obs_freqs"] = aatypes_transitions_freqs |
| 820 all_aatypes_transitions[p1+">"+p2+"_pvalues"] = p1p2_aatypes_pvalues | 893 all_aatypes_transitions[p1+">"+p2+"_pvalues"] = p1p2_aatypes_pvalues |
| 821 | 894 |
| 895 all_codons_transitions[p1+">"+p2+"_sed"] = codons_sed | |
| 896 all_aa_transitions[p1+">"+p2+"_sed"] = aa_sed | |
| 897 all_aatypes_transitions[p1+">"+p2+"_sed"] = aatypes_sed | |
| 898 | |
| 822 index_transi.append((p1, p2)) | 899 index_transi.append((p1, p2)) |
| 823 | 900 |
| 824 print "\n Done.\n" | 901 print "\n Done.\n" |
| 825 | 902 |
| 826 print "Processing : creating dataframes ..." | 903 print "Processing : creating dataframes ..." |
| 844 frame_codons.index.name, frame_aa.index.name, frame_aatypes.index.name = "Species", "Species","Species" | 921 frame_codons.index.name, frame_aa.index.name, frame_aatypes.index.name = "Species", "Species","Species" |
| 845 frame_aa_transitions.index.name, frame_aatypes_transitions.index.name, frame_various.index.name = "Species","Species","Species" | 922 frame_aa_transitions.index.name, frame_aatypes_transitions.index.name, frame_various.index.name = "Species","Species","Species" |
| 846 | 923 |
| 847 print "Writing dataframes to output files ...\n" | 924 print "Writing dataframes to output files ...\n" |
| 848 | 925 |
| 849 frame_codons.to_csv("codons_freqs.csv", sep=",", encoding="utf-8") | 926 frame_codons.round(8).to_csv("codons_freqs.csv", sep=",", encoding="utf-8") |
| 850 frame_aa.to_csv("aa_freqs.csv", sep=",", encoding="utf-8") | 927 frame_aa.round(8).to_csv("aa_freqs.csv", sep=",", encoding="utf-8") |
| 851 frame_aatypes.astype('object').to_csv("aatypes_freqs.csv", sep=",", encoding="utf-8") | 928 frame_aatypes.astype('object').round(8).to_csv("aatypes_freqs.csv", sep=",", encoding="utf-8") |
| 852 frame_codons_transitions.to_csv("codons_transitions_freqs.csv", sep=",", encoding="utf-8") | 929 frame_codons_transitions.round(8).to_csv("codons_transitions_freqs.csv", sep=",", encoding="utf-8") |
| 853 frame_aa_transitions.to_csv("aa_transitions_freqs.csv", sep=",", encoding="utf-8") | 930 frame_aa_transitions.round(8).to_csv("aa_transitions_freqs.csv", sep=",", encoding="utf-8") |
| 854 frame_aatypes_transitions.to_csv("aatypes_transitions_freqs.csv", sep=",", encoding="utf-8") | 931 frame_aatypes_transitions.round(8).to_csv("aatypes_transitions_freqs.csv", sep=",", encoding="utf-8") |
| 855 frame_various.to_csv("gc_and_others_freqs.csv", sep=",", encoding="utf-8") | 932 frame_various.round(8).to_csv("gc_and_others_freqs.csv", sep=",", encoding="utf-8") |
| 856 | 933 |
| 857 print "Done." | 934 print "Done." |
| 858 | 935 |
| 859 if __name__ == "__main__": | 936 if __name__ == "__main__": |
| 860 main() | 937 main() |
