comparison scripts/S01a_codons_counting.py @ 9:04a9ada73cc4 draft

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit f1ba8d136e0129f3e8435b25a95f70f697d51464-dirty
author abims-sbr
date Tue, 03 Jul 2018 10:55:46 -0400
parents 0ba551449008
children
comparison
equal deleted inserted replaced
8:705a7bf4c311 9:04a9ada73cc4
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # coding: utf-8 2 # coding: utf-8
3 # Author : Victor Mataigne 3 # Author : Victor Mataigne
4 4
5 import string, os, sys, re, random, itertools, argparse, copy 5 import string, os, sys, re, random, itertools, argparse, copy, math
6 import pandas as pd 6 import pandas as pd
7 import numpy as np 7 import numpy as np
8 8
9 def buildDicts(list_codons, content, dict_genetic_code, dict_aa_classif): 9 def buildDicts(list_codons, content, dict_genetic_code, dict_aa_classif):
10 """ Build dictionaries with values to 0. These dictionaries are used as starting point for each sequence counting 10 """ Build dictionaries with values to 0. These dictionaries are used as starting point for each sequence counting
495 aatypes_transi_freqs[key][key2] = freq 495 aatypes_transi_freqs[key][key2] = freq
496 else : 496 else :
497 aatypes_transi_freqs[key][key2] = 0 497 aatypes_transi_freqs[key][key2] = 0
498 return aatypes_transi_freqs 498 return aatypes_transi_freqs
499 499
500
501
502
500 # ------ The function ------ # 503 # ------ The function ------ #
501 504
502 codons_transitions = codons_transitions(seq1, seq2, dico_codons_transi) 505 codons_transitions = codons_transitions(seq1, seq2, dico_codons_transi)
503 codons_transitions_freqs = codons_transitions_freqs(codons_transitions) 506 codons_transitions_freqs = codons_transitions_freqs(codons_transitions)
504 aa_transitions = aa_transitions(codons_transitions, dico_aa_transi, reversecode) 507 aa_transitions = aa_transitions(codons_transitions, dico_aa_transi, reversecode)
505 aa_transitions_freqs = aa_transitions_freqs(aa_transitions) 508 aa_transitions_freqs = aa_transitions_freqs(aa_transitions)
506 aatypes_transitions = aatypes_transitions(aa_transitions, dico_aatypes_transi, reverseclassif) 509 aatypes_transitions = aatypes_transitions(aa_transitions, dico_aatypes_transi, reverseclassif)
507 aatypes_transitions_freqs = aatypes_transitions_freqs(aatypes_transitions) 510 aatypes_transitions_freqs = aatypes_transitions_freqs(aatypes_transitions)
508 511
509 return codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs 512 return codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs
513
514 def all_sed(codons_c, aa_c, aat_c, codons_transitions, aa_transitions, aatypes_transitions, dico_codons_transi, dico_aa_transi, dico_aatypes_transi):
515
516 def compute_sed(transi, counts, dico):
517 """ Compute the substitution exchangeability disequilibrium (SED) from one species A to another B between codons/aa//aatypes couples
518
519 Args:
520 transi ; dict - dictionaries of all counts of transition from codon/aa/aatype X to Y from sp A to sp B
521 counts : dict - dictionaries of codons/aa/aatypes counts in species A
522 dico : dict - a dictionary (nested) with all values to 0
523
524 """
525 dict_sed = copy.deepcopy(dico)
526
527 for key in transi.keys():
528 for key2 in transi.keys():
529 if counts[key] != 0 and float(transi[key2][key])/counts[key] != 0.0:
530 x = (float(transi[key][key2])/counts[key]) / (float(transi[key2][key])/counts[key])
531 dict_sed[key][key2] = - pow(2,1-x)+1
532 else :
533 dict_sed[key][key2] = 'NA'
534
535 return dict_sed
536
537 codons_sed = compute_sed(codons_transitions, codons_c, dico_codons_transi)
538 aa_sed = compute_sed(aa_transitions, aa_c, dico_aa_transi)
539 aatypes_sed = compute_sed(aatypes_transitions, aat_c, dico_aatypes_transi)
540
541 return codons_sed, aa_sed, aatypes_sed
510 542
511 # # # Function for random resampling -------------------------------------------------------------------------------------------- 543 # # # Function for random resampling --------------------------------------------------------------------------------------------
512 544
513 def sampling (dict_seq, nb_iter, len_sample, list_codons, genetic_code, aa_classif, reversecode, reverseclassif): 545 def sampling (dict_seq, nb_iter, len_sample, list_codons, genetic_code, aa_classif, reversecode, reverseclassif):
514 """ Resample randomly codons from sequences (sort of bootsrap) 546 """ Resample randomly codons from sequences (sort of bootsrap)
582 for key2 in aatypes_transitions_freq_tmp[key].keys(): 614 for key2 in aatypes_transitions_freq_tmp[key].keys():
583 classif_transitions_lst[key][key2].append(aatypes_transitions_freq_tmp[key][key2]) 615 classif_transitions_lst[key][key2].append(aatypes_transitions_freq_tmp[key][key2])
584 616
585 return codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst 617 return codons_lst, aa_lst, classif_lst, codons_transitions_lst, aa_transitions_lst, classif_transitions_lst
586 618
587 def testPvalues(dict_counts, dict_resampling, nb_iter): 619 def testPvalues(dict_counts, dict_resampling, nb_iter, method):
588 """ Computes where the observed value is located in the expected counting distribution 620 """ Computes where the observed value is located in the expected counting distribution
589 621
590 Args : 622 Args :
591 dict_counts (dict) : observed frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() 623 dict_counts (dict) : observed frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases()
592 dict_resampling (dict) : expected frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() within the sampling() function 624 dict_resampling (dict) : expected frequencies obtained from the functions computeAllCountingsAndFreqs() or computeAllBiases() within the sampling() function
593 625
594 Return : 626 Return :
595 pvalue (dict, dict of dicts) : the pvalues of all observed countings (dict) and transitions (dict of dicts) 627 pvalue (dict, dict of dicts) : the pvalues of all observed countings (dict) and transitions (dict of dicts)
628
629
630 pnorm computes the pvalue to have a value inferior to the observed value under a normal distribution
631 One sided to left tail :
632 p < 0.05 indicates significantly lower counts
633 p > 0.95 indicates significantly higher counts
596 """ 634 """
635
636
637 def p_resampling(obs, values, nb_iter):
638 """ The pvalue is the proportion of bootsrapped values smaller than the observed value
639 If p = 0.025 : 2.5% of the bootstrapped values are smaller than the observed value
640 p < 0.025 : the obs value is most likely significantly lower.
641 If p = 0.975 : 97.5% of the bootstrapped values are smaller than the observed value
642 p > 0.975 the obs value is most likely significantly higher.
643
644 Args :
645 obs : int or float - the observed value
646 values : list - values of resampling (int or floats)
647 nb_iter : int - the number of resampled values (=len(values))
648
649 Return :
650 pvalue (float)
651 """
652
653 num = len([x for x in values if x < obs])
654 return float(num + 1) / (nb_iter+1)
597 655
598 def testPvalue(obs, exp, nb_iter): 656 def testPvalue(obs, exp, nb_iter):
599 """ Compute a pvalue 657 """ Compute a pvalue
600 658
601 Args : 659 Args :
629 687
630 pvalues = {} 688 pvalues = {}
631 689
632 for key in dict_resampling.keys(): 690 for key in dict_resampling.keys():
633 if type(dict_resampling.values()[1]) is not dict : 691 if type(dict_resampling.values()[1]) is not dict :
634 pvalues[key] = testPvalue(dict_counts[key], dict_resampling[key], nb_iter) 692 if method == 'origin':
693 pvalues[key] = testPvalue(dict_counts[key], dict_resampling[key], nb_iter)
694 elif method == 'pnorm':
695 pvalues[key] = scipy.stats.norm.cdf(dict_counts[key], np.mean(dict_resampling[key]), np.std(dict_resampling[key]))
696 elif method == 'p_resampling':
697 pvalues[key] = p_resampling(dict_counts[key], dict_resampling[key], nb_iter)
635 else : 698 else :
636 pvalues[key] = {} 699 pvalues[key] = {}
637 for key2 in dict_resampling[key].keys(): 700 for key2 in dict_resampling[key].keys():
638 pvalues[key][key2] = testPvalue(dict_counts[key][key2], dict_resampling[key][key2], nb_iter) 701 if method == 'origin':
702 pvalues[key][key2] = testPvalue(dict_counts[key][key2], dict_resampling[key][key2], nb_iter)
703 elif method == 'pnorm':
704 pvalues[key][key2] = scipy.stats.norm.cdf(dict_counts[key][key2], np.mean(dict_resampling[key][key2]), np.std(dict_resampling[key][key2]))
705 elif method == 'p_resampling':
706 pvalues[key][key2] = p_resampling(dict_counts[key][key2], dict_resampling[key][key2], nb_iter)
639 707
640 return pvalues 708 return pvalues
641 709
642 def main(): 710 def main():
643 711
757 if p1 not in index: 825 if p1 not in index:
758 print "Countings on {}".format(p1) 826 print "Countings on {}".format(p1)
759 827
760 p1_codons_counts, p1_codons_freqs, p1_aa_counts, p1_aa_freqs, p1_aatypes_counts, p1_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p1], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif) 828 p1_codons_counts, p1_codons_freqs, p1_aa_counts, p1_aa_freqs, p1_aatypes_counts, p1_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p1], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif)
761 p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP = computeVarious(sequences_for_counts[p1], p1_aa_counts, p1_aatypes_freqs) 829 p1_GC3, p1_GC12, p1_IVYWREL, p1_EKQH, p1_PAYRESDGM, p1_purineload, p1_CvP = computeVarious(sequences_for_counts[p1], p1_aa_counts, p1_aatypes_freqs)
830
762 831
763 p1_codons_pvalues = testPvalues(p1_codons_freqs, codons_boot, args.iteration) 832 p1_codons_pvalues = testPvalues(p1_codons_freqs, codons_boot, args.iteration, 'p_resampling')
764 p1_aa_pvalues = testPvalues(p1_aa_freqs, aa_boot, args.iteration) 833 p1_aa_pvalues = testPvalues(p1_aa_freqs, aa_boot, args.iteration, 'p_resampling')
765 p1_aatypes_pvalues = testPvalues(p1_aatypes_freqs, aatypes_boot, args.iteration) 834 p1_aatypes_pvalues = testPvalues(p1_aatypes_freqs, aatypes_boot, args.iteration, 'p_resampling')
766 835
767 all_codons[p1+"_obs_counts"] = p1_codons_counts 836 all_codons[p1+"_obs_counts"] = p1_codons_counts
768 all_codons[p1+"_obs_freqs"] = p1_codons_freqs 837 all_codons[p1+"_obs_freqs"] = p1_codons_freqs
769 all_codons[p1+"_pvalues"] = p1_codons_pvalues 838 all_codons[p1+"_pvalues"] = p1_codons_pvalues
770 all_aa[p1+"_obs_counts"] = p1_aa_counts 839 all_aa[p1+"_obs_counts"] = p1_aa_counts
781 print "Countings on {}".format(p2) 850 print "Countings on {}".format(p2)
782 851
783 p2_codons_counts, p2_codons_freqs, p2_aa_counts, p2_aa_freqs, p2_aatypes_counts, p2_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p2], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif) 852 p2_codons_counts, p2_codons_freqs, p2_aa_counts, p2_aa_freqs, p2_aatypes_counts, p2_aatypes_freqs = computeAllCountingsAndFreqs(sequences_for_counts[p2], list_codons, init_dict_codons, init_dict_aa, init_dict_classif, dict_genetic_code, dict_aa_classif)
784 p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP = computeVarious(sequences_for_counts[p2], p2_aa_counts, p2_aatypes_freqs) 853 p2_GC3, p2_GC12, p2_IVYWREL, p2_EKQH, p2_PAYRESDGM, p2_purineload, p2_CvP = computeVarious(sequences_for_counts[p2], p2_aa_counts, p2_aatypes_freqs)
785 854
786 p2_codons_pvalues = testPvalues(p2_codons_freqs, codons_boot, args.iteration) 855 p2_codons_pvalues = testPvalues(p2_codons_freqs, codons_boot, args.iteration, 'p_resampling')
787 p2_aa_pvalues = testPvalues(p2_aa_freqs, aa_boot, args.iteration) 856 p2_aa_pvalues = testPvalues(p2_aa_freqs, aa_boot, args.iteration, 'p_resampling')
788 p2_aatypes_pvalues = testPvalues(p2_aatypes_freqs, aatypes_boot, args.iteration) 857 p2_aatypes_pvalues = testPvalues(p2_aatypes_freqs, aatypes_boot, args.iteration, 'p_resampling')
789 858
790 all_codons[p2+"_obs_counts"] = p2_codons_counts 859 all_codons[p2+"_obs_counts"] = p2_codons_counts
791 all_codons[p2+"_obs_freqs"] = p2_codons_freqs 860 all_codons[p2+"_obs_freqs"] = p2_codons_freqs
792 all_codons[p2+"_pvalues"] = p2_codons_pvalues 861 all_codons[p2+"_pvalues"] = p2_codons_pvalues
793 all_aa[p2+"_obs_counts"] = p2_aa_counts 862 all_aa[p2+"_obs_counts"] = p2_aa_counts
801 index.append(p2) 870 index.append(p2)
802 871
803 if (p1, p2) not in index_transi and p1 in sequences_for_counts and p2 in sequences_for_counts: 872 if (p1, p2) not in index_transi and p1 in sequences_for_counts and p2 in sequences_for_counts:
804 print "Countings transitions between {} and {}".format(p1, p2) 873 print "Countings transitions between {} and {}".format(p1, p2)
805 codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs = computeAllBiases(sequences_for_counts[p1], sequences_for_counts[p2], dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions, reversecode, reverseclassif) 874 codons_transitions, codons_transitions_freqs, aa_transitions, aa_transitions_freqs, aatypes_transitions, aatypes_transitions_freqs = computeAllBiases(sequences_for_counts[p1], sequences_for_counts[p2], dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions, reversecode, reverseclassif)
875
876 # Ajout
877 codons_sed, aa_sed, aatypes_sed = all_sed(p1_codons_counts, p1_aa_counts, p1_aatypes_counts, codons_transitions, aa_transitions, aatypes_transitions, dico_codons_transitions, dico_aa_transitions, dico_aatypes_transitions)
878
806 index_transi.append((p1,p2)) 879 index_transi.append((p1,p2))
807 880
808 p1p2_codons_pvalues = testPvalues(codons_transitions_freqs, codons_transi_boot, args.iteration) 881 p1p2_codons_pvalues = testPvalues(codons_transitions_freqs, codons_transi_boot, args.iteration, 'p_resampling')
809 p1p2_aa_pvalues = testPvalues(aa_transitions_freqs, aa_transi_boot, args.iteration) 882 p1p2_aa_pvalues = testPvalues(aa_transitions_freqs, aa_transi_boot, args.iteration, 'p_resampling')
810 p1p2_aatypes_pvalues = testPvalues(aatypes_transitions_freqs, aatypes_transi_boot, args.iteration) 883 p1p2_aatypes_pvalues = testPvalues(aatypes_transitions_freqs, aatypes_transi_boot, args.iteration, 'p_resampling')
811 884
812 all_codons_transitions[p1+">"+p2+"_obs_counts"] = codons_transitions 885 all_codons_transitions[p1+">"+p2+"_obs_counts"] = codons_transitions
813 all_codons_transitions[p1+">"+p2+"_obs_freqs"] = codons_transitions_freqs 886 all_codons_transitions[p1+">"+p2+"_obs_freqs"] = codons_transitions_freqs
814 all_codons_transitions[p1+">"+p2+"_pvalues"] = p1p2_codons_pvalues 887 all_codons_transitions[p1+">"+p2+"_pvalues"] = p1p2_codons_pvalues
815 all_aa_transitions[p1+">"+p2+"_obs_counts"] = aa_transitions 888 all_aa_transitions[p1+">"+p2+"_obs_counts"] = aa_transitions
817 all_aa_transitions[p1+">"+p2+"_pvalues"] = p1p2_aa_pvalues 890 all_aa_transitions[p1+">"+p2+"_pvalues"] = p1p2_aa_pvalues
818 all_aatypes_transitions[p1+">"+p2+"_obs_counts"] = aatypes_transitions 891 all_aatypes_transitions[p1+">"+p2+"_obs_counts"] = aatypes_transitions
819 all_aatypes_transitions[p1+">"+p2+"_obs_freqs"] = aatypes_transitions_freqs 892 all_aatypes_transitions[p1+">"+p2+"_obs_freqs"] = aatypes_transitions_freqs
820 all_aatypes_transitions[p1+">"+p2+"_pvalues"] = p1p2_aatypes_pvalues 893 all_aatypes_transitions[p1+">"+p2+"_pvalues"] = p1p2_aatypes_pvalues
821 894
895 all_codons_transitions[p1+">"+p2+"_sed"] = codons_sed
896 all_aa_transitions[p1+">"+p2+"_sed"] = aa_sed
897 all_aatypes_transitions[p1+">"+p2+"_sed"] = aatypes_sed
898
822 index_transi.append((p1, p2)) 899 index_transi.append((p1, p2))
823 900
824 print "\n Done.\n" 901 print "\n Done.\n"
825 902
826 print "Processing : creating dataframes ..." 903 print "Processing : creating dataframes ..."
844 frame_codons.index.name, frame_aa.index.name, frame_aatypes.index.name = "Species", "Species","Species" 921 frame_codons.index.name, frame_aa.index.name, frame_aatypes.index.name = "Species", "Species","Species"
845 frame_aa_transitions.index.name, frame_aatypes_transitions.index.name, frame_various.index.name = "Species","Species","Species" 922 frame_aa_transitions.index.name, frame_aatypes_transitions.index.name, frame_various.index.name = "Species","Species","Species"
846 923
847 print "Writing dataframes to output files ...\n" 924 print "Writing dataframes to output files ...\n"
848 925
849 frame_codons.to_csv("codons_freqs.csv", sep=",", encoding="utf-8") 926 frame_codons.round(8).to_csv("codons_freqs.csv", sep=",", encoding="utf-8")
850 frame_aa.to_csv("aa_freqs.csv", sep=",", encoding="utf-8") 927 frame_aa.round(8).to_csv("aa_freqs.csv", sep=",", encoding="utf-8")
851 frame_aatypes.astype('object').to_csv("aatypes_freqs.csv", sep=",", encoding="utf-8") 928 frame_aatypes.astype('object').round(8).to_csv("aatypes_freqs.csv", sep=",", encoding="utf-8")
852 frame_codons_transitions.to_csv("codons_transitions_freqs.csv", sep=",", encoding="utf-8") 929 frame_codons_transitions.round(8).to_csv("codons_transitions_freqs.csv", sep=",", encoding="utf-8")
853 frame_aa_transitions.to_csv("aa_transitions_freqs.csv", sep=",", encoding="utf-8") 930 frame_aa_transitions.round(8).to_csv("aa_transitions_freqs.csv", sep=",", encoding="utf-8")
854 frame_aatypes_transitions.to_csv("aatypes_transitions_freqs.csv", sep=",", encoding="utf-8") 931 frame_aatypes_transitions.round(8).to_csv("aatypes_transitions_freqs.csv", sep=",", encoding="utf-8")
855 frame_various.to_csv("gc_and_others_freqs.csv", sep=",", encoding="utf-8") 932 frame_various.round(8).to_csv("gc_and_others_freqs.csv", sep=",", encoding="utf-8")
856 933
857 print "Done." 934 print "Done."
858 935
859 if __name__ == "__main__": 936 if __name__ == "__main__":
860 main() 937 main()