Mercurial > repos > abims-sbr > mutcount
view scripts/S02b_study_seq_composition_nuc.py @ 4:5766f80370e7 draft
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 17acd02c547bd1f7661a846661aa99de9743efe9
| author | abims-sbr |
|---|---|
| date | Tue, 27 Feb 2018 08:43:50 -0500 |
| parents | 988467f963f0 |
| children | 0ba551449008 |
line wrap: on
line source
#!/usr/bin/env python ## Author: Eric FONTANILLAS ## Date: 21.12.10 ## Last Version : 12/2017 by Victor Mataigne ## Object: Test for compositional bias in genome and proteome as marker of thermal adaptation (comparison between 2 "hot" species: Ap and Ps and one "cold" species: Pg) import sys,os,shutil,subprocess, string from functions import simplify_fasta_name, dico ################## ###### DEF2 ###### ################## def base_composition(seq): count_A=string.count(seq, "A") + string.count(seq, "a") count_T=string.count(seq, "T") + string.count(seq, "t") count_C=string.count(seq, "C") + string.count(seq, "c") count_G=string.count(seq, "G") + string.count(seq, "g") ## 3 ## Nucleotide proportions ln = count_C+count_G+count_T+count_A if (ln!=0): CG = count_C+count_G AT = count_T+count_A AG = count_A+count_G TC = count_T+count_C ## 1 ## Search for compositional bias in genome as marker of thermal adaptation: CG vs AT ratio_CG_AT = float(CG)/float(AT) percent_CG = float(CG)/(float(AT) + float(CG))*100 ## 2 ## Search for compositional bias in genome as marker of thermal adaptation: AG vs TC ratio_purine_pyrimidine=float(AG)/float(TC) percent_purine=float(AG)/(float(AG)+float(TC))*100 prop_A = float(count_A)/float(ln) prop_T = float(count_T)/float(ln) prop_C = float(count_C)/float(ln) prop_G = float(count_G)/float(ln) else: percent_CG=0 percent_purine=0 prop_A=0 prop_T=0 prop_C=0 prop_G=0 return(percent_CG, percent_purine, prop_A, prop_T, prop_C, prop_G) ############################################## ################## ###### DEF3 ###### ################## def purine_loading(seq): count_A=string.count(seq, "A") + string.count(seq, "a") count_T=string.count(seq, "T") + string.count(seq, "t") count_C=string.count(seq, "C") + string.count(seq, "c") count_G=string.count(seq, "G") + string.count(seq, "g") ## 3 ## Nucleotide proportions TOTAL = count_C+count_G+count_T+count_A if (TOTAL!=0): ## PLI : Purine loading indice (Forsdyke et al.) # (G-C)/N * 1000 et (A-T)/N * 1000 DIFF_GC = count_G - count_C DIFF_AT = count_A - count_T # Per bp PLI_GC = float(DIFF_GC)/float(TOTAL) PLI_AT = float(DIFF_AT)/float(TOTAL) # Per 1000 bp PLI_GC_1000 = PLI_GC*1000 PLI_AT_1000 = PLI_AT*1000 else: DIFF_GC=0 DIFF_AT=0 PLI_GC=0 PLI_AT=0 PLI_GC_1000=0 PLI_AT_1000=0 return(TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000) ############################################## ################### ### RUN RUN RUN ### ################### ##Create specific folders Path_IN_loci_NUC = "./IN_NUC" outpath= "./OUT" os.makedirs(Path_IN_loci_NUC) os.makedirs(outpath) infiles = str.split(sys.argv[1], ",") for file in infiles: os.system("cp %s %s" %(file, Path_IN_loci_NUC)) ## 1 ## List taxa LT=[] cmd="grep '>' %s" % sys.argv[2] result = subprocess.check_output(cmd, shell=True) result=result.split('\n') for i in result: sp=i[1:] if sp !='': LT.append(sp) print LT #LT = ["Ap", "Ac", "Pg"] #os.system("grep '>' %s" %(sys.argv[1])) ## 2 ## PathIN # fileIN_properties = open("01_AminoAcid_Properties2.csv", "r") Lloci_NUC = os.listdir(Path_IN_loci_NUC) ## 3 ## PathOUT ## 3.1 ## NUC composition fileOUT_NUC=open("./OUT/nuc_compositions.csv","w") fileOUT_NUC.write("LOCUS,") for taxa in LT: fileOUT_NUC.write("%s_prop_A,%s_prop_T,%s_prop_C,%s_prop_G," %(taxa,taxa,taxa,taxa)) fileOUT_NUC.write("\n") ## 3.2 ## NUC percent_GC fileOUT_percent_GC=open("./OUT/percent_GC.csv","w") fileOUT_percent_GC.write("LOCUS,") for taxa in LT: fileOUT_percent_GC.write("%s_percent_GC," %(taxa)) fileOUT_percent_GC.write("\n") ## 3.3 ## NUC percent_purine fileOUT_percent_purine=open("./OUT/percent_purine.csv","w") fileOUT_percent_purine.write("LOCUS,") for taxa in LT: fileOUT_percent_purine.write("%s_percent_purine," %(taxa)) fileOUT_percent_purine.write("\n") ## 3.4 ## Purine Load fileOUT_Purine_Load=open("./OUT/Purine_Load_Indice.csv", "w") fileOUT_Purine_Load.write("LOCUS,") for taxa in LT: fileOUT_Purine_Load.write("%s_TOTAL,%s_DIFF_GC,%s_DIFF_AT,%s_PLI_GC1000,%s_PLI_AT1000," %(taxa,taxa,taxa,taxa,taxa)) fileOUT_Purine_Load.write("\n") ##################### ## 4 ## Process Loci ##################### for locus in Lloci_NUC: print locus path_locus = "%s/%s" %(Path_IN_loci_NUC, locus) bash = dico(path_locus,LT) fileOUT_NUC.write("%s," %locus) fileOUT_percent_GC.write("%s," %locus) fileOUT_percent_purine.write("%s," %locus) fileOUT_Purine_Load.write("%s," %locus) for taxa in LT: if taxa in bash.keys(): seq = bash[taxa] percent_GC, percent_purine,prop_A, prop_T, prop_C, prop_G = base_composition(seq) ### DEF2 ### TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000 = purine_loading(seq) ### DEF3 ### fileOUT_NUC.write("%.5f,%.5f,%.5f,%.5f," %(prop_A,prop_T,prop_C,prop_G)) fileOUT_percent_GC.write("%.5f," %percent_GC) fileOUT_percent_purine.write("%.5f," %percent_purine) fileOUT_Purine_Load.write("%d,%d,%d,%.5f,%.5f," %(TOTAL, DIFF_GC, DIFF_AT,PLI_GC_1000, PLI_AT_1000)) else: fileOUT_NUC.write("%s,%s,%s,%s," %("NA","NA","NA","NA")) fileOUT_percent_GC.write("%s," %"NA") fileOUT_percent_purine.write("%s," %"NA") fileOUT_Purine_Load.write("%s,%s,%s,%s,%s," %("NA","NA","NA","NA","NA")) fileOUT_NUC.write("\n") fileOUT_percent_GC.write("\n") fileOUT_percent_purine.write("\n") fileOUT_Purine_Load.write("\n") fileOUT_NUC.close() fileOUT_percent_GC.close() fileOUT_percent_purine.close() fileOUT_Purine_Load.close()
