Mercurial > repos > abims-sbr > mutcount
diff scripts/S02b_study_seq_composition_nuc.py @ 10:f62c76aab669 draft default tip
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1
| author | lecorguille |
|---|---|
| date | Mon, 24 Sep 2018 04:34:39 -0400 |
| parents | 04a9ada73cc4 |
| children |
line wrap: on
line diff
--- a/scripts/S02b_study_seq_composition_nuc.py Tue Jul 03 10:55:46 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,205 +0,0 @@ -#!/usr/bin/env python -## Author: Eric FONTANILLAS -## Date: 21.12.10 -## Last Version : 12/2017 by Victor Mataigne -## Object: Test for compositional bias in genome and proteome as marker of thermal adaptation (comparison between 2 "hot" species: Ap and Ps and one "cold" species: Pg) - -import sys,os,shutil,subprocess, string, itertools -from functions import simplify_fasta_name, dico - -################## -###### DEF2 ###### -################## -def base_composition(seq): - count_A=string.count(seq, "A") + string.count(seq, "a") - count_T=string.count(seq, "T") + string.count(seq, "t") - count_C=string.count(seq, "C") + string.count(seq, "c") - count_G=string.count(seq, "G") + string.count(seq, "g") - ## 3 ## Nucleotide proportions - ln = count_C+count_G+count_T+count_A - if (ln!=0): - - CG = count_C+count_G - AT = count_T+count_A - - AG = count_A+count_G - TC = count_T+count_C - - ## 1 ## Search for compositional bias in genome as marker of thermal adaptation: CG vs AT - ratio_CG_AT = float(CG)/float(AT) - percent_CG = float(CG)/(float(AT) + float(CG))*100 - - ## 2 ## Search for compositional bias in genome as marker of thermal adaptation: AG vs TC - ratio_purine_pyrimidine=float(AG)/float(TC) - percent_purine=float(AG)/(float(AG)+float(TC))*100 - - - prop_A = float(count_A)/float(ln) - prop_T = float(count_T)/float(ln) - prop_C = float(count_C)/float(ln) - prop_G = float(count_G)/float(ln) - else: - percent_CG=0 - percent_purine=0 - prop_A=0 - prop_T=0 - prop_C=0 - prop_G=0 - - return(percent_CG, percent_purine, prop_A, prop_T, prop_C, prop_G) -############################################## - -################## -###### DEF3 ###### -################## -def purine_loading(seq): - count_A=string.count(seq, "A") + string.count(seq, "a") - count_T=string.count(seq, "T") + string.count(seq, "t") - count_C=string.count(seq, "C") + string.count(seq, "c") - count_G=string.count(seq, "G") + string.count(seq, "g") - ## 3 ## Nucleotide proportions - TOTAL = count_C+count_G+count_T+count_A - if (TOTAL!=0): - - ## PLI : Purine loading indice (Forsdyke et al.) - # (G-C)/N * 1000 et (A-T)/N * 1000 - - DIFF_GC = count_G - count_C - DIFF_AT = count_A - count_T - - # Per bp - PLI_GC = float(DIFF_GC)/float(TOTAL) - PLI_AT = float(DIFF_AT)/float(TOTAL) - - # Per 1000 bp - PLI_GC_1000 = PLI_GC*1000 - PLI_AT_1000 = PLI_AT*1000 - else: - DIFF_GC=0 - DIFF_AT=0 - PLI_GC=0 - PLI_AT=0 - PLI_GC_1000=0 - PLI_AT_1000=0 - - return(TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000) -############################################## - -################### -### RUN RUN RUN ### -################### - -##Create specific folders -Path_IN_loci_NUC = "./IN_NUC" -outpath= "./OUT" -os.makedirs(Path_IN_loci_NUC) -os.makedirs(outpath) - -infiles = [] -with open(sys.argv[2], 'r') as f: - for line in f.readlines(): - infiles.append(line.strip('\n')) - -#infiles = str.split(sys.argv[1], ",") -for file in infiles: - os.system("cp %s %s" %(file, Path_IN_loci_NUC)) - -## 1 ## List taxa -LT=[] -cmd="grep '>' {}".format(sys.argv[1]) -result = subprocess.check_output(cmd, shell=True) -result=result.split('\n') -for i in result: - sp=i[1:] - if sp !='': - LT.append(sp) -print LT - -## 2 ## PathIN -Lloci_NUC = os.listdir(Path_IN_loci_NUC) - - -## 3 ## PathOUT -## 3.1 ## NUC composition -fileOUT_NUC=open("./OUT/nuc_compositions.csv","w") -fileOUT_NUC.write("LOCUS,") -for taxa in LT[0:-1]: - fileOUT_NUC.write("%s_prop_A,%s_prop_T,%s_prop_C,%s_prop_G," %(taxa,taxa,taxa,taxa)) -fileOUT_NUC.write("%s_prop_A,%s_prop_T,%s_prop_C,%s_prop_G" %(LT[-1],LT[-1],LT[-1],LT[-1])) -fileOUT_NUC.write("\n") - -## 3.2 ## NUC percent_GC -fileOUT_percent_GC=open("./OUT/percent_GC.csv","w") -fileOUT_percent_GC.write("LOCUS,") -for taxa in LT[0:-1]: - fileOUT_percent_GC.write("%s_percent_GC," %(taxa)) -fileOUT_percent_GC.write("%s_percentGC" %(LT[-1])) -fileOUT_percent_GC.write("\n") - -## 3.3 ## NUC percent_purine -fileOUT_percent_purine=open("./OUT/percent_purine.csv","w") -fileOUT_percent_purine.write("LOCUS,") -for taxa in LT[0:-1]: - fileOUT_percent_purine.write("%s_percent_purine," %(taxa)) -fileOUT_percent_purine.write("%s_percent_purine" %(LT[-1])) -fileOUT_percent_purine.write("\n") - -## 3.4 ## Purine Load -fileOUT_Purine_Load=open("./OUT/Purine_Load_Indice.csv", "w") -fileOUT_Purine_Load.write("LOCUS,") -for taxa in LT[0:-1]: - fileOUT_Purine_Load.write("%s_TOTAL,%s_DIFF_GC,%s_DIFF_AT,%s_PLI_GC1000,%s_PLI_AT1000," %(taxa,taxa,taxa,taxa,taxa)) -fileOUT_Purine_Load.write("%s_TOTAL,%s_DIFF_GC,%s_DIFF_AT,%s_PLI_GC1000,%s_PLI_AT1000" %(LT[-1],LT[-1],LT[-1],LT[-1],LT[-1])) -fileOUT_Purine_Load.write("\n") - -##################### -## 4 ## Process Loci -##################### -for locus in Lloci_NUC: - print locus - path_locus = "%s/%s" %(Path_IN_loci_NUC, locus) - bash = dico(path_locus,LT) - - fileOUT_NUC.write("%s," %locus) - fileOUT_percent_GC.write("%s," %locus) - fileOUT_percent_purine.write("%s," %locus) - fileOUT_Purine_Load.write("%s," %locus) - - for taxa in LT[0:-1]: - if taxa in bash.keys(): - seq = bash[taxa] - percent_GC, percent_purine,prop_A, prop_T, prop_C, prop_G = base_composition(seq) ### DEF2 ### - TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000 = purine_loading(seq) ### DEF3 ### - fileOUT_NUC.write("%.5f,%.5f,%.5f,%.5f," %(prop_A,prop_T,prop_C,prop_G)) - fileOUT_percent_GC.write("%.5f," %percent_GC) - fileOUT_percent_purine.write("%.5f," %percent_purine) - fileOUT_Purine_Load.write("%d,%d,%d,%.5f,%.5f," %(TOTAL, DIFF_GC, DIFF_AT,PLI_GC_1000, PLI_AT_1000)) - else: - fileOUT_NUC.write("%s,%s,%s,%s," %("NA","NA","NA","NA")) - fileOUT_percent_GC.write("%s," %"NA") - fileOUT_percent_purine.write("%s," %"NA") - fileOUT_Purine_Load.write("%s,%s,%s,%s,%s," %("NA","NA","NA","NA","NA")) - - if LT[-1] in bash.keys(): - seq = bash[LT[-1]] - percent_GC, percent_purine,prop_A, prop_T, prop_C, prop_G = base_composition(seq) ### DEF2 ### - TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000 = purine_loading(seq) ### DEF3 ### - fileOUT_NUC.write("%.5f,%.5f,%.5f,%.5f" %(prop_A,prop_T,prop_C,prop_G)) - fileOUT_percent_GC.write("%.5f" %percent_GC) - fileOUT_percent_purine.write("%.5f" %percent_purine) - fileOUT_Purine_Load.write("%d,%d,%d,%.5f,%.5f" %(TOTAL, DIFF_GC, DIFF_AT,PLI_GC_1000, PLI_AT_1000)) - else: - fileOUT_NUC.write("%s,%s,%s,%s" %("NA","NA","NA","NA")) - fileOUT_percent_GC.write("%s" %"NA") - fileOUT_percent_purine.write("%s" %"NA") - fileOUT_Purine_Load.write("%s,%s,%s,%s,%s" %("NA","NA","NA","NA","NA")) - - - fileOUT_NUC.write("\n") - fileOUT_percent_GC.write("\n") - fileOUT_percent_purine.write("\n") - fileOUT_Purine_Load.write("\n") -fileOUT_NUC.close() -fileOUT_percent_GC.close() -fileOUT_percent_purine.close() -fileOUT_Purine_Load.close()
