Mercurial > repos > abims-sbr > mutcount
diff scripts/S01b_study_seq_composition_aa.py @ 0:78dd6454f6f0 draft
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 73670b26c75bb6c1a6332481920f3036314de364
| author | abims-sbr |
|---|---|
| date | Tue, 02 May 2017 04:20:51 -0400 |
| parents | |
| children | 8de21b6eb110 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S01b_study_seq_composition_aa.py Tue May 02 04:20:51 2017 -0400 @@ -0,0 +1,625 @@ +#!/usr/bin/env python +# -*- coding: ascii -*- +## Author: Eric FONTANILLAS +## Date: 21.12.10 +## Object: Test for compositional bias in genome and proteome as marker of thermal adaptation (comparison between 2 "hot" species: Ap and Ps and two "cold" species: Pg, Pp) +import sys, os +script_path = os.path.dirname(sys.argv[0]) + +############# +### DEF 0 ### +############# +def simplify_fasta_name(fasta_name,LT): + + for abbreviation in LT: + if abbreviation in fasta_name: + new_fasta_name = abbreviation + + return(new_fasta_name) +########################################## + +########### +## DEF1 ## +########### +## Generates bash, with key = fasta name; value = sequence (WITH GAP, IF ANY, REMOVED IN THIS FUNCTION) + +def dico(fasta_file,LT): + + count_fastaName=0 + F1 = open(fasta_file, "r") + + bash1 = {} + while 1: + nextline = F1.readline() + #print nextline + if not nextline : + break + + if nextline[0] == ">": + count_fastaName = count_fastaName + 1 + fasta_name = nextline[1:-1] + nextline = F1.readline() + sequence = nextline[:-1] + + if fasta_name not in bash1.keys(): + fasta_name = simplify_fasta_name(fasta_name,LT) ### DEF 0 ### + bash1[fasta_name] = sequence + else: + print fasta_name + + # Find alignment length + kk = bash1.keys() + key0 = kk[0] + seq0 = bash1[key0] + ln_seq = len(seq0) + + F1.close() + + return(bash1) +##################################### + + + +################## +###### DEF2 ###### +################## +def base_composition(seq): + count_A=string.count(seq, "A") + count_T=string.count(seq, "T") + count_C=string.count(seq, "C") + count_G=string.count(seq, "G") + + + CG = count_C+count_G + AT = count_T+count_A + + AG = count_A+count_G + TC = count_T+count_C + + ## 1 ## Search for compositional bias in genome as marker of thermal adaptation: CG vs AT + ratio_CG_AT=float(CG)/float(AT) + + ## 2 ## Search for compositional bias in genome as marker of thermal adaptation: AG vs TC + ratio_purine_pyrimidine=float(AG)/float(TC) + + ## 3 ## Nucleotide proportion + ln = len(seq) + prop_A = float(count_A)/float(ln) + prop_T = float(count_T)/float(ln) + prop_C = float(count_C)/float(ln) + prop_G = float(count_G)/float(ln) + + + return(ratio_CG_AT, ratio_purine_pyrimidine, prop_A, prop_T, prop_C, prop_G) +############################################## + + +################## +###### DEF3 ###### +################## +def aa_composition1(seq): + + ## 1 ## count occurence of AA + count_K=string.count(seq,"K") + count_R=string.count(seq,"R") + count_A=string.count(seq,"A") + count_F=string.count(seq,"F") + count_I=string.count(seq,"I") + count_L=string.count(seq,"L") + count_M=string.count(seq,"M") + count_V=string.count(seq,"V") + count_W=string.count(seq,"W") + count_N=string.count(seq,"N") + count_Q=string.count(seq,"Q") + count_S=string.count(seq,"S") + count_T=string.count(seq,"T") + count_H=string.count(seq,"H") + count_Y=string.count(seq,"Y") + count_C=string.count(seq,"C") + count_D=string.count(seq,"D") + count_E=string.count(seq,"E") + count_P=string.count(seq,"P") + count_G=string.count(seq,"G") + + + + ## 2 ## compute relative proportion + TOTAL=count_K+count_R+count_A+count_F+count_I+count_L+count_M+count_V+count_W+count_N+count_Q+count_S+count_T+count_H+count_Y+count_C+count_D+count_E+count_P+count_G + if (TOTAL!=0): + ln = TOTAL + + prop_K=float(count_K)/float(ln) + prop_R=float(count_R)/float(ln) + prop_A=float(count_A)/float(ln) + prop_F=float(count_F)/float(ln) + prop_I=float(count_I)/float(ln) + prop_L=float(count_L)/float(ln) + prop_M=float(count_M)/float(ln) + prop_V=float(count_V)/float(ln) + prop_W=float(count_W)/float(ln) + prop_N=float(count_N)/float(ln) + prop_Q=float(count_Q)/float(ln) + prop_S=float(count_S)/float(ln) + prop_T=float(count_T)/float(ln) + prop_H=float(count_H)/float(ln) + prop_Y=float(count_Y)/float(ln) + prop_C=float(count_C)/float(ln) + prop_D=float(count_D)/float(ln) + prop_E=float(count_E)/float(ln) + prop_P=float(count_P)/float(ln) + prop_G=float(count_G)/float(ln) + else: + prop_K=0 + prop_R=0 + prop_A=0 + prop_F=0 + prop_I=0 + prop_L=0 + prop_M=0 + prop_V=0 + prop_W=0 + prop_N=0 + prop_Q=0 + prop_S=0 + prop_T=0 + prop_H=0 + prop_Y=0 + prop_C=0 + prop_D=0 + prop_E=0 + prop_P=0 + prop_G=0 + + + + + return(prop_K,prop_R,prop_A,prop_F,prop_I,prop_L,prop_M,prop_V,prop_W,prop_N,prop_Q,prop_S,prop_T,prop_H,prop_Y,prop_C,prop_D,prop_E,prop_P,prop_G) + +################## +###### DEF4 ###### +################## +def aa_composition2(seq): + + ## 1 ## count occurence of AA + count_K=string.count(seq,"K") + count_R=string.count(seq,"R") + count_A=string.count(seq,"A") + count_F=string.count(seq,"F") + count_I=string.count(seq,"I") + count_L=string.count(seq,"L") + count_M=string.count(seq,"M") + count_V=string.count(seq,"V") + count_W=string.count(seq,"W") + count_N=string.count(seq,"N") + count_Q=string.count(seq,"Q") + count_S=string.count(seq,"S") + count_T=string.count(seq,"T") + count_H=string.count(seq,"H") + count_Y=string.count(seq,"Y") + count_C=string.count(seq,"C") + count_D=string.count(seq,"D") + count_E=string.count(seq,"E") + count_P=string.count(seq,"P") + count_G=string.count(seq,"G") + + + + ## 2 ## compute seq length + TOTAL=count_K+count_R+count_A+count_F+count_I+count_L+count_M+count_V+count_W+count_N+count_Q+count_S+count_T+count_H+count_Y+count_C+count_D+count_E+count_P+count_G + if (TOTAL!=0): + + ln = TOTAL + ##3 Famous Hyperthermophile Prokaryotes criterias + + # 3.1. IVYWREL estimator => positivelly correlated with otpimal growth + count_IVYWREL = count_I+count_V+count_Y+count_W+count_R+count_E+count_L + prop_IVYWREL = float(count_IVYWREL)/float(ln) + + # 3.2. ERK estimator (i.e. ERK vs DNQTSHA) => positivelly correlated with optimal growth temperature + # ERK alone + count_ERK = count_E + count_R + count_K + prop_ERK = float(count_ERK)/float(ln) + # DNQTSHA alone + count_DNQTSH = count_D+count_N+count_Q+count_T+count_S+count_H + prop_DNQTSH=float(count_DNQTSH)/float(ln) + # ERK vs DNQTSH + if count_DNQTSH != 0: + ratio_ERK_vs_DNQTSH = float(count_ERK)/float(count_DNQTSH) + else: + ratio_ERK_vs_DNQTSH=-1 + # EK/QH estimator + count_EK = count_E+count_K + count_QH = count_Q+count_H + + prop_EK = float(count_EK)/float(ln) + prop_QH = float(count_QH)/float(ln) + + if count_QH != 0: + ratio_EK_vs_QH = float(count_EK)/float(count_QH) + else: + ratio_EK_vs_QH=-1 ## "-1" will indicate the impossibility to compute the ratio (coz the numerator) + + ## 4 ## Mutationnal bias hypothesis => AT rich: favor FYMINK // GC rich: favor GARP + ## The mutational bias model predict a linear relationship between GARP vs FYMINK ==> so if outliers to that, it means that the excess of GARP or FYMINK are not explained by the mutationnal bias model but by other thing ... selection!!??? + count_FYMINK=count_F+count_Y+count_M+count_I+count_N+count_K + prop_FYMINK = float(count_FYMINK)/float(ln) + + count_GARP=count_G+count_A+count_R+count_P + prop_GARP=float(count_GARP)/float(ln) + + ## 5 ## Hydophobicity hypothesis [should INCREASE with thermal adaptation] + ## 5.1. AL + count_AVLIMFYW = count_A+count_V+count_L+count_I+count_F+count_Y+count_W+count_M + prop_AVLIMFYW=float(count_AVLIMFYW)/float(ln) + ## 5.2. Only non-aromatic + count_AVLIM = count_A+count_V+count_L+count_I+count_M + prop_AVLIM=float(count_AVLIM)/float(ln) + ## 5.3. Only aromatic (have they higher residus volume?? in such case opposite hypothesis based on residu volume, predict DECREASE for these aa in composition) + count_FYW = count_F+count_Y+count_W + prop_FYW=float(count_FYW)/float(ln) + + ## 6 ## Charged hypothesis => positivelly correlated with optimal growth temperature + # All charged + count_RHKDE = count_R + count_H +count_K + count_D + count_E + prop_RHKDE = float(count_RHKDE)/float(ln) + # Only positive + count_RHK = count_R + count_H +count_K + prop_RHK = float(count_RHK)/float(ln) + # Only negative + count_DE = count_D + count_E + prop_DE = float(count_DE)/float(ln) + + ## 7 ## Neutral polar hypothesis [should DECREASE with thermal adaptation] + count_STNQ = count_S+count_T+count_N+count_Q + prop_STNQ=float(count_STNQ)/float(ln) + + + ## 9 ## PAYRE VS MGDS (FONTANILLAS CRITERIA) + ## 9.1 ## Didier's criteria 1 = SMALL / BIG + count_PAYRE = count_A+count_Y+count_P+count_R+count_E + prop_PAYRE=float(count_PAYRE)/float(ln) + count_MVGDS = count_V+count_M+count_S+count_G+count_D + prop_MVGDS=float(count_MVGDS)/float(ln) + if count_MVGDS!= 0: + ratio_PAYRE_vs_MVGDS = float(count_PAYRE)/float(count_MVGDS) + else: + ratio_PAYRE_vs_MVGDS=-1 ## "-1" will indicate the impossibility to compute the ratio (coz the numerator) + + ## 9.2 ## Didier's criteria 2 = VERY SMALL / BIG + count_AC = count_A+count_C + prop_AC=float(count_AC)/float(ln) + + #count_VLIM = count_V+count_L+count_I+count_M + if count_MVGDS != 0: + ratio_AC_vs_MVGDS = float(count_AC)/float(count_MVGDS) + else: + ratio_AC_vs_MVGDS=-1 ## "-1" will indicate the impossibility to compute the ratio (coz the numerator) + else: + count_IVYWREL=0 + prop_IVYWREL=0 + count_ERK=0 + prop_ERK=0 + count_DNQTSH=0 + prop_DNQTSH=0 + ratio_ERK_vs_DNQTSH=0 + count_EK=0 + prop_EK=0 + count_QH=0 + prop_QH=0 + ratio_EK_vs_QH=0 + count_FYMINK=0 + prop_FYMINK=0 + count_GARP=0 + prop_GARP=0 + count_AVLIMFYW=0 + prop_AVLIMFYW=0 + count_AVLIM=0 + prop_AVLIM=0 + count_FYW=0 + prop_FYW=0 + count_STNQ=0 + prop_STNQ=0 + count_MVGDS=0 + prop_MVGDS=0 + count_PAYRE=0 + prop_PAYRE=0 + count_AC=0 + prop_AC=0 + ratio_PAYRE_vs_MVGDS=0 + ratio_AC_vs_MVGDS=0 + count_RHKDE=0 + prop_RHKDE=0 + count_RHK=0 + prop_RHK=0 + count_DE=0 + prop_DE=0 + + return(count_IVYWREL,prop_IVYWREL,count_ERK,prop_ERK,count_DNQTSH,prop_DNQTSH,ratio_ERK_vs_DNQTSH,count_EK,prop_EK,count_QH,prop_QH,ratio_EK_vs_QH,count_FYMINK,prop_FYMINK,count_GARP,prop_GARP,count_AVLIMFYW, prop_AVLIMFYW,count_AVLIM,prop_AVLIM,count_FYW,prop_FYW,count_STNQ, prop_STNQ, count_MVGDS,prop_MVGDS, count_PAYRE,prop_PAYRE, count_AC,prop_AC, ratio_PAYRE_vs_MVGDS, ratio_AC_vs_MVGDS, count_RHKDE,prop_RHKDE,count_RHK,prop_RHK,count_DE,prop_DE) +##################### + + +################## +###### DEF5 ###### +################## +def aa_properties(fileIN_aaProperties): + next = fileIN_aaProperties.readline() ## JUMP HEADERS + + bash_aa_properties={} + + while 1: + next = fileIN_aaProperties.readline() + if not next: + break + + S1 = string.split(next, ",") + + aa_name = S1[1] + S2 = string.split(aa_name, "/") + aa_code = S2[1][:-1] + + frequencies = S1[2][:-1] + Residue_Weight = S1[5] + Residue_Volume = S1[6] + Partial_specific_volume = S1[7] + Hydration = S1[8] + + bash_aa_properties[aa_code] = [frequencies,Residue_Weight,Residue_Volume,Partial_specific_volume,Hydration] + + return(bash_aa_properties) + + +################## +###### DEF6 ###### +################## +def sequence_properties_from_aa_properties(seq, bash_properties): + + ## 1 ## count occurence of AA + count_K=string.count(seq,"K") + count_R=string.count(seq,"R") + count_A=string.count(seq,"A") + count_F=string.count(seq,"F") + count_I=string.count(seq,"I") + count_L=string.count(seq,"L") + count_M=string.count(seq,"M") + count_V=string.count(seq,"V") + count_W=string.count(seq,"W") + count_N=string.count(seq,"N") + count_Q=string.count(seq,"Q") + count_S=string.count(seq,"S") + count_T=string.count(seq,"T") + count_H=string.count(seq,"H") + count_Y=string.count(seq,"Y") + count_C=string.count(seq,"C") + count_D=string.count(seq,"D") + count_E=string.count(seq,"E") + count_P=string.count(seq,"P") + count_G=string.count(seq,"G") + + TOTAL=count_K+count_R+count_A+count_F+count_I+count_L+count_M+count_V+count_W+count_N+count_Q+count_S+count_T+count_H+count_Y+count_C+count_D+count_E+count_P+count_G + + if (TOTAL!=0): + + + ## 2 ## Compute properties 1: Residue Weight (Mr) (UNIT:Daltons): + + Total_Residue_Weight = count_K*float(bash_properties["K"][1]) + count_R*float(bash_properties["R"][1]) + count_A*float(bash_properties["A"][1]) + count_F*float(bash_properties["F"][1]) + count_I*float(bash_properties["I"][1]) + count_L*float(bash_properties["L"][1]) + count_M*float(bash_properties["M"][1]) + count_V*float(bash_properties["V"][1]) + count_W*float(bash_properties["W"][1]) + count_N*float(bash_properties["N"][1]) + count_Q*float(bash_properties["Q"][1]) + count_S*float(bash_properties["S"][1]) + count_T*float(bash_properties["T"][1]) + count_H*float(bash_properties["H"][1]) + count_Y*float(bash_properties["Y"][1]) + count_C*float(bash_properties["C"][1]) + count_D*float(bash_properties["D"][1]) + count_E*float(bash_properties["E"][1]) + count_P*float(bash_properties["P"][1]) + count_G*float(bash_properties["G"][1]) + Total_Residue_Volume = count_K*float(bash_properties["K"][2]) + count_R*float(bash_properties["R"][2]) + count_A*float(bash_properties["A"][2]) + count_F*float(bash_properties["F"][2]) + count_I*float(bash_properties["I"][2]) + count_L*float(bash_properties["L"][2]) + count_M*float(bash_properties["M"][2]) + count_V*float(bash_properties["V"][2]) + count_W*float(bash_properties["W"][2]) + count_N*float(bash_properties["N"][2]) + count_Q*float(bash_properties["Q"][2]) + count_S*float(bash_properties["S"][2]) + count_T*float(bash_properties["T"][2]) + count_H*float(bash_properties["H"][2]) + count_Y*float(bash_properties["Y"][2]) + count_C*float(bash_properties["C"][2]) + count_D*float(bash_properties["D"][2]) + count_E*float(bash_properties["E"][2]) + count_P*float(bash_properties["P"][2]) + count_G*float(bash_properties["G"][2]) + Total_Partial_specific_volume = count_K*float(bash_properties["K"][3]) + count_R*float(bash_properties["R"][3]) + count_A*float(bash_properties["A"][3]) + count_F*float(bash_properties["F"][3]) + count_I*float(bash_properties["I"][3]) + count_L*float(bash_properties["L"][3]) + count_M*float(bash_properties["M"][3]) + count_V*float(bash_properties["V"][3]) + count_W*float(bash_properties["W"][3]) + count_N*float(bash_properties["N"][3]) + count_Q*float(bash_properties["Q"][3]) + count_S*float(bash_properties["S"][3]) + count_T*float(bash_properties["T"][3]) + count_H*float(bash_properties["H"][3]) + count_Y*float(bash_properties["Y"][3]) + count_C*float(bash_properties["C"][3]) + count_D*float(bash_properties["D"][3]) + count_E*float(bash_properties["E"][3]) + count_P*float(bash_properties["P"][3]) + count_G*float(bash_properties["G"][3]) + Total_Hydration = count_K*float(bash_properties["K"][4]) + count_R*float(bash_properties["R"][4]) + count_A*float(bash_properties["A"][4]) + count_F*float(bash_properties["F"][4]) + count_I*float(bash_properties["I"][4]) + count_L*float(bash_properties["L"][4]) + count_M*float(bash_properties["M"][4]) + count_V*float(bash_properties["V"][4]) + count_W*float(bash_properties["W"][4]) + count_N*float(bash_properties["N"][4]) + count_Q*float(bash_properties["Q"][4]) + count_S*float(bash_properties["S"][4]) + count_T*float(bash_properties["T"][4]) + count_H*float(bash_properties["H"][4]) + count_Y*float(bash_properties["Y"][4]) + count_C*float(bash_properties["C"][4]) + count_D*float(bash_properties["D"][4]) + count_E*float(bash_properties["E"][4]) + count_P*float(bash_properties["P"][4]) + count_G*float(bash_properties["G"][4]) + else: + Total_Residue_Weight=0 + Total_Residue_Volume=0 + Total_Partial_specific_volume=0 + Total_Hydration=0 + + return(Total_Residue_Weight,Total_Residue_Volume,Total_Partial_specific_volume,Total_Hydration) + +######################################################## + + + +################### +### RUN RUN RUN ### +################### +import sys,os,zipfile,shutil,subprocess,string + +##Create specific folders +Path_IN_loci_NUC = "./IN_AA" +outpath= "./OUT" +os.makedirs(Path_IN_loci_NUC) +os.makedirs(outpath) + + +#Check if the file is a zip or fasta file + +the_zip_file = zipfile.ZipFile(sys.argv[1]) +ret = the_zip_file.testzip() + +if ret is not None: + shutil.copy2(sys.argv[1], './IN_AA/input.fasta') +else: + cmd="unzip %s -d ./IN_AA"%(sys.argv[1]) + os.system(cmd) + + + +## 1 ## List taxa +LT=[] +cmd="grep '>' %s" % sys.argv[2] +result = subprocess.check_output(cmd, shell=True) +result=result.split('\n') +for i in result: + sp=i[1:] + if sp !='': + LT.append(sp) +print LT + + +## 2 ## PathIN +fileIN_properties = open("%s/01_AminoAcid_Properties2.csv"%(script_path), "r") +Path_IN_loci_AA = "./IN_AA" +#Path_IN_loci_AA = "02_CDS_No_Missing_Data_aa_CDS_withM" +Lloci_AA = os.listdir(Path_IN_loci_AA) + +## 3 ## PathOUT + +## 3.1 ## PROT composition +fileOUT_PROT_ALL=open("./OUT/13_prot_compositions_All_AA.csv","w") +fileOUT_PROT_ALL.write("LOCUS,") +for taxa in LT: + fileOUT_PROT_ALL.write("%s_prop_K,%s_prop_R,%s_prop_A,%s_prop_F,%s_prop_I,%s_prop_L,%s_prop_M,%s_prop_V,%s_prop_W,%s_prop_N,%s_prop_Q,%s_prop_S,%s_prop_T,%s_prop_H,%s_prop_Y,%s_prop_C,%s_prop_D,%s_prop_E,%s_prop_P,%s_prop_G," %(taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa)) +fileOUT_PROT_ALL.write("\n") + +## 3.2 ## PROT IVYWREL +fileOUT_IVYWREL=open("./OUT/14_IVYWREL.csv","w") +fileOUT_IVYWREL.write("LOCUS,") +for taxa in LT: + fileOUT_IVYWREL.write("%s_count_IVYWREL,%s_prop_IVYWREL," %(taxa,taxa)) +fileOUT_IVYWREL.write("\n") + + +## 3.3 ## PROT ERK_DNQTSHA +fileOUT_ERK_DNQTSH=open("./OUT/15_ERK_DNQTSH.csv","w") +fileOUT_ERK_DNQTSH.write("LOCUS,") +for taxa in LT: + fileOUT_ERK_DNQTSH.write("%s_count_ERK,%s_prop_ERK,%s_count_DNQTSH,%s_prop_DNQTSH,%s_ratio_ERK_vs_DNQTSH," %(taxa,taxa,taxa,taxa,taxa)) +fileOUT_ERK_DNQTSH.write("\n") + +## 3.4 ## PROT EK_QH +fileOUT_EK_QH=open("./OUT/16_EK_QH.csv","w") +fileOUT_EK_QH.write("LOCUS,") +for taxa in LT: + fileOUT_EK_QH.write("%s_count_EK,%s_prop_EK,%s_count_QH,%s_prop_QH,%s_ratio_EK_vs_QH," %(taxa,taxa,taxa,taxa,taxa)) +fileOUT_EK_QH.write("\n") + + +## 3.5 ## PROT FYMINK_GARP +fileOUT_FYMINK_GARP=open("./OUT/17_FYMINK_GARP.csv","w") +fileOUT_FYMINK_GARP.write("LOCUS,") +for taxa in LT: + fileOUT_FYMINK_GARP.write("%s_count_FYMINK,%s_prop_FYMINK,%s_count_GARP,%s_prop_GARP," %(taxa,taxa,taxa,taxa)) +fileOUT_FYMINK_GARP.write("\n") + + +## 3.6 ## PROT AVLIMFYW +fileOUT_AVLIMFYW=open("./OUT/18_AVLIMFYW.csv","w") +fileOUT_AVLIMFYW.write("LOCUS,") +for taxa in LT: + fileOUT_AVLIMFYW.write("%s_count_AVLIMFYW,%s_prop_AVLIMFYW,%s_count_AVLIM,%s_prop_AVLIM,%s_count_FYW,%s_prop_FYW," %(taxa,taxa,taxa,taxa,taxa,taxa)) +fileOUT_AVLIMFYW.write("\n") + +## 3.7 ## PROT STNQ +fileOUT_STNQ=open("./OUT/19_STNQ.csv","w") +fileOUT_STNQ.write("LOCUS,") +for taxa in LT: + fileOUT_STNQ.write("%s_count_STNQ,%s_prop_STNQ," %(taxa,taxa)) +fileOUT_STNQ.write("\n") + +## 3.8 ## PROT RHKDE +fileOUT_RHKDE=open("./OUT/20_RHKDE.csv","w") +fileOUT_RHKDE.write("LOCUS,") +for taxa in LT: + fileOUT_RHKDE.write("%s_count_RHKDE,%s_prop_RHKDE,%s_count_RHK,%s_prop_RHK,%s_count_DE,%s_prop_DE," %(taxa,taxa,taxa,taxa,taxa,taxa)) +fileOUT_RHKDE.write("\n") + +## 3.9 ## PROT DIDER CRITERIA +fileOUT_PAYRE=open("./OUT/21_PAYRE-MVGDS.csv","w") +fileOUT_PAYRE.write("LOCUS,") +for taxa in LT: + fileOUT_PAYRE.write("%s_count_PAYRE,%s_prop_PAYRE,%s_count_AC,%s_prop_AC,%s_count_MVGDS,%s_prop_MVGDS,%s_ratio_PAYRE_vs_MVGDS,%s_ratio_AC_vs_MVGDS," %(taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa)) +fileOUT_PAYRE.write("\n") + +## 3.10 ## PROT Total residue weight +fileOUT_TotalResidueWeight=open("./OUT/22_TotalResidueWeight.csv","w") +fileOUT_TotalResidueWeight.write("LOCUS,") +for taxa in LT: + fileOUT_TotalResidueWeight.write("%s_Total_Residue_Weight," %taxa) +fileOUT_TotalResidueWeight.write("\n") + +## 3.11 ## PROT Total residue volume +fileOUT_TotalResidueVolume=open("./OUT/23_TotalResidueVolume.csv","w") +fileOUT_TotalResidueVolume.write("LOCUS,") +for taxa in LT: + fileOUT_TotalResidueVolume.write("%s_Total_Residue_Volume," %taxa) +fileOUT_TotalResidueVolume.write("\n") + +## 3.12 ## PROT Total partial specific volume +fileOUT_TotalPartialSpecificVolume=open("./OUT/24_TotalPartialSpecificVolume.csv","w") +fileOUT_TotalPartialSpecificVolume.write("LOCUS,") +for taxa in LT: + fileOUT_TotalPartialSpecificVolume.write("%s_Total_Partial_Specific_Volume," %taxa) +fileOUT_TotalPartialSpecificVolume.write("\n") + +## 3.13 ## PROT Total hydratation +fileOUT_TotalHydratation=open("./OUT/25_TotalHydratation.csv","w") +fileOUT_TotalHydratation.write("LOCUS,") +for taxa in LT: + fileOUT_TotalHydratation.write("%s_Total_Hydratation," %taxa) +fileOUT_TotalHydratation.write("\n") + + +##################### +## 4 ## Process Loci +##################### +bash_aa_properties = aa_properties(fileIN_properties) + +for locus in Lloci_AA: + print locus + path_locus = "%s/%s" %(Path_IN_loci_AA, locus) + bash = dico(path_locus,LT) + + #print bash + + fileOUT_PROT_ALL.write("%s," %locus) + fileOUT_IVYWREL.write("%s," %locus) + fileOUT_ERK_DNQTSH.write("%s," %locus) + fileOUT_EK_QH.write("%s," %locus) + fileOUT_FYMINK_GARP.write("%s," %locus) + fileOUT_AVLIMFYW.write("%s," %locus) + fileOUT_STNQ.write("%s," %locus) + fileOUT_RHKDE.write("%s," %locus) + fileOUT_PAYRE.write("%s," %locus) + fileOUT_TotalResidueWeight.write("%s," %locus) + fileOUT_TotalResidueVolume.write("%s," %locus) + fileOUT_TotalPartialSpecificVolume.write("%s," %locus) + fileOUT_TotalHydratation.write("%s," %locus) + + for taxa in LT: + if taxa in bash.keys(): + seq = bash[taxa] + prop_K,prop_R,prop_A,prop_F,prop_I,prop_L,prop_M,prop_V,prop_W,prop_N,prop_Q,prop_S,prop_T,prop_H,prop_Y,prop_C,prop_D,prop_E,prop_P,prop_G = aa_composition1(seq) ### DEF3 ### + count_IVYWREL,prop_IVYWREL,count_ERK,prop_ERK,count_DNQTSH,prop_DNQTSH,ratio_ERK_vs_DNQTSH,count_EK,prop_EK,count_QH,prop_QH,ratio_EK_vs_QH,count_FYMINK,prop_FYMINK,count_GARP,prop_GARP,count_AVLIMFYW,prop_AVLIMFYW,count_AVLIM,prop_AVLIM,count_FYW,prop_FYW,count_STNQ,prop_STNQ, count_MVGDS,prop_MVGDS, count_PAYRE,prop_PAYRE, count_AC,prop_AC, ratio_PAYRE_vs_MVGDS, ratio_AC_vs_MVGDS,count_RHKDE,prop_RHKDE,count_RHK,prop_RHK,count_DE,prop_DE = aa_composition2(seq) ### DEF4 ### + Total_Residue_Weight,Total_Residue_Volume,Total_Partial_Specific_Volume,Total_Hydration = sequence_properties_from_aa_properties(seq, bash_aa_properties) ### DEF6 ### + + fileOUT_PROT_ALL.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f," %(prop_K,prop_R,prop_A,prop_F,prop_I,prop_L,prop_M,prop_V,prop_W,prop_N,prop_Q,prop_S,prop_T,prop_H,prop_Y,prop_C,prop_D,prop_E,prop_P,prop_G)) + fileOUT_IVYWREL.write("%.5f,%.5f," %(count_IVYWREL, prop_IVYWREL)) + fileOUT_ERK_DNQTSH.write("%.5f,%.5f,%.5f,%.5f,%.5f," %(count_ERK,prop_ERK,count_DNQTSH,prop_DNQTSH,ratio_ERK_vs_DNQTSH)) + fileOUT_EK_QH.write("%.5f,%.5f,%.5f,%.5f,%.5f," %(count_EK,prop_EK,count_QH,prop_QH,ratio_EK_vs_QH)) + fileOUT_FYMINK_GARP.write("%.5f,%.5f,%.5f,%.5f," %(count_FYMINK,prop_FYMINK,count_GARP,prop_GARP)) + fileOUT_AVLIMFYW.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f," %(count_AVLIMFYW,prop_AVLIMFYW,count_AVLIM,prop_AVLIM,count_FYW,prop_FYW)) + fileOUT_STNQ.write("%.5f,%.5f," %(count_STNQ,prop_STNQ)) + fileOUT_RHKDE.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,"%(count_RHKDE,prop_RHKDE,count_RHK,prop_RHK,count_DE,prop_DE)) + fileOUT_PAYRE.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f," %(count_PAYRE,prop_PAYRE,count_AC,prop_AC,count_MVGDS,prop_MVGDS,ratio_PAYRE_vs_MVGDS,ratio_AC_vs_MVGDS)) + fileOUT_TotalResidueWeight.write("%.5f," %Total_Residue_Weight) + fileOUT_TotalResidueVolume.write("%.5f," %Total_Residue_Volume) + fileOUT_TotalPartialSpecificVolume.write("%.5f," %(Total_Partial_Specific_Volume)) + fileOUT_TotalHydratation.write("%.5f," % Total_Hydration) + + ## END LINE + fileOUT_PROT_ALL.write("\n") + fileOUT_IVYWREL.write("\n") + fileOUT_ERK_DNQTSH.write("\n") + fileOUT_EK_QH.write("\n") + fileOUT_FYMINK_GARP.write("\n") + fileOUT_AVLIMFYW.write("\n") + fileOUT_STNQ.write("\n") + fileOUT_RHKDE.write("\n") + fileOUT_PAYRE.write("\n") + fileOUT_TotalResidueWeight.write("\n") + fileOUT_TotalResidueVolume.write("\n") + fileOUT_TotalPartialSpecificVolume.write("\n") + fileOUT_TotalHydratation.write("\n") + + + +
