comparison scripts/S01b_study_seq_composition_aa.py @ 10:f62c76aab669 draft default tip

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1
author lecorguille
date Mon, 24 Sep 2018 04:34:39 -0400
parents 04a9ada73cc4
children
comparison
equal deleted inserted replaced
9:04a9ada73cc4 10:f62c76aab669
1 #!/usr/bin/env python
2 # -*- coding: ascii -*-
3 ## Author: Eric FONTANILLAS
4 ## Date: 21.12.10
5 ## Last Version : 12/2017 by Victor Mataigne
6 ## Object: Test for compositional bias in genome and proteome as marker of thermal adaptation (comparison between 2 "hot" species: Ap and Ps and two "cold" species: Pg, Pp)
7
8 import sys,os,shutil,subprocess,string, itertools
9 from functions import simplify_fasta_name, dico
10
11 script_path = os.path.dirname(sys.argv[0])
12
13 ##################
14 ###### DEF2 ######
15 ##################
16 def base_composition(seq):
17 count_A=string.count(seq, "A")
18 count_T=string.count(seq, "T")
19 count_C=string.count(seq, "C")
20 count_G=string.count(seq, "G")
21
22
23 CG = count_C+count_G
24 AT = count_T+count_A
25
26 AG = count_A+count_G
27 TC = count_T+count_C
28
29 ## 1 ## Search for compositional bias in genome as marker of thermal adaptation: CG vs AT
30 ratio_CG_AT=float(CG)/float(AT)
31
32 ## 2 ## Search for compositional bias in genome as marker of thermal adaptation: AG vs TC
33 ratio_purine_pyrimidine=float(AG)/float(TC)
34
35 ## 3 ## Nucleotide proportion
36 ln = len(seq)
37 prop_A = float(count_A)/float(ln)
38 prop_T = float(count_T)/float(ln)
39 prop_C = float(count_C)/float(ln)
40 prop_G = float(count_G)/float(ln)
41
42
43 return(ratio_CG_AT, ratio_purine_pyrimidine, prop_A, prop_T, prop_C, prop_G)
44 ##############################################
45
46
47 ##################
48 ###### DEF3 ######
49 ##################
50 def aa_composition1(seq):
51
52 ## 1 ## count occurence of AA
53 count_K=string.count(seq,"K")
54 count_R=string.count(seq,"R")
55 count_A=string.count(seq,"A")
56 count_F=string.count(seq,"F")
57 count_I=string.count(seq,"I")
58 count_L=string.count(seq,"L")
59 count_M=string.count(seq,"M")
60 count_V=string.count(seq,"V")
61 count_W=string.count(seq,"W")
62 count_N=string.count(seq,"N")
63 count_Q=string.count(seq,"Q")
64 count_S=string.count(seq,"S")
65 count_T=string.count(seq,"T")
66 count_H=string.count(seq,"H")
67 count_Y=string.count(seq,"Y")
68 count_C=string.count(seq,"C")
69 count_D=string.count(seq,"D")
70 count_E=string.count(seq,"E")
71 count_P=string.count(seq,"P")
72 count_G=string.count(seq,"G")
73
74
75
76 ## 2 ## compute relative proportion
77 TOTAL=count_K+count_R+count_A+count_F+count_I+count_L+count_M+count_V+count_W+count_N+count_Q+count_S+count_T+count_H+count_Y+count_C+count_D+count_E+count_P+count_G
78 if (TOTAL!=0):
79 ln = TOTAL
80
81 prop_K=float(count_K)/float(ln)
82 prop_R=float(count_R)/float(ln)
83 prop_A=float(count_A)/float(ln)
84 prop_F=float(count_F)/float(ln)
85 prop_I=float(count_I)/float(ln)
86 prop_L=float(count_L)/float(ln)
87 prop_M=float(count_M)/float(ln)
88 prop_V=float(count_V)/float(ln)
89 prop_W=float(count_W)/float(ln)
90 prop_N=float(count_N)/float(ln)
91 prop_Q=float(count_Q)/float(ln)
92 prop_S=float(count_S)/float(ln)
93 prop_T=float(count_T)/float(ln)
94 prop_H=float(count_H)/float(ln)
95 prop_Y=float(count_Y)/float(ln)
96 prop_C=float(count_C)/float(ln)
97 prop_D=float(count_D)/float(ln)
98 prop_E=float(count_E)/float(ln)
99 prop_P=float(count_P)/float(ln)
100 prop_G=float(count_G)/float(ln)
101 else:
102 prop_K=0
103 prop_R=0
104 prop_A=0
105 prop_F=0
106 prop_I=0
107 prop_L=0
108 prop_M=0
109 prop_V=0
110 prop_W=0
111 prop_N=0
112 prop_Q=0
113 prop_S=0
114 prop_T=0
115 prop_H=0
116 prop_Y=0
117 prop_C=0
118 prop_D=0
119 prop_E=0
120 prop_P=0
121 prop_G=0
122
123
124
125
126 return(prop_K,prop_R,prop_A,prop_F,prop_I,prop_L,prop_M,prop_V,prop_W,prop_N,prop_Q,prop_S,prop_T,prop_H,prop_Y,prop_C,prop_D,prop_E,prop_P,prop_G)
127
128 ##################
129 ###### DEF4 ######
130 ##################
131 def aa_composition2(seq):
132
133 ## 1 ## count occurence of AA
134 count_K=string.count(seq,"K")
135 count_R=string.count(seq,"R")
136 count_A=string.count(seq,"A")
137 count_F=string.count(seq,"F")
138 count_I=string.count(seq,"I")
139 count_L=string.count(seq,"L")
140 count_M=string.count(seq,"M")
141 count_V=string.count(seq,"V")
142 count_W=string.count(seq,"W")
143 count_N=string.count(seq,"N")
144 count_Q=string.count(seq,"Q")
145 count_S=string.count(seq,"S")
146 count_T=string.count(seq,"T")
147 count_H=string.count(seq,"H")
148 count_Y=string.count(seq,"Y")
149 count_C=string.count(seq,"C")
150 count_D=string.count(seq,"D")
151 count_E=string.count(seq,"E")
152 count_P=string.count(seq,"P")
153 count_G=string.count(seq,"G")
154
155
156
157 ## 2 ## compute seq length
158 TOTAL=count_K+count_R+count_A+count_F+count_I+count_L+count_M+count_V+count_W+count_N+count_Q+count_S+count_T+count_H+count_Y+count_C+count_D+count_E+count_P+count_G
159 if (TOTAL!=0):
160
161 ln = TOTAL
162 ##3 Famous Hyperthermophile Prokaryotes criterias
163
164 # 3.1. IVYWREL estimator => positivelly correlated with otpimal growth
165 count_IVYWREL = count_I+count_V+count_Y+count_W+count_R+count_E+count_L
166 prop_IVYWREL = float(count_IVYWREL)/float(ln)
167
168 # 3.2. ERK estimator (i.e. ERK vs DNQTSHA) => positivelly correlated with optimal growth temperature
169 # ERK alone
170 count_ERK = count_E + count_R + count_K
171 prop_ERK = float(count_ERK)/float(ln)
172 # DNQTSHA alone
173 count_DNQTSH = count_D+count_N+count_Q+count_T+count_S+count_H
174 prop_DNQTSH=float(count_DNQTSH)/float(ln)
175 # ERK vs DNQTSH
176 if count_DNQTSH != 0:
177 ratio_ERK_vs_DNQTSH = float(count_ERK)/float(count_DNQTSH)
178 else:
179 ratio_ERK_vs_DNQTSH=-1
180 # EK/QH estimator
181 count_EK = count_E+count_K
182 count_QH = count_Q+count_H
183
184 prop_EK = float(count_EK)/float(ln)
185 prop_QH = float(count_QH)/float(ln)
186
187 if count_QH != 0:
188 ratio_EK_vs_QH = float(count_EK)/float(count_QH)
189 else:
190 ratio_EK_vs_QH=-1 ## "-1" will indicate the impossibility to compute the ratio (coz the numerator)
191
192 ## 4 ## Mutationnal bias hypothesis => AT rich: favor FYMINK // GC rich: favor GARP
193 ## The mutational bias model predict a linear relationship between GARP vs FYMINK ==> so if outliers to that, it means that the excess of GARP or FYMINK are not explained by the mutationnal bias model but by other thing ... selection!!???
194 count_FYMINK=count_F+count_Y+count_M+count_I+count_N+count_K
195 prop_FYMINK = float(count_FYMINK)/float(ln)
196
197 count_GARP=count_G+count_A+count_R+count_P
198 prop_GARP=float(count_GARP)/float(ln)
199
200 ## 5 ## Hydophobicity hypothesis [should INCREASE with thermal adaptation]
201 ## 5.1. AL
202 count_AVLIMFYW = count_A+count_V+count_L+count_I+count_F+count_Y+count_W+count_M
203 prop_AVLIMFYW=float(count_AVLIMFYW)/float(ln)
204 ## 5.2. Only non-aromatic
205 count_AVLIM = count_A+count_V+count_L+count_I+count_M
206 prop_AVLIM=float(count_AVLIM)/float(ln)
207 ## 5.3. Only aromatic (have they higher residus volume?? in such case opposite hypothesis based on residu volume, predict DECREASE for these aa in composition)
208 count_FYW = count_F+count_Y+count_W
209 prop_FYW=float(count_FYW)/float(ln)
210
211 ## 6 ## Charged hypothesis => positivelly correlated with optimal growth temperature
212 # All charged
213 count_RHKDE = count_R + count_H +count_K + count_D + count_E
214 prop_RHKDE = float(count_RHKDE)/float(ln)
215 # Only positive
216 count_RHK = count_R + count_H +count_K
217 prop_RHK = float(count_RHK)/float(ln)
218 # Only negative
219 count_DE = count_D + count_E
220 prop_DE = float(count_DE)/float(ln)
221
222 ## 7 ## Neutral polar hypothesis [should DECREASE with thermal adaptation]
223 count_STNQ = count_S+count_T+count_N+count_Q
224 prop_STNQ=float(count_STNQ)/float(ln)
225
226
227 ## 9 ## PAYRE VS MGDS (FONTANILLAS CRITERIA)
228 ## 9.1 ## Didier's criteria 1 = SMALL / BIG
229 count_PAYRE = count_A+count_Y+count_P+count_R+count_E
230 prop_PAYRE=float(count_PAYRE)/float(ln)
231 count_MVGDS = count_V+count_M+count_S+count_G+count_D
232 prop_MVGDS=float(count_MVGDS)/float(ln)
233 if count_MVGDS!= 0:
234 ratio_PAYRE_vs_MVGDS = float(count_PAYRE)/float(count_MVGDS)
235 else:
236 ratio_PAYRE_vs_MVGDS=-1 ## "-1" will indicate the impossibility to compute the ratio (coz the numerator)
237
238 ## 9.2 ## Didier's criteria 2 = VERY SMALL / BIG
239 count_AC = count_A+count_C
240 prop_AC=float(count_AC)/float(ln)
241
242 #count_VLIM = count_V+count_L+count_I+count_M
243 if count_MVGDS != 0:
244 ratio_AC_vs_MVGDS = float(count_AC)/float(count_MVGDS)
245 else:
246 ratio_AC_vs_MVGDS=-1 ## "-1" will indicate the impossibility to compute the ratio (coz the numerator)
247 else:
248 count_IVYWREL=0
249 prop_IVYWREL=0
250 count_ERK=0
251 prop_ERK=0
252 count_DNQTSH=0
253 prop_DNQTSH=0
254 ratio_ERK_vs_DNQTSH=0
255 count_EK=0
256 prop_EK=0
257 count_QH=0
258 prop_QH=0
259 ratio_EK_vs_QH=0
260 count_FYMINK=0
261 prop_FYMINK=0
262 count_GARP=0
263 prop_GARP=0
264 count_AVLIMFYW=0
265 prop_AVLIMFYW=0
266 count_AVLIM=0
267 prop_AVLIM=0
268 count_FYW=0
269 prop_FYW=0
270 count_STNQ=0
271 prop_STNQ=0
272 count_MVGDS=0
273 prop_MVGDS=0
274 count_PAYRE=0
275 prop_PAYRE=0
276 count_AC=0
277 prop_AC=0
278 ratio_PAYRE_vs_MVGDS=0
279 ratio_AC_vs_MVGDS=0
280 count_RHKDE=0
281 prop_RHKDE=0
282 count_RHK=0
283 prop_RHK=0
284 count_DE=0
285 prop_DE=0
286
287 return(count_IVYWREL,prop_IVYWREL,count_ERK,prop_ERK,count_DNQTSH,prop_DNQTSH,ratio_ERK_vs_DNQTSH,count_EK,prop_EK,count_QH,prop_QH,ratio_EK_vs_QH,count_FYMINK,prop_FYMINK,count_GARP,prop_GARP,count_AVLIMFYW, prop_AVLIMFYW,count_AVLIM,prop_AVLIM,count_FYW,prop_FYW,count_STNQ, prop_STNQ, count_MVGDS,prop_MVGDS, count_PAYRE,prop_PAYRE, count_AC,prop_AC, ratio_PAYRE_vs_MVGDS, ratio_AC_vs_MVGDS, count_RHKDE,prop_RHKDE,count_RHK,prop_RHK,count_DE,prop_DE)
288 #####################
289
290
291 ##################
292 ###### DEF5 ######
293 ##################
294 def aa_properties(fileIN_aaProperties):
295 next = fileIN_aaProperties.readline() ## JUMP HEADERS
296
297 bash_aa_properties={}
298
299 while 1:
300 next = fileIN_aaProperties.readline()
301 if not next:
302 break
303
304 S1 = string.split(next, ",")
305
306 aa_name = S1[1]
307 S2 = string.split(aa_name, "/")
308 aa_code = S2[1][:-1]
309
310 frequencies = S1[2][:-1]
311 Residue_Weight = S1[5]
312 Residue_Volume = S1[6]
313 Partial_specific_volume = S1[7]
314 Hydration = S1[8]
315
316 bash_aa_properties[aa_code] = [frequencies,Residue_Weight,Residue_Volume,Partial_specific_volume,Hydration]
317
318 return(bash_aa_properties)
319
320
321 ##################
322 ###### DEF6 ######
323 ##################
324 def sequence_properties_from_aa_properties(seq, bash_properties):
325
326 ## 1 ## count occurence of AA
327 count_K=string.count(seq,"K")
328 count_R=string.count(seq,"R")
329 count_A=string.count(seq,"A")
330 count_F=string.count(seq,"F")
331 count_I=string.count(seq,"I")
332 count_L=string.count(seq,"L")
333 count_M=string.count(seq,"M")
334 count_V=string.count(seq,"V")
335 count_W=string.count(seq,"W")
336 count_N=string.count(seq,"N")
337 count_Q=string.count(seq,"Q")
338 count_S=string.count(seq,"S")
339 count_T=string.count(seq,"T")
340 count_H=string.count(seq,"H")
341 count_Y=string.count(seq,"Y")
342 count_C=string.count(seq,"C")
343 count_D=string.count(seq,"D")
344 count_E=string.count(seq,"E")
345 count_P=string.count(seq,"P")
346 count_G=string.count(seq,"G")
347
348 TOTAL=count_K+count_R+count_A+count_F+count_I+count_L+count_M+count_V+count_W+count_N+count_Q+count_S+count_T+count_H+count_Y+count_C+count_D+count_E+count_P+count_G
349
350 if (TOTAL!=0):
351
352
353 ## 2 ## Compute properties 1: Residue Weight (Mr) (UNIT:Daltons):
354
355 Total_Residue_Weight = count_K*float(bash_properties["K"][1]) + count_R*float(bash_properties["R"][1]) + count_A*float(bash_properties["A"][1]) + count_F*float(bash_properties["F"][1]) + count_I*float(bash_properties["I"][1]) + count_L*float(bash_properties["L"][1]) + count_M*float(bash_properties["M"][1]) + count_V*float(bash_properties["V"][1]) + count_W*float(bash_properties["W"][1]) + count_N*float(bash_properties["N"][1]) + count_Q*float(bash_properties["Q"][1]) + count_S*float(bash_properties["S"][1]) + count_T*float(bash_properties["T"][1]) + count_H*float(bash_properties["H"][1]) + count_Y*float(bash_properties["Y"][1]) + count_C*float(bash_properties["C"][1]) + count_D*float(bash_properties["D"][1]) + count_E*float(bash_properties["E"][1]) + count_P*float(bash_properties["P"][1]) + count_G*float(bash_properties["G"][1])
356 Total_Residue_Volume = count_K*float(bash_properties["K"][2]) + count_R*float(bash_properties["R"][2]) + count_A*float(bash_properties["A"][2]) + count_F*float(bash_properties["F"][2]) + count_I*float(bash_properties["I"][2]) + count_L*float(bash_properties["L"][2]) + count_M*float(bash_properties["M"][2]) + count_V*float(bash_properties["V"][2]) + count_W*float(bash_properties["W"][2]) + count_N*float(bash_properties["N"][2]) + count_Q*float(bash_properties["Q"][2]) + count_S*float(bash_properties["S"][2]) + count_T*float(bash_properties["T"][2]) + count_H*float(bash_properties["H"][2]) + count_Y*float(bash_properties["Y"][2]) + count_C*float(bash_properties["C"][2]) + count_D*float(bash_properties["D"][2]) + count_E*float(bash_properties["E"][2]) + count_P*float(bash_properties["P"][2]) + count_G*float(bash_properties["G"][2])
357 Total_Partial_specific_volume = count_K*float(bash_properties["K"][3]) + count_R*float(bash_properties["R"][3]) + count_A*float(bash_properties["A"][3]) + count_F*float(bash_properties["F"][3]) + count_I*float(bash_properties["I"][3]) + count_L*float(bash_properties["L"][3]) + count_M*float(bash_properties["M"][3]) + count_V*float(bash_properties["V"][3]) + count_W*float(bash_properties["W"][3]) + count_N*float(bash_properties["N"][3]) + count_Q*float(bash_properties["Q"][3]) + count_S*float(bash_properties["S"][3]) + count_T*float(bash_properties["T"][3]) + count_H*float(bash_properties["H"][3]) + count_Y*float(bash_properties["Y"][3]) + count_C*float(bash_properties["C"][3]) + count_D*float(bash_properties["D"][3]) + count_E*float(bash_properties["E"][3]) + count_P*float(bash_properties["P"][3]) + count_G*float(bash_properties["G"][3])
358 Total_Hydration = count_K*float(bash_properties["K"][4]) + count_R*float(bash_properties["R"][4]) + count_A*float(bash_properties["A"][4]) + count_F*float(bash_properties["F"][4]) + count_I*float(bash_properties["I"][4]) + count_L*float(bash_properties["L"][4]) + count_M*float(bash_properties["M"][4]) + count_V*float(bash_properties["V"][4]) + count_W*float(bash_properties["W"][4]) + count_N*float(bash_properties["N"][4]) + count_Q*float(bash_properties["Q"][4]) + count_S*float(bash_properties["S"][4]) + count_T*float(bash_properties["T"][4]) + count_H*float(bash_properties["H"][4]) + count_Y*float(bash_properties["Y"][4]) + count_C*float(bash_properties["C"][4]) + count_D*float(bash_properties["D"][4]) + count_E*float(bash_properties["E"][4]) + count_P*float(bash_properties["P"][4]) + count_G*float(bash_properties["G"][4])
359 else:
360 Total_Residue_Weight=0
361 Total_Residue_Volume=0
362 Total_Partial_specific_volume=0
363 Total_Hydration=0
364
365 return(Total_Residue_Weight,Total_Residue_Volume,Total_Partial_specific_volume,Total_Hydration)
366
367 ########################################################
368
369
370
371 ###################
372 ### RUN RUN RUN ###
373 ###################
374
375 ##Create specific folders
376 Path_IN_loci_NUC = "./IN_AA"
377 outpath= "./OUT"
378 os.makedirs(Path_IN_loci_NUC)
379 os.makedirs(outpath)
380
381 infiles = []
382 with open(sys.argv[2], 'r') as f:
383 for line in f.readlines():
384 infiles.append(line.strip('\n'))
385
386 for file in infiles:
387 os.system("cp %s %s" %(file, Path_IN_loci_NUC))
388
389 ## 1 ## List taxa
390 LT=[]
391 cmd="grep '>' {}".format(sys.argv[1])
392 result = subprocess.check_output(cmd, shell=True)
393 result=result.split('\n')
394 for i in result:
395 sp=i[1:]
396 if sp !='':
397 LT.append(sp)
398 print LT
399
400
401 ## 2 ## PathIN
402 fileIN_properties = open("amino_acid_properties.csv", "r")
403 Path_IN_loci_AA = "./IN_AA"
404 #Path_IN_loci_AA = "02_CDS_No_Missing_Data_aa_CDS_withM"
405 Lloci_AA = os.listdir(Path_IN_loci_AA)
406
407 ## 3 ## PathOUT
408
409 ## 3.1 ## PROT composition
410 fileOUT_PROT_ALL=open("./OUT/prot_compositions_All_AA.csv","w")
411 fileOUT_PROT_ALL.write("LOCUS,")
412 for taxa in LT[0:-1]:
413 fileOUT_PROT_ALL.write("%s_prop_K,%s_prop_R,%s_prop_A,%s_prop_F,%s_prop_I,%s_prop_L,%s_prop_M,%s_prop_V,%s_prop_W,%s_prop_N,%s_prop_Q,%s_prop_S,%s_prop_T,%s_prop_H,%s_prop_Y,%s_prop_C,%s_prop_D,%s_prop_E,%s_prop_P,%s_prop_G," %(taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa))
414 fileOUT_PROT_ALL.write("%s_prop_K,%s_prop_R,%s_prop_A,%s_prop_F,%s_prop_I,%s_prop_L,%s_prop_M,%s_prop_V,%s_prop_W,%s_prop_N,%s_prop_Q,%s_prop_S,%s_prop_T,%s_prop_H,%s_prop_Y,%s_prop_C,%s_prop_D,%s_prop_E,%s_prop_P,%s_prop_G" %(LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1]))
415 fileOUT_PROT_ALL.write("\n")
416
417 ## 3.2 ## PROT IVYWREL
418 fileOUT_IVYWREL=open("./OUT/IVYWREL.csv","w")
419 fileOUT_IVYWREL.write("LOCUS,")
420 for taxa in LT[0:-1]:
421 fileOUT_IVYWREL.write("%s_count_IVYWREL,%s_prop_IVYWREL," %(taxa,taxa))
422 fileOUT_IVYWREL.write("%s_count_IVYWREL,%s_prop_IVYWREL" %(LT[-1],LT[-1]))
423 fileOUT_IVYWREL.write("\n")
424
425 ## 3.3 ## PROT ERK_DNQTSHA
426 fileOUT_ERK_DNQTSH=open("./OUT/ERK_DNQTSH.csv","w")
427 fileOUT_ERK_DNQTSH.write("LOCUS,")
428 for taxa in LT[0:-1]:
429 fileOUT_ERK_DNQTSH.write("%s_count_ERK,%s_prop_ERK,%s_count_DNQTSH,%s_prop_DNQTSH,%s_ratio_ERK_vs_DNQTSH," %(taxa,taxa,taxa,taxa,taxa))
430 fileOUT_ERK_DNQTSH.write("%s_count_ERK,%s_prop_ERK,%s_count_DNQTSH,%s_prop_DNQTSH,%s_ratio_ERK_vs_DNQTSH" %(LT[-1],LT[-1],LT[-1],LT[-1],LT[-1]))
431 fileOUT_ERK_DNQTSH.write("\n")
432
433 ## 3.4 ## PROT EK_QH
434 fileOUT_EK_QH=open("./OUT/EK_QH.csv","w")
435 fileOUT_EK_QH.write("LOCUS,")
436 for taxa in LT[0:-1]:
437 fileOUT_EK_QH.write("%s_count_EK,%s_prop_EK,%s_count_QH,%s_prop_QH,%s_ratio_EK_vs_QH," %(taxa,taxa,taxa,taxa,taxa))
438 fileOUT_EK_QH.write("%s_count_EK,%s_prop_EK,%s_count_QH,%s_prop_QH,%s_ratio_EK_vs_QH" %(LT[-1],LT[-1],LT[-1],LT[-1],LT[-1]))
439 fileOUT_EK_QH.write("\n")
440
441 ## 3.5 ## PROT FYMINK_GARP
442 fileOUT_FYMINK_GARP=open("./OUT/FYMINK_GARP.csv","w")
443 fileOUT_FYMINK_GARP.write("LOCUS,")
444 for taxa in LT[0:-1]:
445 fileOUT_FYMINK_GARP.write("%s_count_FYMINK,%s_prop_FYMINK,%s_count_GARP,%s_prop_GARP," %(taxa,taxa,taxa,taxa))
446 fileOUT_FYMINK_GARP.write("%s_count_FYMINK,%s_prop_FYMINK,%s_count_GARP,%s_prop_GARP" %(LT[-1],LT[-1],LT[-1],LT[-1]))
447 fileOUT_FYMINK_GARP.write("\n")
448
449 ## 3.6 ## PROT AVLIMFYW
450 fileOUT_AVLIMFYW=open("./OUT/AVLIMFYW.csv","w")
451 fileOUT_AVLIMFYW.write("LOCUS,")
452 for taxa in LT[0:-1]:
453 fileOUT_AVLIMFYW.write("%s_count_AVLIMFYW,%s_prop_AVLIMFYW,%s_count_AVLIM,%s_prop_AVLIM,%s_count_FYW,%s_prop_FYW," %(taxa,taxa,taxa,taxa,taxa,taxa))
454 fileOUT_AVLIMFYW.write("%s_count_AVLIMFYW,%s_prop_AVLIMFYW,%s_count_AVLIM,%s_prop_AVLIM,%s_count_FYW,%s_prop_FYW" %(LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1]))
455 fileOUT_AVLIMFYW.write("\n")
456
457 ## 3.7 ## PROT STNQ
458 fileOUT_STNQ=open("./OUT/STNQ.csv","w")
459 fileOUT_STNQ.write("LOCUS,")
460 for taxa in LT[0:-1]:
461 fileOUT_STNQ.write("%s_count_STNQ,%s_prop_STNQ," %(taxa,taxa))
462 fileOUT_STNQ.write("%s_count_STNQ,%s_prop_STNQ" %(LT[-1],LT[-1]))
463 fileOUT_STNQ.write("\n")
464
465 ## 3.8 ## PROT RHKDE
466 fileOUT_RHKDE=open("./OUT/RHKDE.csv","w")
467 fileOUT_RHKDE.write("LOCUS,")
468 for taxa in LT[0:-1]:
469 fileOUT_RHKDE.write("%s_count_RHKDE,%s_prop_RHKDE,%s_count_RHK,%s_prop_RHK,%s_count_DE,%s_prop_DE," %(taxa,taxa,taxa,taxa,taxa,taxa))
470 fileOUT_RHKDE.write("%s_count_RHKDE,%s_prop_RHKDE,%s_count_RHK,%s_prop_RHK,%s_count_DE,%s_prop_DE" %(LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1]))
471 fileOUT_RHKDE.write("\n")
472
473 ## 3.9 ## PROT DIDER CRITERIA
474 fileOUT_PAYRE=open("./OUT/PAYRE-MVGDS.csv","w")
475 fileOUT_PAYRE.write("LOCUS,")
476 for taxa in LT[0:-1]:
477 fileOUT_PAYRE.write("%s_count_PAYRE,%s_prop_PAYRE,%s_count_AC,%s_prop_AC,%s_count_MVGDS,%s_prop_MVGDS,%s_ratio_PAYRE_vs_MVGDS,%s_ratio_AC_vs_MVGDS," %(taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa))
478 fileOUT_PAYRE.write("%s_count_PAYRE,%s_prop_PAYRE,%s_count_AC,%s_prop_AC,%s_count_MVGDS,%s_prop_MVGDS,%s_ratio_PAYRE_vs_MVGDS,%s_ratio_AC_vs_MVGDS" %(LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1],LT[-1]))
479 fileOUT_PAYRE.write("\n")
480
481 ## 3.10 ## PROT Total residue weight
482 fileOUT_TotalResidueWeight=open("./OUT/TotalResidueWeight.csv","w")
483 fileOUT_TotalResidueWeight.write("LOCUS,")
484 for taxa in LT[0:-1]:
485 fileOUT_TotalResidueWeight.write("%s_Total_Residue_Weight," %taxa)
486 fileOUT_TotalResidueWeight.write("%s_Total_Residue_Weight" %LT[-1])
487 fileOUT_TotalResidueWeight.write("\n")
488
489 ## 3.11 ## PROT Total residue volume
490 fileOUT_TotalResidueVolume=open("./OUT/TotalResidueVolume.csv","w")
491 fileOUT_TotalResidueVolume.write("LOCUS,")
492 for taxa in LT[0:-1]:
493 fileOUT_TotalResidueVolume.write("%s_Total_Residue_Volume," %taxa)
494 fileOUT_TotalResidueVolume.write("%s_Total_Residue_Volume" %LT[-1])
495 fileOUT_TotalResidueVolume.write("\n")
496
497 ## 3.12 ## PROT Total partial specific volume
498 fileOUT_TotalPartialSpecificVolume=open("./OUT/TotalPartialSpecificVolume.csv","w")
499 fileOUT_TotalPartialSpecificVolume.write("LOCUS,")
500 for taxa in LT[0:-1]:
501 fileOUT_TotalPartialSpecificVolume.write("%s_Total_Partial_Specific_Volume," %taxa)
502 fileOUT_TotalPartialSpecificVolume.write("%s_Total_Partial_Specific_Volume" %LT[-1])
503 fileOUT_TotalPartialSpecificVolume.write("\n")
504
505 ## 3.13 ## PROT Total hydratation
506 fileOUT_TotalHydratation=open("./OUT/TotalHydratation.csv","w")
507 fileOUT_TotalHydratation.write("LOCUS,")
508 for taxa in LT[0:-1]:
509 fileOUT_TotalHydratation.write("%s_Total_Hydratation," %taxa)
510 fileOUT_TotalHydratation.write("%s_Total_Hydratation" %LT[-1])
511 fileOUT_TotalHydratation.write("\n")
512
513 #####################
514 ## 4 ## Process Loci
515 #####################
516 bash_aa_properties = aa_properties(fileIN_properties)
517
518 for locus in Lloci_AA:
519 print locus
520 path_locus = "%s/%s" %(Path_IN_loci_AA, locus)
521 bash = dico(path_locus,LT)
522
523 #print bash
524
525 fileOUT_PROT_ALL.write("%s," %locus)
526 fileOUT_IVYWREL.write("%s," %locus)
527 fileOUT_ERK_DNQTSH.write("%s," %locus)
528 fileOUT_EK_QH.write("%s," %locus)
529 fileOUT_FYMINK_GARP.write("%s," %locus)
530 fileOUT_AVLIMFYW.write("%s," %locus)
531 fileOUT_STNQ.write("%s," %locus)
532 fileOUT_RHKDE.write("%s," %locus)
533 fileOUT_PAYRE.write("%s," %locus)
534 fileOUT_TotalResidueWeight.write("%s," %locus)
535 fileOUT_TotalResidueVolume.write("%s," %locus)
536 fileOUT_TotalPartialSpecificVolume.write("%s," %locus)
537 fileOUT_TotalHydratation.write("%s," %locus)
538
539 for taxa in LT[0:-1]:
540 if taxa in bash.keys():
541 seq = bash[taxa]
542 prop_K,prop_R,prop_A,prop_F,prop_I,prop_L,prop_M,prop_V,prop_W,prop_N,prop_Q,prop_S,prop_T,prop_H,prop_Y,prop_C,prop_D,prop_E,prop_P,prop_G = aa_composition1(seq) ### DEF3 ###
543 count_IVYWREL,prop_IVYWREL,count_ERK,prop_ERK,count_DNQTSH,prop_DNQTSH,ratio_ERK_vs_DNQTSH,count_EK,prop_EK,count_QH,prop_QH,ratio_EK_vs_QH,count_FYMINK,prop_FYMINK,count_GARP,prop_GARP,count_AVLIMFYW,prop_AVLIMFYW,count_AVLIM,prop_AVLIM,count_FYW,prop_FYW,count_STNQ,prop_STNQ, count_MVGDS,prop_MVGDS, count_PAYRE,prop_PAYRE, count_AC,prop_AC, ratio_PAYRE_vs_MVGDS, ratio_AC_vs_MVGDS,count_RHKDE,prop_RHKDE,count_RHK,prop_RHK,count_DE,prop_DE = aa_composition2(seq) ### DEF4 ###
544 Total_Residue_Weight,Total_Residue_Volume,Total_Partial_Specific_Volume,Total_Hydration = sequence_properties_from_aa_properties(seq, bash_aa_properties) ### DEF6 ###
545
546 fileOUT_PROT_ALL.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f," %(prop_K,prop_R,prop_A,prop_F,prop_I,prop_L,prop_M,prop_V,prop_W,prop_N,prop_Q,prop_S,prop_T,prop_H,prop_Y,prop_C,prop_D,prop_E,prop_P,prop_G))
547 fileOUT_IVYWREL.write("%.5f,%.5f," %(count_IVYWREL, prop_IVYWREL))
548 fileOUT_ERK_DNQTSH.write("%.5f,%.5f,%.5f,%.5f,%.5f," %(count_ERK,prop_ERK,count_DNQTSH,prop_DNQTSH,ratio_ERK_vs_DNQTSH))
549 fileOUT_EK_QH.write("%.5f,%.5f,%.5f,%.5f,%.5f," %(count_EK,prop_EK,count_QH,prop_QH,ratio_EK_vs_QH))
550 fileOUT_FYMINK_GARP.write("%.5f,%.5f,%.5f,%.5f," %(count_FYMINK,prop_FYMINK,count_GARP,prop_GARP))
551 fileOUT_AVLIMFYW.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f," %(count_AVLIMFYW,prop_AVLIMFYW,count_AVLIM,prop_AVLIM,count_FYW,prop_FYW))
552 fileOUT_STNQ.write("%.5f,%.5f," %(count_STNQ,prop_STNQ))
553 fileOUT_RHKDE.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,"%(count_RHKDE,prop_RHKDE,count_RHK,prop_RHK,count_DE,prop_DE))
554 fileOUT_PAYRE.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f," %(count_PAYRE,prop_PAYRE,count_AC,prop_AC,count_MVGDS,prop_MVGDS,ratio_PAYRE_vs_MVGDS,ratio_AC_vs_MVGDS))
555 fileOUT_TotalResidueWeight.write("%.5f," %Total_Residue_Weight)
556 fileOUT_TotalResidueVolume.write("%.5f," %Total_Residue_Volume)
557 fileOUT_TotalPartialSpecificVolume.write("%.5f," %(Total_Partial_Specific_Volume))
558 fileOUT_TotalHydratation.write("%.5f," % Total_Hydration)
559 else:
560 fileOUT_PROT_ALL.write("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s," %("NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA"))
561 fileOUT_IVYWREL.write("%s,%s," %("NA", "NA"))
562 fileOUT_ERK_DNQTSH.write("%s,%s,%s,%s,%s," %("NA","NA","NA","NA","NA"))
563 fileOUT_EK_QH.write("%s,%s,%s,%s,%s," %("NA","NA","NA","NA","NA"))
564 fileOUT_FYMINK_GARP.write("%s,%s,%s,%s," %("NA","NA","NA","NA"))
565 fileOUT_AVLIMFYW.write("%s,%s,%s,%s,%s,%s," %("NA","NA","NA","NA","NA","NA"))
566 fileOUT_STNQ.write("%s,%s," %("NA","NA"))
567 fileOUT_RHKDE.write("%s,%s,%s,%s,%s,%s,"%("NA","NA","NA","NA","NA","NA"))
568 fileOUT_PAYRE.write("%s,%s,%s,%s,%s,%s,%s,%s," %("NA","NA","NA","NA","NA","NA","NA","NA"))
569 fileOUT_TotalResidueWeight.write("%s," %"NA")
570 fileOUT_TotalResidueVolume.write("%s," %"NA")
571 fileOUT_TotalPartialSpecificVolume.write("%s," %"NA")
572 fileOUT_TotalHydratation.write("%s," %"NA")
573
574 if LT[-1] in bash.keys():
575 seq = bash[LT[-1]]
576 prop_K,prop_R,prop_A,prop_F,prop_I,prop_L,prop_M,prop_V,prop_W,prop_N,prop_Q,prop_S,prop_T,prop_H,prop_Y,prop_C,prop_D,prop_E,prop_P,prop_G = aa_composition1(seq) ### DEF3 ###
577 count_IVYWREL,prop_IVYWREL,count_ERK,prop_ERK,count_DNQTSH,prop_DNQTSH,ratio_ERK_vs_DNQTSH,count_EK,prop_EK,count_QH,prop_QH,ratio_EK_vs_QH,count_FYMINK,prop_FYMINK,count_GARP,prop_GARP,count_AVLIMFYW,prop_AVLIMFYW,count_AVLIM,prop_AVLIM,count_FYW,prop_FYW,count_STNQ,prop_STNQ, count_MVGDS,prop_MVGDS, count_PAYRE,prop_PAYRE, count_AC,prop_AC, ratio_PAYRE_vs_MVGDS, ratio_AC_vs_MVGDS,count_RHKDE,prop_RHKDE,count_RHK,prop_RHK,count_DE,prop_DE = aa_composition2(seq) ### DEF4 ###
578 Total_Residue_Weight,Total_Residue_Volume,Total_Partial_Specific_Volume,Total_Hydration = sequence_properties_from_aa_properties(seq, bash_aa_properties) ### DEF6 ###
579
580 fileOUT_PROT_ALL.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f" %(prop_K,prop_R,prop_A,prop_F,prop_I,prop_L,prop_M,prop_V,prop_W,prop_N,prop_Q,prop_S,prop_T,prop_H,prop_Y,prop_C,prop_D,prop_E,prop_P,prop_G))
581 fileOUT_IVYWREL.write("%.5f,%.5f" %(count_IVYWREL, prop_IVYWREL))
582 fileOUT_ERK_DNQTSH.write("%.5f,%.5f,%.5f,%.5f,%.5f" %(count_ERK,prop_ERK,count_DNQTSH,prop_DNQTSH,ratio_ERK_vs_DNQTSH))
583 fileOUT_EK_QH.write("%.5f,%.5f,%.5f,%.5f,%.5f" %(count_EK,prop_EK,count_QH,prop_QH,ratio_EK_vs_QH))
584 fileOUT_FYMINK_GARP.write("%.5f,%.5f,%.5f,%.5f" %(count_FYMINK,prop_FYMINK,count_GARP,prop_GARP))
585 fileOUT_AVLIMFYW.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f" %(count_AVLIMFYW,prop_AVLIMFYW,count_AVLIM,prop_AVLIM,count_FYW,prop_FYW))
586 fileOUT_STNQ.write("%.5f,%.5f" %(count_STNQ,prop_STNQ))
587 fileOUT_RHKDE.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f"%(count_RHKDE,prop_RHKDE,count_RHK,prop_RHK,count_DE,prop_DE))
588 fileOUT_PAYRE.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f" %(count_PAYRE,prop_PAYRE,count_AC,prop_AC,count_MVGDS,prop_MVGDS,ratio_PAYRE_vs_MVGDS,ratio_AC_vs_MVGDS))
589 fileOUT_TotalResidueWeight.write("%.5f" %Total_Residue_Weight)
590 fileOUT_TotalResidueVolume.write("%.5f" %Total_Residue_Volume)
591 fileOUT_TotalPartialSpecificVolume.write("%.5f" %(Total_Partial_Specific_Volume))
592 fileOUT_TotalHydratation.write("%.5f" % Total_Hydration)
593 else:
594 fileOUT_PROT_ALL.write("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s" %("NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA"))
595 fileOUT_IVYWREL.write("%s,%s" %("NA", "NA"))
596 fileOUT_ERK_DNQTSH.write("%s,%s,%s,%s,%s" %("NA","NA","NA","NA","NA"))
597 fileOUT_EK_QH.write("%s,%s,%s,%s,%s" %("NA","NA","NA","NA","NA"))
598 fileOUT_FYMINK_GARP.write("%s,%s,%s,%s" %("NA","NA","NA","NA"))
599 fileOUT_AVLIMFYW.write("%s,%s,%s,%s,%s,%s" %("NA","NA","NA","NA","NA","NA"))
600 fileOUT_STNQ.write("%s,%s" %("NA","NA"))
601 fileOUT_RHKDE.write("%s,%s,%s,%s,%s,%s"%("NA","NA","NA","NA","NA","NA"))
602 fileOUT_PAYRE.write("%s,%s,%s,%s,%s,%s,%s,%s" %("NA","NA","NA","NA","NA","NA","NA","NA"))
603 fileOUT_TotalResidueWeight.write("%s" %"NA")
604 fileOUT_TotalResidueVolume.write("%s" %"NA")
605 fileOUT_TotalPartialSpecificVolume.write("%s" %"NA")
606 fileOUT_TotalHydratation.write("%s" %"NA")
607
608 ## END LINE
609 fileOUT_PROT_ALL.write("\n")
610 fileOUT_IVYWREL.write("\n")
611 fileOUT_ERK_DNQTSH.write("\n")
612 fileOUT_EK_QH.write("\n")
613 fileOUT_FYMINK_GARP.write("\n")
614 fileOUT_AVLIMFYW.write("\n")
615 fileOUT_STNQ.write("\n")
616 fileOUT_RHKDE.write("\n")
617 fileOUT_PAYRE.write("\n")
618 fileOUT_TotalResidueWeight.write("\n")
619 fileOUT_TotalResidueVolume.write("\n")
620 fileOUT_TotalPartialSpecificVolume.write("\n")
621 fileOUT_TotalHydratation.write("\n")
622