comparison scripts/S01b_study_seq_composition_aa.py @ 0:78dd6454f6f0 draft

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 73670b26c75bb6c1a6332481920f3036314de364
author abims-sbr
date Tue, 02 May 2017 04:20:51 -0400
parents
children 8de21b6eb110
comparison
equal deleted inserted replaced
-1:000000000000 0:78dd6454f6f0
1 #!/usr/bin/env python
2 # -*- coding: ascii -*-
3 ## Author: Eric FONTANILLAS
4 ## Date: 21.12.10
5 ## Object: Test for compositional bias in genome and proteome as marker of thermal adaptation (comparison between 2 "hot" species: Ap and Ps and two "cold" species: Pg, Pp)
6 import sys, os
7 script_path = os.path.dirname(sys.argv[0])
8
9 #############
10 ### DEF 0 ###
11 #############
12 def simplify_fasta_name(fasta_name,LT):
13
14 for abbreviation in LT:
15 if abbreviation in fasta_name:
16 new_fasta_name = abbreviation
17
18 return(new_fasta_name)
19 ##########################################
20
21 ###########
22 ## DEF1 ##
23 ###########
24 ## Generates bash, with key = fasta name; value = sequence (WITH GAP, IF ANY, REMOVED IN THIS FUNCTION)
25
26 def dico(fasta_file,LT):
27
28 count_fastaName=0
29 F1 = open(fasta_file, "r")
30
31 bash1 = {}
32 while 1:
33 nextline = F1.readline()
34 #print nextline
35 if not nextline :
36 break
37
38 if nextline[0] == ">":
39 count_fastaName = count_fastaName + 1
40 fasta_name = nextline[1:-1]
41 nextline = F1.readline()
42 sequence = nextline[:-1]
43
44 if fasta_name not in bash1.keys():
45 fasta_name = simplify_fasta_name(fasta_name,LT) ### DEF 0 ###
46 bash1[fasta_name] = sequence
47 else:
48 print fasta_name
49
50 # Find alignment length
51 kk = bash1.keys()
52 key0 = kk[0]
53 seq0 = bash1[key0]
54 ln_seq = len(seq0)
55
56 F1.close()
57
58 return(bash1)
59 #####################################
60
61
62
63 ##################
64 ###### DEF2 ######
65 ##################
66 def base_composition(seq):
67 count_A=string.count(seq, "A")
68 count_T=string.count(seq, "T")
69 count_C=string.count(seq, "C")
70 count_G=string.count(seq, "G")
71
72
73 CG = count_C+count_G
74 AT = count_T+count_A
75
76 AG = count_A+count_G
77 TC = count_T+count_C
78
79 ## 1 ## Search for compositional bias in genome as marker of thermal adaptation: CG vs AT
80 ratio_CG_AT=float(CG)/float(AT)
81
82 ## 2 ## Search for compositional bias in genome as marker of thermal adaptation: AG vs TC
83 ratio_purine_pyrimidine=float(AG)/float(TC)
84
85 ## 3 ## Nucleotide proportion
86 ln = len(seq)
87 prop_A = float(count_A)/float(ln)
88 prop_T = float(count_T)/float(ln)
89 prop_C = float(count_C)/float(ln)
90 prop_G = float(count_G)/float(ln)
91
92
93 return(ratio_CG_AT, ratio_purine_pyrimidine, prop_A, prop_T, prop_C, prop_G)
94 ##############################################
95
96
97 ##################
98 ###### DEF3 ######
99 ##################
100 def aa_composition1(seq):
101
102 ## 1 ## count occurence of AA
103 count_K=string.count(seq,"K")
104 count_R=string.count(seq,"R")
105 count_A=string.count(seq,"A")
106 count_F=string.count(seq,"F")
107 count_I=string.count(seq,"I")
108 count_L=string.count(seq,"L")
109 count_M=string.count(seq,"M")
110 count_V=string.count(seq,"V")
111 count_W=string.count(seq,"W")
112 count_N=string.count(seq,"N")
113 count_Q=string.count(seq,"Q")
114 count_S=string.count(seq,"S")
115 count_T=string.count(seq,"T")
116 count_H=string.count(seq,"H")
117 count_Y=string.count(seq,"Y")
118 count_C=string.count(seq,"C")
119 count_D=string.count(seq,"D")
120 count_E=string.count(seq,"E")
121 count_P=string.count(seq,"P")
122 count_G=string.count(seq,"G")
123
124
125
126 ## 2 ## compute relative proportion
127 TOTAL=count_K+count_R+count_A+count_F+count_I+count_L+count_M+count_V+count_W+count_N+count_Q+count_S+count_T+count_H+count_Y+count_C+count_D+count_E+count_P+count_G
128 if (TOTAL!=0):
129 ln = TOTAL
130
131 prop_K=float(count_K)/float(ln)
132 prop_R=float(count_R)/float(ln)
133 prop_A=float(count_A)/float(ln)
134 prop_F=float(count_F)/float(ln)
135 prop_I=float(count_I)/float(ln)
136 prop_L=float(count_L)/float(ln)
137 prop_M=float(count_M)/float(ln)
138 prop_V=float(count_V)/float(ln)
139 prop_W=float(count_W)/float(ln)
140 prop_N=float(count_N)/float(ln)
141 prop_Q=float(count_Q)/float(ln)
142 prop_S=float(count_S)/float(ln)
143 prop_T=float(count_T)/float(ln)
144 prop_H=float(count_H)/float(ln)
145 prop_Y=float(count_Y)/float(ln)
146 prop_C=float(count_C)/float(ln)
147 prop_D=float(count_D)/float(ln)
148 prop_E=float(count_E)/float(ln)
149 prop_P=float(count_P)/float(ln)
150 prop_G=float(count_G)/float(ln)
151 else:
152 prop_K=0
153 prop_R=0
154 prop_A=0
155 prop_F=0
156 prop_I=0
157 prop_L=0
158 prop_M=0
159 prop_V=0
160 prop_W=0
161 prop_N=0
162 prop_Q=0
163 prop_S=0
164 prop_T=0
165 prop_H=0
166 prop_Y=0
167 prop_C=0
168 prop_D=0
169 prop_E=0
170 prop_P=0
171 prop_G=0
172
173
174
175
176 return(prop_K,prop_R,prop_A,prop_F,prop_I,prop_L,prop_M,prop_V,prop_W,prop_N,prop_Q,prop_S,prop_T,prop_H,prop_Y,prop_C,prop_D,prop_E,prop_P,prop_G)
177
178 ##################
179 ###### DEF4 ######
180 ##################
181 def aa_composition2(seq):
182
183 ## 1 ## count occurence of AA
184 count_K=string.count(seq,"K")
185 count_R=string.count(seq,"R")
186 count_A=string.count(seq,"A")
187 count_F=string.count(seq,"F")
188 count_I=string.count(seq,"I")
189 count_L=string.count(seq,"L")
190 count_M=string.count(seq,"M")
191 count_V=string.count(seq,"V")
192 count_W=string.count(seq,"W")
193 count_N=string.count(seq,"N")
194 count_Q=string.count(seq,"Q")
195 count_S=string.count(seq,"S")
196 count_T=string.count(seq,"T")
197 count_H=string.count(seq,"H")
198 count_Y=string.count(seq,"Y")
199 count_C=string.count(seq,"C")
200 count_D=string.count(seq,"D")
201 count_E=string.count(seq,"E")
202 count_P=string.count(seq,"P")
203 count_G=string.count(seq,"G")
204
205
206
207 ## 2 ## compute seq length
208 TOTAL=count_K+count_R+count_A+count_F+count_I+count_L+count_M+count_V+count_W+count_N+count_Q+count_S+count_T+count_H+count_Y+count_C+count_D+count_E+count_P+count_G
209 if (TOTAL!=0):
210
211 ln = TOTAL
212 ##3 Famous Hyperthermophile Prokaryotes criterias
213
214 # 3.1. IVYWREL estimator => positivelly correlated with otpimal growth
215 count_IVYWREL = count_I+count_V+count_Y+count_W+count_R+count_E+count_L
216 prop_IVYWREL = float(count_IVYWREL)/float(ln)
217
218 # 3.2. ERK estimator (i.e. ERK vs DNQTSHA) => positivelly correlated with optimal growth temperature
219 # ERK alone
220 count_ERK = count_E + count_R + count_K
221 prop_ERK = float(count_ERK)/float(ln)
222 # DNQTSHA alone
223 count_DNQTSH = count_D+count_N+count_Q+count_T+count_S+count_H
224 prop_DNQTSH=float(count_DNQTSH)/float(ln)
225 # ERK vs DNQTSH
226 if count_DNQTSH != 0:
227 ratio_ERK_vs_DNQTSH = float(count_ERK)/float(count_DNQTSH)
228 else:
229 ratio_ERK_vs_DNQTSH=-1
230 # EK/QH estimator
231 count_EK = count_E+count_K
232 count_QH = count_Q+count_H
233
234 prop_EK = float(count_EK)/float(ln)
235 prop_QH = float(count_QH)/float(ln)
236
237 if count_QH != 0:
238 ratio_EK_vs_QH = float(count_EK)/float(count_QH)
239 else:
240 ratio_EK_vs_QH=-1 ## "-1" will indicate the impossibility to compute the ratio (coz the numerator)
241
242 ## 4 ## Mutationnal bias hypothesis => AT rich: favor FYMINK // GC rich: favor GARP
243 ## The mutational bias model predict a linear relationship between GARP vs FYMINK ==> so if outliers to that, it means that the excess of GARP or FYMINK are not explained by the mutationnal bias model but by other thing ... selection!!???
244 count_FYMINK=count_F+count_Y+count_M+count_I+count_N+count_K
245 prop_FYMINK = float(count_FYMINK)/float(ln)
246
247 count_GARP=count_G+count_A+count_R+count_P
248 prop_GARP=float(count_GARP)/float(ln)
249
250 ## 5 ## Hydophobicity hypothesis [should INCREASE with thermal adaptation]
251 ## 5.1. AL
252 count_AVLIMFYW = count_A+count_V+count_L+count_I+count_F+count_Y+count_W+count_M
253 prop_AVLIMFYW=float(count_AVLIMFYW)/float(ln)
254 ## 5.2. Only non-aromatic
255 count_AVLIM = count_A+count_V+count_L+count_I+count_M
256 prop_AVLIM=float(count_AVLIM)/float(ln)
257 ## 5.3. Only aromatic (have they higher residus volume?? in such case opposite hypothesis based on residu volume, predict DECREASE for these aa in composition)
258 count_FYW = count_F+count_Y+count_W
259 prop_FYW=float(count_FYW)/float(ln)
260
261 ## 6 ## Charged hypothesis => positivelly correlated with optimal growth temperature
262 # All charged
263 count_RHKDE = count_R + count_H +count_K + count_D + count_E
264 prop_RHKDE = float(count_RHKDE)/float(ln)
265 # Only positive
266 count_RHK = count_R + count_H +count_K
267 prop_RHK = float(count_RHK)/float(ln)
268 # Only negative
269 count_DE = count_D + count_E
270 prop_DE = float(count_DE)/float(ln)
271
272 ## 7 ## Neutral polar hypothesis [should DECREASE with thermal adaptation]
273 count_STNQ = count_S+count_T+count_N+count_Q
274 prop_STNQ=float(count_STNQ)/float(ln)
275
276
277 ## 9 ## PAYRE VS MGDS (FONTANILLAS CRITERIA)
278 ## 9.1 ## Didier's criteria 1 = SMALL / BIG
279 count_PAYRE = count_A+count_Y+count_P+count_R+count_E
280 prop_PAYRE=float(count_PAYRE)/float(ln)
281 count_MVGDS = count_V+count_M+count_S+count_G+count_D
282 prop_MVGDS=float(count_MVGDS)/float(ln)
283 if count_MVGDS!= 0:
284 ratio_PAYRE_vs_MVGDS = float(count_PAYRE)/float(count_MVGDS)
285 else:
286 ratio_PAYRE_vs_MVGDS=-1 ## "-1" will indicate the impossibility to compute the ratio (coz the numerator)
287
288 ## 9.2 ## Didier's criteria 2 = VERY SMALL / BIG
289 count_AC = count_A+count_C
290 prop_AC=float(count_AC)/float(ln)
291
292 #count_VLIM = count_V+count_L+count_I+count_M
293 if count_MVGDS != 0:
294 ratio_AC_vs_MVGDS = float(count_AC)/float(count_MVGDS)
295 else:
296 ratio_AC_vs_MVGDS=-1 ## "-1" will indicate the impossibility to compute the ratio (coz the numerator)
297 else:
298 count_IVYWREL=0
299 prop_IVYWREL=0
300 count_ERK=0
301 prop_ERK=0
302 count_DNQTSH=0
303 prop_DNQTSH=0
304 ratio_ERK_vs_DNQTSH=0
305 count_EK=0
306 prop_EK=0
307 count_QH=0
308 prop_QH=0
309 ratio_EK_vs_QH=0
310 count_FYMINK=0
311 prop_FYMINK=0
312 count_GARP=0
313 prop_GARP=0
314 count_AVLIMFYW=0
315 prop_AVLIMFYW=0
316 count_AVLIM=0
317 prop_AVLIM=0
318 count_FYW=0
319 prop_FYW=0
320 count_STNQ=0
321 prop_STNQ=0
322 count_MVGDS=0
323 prop_MVGDS=0
324 count_PAYRE=0
325 prop_PAYRE=0
326 count_AC=0
327 prop_AC=0
328 ratio_PAYRE_vs_MVGDS=0
329 ratio_AC_vs_MVGDS=0
330 count_RHKDE=0
331 prop_RHKDE=0
332 count_RHK=0
333 prop_RHK=0
334 count_DE=0
335 prop_DE=0
336
337 return(count_IVYWREL,prop_IVYWREL,count_ERK,prop_ERK,count_DNQTSH,prop_DNQTSH,ratio_ERK_vs_DNQTSH,count_EK,prop_EK,count_QH,prop_QH,ratio_EK_vs_QH,count_FYMINK,prop_FYMINK,count_GARP,prop_GARP,count_AVLIMFYW, prop_AVLIMFYW,count_AVLIM,prop_AVLIM,count_FYW,prop_FYW,count_STNQ, prop_STNQ, count_MVGDS,prop_MVGDS, count_PAYRE,prop_PAYRE, count_AC,prop_AC, ratio_PAYRE_vs_MVGDS, ratio_AC_vs_MVGDS, count_RHKDE,prop_RHKDE,count_RHK,prop_RHK,count_DE,prop_DE)
338 #####################
339
340
341 ##################
342 ###### DEF5 ######
343 ##################
344 def aa_properties(fileIN_aaProperties):
345 next = fileIN_aaProperties.readline() ## JUMP HEADERS
346
347 bash_aa_properties={}
348
349 while 1:
350 next = fileIN_aaProperties.readline()
351 if not next:
352 break
353
354 S1 = string.split(next, ",")
355
356 aa_name = S1[1]
357 S2 = string.split(aa_name, "/")
358 aa_code = S2[1][:-1]
359
360 frequencies = S1[2][:-1]
361 Residue_Weight = S1[5]
362 Residue_Volume = S1[6]
363 Partial_specific_volume = S1[7]
364 Hydration = S1[8]
365
366 bash_aa_properties[aa_code] = [frequencies,Residue_Weight,Residue_Volume,Partial_specific_volume,Hydration]
367
368 return(bash_aa_properties)
369
370
371 ##################
372 ###### DEF6 ######
373 ##################
374 def sequence_properties_from_aa_properties(seq, bash_properties):
375
376 ## 1 ## count occurence of AA
377 count_K=string.count(seq,"K")
378 count_R=string.count(seq,"R")
379 count_A=string.count(seq,"A")
380 count_F=string.count(seq,"F")
381 count_I=string.count(seq,"I")
382 count_L=string.count(seq,"L")
383 count_M=string.count(seq,"M")
384 count_V=string.count(seq,"V")
385 count_W=string.count(seq,"W")
386 count_N=string.count(seq,"N")
387 count_Q=string.count(seq,"Q")
388 count_S=string.count(seq,"S")
389 count_T=string.count(seq,"T")
390 count_H=string.count(seq,"H")
391 count_Y=string.count(seq,"Y")
392 count_C=string.count(seq,"C")
393 count_D=string.count(seq,"D")
394 count_E=string.count(seq,"E")
395 count_P=string.count(seq,"P")
396 count_G=string.count(seq,"G")
397
398 TOTAL=count_K+count_R+count_A+count_F+count_I+count_L+count_M+count_V+count_W+count_N+count_Q+count_S+count_T+count_H+count_Y+count_C+count_D+count_E+count_P+count_G
399
400 if (TOTAL!=0):
401
402
403 ## 2 ## Compute properties 1: Residue Weight (Mr) (UNIT:Daltons):
404
405 Total_Residue_Weight = count_K*float(bash_properties["K"][1]) + count_R*float(bash_properties["R"][1]) + count_A*float(bash_properties["A"][1]) + count_F*float(bash_properties["F"][1]) + count_I*float(bash_properties["I"][1]) + count_L*float(bash_properties["L"][1]) + count_M*float(bash_properties["M"][1]) + count_V*float(bash_properties["V"][1]) + count_W*float(bash_properties["W"][1]) + count_N*float(bash_properties["N"][1]) + count_Q*float(bash_properties["Q"][1]) + count_S*float(bash_properties["S"][1]) + count_T*float(bash_properties["T"][1]) + count_H*float(bash_properties["H"][1]) + count_Y*float(bash_properties["Y"][1]) + count_C*float(bash_properties["C"][1]) + count_D*float(bash_properties["D"][1]) + count_E*float(bash_properties["E"][1]) + count_P*float(bash_properties["P"][1]) + count_G*float(bash_properties["G"][1])
406 Total_Residue_Volume = count_K*float(bash_properties["K"][2]) + count_R*float(bash_properties["R"][2]) + count_A*float(bash_properties["A"][2]) + count_F*float(bash_properties["F"][2]) + count_I*float(bash_properties["I"][2]) + count_L*float(bash_properties["L"][2]) + count_M*float(bash_properties["M"][2]) + count_V*float(bash_properties["V"][2]) + count_W*float(bash_properties["W"][2]) + count_N*float(bash_properties["N"][2]) + count_Q*float(bash_properties["Q"][2]) + count_S*float(bash_properties["S"][2]) + count_T*float(bash_properties["T"][2]) + count_H*float(bash_properties["H"][2]) + count_Y*float(bash_properties["Y"][2]) + count_C*float(bash_properties["C"][2]) + count_D*float(bash_properties["D"][2]) + count_E*float(bash_properties["E"][2]) + count_P*float(bash_properties["P"][2]) + count_G*float(bash_properties["G"][2])
407 Total_Partial_specific_volume = count_K*float(bash_properties["K"][3]) + count_R*float(bash_properties["R"][3]) + count_A*float(bash_properties["A"][3]) + count_F*float(bash_properties["F"][3]) + count_I*float(bash_properties["I"][3]) + count_L*float(bash_properties["L"][3]) + count_M*float(bash_properties["M"][3]) + count_V*float(bash_properties["V"][3]) + count_W*float(bash_properties["W"][3]) + count_N*float(bash_properties["N"][3]) + count_Q*float(bash_properties["Q"][3]) + count_S*float(bash_properties["S"][3]) + count_T*float(bash_properties["T"][3]) + count_H*float(bash_properties["H"][3]) + count_Y*float(bash_properties["Y"][3]) + count_C*float(bash_properties["C"][3]) + count_D*float(bash_properties["D"][3]) + count_E*float(bash_properties["E"][3]) + count_P*float(bash_properties["P"][3]) + count_G*float(bash_properties["G"][3])
408 Total_Hydration = count_K*float(bash_properties["K"][4]) + count_R*float(bash_properties["R"][4]) + count_A*float(bash_properties["A"][4]) + count_F*float(bash_properties["F"][4]) + count_I*float(bash_properties["I"][4]) + count_L*float(bash_properties["L"][4]) + count_M*float(bash_properties["M"][4]) + count_V*float(bash_properties["V"][4]) + count_W*float(bash_properties["W"][4]) + count_N*float(bash_properties["N"][4]) + count_Q*float(bash_properties["Q"][4]) + count_S*float(bash_properties["S"][4]) + count_T*float(bash_properties["T"][4]) + count_H*float(bash_properties["H"][4]) + count_Y*float(bash_properties["Y"][4]) + count_C*float(bash_properties["C"][4]) + count_D*float(bash_properties["D"][4]) + count_E*float(bash_properties["E"][4]) + count_P*float(bash_properties["P"][4]) + count_G*float(bash_properties["G"][4])
409 else:
410 Total_Residue_Weight=0
411 Total_Residue_Volume=0
412 Total_Partial_specific_volume=0
413 Total_Hydration=0
414
415 return(Total_Residue_Weight,Total_Residue_Volume,Total_Partial_specific_volume,Total_Hydration)
416
417 ########################################################
418
419
420
421 ###################
422 ### RUN RUN RUN ###
423 ###################
424 import sys,os,zipfile,shutil,subprocess,string
425
426 ##Create specific folders
427 Path_IN_loci_NUC = "./IN_AA"
428 outpath= "./OUT"
429 os.makedirs(Path_IN_loci_NUC)
430 os.makedirs(outpath)
431
432
433 #Check if the file is a zip or fasta file
434
435 the_zip_file = zipfile.ZipFile(sys.argv[1])
436 ret = the_zip_file.testzip()
437
438 if ret is not None:
439 shutil.copy2(sys.argv[1], './IN_AA/input.fasta')
440 else:
441 cmd="unzip %s -d ./IN_AA"%(sys.argv[1])
442 os.system(cmd)
443
444
445
446 ## 1 ## List taxa
447 LT=[]
448 cmd="grep '>' %s" % sys.argv[2]
449 result = subprocess.check_output(cmd, shell=True)
450 result=result.split('\n')
451 for i in result:
452 sp=i[1:]
453 if sp !='':
454 LT.append(sp)
455 print LT
456
457
458 ## 2 ## PathIN
459 fileIN_properties = open("%s/01_AminoAcid_Properties2.csv"%(script_path), "r")
460 Path_IN_loci_AA = "./IN_AA"
461 #Path_IN_loci_AA = "02_CDS_No_Missing_Data_aa_CDS_withM"
462 Lloci_AA = os.listdir(Path_IN_loci_AA)
463
464 ## 3 ## PathOUT
465
466 ## 3.1 ## PROT composition
467 fileOUT_PROT_ALL=open("./OUT/13_prot_compositions_All_AA.csv","w")
468 fileOUT_PROT_ALL.write("LOCUS,")
469 for taxa in LT:
470 fileOUT_PROT_ALL.write("%s_prop_K,%s_prop_R,%s_prop_A,%s_prop_F,%s_prop_I,%s_prop_L,%s_prop_M,%s_prop_V,%s_prop_W,%s_prop_N,%s_prop_Q,%s_prop_S,%s_prop_T,%s_prop_H,%s_prop_Y,%s_prop_C,%s_prop_D,%s_prop_E,%s_prop_P,%s_prop_G," %(taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa))
471 fileOUT_PROT_ALL.write("\n")
472
473 ## 3.2 ## PROT IVYWREL
474 fileOUT_IVYWREL=open("./OUT/14_IVYWREL.csv","w")
475 fileOUT_IVYWREL.write("LOCUS,")
476 for taxa in LT:
477 fileOUT_IVYWREL.write("%s_count_IVYWREL,%s_prop_IVYWREL," %(taxa,taxa))
478 fileOUT_IVYWREL.write("\n")
479
480
481 ## 3.3 ## PROT ERK_DNQTSHA
482 fileOUT_ERK_DNQTSH=open("./OUT/15_ERK_DNQTSH.csv","w")
483 fileOUT_ERK_DNQTSH.write("LOCUS,")
484 for taxa in LT:
485 fileOUT_ERK_DNQTSH.write("%s_count_ERK,%s_prop_ERK,%s_count_DNQTSH,%s_prop_DNQTSH,%s_ratio_ERK_vs_DNQTSH," %(taxa,taxa,taxa,taxa,taxa))
486 fileOUT_ERK_DNQTSH.write("\n")
487
488 ## 3.4 ## PROT EK_QH
489 fileOUT_EK_QH=open("./OUT/16_EK_QH.csv","w")
490 fileOUT_EK_QH.write("LOCUS,")
491 for taxa in LT:
492 fileOUT_EK_QH.write("%s_count_EK,%s_prop_EK,%s_count_QH,%s_prop_QH,%s_ratio_EK_vs_QH," %(taxa,taxa,taxa,taxa,taxa))
493 fileOUT_EK_QH.write("\n")
494
495
496 ## 3.5 ## PROT FYMINK_GARP
497 fileOUT_FYMINK_GARP=open("./OUT/17_FYMINK_GARP.csv","w")
498 fileOUT_FYMINK_GARP.write("LOCUS,")
499 for taxa in LT:
500 fileOUT_FYMINK_GARP.write("%s_count_FYMINK,%s_prop_FYMINK,%s_count_GARP,%s_prop_GARP," %(taxa,taxa,taxa,taxa))
501 fileOUT_FYMINK_GARP.write("\n")
502
503
504 ## 3.6 ## PROT AVLIMFYW
505 fileOUT_AVLIMFYW=open("./OUT/18_AVLIMFYW.csv","w")
506 fileOUT_AVLIMFYW.write("LOCUS,")
507 for taxa in LT:
508 fileOUT_AVLIMFYW.write("%s_count_AVLIMFYW,%s_prop_AVLIMFYW,%s_count_AVLIM,%s_prop_AVLIM,%s_count_FYW,%s_prop_FYW," %(taxa,taxa,taxa,taxa,taxa,taxa))
509 fileOUT_AVLIMFYW.write("\n")
510
511 ## 3.7 ## PROT STNQ
512 fileOUT_STNQ=open("./OUT/19_STNQ.csv","w")
513 fileOUT_STNQ.write("LOCUS,")
514 for taxa in LT:
515 fileOUT_STNQ.write("%s_count_STNQ,%s_prop_STNQ," %(taxa,taxa))
516 fileOUT_STNQ.write("\n")
517
518 ## 3.8 ## PROT RHKDE
519 fileOUT_RHKDE=open("./OUT/20_RHKDE.csv","w")
520 fileOUT_RHKDE.write("LOCUS,")
521 for taxa in LT:
522 fileOUT_RHKDE.write("%s_count_RHKDE,%s_prop_RHKDE,%s_count_RHK,%s_prop_RHK,%s_count_DE,%s_prop_DE," %(taxa,taxa,taxa,taxa,taxa,taxa))
523 fileOUT_RHKDE.write("\n")
524
525 ## 3.9 ## PROT DIDER CRITERIA
526 fileOUT_PAYRE=open("./OUT/21_PAYRE-MVGDS.csv","w")
527 fileOUT_PAYRE.write("LOCUS,")
528 for taxa in LT:
529 fileOUT_PAYRE.write("%s_count_PAYRE,%s_prop_PAYRE,%s_count_AC,%s_prop_AC,%s_count_MVGDS,%s_prop_MVGDS,%s_ratio_PAYRE_vs_MVGDS,%s_ratio_AC_vs_MVGDS," %(taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa))
530 fileOUT_PAYRE.write("\n")
531
532 ## 3.10 ## PROT Total residue weight
533 fileOUT_TotalResidueWeight=open("./OUT/22_TotalResidueWeight.csv","w")
534 fileOUT_TotalResidueWeight.write("LOCUS,")
535 for taxa in LT:
536 fileOUT_TotalResidueWeight.write("%s_Total_Residue_Weight," %taxa)
537 fileOUT_TotalResidueWeight.write("\n")
538
539 ## 3.11 ## PROT Total residue volume
540 fileOUT_TotalResidueVolume=open("./OUT/23_TotalResidueVolume.csv","w")
541 fileOUT_TotalResidueVolume.write("LOCUS,")
542 for taxa in LT:
543 fileOUT_TotalResidueVolume.write("%s_Total_Residue_Volume," %taxa)
544 fileOUT_TotalResidueVolume.write("\n")
545
546 ## 3.12 ## PROT Total partial specific volume
547 fileOUT_TotalPartialSpecificVolume=open("./OUT/24_TotalPartialSpecificVolume.csv","w")
548 fileOUT_TotalPartialSpecificVolume.write("LOCUS,")
549 for taxa in LT:
550 fileOUT_TotalPartialSpecificVolume.write("%s_Total_Partial_Specific_Volume," %taxa)
551 fileOUT_TotalPartialSpecificVolume.write("\n")
552
553 ## 3.13 ## PROT Total hydratation
554 fileOUT_TotalHydratation=open("./OUT/25_TotalHydratation.csv","w")
555 fileOUT_TotalHydratation.write("LOCUS,")
556 for taxa in LT:
557 fileOUT_TotalHydratation.write("%s_Total_Hydratation," %taxa)
558 fileOUT_TotalHydratation.write("\n")
559
560
561 #####################
562 ## 4 ## Process Loci
563 #####################
564 bash_aa_properties = aa_properties(fileIN_properties)
565
566 for locus in Lloci_AA:
567 print locus
568 path_locus = "%s/%s" %(Path_IN_loci_AA, locus)
569 bash = dico(path_locus,LT)
570
571 #print bash
572
573 fileOUT_PROT_ALL.write("%s," %locus)
574 fileOUT_IVYWREL.write("%s," %locus)
575 fileOUT_ERK_DNQTSH.write("%s," %locus)
576 fileOUT_EK_QH.write("%s," %locus)
577 fileOUT_FYMINK_GARP.write("%s," %locus)
578 fileOUT_AVLIMFYW.write("%s," %locus)
579 fileOUT_STNQ.write("%s," %locus)
580 fileOUT_RHKDE.write("%s," %locus)
581 fileOUT_PAYRE.write("%s," %locus)
582 fileOUT_TotalResidueWeight.write("%s," %locus)
583 fileOUT_TotalResidueVolume.write("%s," %locus)
584 fileOUT_TotalPartialSpecificVolume.write("%s," %locus)
585 fileOUT_TotalHydratation.write("%s," %locus)
586
587 for taxa in LT:
588 if taxa in bash.keys():
589 seq = bash[taxa]
590 prop_K,prop_R,prop_A,prop_F,prop_I,prop_L,prop_M,prop_V,prop_W,prop_N,prop_Q,prop_S,prop_T,prop_H,prop_Y,prop_C,prop_D,prop_E,prop_P,prop_G = aa_composition1(seq) ### DEF3 ###
591 count_IVYWREL,prop_IVYWREL,count_ERK,prop_ERK,count_DNQTSH,prop_DNQTSH,ratio_ERK_vs_DNQTSH,count_EK,prop_EK,count_QH,prop_QH,ratio_EK_vs_QH,count_FYMINK,prop_FYMINK,count_GARP,prop_GARP,count_AVLIMFYW,prop_AVLIMFYW,count_AVLIM,prop_AVLIM,count_FYW,prop_FYW,count_STNQ,prop_STNQ, count_MVGDS,prop_MVGDS, count_PAYRE,prop_PAYRE, count_AC,prop_AC, ratio_PAYRE_vs_MVGDS, ratio_AC_vs_MVGDS,count_RHKDE,prop_RHKDE,count_RHK,prop_RHK,count_DE,prop_DE = aa_composition2(seq) ### DEF4 ###
592 Total_Residue_Weight,Total_Residue_Volume,Total_Partial_Specific_Volume,Total_Hydration = sequence_properties_from_aa_properties(seq, bash_aa_properties) ### DEF6 ###
593
594 fileOUT_PROT_ALL.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f," %(prop_K,prop_R,prop_A,prop_F,prop_I,prop_L,prop_M,prop_V,prop_W,prop_N,prop_Q,prop_S,prop_T,prop_H,prop_Y,prop_C,prop_D,prop_E,prop_P,prop_G))
595 fileOUT_IVYWREL.write("%.5f,%.5f," %(count_IVYWREL, prop_IVYWREL))
596 fileOUT_ERK_DNQTSH.write("%.5f,%.5f,%.5f,%.5f,%.5f," %(count_ERK,prop_ERK,count_DNQTSH,prop_DNQTSH,ratio_ERK_vs_DNQTSH))
597 fileOUT_EK_QH.write("%.5f,%.5f,%.5f,%.5f,%.5f," %(count_EK,prop_EK,count_QH,prop_QH,ratio_EK_vs_QH))
598 fileOUT_FYMINK_GARP.write("%.5f,%.5f,%.5f,%.5f," %(count_FYMINK,prop_FYMINK,count_GARP,prop_GARP))
599 fileOUT_AVLIMFYW.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f," %(count_AVLIMFYW,prop_AVLIMFYW,count_AVLIM,prop_AVLIM,count_FYW,prop_FYW))
600 fileOUT_STNQ.write("%.5f,%.5f," %(count_STNQ,prop_STNQ))
601 fileOUT_RHKDE.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,"%(count_RHKDE,prop_RHKDE,count_RHK,prop_RHK,count_DE,prop_DE))
602 fileOUT_PAYRE.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f," %(count_PAYRE,prop_PAYRE,count_AC,prop_AC,count_MVGDS,prop_MVGDS,ratio_PAYRE_vs_MVGDS,ratio_AC_vs_MVGDS))
603 fileOUT_TotalResidueWeight.write("%.5f," %Total_Residue_Weight)
604 fileOUT_TotalResidueVolume.write("%.5f," %Total_Residue_Volume)
605 fileOUT_TotalPartialSpecificVolume.write("%.5f," %(Total_Partial_Specific_Volume))
606 fileOUT_TotalHydratation.write("%.5f," % Total_Hydration)
607
608 ## END LINE
609 fileOUT_PROT_ALL.write("\n")
610 fileOUT_IVYWREL.write("\n")
611 fileOUT_ERK_DNQTSH.write("\n")
612 fileOUT_EK_QH.write("\n")
613 fileOUT_FYMINK_GARP.write("\n")
614 fileOUT_AVLIMFYW.write("\n")
615 fileOUT_STNQ.write("\n")
616 fileOUT_RHKDE.write("\n")
617 fileOUT_PAYRE.write("\n")
618 fileOUT_TotalResidueWeight.write("\n")
619 fileOUT_TotalResidueVolume.write("\n")
620 fileOUT_TotalPartialSpecificVolume.write("\n")
621 fileOUT_TotalHydratation.write("\n")
622
623
624
625