Mercurial > repos > abims-sbr > mutcount
comparison scripts/S01b_study_seq_composition_aa.py @ 0:78dd6454f6f0 draft
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 73670b26c75bb6c1a6332481920f3036314de364
| author | abims-sbr |
|---|---|
| date | Tue, 02 May 2017 04:20:51 -0400 |
| parents | |
| children | 8de21b6eb110 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:78dd6454f6f0 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # -*- coding: ascii -*- | |
| 3 ## Author: Eric FONTANILLAS | |
| 4 ## Date: 21.12.10 | |
| 5 ## Object: Test for compositional bias in genome and proteome as marker of thermal adaptation (comparison between 2 "hot" species: Ap and Ps and two "cold" species: Pg, Pp) | |
| 6 import sys, os | |
| 7 script_path = os.path.dirname(sys.argv[0]) | |
| 8 | |
| 9 ############# | |
| 10 ### DEF 0 ### | |
| 11 ############# | |
| 12 def simplify_fasta_name(fasta_name,LT): | |
| 13 | |
| 14 for abbreviation in LT: | |
| 15 if abbreviation in fasta_name: | |
| 16 new_fasta_name = abbreviation | |
| 17 | |
| 18 return(new_fasta_name) | |
| 19 ########################################## | |
| 20 | |
| 21 ########### | |
| 22 ## DEF1 ## | |
| 23 ########### | |
| 24 ## Generates bash, with key = fasta name; value = sequence (WITH GAP, IF ANY, REMOVED IN THIS FUNCTION) | |
| 25 | |
| 26 def dico(fasta_file,LT): | |
| 27 | |
| 28 count_fastaName=0 | |
| 29 F1 = open(fasta_file, "r") | |
| 30 | |
| 31 bash1 = {} | |
| 32 while 1: | |
| 33 nextline = F1.readline() | |
| 34 #print nextline | |
| 35 if not nextline : | |
| 36 break | |
| 37 | |
| 38 if nextline[0] == ">": | |
| 39 count_fastaName = count_fastaName + 1 | |
| 40 fasta_name = nextline[1:-1] | |
| 41 nextline = F1.readline() | |
| 42 sequence = nextline[:-1] | |
| 43 | |
| 44 if fasta_name not in bash1.keys(): | |
| 45 fasta_name = simplify_fasta_name(fasta_name,LT) ### DEF 0 ### | |
| 46 bash1[fasta_name] = sequence | |
| 47 else: | |
| 48 print fasta_name | |
| 49 | |
| 50 # Find alignment length | |
| 51 kk = bash1.keys() | |
| 52 key0 = kk[0] | |
| 53 seq0 = bash1[key0] | |
| 54 ln_seq = len(seq0) | |
| 55 | |
| 56 F1.close() | |
| 57 | |
| 58 return(bash1) | |
| 59 ##################################### | |
| 60 | |
| 61 | |
| 62 | |
| 63 ################## | |
| 64 ###### DEF2 ###### | |
| 65 ################## | |
| 66 def base_composition(seq): | |
| 67 count_A=string.count(seq, "A") | |
| 68 count_T=string.count(seq, "T") | |
| 69 count_C=string.count(seq, "C") | |
| 70 count_G=string.count(seq, "G") | |
| 71 | |
| 72 | |
| 73 CG = count_C+count_G | |
| 74 AT = count_T+count_A | |
| 75 | |
| 76 AG = count_A+count_G | |
| 77 TC = count_T+count_C | |
| 78 | |
| 79 ## 1 ## Search for compositional bias in genome as marker of thermal adaptation: CG vs AT | |
| 80 ratio_CG_AT=float(CG)/float(AT) | |
| 81 | |
| 82 ## 2 ## Search for compositional bias in genome as marker of thermal adaptation: AG vs TC | |
| 83 ratio_purine_pyrimidine=float(AG)/float(TC) | |
| 84 | |
| 85 ## 3 ## Nucleotide proportion | |
| 86 ln = len(seq) | |
| 87 prop_A = float(count_A)/float(ln) | |
| 88 prop_T = float(count_T)/float(ln) | |
| 89 prop_C = float(count_C)/float(ln) | |
| 90 prop_G = float(count_G)/float(ln) | |
| 91 | |
| 92 | |
| 93 return(ratio_CG_AT, ratio_purine_pyrimidine, prop_A, prop_T, prop_C, prop_G) | |
| 94 ############################################## | |
| 95 | |
| 96 | |
| 97 ################## | |
| 98 ###### DEF3 ###### | |
| 99 ################## | |
| 100 def aa_composition1(seq): | |
| 101 | |
| 102 ## 1 ## count occurence of AA | |
| 103 count_K=string.count(seq,"K") | |
| 104 count_R=string.count(seq,"R") | |
| 105 count_A=string.count(seq,"A") | |
| 106 count_F=string.count(seq,"F") | |
| 107 count_I=string.count(seq,"I") | |
| 108 count_L=string.count(seq,"L") | |
| 109 count_M=string.count(seq,"M") | |
| 110 count_V=string.count(seq,"V") | |
| 111 count_W=string.count(seq,"W") | |
| 112 count_N=string.count(seq,"N") | |
| 113 count_Q=string.count(seq,"Q") | |
| 114 count_S=string.count(seq,"S") | |
| 115 count_T=string.count(seq,"T") | |
| 116 count_H=string.count(seq,"H") | |
| 117 count_Y=string.count(seq,"Y") | |
| 118 count_C=string.count(seq,"C") | |
| 119 count_D=string.count(seq,"D") | |
| 120 count_E=string.count(seq,"E") | |
| 121 count_P=string.count(seq,"P") | |
| 122 count_G=string.count(seq,"G") | |
| 123 | |
| 124 | |
| 125 | |
| 126 ## 2 ## compute relative proportion | |
| 127 TOTAL=count_K+count_R+count_A+count_F+count_I+count_L+count_M+count_V+count_W+count_N+count_Q+count_S+count_T+count_H+count_Y+count_C+count_D+count_E+count_P+count_G | |
| 128 if (TOTAL!=0): | |
| 129 ln = TOTAL | |
| 130 | |
| 131 prop_K=float(count_K)/float(ln) | |
| 132 prop_R=float(count_R)/float(ln) | |
| 133 prop_A=float(count_A)/float(ln) | |
| 134 prop_F=float(count_F)/float(ln) | |
| 135 prop_I=float(count_I)/float(ln) | |
| 136 prop_L=float(count_L)/float(ln) | |
| 137 prop_M=float(count_M)/float(ln) | |
| 138 prop_V=float(count_V)/float(ln) | |
| 139 prop_W=float(count_W)/float(ln) | |
| 140 prop_N=float(count_N)/float(ln) | |
| 141 prop_Q=float(count_Q)/float(ln) | |
| 142 prop_S=float(count_S)/float(ln) | |
| 143 prop_T=float(count_T)/float(ln) | |
| 144 prop_H=float(count_H)/float(ln) | |
| 145 prop_Y=float(count_Y)/float(ln) | |
| 146 prop_C=float(count_C)/float(ln) | |
| 147 prop_D=float(count_D)/float(ln) | |
| 148 prop_E=float(count_E)/float(ln) | |
| 149 prop_P=float(count_P)/float(ln) | |
| 150 prop_G=float(count_G)/float(ln) | |
| 151 else: | |
| 152 prop_K=0 | |
| 153 prop_R=0 | |
| 154 prop_A=0 | |
| 155 prop_F=0 | |
| 156 prop_I=0 | |
| 157 prop_L=0 | |
| 158 prop_M=0 | |
| 159 prop_V=0 | |
| 160 prop_W=0 | |
| 161 prop_N=0 | |
| 162 prop_Q=0 | |
| 163 prop_S=0 | |
| 164 prop_T=0 | |
| 165 prop_H=0 | |
| 166 prop_Y=0 | |
| 167 prop_C=0 | |
| 168 prop_D=0 | |
| 169 prop_E=0 | |
| 170 prop_P=0 | |
| 171 prop_G=0 | |
| 172 | |
| 173 | |
| 174 | |
| 175 | |
| 176 return(prop_K,prop_R,prop_A,prop_F,prop_I,prop_L,prop_M,prop_V,prop_W,prop_N,prop_Q,prop_S,prop_T,prop_H,prop_Y,prop_C,prop_D,prop_E,prop_P,prop_G) | |
| 177 | |
| 178 ################## | |
| 179 ###### DEF4 ###### | |
| 180 ################## | |
| 181 def aa_composition2(seq): | |
| 182 | |
| 183 ## 1 ## count occurence of AA | |
| 184 count_K=string.count(seq,"K") | |
| 185 count_R=string.count(seq,"R") | |
| 186 count_A=string.count(seq,"A") | |
| 187 count_F=string.count(seq,"F") | |
| 188 count_I=string.count(seq,"I") | |
| 189 count_L=string.count(seq,"L") | |
| 190 count_M=string.count(seq,"M") | |
| 191 count_V=string.count(seq,"V") | |
| 192 count_W=string.count(seq,"W") | |
| 193 count_N=string.count(seq,"N") | |
| 194 count_Q=string.count(seq,"Q") | |
| 195 count_S=string.count(seq,"S") | |
| 196 count_T=string.count(seq,"T") | |
| 197 count_H=string.count(seq,"H") | |
| 198 count_Y=string.count(seq,"Y") | |
| 199 count_C=string.count(seq,"C") | |
| 200 count_D=string.count(seq,"D") | |
| 201 count_E=string.count(seq,"E") | |
| 202 count_P=string.count(seq,"P") | |
| 203 count_G=string.count(seq,"G") | |
| 204 | |
| 205 | |
| 206 | |
| 207 ## 2 ## compute seq length | |
| 208 TOTAL=count_K+count_R+count_A+count_F+count_I+count_L+count_M+count_V+count_W+count_N+count_Q+count_S+count_T+count_H+count_Y+count_C+count_D+count_E+count_P+count_G | |
| 209 if (TOTAL!=0): | |
| 210 | |
| 211 ln = TOTAL | |
| 212 ##3 Famous Hyperthermophile Prokaryotes criterias | |
| 213 | |
| 214 # 3.1. IVYWREL estimator => positivelly correlated with otpimal growth | |
| 215 count_IVYWREL = count_I+count_V+count_Y+count_W+count_R+count_E+count_L | |
| 216 prop_IVYWREL = float(count_IVYWREL)/float(ln) | |
| 217 | |
| 218 # 3.2. ERK estimator (i.e. ERK vs DNQTSHA) => positivelly correlated with optimal growth temperature | |
| 219 # ERK alone | |
| 220 count_ERK = count_E + count_R + count_K | |
| 221 prop_ERK = float(count_ERK)/float(ln) | |
| 222 # DNQTSHA alone | |
| 223 count_DNQTSH = count_D+count_N+count_Q+count_T+count_S+count_H | |
| 224 prop_DNQTSH=float(count_DNQTSH)/float(ln) | |
| 225 # ERK vs DNQTSH | |
| 226 if count_DNQTSH != 0: | |
| 227 ratio_ERK_vs_DNQTSH = float(count_ERK)/float(count_DNQTSH) | |
| 228 else: | |
| 229 ratio_ERK_vs_DNQTSH=-1 | |
| 230 # EK/QH estimator | |
| 231 count_EK = count_E+count_K | |
| 232 count_QH = count_Q+count_H | |
| 233 | |
| 234 prop_EK = float(count_EK)/float(ln) | |
| 235 prop_QH = float(count_QH)/float(ln) | |
| 236 | |
| 237 if count_QH != 0: | |
| 238 ratio_EK_vs_QH = float(count_EK)/float(count_QH) | |
| 239 else: | |
| 240 ratio_EK_vs_QH=-1 ## "-1" will indicate the impossibility to compute the ratio (coz the numerator) | |
| 241 | |
| 242 ## 4 ## Mutationnal bias hypothesis => AT rich: favor FYMINK // GC rich: favor GARP | |
| 243 ## The mutational bias model predict a linear relationship between GARP vs FYMINK ==> so if outliers to that, it means that the excess of GARP or FYMINK are not explained by the mutationnal bias model but by other thing ... selection!!??? | |
| 244 count_FYMINK=count_F+count_Y+count_M+count_I+count_N+count_K | |
| 245 prop_FYMINK = float(count_FYMINK)/float(ln) | |
| 246 | |
| 247 count_GARP=count_G+count_A+count_R+count_P | |
| 248 prop_GARP=float(count_GARP)/float(ln) | |
| 249 | |
| 250 ## 5 ## Hydophobicity hypothesis [should INCREASE with thermal adaptation] | |
| 251 ## 5.1. AL | |
| 252 count_AVLIMFYW = count_A+count_V+count_L+count_I+count_F+count_Y+count_W+count_M | |
| 253 prop_AVLIMFYW=float(count_AVLIMFYW)/float(ln) | |
| 254 ## 5.2. Only non-aromatic | |
| 255 count_AVLIM = count_A+count_V+count_L+count_I+count_M | |
| 256 prop_AVLIM=float(count_AVLIM)/float(ln) | |
| 257 ## 5.3. Only aromatic (have they higher residus volume?? in such case opposite hypothesis based on residu volume, predict DECREASE for these aa in composition) | |
| 258 count_FYW = count_F+count_Y+count_W | |
| 259 prop_FYW=float(count_FYW)/float(ln) | |
| 260 | |
| 261 ## 6 ## Charged hypothesis => positivelly correlated with optimal growth temperature | |
| 262 # All charged | |
| 263 count_RHKDE = count_R + count_H +count_K + count_D + count_E | |
| 264 prop_RHKDE = float(count_RHKDE)/float(ln) | |
| 265 # Only positive | |
| 266 count_RHK = count_R + count_H +count_K | |
| 267 prop_RHK = float(count_RHK)/float(ln) | |
| 268 # Only negative | |
| 269 count_DE = count_D + count_E | |
| 270 prop_DE = float(count_DE)/float(ln) | |
| 271 | |
| 272 ## 7 ## Neutral polar hypothesis [should DECREASE with thermal adaptation] | |
| 273 count_STNQ = count_S+count_T+count_N+count_Q | |
| 274 prop_STNQ=float(count_STNQ)/float(ln) | |
| 275 | |
| 276 | |
| 277 ## 9 ## PAYRE VS MGDS (FONTANILLAS CRITERIA) | |
| 278 ## 9.1 ## Didier's criteria 1 = SMALL / BIG | |
| 279 count_PAYRE = count_A+count_Y+count_P+count_R+count_E | |
| 280 prop_PAYRE=float(count_PAYRE)/float(ln) | |
| 281 count_MVGDS = count_V+count_M+count_S+count_G+count_D | |
| 282 prop_MVGDS=float(count_MVGDS)/float(ln) | |
| 283 if count_MVGDS!= 0: | |
| 284 ratio_PAYRE_vs_MVGDS = float(count_PAYRE)/float(count_MVGDS) | |
| 285 else: | |
| 286 ratio_PAYRE_vs_MVGDS=-1 ## "-1" will indicate the impossibility to compute the ratio (coz the numerator) | |
| 287 | |
| 288 ## 9.2 ## Didier's criteria 2 = VERY SMALL / BIG | |
| 289 count_AC = count_A+count_C | |
| 290 prop_AC=float(count_AC)/float(ln) | |
| 291 | |
| 292 #count_VLIM = count_V+count_L+count_I+count_M | |
| 293 if count_MVGDS != 0: | |
| 294 ratio_AC_vs_MVGDS = float(count_AC)/float(count_MVGDS) | |
| 295 else: | |
| 296 ratio_AC_vs_MVGDS=-1 ## "-1" will indicate the impossibility to compute the ratio (coz the numerator) | |
| 297 else: | |
| 298 count_IVYWREL=0 | |
| 299 prop_IVYWREL=0 | |
| 300 count_ERK=0 | |
| 301 prop_ERK=0 | |
| 302 count_DNQTSH=0 | |
| 303 prop_DNQTSH=0 | |
| 304 ratio_ERK_vs_DNQTSH=0 | |
| 305 count_EK=0 | |
| 306 prop_EK=0 | |
| 307 count_QH=0 | |
| 308 prop_QH=0 | |
| 309 ratio_EK_vs_QH=0 | |
| 310 count_FYMINK=0 | |
| 311 prop_FYMINK=0 | |
| 312 count_GARP=0 | |
| 313 prop_GARP=0 | |
| 314 count_AVLIMFYW=0 | |
| 315 prop_AVLIMFYW=0 | |
| 316 count_AVLIM=0 | |
| 317 prop_AVLIM=0 | |
| 318 count_FYW=0 | |
| 319 prop_FYW=0 | |
| 320 count_STNQ=0 | |
| 321 prop_STNQ=0 | |
| 322 count_MVGDS=0 | |
| 323 prop_MVGDS=0 | |
| 324 count_PAYRE=0 | |
| 325 prop_PAYRE=0 | |
| 326 count_AC=0 | |
| 327 prop_AC=0 | |
| 328 ratio_PAYRE_vs_MVGDS=0 | |
| 329 ratio_AC_vs_MVGDS=0 | |
| 330 count_RHKDE=0 | |
| 331 prop_RHKDE=0 | |
| 332 count_RHK=0 | |
| 333 prop_RHK=0 | |
| 334 count_DE=0 | |
| 335 prop_DE=0 | |
| 336 | |
| 337 return(count_IVYWREL,prop_IVYWREL,count_ERK,prop_ERK,count_DNQTSH,prop_DNQTSH,ratio_ERK_vs_DNQTSH,count_EK,prop_EK,count_QH,prop_QH,ratio_EK_vs_QH,count_FYMINK,prop_FYMINK,count_GARP,prop_GARP,count_AVLIMFYW, prop_AVLIMFYW,count_AVLIM,prop_AVLIM,count_FYW,prop_FYW,count_STNQ, prop_STNQ, count_MVGDS,prop_MVGDS, count_PAYRE,prop_PAYRE, count_AC,prop_AC, ratio_PAYRE_vs_MVGDS, ratio_AC_vs_MVGDS, count_RHKDE,prop_RHKDE,count_RHK,prop_RHK,count_DE,prop_DE) | |
| 338 ##################### | |
| 339 | |
| 340 | |
| 341 ################## | |
| 342 ###### DEF5 ###### | |
| 343 ################## | |
| 344 def aa_properties(fileIN_aaProperties): | |
| 345 next = fileIN_aaProperties.readline() ## JUMP HEADERS | |
| 346 | |
| 347 bash_aa_properties={} | |
| 348 | |
| 349 while 1: | |
| 350 next = fileIN_aaProperties.readline() | |
| 351 if not next: | |
| 352 break | |
| 353 | |
| 354 S1 = string.split(next, ",") | |
| 355 | |
| 356 aa_name = S1[1] | |
| 357 S2 = string.split(aa_name, "/") | |
| 358 aa_code = S2[1][:-1] | |
| 359 | |
| 360 frequencies = S1[2][:-1] | |
| 361 Residue_Weight = S1[5] | |
| 362 Residue_Volume = S1[6] | |
| 363 Partial_specific_volume = S1[7] | |
| 364 Hydration = S1[8] | |
| 365 | |
| 366 bash_aa_properties[aa_code] = [frequencies,Residue_Weight,Residue_Volume,Partial_specific_volume,Hydration] | |
| 367 | |
| 368 return(bash_aa_properties) | |
| 369 | |
| 370 | |
| 371 ################## | |
| 372 ###### DEF6 ###### | |
| 373 ################## | |
| 374 def sequence_properties_from_aa_properties(seq, bash_properties): | |
| 375 | |
| 376 ## 1 ## count occurence of AA | |
| 377 count_K=string.count(seq,"K") | |
| 378 count_R=string.count(seq,"R") | |
| 379 count_A=string.count(seq,"A") | |
| 380 count_F=string.count(seq,"F") | |
| 381 count_I=string.count(seq,"I") | |
| 382 count_L=string.count(seq,"L") | |
| 383 count_M=string.count(seq,"M") | |
| 384 count_V=string.count(seq,"V") | |
| 385 count_W=string.count(seq,"W") | |
| 386 count_N=string.count(seq,"N") | |
| 387 count_Q=string.count(seq,"Q") | |
| 388 count_S=string.count(seq,"S") | |
| 389 count_T=string.count(seq,"T") | |
| 390 count_H=string.count(seq,"H") | |
| 391 count_Y=string.count(seq,"Y") | |
| 392 count_C=string.count(seq,"C") | |
| 393 count_D=string.count(seq,"D") | |
| 394 count_E=string.count(seq,"E") | |
| 395 count_P=string.count(seq,"P") | |
| 396 count_G=string.count(seq,"G") | |
| 397 | |
| 398 TOTAL=count_K+count_R+count_A+count_F+count_I+count_L+count_M+count_V+count_W+count_N+count_Q+count_S+count_T+count_H+count_Y+count_C+count_D+count_E+count_P+count_G | |
| 399 | |
| 400 if (TOTAL!=0): | |
| 401 | |
| 402 | |
| 403 ## 2 ## Compute properties 1: Residue Weight (Mr) (UNIT:Daltons): | |
| 404 | |
| 405 Total_Residue_Weight = count_K*float(bash_properties["K"][1]) + count_R*float(bash_properties["R"][1]) + count_A*float(bash_properties["A"][1]) + count_F*float(bash_properties["F"][1]) + count_I*float(bash_properties["I"][1]) + count_L*float(bash_properties["L"][1]) + count_M*float(bash_properties["M"][1]) + count_V*float(bash_properties["V"][1]) + count_W*float(bash_properties["W"][1]) + count_N*float(bash_properties["N"][1]) + count_Q*float(bash_properties["Q"][1]) + count_S*float(bash_properties["S"][1]) + count_T*float(bash_properties["T"][1]) + count_H*float(bash_properties["H"][1]) + count_Y*float(bash_properties["Y"][1]) + count_C*float(bash_properties["C"][1]) + count_D*float(bash_properties["D"][1]) + count_E*float(bash_properties["E"][1]) + count_P*float(bash_properties["P"][1]) + count_G*float(bash_properties["G"][1]) | |
| 406 Total_Residue_Volume = count_K*float(bash_properties["K"][2]) + count_R*float(bash_properties["R"][2]) + count_A*float(bash_properties["A"][2]) + count_F*float(bash_properties["F"][2]) + count_I*float(bash_properties["I"][2]) + count_L*float(bash_properties["L"][2]) + count_M*float(bash_properties["M"][2]) + count_V*float(bash_properties["V"][2]) + count_W*float(bash_properties["W"][2]) + count_N*float(bash_properties["N"][2]) + count_Q*float(bash_properties["Q"][2]) + count_S*float(bash_properties["S"][2]) + count_T*float(bash_properties["T"][2]) + count_H*float(bash_properties["H"][2]) + count_Y*float(bash_properties["Y"][2]) + count_C*float(bash_properties["C"][2]) + count_D*float(bash_properties["D"][2]) + count_E*float(bash_properties["E"][2]) + count_P*float(bash_properties["P"][2]) + count_G*float(bash_properties["G"][2]) | |
| 407 Total_Partial_specific_volume = count_K*float(bash_properties["K"][3]) + count_R*float(bash_properties["R"][3]) + count_A*float(bash_properties["A"][3]) + count_F*float(bash_properties["F"][3]) + count_I*float(bash_properties["I"][3]) + count_L*float(bash_properties["L"][3]) + count_M*float(bash_properties["M"][3]) + count_V*float(bash_properties["V"][3]) + count_W*float(bash_properties["W"][3]) + count_N*float(bash_properties["N"][3]) + count_Q*float(bash_properties["Q"][3]) + count_S*float(bash_properties["S"][3]) + count_T*float(bash_properties["T"][3]) + count_H*float(bash_properties["H"][3]) + count_Y*float(bash_properties["Y"][3]) + count_C*float(bash_properties["C"][3]) + count_D*float(bash_properties["D"][3]) + count_E*float(bash_properties["E"][3]) + count_P*float(bash_properties["P"][3]) + count_G*float(bash_properties["G"][3]) | |
| 408 Total_Hydration = count_K*float(bash_properties["K"][4]) + count_R*float(bash_properties["R"][4]) + count_A*float(bash_properties["A"][4]) + count_F*float(bash_properties["F"][4]) + count_I*float(bash_properties["I"][4]) + count_L*float(bash_properties["L"][4]) + count_M*float(bash_properties["M"][4]) + count_V*float(bash_properties["V"][4]) + count_W*float(bash_properties["W"][4]) + count_N*float(bash_properties["N"][4]) + count_Q*float(bash_properties["Q"][4]) + count_S*float(bash_properties["S"][4]) + count_T*float(bash_properties["T"][4]) + count_H*float(bash_properties["H"][4]) + count_Y*float(bash_properties["Y"][4]) + count_C*float(bash_properties["C"][4]) + count_D*float(bash_properties["D"][4]) + count_E*float(bash_properties["E"][4]) + count_P*float(bash_properties["P"][4]) + count_G*float(bash_properties["G"][4]) | |
| 409 else: | |
| 410 Total_Residue_Weight=0 | |
| 411 Total_Residue_Volume=0 | |
| 412 Total_Partial_specific_volume=0 | |
| 413 Total_Hydration=0 | |
| 414 | |
| 415 return(Total_Residue_Weight,Total_Residue_Volume,Total_Partial_specific_volume,Total_Hydration) | |
| 416 | |
| 417 ######################################################## | |
| 418 | |
| 419 | |
| 420 | |
| 421 ################### | |
| 422 ### RUN RUN RUN ### | |
| 423 ################### | |
| 424 import sys,os,zipfile,shutil,subprocess,string | |
| 425 | |
| 426 ##Create specific folders | |
| 427 Path_IN_loci_NUC = "./IN_AA" | |
| 428 outpath= "./OUT" | |
| 429 os.makedirs(Path_IN_loci_NUC) | |
| 430 os.makedirs(outpath) | |
| 431 | |
| 432 | |
| 433 #Check if the file is a zip or fasta file | |
| 434 | |
| 435 the_zip_file = zipfile.ZipFile(sys.argv[1]) | |
| 436 ret = the_zip_file.testzip() | |
| 437 | |
| 438 if ret is not None: | |
| 439 shutil.copy2(sys.argv[1], './IN_AA/input.fasta') | |
| 440 else: | |
| 441 cmd="unzip %s -d ./IN_AA"%(sys.argv[1]) | |
| 442 os.system(cmd) | |
| 443 | |
| 444 | |
| 445 | |
| 446 ## 1 ## List taxa | |
| 447 LT=[] | |
| 448 cmd="grep '>' %s" % sys.argv[2] | |
| 449 result = subprocess.check_output(cmd, shell=True) | |
| 450 result=result.split('\n') | |
| 451 for i in result: | |
| 452 sp=i[1:] | |
| 453 if sp !='': | |
| 454 LT.append(sp) | |
| 455 print LT | |
| 456 | |
| 457 | |
| 458 ## 2 ## PathIN | |
| 459 fileIN_properties = open("%s/01_AminoAcid_Properties2.csv"%(script_path), "r") | |
| 460 Path_IN_loci_AA = "./IN_AA" | |
| 461 #Path_IN_loci_AA = "02_CDS_No_Missing_Data_aa_CDS_withM" | |
| 462 Lloci_AA = os.listdir(Path_IN_loci_AA) | |
| 463 | |
| 464 ## 3 ## PathOUT | |
| 465 | |
| 466 ## 3.1 ## PROT composition | |
| 467 fileOUT_PROT_ALL=open("./OUT/13_prot_compositions_All_AA.csv","w") | |
| 468 fileOUT_PROT_ALL.write("LOCUS,") | |
| 469 for taxa in LT: | |
| 470 fileOUT_PROT_ALL.write("%s_prop_K,%s_prop_R,%s_prop_A,%s_prop_F,%s_prop_I,%s_prop_L,%s_prop_M,%s_prop_V,%s_prop_W,%s_prop_N,%s_prop_Q,%s_prop_S,%s_prop_T,%s_prop_H,%s_prop_Y,%s_prop_C,%s_prop_D,%s_prop_E,%s_prop_P,%s_prop_G," %(taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa)) | |
| 471 fileOUT_PROT_ALL.write("\n") | |
| 472 | |
| 473 ## 3.2 ## PROT IVYWREL | |
| 474 fileOUT_IVYWREL=open("./OUT/14_IVYWREL.csv","w") | |
| 475 fileOUT_IVYWREL.write("LOCUS,") | |
| 476 for taxa in LT: | |
| 477 fileOUT_IVYWREL.write("%s_count_IVYWREL,%s_prop_IVYWREL," %(taxa,taxa)) | |
| 478 fileOUT_IVYWREL.write("\n") | |
| 479 | |
| 480 | |
| 481 ## 3.3 ## PROT ERK_DNQTSHA | |
| 482 fileOUT_ERK_DNQTSH=open("./OUT/15_ERK_DNQTSH.csv","w") | |
| 483 fileOUT_ERK_DNQTSH.write("LOCUS,") | |
| 484 for taxa in LT: | |
| 485 fileOUT_ERK_DNQTSH.write("%s_count_ERK,%s_prop_ERK,%s_count_DNQTSH,%s_prop_DNQTSH,%s_ratio_ERK_vs_DNQTSH," %(taxa,taxa,taxa,taxa,taxa)) | |
| 486 fileOUT_ERK_DNQTSH.write("\n") | |
| 487 | |
| 488 ## 3.4 ## PROT EK_QH | |
| 489 fileOUT_EK_QH=open("./OUT/16_EK_QH.csv","w") | |
| 490 fileOUT_EK_QH.write("LOCUS,") | |
| 491 for taxa in LT: | |
| 492 fileOUT_EK_QH.write("%s_count_EK,%s_prop_EK,%s_count_QH,%s_prop_QH,%s_ratio_EK_vs_QH," %(taxa,taxa,taxa,taxa,taxa)) | |
| 493 fileOUT_EK_QH.write("\n") | |
| 494 | |
| 495 | |
| 496 ## 3.5 ## PROT FYMINK_GARP | |
| 497 fileOUT_FYMINK_GARP=open("./OUT/17_FYMINK_GARP.csv","w") | |
| 498 fileOUT_FYMINK_GARP.write("LOCUS,") | |
| 499 for taxa in LT: | |
| 500 fileOUT_FYMINK_GARP.write("%s_count_FYMINK,%s_prop_FYMINK,%s_count_GARP,%s_prop_GARP," %(taxa,taxa,taxa,taxa)) | |
| 501 fileOUT_FYMINK_GARP.write("\n") | |
| 502 | |
| 503 | |
| 504 ## 3.6 ## PROT AVLIMFYW | |
| 505 fileOUT_AVLIMFYW=open("./OUT/18_AVLIMFYW.csv","w") | |
| 506 fileOUT_AVLIMFYW.write("LOCUS,") | |
| 507 for taxa in LT: | |
| 508 fileOUT_AVLIMFYW.write("%s_count_AVLIMFYW,%s_prop_AVLIMFYW,%s_count_AVLIM,%s_prop_AVLIM,%s_count_FYW,%s_prop_FYW," %(taxa,taxa,taxa,taxa,taxa,taxa)) | |
| 509 fileOUT_AVLIMFYW.write("\n") | |
| 510 | |
| 511 ## 3.7 ## PROT STNQ | |
| 512 fileOUT_STNQ=open("./OUT/19_STNQ.csv","w") | |
| 513 fileOUT_STNQ.write("LOCUS,") | |
| 514 for taxa in LT: | |
| 515 fileOUT_STNQ.write("%s_count_STNQ,%s_prop_STNQ," %(taxa,taxa)) | |
| 516 fileOUT_STNQ.write("\n") | |
| 517 | |
| 518 ## 3.8 ## PROT RHKDE | |
| 519 fileOUT_RHKDE=open("./OUT/20_RHKDE.csv","w") | |
| 520 fileOUT_RHKDE.write("LOCUS,") | |
| 521 for taxa in LT: | |
| 522 fileOUT_RHKDE.write("%s_count_RHKDE,%s_prop_RHKDE,%s_count_RHK,%s_prop_RHK,%s_count_DE,%s_prop_DE," %(taxa,taxa,taxa,taxa,taxa,taxa)) | |
| 523 fileOUT_RHKDE.write("\n") | |
| 524 | |
| 525 ## 3.9 ## PROT DIDER CRITERIA | |
| 526 fileOUT_PAYRE=open("./OUT/21_PAYRE-MVGDS.csv","w") | |
| 527 fileOUT_PAYRE.write("LOCUS,") | |
| 528 for taxa in LT: | |
| 529 fileOUT_PAYRE.write("%s_count_PAYRE,%s_prop_PAYRE,%s_count_AC,%s_prop_AC,%s_count_MVGDS,%s_prop_MVGDS,%s_ratio_PAYRE_vs_MVGDS,%s_ratio_AC_vs_MVGDS," %(taxa,taxa,taxa,taxa,taxa,taxa,taxa,taxa)) | |
| 530 fileOUT_PAYRE.write("\n") | |
| 531 | |
| 532 ## 3.10 ## PROT Total residue weight | |
| 533 fileOUT_TotalResidueWeight=open("./OUT/22_TotalResidueWeight.csv","w") | |
| 534 fileOUT_TotalResidueWeight.write("LOCUS,") | |
| 535 for taxa in LT: | |
| 536 fileOUT_TotalResidueWeight.write("%s_Total_Residue_Weight," %taxa) | |
| 537 fileOUT_TotalResidueWeight.write("\n") | |
| 538 | |
| 539 ## 3.11 ## PROT Total residue volume | |
| 540 fileOUT_TotalResidueVolume=open("./OUT/23_TotalResidueVolume.csv","w") | |
| 541 fileOUT_TotalResidueVolume.write("LOCUS,") | |
| 542 for taxa in LT: | |
| 543 fileOUT_TotalResidueVolume.write("%s_Total_Residue_Volume," %taxa) | |
| 544 fileOUT_TotalResidueVolume.write("\n") | |
| 545 | |
| 546 ## 3.12 ## PROT Total partial specific volume | |
| 547 fileOUT_TotalPartialSpecificVolume=open("./OUT/24_TotalPartialSpecificVolume.csv","w") | |
| 548 fileOUT_TotalPartialSpecificVolume.write("LOCUS,") | |
| 549 for taxa in LT: | |
| 550 fileOUT_TotalPartialSpecificVolume.write("%s_Total_Partial_Specific_Volume," %taxa) | |
| 551 fileOUT_TotalPartialSpecificVolume.write("\n") | |
| 552 | |
| 553 ## 3.13 ## PROT Total hydratation | |
| 554 fileOUT_TotalHydratation=open("./OUT/25_TotalHydratation.csv","w") | |
| 555 fileOUT_TotalHydratation.write("LOCUS,") | |
| 556 for taxa in LT: | |
| 557 fileOUT_TotalHydratation.write("%s_Total_Hydratation," %taxa) | |
| 558 fileOUT_TotalHydratation.write("\n") | |
| 559 | |
| 560 | |
| 561 ##################### | |
| 562 ## 4 ## Process Loci | |
| 563 ##################### | |
| 564 bash_aa_properties = aa_properties(fileIN_properties) | |
| 565 | |
| 566 for locus in Lloci_AA: | |
| 567 print locus | |
| 568 path_locus = "%s/%s" %(Path_IN_loci_AA, locus) | |
| 569 bash = dico(path_locus,LT) | |
| 570 | |
| 571 #print bash | |
| 572 | |
| 573 fileOUT_PROT_ALL.write("%s," %locus) | |
| 574 fileOUT_IVYWREL.write("%s," %locus) | |
| 575 fileOUT_ERK_DNQTSH.write("%s," %locus) | |
| 576 fileOUT_EK_QH.write("%s," %locus) | |
| 577 fileOUT_FYMINK_GARP.write("%s," %locus) | |
| 578 fileOUT_AVLIMFYW.write("%s," %locus) | |
| 579 fileOUT_STNQ.write("%s," %locus) | |
| 580 fileOUT_RHKDE.write("%s," %locus) | |
| 581 fileOUT_PAYRE.write("%s," %locus) | |
| 582 fileOUT_TotalResidueWeight.write("%s," %locus) | |
| 583 fileOUT_TotalResidueVolume.write("%s," %locus) | |
| 584 fileOUT_TotalPartialSpecificVolume.write("%s," %locus) | |
| 585 fileOUT_TotalHydratation.write("%s," %locus) | |
| 586 | |
| 587 for taxa in LT: | |
| 588 if taxa in bash.keys(): | |
| 589 seq = bash[taxa] | |
| 590 prop_K,prop_R,prop_A,prop_F,prop_I,prop_L,prop_M,prop_V,prop_W,prop_N,prop_Q,prop_S,prop_T,prop_H,prop_Y,prop_C,prop_D,prop_E,prop_P,prop_G = aa_composition1(seq) ### DEF3 ### | |
| 591 count_IVYWREL,prop_IVYWREL,count_ERK,prop_ERK,count_DNQTSH,prop_DNQTSH,ratio_ERK_vs_DNQTSH,count_EK,prop_EK,count_QH,prop_QH,ratio_EK_vs_QH,count_FYMINK,prop_FYMINK,count_GARP,prop_GARP,count_AVLIMFYW,prop_AVLIMFYW,count_AVLIM,prop_AVLIM,count_FYW,prop_FYW,count_STNQ,prop_STNQ, count_MVGDS,prop_MVGDS, count_PAYRE,prop_PAYRE, count_AC,prop_AC, ratio_PAYRE_vs_MVGDS, ratio_AC_vs_MVGDS,count_RHKDE,prop_RHKDE,count_RHK,prop_RHK,count_DE,prop_DE = aa_composition2(seq) ### DEF4 ### | |
| 592 Total_Residue_Weight,Total_Residue_Volume,Total_Partial_Specific_Volume,Total_Hydration = sequence_properties_from_aa_properties(seq, bash_aa_properties) ### DEF6 ### | |
| 593 | |
| 594 fileOUT_PROT_ALL.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f," %(prop_K,prop_R,prop_A,prop_F,prop_I,prop_L,prop_M,prop_V,prop_W,prop_N,prop_Q,prop_S,prop_T,prop_H,prop_Y,prop_C,prop_D,prop_E,prop_P,prop_G)) | |
| 595 fileOUT_IVYWREL.write("%.5f,%.5f," %(count_IVYWREL, prop_IVYWREL)) | |
| 596 fileOUT_ERK_DNQTSH.write("%.5f,%.5f,%.5f,%.5f,%.5f," %(count_ERK,prop_ERK,count_DNQTSH,prop_DNQTSH,ratio_ERK_vs_DNQTSH)) | |
| 597 fileOUT_EK_QH.write("%.5f,%.5f,%.5f,%.5f,%.5f," %(count_EK,prop_EK,count_QH,prop_QH,ratio_EK_vs_QH)) | |
| 598 fileOUT_FYMINK_GARP.write("%.5f,%.5f,%.5f,%.5f," %(count_FYMINK,prop_FYMINK,count_GARP,prop_GARP)) | |
| 599 fileOUT_AVLIMFYW.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f," %(count_AVLIMFYW,prop_AVLIMFYW,count_AVLIM,prop_AVLIM,count_FYW,prop_FYW)) | |
| 600 fileOUT_STNQ.write("%.5f,%.5f," %(count_STNQ,prop_STNQ)) | |
| 601 fileOUT_RHKDE.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,"%(count_RHKDE,prop_RHKDE,count_RHK,prop_RHK,count_DE,prop_DE)) | |
| 602 fileOUT_PAYRE.write("%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f," %(count_PAYRE,prop_PAYRE,count_AC,prop_AC,count_MVGDS,prop_MVGDS,ratio_PAYRE_vs_MVGDS,ratio_AC_vs_MVGDS)) | |
| 603 fileOUT_TotalResidueWeight.write("%.5f," %Total_Residue_Weight) | |
| 604 fileOUT_TotalResidueVolume.write("%.5f," %Total_Residue_Volume) | |
| 605 fileOUT_TotalPartialSpecificVolume.write("%.5f," %(Total_Partial_Specific_Volume)) | |
| 606 fileOUT_TotalHydratation.write("%.5f," % Total_Hydration) | |
| 607 | |
| 608 ## END LINE | |
| 609 fileOUT_PROT_ALL.write("\n") | |
| 610 fileOUT_IVYWREL.write("\n") | |
| 611 fileOUT_ERK_DNQTSH.write("\n") | |
| 612 fileOUT_EK_QH.write("\n") | |
| 613 fileOUT_FYMINK_GARP.write("\n") | |
| 614 fileOUT_AVLIMFYW.write("\n") | |
| 615 fileOUT_STNQ.write("\n") | |
| 616 fileOUT_RHKDE.write("\n") | |
| 617 fileOUT_PAYRE.write("\n") | |
| 618 fileOUT_TotalResidueWeight.write("\n") | |
| 619 fileOUT_TotalResidueVolume.write("\n") | |
| 620 fileOUT_TotalPartialSpecificVolume.write("\n") | |
| 621 fileOUT_TotalHydratation.write("\n") | |
| 622 | |
| 623 | |
| 624 | |
| 625 |
