comparison scripts/S02b_study_seq_composition_nuc.py @ 2:988467f963f0 draft

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit cf1b9c905931ca2ca25faa4844d45c908756472f
author abims-sbr
date Wed, 17 Jan 2018 08:57:49 -0500
parents 8de21b6eb110
children 5766f80370e7
comparison
equal deleted inserted replaced
1:8de21b6eb110 2:988467f963f0
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 ## Author: Eric FONTANILLAS 2 ## Author: Eric FONTANILLAS
3 ## Date: 21.12.10 3 ## Date: 21.12.10
4 ## Last Version : 12/2017 by Victor Mataigne
4 ## Object: Test for compositional bias in genome and proteome as marker of thermal adaptation (comparison between 2 "hot" species: Ap and Ps and one "cold" species: Pg) 5 ## Object: Test for compositional bias in genome and proteome as marker of thermal adaptation (comparison between 2 "hot" species: Ap and Ps and one "cold" species: Pg)
5 6
6
7 import sys,os,shutil,subprocess, string 7 import sys,os,shutil,subprocess, string
8 8 from functions import simplify_fasta_name, dico
9 #############
10 ### DEF 0 ###
11
12 def simplify_fasta_name(fasta_name,LT):
13 for abbreviation in LT:
14 if abbreviation in fasta_name:
15 new_fasta_name = abbreviation
16
17 return(new_fasta_name)
18 ##########################################
19
20 ###########
21 ## DEF1 ##
22 ###########
23 ## Generates bash, with key = fasta name; value = sequence (WITH GAP, IF ANY, REMOVED IN THIS FUNCTION)
24
25 def dico(fasta_file,LT):
26
27 count_fastaName=0
28 F1 = open(fasta_file, "r")
29
30 bash1 = {}
31 while 1:
32 nextline = F1.readline()
33 #print nextline
34 if not nextline :
35 break
36
37 if nextline[0] == ">":
38 count_fastaName = count_fastaName + 1
39 fasta_name = nextline[1:-1]
40 nextline = F1.readline()
41 sequence = nextline[:-1]
42
43 if fasta_name not in bash1.keys():
44 fasta_name = simplify_fasta_name(fasta_name,LT) ### DEF 0 ###
45 bash1[fasta_name] = sequence
46 else:
47 print fasta_name
48
49 # Find alignment length
50 kk = bash1.keys()
51 key0 = kk[0]
52 seq0 = bash1[key0]
53 ln_seq = len(seq0)
54
55 F1.close()
56
57 return(bash1)
58 #####################################
59
60
61 9
62 ################## 10 ##################
63 ###### DEF2 ###### 11 ###### DEF2 ######
64 ################## 12 ##################
65 def base_composition(seq): 13 def base_composition(seq):
203 ## 4 ## Process Loci 151 ## 4 ## Process Loci
204 ##################### 152 #####################
205 for locus in Lloci_NUC: 153 for locus in Lloci_NUC:
206 print locus 154 print locus
207 path_locus = "%s/%s" %(Path_IN_loci_NUC, locus) 155 path_locus = "%s/%s" %(Path_IN_loci_NUC, locus)
208 bash = dico(path_locus,LT) 156 bash = dico(path_locus,LT)
209 157
210 fileOUT_NUC.write("%s," %locus) 158 fileOUT_NUC.write("%s," %locus)
211 fileOUT_percent_GC.write("%s," %locus) 159 fileOUT_percent_GC.write("%s," %locus)
212 fileOUT_percent_purine.write("%s," %locus) 160 fileOUT_percent_purine.write("%s," %locus)
213 fileOUT_Purine_Load.write("%s," %locus) 161 fileOUT_Purine_Load.write("%s," %locus)
214 162
215 if taxa in bash.keys(): 163 for taxa in LT:
216 seq = bash[taxa] 164 if taxa in bash.keys():
217 percent_GC, percent_purine,prop_A, prop_T, prop_C, prop_G = base_composition(seq) ### DEF2 ### 165 seq = bash[taxa]
218 TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000 = purine_loading(seq) ### DEF3 ### 166 percent_GC, percent_purine,prop_A, prop_T, prop_C, prop_G = base_composition(seq) ### DEF2 ###
219 fileOUT_NUC.write("%.5f,%.5f,%.5f,%.5f," %(prop_A,prop_T,prop_C,prop_G)) 167 TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000 = purine_loading(seq) ### DEF3 ###
220 fileOUT_percent_GC.write("%.5f," %percent_GC) 168 fileOUT_NUC.write("%.5f,%.5f,%.5f,%.5f," %(prop_A,prop_T,prop_C,prop_G))
221 fileOUT_percent_purine.write("%.5f," %percent_purine) 169 fileOUT_percent_GC.write("%.5f," %percent_GC)
222 fileOUT_Purine_Load.write("%d,%d,%d,%.5f,%.5f," %(TOTAL, DIFF_GC, DIFF_AT,PLI_GC_1000, PLI_AT_1000)) 170 fileOUT_percent_purine.write("%.5f," %percent_purine)
171 fileOUT_Purine_Load.write("%d,%d,%d,%.5f,%.5f," %(TOTAL, DIFF_GC, DIFF_AT,PLI_GC_1000, PLI_AT_1000))
172 else:
173 fileOUT_NUC.write("%s,%s,%s,%s," %("n.a","n.a","n.a","n.a"))
174 fileOUT_percent_GC.write("%s," %"n.a")
175 fileOUT_percent_purine.write("%s," %"n.a")
176 fileOUT_Purine_Load.write("%s,%s,%s,%s,%s," %("n.a","n.a","n.a","n.a","n.a"))
177
223 fileOUT_NUC.write("\n") 178 fileOUT_NUC.write("\n")
224 fileOUT_percent_GC.write("\n") 179 fileOUT_percent_GC.write("\n")
225 fileOUT_percent_purine.write("\n") 180 fileOUT_percent_purine.write("\n")
226 fileOUT_Purine_Load.write("\n") 181 fileOUT_Purine_Load.write("\n")
227 fileOUT_NUC.close() 182 fileOUT_NUC.close()
228 fileOUT_percent_GC.close() 183 fileOUT_percent_GC.close()
229 fileOUT_percent_purine.close() 184 fileOUT_percent_purine.close()
230 fileOUT_Purine_Load.close() 185 fileOUT_Purine_Load.close()
231