comparison scripts/S02b_study_seq_composition_nuc.py @ 0:78dd6454f6f0 draft

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 73670b26c75bb6c1a6332481920f3036314de364
author abims-sbr
date Tue, 02 May 2017 04:20:51 -0400
parents
children 8de21b6eb110
comparison
equal deleted inserted replaced
-1:000000000000 0:78dd6454f6f0
1 #!/usr/bin/env python
2 ## Author: Eric FONTANILLAS
3 ## Date: 21.12.10
4 ## Object: Test for compositional bias in genome and proteome as marker of thermal adaptation (comparison between 2 "hot" species: Ap and Ps and one "cold" species: Pg)
5
6
7 import sys,os,zipfile,shutil,subprocess
8
9 #############
10 ### DEF 0 ###
11 #############import sys,os,zipfile
12 def simplify_fasta_name(fasta_name,LT):
13 for abbreviation in LT:
14 if abbreviation in fasta_name:
15 new_fasta_name = abbreviation
16
17 return(new_fasta_name)
18 ##########################################
19
20 ###########
21 ## DEF1 ##
22 ###########
23 ## Generates bash, with key = fasta name; value = sequence (WITH GAP, IF ANY, REMOVED IN THIS FUNCTION)
24
25 def dico(fasta_file,LT):
26
27 count_fastaName=0
28 F1 = open(fasta_file, "r")
29
30 bash1 = {}
31 while 1:
32 nextline = F1.readline()
33 #print nextline
34 if not nextline :
35 break
36
37 if nextline[0] == ">":
38 count_fastaName = count_fastaName + 1
39 fasta_name = nextline[1:-1]
40 nextline = F1.readline()
41 sequence = nextline[:-1]
42
43 if fasta_name not in bash1.keys():
44 fasta_name = simplify_fasta_name(fasta_name,LT) ### DEF 0 ###
45 bash1[fasta_name] = sequence
46 else:
47 print fasta_name
48
49 # Find alignment length
50 kk = bash1.keys()
51 key0 = kk[0]
52 seq0 = bash1[key0]
53 ln_seq = len(seq0)
54
55 F1.close()
56
57 return(bash1)
58 #####################################
59
60
61
62 ##################
63 ###### DEF2 ######
64 ##################
65 def base_composition(seq):
66 count_A=string.count(seq, "A") + string.count(seq, "a")
67 count_T=string.count(seq, "T") + string.count(seq, "t")
68 count_C=string.count(seq, "C") + string.count(seq, "c")
69 count_G=string.count(seq, "G") + string.count(seq, "g")
70 ## 3 ## Nucleotide proportions
71 ln = count_C+count_G+count_T+count_A
72 if (ln!=0):
73
74 CG = count_C+count_G
75 AT = count_T+count_A
76
77 AG = count_A+count_G
78 TC = count_T+count_C
79
80 ## 1 ## Search for compositional bias in genome as marker of thermal adaptation: CG vs AT
81 ratio_CG_AT = float(CG)/float(AT)
82 percent_CG = float(CG)/(float(AT) + float(CG))*100
83
84 ## 2 ## Search for compositional bias in genome as marker of thermal adaptation: AG vs TC
85 ratio_purine_pyrimidine=float(AG)/float(TC)
86 percent_purine=float(AG)/(float(AG)+float(TC))*100
87
88
89 prop_A = float(count_A)/float(ln)
90 prop_T = float(count_T)/float(ln)
91 prop_C = float(count_C)/float(ln)
92 prop_G = float(count_G)/float(ln)
93 else:
94 percent_CG=0
95 percent_purine=0
96 prop_A=0
97 prop_T=0
98 prop_C=0
99 prop_G=0
100
101 return(percent_CG, percent_purine, prop_A, prop_T, prop_C, prop_G)
102 ##############################################
103
104 ##################
105 ###### DEF3 ######
106 ##################
107 def purine_loading(seq):
108 count_A=string.count(seq, "A") + string.count(seq, "a")
109 count_T=string.count(seq, "T") + string.count(seq, "t")
110 count_C=string.count(seq, "C") + string.count(seq, "c")
111 count_G=string.count(seq, "G") + string.count(seq, "g")
112 ## 3 ## Nucleotide proportions
113 TOTAL = count_C+count_G+count_T+count_A
114 if (TOTAL!=0):
115
116 ## PLI : Purine loading indice (Forsdyke et al.)
117 # (G-C)/N * 1000 et (A-T)/N * 1000
118
119 DIFF_GC = count_G - count_C
120 DIFF_AT = count_A - count_T
121
122 # Per bp
123 PLI_GC = float(DIFF_GC)/float(TOTAL)
124 PLI_AT = float(DIFF_AT)/float(TOTAL)
125
126 # Per 1000 bp
127 PLI_GC_1000 = PLI_GC*1000
128 PLI_AT_1000 = PLI_AT*1000
129 else:
130 DIFF_GC=0
131 DIFF_AT=0
132 PLI_GC=0
133 PLI_AT=0
134 PLI_GC_1000=0
135 PLI_AT_1000=0
136
137 return(TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000)
138 ##############################################
139
140 ###################
141 ### RUN RUN RUN ###
142 ###################
143 import string, os,sys,zipfile
144
145
146
147
148
149 ##Create specific folders
150 Path_IN_loci_NUC = "./IN_NUC"
151 outpath= "./OUT"
152 os.makedirs(Path_IN_loci_NUC)
153 os.makedirs(outpath)
154
155
156
157
158 #Check if the file is a zip or fasta file
159
160 the_zip_file = zipfile.ZipFile(sys.argv[1])
161 ret = the_zip_file.testzip()
162
163 if ret is not None:
164 shutil.copy2(sys.argv[1], './IN_NUC/input.fasta')
165 else:
166 cmd="unzip %s -d ./IN_NUC"%(sys.argv[1])
167 os.system(cmd)
168
169
170 ## 1 ## List taxa
171 LT=[]
172 cmd="grep '>' %s" % sys.argv[2]
173 result = subprocess.check_output(cmd, shell=True)
174 result=result.split('\n')
175 for i in result:
176 sp=i[1:]
177 if sp !='':
178 LT.append(sp)
179 print LT
180
181 #LT = ["Ap", "Ac", "Pg"]
182 #os.system("grep '>' %s" %(sys.argv[1]))
183
184 ## 2 ## PathIN
185 # fileIN_properties = open("01_AminoAcid_Properties2.csv", "r")
186 Lloci_NUC = os.listdir(Path_IN_loci_NUC)
187
188
189 ## 3 ## PathOUT
190 ## 3.1 ## NUC composition
191 fileOUT_NUC=open("./OUT/10_nuc_compositions.csv","w")
192 fileOUT_NUC.write("LOCUS,")
193 for taxa in LT:
194 fileOUT_NUC.write("%s_prop_A,%s_prop_T,%s_prop_C,%s_prop_G," %(taxa,taxa,taxa,taxa))
195 fileOUT_NUC.write("\n")
196
197 ## 3.2 ## NUC percent_GC
198 fileOUT_percent_GC=open("./OUT/11_percent_GC.csv","w")
199 fileOUT_percent_GC.write("LOCUS,")
200 for taxa in LT:
201 fileOUT_percent_GC.write("%s_percent_GC," %(taxa))
202 fileOUT_percent_GC.write("\n")
203
204 ## 3.3 ## NUC percent_purine
205 fileOUT_percent_purine=open("./OUT/12_percent_purine.csv","w")
206 fileOUT_percent_purine.write("LOCUS,")
207 for taxa in LT:
208 fileOUT_percent_purine.write("%s_percent_purine," %(taxa))
209 fileOUT_percent_purine.write("\n")
210
211 ## 3.4 ## Purine Load
212 fileOUT_Purine_Load=open("./OUT/12_Purine_Load_Indice.csv", "w")
213 fileOUT_Purine_Load.write("LOCUS,")
214 for taxa in LT:
215 fileOUT_Purine_Load.write("%s_TOTAL,%s_DIFF_GC,%s_DIFF_AT,%s_PLI_GC1000,%s_PLI_AT1000," %(taxa,taxa,taxa,taxa,taxa))
216 fileOUT_Purine_Load.write("\n")
217
218 #####################
219 ## 4 ## Process Loci
220 #####################
221 for locus in Lloci_NUC:
222 print locus
223 path_locus = "%s/%s" %(Path_IN_loci_NUC, locus)
224 bash = dico(path_locus,LT)
225
226 fileOUT_NUC.write("%s," %locus)
227 fileOUT_percent_GC.write("%s," %locus)
228 fileOUT_percent_purine.write("%s," %locus)
229 fileOUT_Purine_Load.write("%s," %locus)
230 #print bash
231 for taxa in LT:
232 print taxa
233 if taxa in bash.keys():
234 seq = bash[taxa]
235 percent_GC, percent_purine,prop_A, prop_T, prop_C, prop_G = base_composition(seq) ### DEF2 ###
236 TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000 = purine_loading(seq) ### DEF3 ###
237 fileOUT_NUC.write("%.5f,%.5f,%.5f,%.5f," %(prop_A,prop_T,prop_C,prop_G))
238 fileOUT_percent_GC.write("%.5f," %percent_GC)
239 fileOUT_percent_purine.write("%.5f," %percent_purine)
240 fileOUT_Purine_Load.write("%d,%d,%d,%.5f,%.5f," %(TOTAL, DIFF_GC, DIFF_AT,PLI_GC_1000, PLI_AT_1000))
241 fileOUT_NUC.write("\n")
242 fileOUT_percent_GC.write("\n")
243 fileOUT_percent_purine.write("\n")
244 fileOUT_Purine_Load.write("\n")
245 fileOUT_NUC.close()
246 fileOUT_percent_GC.close()
247 fileOUT_percent_purine.close()
248 fileOUT_Purine_Load.close()