Mercurial > repos > abims-sbr > concatphyl
comparison scripts/S01_concatenate.py @ 1:f181bd945a6c draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit ab76075e541dd7ece1090f6b55ca508ec0fde39d
author | lecorguille |
---|---|
date | Thu, 13 Apr 2017 09:48:47 -0400 |
parents | |
children | 1f8d039bd241 |
comparison
equal
deleted
inserted
replaced
0:6d930f037fea | 1:f181bd945a6c |
---|---|
1 #!/usr/bin/python | |
2 ## Author: Eric Fontanillas | |
3 ## Last modification: 17/06/2011 | |
4 ## Subject: find and remove indels | |
5 | |
6 | |
7 ############################### | |
8 ##### DEF 0 : Dico fasta ##### | |
9 ############################### | |
10 def dico(F2): | |
11 dicoco = {} | |
12 while 1: | |
13 next2 = F2.readline() | |
14 if not next2: | |
15 break | |
16 if next2[0] == ">": | |
17 fasta_name_query = next2[:-1] | |
18 Sn = string.split(fasta_name_query, "||") | |
19 fasta_name_query = Sn[0] | |
20 next3 = F2.readline() | |
21 fasta_seq_query = next3[:-1] | |
22 dicoco[fasta_name_query]=fasta_seq_query | |
23 return(dicoco) | |
24 ################################################################################### | |
25 | |
26 | |
27 #################### | |
28 ###### DEF 11 ###### | |
29 #################### | |
30 ## Concatenate sequences | |
31 ########################### | |
32 def concatenate(folder_with_loci, SPECIES_ID_LIST): | |
33 ## 4 ## Process files | |
34 ## 4.1 ## Create the bash and the fasta names entries (name of the species) | |
35 bash_concat = {} | |
36 | |
37 for species_ID in SPECIES_ID_LIST: | |
38 bash_concat[species_ID] = '' | |
39 | |
40 ln_concat = 0 | |
41 nb_locus = 0 | |
42 pos=1 | |
43 list_genes_position=[] | |
44 ## 4.2 ## Concatenate | |
45 for file in L_IN: | |
46 nb_locus=nb_locus+1 | |
47 | |
48 ## a ## Open alignments | |
49 file_IN = open("%s/%s" %(folder_with_loci, file), "r") | |
50 dico_seq = dico(file_IN) ### DEF 0 ### | |
51 file_IN.close() | |
52 ## b ## Get alignment length + genes positions for RAxML | |
53 key0 = dico_seq.keys()[0] | |
54 ln = len(dico_seq[key0]) | |
55 ln_concat = ln_concat + ln | |
56 | |
57 pos_start = pos | |
58 pos_end = pos+ln-1 | |
59 pos=pos_end+1 | |
60 position="%d-%d" %(pos_start, pos_end) | |
61 RAxML_name = file[:-6] | |
62 sublist = [RAxML_name, position] | |
63 list_genes_position.append(sublist) | |
64 | |
65 ## c ## Generate "empty" sequence with alignment length * "-" | |
66 empty_seq = "-" * ln | |
67 | |
68 ## d ## Concatenate | |
69 ## d.1 ## Detect missing species in this alignment | |
70 list_ID=[] | |
71 list_absent_ID=[] | |
72 bash_fastaName={} | |
73 for fasta_name in dico_seq: | |
74 ID = fasta_name[1:3] | |
75 list_ID.append(ID) | |
76 seq = dico_seq[fasta_name] | |
77 bash_fastaName[ID]=fasta_name | |
78 for sp_ID in SPECIES_ID_LIST: | |
79 if sp_ID not in list_ID: | |
80 list_absent_ID.append(sp_ID) | |
81 | |
82 for ID in SPECIES_ID_LIST: | |
83 if ID in list_absent_ID: | |
84 bash_concat[ID] = bash_concat[ID] + empty_seq | |
85 else: | |
86 fasta_name = bash_fastaName[ID] | |
87 seq = dico_seq[fasta_name] | |
88 bash_concat[ID] = bash_concat[ID] + seq | |
89 | |
90 return(bash_concat, ln_concat, nb_locus, list_genes_position) | |
91 #################################### | |
92 | |
93 | |
94 ######################################## | |
95 ##### DEF 12 : get codon position ##### | |
96 ######################################## | |
97 def get_codon_position(seq_inORF): | |
98 | |
99 ln = len(seq_inORF) | |
100 | |
101 i=0 | |
102 seq_pos1="" | |
103 seq_pos2="" | |
104 seq_pos12="" | |
105 seq_pos3="" | |
106 while i<ln: | |
107 pos1 = seq_inORF[i] | |
108 pos2 = seq_inORF[i+1] | |
109 pos3 = seq_inORF[i+2] | |
110 | |
111 seq_pos1 = seq_pos1 + pos1 | |
112 seq_pos2 = seq_pos2 + pos2 | |
113 seq_pos12 = seq_pos12 + pos1 + pos2 | |
114 seq_pos3 = seq_pos3 + pos3 | |
115 | |
116 i = i+3 | |
117 | |
118 return(seq_pos1, seq_pos2, seq_pos12, seq_pos3) | |
119 ############################################################################### | |
120 | |
121 | |
122 | |
123 ####################### | |
124 ##### RUN RUN RUN ##### | |
125 ####################### | |
126 import string, os, time, re, sys, zipfile | |
127 | |
128 list_species = [] | |
129 SPECIES_ID_LIST = [] | |
130 fasta = "^.*fasta$" | |
131 i=3 | |
132 | |
133 ## add file to list_species | |
134 zfile = zipfile.ZipFile(sys.argv[1]) | |
135 for name in zfile.namelist() : | |
136 list_species.append(name) | |
137 | |
138 ## in SPECIES_ID_LIST, only the 2 first letters of name of species | |
139 for name in list_species : | |
140 name = name[:2] | |
141 SPECIES_ID_LIST.append(name) | |
142 | |
143 | |
144 ### 1 ### Proteic | |
145 if sys.argv[2] == "proteic" : | |
146 os.mkdir("02_CDS_No_Missing_Data_aa") | |
147 zfile_nuc = zipfile.ZipFile(sys.argv[3]) | |
148 for name in zfile_nuc.namelist() : | |
149 zfile_nuc.extract(name, "./02_CDS_No_Missing_Data_aa") | |
150 path_IN = "./02_CDS_No_Missing_Data_aa" | |
151 L_IN = os.listdir(path_IN) | |
152 | |
153 OUT1 = open("02_Concatenation_aa.fas", "w") | |
154 OUT2 = open("02_Concatenation_aa.phy", "w") | |
155 OUT3 = open("02_Concatenation_aa.nex", "w") | |
156 OUT_PARTITION_gene_AA = open("06_partitions_gene_AA","w") | |
157 | |
158 | |
159 ## Get bash with concatenation | |
160 bash_concatenation, ln, nb_locus,list_genes_position= concatenate(path_IN, SPECIES_ID_LIST) ### DEF 11 ## | |
161 | |
162 ## Write gene AA partition file for RAxML | |
163 for sublist in list_genes_position: | |
164 name = sublist[0] | |
165 positions=sublist[1] | |
166 OUT_PARTITION_gene_AA.write("DNA,%s=%s\n"%(name,positions)) | |
167 OUT_PARTITION_gene_AA.close() | |
168 | |
169 ## Get "ntax" for NEXUS HEADER | |
170 nb_taxa = len(bash_concatenation.keys()) | |
171 | |
172 print "******************** CONCATENATION ********************\n" | |
173 print "Process amino-acid concatenation:" | |
174 print "\tNumber of taxa aligned = %d" %nb_taxa | |
175 print "\tNumber of loci concatenated = %d\n" %nb_locus | |
176 print "\tTotal length of the concatenated sequences = %d" %ln | |
177 | |
178 | |
179 ## Print NEXUS HEADER: | |
180 OUT3.write("#NEXUS\n\n") | |
181 OUT3.write("Begin data;\n") | |
182 OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln)) | |
183 OUT3.write("\tFormat datatype=aa gap=-;\n") | |
184 OUT3.write("\tMatrix\n") | |
185 | |
186 ## Print PHYLIP HEADER: | |
187 OUT2.write(" %d %d\n" %(nb_taxa, ln)) | |
188 | |
189 ## 3.5 ## Print outputs | |
190 for seq_name in bash_concatenation.keys(): | |
191 seq = bash_concatenation[seq_name] | |
192 | |
193 ## Filtering the sequence in case of remaining "?" | |
194 seq = string.replace(seq, "?", "-") | |
195 | |
196 #print seq FASTA FORMAT | |
197 OUT1.write(">%s\n" %seq_name) | |
198 OUT1.write("%s\n" %seq) | |
199 | |
200 #print seq PHYLIP FORMAT | |
201 OUT2.write("%s\n" %seq_name) | |
202 OUT2.write("%s\n" %seq) | |
203 | |
204 #print seq NEXUS FORMAT | |
205 OUT3.write("%s" %seq_name) | |
206 OUT3.write(" %s\n" %seq) | |
207 | |
208 OUT3.write("\t;\n") | |
209 OUT3.write("End;\n") | |
210 OUT1.close() | |
211 OUT2.close() | |
212 OUT2.close() | |
213 | |
214 | |
215 ### 2 ### Nucleic | |
216 elif sys.argv[2] == "nucleic" : | |
217 os.mkdir("02_CDS_No_Missing_Data_nuc") | |
218 | |
219 zfile_nuc = zipfile.ZipFile(sys.argv[3]) | |
220 for name in zfile_nuc.namelist() : | |
221 zfile_nuc.extract(name, "./02_CDS_No_Missing_Data_nuc") | |
222 path_IN = "./02_CDS_No_Missing_Data_nuc" | |
223 L_IN = os.listdir(path_IN) | |
224 | |
225 OUT1 = open("03_Concatenation_nuc.fas", "w") | |
226 OUT2 = open("03_Concatenation_nuc.phy", "w") | |
227 OUT3 = open("03_Concatenation_nuc.nex", "w") | |
228 | |
229 OUT1_pos12 = open("03_Concatenation_pos12_nuc.fas", "w") | |
230 OUT2_pos12 = open("03_Concatenation_pos12_nuc.phy", "w") | |
231 OUT3_pos12 = open("03_Concatenation_pos12_nuc.nex", "w") | |
232 | |
233 OUT1_pos3 = open("03_Concatenation_pos3_nuc.fas", "w") | |
234 OUT2_pos3 = open("03_Concatenation_pos3_nuc.phy", "w") | |
235 OUT3_pos3 = open("03_Concatenation_pos3_nuc.nex", "w") | |
236 | |
237 OUT_PARTITION_codon_12_3 = open("05_partitions_codon12_3","w") | |
238 OUT_PARTITION_gene_NUC = open("05_partitions_gene_NUC","w") | |
239 OUT_PARTITION_gene_PLUS_codon_12_3 = open("05_partitions_gene_PLUS_codon12_3","w") | |
240 | |
241 ## Get bash with concatenation | |
242 bash_concatenation, ln, nb_locus, list_genes_position = concatenate(path_IN, SPECIES_ID_LIST) ### DEF 11 ## | |
243 ln_12 = ln/3*2 ### length of the alignment when only the 2 first codon position | |
244 ln_3 = ln/3 ### length of the alignment when only the third codon position | |
245 | |
246 ## Write partition files for RAxML subsequent runs | |
247 # a # Codon partition | |
248 OUT_PARTITION_codon_12_3.write("DNA, p1=1-%d\\3,2-%d\\3\n" %(ln, ln)) | |
249 OUT_PARTITION_codon_12_3.write("DNA, p2=3-%d\\3\n" %(ln)) | |
250 OUT_PARTITION_codon_12_3.close() | |
251 | |
252 # b # Gene partition | |
253 for sublist in list_genes_position: | |
254 name=sublist[0] | |
255 positions=sublist[1] | |
256 OUT_PARTITION_gene_NUC.write("DNA,%s=%s\n"%(name,positions)) | |
257 OUT_PARTITION_gene_NUC.close() | |
258 | |
259 # c # Mixed partition (codon + gene) | |
260 for sublist in list_genes_position: | |
261 name = sublist[0] | |
262 positions = sublist[1] | |
263 S1 = string.split(positions, "-") | |
264 pos_start1 = string.atoi(S1[0]) | |
265 pos_end = string.atoi(S1[1]) | |
266 pos_start2=pos_start1+1 | |
267 pos_start3=pos_start2+1 | |
268 partition1 = "DNA, %s_1=%d-%d\\3,%d-%d\\3\n" %(name,pos_start1, pos_end, pos_start2, pos_end) | |
269 partition2 = "DNA, %s_2=%d-%d\\3\n" %(name,pos_start3, pos_end) | |
270 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition1) | |
271 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition2) | |
272 | |
273 OUT_PARTITION_gene_PLUS_codon_12_3.close() | |
274 | |
275 | |
276 ## Get "ntax" for NEXUS HEADER | |
277 nb_taxa = len(bash_concatenation.keys()) | |
278 | |
279 print "******************** CONCATENATION ********************\n" | |
280 print "Process nucleotides concatenation:" | |
281 print "\tNumber of taxa aligned = %d" %nb_taxa | |
282 print "\tNumber of loci concatenated = %d\n" %nb_locus | |
283 print "\tTotal length of the concatenated sequences [All codon positions] = %d" %ln | |
284 print "\t\tTotal length of the concatenated sequences [Codon positions 1 & 2] = %d" %ln_12 | |
285 print "\t\tTotal length of the concatenated sequences [Codon position 3] = %d" %ln_3 | |
286 | |
287 | |
288 ## Print NEXUS HEADER: | |
289 OUT3.write("#NEXUS\n\n") | |
290 OUT3.write("Begin data;\n") | |
291 OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln)) | |
292 OUT3.write("\tFormat datatype=dna gap=-;\n") | |
293 OUT3.write("\tMatrix\n") | |
294 | |
295 OUT3_pos12.write("#NEXUS\n\n") | |
296 OUT3_pos12.write("Begin data;\n") | |
297 OUT3_pos12.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_12)) | |
298 OUT3_pos12.write("\tFormat datatype=dna gap=-;\n") | |
299 OUT3_pos12.write("\tMatrix\n") | |
300 | |
301 OUT3_pos3.write("#NEXUS\n\n") | |
302 OUT3_pos3.write("Begin data;\n") | |
303 OUT3_pos3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_3)) | |
304 OUT3_pos3.write("\tFormat datatype=dna gap=-;\n") | |
305 OUT3_pos3.write("\tMatrix\n") | |
306 | |
307 ## Print PHYLIP HEADER: | |
308 OUT2.write(" %d %d\n" %(nb_taxa, ln)) | |
309 OUT2_pos12.write(" %d %d\n" %(nb_taxa, ln_12)) | |
310 OUT2_pos3.write(" %d %d\n" %(nb_taxa, ln_3)) | |
311 | |
312 ## Print outputs | |
313 for seq_name in bash_concatenation.keys(): | |
314 seq = bash_concatenation[seq_name] | |
315 | |
316 ## Filtering the sequence in case of remaining "?" | |
317 seq = string.replace(seq, "?", "-") | |
318 | |
319 ## Get the differentes codons partitions | |
320 seq_pos1, seq_pos2, seq_pos12, seq_pos3 = get_codon_position(seq) ### DEF 12 ### | |
321 | |
322 #print seq FASTA FORMAT | |
323 OUT1.write(">%s\n" %seq_name) | |
324 OUT1.write("%s\n" %seq) | |
325 OUT1_pos12.write(">%s\n" %seq_name) | |
326 OUT1_pos12.write("%s\n" %seq_pos12) | |
327 OUT1_pos3.write(">%s\n" %seq_name) | |
328 OUT1_pos3.write("%s\n" %seq_pos3) | |
329 | |
330 #print seq PHYLIP FORMAT | |
331 OUT2.write("%s\n" %seq_name) | |
332 OUT2.write("%s\n" %seq) | |
333 OUT2_pos12.write("%s\n" %seq_name) | |
334 OUT2_pos12.write("%s\n" %seq_pos12) | |
335 OUT2_pos3.write("%s\n" %seq_name) | |
336 OUT2_pos3.write("%s\n" %seq_pos3) | |
337 | |
338 #print seq NEXUS FORMAT | |
339 OUT3.write("%s" %seq_name) | |
340 OUT3.write(" %s\n" %seq) | |
341 OUT3_pos12.write("%s" %seq_name) | |
342 OUT3_pos12.write(" %s\n" %seq_pos12) | |
343 OUT3_pos3.write("%s" %seq_name) | |
344 OUT3_pos3.write(" %s\n" %seq_pos3) | |
345 | |
346 | |
347 OUT3.write("\t;\n") | |
348 OUT3.write("End;\n") | |
349 OUT3_pos12.write("\t;\n") | |
350 OUT3_pos12.write("End;\n") | |
351 OUT3_pos3.write("\t;\n") | |
352 OUT3_pos3.write("End;\n") | |
353 | |
354 OUT1.close() | |
355 OUT2.close() | |
356 OUT3.close() | |
357 OUT1_pos12.close() | |
358 OUT2_pos12.close() | |
359 OUT3_pos12.close() | |
360 OUT1_pos3.close() | |
361 OUT2_pos3.close() | |
362 OUT3_pos3.close() | |
363 | |
364 print "\n\n\n******************** RAxML RUN ********************\n" |