comparison scripts/S01_concatenate.py @ 1:f181bd945a6c draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit ab76075e541dd7ece1090f6b55ca508ec0fde39d
author lecorguille
date Thu, 13 Apr 2017 09:48:47 -0400
parents
children 1f8d039bd241
comparison
equal deleted inserted replaced
0:6d930f037fea 1:f181bd945a6c
1 #!/usr/bin/python
2 ## Author: Eric Fontanillas
3 ## Last modification: 17/06/2011
4 ## Subject: find and remove indels
5
6
7 ###############################
8 ##### DEF 0 : Dico fasta #####
9 ###############################
10 def dico(F2):
11 dicoco = {}
12 while 1:
13 next2 = F2.readline()
14 if not next2:
15 break
16 if next2[0] == ">":
17 fasta_name_query = next2[:-1]
18 Sn = string.split(fasta_name_query, "||")
19 fasta_name_query = Sn[0]
20 next3 = F2.readline()
21 fasta_seq_query = next3[:-1]
22 dicoco[fasta_name_query]=fasta_seq_query
23 return(dicoco)
24 ###################################################################################
25
26
27 ####################
28 ###### DEF 11 ######
29 ####################
30 ## Concatenate sequences
31 ###########################
32 def concatenate(folder_with_loci, SPECIES_ID_LIST):
33 ## 4 ## Process files
34 ## 4.1 ## Create the bash and the fasta names entries (name of the species)
35 bash_concat = {}
36
37 for species_ID in SPECIES_ID_LIST:
38 bash_concat[species_ID] = ''
39
40 ln_concat = 0
41 nb_locus = 0
42 pos=1
43 list_genes_position=[]
44 ## 4.2 ## Concatenate
45 for file in L_IN:
46 nb_locus=nb_locus+1
47
48 ## a ## Open alignments
49 file_IN = open("%s/%s" %(folder_with_loci, file), "r")
50 dico_seq = dico(file_IN) ### DEF 0 ###
51 file_IN.close()
52 ## b ## Get alignment length + genes positions for RAxML
53 key0 = dico_seq.keys()[0]
54 ln = len(dico_seq[key0])
55 ln_concat = ln_concat + ln
56
57 pos_start = pos
58 pos_end = pos+ln-1
59 pos=pos_end+1
60 position="%d-%d" %(pos_start, pos_end)
61 RAxML_name = file[:-6]
62 sublist = [RAxML_name, position]
63 list_genes_position.append(sublist)
64
65 ## c ## Generate "empty" sequence with alignment length * "-"
66 empty_seq = "-" * ln
67
68 ## d ## Concatenate
69 ## d.1 ## Detect missing species in this alignment
70 list_ID=[]
71 list_absent_ID=[]
72 bash_fastaName={}
73 for fasta_name in dico_seq:
74 ID = fasta_name[1:3]
75 list_ID.append(ID)
76 seq = dico_seq[fasta_name]
77 bash_fastaName[ID]=fasta_name
78 for sp_ID in SPECIES_ID_LIST:
79 if sp_ID not in list_ID:
80 list_absent_ID.append(sp_ID)
81
82 for ID in SPECIES_ID_LIST:
83 if ID in list_absent_ID:
84 bash_concat[ID] = bash_concat[ID] + empty_seq
85 else:
86 fasta_name = bash_fastaName[ID]
87 seq = dico_seq[fasta_name]
88 bash_concat[ID] = bash_concat[ID] + seq
89
90 return(bash_concat, ln_concat, nb_locus, list_genes_position)
91 ####################################
92
93
94 ########################################
95 ##### DEF 12 : get codon position #####
96 ########################################
97 def get_codon_position(seq_inORF):
98
99 ln = len(seq_inORF)
100
101 i=0
102 seq_pos1=""
103 seq_pos2=""
104 seq_pos12=""
105 seq_pos3=""
106 while i<ln:
107 pos1 = seq_inORF[i]
108 pos2 = seq_inORF[i+1]
109 pos3 = seq_inORF[i+2]
110
111 seq_pos1 = seq_pos1 + pos1
112 seq_pos2 = seq_pos2 + pos2
113 seq_pos12 = seq_pos12 + pos1 + pos2
114 seq_pos3 = seq_pos3 + pos3
115
116 i = i+3
117
118 return(seq_pos1, seq_pos2, seq_pos12, seq_pos3)
119 ###############################################################################
120
121
122
123 #######################
124 ##### RUN RUN RUN #####
125 #######################
126 import string, os, time, re, sys, zipfile
127
128 list_species = []
129 SPECIES_ID_LIST = []
130 fasta = "^.*fasta$"
131 i=3
132
133 ## add file to list_species
134 zfile = zipfile.ZipFile(sys.argv[1])
135 for name in zfile.namelist() :
136 list_species.append(name)
137
138 ## in SPECIES_ID_LIST, only the 2 first letters of name of species
139 for name in list_species :
140 name = name[:2]
141 SPECIES_ID_LIST.append(name)
142
143
144 ### 1 ### Proteic
145 if sys.argv[2] == "proteic" :
146 os.mkdir("02_CDS_No_Missing_Data_aa")
147 zfile_nuc = zipfile.ZipFile(sys.argv[3])
148 for name in zfile_nuc.namelist() :
149 zfile_nuc.extract(name, "./02_CDS_No_Missing_Data_aa")
150 path_IN = "./02_CDS_No_Missing_Data_aa"
151 L_IN = os.listdir(path_IN)
152
153 OUT1 = open("02_Concatenation_aa.fas", "w")
154 OUT2 = open("02_Concatenation_aa.phy", "w")
155 OUT3 = open("02_Concatenation_aa.nex", "w")
156 OUT_PARTITION_gene_AA = open("06_partitions_gene_AA","w")
157
158
159 ## Get bash with concatenation
160 bash_concatenation, ln, nb_locus,list_genes_position= concatenate(path_IN, SPECIES_ID_LIST) ### DEF 11 ##
161
162 ## Write gene AA partition file for RAxML
163 for sublist in list_genes_position:
164 name = sublist[0]
165 positions=sublist[1]
166 OUT_PARTITION_gene_AA.write("DNA,%s=%s\n"%(name,positions))
167 OUT_PARTITION_gene_AA.close()
168
169 ## Get "ntax" for NEXUS HEADER
170 nb_taxa = len(bash_concatenation.keys())
171
172 print "******************** CONCATENATION ********************\n"
173 print "Process amino-acid concatenation:"
174 print "\tNumber of taxa aligned = %d" %nb_taxa
175 print "\tNumber of loci concatenated = %d\n" %nb_locus
176 print "\tTotal length of the concatenated sequences = %d" %ln
177
178
179 ## Print NEXUS HEADER:
180 OUT3.write("#NEXUS\n\n")
181 OUT3.write("Begin data;\n")
182 OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln))
183 OUT3.write("\tFormat datatype=aa gap=-;\n")
184 OUT3.write("\tMatrix\n")
185
186 ## Print PHYLIP HEADER:
187 OUT2.write(" %d %d\n" %(nb_taxa, ln))
188
189 ## 3.5 ## Print outputs
190 for seq_name in bash_concatenation.keys():
191 seq = bash_concatenation[seq_name]
192
193 ## Filtering the sequence in case of remaining "?"
194 seq = string.replace(seq, "?", "-")
195
196 #print seq FASTA FORMAT
197 OUT1.write(">%s\n" %seq_name)
198 OUT1.write("%s\n" %seq)
199
200 #print seq PHYLIP FORMAT
201 OUT2.write("%s\n" %seq_name)
202 OUT2.write("%s\n" %seq)
203
204 #print seq NEXUS FORMAT
205 OUT3.write("%s" %seq_name)
206 OUT3.write(" %s\n" %seq)
207
208 OUT3.write("\t;\n")
209 OUT3.write("End;\n")
210 OUT1.close()
211 OUT2.close()
212 OUT2.close()
213
214
215 ### 2 ### Nucleic
216 elif sys.argv[2] == "nucleic" :
217 os.mkdir("02_CDS_No_Missing_Data_nuc")
218
219 zfile_nuc = zipfile.ZipFile(sys.argv[3])
220 for name in zfile_nuc.namelist() :
221 zfile_nuc.extract(name, "./02_CDS_No_Missing_Data_nuc")
222 path_IN = "./02_CDS_No_Missing_Data_nuc"
223 L_IN = os.listdir(path_IN)
224
225 OUT1 = open("03_Concatenation_nuc.fas", "w")
226 OUT2 = open("03_Concatenation_nuc.phy", "w")
227 OUT3 = open("03_Concatenation_nuc.nex", "w")
228
229 OUT1_pos12 = open("03_Concatenation_pos12_nuc.fas", "w")
230 OUT2_pos12 = open("03_Concatenation_pos12_nuc.phy", "w")
231 OUT3_pos12 = open("03_Concatenation_pos12_nuc.nex", "w")
232
233 OUT1_pos3 = open("03_Concatenation_pos3_nuc.fas", "w")
234 OUT2_pos3 = open("03_Concatenation_pos3_nuc.phy", "w")
235 OUT3_pos3 = open("03_Concatenation_pos3_nuc.nex", "w")
236
237 OUT_PARTITION_codon_12_3 = open("05_partitions_codon12_3","w")
238 OUT_PARTITION_gene_NUC = open("05_partitions_gene_NUC","w")
239 OUT_PARTITION_gene_PLUS_codon_12_3 = open("05_partitions_gene_PLUS_codon12_3","w")
240
241 ## Get bash with concatenation
242 bash_concatenation, ln, nb_locus, list_genes_position = concatenate(path_IN, SPECIES_ID_LIST) ### DEF 11 ##
243 ln_12 = ln/3*2 ### length of the alignment when only the 2 first codon position
244 ln_3 = ln/3 ### length of the alignment when only the third codon position
245
246 ## Write partition files for RAxML subsequent runs
247 # a # Codon partition
248 OUT_PARTITION_codon_12_3.write("DNA, p1=1-%d\\3,2-%d\\3\n" %(ln, ln))
249 OUT_PARTITION_codon_12_3.write("DNA, p2=3-%d\\3\n" %(ln))
250 OUT_PARTITION_codon_12_3.close()
251
252 # b # Gene partition
253 for sublist in list_genes_position:
254 name=sublist[0]
255 positions=sublist[1]
256 OUT_PARTITION_gene_NUC.write("DNA,%s=%s\n"%(name,positions))
257 OUT_PARTITION_gene_NUC.close()
258
259 # c # Mixed partition (codon + gene)
260 for sublist in list_genes_position:
261 name = sublist[0]
262 positions = sublist[1]
263 S1 = string.split(positions, "-")
264 pos_start1 = string.atoi(S1[0])
265 pos_end = string.atoi(S1[1])
266 pos_start2=pos_start1+1
267 pos_start3=pos_start2+1
268 partition1 = "DNA, %s_1=%d-%d\\3,%d-%d\\3\n" %(name,pos_start1, pos_end, pos_start2, pos_end)
269 partition2 = "DNA, %s_2=%d-%d\\3\n" %(name,pos_start3, pos_end)
270 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition1)
271 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition2)
272
273 OUT_PARTITION_gene_PLUS_codon_12_3.close()
274
275
276 ## Get "ntax" for NEXUS HEADER
277 nb_taxa = len(bash_concatenation.keys())
278
279 print "******************** CONCATENATION ********************\n"
280 print "Process nucleotides concatenation:"
281 print "\tNumber of taxa aligned = %d" %nb_taxa
282 print "\tNumber of loci concatenated = %d\n" %nb_locus
283 print "\tTotal length of the concatenated sequences [All codon positions] = %d" %ln
284 print "\t\tTotal length of the concatenated sequences [Codon positions 1 & 2] = %d" %ln_12
285 print "\t\tTotal length of the concatenated sequences [Codon position 3] = %d" %ln_3
286
287
288 ## Print NEXUS HEADER:
289 OUT3.write("#NEXUS\n\n")
290 OUT3.write("Begin data;\n")
291 OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln))
292 OUT3.write("\tFormat datatype=dna gap=-;\n")
293 OUT3.write("\tMatrix\n")
294
295 OUT3_pos12.write("#NEXUS\n\n")
296 OUT3_pos12.write("Begin data;\n")
297 OUT3_pos12.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_12))
298 OUT3_pos12.write("\tFormat datatype=dna gap=-;\n")
299 OUT3_pos12.write("\tMatrix\n")
300
301 OUT3_pos3.write("#NEXUS\n\n")
302 OUT3_pos3.write("Begin data;\n")
303 OUT3_pos3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_3))
304 OUT3_pos3.write("\tFormat datatype=dna gap=-;\n")
305 OUT3_pos3.write("\tMatrix\n")
306
307 ## Print PHYLIP HEADER:
308 OUT2.write(" %d %d\n" %(nb_taxa, ln))
309 OUT2_pos12.write(" %d %d\n" %(nb_taxa, ln_12))
310 OUT2_pos3.write(" %d %d\n" %(nb_taxa, ln_3))
311
312 ## Print outputs
313 for seq_name in bash_concatenation.keys():
314 seq = bash_concatenation[seq_name]
315
316 ## Filtering the sequence in case of remaining "?"
317 seq = string.replace(seq, "?", "-")
318
319 ## Get the differentes codons partitions
320 seq_pos1, seq_pos2, seq_pos12, seq_pos3 = get_codon_position(seq) ### DEF 12 ###
321
322 #print seq FASTA FORMAT
323 OUT1.write(">%s\n" %seq_name)
324 OUT1.write("%s\n" %seq)
325 OUT1_pos12.write(">%s\n" %seq_name)
326 OUT1_pos12.write("%s\n" %seq_pos12)
327 OUT1_pos3.write(">%s\n" %seq_name)
328 OUT1_pos3.write("%s\n" %seq_pos3)
329
330 #print seq PHYLIP FORMAT
331 OUT2.write("%s\n" %seq_name)
332 OUT2.write("%s\n" %seq)
333 OUT2_pos12.write("%s\n" %seq_name)
334 OUT2_pos12.write("%s\n" %seq_pos12)
335 OUT2_pos3.write("%s\n" %seq_name)
336 OUT2_pos3.write("%s\n" %seq_pos3)
337
338 #print seq NEXUS FORMAT
339 OUT3.write("%s" %seq_name)
340 OUT3.write(" %s\n" %seq)
341 OUT3_pos12.write("%s" %seq_name)
342 OUT3_pos12.write(" %s\n" %seq_pos12)
343 OUT3_pos3.write("%s" %seq_name)
344 OUT3_pos3.write(" %s\n" %seq_pos3)
345
346
347 OUT3.write("\t;\n")
348 OUT3.write("End;\n")
349 OUT3_pos12.write("\t;\n")
350 OUT3_pos12.write("End;\n")
351 OUT3_pos3.write("\t;\n")
352 OUT3_pos3.write("End;\n")
353
354 OUT1.close()
355 OUT2.close()
356 OUT3.close()
357 OUT1_pos12.close()
358 OUT2_pos12.close()
359 OUT3_pos12.close()
360 OUT1_pos3.close()
361 OUT2_pos3.close()
362 OUT3_pos3.close()
363
364 print "\n\n\n******************** RAxML RUN ********************\n"