Mercurial > repos > abims-sbr > concatphyl
comparison scripts/S01_concatenate.py @ 2:1f8d039bd241 draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 44a89d5eeb82789bfc643b33c11f391281b6374b
author | abims-sbr |
---|---|
date | Wed, 27 Sep 2017 10:03:45 -0400 |
parents | f181bd945a6c |
children | 59f4b9ffd92b |
comparison
equal
deleted
inserted
replaced
1:f181bd945a6c | 2:1f8d039bd241 |
---|---|
27 #################### | 27 #################### |
28 ###### DEF 11 ###### | 28 ###### DEF 11 ###### |
29 #################### | 29 #################### |
30 ## Concatenate sequences | 30 ## Concatenate sequences |
31 ########################### | 31 ########################### |
32 def concatenate(folder_with_loci, SPECIES_ID_LIST): | 32 def concatenate(L_IN, SPECIES_ID_LIST): |
33 ## 4 ## Process files | 33 ## 4 ## Process files |
34 ## 4.1 ## Create the bash and the fasta names entries (name of the species) | 34 ## 4.1 ## Create the bash and the fasta names entries (name of the species) |
35 bash_concat = {} | 35 bash_concat = {} |
36 | 36 |
37 for species_ID in SPECIES_ID_LIST: | 37 for species_ID in SPECIES_ID_LIST: |
38 bash_concat[species_ID] = '' | 38 bash_concat[species_ID] = '' |
39 | 39 |
40 ln_concat = 0 | 40 ln_concat = 0 |
41 nb_locus = 0 | 41 nb_locus = 0 |
42 pos=1 | 42 pos=1 |
43 list_genes_position=[] | 43 list_genes_position=[] |
44 ## 4.2 ## Concatenate | 44 ## 4.2 ## Concatenate |
45 for file in L_IN: | 45 for file in L_IN: |
46 nb_locus=nb_locus+1 | 46 nb_locus=nb_locus+1 |
47 | 47 |
48 ## a ## Open alignments | 48 ## a ## Open alignments |
49 file_IN = open("%s/%s" %(folder_with_loci, file), "r") | 49 file_IN = open(file, "r") |
50 dico_seq = dico(file_IN) ### DEF 0 ### | 50 dico_seq = dico(file_IN) ### DEF 0 ### |
51 file_IN.close() | 51 file_IN.close() |
52 ## b ## Get alignment length + genes positions for RAxML | 52 ## b ## Get alignment length + genes positions for RAxML |
53 key0 = dico_seq.keys()[0] | 53 key0 = dico_seq.keys()[0] |
54 ln = len(dico_seq[key0]) | 54 ln = len(dico_seq[key0]) |
59 pos=pos_end+1 | 59 pos=pos_end+1 |
60 position="%d-%d" %(pos_start, pos_end) | 60 position="%d-%d" %(pos_start, pos_end) |
61 RAxML_name = file[:-6] | 61 RAxML_name = file[:-6] |
62 sublist = [RAxML_name, position] | 62 sublist = [RAxML_name, position] |
63 list_genes_position.append(sublist) | 63 list_genes_position.append(sublist) |
64 | 64 |
65 ## c ## Generate "empty" sequence with alignment length * "-" | 65 ## c ## Generate "empty" sequence with alignment length * "-" |
66 empty_seq = "-" * ln | 66 empty_seq = "-" * ln |
67 | 67 |
68 ## d ## Concatenate | 68 ## d ## Concatenate |
69 ## d.1 ## Detect missing species in this alignment | 69 ## d.1 ## Detect missing species in this alignment |
70 list_ID=[] | 70 list_ID=[] |
71 list_absent_ID=[] | 71 list_absent_ID=[] |
72 bash_fastaName={} | 72 bash_fastaName={} |
83 if ID in list_absent_ID: | 83 if ID in list_absent_ID: |
84 bash_concat[ID] = bash_concat[ID] + empty_seq | 84 bash_concat[ID] = bash_concat[ID] + empty_seq |
85 else: | 85 else: |
86 fasta_name = bash_fastaName[ID] | 86 fasta_name = bash_fastaName[ID] |
87 seq = dico_seq[fasta_name] | 87 seq = dico_seq[fasta_name] |
88 bash_concat[ID] = bash_concat[ID] + seq | 88 bash_concat[ID] = bash_concat[ID] + seq |
89 | 89 |
90 return(bash_concat, ln_concat, nb_locus, list_genes_position) | 90 return(bash_concat, ln_concat, nb_locus, list_genes_position) |
91 #################################### | 91 #################################### |
92 | 92 |
93 | 93 |
105 seq_pos3="" | 105 seq_pos3="" |
106 while i<ln: | 106 while i<ln: |
107 pos1 = seq_inORF[i] | 107 pos1 = seq_inORF[i] |
108 pos2 = seq_inORF[i+1] | 108 pos2 = seq_inORF[i+1] |
109 pos3 = seq_inORF[i+2] | 109 pos3 = seq_inORF[i+2] |
110 | 110 |
111 seq_pos1 = seq_pos1 + pos1 | 111 seq_pos1 = seq_pos1 + pos1 |
112 seq_pos2 = seq_pos2 + pos2 | 112 seq_pos2 = seq_pos2 + pos2 |
113 seq_pos12 = seq_pos12 + pos1 + pos2 | 113 seq_pos12 = seq_pos12 + pos1 + pos2 |
114 seq_pos3 = seq_pos3 + pos3 | 114 seq_pos3 = seq_pos3 + pos3 |
115 | 115 |
128 list_species = [] | 128 list_species = [] |
129 SPECIES_ID_LIST = [] | 129 SPECIES_ID_LIST = [] |
130 fasta = "^.*fasta$" | 130 fasta = "^.*fasta$" |
131 i=3 | 131 i=3 |
132 | 132 |
133 ## Arguments | |
134 infiles_filter_assemblies = sys.argv[1] | |
135 format_run = sys.argv[2] | |
136 input_alignments = sys.argv[3] | |
137 | |
133 ## add file to list_species | 138 ## add file to list_species |
134 zfile = zipfile.ZipFile(sys.argv[1]) | 139 list_species = str.split(infiles_filter_assemblies,",") |
135 for name in zfile.namelist() : | |
136 list_species.append(name) | |
137 | 140 |
138 ## in SPECIES_ID_LIST, only the 2 first letters of name of species | 141 ## in SPECIES_ID_LIST, only the 2 first letters of name of species |
139 for name in list_species : | 142 for name in list_species : |
140 name = name[:2] | 143 name = name[:2] |
141 SPECIES_ID_LIST.append(name) | 144 SPECIES_ID_LIST.append(name) |
142 | 145 |
146 ## add alignment files to L_IN | |
147 L_IN = str.split(input_alignments,",") | |
148 print(L_IN) | |
143 | 149 |
144 ### 1 ### Proteic | 150 ### 1 ### Proteic |
145 if sys.argv[2] == "proteic" : | 151 if format_run == "proteic" : |
146 os.mkdir("02_CDS_No_Missing_Data_aa") | |
147 zfile_nuc = zipfile.ZipFile(sys.argv[3]) | |
148 for name in zfile_nuc.namelist() : | |
149 zfile_nuc.extract(name, "./02_CDS_No_Missing_Data_aa") | |
150 path_IN = "./02_CDS_No_Missing_Data_aa" | |
151 L_IN = os.listdir(path_IN) | |
152 | 152 |
153 OUT1 = open("02_Concatenation_aa.fas", "w") | 153 OUT1 = open("02_Concatenation_aa.fas", "w") |
154 OUT2 = open("02_Concatenation_aa.phy", "w") | 154 OUT2 = open("02_Concatenation_aa.phy", "w") |
155 OUT3 = open("02_Concatenation_aa.nex", "w") | 155 OUT3 = open("02_Concatenation_aa.nex", "w") |
156 OUT_PARTITION_gene_AA = open("06_partitions_gene_AA","w") | 156 OUT_PARTITION_gene_AA = open("06_partitions_gene_AA","w") |
157 | 157 |
158 | 158 |
159 ## Get bash with concatenation | 159 ## Get bash with concatenation |
160 bash_concatenation, ln, nb_locus,list_genes_position= concatenate(path_IN, SPECIES_ID_LIST) ### DEF 11 ## | 160 bash_concatenation, ln, nb_locus,list_genes_position= concatenate(L_IN, SPECIES_ID_LIST) ### DEF 11 ## |
161 | 161 |
162 ## Write gene AA partition file for RAxML | 162 ## Write gene AA partition file for RAxML |
163 for sublist in list_genes_position: | 163 for sublist in list_genes_position: |
164 name = sublist[0] | 164 name = sublist[0] |
165 positions=sublist[1] | 165 positions=sublist[1] |
187 OUT2.write(" %d %d\n" %(nb_taxa, ln)) | 187 OUT2.write(" %d %d\n" %(nb_taxa, ln)) |
188 | 188 |
189 ## 3.5 ## Print outputs | 189 ## 3.5 ## Print outputs |
190 for seq_name in bash_concatenation.keys(): | 190 for seq_name in bash_concatenation.keys(): |
191 seq = bash_concatenation[seq_name] | 191 seq = bash_concatenation[seq_name] |
192 | 192 |
193 ## Filtering the sequence in case of remaining "?" | 193 ## Filtering the sequence in case of remaining "?" |
194 seq = string.replace(seq, "?", "-") | 194 seq = string.replace(seq, "?", "-") |
195 | 195 |
196 #print seq FASTA FORMAT | 196 #print seq FASTA FORMAT |
197 OUT1.write(">%s\n" %seq_name) | 197 OUT1.write(">%s\n" %seq_name) |
198 OUT1.write("%s\n" %seq) | 198 OUT1.write("%s\n" %seq) |
199 | 199 |
200 #print seq PHYLIP FORMAT | 200 #print seq PHYLIP FORMAT |
201 OUT2.write("%s\n" %seq_name) | 201 OUT2.write("%s\n" %seq_name) |
202 OUT2.write("%s\n" %seq) | 202 OUT2.write("%s\n" %seq) |
203 | 203 |
204 #print seq NEXUS FORMAT | 204 #print seq NEXUS FORMAT |
211 OUT2.close() | 211 OUT2.close() |
212 OUT2.close() | 212 OUT2.close() |
213 | 213 |
214 | 214 |
215 ### 2 ### Nucleic | 215 ### 2 ### Nucleic |
216 elif sys.argv[2] == "nucleic" : | 216 elif format_run == "nucleic" : |
217 os.mkdir("02_CDS_No_Missing_Data_nuc") | |
218 | |
219 zfile_nuc = zipfile.ZipFile(sys.argv[3]) | |
220 for name in zfile_nuc.namelist() : | |
221 zfile_nuc.extract(name, "./02_CDS_No_Missing_Data_nuc") | |
222 path_IN = "./02_CDS_No_Missing_Data_nuc" | |
223 L_IN = os.listdir(path_IN) | |
224 | 217 |
225 OUT1 = open("03_Concatenation_nuc.fas", "w") | 218 OUT1 = open("03_Concatenation_nuc.fas", "w") |
226 OUT2 = open("03_Concatenation_nuc.phy", "w") | 219 OUT2 = open("03_Concatenation_nuc.phy", "w") |
227 OUT3 = open("03_Concatenation_nuc.nex", "w") | 220 OUT3 = open("03_Concatenation_nuc.nex", "w") |
228 | 221 |
237 OUT_PARTITION_codon_12_3 = open("05_partitions_codon12_3","w") | 230 OUT_PARTITION_codon_12_3 = open("05_partitions_codon12_3","w") |
238 OUT_PARTITION_gene_NUC = open("05_partitions_gene_NUC","w") | 231 OUT_PARTITION_gene_NUC = open("05_partitions_gene_NUC","w") |
239 OUT_PARTITION_gene_PLUS_codon_12_3 = open("05_partitions_gene_PLUS_codon12_3","w") | 232 OUT_PARTITION_gene_PLUS_codon_12_3 = open("05_partitions_gene_PLUS_codon12_3","w") |
240 | 233 |
241 ## Get bash with concatenation | 234 ## Get bash with concatenation |
242 bash_concatenation, ln, nb_locus, list_genes_position = concatenate(path_IN, SPECIES_ID_LIST) ### DEF 11 ## | 235 bash_concatenation, ln, nb_locus, list_genes_position = concatenate(L_IN, SPECIES_ID_LIST) ### DEF 11 ## |
243 ln_12 = ln/3*2 ### length of the alignment when only the 2 first codon position | 236 ln_12 = ln/3*2 ### length of the alignment when only the 2 first codon position |
244 ln_3 = ln/3 ### length of the alignment when only the third codon position | 237 ln_3 = ln/3 ### length of the alignment when only the third codon position |
245 | 238 |
246 ## Write partition files for RAxML subsequent runs | 239 ## Write partition files for RAxML subsequent runs |
247 # a # Codon partition | 240 # a # Codon partition |
267 pos_start3=pos_start2+1 | 260 pos_start3=pos_start2+1 |
268 partition1 = "DNA, %s_1=%d-%d\\3,%d-%d\\3\n" %(name,pos_start1, pos_end, pos_start2, pos_end) | 261 partition1 = "DNA, %s_1=%d-%d\\3,%d-%d\\3\n" %(name,pos_start1, pos_end, pos_start2, pos_end) |
269 partition2 = "DNA, %s_2=%d-%d\\3\n" %(name,pos_start3, pos_end) | 262 partition2 = "DNA, %s_2=%d-%d\\3\n" %(name,pos_start3, pos_end) |
270 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition1) | 263 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition1) |
271 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition2) | 264 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition2) |
272 | 265 |
273 OUT_PARTITION_gene_PLUS_codon_12_3.close() | 266 OUT_PARTITION_gene_PLUS_codon_12_3.close() |
274 | 267 |
275 | 268 |
276 ## Get "ntax" for NEXUS HEADER | 269 ## Get "ntax" for NEXUS HEADER |
277 nb_taxa = len(bash_concatenation.keys()) | 270 nb_taxa = len(bash_concatenation.keys()) |
310 OUT2_pos3.write(" %d %d\n" %(nb_taxa, ln_3)) | 303 OUT2_pos3.write(" %d %d\n" %(nb_taxa, ln_3)) |
311 | 304 |
312 ## Print outputs | 305 ## Print outputs |
313 for seq_name in bash_concatenation.keys(): | 306 for seq_name in bash_concatenation.keys(): |
314 seq = bash_concatenation[seq_name] | 307 seq = bash_concatenation[seq_name] |
315 | 308 |
316 ## Filtering the sequence in case of remaining "?" | 309 ## Filtering the sequence in case of remaining "?" |
317 seq = string.replace(seq, "?", "-") | 310 seq = string.replace(seq, "?", "-") |
318 | 311 |
319 ## Get the differentes codons partitions | 312 ## Get the differentes codons partitions |
320 seq_pos1, seq_pos2, seq_pos12, seq_pos3 = get_codon_position(seq) ### DEF 12 ### | 313 seq_pos1, seq_pos2, seq_pos12, seq_pos3 = get_codon_position(seq) ### DEF 12 ### |
321 | 314 |
322 #print seq FASTA FORMAT | 315 #print seq FASTA FORMAT |
323 OUT1.write(">%s\n" %seq_name) | 316 OUT1.write(">%s\n" %seq_name) |
324 OUT1.write("%s\n" %seq) | 317 OUT1.write("%s\n" %seq) |
325 OUT1_pos12.write(">%s\n" %seq_name) | 318 OUT1_pos12.write(">%s\n" %seq_name) |
326 OUT1_pos12.write("%s\n" %seq_pos12) | 319 OUT1_pos12.write("%s\n" %seq_pos12) |
327 OUT1_pos3.write(">%s\n" %seq_name) | 320 OUT1_pos3.write(">%s\n" %seq_name) |
328 OUT1_pos3.write("%s\n" %seq_pos3) | 321 OUT1_pos3.write("%s\n" %seq_pos3) |
329 | 322 |
330 #print seq PHYLIP FORMAT | 323 #print seq PHYLIP FORMAT |
331 OUT2.write("%s\n" %seq_name) | 324 OUT2.write("%s\n" %seq_name) |
332 OUT2.write("%s\n" %seq) | 325 OUT2.write("%s\n" %seq) |
333 OUT2_pos12.write("%s\n" %seq_name) | 326 OUT2_pos12.write("%s\n" %seq_name) |
334 OUT2_pos12.write("%s\n" %seq_pos12) | 327 OUT2_pos12.write("%s\n" %seq_pos12) |