comparison scripts/S01_concatenate.py @ 2:1f8d039bd241 draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 44a89d5eeb82789bfc643b33c11f391281b6374b
author abims-sbr
date Wed, 27 Sep 2017 10:03:45 -0400
parents f181bd945a6c
children 59f4b9ffd92b
comparison
equal deleted inserted replaced
1:f181bd945a6c 2:1f8d039bd241
27 #################### 27 ####################
28 ###### DEF 11 ###### 28 ###### DEF 11 ######
29 #################### 29 ####################
30 ## Concatenate sequences 30 ## Concatenate sequences
31 ########################### 31 ###########################
32 def concatenate(folder_with_loci, SPECIES_ID_LIST): 32 def concatenate(L_IN, SPECIES_ID_LIST):
33 ## 4 ## Process files 33 ## 4 ## Process files
34 ## 4.1 ## Create the bash and the fasta names entries (name of the species) 34 ## 4.1 ## Create the bash and the fasta names entries (name of the species)
35 bash_concat = {} 35 bash_concat = {}
36 36
37 for species_ID in SPECIES_ID_LIST: 37 for species_ID in SPECIES_ID_LIST:
38 bash_concat[species_ID] = '' 38 bash_concat[species_ID] = ''
39 39
40 ln_concat = 0 40 ln_concat = 0
41 nb_locus = 0 41 nb_locus = 0
42 pos=1 42 pos=1
43 list_genes_position=[] 43 list_genes_position=[]
44 ## 4.2 ## Concatenate 44 ## 4.2 ## Concatenate
45 for file in L_IN: 45 for file in L_IN:
46 nb_locus=nb_locus+1 46 nb_locus=nb_locus+1
47 47
48 ## a ## Open alignments 48 ## a ## Open alignments
49 file_IN = open("%s/%s" %(folder_with_loci, file), "r") 49 file_IN = open(file, "r")
50 dico_seq = dico(file_IN) ### DEF 0 ### 50 dico_seq = dico(file_IN) ### DEF 0 ###
51 file_IN.close() 51 file_IN.close()
52 ## b ## Get alignment length + genes positions for RAxML 52 ## b ## Get alignment length + genes positions for RAxML
53 key0 = dico_seq.keys()[0] 53 key0 = dico_seq.keys()[0]
54 ln = len(dico_seq[key0]) 54 ln = len(dico_seq[key0])
59 pos=pos_end+1 59 pos=pos_end+1
60 position="%d-%d" %(pos_start, pos_end) 60 position="%d-%d" %(pos_start, pos_end)
61 RAxML_name = file[:-6] 61 RAxML_name = file[:-6]
62 sublist = [RAxML_name, position] 62 sublist = [RAxML_name, position]
63 list_genes_position.append(sublist) 63 list_genes_position.append(sublist)
64 64
65 ## c ## Generate "empty" sequence with alignment length * "-" 65 ## c ## Generate "empty" sequence with alignment length * "-"
66 empty_seq = "-" * ln 66 empty_seq = "-" * ln
67 67
68 ## d ## Concatenate 68 ## d ## Concatenate
69 ## d.1 ## Detect missing species in this alignment 69 ## d.1 ## Detect missing species in this alignment
70 list_ID=[] 70 list_ID=[]
71 list_absent_ID=[] 71 list_absent_ID=[]
72 bash_fastaName={} 72 bash_fastaName={}
83 if ID in list_absent_ID: 83 if ID in list_absent_ID:
84 bash_concat[ID] = bash_concat[ID] + empty_seq 84 bash_concat[ID] = bash_concat[ID] + empty_seq
85 else: 85 else:
86 fasta_name = bash_fastaName[ID] 86 fasta_name = bash_fastaName[ID]
87 seq = dico_seq[fasta_name] 87 seq = dico_seq[fasta_name]
88 bash_concat[ID] = bash_concat[ID] + seq 88 bash_concat[ID] = bash_concat[ID] + seq
89 89
90 return(bash_concat, ln_concat, nb_locus, list_genes_position) 90 return(bash_concat, ln_concat, nb_locus, list_genes_position)
91 #################################### 91 ####################################
92 92
93 93
105 seq_pos3="" 105 seq_pos3=""
106 while i<ln: 106 while i<ln:
107 pos1 = seq_inORF[i] 107 pos1 = seq_inORF[i]
108 pos2 = seq_inORF[i+1] 108 pos2 = seq_inORF[i+1]
109 pos3 = seq_inORF[i+2] 109 pos3 = seq_inORF[i+2]
110 110
111 seq_pos1 = seq_pos1 + pos1 111 seq_pos1 = seq_pos1 + pos1
112 seq_pos2 = seq_pos2 + pos2 112 seq_pos2 = seq_pos2 + pos2
113 seq_pos12 = seq_pos12 + pos1 + pos2 113 seq_pos12 = seq_pos12 + pos1 + pos2
114 seq_pos3 = seq_pos3 + pos3 114 seq_pos3 = seq_pos3 + pos3
115 115
128 list_species = [] 128 list_species = []
129 SPECIES_ID_LIST = [] 129 SPECIES_ID_LIST = []
130 fasta = "^.*fasta$" 130 fasta = "^.*fasta$"
131 i=3 131 i=3
132 132
133 ## Arguments
134 infiles_filter_assemblies = sys.argv[1]
135 format_run = sys.argv[2]
136 input_alignments = sys.argv[3]
137
133 ## add file to list_species 138 ## add file to list_species
134 zfile = zipfile.ZipFile(sys.argv[1]) 139 list_species = str.split(infiles_filter_assemblies,",")
135 for name in zfile.namelist() :
136 list_species.append(name)
137 140
138 ## in SPECIES_ID_LIST, only the 2 first letters of name of species 141 ## in SPECIES_ID_LIST, only the 2 first letters of name of species
139 for name in list_species : 142 for name in list_species :
140 name = name[:2] 143 name = name[:2]
141 SPECIES_ID_LIST.append(name) 144 SPECIES_ID_LIST.append(name)
142 145
146 ## add alignment files to L_IN
147 L_IN = str.split(input_alignments,",")
148 print(L_IN)
143 149
144 ### 1 ### Proteic 150 ### 1 ### Proteic
145 if sys.argv[2] == "proteic" : 151 if format_run == "proteic" :
146 os.mkdir("02_CDS_No_Missing_Data_aa")
147 zfile_nuc = zipfile.ZipFile(sys.argv[3])
148 for name in zfile_nuc.namelist() :
149 zfile_nuc.extract(name, "./02_CDS_No_Missing_Data_aa")
150 path_IN = "./02_CDS_No_Missing_Data_aa"
151 L_IN = os.listdir(path_IN)
152 152
153 OUT1 = open("02_Concatenation_aa.fas", "w") 153 OUT1 = open("02_Concatenation_aa.fas", "w")
154 OUT2 = open("02_Concatenation_aa.phy", "w") 154 OUT2 = open("02_Concatenation_aa.phy", "w")
155 OUT3 = open("02_Concatenation_aa.nex", "w") 155 OUT3 = open("02_Concatenation_aa.nex", "w")
156 OUT_PARTITION_gene_AA = open("06_partitions_gene_AA","w") 156 OUT_PARTITION_gene_AA = open("06_partitions_gene_AA","w")
157 157
158 158
159 ## Get bash with concatenation 159 ## Get bash with concatenation
160 bash_concatenation, ln, nb_locus,list_genes_position= concatenate(path_IN, SPECIES_ID_LIST) ### DEF 11 ## 160 bash_concatenation, ln, nb_locus,list_genes_position= concatenate(L_IN, SPECIES_ID_LIST) ### DEF 11 ##
161 161
162 ## Write gene AA partition file for RAxML 162 ## Write gene AA partition file for RAxML
163 for sublist in list_genes_position: 163 for sublist in list_genes_position:
164 name = sublist[0] 164 name = sublist[0]
165 positions=sublist[1] 165 positions=sublist[1]
187 OUT2.write(" %d %d\n" %(nb_taxa, ln)) 187 OUT2.write(" %d %d\n" %(nb_taxa, ln))
188 188
189 ## 3.5 ## Print outputs 189 ## 3.5 ## Print outputs
190 for seq_name in bash_concatenation.keys(): 190 for seq_name in bash_concatenation.keys():
191 seq = bash_concatenation[seq_name] 191 seq = bash_concatenation[seq_name]
192 192
193 ## Filtering the sequence in case of remaining "?" 193 ## Filtering the sequence in case of remaining "?"
194 seq = string.replace(seq, "?", "-") 194 seq = string.replace(seq, "?", "-")
195 195
196 #print seq FASTA FORMAT 196 #print seq FASTA FORMAT
197 OUT1.write(">%s\n" %seq_name) 197 OUT1.write(">%s\n" %seq_name)
198 OUT1.write("%s\n" %seq) 198 OUT1.write("%s\n" %seq)
199 199
200 #print seq PHYLIP FORMAT 200 #print seq PHYLIP FORMAT
201 OUT2.write("%s\n" %seq_name) 201 OUT2.write("%s\n" %seq_name)
202 OUT2.write("%s\n" %seq) 202 OUT2.write("%s\n" %seq)
203 203
204 #print seq NEXUS FORMAT 204 #print seq NEXUS FORMAT
211 OUT2.close() 211 OUT2.close()
212 OUT2.close() 212 OUT2.close()
213 213
214 214
215 ### 2 ### Nucleic 215 ### 2 ### Nucleic
216 elif sys.argv[2] == "nucleic" : 216 elif format_run == "nucleic" :
217 os.mkdir("02_CDS_No_Missing_Data_nuc")
218
219 zfile_nuc = zipfile.ZipFile(sys.argv[3])
220 for name in zfile_nuc.namelist() :
221 zfile_nuc.extract(name, "./02_CDS_No_Missing_Data_nuc")
222 path_IN = "./02_CDS_No_Missing_Data_nuc"
223 L_IN = os.listdir(path_IN)
224 217
225 OUT1 = open("03_Concatenation_nuc.fas", "w") 218 OUT1 = open("03_Concatenation_nuc.fas", "w")
226 OUT2 = open("03_Concatenation_nuc.phy", "w") 219 OUT2 = open("03_Concatenation_nuc.phy", "w")
227 OUT3 = open("03_Concatenation_nuc.nex", "w") 220 OUT3 = open("03_Concatenation_nuc.nex", "w")
228 221
237 OUT_PARTITION_codon_12_3 = open("05_partitions_codon12_3","w") 230 OUT_PARTITION_codon_12_3 = open("05_partitions_codon12_3","w")
238 OUT_PARTITION_gene_NUC = open("05_partitions_gene_NUC","w") 231 OUT_PARTITION_gene_NUC = open("05_partitions_gene_NUC","w")
239 OUT_PARTITION_gene_PLUS_codon_12_3 = open("05_partitions_gene_PLUS_codon12_3","w") 232 OUT_PARTITION_gene_PLUS_codon_12_3 = open("05_partitions_gene_PLUS_codon12_3","w")
240 233
241 ## Get bash with concatenation 234 ## Get bash with concatenation
242 bash_concatenation, ln, nb_locus, list_genes_position = concatenate(path_IN, SPECIES_ID_LIST) ### DEF 11 ## 235 bash_concatenation, ln, nb_locus, list_genes_position = concatenate(L_IN, SPECIES_ID_LIST) ### DEF 11 ##
243 ln_12 = ln/3*2 ### length of the alignment when only the 2 first codon position 236 ln_12 = ln/3*2 ### length of the alignment when only the 2 first codon position
244 ln_3 = ln/3 ### length of the alignment when only the third codon position 237 ln_3 = ln/3 ### length of the alignment when only the third codon position
245 238
246 ## Write partition files for RAxML subsequent runs 239 ## Write partition files for RAxML subsequent runs
247 # a # Codon partition 240 # a # Codon partition
267 pos_start3=pos_start2+1 260 pos_start3=pos_start2+1
268 partition1 = "DNA, %s_1=%d-%d\\3,%d-%d\\3\n" %(name,pos_start1, pos_end, pos_start2, pos_end) 261 partition1 = "DNA, %s_1=%d-%d\\3,%d-%d\\3\n" %(name,pos_start1, pos_end, pos_start2, pos_end)
269 partition2 = "DNA, %s_2=%d-%d\\3\n" %(name,pos_start3, pos_end) 262 partition2 = "DNA, %s_2=%d-%d\\3\n" %(name,pos_start3, pos_end)
270 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition1) 263 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition1)
271 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition2) 264 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition2)
272 265
273 OUT_PARTITION_gene_PLUS_codon_12_3.close() 266 OUT_PARTITION_gene_PLUS_codon_12_3.close()
274 267
275 268
276 ## Get "ntax" for NEXUS HEADER 269 ## Get "ntax" for NEXUS HEADER
277 nb_taxa = len(bash_concatenation.keys()) 270 nb_taxa = len(bash_concatenation.keys())
310 OUT2_pos3.write(" %d %d\n" %(nb_taxa, ln_3)) 303 OUT2_pos3.write(" %d %d\n" %(nb_taxa, ln_3))
311 304
312 ## Print outputs 305 ## Print outputs
313 for seq_name in bash_concatenation.keys(): 306 for seq_name in bash_concatenation.keys():
314 seq = bash_concatenation[seq_name] 307 seq = bash_concatenation[seq_name]
315 308
316 ## Filtering the sequence in case of remaining "?" 309 ## Filtering the sequence in case of remaining "?"
317 seq = string.replace(seq, "?", "-") 310 seq = string.replace(seq, "?", "-")
318 311
319 ## Get the differentes codons partitions 312 ## Get the differentes codons partitions
320 seq_pos1, seq_pos2, seq_pos12, seq_pos3 = get_codon_position(seq) ### DEF 12 ### 313 seq_pos1, seq_pos2, seq_pos12, seq_pos3 = get_codon_position(seq) ### DEF 12 ###
321 314
322 #print seq FASTA FORMAT 315 #print seq FASTA FORMAT
323 OUT1.write(">%s\n" %seq_name) 316 OUT1.write(">%s\n" %seq_name)
324 OUT1.write("%s\n" %seq) 317 OUT1.write("%s\n" %seq)
325 OUT1_pos12.write(">%s\n" %seq_name) 318 OUT1_pos12.write(">%s\n" %seq_name)
326 OUT1_pos12.write("%s\n" %seq_pos12) 319 OUT1_pos12.write("%s\n" %seq_pos12)
327 OUT1_pos3.write(">%s\n" %seq_name) 320 OUT1_pos3.write(">%s\n" %seq_name)
328 OUT1_pos3.write("%s\n" %seq_pos3) 321 OUT1_pos3.write("%s\n" %seq_pos3)
329 322
330 #print seq PHYLIP FORMAT 323 #print seq PHYLIP FORMAT
331 OUT2.write("%s\n" %seq_name) 324 OUT2.write("%s\n" %seq_name)
332 OUT2.write("%s\n" %seq) 325 OUT2.write("%s\n" %seq)
333 OUT2_pos12.write("%s\n" %seq_name) 326 OUT2_pos12.write("%s\n" %seq_name)
334 OUT2_pos12.write("%s\n" %seq_pos12) 327 OUT2_pos12.write("%s\n" %seq_pos12)