Mercurial > repos > abims-sbr > concatphyl
comparison scripts/S01_concatenate.py @ 1:f181bd945a6c draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit ab76075e541dd7ece1090f6b55ca508ec0fde39d
| author | lecorguille |
|---|---|
| date | Thu, 13 Apr 2017 09:48:47 -0400 |
| parents | |
| children | 1f8d039bd241 |
comparison
equal
deleted
inserted
replaced
| 0:6d930f037fea | 1:f181bd945a6c |
|---|---|
| 1 #!/usr/bin/python | |
| 2 ## Author: Eric Fontanillas | |
| 3 ## Last modification: 17/06/2011 | |
| 4 ## Subject: find and remove indels | |
| 5 | |
| 6 | |
| 7 ############################### | |
| 8 ##### DEF 0 : Dico fasta ##### | |
| 9 ############################### | |
| 10 def dico(F2): | |
| 11 dicoco = {} | |
| 12 while 1: | |
| 13 next2 = F2.readline() | |
| 14 if not next2: | |
| 15 break | |
| 16 if next2[0] == ">": | |
| 17 fasta_name_query = next2[:-1] | |
| 18 Sn = string.split(fasta_name_query, "||") | |
| 19 fasta_name_query = Sn[0] | |
| 20 next3 = F2.readline() | |
| 21 fasta_seq_query = next3[:-1] | |
| 22 dicoco[fasta_name_query]=fasta_seq_query | |
| 23 return(dicoco) | |
| 24 ################################################################################### | |
| 25 | |
| 26 | |
| 27 #################### | |
| 28 ###### DEF 11 ###### | |
| 29 #################### | |
| 30 ## Concatenate sequences | |
| 31 ########################### | |
| 32 def concatenate(folder_with_loci, SPECIES_ID_LIST): | |
| 33 ## 4 ## Process files | |
| 34 ## 4.1 ## Create the bash and the fasta names entries (name of the species) | |
| 35 bash_concat = {} | |
| 36 | |
| 37 for species_ID in SPECIES_ID_LIST: | |
| 38 bash_concat[species_ID] = '' | |
| 39 | |
| 40 ln_concat = 0 | |
| 41 nb_locus = 0 | |
| 42 pos=1 | |
| 43 list_genes_position=[] | |
| 44 ## 4.2 ## Concatenate | |
| 45 for file in L_IN: | |
| 46 nb_locus=nb_locus+1 | |
| 47 | |
| 48 ## a ## Open alignments | |
| 49 file_IN = open("%s/%s" %(folder_with_loci, file), "r") | |
| 50 dico_seq = dico(file_IN) ### DEF 0 ### | |
| 51 file_IN.close() | |
| 52 ## b ## Get alignment length + genes positions for RAxML | |
| 53 key0 = dico_seq.keys()[0] | |
| 54 ln = len(dico_seq[key0]) | |
| 55 ln_concat = ln_concat + ln | |
| 56 | |
| 57 pos_start = pos | |
| 58 pos_end = pos+ln-1 | |
| 59 pos=pos_end+1 | |
| 60 position="%d-%d" %(pos_start, pos_end) | |
| 61 RAxML_name = file[:-6] | |
| 62 sublist = [RAxML_name, position] | |
| 63 list_genes_position.append(sublist) | |
| 64 | |
| 65 ## c ## Generate "empty" sequence with alignment length * "-" | |
| 66 empty_seq = "-" * ln | |
| 67 | |
| 68 ## d ## Concatenate | |
| 69 ## d.1 ## Detect missing species in this alignment | |
| 70 list_ID=[] | |
| 71 list_absent_ID=[] | |
| 72 bash_fastaName={} | |
| 73 for fasta_name in dico_seq: | |
| 74 ID = fasta_name[1:3] | |
| 75 list_ID.append(ID) | |
| 76 seq = dico_seq[fasta_name] | |
| 77 bash_fastaName[ID]=fasta_name | |
| 78 for sp_ID in SPECIES_ID_LIST: | |
| 79 if sp_ID not in list_ID: | |
| 80 list_absent_ID.append(sp_ID) | |
| 81 | |
| 82 for ID in SPECIES_ID_LIST: | |
| 83 if ID in list_absent_ID: | |
| 84 bash_concat[ID] = bash_concat[ID] + empty_seq | |
| 85 else: | |
| 86 fasta_name = bash_fastaName[ID] | |
| 87 seq = dico_seq[fasta_name] | |
| 88 bash_concat[ID] = bash_concat[ID] + seq | |
| 89 | |
| 90 return(bash_concat, ln_concat, nb_locus, list_genes_position) | |
| 91 #################################### | |
| 92 | |
| 93 | |
| 94 ######################################## | |
| 95 ##### DEF 12 : get codon position ##### | |
| 96 ######################################## | |
| 97 def get_codon_position(seq_inORF): | |
| 98 | |
| 99 ln = len(seq_inORF) | |
| 100 | |
| 101 i=0 | |
| 102 seq_pos1="" | |
| 103 seq_pos2="" | |
| 104 seq_pos12="" | |
| 105 seq_pos3="" | |
| 106 while i<ln: | |
| 107 pos1 = seq_inORF[i] | |
| 108 pos2 = seq_inORF[i+1] | |
| 109 pos3 = seq_inORF[i+2] | |
| 110 | |
| 111 seq_pos1 = seq_pos1 + pos1 | |
| 112 seq_pos2 = seq_pos2 + pos2 | |
| 113 seq_pos12 = seq_pos12 + pos1 + pos2 | |
| 114 seq_pos3 = seq_pos3 + pos3 | |
| 115 | |
| 116 i = i+3 | |
| 117 | |
| 118 return(seq_pos1, seq_pos2, seq_pos12, seq_pos3) | |
| 119 ############################################################################### | |
| 120 | |
| 121 | |
| 122 | |
| 123 ####################### | |
| 124 ##### RUN RUN RUN ##### | |
| 125 ####################### | |
| 126 import string, os, time, re, sys, zipfile | |
| 127 | |
| 128 list_species = [] | |
| 129 SPECIES_ID_LIST = [] | |
| 130 fasta = "^.*fasta$" | |
| 131 i=3 | |
| 132 | |
| 133 ## add file to list_species | |
| 134 zfile = zipfile.ZipFile(sys.argv[1]) | |
| 135 for name in zfile.namelist() : | |
| 136 list_species.append(name) | |
| 137 | |
| 138 ## in SPECIES_ID_LIST, only the 2 first letters of name of species | |
| 139 for name in list_species : | |
| 140 name = name[:2] | |
| 141 SPECIES_ID_LIST.append(name) | |
| 142 | |
| 143 | |
| 144 ### 1 ### Proteic | |
| 145 if sys.argv[2] == "proteic" : | |
| 146 os.mkdir("02_CDS_No_Missing_Data_aa") | |
| 147 zfile_nuc = zipfile.ZipFile(sys.argv[3]) | |
| 148 for name in zfile_nuc.namelist() : | |
| 149 zfile_nuc.extract(name, "./02_CDS_No_Missing_Data_aa") | |
| 150 path_IN = "./02_CDS_No_Missing_Data_aa" | |
| 151 L_IN = os.listdir(path_IN) | |
| 152 | |
| 153 OUT1 = open("02_Concatenation_aa.fas", "w") | |
| 154 OUT2 = open("02_Concatenation_aa.phy", "w") | |
| 155 OUT3 = open("02_Concatenation_aa.nex", "w") | |
| 156 OUT_PARTITION_gene_AA = open("06_partitions_gene_AA","w") | |
| 157 | |
| 158 | |
| 159 ## Get bash with concatenation | |
| 160 bash_concatenation, ln, nb_locus,list_genes_position= concatenate(path_IN, SPECIES_ID_LIST) ### DEF 11 ## | |
| 161 | |
| 162 ## Write gene AA partition file for RAxML | |
| 163 for sublist in list_genes_position: | |
| 164 name = sublist[0] | |
| 165 positions=sublist[1] | |
| 166 OUT_PARTITION_gene_AA.write("DNA,%s=%s\n"%(name,positions)) | |
| 167 OUT_PARTITION_gene_AA.close() | |
| 168 | |
| 169 ## Get "ntax" for NEXUS HEADER | |
| 170 nb_taxa = len(bash_concatenation.keys()) | |
| 171 | |
| 172 print "******************** CONCATENATION ********************\n" | |
| 173 print "Process amino-acid concatenation:" | |
| 174 print "\tNumber of taxa aligned = %d" %nb_taxa | |
| 175 print "\tNumber of loci concatenated = %d\n" %nb_locus | |
| 176 print "\tTotal length of the concatenated sequences = %d" %ln | |
| 177 | |
| 178 | |
| 179 ## Print NEXUS HEADER: | |
| 180 OUT3.write("#NEXUS\n\n") | |
| 181 OUT3.write("Begin data;\n") | |
| 182 OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln)) | |
| 183 OUT3.write("\tFormat datatype=aa gap=-;\n") | |
| 184 OUT3.write("\tMatrix\n") | |
| 185 | |
| 186 ## Print PHYLIP HEADER: | |
| 187 OUT2.write(" %d %d\n" %(nb_taxa, ln)) | |
| 188 | |
| 189 ## 3.5 ## Print outputs | |
| 190 for seq_name in bash_concatenation.keys(): | |
| 191 seq = bash_concatenation[seq_name] | |
| 192 | |
| 193 ## Filtering the sequence in case of remaining "?" | |
| 194 seq = string.replace(seq, "?", "-") | |
| 195 | |
| 196 #print seq FASTA FORMAT | |
| 197 OUT1.write(">%s\n" %seq_name) | |
| 198 OUT1.write("%s\n" %seq) | |
| 199 | |
| 200 #print seq PHYLIP FORMAT | |
| 201 OUT2.write("%s\n" %seq_name) | |
| 202 OUT2.write("%s\n" %seq) | |
| 203 | |
| 204 #print seq NEXUS FORMAT | |
| 205 OUT3.write("%s" %seq_name) | |
| 206 OUT3.write(" %s\n" %seq) | |
| 207 | |
| 208 OUT3.write("\t;\n") | |
| 209 OUT3.write("End;\n") | |
| 210 OUT1.close() | |
| 211 OUT2.close() | |
| 212 OUT2.close() | |
| 213 | |
| 214 | |
| 215 ### 2 ### Nucleic | |
| 216 elif sys.argv[2] == "nucleic" : | |
| 217 os.mkdir("02_CDS_No_Missing_Data_nuc") | |
| 218 | |
| 219 zfile_nuc = zipfile.ZipFile(sys.argv[3]) | |
| 220 for name in zfile_nuc.namelist() : | |
| 221 zfile_nuc.extract(name, "./02_CDS_No_Missing_Data_nuc") | |
| 222 path_IN = "./02_CDS_No_Missing_Data_nuc" | |
| 223 L_IN = os.listdir(path_IN) | |
| 224 | |
| 225 OUT1 = open("03_Concatenation_nuc.fas", "w") | |
| 226 OUT2 = open("03_Concatenation_nuc.phy", "w") | |
| 227 OUT3 = open("03_Concatenation_nuc.nex", "w") | |
| 228 | |
| 229 OUT1_pos12 = open("03_Concatenation_pos12_nuc.fas", "w") | |
| 230 OUT2_pos12 = open("03_Concatenation_pos12_nuc.phy", "w") | |
| 231 OUT3_pos12 = open("03_Concatenation_pos12_nuc.nex", "w") | |
| 232 | |
| 233 OUT1_pos3 = open("03_Concatenation_pos3_nuc.fas", "w") | |
| 234 OUT2_pos3 = open("03_Concatenation_pos3_nuc.phy", "w") | |
| 235 OUT3_pos3 = open("03_Concatenation_pos3_nuc.nex", "w") | |
| 236 | |
| 237 OUT_PARTITION_codon_12_3 = open("05_partitions_codon12_3","w") | |
| 238 OUT_PARTITION_gene_NUC = open("05_partitions_gene_NUC","w") | |
| 239 OUT_PARTITION_gene_PLUS_codon_12_3 = open("05_partitions_gene_PLUS_codon12_3","w") | |
| 240 | |
| 241 ## Get bash with concatenation | |
| 242 bash_concatenation, ln, nb_locus, list_genes_position = concatenate(path_IN, SPECIES_ID_LIST) ### DEF 11 ## | |
| 243 ln_12 = ln/3*2 ### length of the alignment when only the 2 first codon position | |
| 244 ln_3 = ln/3 ### length of the alignment when only the third codon position | |
| 245 | |
| 246 ## Write partition files for RAxML subsequent runs | |
| 247 # a # Codon partition | |
| 248 OUT_PARTITION_codon_12_3.write("DNA, p1=1-%d\\3,2-%d\\3\n" %(ln, ln)) | |
| 249 OUT_PARTITION_codon_12_3.write("DNA, p2=3-%d\\3\n" %(ln)) | |
| 250 OUT_PARTITION_codon_12_3.close() | |
| 251 | |
| 252 # b # Gene partition | |
| 253 for sublist in list_genes_position: | |
| 254 name=sublist[0] | |
| 255 positions=sublist[1] | |
| 256 OUT_PARTITION_gene_NUC.write("DNA,%s=%s\n"%(name,positions)) | |
| 257 OUT_PARTITION_gene_NUC.close() | |
| 258 | |
| 259 # c # Mixed partition (codon + gene) | |
| 260 for sublist in list_genes_position: | |
| 261 name = sublist[0] | |
| 262 positions = sublist[1] | |
| 263 S1 = string.split(positions, "-") | |
| 264 pos_start1 = string.atoi(S1[0]) | |
| 265 pos_end = string.atoi(S1[1]) | |
| 266 pos_start2=pos_start1+1 | |
| 267 pos_start3=pos_start2+1 | |
| 268 partition1 = "DNA, %s_1=%d-%d\\3,%d-%d\\3\n" %(name,pos_start1, pos_end, pos_start2, pos_end) | |
| 269 partition2 = "DNA, %s_2=%d-%d\\3\n" %(name,pos_start3, pos_end) | |
| 270 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition1) | |
| 271 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition2) | |
| 272 | |
| 273 OUT_PARTITION_gene_PLUS_codon_12_3.close() | |
| 274 | |
| 275 | |
| 276 ## Get "ntax" for NEXUS HEADER | |
| 277 nb_taxa = len(bash_concatenation.keys()) | |
| 278 | |
| 279 print "******************** CONCATENATION ********************\n" | |
| 280 print "Process nucleotides concatenation:" | |
| 281 print "\tNumber of taxa aligned = %d" %nb_taxa | |
| 282 print "\tNumber of loci concatenated = %d\n" %nb_locus | |
| 283 print "\tTotal length of the concatenated sequences [All codon positions] = %d" %ln | |
| 284 print "\t\tTotal length of the concatenated sequences [Codon positions 1 & 2] = %d" %ln_12 | |
| 285 print "\t\tTotal length of the concatenated sequences [Codon position 3] = %d" %ln_3 | |
| 286 | |
| 287 | |
| 288 ## Print NEXUS HEADER: | |
| 289 OUT3.write("#NEXUS\n\n") | |
| 290 OUT3.write("Begin data;\n") | |
| 291 OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln)) | |
| 292 OUT3.write("\tFormat datatype=dna gap=-;\n") | |
| 293 OUT3.write("\tMatrix\n") | |
| 294 | |
| 295 OUT3_pos12.write("#NEXUS\n\n") | |
| 296 OUT3_pos12.write("Begin data;\n") | |
| 297 OUT3_pos12.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_12)) | |
| 298 OUT3_pos12.write("\tFormat datatype=dna gap=-;\n") | |
| 299 OUT3_pos12.write("\tMatrix\n") | |
| 300 | |
| 301 OUT3_pos3.write("#NEXUS\n\n") | |
| 302 OUT3_pos3.write("Begin data;\n") | |
| 303 OUT3_pos3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_3)) | |
| 304 OUT3_pos3.write("\tFormat datatype=dna gap=-;\n") | |
| 305 OUT3_pos3.write("\tMatrix\n") | |
| 306 | |
| 307 ## Print PHYLIP HEADER: | |
| 308 OUT2.write(" %d %d\n" %(nb_taxa, ln)) | |
| 309 OUT2_pos12.write(" %d %d\n" %(nb_taxa, ln_12)) | |
| 310 OUT2_pos3.write(" %d %d\n" %(nb_taxa, ln_3)) | |
| 311 | |
| 312 ## Print outputs | |
| 313 for seq_name in bash_concatenation.keys(): | |
| 314 seq = bash_concatenation[seq_name] | |
| 315 | |
| 316 ## Filtering the sequence in case of remaining "?" | |
| 317 seq = string.replace(seq, "?", "-") | |
| 318 | |
| 319 ## Get the differentes codons partitions | |
| 320 seq_pos1, seq_pos2, seq_pos12, seq_pos3 = get_codon_position(seq) ### DEF 12 ### | |
| 321 | |
| 322 #print seq FASTA FORMAT | |
| 323 OUT1.write(">%s\n" %seq_name) | |
| 324 OUT1.write("%s\n" %seq) | |
| 325 OUT1_pos12.write(">%s\n" %seq_name) | |
| 326 OUT1_pos12.write("%s\n" %seq_pos12) | |
| 327 OUT1_pos3.write(">%s\n" %seq_name) | |
| 328 OUT1_pos3.write("%s\n" %seq_pos3) | |
| 329 | |
| 330 #print seq PHYLIP FORMAT | |
| 331 OUT2.write("%s\n" %seq_name) | |
| 332 OUT2.write("%s\n" %seq) | |
| 333 OUT2_pos12.write("%s\n" %seq_name) | |
| 334 OUT2_pos12.write("%s\n" %seq_pos12) | |
| 335 OUT2_pos3.write("%s\n" %seq_name) | |
| 336 OUT2_pos3.write("%s\n" %seq_pos3) | |
| 337 | |
| 338 #print seq NEXUS FORMAT | |
| 339 OUT3.write("%s" %seq_name) | |
| 340 OUT3.write(" %s\n" %seq) | |
| 341 OUT3_pos12.write("%s" %seq_name) | |
| 342 OUT3_pos12.write(" %s\n" %seq_pos12) | |
| 343 OUT3_pos3.write("%s" %seq_name) | |
| 344 OUT3_pos3.write(" %s\n" %seq_pos3) | |
| 345 | |
| 346 | |
| 347 OUT3.write("\t;\n") | |
| 348 OUT3.write("End;\n") | |
| 349 OUT3_pos12.write("\t;\n") | |
| 350 OUT3_pos12.write("End;\n") | |
| 351 OUT3_pos3.write("\t;\n") | |
| 352 OUT3_pos3.write("End;\n") | |
| 353 | |
| 354 OUT1.close() | |
| 355 OUT2.close() | |
| 356 OUT3.close() | |
| 357 OUT1_pos12.close() | |
| 358 OUT2_pos12.close() | |
| 359 OUT3_pos12.close() | |
| 360 OUT1_pos3.close() | |
| 361 OUT2_pos3.close() | |
| 362 OUT3_pos3.close() | |
| 363 | |
| 364 print "\n\n\n******************** RAxML RUN ********************\n" |
