Mercurial > repos > abims-sbr > concatphyl
changeset 1:f181bd945a6c draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit ab76075e541dd7ece1090f6b55ca508ec0fde39d
author | lecorguille |
---|---|
date | Thu, 13 Apr 2017 09:48:47 -0400 |
parents | 6d930f037fea |
children | 1f8d039bd241 |
files | CHANGELOG.md README.md scripts/S01_concatenate.py |
diffstat | 3 files changed, 371 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- a/CHANGELOG.md Thu Apr 13 05:49:32 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -Changelog - -Version 1.0 - 13/04/2017 - - - Add funtional test with planemo - - Planemo test with conda dependencies for raxml and python - - Scripts renamed + symlinks to the directory 'scripts'
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Thu Apr 13 09:48:47 2017 -0400 @@ -0,0 +1,7 @@ +Changelog + +Version 1.0 - 13/04/2017 + + - Add funtional test with planemo + - Planemo test with conda dependencies for raxml and python + - Scripts renamed + symlinks to the directory 'scripts'
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S01_concatenate.py Thu Apr 13 09:48:47 2017 -0400 @@ -0,0 +1,364 @@ +#!/usr/bin/python +## Author: Eric Fontanillas +## Last modification: 17/06/2011 +## Subject: find and remove indels + + +############################### +##### DEF 0 : Dico fasta ##### +############################### +def dico(F2): + dicoco = {} + while 1: + next2 = F2.readline() + if not next2: + break + if next2[0] == ">": + fasta_name_query = next2[:-1] + Sn = string.split(fasta_name_query, "||") + fasta_name_query = Sn[0] + next3 = F2.readline() + fasta_seq_query = next3[:-1] + dicoco[fasta_name_query]=fasta_seq_query + return(dicoco) +################################################################################### + + +#################### +###### DEF 11 ###### +#################### +## Concatenate sequences +########################### +def concatenate(folder_with_loci, SPECIES_ID_LIST): + ## 4 ## Process files + ## 4.1 ## Create the bash and the fasta names entries (name of the species) + bash_concat = {} + + for species_ID in SPECIES_ID_LIST: + bash_concat[species_ID] = '' + + ln_concat = 0 + nb_locus = 0 + pos=1 + list_genes_position=[] + ## 4.2 ## Concatenate + for file in L_IN: + nb_locus=nb_locus+1 + + ## a ## Open alignments + file_IN = open("%s/%s" %(folder_with_loci, file), "r") + dico_seq = dico(file_IN) ### DEF 0 ### + file_IN.close() + ## b ## Get alignment length + genes positions for RAxML + key0 = dico_seq.keys()[0] + ln = len(dico_seq[key0]) + ln_concat = ln_concat + ln + + pos_start = pos + pos_end = pos+ln-1 + pos=pos_end+1 + position="%d-%d" %(pos_start, pos_end) + RAxML_name = file[:-6] + sublist = [RAxML_name, position] + list_genes_position.append(sublist) + + ## c ## Generate "empty" sequence with alignment length * "-" + empty_seq = "-" * ln + + ## d ## Concatenate + ## d.1 ## Detect missing species in this alignment + list_ID=[] + list_absent_ID=[] + bash_fastaName={} + for fasta_name in dico_seq: + ID = fasta_name[1:3] + list_ID.append(ID) + seq = dico_seq[fasta_name] + bash_fastaName[ID]=fasta_name + for sp_ID in SPECIES_ID_LIST: + if sp_ID not in list_ID: + list_absent_ID.append(sp_ID) + + for ID in SPECIES_ID_LIST: + if ID in list_absent_ID: + bash_concat[ID] = bash_concat[ID] + empty_seq + else: + fasta_name = bash_fastaName[ID] + seq = dico_seq[fasta_name] + bash_concat[ID] = bash_concat[ID] + seq + + return(bash_concat, ln_concat, nb_locus, list_genes_position) +#################################### + + +######################################## +##### DEF 12 : get codon position ##### +######################################## +def get_codon_position(seq_inORF): + + ln = len(seq_inORF) + + i=0 + seq_pos1="" + seq_pos2="" + seq_pos12="" + seq_pos3="" + while i<ln: + pos1 = seq_inORF[i] + pos2 = seq_inORF[i+1] + pos3 = seq_inORF[i+2] + + seq_pos1 = seq_pos1 + pos1 + seq_pos2 = seq_pos2 + pos2 + seq_pos12 = seq_pos12 + pos1 + pos2 + seq_pos3 = seq_pos3 + pos3 + + i = i+3 + + return(seq_pos1, seq_pos2, seq_pos12, seq_pos3) +############################################################################### + + + +####################### +##### RUN RUN RUN ##### +####################### +import string, os, time, re, sys, zipfile + +list_species = [] +SPECIES_ID_LIST = [] +fasta = "^.*fasta$" +i=3 + +## add file to list_species +zfile = zipfile.ZipFile(sys.argv[1]) +for name in zfile.namelist() : + list_species.append(name) + +## in SPECIES_ID_LIST, only the 2 first letters of name of species +for name in list_species : + name = name[:2] + SPECIES_ID_LIST.append(name) + + +### 1 ### Proteic +if sys.argv[2] == "proteic" : + os.mkdir("02_CDS_No_Missing_Data_aa") + zfile_nuc = zipfile.ZipFile(sys.argv[3]) + for name in zfile_nuc.namelist() : + zfile_nuc.extract(name, "./02_CDS_No_Missing_Data_aa") + path_IN = "./02_CDS_No_Missing_Data_aa" + L_IN = os.listdir(path_IN) + + OUT1 = open("02_Concatenation_aa.fas", "w") + OUT2 = open("02_Concatenation_aa.phy", "w") + OUT3 = open("02_Concatenation_aa.nex", "w") + OUT_PARTITION_gene_AA = open("06_partitions_gene_AA","w") + + + ## Get bash with concatenation + bash_concatenation, ln, nb_locus,list_genes_position= concatenate(path_IN, SPECIES_ID_LIST) ### DEF 11 ## + + ## Write gene AA partition file for RAxML + for sublist in list_genes_position: + name = sublist[0] + positions=sublist[1] + OUT_PARTITION_gene_AA.write("DNA,%s=%s\n"%(name,positions)) + OUT_PARTITION_gene_AA.close() + + ## Get "ntax" for NEXUS HEADER + nb_taxa = len(bash_concatenation.keys()) + + print "******************** CONCATENATION ********************\n" + print "Process amino-acid concatenation:" + print "\tNumber of taxa aligned = %d" %nb_taxa + print "\tNumber of loci concatenated = %d\n" %nb_locus + print "\tTotal length of the concatenated sequences = %d" %ln + + + ## Print NEXUS HEADER: + OUT3.write("#NEXUS\n\n") + OUT3.write("Begin data;\n") + OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln)) + OUT3.write("\tFormat datatype=aa gap=-;\n") + OUT3.write("\tMatrix\n") + + ## Print PHYLIP HEADER: + OUT2.write(" %d %d\n" %(nb_taxa, ln)) + + ## 3.5 ## Print outputs + for seq_name in bash_concatenation.keys(): + seq = bash_concatenation[seq_name] + + ## Filtering the sequence in case of remaining "?" + seq = string.replace(seq, "?", "-") + + #print seq FASTA FORMAT + OUT1.write(">%s\n" %seq_name) + OUT1.write("%s\n" %seq) + + #print seq PHYLIP FORMAT + OUT2.write("%s\n" %seq_name) + OUT2.write("%s\n" %seq) + + #print seq NEXUS FORMAT + OUT3.write("%s" %seq_name) + OUT3.write(" %s\n" %seq) + + OUT3.write("\t;\n") + OUT3.write("End;\n") + OUT1.close() + OUT2.close() + OUT2.close() + + +### 2 ### Nucleic +elif sys.argv[2] == "nucleic" : + os.mkdir("02_CDS_No_Missing_Data_nuc") + + zfile_nuc = zipfile.ZipFile(sys.argv[3]) + for name in zfile_nuc.namelist() : + zfile_nuc.extract(name, "./02_CDS_No_Missing_Data_nuc") + path_IN = "./02_CDS_No_Missing_Data_nuc" + L_IN = os.listdir(path_IN) + + OUT1 = open("03_Concatenation_nuc.fas", "w") + OUT2 = open("03_Concatenation_nuc.phy", "w") + OUT3 = open("03_Concatenation_nuc.nex", "w") + + OUT1_pos12 = open("03_Concatenation_pos12_nuc.fas", "w") + OUT2_pos12 = open("03_Concatenation_pos12_nuc.phy", "w") + OUT3_pos12 = open("03_Concatenation_pos12_nuc.nex", "w") + + OUT1_pos3 = open("03_Concatenation_pos3_nuc.fas", "w") + OUT2_pos3 = open("03_Concatenation_pos3_nuc.phy", "w") + OUT3_pos3 = open("03_Concatenation_pos3_nuc.nex", "w") + + OUT_PARTITION_codon_12_3 = open("05_partitions_codon12_3","w") + OUT_PARTITION_gene_NUC = open("05_partitions_gene_NUC","w") + OUT_PARTITION_gene_PLUS_codon_12_3 = open("05_partitions_gene_PLUS_codon12_3","w") + + ## Get bash with concatenation + bash_concatenation, ln, nb_locus, list_genes_position = concatenate(path_IN, SPECIES_ID_LIST) ### DEF 11 ## + ln_12 = ln/3*2 ### length of the alignment when only the 2 first codon position + ln_3 = ln/3 ### length of the alignment when only the third codon position + + ## Write partition files for RAxML subsequent runs + # a # Codon partition + OUT_PARTITION_codon_12_3.write("DNA, p1=1-%d\\3,2-%d\\3\n" %(ln, ln)) + OUT_PARTITION_codon_12_3.write("DNA, p2=3-%d\\3\n" %(ln)) + OUT_PARTITION_codon_12_3.close() + + # b # Gene partition + for sublist in list_genes_position: + name=sublist[0] + positions=sublist[1] + OUT_PARTITION_gene_NUC.write("DNA,%s=%s\n"%(name,positions)) + OUT_PARTITION_gene_NUC.close() + + # c # Mixed partition (codon + gene) + for sublist in list_genes_position: + name = sublist[0] + positions = sublist[1] + S1 = string.split(positions, "-") + pos_start1 = string.atoi(S1[0]) + pos_end = string.atoi(S1[1]) + pos_start2=pos_start1+1 + pos_start3=pos_start2+1 + partition1 = "DNA, %s_1=%d-%d\\3,%d-%d\\3\n" %(name,pos_start1, pos_end, pos_start2, pos_end) + partition2 = "DNA, %s_2=%d-%d\\3\n" %(name,pos_start3, pos_end) + OUT_PARTITION_gene_PLUS_codon_12_3.write(partition1) + OUT_PARTITION_gene_PLUS_codon_12_3.write(partition2) + + OUT_PARTITION_gene_PLUS_codon_12_3.close() + + + ## Get "ntax" for NEXUS HEADER + nb_taxa = len(bash_concatenation.keys()) + + print "******************** CONCATENATION ********************\n" + print "Process nucleotides concatenation:" + print "\tNumber of taxa aligned = %d" %nb_taxa + print "\tNumber of loci concatenated = %d\n" %nb_locus + print "\tTotal length of the concatenated sequences [All codon positions] = %d" %ln + print "\t\tTotal length of the concatenated sequences [Codon positions 1 & 2] = %d" %ln_12 + print "\t\tTotal length of the concatenated sequences [Codon position 3] = %d" %ln_3 + + + ## Print NEXUS HEADER: + OUT3.write("#NEXUS\n\n") + OUT3.write("Begin data;\n") + OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln)) + OUT3.write("\tFormat datatype=dna gap=-;\n") + OUT3.write("\tMatrix\n") + + OUT3_pos12.write("#NEXUS\n\n") + OUT3_pos12.write("Begin data;\n") + OUT3_pos12.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_12)) + OUT3_pos12.write("\tFormat datatype=dna gap=-;\n") + OUT3_pos12.write("\tMatrix\n") + + OUT3_pos3.write("#NEXUS\n\n") + OUT3_pos3.write("Begin data;\n") + OUT3_pos3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_3)) + OUT3_pos3.write("\tFormat datatype=dna gap=-;\n") + OUT3_pos3.write("\tMatrix\n") + + ## Print PHYLIP HEADER: + OUT2.write(" %d %d\n" %(nb_taxa, ln)) + OUT2_pos12.write(" %d %d\n" %(nb_taxa, ln_12)) + OUT2_pos3.write(" %d %d\n" %(nb_taxa, ln_3)) + + ## Print outputs + for seq_name in bash_concatenation.keys(): + seq = bash_concatenation[seq_name] + + ## Filtering the sequence in case of remaining "?" + seq = string.replace(seq, "?", "-") + + ## Get the differentes codons partitions + seq_pos1, seq_pos2, seq_pos12, seq_pos3 = get_codon_position(seq) ### DEF 12 ### + + #print seq FASTA FORMAT + OUT1.write(">%s\n" %seq_name) + OUT1.write("%s\n" %seq) + OUT1_pos12.write(">%s\n" %seq_name) + OUT1_pos12.write("%s\n" %seq_pos12) + OUT1_pos3.write(">%s\n" %seq_name) + OUT1_pos3.write("%s\n" %seq_pos3) + + #print seq PHYLIP FORMAT + OUT2.write("%s\n" %seq_name) + OUT2.write("%s\n" %seq) + OUT2_pos12.write("%s\n" %seq_name) + OUT2_pos12.write("%s\n" %seq_pos12) + OUT2_pos3.write("%s\n" %seq_name) + OUT2_pos3.write("%s\n" %seq_pos3) + + #print seq NEXUS FORMAT + OUT3.write("%s" %seq_name) + OUT3.write(" %s\n" %seq) + OUT3_pos12.write("%s" %seq_name) + OUT3_pos12.write(" %s\n" %seq_pos12) + OUT3_pos3.write("%s" %seq_name) + OUT3_pos3.write(" %s\n" %seq_pos3) + + + OUT3.write("\t;\n") + OUT3.write("End;\n") + OUT3_pos12.write("\t;\n") + OUT3_pos12.write("End;\n") + OUT3_pos3.write("\t;\n") + OUT3_pos3.write("End;\n") + + OUT1.close() + OUT2.close() + OUT3.close() + OUT1_pos12.close() + OUT2_pos12.close() + OUT3_pos12.close() + OUT1_pos3.close() + OUT2_pos3.close() + OUT3_pos3.close() + +print "\n\n\n******************** RAxML RUN ********************\n"