diff scripts/S01_concatenate.py @ 2:1f8d039bd241 draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 44a89d5eeb82789bfc643b33c11f391281b6374b
author abims-sbr
date Wed, 27 Sep 2017 10:03:45 -0400
parents f181bd945a6c
children 59f4b9ffd92b
line wrap: on
line diff
--- a/scripts/S01_concatenate.py	Thu Apr 13 09:48:47 2017 -0400
+++ b/scripts/S01_concatenate.py	Wed Sep 27 10:03:45 2017 -0400
@@ -29,11 +29,11 @@
 ####################
 ## Concatenate sequences
 ###########################
-def concatenate(folder_with_loci, SPECIES_ID_LIST):
+def concatenate(L_IN, SPECIES_ID_LIST):
     ## 4 ## Process files
     ## 4.1 ## Create the bash and the fasta names entries (name of the species)
     bash_concat = {}
-    
+
     for species_ID in SPECIES_ID_LIST:
         bash_concat[species_ID] = ''
 
@@ -44,9 +44,9 @@
     ## 4.2 ## Concatenate
     for file in L_IN:
         nb_locus=nb_locus+1
-        
+
         ## a ## Open alignments
-        file_IN = open("%s/%s" %(folder_with_loci, file), "r")
+        file_IN = open(file, "r")
         dico_seq = dico(file_IN)   ### DEF 0 ###
         file_IN.close()
         ## b ## Get alignment length + genes positions for RAxML
@@ -61,10 +61,10 @@
         RAxML_name = file[:-6]
         sublist = [RAxML_name, position]
         list_genes_position.append(sublist)
-        
+
         ## c ## Generate "empty" sequence with alignment length * "-"
         empty_seq = "-" * ln
-        
+
         ## d ## Concatenate
         ## d.1 ## Detect missing species in this alignment
         list_ID=[]
@@ -85,7 +85,7 @@
             else:
                 fasta_name = bash_fastaName[ID]
                 seq = dico_seq[fasta_name]
-                bash_concat[ID] = bash_concat[ID] + seq      
+                bash_concat[ID] = bash_concat[ID] + seq
 
     return(bash_concat, ln_concat, nb_locus, list_genes_position)
 ####################################
@@ -107,7 +107,7 @@
        pos1 =  seq_inORF[i]
        pos2 =  seq_inORF[i+1]
        pos3 =  seq_inORF[i+2]
-       
+
        seq_pos1 = seq_pos1 + pos1
        seq_pos2 = seq_pos2 + pos2
        seq_pos12 = seq_pos12 + pos1 + pos2
@@ -130,25 +130,25 @@
 fasta = "^.*fasta$"
 i=3
 
+## Arguments
+infiles_filter_assemblies = sys.argv[1]
+format_run = sys.argv[2]
+input_alignments = sys.argv[3]
+
 ## add file to list_species
-zfile = zipfile.ZipFile(sys.argv[1])
-for name in zfile.namelist() :
-	list_species.append(name)
+list_species = str.split(infiles_filter_assemblies,",")
 
 ## in SPECIES_ID_LIST, only the 2 first letters of name of species
 for name in list_species :
     name = name[:2]
     SPECIES_ID_LIST.append(name)
 
+## add alignment files to L_IN
+L_IN = str.split(input_alignments,",")
+print(L_IN)
 
 ### 1 ### Proteic
-if sys.argv[2] == "proteic" :
-    os.mkdir("02_CDS_No_Missing_Data_aa")
-    zfile_nuc = zipfile.ZipFile(sys.argv[3])
-    for name in zfile_nuc.namelist() :
-        zfile_nuc.extract(name, "./02_CDS_No_Missing_Data_aa")
-    path_IN = "./02_CDS_No_Missing_Data_aa"
-    L_IN = os.listdir(path_IN)
+if format_run == "proteic" :
 
     OUT1 = open("02_Concatenation_aa.fas", "w")
     OUT2 = open("02_Concatenation_aa.phy", "w")
@@ -157,7 +157,7 @@
 
 
     ##  Get bash with concatenation
-    bash_concatenation, ln, nb_locus,list_genes_position= concatenate(path_IN, SPECIES_ID_LIST)    ### DEF 11 ##
+    bash_concatenation, ln, nb_locus,list_genes_position= concatenate(L_IN, SPECIES_ID_LIST)    ### DEF 11 ##
 
     ## Write gene AA partition file for RAxML
     for sublist in list_genes_position:
@@ -189,14 +189,14 @@
     ## 3.5 ## Print outputs
     for seq_name in bash_concatenation.keys():
         seq = bash_concatenation[seq_name]
-    
+
         ## Filtering the sequence in case of remaining "?"
-        seq = string.replace(seq, "?", "-")  
+        seq = string.replace(seq, "?", "-")
 
         #print seq FASTA FORMAT
         OUT1.write(">%s\n" %seq_name)
         OUT1.write("%s\n" %seq)
-    
+
         #print seq PHYLIP FORMAT
         OUT2.write("%s\n" %seq_name)
         OUT2.write("%s\n" %seq)
@@ -213,14 +213,7 @@
 
 
 ### 2 ### Nucleic
-elif sys.argv[2] == "nucleic" :
-    os.mkdir("02_CDS_No_Missing_Data_nuc")
-
-    zfile_nuc = zipfile.ZipFile(sys.argv[3])
-    for name in zfile_nuc.namelist() :
-        zfile_nuc.extract(name, "./02_CDS_No_Missing_Data_nuc")
-    path_IN = "./02_CDS_No_Missing_Data_nuc"
-    L_IN = os.listdir(path_IN)
+elif format_run == "nucleic" :
 
     OUT1 = open("03_Concatenation_nuc.fas", "w")
     OUT2 = open("03_Concatenation_nuc.phy", "w")
@@ -239,7 +232,7 @@
     OUT_PARTITION_gene_PLUS_codon_12_3 = open("05_partitions_gene_PLUS_codon12_3","w")
 
     ## Get bash with concatenation
-    bash_concatenation, ln, nb_locus, list_genes_position = concatenate(path_IN, SPECIES_ID_LIST)    ### DEF 11 ##
+    bash_concatenation, ln, nb_locus, list_genes_position = concatenate(L_IN, SPECIES_ID_LIST)    ### DEF 11 ##
     ln_12 = ln/3*2   ### length of the alignment when only the 2 first codon position
     ln_3 = ln/3      ### length of the alignment when only the third codon position
 
@@ -269,7 +262,7 @@
         partition2 = "DNA, %s_2=%d-%d\\3\n" %(name,pos_start3, pos_end)
         OUT_PARTITION_gene_PLUS_codon_12_3.write(partition1)
         OUT_PARTITION_gene_PLUS_codon_12_3.write(partition2)
-    
+
     OUT_PARTITION_gene_PLUS_codon_12_3.close()
 
 
@@ -312,13 +305,13 @@
     ## Print outputs
     for seq_name in bash_concatenation.keys():
         seq = bash_concatenation[seq_name]
-    
+
         ## Filtering the sequence in case of remaining "?"
-        seq = string.replace(seq, "?", "-")  
+        seq = string.replace(seq, "?", "-")
 
         ## Get the differentes codons partitions
         seq_pos1, seq_pos2, seq_pos12, seq_pos3 = get_codon_position(seq)    ### DEF 12 ###
-    
+
         #print seq FASTA FORMAT
         OUT1.write(">%s\n" %seq_name)
         OUT1.write("%s\n" %seq)
@@ -326,7 +319,7 @@
         OUT1_pos12.write("%s\n" %seq_pos12)
         OUT1_pos3.write(">%s\n" %seq_name)
         OUT1_pos3.write("%s\n" %seq_pos3)
-    
+
         #print seq PHYLIP FORMAT
         OUT2.write("%s\n" %seq_name)
         OUT2.write("%s\n" %seq)