changeset 11:06a28df198b6 draft default tip

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1
author lecorguille
date Mon, 24 Sep 2018 03:58:34 -0400
parents 3d00be2d05f3
children
files CDS_search.xml macros.xml scripts/S01_find_orf_on_multiple_alignment.py scripts/S02_remove_too_short_bit_or_whole_sequence.py scripts/S03_remove_site_with_not_enough_species_represented.py test-data/outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_14_with_2_species.fasta test-data/outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_6_with_2_species.fasta test-data/outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_14_with_2_species.fasta test-data/outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_6_with_2_species.fasta test-data/outputs_ORF_Search_05_CDS_aa/test1/orthogroup_14_with_2_species.fasta test-data/outputs_ORF_Search_05_CDS_aa/test1/orthogroup_6_with_2_species.fasta test-data/outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_14_with_2_species.fasta test-data/outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_6_with_2_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/old_orthogroup_7_with_3_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_14_with_2_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_1_with_2_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_6_with_2_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_7_with_3_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/old_orthogroup_7_with_3_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_14_with_2_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_1_with_2_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_6_with_2_species.fasta test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_7_with_3_species.fasta
diffstat 23 files changed, 214 insertions(+), 401 deletions(-) [+]
line wrap: on
line diff
--- a/CDS_search.xml	Fri Jul 06 02:52:15 2018 -0400
+++ b/CDS_search.xml	Mon Sep 24 03:58:34 2018 -0400
@@ -1,4 +1,4 @@
-<tool name="CDS_search" id="cds_search" version="2.0.2">
+<tool name="CDS_search" id="cds_search" version="2.1.2">
 
 	<description>
 		ORF and CDS search
@@ -23,6 +23,7 @@
         python $__tool_directory__/scripts/S01_find_orf_on_multiple_alignment.py
         $__tool_directory__/scripts/code_universel_modified.txt
         $length.min_length_seq
+        $nb_species_keep
         list_files
         > '$log' &&
 
@@ -120,11 +121,10 @@
 
 	<tests>
 
-        <!-- tests with name "orthogroup_x_witn_y_sequences.fasta" -->
 		<test>
             <param name="inputs" ftype="fasta" value="inputs/orthogroup_1_with_4_sequences.fasta,inputs/orthogroup_6_with_4_sequences.fasta,inputs/orthogroup_7_with_3_sequences.fasta,inputs/orthogroup_8_with_4_sequences.fasta,inputs/orthogroup_12_with_5_sequences.fasta,inputs/orthogroup_14_with_4_sequences.fasta" />
-			<param name="nb_species_keep" value="2" />
-			<param name="methionine" value="no" />
+			<param name="nb_species_keep" value="3" />
+			<param name="methionine" value="non" />
             <section name="length">
                 <param name="min_length_seq" value="50" />
                 <param name="min_length_subseq" value="15" />
@@ -133,51 +133,34 @@
 			<param name="out_BESTORF" value="both" />
 			<param name="out_CDS" value="both" />
 			<param name="out_CDS_filter" value="both" />
-            <output_collection name="output_BESTORF_aa" type="list">
+            <output_collection name="output_BESTORF_aa" type="list" count="2">
                 <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_1_with_3_species.fasta" />
-                <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_6_with_2_species.fasta" />
                 <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_7_with_3_species.fasta" />
-                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_14_with_2_species.fasta" />
             </output_collection>
-            <output_collection name="output_BESTORF_nuc" type="list">
+            <output_collection name="output_BESTORF_nuc" type="list" count="2">
                 <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_1_with_3_species.fasta" />
-                <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_6_with_2_species.fasta" />
                 <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_7_with_3_species.fasta" />
-                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_14_with_2_species.fasta" />
             </output_collection>
-            <output_collection name="output_CDS_aa" type="list">
+            <output_collection name="output_CDS_aa" type="list" count="2">
                 <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_05_CDS_aa/test1/orthogroup_1_with_3_species.fasta" />
-                <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_05_CDS_aa/test1/orthogroup_6_with_2_species.fasta" />
                 <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_05_CDS_aa/test1/orthogroup_7_with_3_species.fasta" />
-                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_05_CDS_aa/test1/orthogroup_14_with_2_species.fasta" />
             </output_collection>
-            <output_collection name="output_CDS_nuc" type="list">
+            <output_collection name="output_CDS_nuc" type="list" count="2">
                 <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_1_with_3_species.fasta" />
-                <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_6_with_2_species.fasta" />
                 <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_7_with_3_species.fasta" />
-                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_14_with_2_species.fasta" />
             </output_collection>
-           
-            <output_collection name="output_filter_aa" type="list">
-                <element name="orthogroup_1_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_1_with_2_species.fasta" />
-                <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_6_with_2_species.fasta" />
+            <output_collection name="output_filter_aa" type="list" count="1">
                 <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_7_with_3_species.fasta" />
-                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_14_with_2_species.fasta" />
             </output_collection>
-            <output_collection name="output_filter_nuc" type="list">
-                <element name="orthogroup_1_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_1_with_2_species.fasta" />
-                <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_6_with_2_species.fasta" />
+            <output_collection name="output_filter_nuc" type="list" count="1">
                 <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_7_with_3_species.fasta" />
-                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_14_with_2_species.fasta" />
             </output_collection>
-            <output_collection name="output_filter_aa" type="list" count="4"/>
-            <output_collection name="output_filter_nuc" type="list" count="4"/>
 		</test>
-        
+
         <test>
             <param name="inputs" ftype="fasta" value="inputs/orthogroup_1_with_4_sequences.fasta,inputs/orthogroup_6_with_4_sequences.fasta,inputs/orthogroup_7_with_3_sequences.fasta,inputs/orthogroup_8_with_4_sequences.fasta,inputs/orthogroup_12_with_5_sequences.fasta,inputs/orthogroup_14_with_4_sequences.fasta" />
             <param name="nb_species_keep" value="2" />
-            <param name="methionine" value="no" />
+            <param name="methionine" value="oui" />
             <section name="length">
                 <param name="min_length_seq" value="50" />
                 <param name="min_length_subseq" value="15" />
@@ -186,25 +169,26 @@
             <param name="out_BESTORF" value="both" />
             <param name="out_CDS" value="both" />
             <param name="out_CDS_filter" value="both" />			
-            <output_collection name="output_BESTORF_aa" type="list">
+            <output_collection name="output_BESTORF_aa" type="list" count="4">
                 <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_1_with_3_species.fasta" />
                 <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_6_with_2_species.fasta" />
                 <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_7_with_3_species.fasta" />
                 <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_14_with_2_species.fasta" />
             </output_collection>
-            <output_collection name="output_BESTORF_nuc" type="list">
+            <output_collection name="output_BESTORF_nuc" type="list" count="4">
                 <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_1_with_3_species.fasta" />
                 <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_6_with_2_species.fasta" />
                 <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_7_with_3_species.fasta" />
                 <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_14_with_2_species.fasta" />
             </output_collection>            
-            <output_collection name="output_filter_aa" type="list">
-                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_14_with_2_species.fasta" />
+            <output_collection name="output_filter_aa" type="list" count="1">
+                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_aa/test2/orthogroup_14_with_2_species.fasta" />
             </output_collection>
-            <output_collection name="output_filter_nuc" type="list">
-                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_14_with_2_species.fasta" />
+            <output_collection name="output_filter_nuc" type="list" count="1">
+                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_nuc/test2/orthogroup_14_with_2_species.fasta" />
             </output_collection>
 		</test>
+
 	</tests>
 	<help>
 
@@ -315,6 +299,8 @@
 
 	</help>
 
-	<expand macro="citations" />
+    <citations>
+
+    </citations>
 
 </tool>
--- a/macros.xml	Fri Jul 06 02:52:15 2018 -0400
+++ b/macros.xml	Mon Sep 24 03:58:34 2018 -0400
@@ -7,13 +7,14 @@
     <token name="@HELP_AUTHORS@">
 .. class:: infomark
 
-**Authors**  Eric Fontanillas creates the scripts of this pipeline.
+**Authors**  Eric Fontanillas created the version 1 of this pipeline. Victor Mataigne developped version 2.
 
 .. class:: infomark
 
-**Galaxy integration** ABiMS TEAM
+**Galaxy integration** Julie Baffard and ABiMS TEAM, Roscoff Marine Station
 
  | Contact support.abims@sb-roscoff.fr for any questions or concerns about the Galaxy implementation of this tool.
+ | Credits : Gildas le Corguillé, Misharl Monsoor
 
 ---------------------------------------------------
 
--- a/scripts/S01_find_orf_on_multiple_alignment.py	Fri Jul 06 02:52:15 2018 -0400
+++ b/scripts/S01_find_orf_on_multiple_alignment.py	Mon Sep 24 03:58:34 2018 -0400
@@ -1,24 +1,23 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # coding: utf8
-## Author: Eric Fontanillas
-## Modification: 03/09/14 by Julie BAFFARD
-## Last modification : 05/03/18 by Victor Mataigne
+# Author: Eric Fontanillas
+# Modification: 03/09/14 by Julie BAFFARD
+# Last modification : 25/07/18 by Victor Mataigne
 
-## Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria
-                ## CRITERIA 1 ## Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF
-                ## CRITERIA 2 ## This longest part should be > 150nc or 50aa
-                ## CRITERIA 3 ## [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa
-                                 ## OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA
-                                 ## OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA
+# Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria
+                # CRITERIA 1 - Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF
+                # CRITERIA 2 - This longest part should be > 150nc or 50aa
+                # CRITERIA 3 - [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa
+                                 # OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA
+                                 # OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA
 
-####################################################
-###### DEF 2 : Create bash for genetic code ########
-####################################################
-###       KEY = codon
-###       VALUE = Amino Acid
+import string, os, time, re, zipfile, sys, argparse
+from dico import dico
 
 def code_universel(F1):
+    """ Creates bash for genetic code (key : codon ; value : amino-acid) """
     bash_codeUniversel = {}
+
     with open(F1, "r") as file:
         for line in file.readlines():
             L1 = string.split(line, " ")
@@ -31,110 +30,85 @@
                 key = L1[0]
                 value = L1[2]
                 bash_codeUniversel[key] = value
-    return(bash_codeUniversel)
-###########################################################
-
 
-######################################################################################################################
-##### DEF 3 : Test if the sequence is a multiple of 3, and if not correct the sequence to become a multiple of 3 #####
-######################################################################################################################
-### WEAKNESS OF THAT APPROACH = I remove extra base(s) at the end of the sequence ==> I can lost a codon, when I test ORF (as I will decay the ORF)
+    return(bash_codeUniversel)
+
 def multiple3(seq):
-    leng = len(seq)
-    modulo = leng%3
-    if modulo == 0:   # the results of dividing leng per 3 is an integer
-        new_seq = seq
-    elif modulo == 1:    # means 1 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3)
-        new_seq = seq[:-1]  # remove the last nc
-    elif modulo == 2:  # means 2 extra nc (nucleotid) needs to be removed (the remaining of modulo indicate the part which is non-dividable per 3)
-        new_seq = seq[:-2]  # remove the 2 last nc
-    len1 = len(new_seq)
-    return(new_seq, modulo)
-##########################################################
+    """ Tests if the sequence is a multiple of 3, and if not removes extra-bases 
+        !! Possible to lost a codon, when I test ORF (as I will decay the ORF) """
+    
+    m = len(seq)%3
+    if m != 0 :
+        return seq[:-m], m
+    else :
+        return seq, m
 
+def detect_Methionine(seq_aa, Ortho, minimal_cds_length):
+    """ Detects if methionin in the aa sequence """
 
-#############################
-###### DEF 4 : GET ORF ######
-#############################
-##- MULTIPLE SEQUENCE BASED : Based on ALIGNMENT of several sequences
-##- CRITERIA1: Get the segment in the alignment with no codon stop
+    ln = len(seq_aa)
+    CUTOFF_Last_50aa = ln - minimal_cds_length
 
+    # Find all indices of occurances of "M" in a string of aa   
+    list_indices = [pos for pos, char in enumerate(seq_aa) if char == "M"]
 
-###### DEF 4 - Part 1 - ######
-##############################
-def simply_get_ORF(seq_dna, bash_codeUniversel):
-    seq_aa = ""
-    i = 0
-    len1 = len(seq_dna)
-    while i < len1:
-        base1 = seq_dna[i]
-        base1 = string.capitalize(base1)
-        base2 = seq_dna[i+1]
-        base2 = string.capitalize(base2)
-        base3 = seq_dna[i+2]
-        base3 = string.capitalize(base3)
+    # If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS
+    if list_indices != []:
+        first_M = list_indices[0]
+        if first_M < CUTOFF_Last_50aa:
+            Ortho = 1  # means orthologs found
 
-        codon = base1+base2+base3
-        codon = string.replace(codon, "T", "U")
+    return(Ortho)
+
+def ReverseComplement2(seq):
+    """ Reverse complement DNA sequence """
+    seq1 = 'ATCGN-TAGCN-atcgn-tagcn-'
+    seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 }
 
-        if codon in bash_codeUniversel.keys():
-            aa = bash_codeUniversel[codon]
-            seq_aa = seq_aa + aa
-        else:
-            seq_aa = seq_aa +"?"    ### Take account for gap "-" and "N"
-        i = i + 3
+    return "".join([seq_dict[base] for base in reversed(seq)])
+
+def simply_get_ORF(seq_dna, gen_code):
+    seq_by_codons = [seq_dna.upper().replace('T', 'U')[i:i+3] for i in range(0, len(seq_dna), 3)]
+    seq_by_aa = [gen_code[codon] if codon in gen_code.keys() else '?' for codon in seq_by_codons]
 
-    return(seq_aa)
-##########################################################
-
+    return ''.join(seq_by_aa)
 
-###### DEF 4 - Part 2 - ######
-##############################
-def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel):
+def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel, minimal_cds_length, min_spec):
+    # Multiple sequence based : Based on the alignment of several sequences (orthogroup)
+    # Criteria 1 : Get the segment in the alignment with no codon stop
 
-    ## 1 ## Get the list of aligned aa seq for the 3 ORF:
+    # 1 - Get the list of aligned aa seq for the 3 ORF:
     bash_of_aligned_aa_seq_3ORF = {}
     bash_of_aligned_nuc_seq_3ORF = {}
     BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = []
+    
     for fasta_name in bash_aligned_nc_seq.keys():
-        ## 1.1. ## Get the raw sequence
+        # Get sequence, chek if multiple 3, then get 6 orfs
         sequence_nc = bash_aligned_nc_seq[fasta_name]
-
-        ## 1.2. ## Check whether the sequence is multiple of 3, and correct it if not:
-        new_sequence_nc, modulo = multiple3(sequence_nc)                                  ### DEF 3 ###
-
-        ## 1.3. ## Get the 3 ORFs (nuc) for each sequence
-        seq_nuc_ORF1 = new_sequence_nc
-        seq_nuc_ORF2 = new_sequence_nc[1:-2]
-        seq_nuc_ORF3 = new_sequence_nc[2:-1]
-        seq_reversed=ReverseComplement2(seq_nuc_ORF1)
-        seq_nuc_ORF4=seq_reversed
-        seq_nuc_ORF5=seq_reversed[1:-2]
-        seq_nuc_ORF6=seq_reversed[2:-1]
+        new_sequence_nc, modulo = multiple3(sequence_nc)
+        new_sequence_rev = ReverseComplement2(new_sequence_nc)        
+        # For each seq of the multialignment => give the 6 ORFs (in nuc)
+        bash_of_aligned_nuc_seq_3ORF[fasta_name] = [new_sequence_nc, new_sequence_nc[1:-2], new_sequence_nc[2:-1], new_sequence_rev, new_sequence_rev[1:-2], new_sequence_rev[2:-1]]
 
-        LIST_6_ORF_nuc = [seq_nuc_ORF1, seq_nuc_ORF2, seq_nuc_ORF3,seq_nuc_ORF4,seq_nuc_ORF5,seq_nuc_ORF6]
-        bash_of_aligned_nuc_seq_3ORF[fasta_name] = LIST_6_ORF_nuc     ### For each seq of the multialignment => give the 6 ORFs (in nuc)
+        seq_prot_ORF1 = simply_get_ORF(new_sequence_nc, bash_codeUniversel)
+        seq_prot_ORF2 = simply_get_ORF(new_sequence_nc[1:-2], bash_codeUniversel)
+        seq_prot_ORF3 = simply_get_ORF(new_sequence_nc[2:-1], bash_codeUniversel)
+        seq_prot_ORF4 = simply_get_ORF(new_sequence_rev, bash_codeUniversel)
+        seq_prot_ORF5 = simply_get_ORF(new_sequence_rev[1:-2], bash_codeUniversel)
+        seq_prot_ORF6 = simply_get_ORF(new_sequence_rev[2:-1], bash_codeUniversel)
+        
+        # For each seq of the multialignment => give the 6 ORFs (in aa)
+        bash_of_aligned_aa_seq_3ORF[fasta_name] = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3, seq_prot_ORF4, seq_prot_ORF5, seq_prot_ORF6]
 
-        ## 1.4. ## Get the 3 ORFs (aa) for each sequence
-        seq_prot_ORF1 = simply_get_ORF(seq_nuc_ORF1,bash_codeUniversel)                ### DEF 4 - Part 1 - ##
-        seq_prot_ORF2 = simply_get_ORF(seq_nuc_ORF2,bash_codeUniversel)                ### DEF 4 - Part 1 - ##
-        seq_prot_ORF3 = simply_get_ORF(seq_nuc_ORF3,bash_codeUniversel)                ### DEF 4 - Part 1 - ##
-        seq_prot_ORF4 = simply_get_ORF(seq_nuc_ORF4,bash_codeUniversel)                ### DEF 4 - Part 1 - ##
-        seq_prot_ORF5 = simply_get_ORF(seq_nuc_ORF5,bash_codeUniversel)                ### DEF 4 - Part 1 - ##
-        seq_prot_ORF6 = simply_get_ORF(seq_nuc_ORF6,bash_codeUniversel)                ### DEF 4 - Part 1 - ##
+    # 2 - Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF)
+    BEST_MAX = 0
 
-        LIST_6_ORF_aa = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3,seq_prot_ORF4,seq_prot_ORF5,seq_prot_ORF6]
-        bash_of_aligned_aa_seq_3ORF[fasta_name] = LIST_6_ORF_aa     ### For each seq of the multialignment => give the 6 ORFs (in aa)
-
-    ## 2 ## Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF)
-    BEST_MAX = 0
-    for i in [0,1,2,3,4,5]:   ### Test the 6 ORFs
+    for i in [0,1,2,3,4,5]: # Test the 6 ORFs
         ORF_Aligned_aa = []
         ORF_Aligned_nuc = []
 
-
-        ## 2.1 ## Get the alignment of sequence for a given ORF
-        ## Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd
+        # 2.1 - Get the alignment of sequence for a given ORF
+        # Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd
         for fasta_name in bash_of_aligned_aa_seq_3ORF.keys():
             ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i]
             aa_length = len(ORFsequence)
@@ -145,12 +119,12 @@
         for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys():
             ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i]
             nuc_length = len(ORFsequence)
-            ORF_Aligned_nuc.append(ORFsequence)   ### List of all sequences in the ORF nb "i" =
+            ORF_Aligned_nuc.append(ORFsequence) # List of all sequences in the ORF nb "i" =
 
-        ## 2.2 ## Get the list of sublist of positions whithout codon stop in the alignment
-        ## For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF)
-        ## Next step is to get the longuest subsequence whithout stop
-        ## We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*"
+        # 2.2 - Get the list of sublist of positions whithout codon stop in the alignment
+        # For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF)
+        # Next step is to get the longuest subsequence whithout stop
+        # We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*"
         MAX_LENGTH = 0
         LONGUEST_SEGMENT_UNSTOPPED = ""
         j = 0 # Start from first position in alignment
@@ -162,20 +136,20 @@
                     column.append(seq[j])
                 j = j+1
                 if "*" in column:
-                    List_of_List_subsequences.append(List_positions_subsequence) ## Add previous list of positions
-                    List_positions_subsequence = []                              ## Re-initialyse list of positions
+                    List_of_List_subsequences.append(List_positions_subsequence) # Add previous list of positions
+                    List_positions_subsequence = []                              # Re-initialyse list of positions
                 else:
                     List_positions_subsequence.append(j)
 
-        ## 2.3 ## Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF)
+        # 2.3 - Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF)
         LONGUEST_SUBSEQUENCE_LIST_POSITION = []
         MAX=0
         for sublist in List_of_List_subsequences:
-            if len(sublist) > MAX and len(sublist) > MINIMAL_CDS_LENGTH:
+            if len(sublist) > MAX and len(sublist) > minimal_cds_length:
                 MAX = len(sublist)
                 LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist
 
-        ## 2.4. ## Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated)
+        # 2.4. - Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated)
         if LONGUEST_SUBSEQUENCE_LIST_POSITION != []:
             if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0:
                 CDS_maybe_truncated = 1
@@ -185,17 +159,17 @@
             CDS_maybe_truncated = 0
 
 
-        ## 2.5 ## Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF)
-        ## Test whether it is the better ORF
+        # 2.5 - Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF)
+        # Test whether it is the better ORF
         if MAX > BEST_MAX:
             BEST_MAX = MAX
             BEST_ORF = i+1
             BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = LONGUEST_SUBSEQUENCE_LIST_POSITION
 
 
-    ## 3 ## ONCE we have this better segment (BEST CODING SEGMENT)
-    ## ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position)
-    ## And get the INDEX of the best ORF [0, 1, or 2]
+    # 3 - ONCE we have this better segment (BEST CODING SEGMENT)
+    # ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position)
+    # And get the INDEX of the best ORF [0, 1, or 2]
     if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []:
         pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0]
         pos_MIN_aa = pos_MIN_aa - 1
@@ -205,13 +179,13 @@
         BESTORF_bash_of_aligned_aa_seq = {}
         BESTORF_bash_of_aligned_aa_seq_CODING = {}
         for fasta_name in bash_of_aligned_aa_seq_3ORF.keys():
-            index_BEST_ORF = BEST_ORF-1  ### cause list going from 0 to 2 in LIST_3_ORF, while the ORF nb is indexed from 1 to 3
+            index_BEST_ORF = BEST_ORF-1  # cause list going from 0 to 2 in LIST_3_ORF, while the ORF nb is indexed from 1 to 3
             seq = bash_of_aligned_aa_seq_3ORF[fasta_name][index_BEST_ORF]
             seq_coding = seq[pos_MIN_aa:pos_MAX_aa]
             BESTORF_bash_of_aligned_aa_seq[fasta_name] = seq
             BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding
 
-        ## 4 ## Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment
+        # 4 - Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment
         pos_MIN_nuc = pos_MIN_aa * 3
         pos_MAX_nuc = pos_MAX_aa * 3
 
@@ -223,224 +197,122 @@
             BESTORF_bash_aligned_nc_seq[fasta_name] = seq
             BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding
 
-    else: ### no CDS found ###
+    else: # no CDS found
         BESTORF_bash_aligned_nc_seq = {}
         BESTORF_bash_aligned_nc_seq_CODING = {}
         BESTORF_bash_of_aligned_aa_seq = {}
         BESTORF_bash_of_aligned_aa_seq_CODING ={}
 
-
-
-    ### Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa
-    ###########################################################################################################
-
+    # Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa
+    
     BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {}
     BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {}
 
     Ortho = 0
     for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys():
         seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name]
-        Ortho = detect_Methionine(seq_aa, Ortho)                                ### DEF6 ###
+        Ortho = detect_Methionine(seq_aa, Ortho, minimal_cds_length)   ### DEF6 ###
 
-    ## CASE 1: A "M" is present and correctly localized (not in last 50 aa)
+    # CASE 1: A "M" is present and correctly localized (not in last 50 aa)
     if Ortho == 1:
         BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING
         BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING
 
-    ## CASE 2: in case the CDS is truncated, so the "M" is maybe missing:
+    # CASE 2: in case the CDS is truncated, so the "M" is maybe missing:
     if Ortho == 0 and CDS_maybe_truncated == 1:
         BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING
         BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING
 
-    ## CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa):
+    # CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa):
         ## => the 2 bash "CDS_with_M" are left empty ("{}")
 
     return(BESTORF_bash_aligned_nc_seq,  BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M)
-##########################################################
 
-
-##################################################################################################
-###### DEF 5 : Detect all indices corresponding to all occurance of a substring in a string ######
-##################################################################################################
-def allindices(string, sub):
-    listindex=[]
-    offset=0
-    i = string.find(sub, offset)
-    while i >= 0:
-        listindex.append(i)
-        i = string.find(sub, i + 1)
-    return listindex
-######################################################
-
+def write_output_file(results_dict, name_elems, path_out):
+    if results_dict != {}:
+        name_elems[3] = str(len(results_dict.keys()))
+        new_name = "_".join(name_elems)
 
-############################################################
-###### DEF 6 : Detect if methionin in the aa sequence ######
-############################################################
-def detect_Methionine(seq_aa, Ortho):
-
-    ln = len(seq_aa)
-    nbre = sys.argv[2]
-    CUTOFF_Last_50aa = ln - MINIMAL_CDS_LENGTH
-    #Ortho = 0  ## means orthologs not found
+        out1 = open("%s/%s" %(path_out,new_name), "w")
+        for fasta_name in results_dict.keys():
+            seq = results_dict[fasta_name]
+            out1.write("%s\n" %fasta_name)
+            out1.write("%s\n" %seq)
+        out1.close()
 
-    ## Find all indices of occurances of "M" in a string of aa
-    list_indices = allindices(seq_aa, "M")                  ### DEF5 ###
-
-    ## If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS
-    if list_indices != []:
-        first_M = list_indices[0]
-        if first_M < CUTOFF_Last_50aa:
-            Ortho = 1  ## means orthologs found
-
-    return(Ortho)
-###################################
-
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("codeUniversel", help="File describing the genetic code (code_universel_modified.txt")
+    parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int)    
+    parser.add_argument("min_spec", help="Minimal number of species per alignment")
+    parser.add_argument("list_files", help="File with all input files names")
+    args = parser.parse_args()
 
-############################################################
-###### DEF 7 : Reverse complement DNA sequence ######
-###### Reference: http://crazyhottommy.blogspot.fr/2013/10/python-code-for-getting-reverse.html
-############################################################
-def ReverseComplement2(seq):
-    # too lazy to construct the dictionary manually, use a dict comprehension
-    seq1 = 'ATCGN-TAGCN-atcgn-tagcn-'
-    seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 }
-    return "".join([seq_dict[base] for base in reversed(seq)])
-############################
-
-
-#######################
-##### RUN RUN RUN #####
-#######################
-import string, os, time, re, zipfile, sys
-from dico import dico
-
-MINIMAL_CDS_LENGTH = int(sys.argv[2])  ## in aa number
-
-### Get Universal Code
-bash_codeUniversel = code_universel(sys.argv[1])  ### DEF2 ###
+    minimal_cds_length = int(args.min_cds_len)  # in aa number
+    bash_codeUniversel = code_universel(args.codeUniversel)
+    minimum_species = int(args.min_spec)
+    
+    # Inputs from file containing list of species
+    list_files = []
+    with open(args.list_files, 'r') as f:
+        for line in f.readlines():
+            list_files.append(line.strip('\n'))
 
-## INPUT from file containing list of species
-list_files = []
-with open(sys.argv[3], 'r') as f:
-    for line in f.readlines():
-        list_files.append(line.strip('\n'))
-
-os.mkdir("04_BEST_ORF_nuc")
-Path_OUT1 = "04_BEST_ORF_nuc"
-os.mkdir("04_BEST_ORF_aa")
-Path_OUT2 = "04_BEST_ORF_aa"
+    # Directories for results
+    dirs = ["04_BEST_ORF_nuc", "04_BEST_ORF_aa", "05_CDS_nuc", "05_CDS_aa", "06_CDS_with_M_nuc", "06_CDS_with_M_aa"]
+    for directory in dirs:
+        os.mkdir(directory)
 
-os.mkdir("05_CDS_nuc")
-Path_OUT3 = "05_CDS_nuc"
-os.mkdir("05_CDS_aa")
-Path_OUT4 = "05_CDS_aa"
+    count_file_processed, count_file_with_CDS, count_file_without_CDS, count_file_with_CDS_plus_M = 0, 0, 0, 0
+    count_file_with_cds_and_enought_species, count_file_with_cds_M_and_enought_species = 0, 0
 
-os.mkdir("06_CDS_with_M_nuc")
-Path_OUT5 = "06_CDS_with_M_nuc"
-os.mkdir("06_CDS_with_M_aa")
-Path_OUT6 = "06_CDS_with_M_aa"
-
-### Get the Bash corresponding to an alignment file in fasta format
-count_file_processed = 0
-count_file_with_CDS = 0
-count_file_without_CDS = 0
-count_file_with_CDS_plus_M = 0
-
-# ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name),
-# and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified.
+    # ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name),
+    # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified.
+    name_elems = ["orthogroup", "0", "with", "0", "species.fasta"]
 
-name_elems = ["orthogroup", "0", "with", "0", "species.fasta"]
-
-# by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond
-#n0 = 0
-
-for file in list_files:
-    #n0 += 1
+    # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond
+    #n0 = 0
 
-    count_file_processed = count_file_processed + 1
-    nb_gp = file.split('_')[1] # Keep trace of the orthogroup number
-    fasta_file_path = "./%s" %file    
-    bash_fasta = dico(fasta_file_path)   ### DEF 1 ###    
-    BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M  = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel)   ### DEF 4 - PART 2 - ###
-    
-    name_elems[1] = nb_gp
+    for file in list_files:
+        #n0 += 1
 
-    ## a ## OUTPUT BESTORF_nuc
-    if BESTORF_nuc != {}:
-        name_elems[3] = str(len(BESTORF_nuc.keys()))
-        new_name = "_".join(name_elems)
-        count_file_with_CDS = count_file_with_CDS +1
-        OUT1 = open("%s/%s" %(Path_OUT1,new_name), "w")
-        for fasta_name in BESTORF_nuc.keys():
-            seq = BESTORF_nuc[fasta_name]
-            OUT1.write("%s\n" %fasta_name)
-            OUT1.write("%s\n" %seq)
-        OUT1.close()
-    else:
-        count_file_without_CDS = count_file_without_CDS + 1
-
-    ## b ## OUTPUT BESTORF_nuc_CODING  ===> THE MOST INTERESTING!!!
-    if BESTORF_aa != {}:
-        name_elems[3] = str(len(BESTORF_aa.keys()))
-        new_name = "_".join(name_elems)
-        OUT2 = open("%s/%s" %(Path_OUT2,new_name), "w")
-        for fasta_name in BESTORF_aa.keys():
-            seq = BESTORF_aa[fasta_name]
-            OUT2.write("%s\n" %fasta_name)
-            OUT2.write("%s\n" %seq)
-        OUT2.close()
+        count_file_processed = count_file_processed + 1
+        nb_gp = file.split('_')[1] # Keep trace of the orthogroup number
+        fasta_file_path = "./%s" %file    
+        bash_fasta = dico(fasta_file_path)
+        BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M  = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel, minimal_cds_length, minimum_species)
+        
+        name_elems[1] = nb_gp
 
-    ## c ## OUTPUT BESTORF_aa
-    if BESTORF_nuc_CODING != {}:
-        name_elems[3] = str(len(BESTORF_nuc_CODING.keys()))
-        new_name = "_".join(name_elems)
-        OUT3 = open("%s/%s" %(Path_OUT3,new_name), "w")
-        for fasta_name in BESTORF_nuc_CODING.keys():
-            seq = BESTORF_nuc_CODING[fasta_name]
-            OUT3.write("%s\n" %fasta_name)
-            OUT3.write("%s\n" %seq)
-        OUT3.close()
+        # Update counts and write group in corresponding output directory
+        if BESTORF_nuc != {}: 
+            count_file_with_CDS += 1
+            if len(BESTORF_nuc.keys()) >= minimum_species :
+                count_file_with_cds_and_enought_species += 1
+                write_output_file(BESTORF_nuc, name_elems, dirs[0]) # OUTPUT BESTORF_nuc
+                write_output_file(BESTORF_aa, name_elems, dirs[1]) # The most interesting
+        else:
+            count_file_without_CDS += 1
 
-    ## d ## OUTPUT BESTORF_aa_CODING
-    if BESTORF_aa_CODING != {}:
-        name_elems[3] = str(len(BESTORF_aa_CODING.keys()))
-        new_name = "_".join(name_elems)
-        OUT4 = open("%s/%s" %(Path_OUT4,new_name), "w")
-        for fasta_name in BESTORF_aa_CODING.keys():
-            seq = BESTORF_aa_CODING[fasta_name]
-            OUT4.write("%s\n" %fasta_name)
-            OUT4.write("%s\n" %seq)
-        OUT4.close()
+        if BESTORF_nuc_CODING != {} and len(BESTORF_nuc_CODING.keys()) >= minimum_species:
+            write_output_file(BESTORF_nuc_CODING, name_elems, dirs[2])
+            write_output_file(BESTORF_aa_CODING, name_elems, dirs[3])
 
-    ## e ## OUTPUT BESTORF_nuc_CDS_with_M
-    if BESTORF_nuc_CDS_with_M != {}:
-        name_elems[3] = str(len(BESTORF_nuc_CDS_with_M.keys()))
-        new_name = "_".join(name_elems)
-        count_file_with_CDS_plus_M = count_file_with_CDS_plus_M + 1
-        OUT5 = open("%s/%s" %(Path_OUT5,new_name), "w")
-        for fasta_name in BESTORF_nuc_CDS_with_M.keys():
-            seq = BESTORF_nuc_CDS_with_M[fasta_name]
-            OUT5.write("%s\n" %fasta_name)
-            OUT5.write("%s\n" %seq)
-        OUT5.close()
+        if BESTORF_nuc_CDS_with_M != {}:
+            count_file_with_CDS_plus_M += 1
+            if len(BESTORF_nuc_CDS_with_M.keys()) >= minimum_species :
+                count_file_with_cds_M_and_enought_species += 1
+                write_output_file(BESTORF_nuc_CDS_with_M, name_elems, dirs[4])
+                write_output_file(BESTORF_aa_CDS_with_M, name_elems, dirs[5])
 
-    ## f ## OUTPUT BESTORF_aa_CDS_with_M
-    if BESTORF_aa_CDS_with_M != {}:
-        name_elems[3] = str(len(BESTORF_aa_CDS_with_M.keys()))
-        new_name = "_".join(name_elems)
-        OUT6 = open("%s/%s" %(Path_OUT6,new_name), "w")
-        for fasta_name in BESTORF_aa_CDS_with_M.keys():
-            seq = BESTORF_aa_CDS_with_M[fasta_name]
-            OUT6.write("%s\n" %fasta_name)
-            OUT6.write("%s\n" %seq)
-        OUT6.close()
+    print "*************** CDS detection ***************"
+    print "\nFiles processed: %d" %count_file_processed
+    print "\tFiles with CDS: %d" %count_file_with_CDS
+    print "\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species)
+    print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M
+    print "\t\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species) 
+    print "\tFiles without CDS: %d \n" %count_file_without_CDS
+    print ""
 
-    #os.system("rm -rf %s" %file)
-
-## Print
-print "*************** CDS detection ***************"
-print "\nFiles processed: %d" %count_file_processed
-print "\tFiles with CDS: %d" %count_file_with_CDS
-print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M
-print "\tFiles without CDS: %d \n" %count_file_without_CDS
-print ""
+if __name__ == '__main__':
+    main()
--- a/scripts/S02_remove_too_short_bit_or_whole_sequence.py	Fri Jul 06 02:52:15 2018 -0400
+++ b/scripts/S02_remove_too_short_bit_or_whole_sequence.py	Mon Sep 24 03:58:34 2018 -0400
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # coding: utf8
 ## Author: Eric Fontanillas
 ## Modification: 03/09/14 by Julie BAFFARD
@@ -48,7 +48,6 @@
 MAX_LENGTH_SMALL_INDEL = 2      ## in aa
 MAX_LENGTH_SMALL_INDEL_nuc = 6  ## in nuc
 MIN_SPECIES_NB = int(sys.argv[1])
-MAX_sp = MIN_SPECIES_NB
 dicoco = {}
 dico_dico = {}
 list_new_file = []
--- a/scripts/S03_remove_site_with_not_enough_species_represented.py	Fri Jul 06 02:52:15 2018 -0400
+++ b/scripts/S03_remove_site_with_not_enough_species_represented.py	Mon Sep 24 03:58:34 2018 -0400
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # coding: utf8
 ## Author: Eric Fontanillas
 ## Modification: 03/09/14 by Julie BAFFARD
@@ -100,7 +100,6 @@
 
 ### 0 ### PARAMETERS
 MIN_SPECIES_NB = int(sys.argv[1])
-MAX_sp = MIN_SPECIES_NB
 MIN_LENGTH_FINAL_ALIGNMENT_NUC = int(sys.argv[2])
 n0 = 0
 bad = 0
--- a/test-data/outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_14_with_2_species.fasta	Fri Jul 06 02:52:15 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
->Ac6688_1/1_1.000_963
-RVAAMDTCQTRIRLSVYPANSVWPSADHARDIHWGGSALALVLITSGRNSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSLPPEAHKDPSGDTVTQFR*PECPKWFVFSLQFVRFQTLTSLSQPQDTMMGF*VFGEKRTHDTHSVCPSS*MVYLHTPRVFHNLMVLSREPDTIWRLSAEKATLSTSFVCPTNLRVV*PVPRSHKRRVPSQDPDRANCPSDDITTSDTKCPCPRKALRGKP*RDSSRVSCHRMSDLSLEADKIISGNCGVVAI*VTHPPCPWRVPRSVICSVMFL
->Ap1491_1/1_1.000_963
-RVAAIDTCHTRIRLSVYPANSVWPSADQARDIH*GGSALALVLITSGRSSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSFPPEAHKDPSGDTVTQFR*PECPKWFVFSLQFVRFHTLTSLSQPQDTMMGFWVFGENRTHDTHSVCPSS*MVYLHTPRVFHNLMVLSREPDTI*RLSAEKATLSTSFVCPTNLRVV*PVPRSHKRRVPSQDPDRANCPSDDMTTSDTKCPCPRKALRGKP*RDSSRVSCHRMSDLSLEADKMISGNCGVVAIWVTHPPCPWRVPRSVICSVMFL
--- a/test-data/outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_6_with_2_species.fasta	Fri Jul 06 02:52:15 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
->Ap5072_1/1_1.000_437
-PNRCHCSRS*HFAAS*RRSGIRCS*SVRCGPCRFRGRSERARLD*GRCSSPGRWR*SRRCPSFRLCFCPPARKPAAWRPLCGRHPGLDPSRSGILEGIFSCRRLYQTV*IGP*AVCP*PCSPWPSGPFSAP*RS
->Ac1013_1/1_1.000_525
-PSRCRCSQS*HFAAS*RRSRIRRS*SVRCGPCRWRGRS*RARLDQGRCSSLGKWR*PRRCPSFRLCFCPPARKPAAWLPLCGRHPGLGPSRSSILEGLFSYRHLYQTV*IEP*VVCP*PCSLWP*GLFSAP*RS
--- a/test-data/outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_14_with_2_species.fasta	Fri Jul 06 02:52:15 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
->Ac6688_1/1_1.000_963
-cgggtagctgccatggacacctgccagacacggatcaggttgtctgtgtatcctgcaaacagcgtctggccatcagctgaccatgccagggatatacactggggtggttcagcgctggcactggtactgatcacttctggacgcaactcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaggcagcacaaagccaatatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccacaacattgcctggccatccttaccaccagaagcacacaaagatccatcaggggacacggtgacacagttcaggtaacctgagtgtccaaagtggtttgtttttagtttgcagtttgtcagattccaaaccttaaccagtttgtcccagccacaagacacaatgatgggattctgagtgtttggtgagaagcgaacacatgatacccactctgtgtgtccatcttcctgaatggtgtacttgcatacaccgagagtgttccacaacttgatggtcttgtcacgtgaacctgacacaatctggcggttatcagctgagaaggccacgcttagcacatcttttgtgtgtccgacaaacctacgagttgtctgaccagtgccaagatcccacaaacgaagggttccatcccaggatccagacagagcgaactgtccatctgatgacataacaacgtcagacacgaaatgtccatgtccacgcaaggccttgcgagggaaaccgtagcgcgattcctcacgagtcagctgccacaggatgagcgatttgtctcttgaagccgacaaaataatatcgggaaattgtggcgttgtagcaatttgagttacccatcctccgtgcccttggagggtaccgcgaagcgtcatttgctccgtcatgtttctt
->Ap1491_1/1_1.000_963
-cgggtagctgccatagatacctgccacacacgaatcaggttgtctgtgtatccagcaaacagtgtctggccatcagctgaccaagccagggatatacactgaggtggctcggcactggcactggtgctgatcacttctggacgcagctcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaagcagcacaaagccagtatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccataacattgcctggccatcttttccaccagaagcgcacaaagatccatcaggggacacagtgacacagttcagataacctgagtgtccgaagtggtttgtttttagcttgcagtttgtcagattccacaccttaaccagtttgtcccagccacaggacacaatgatgggattctgggtgtttggtgagaatcgaacacatgatacccactctgtgtgcccatcttcctgaatggtatacttgcacaccccaagagtgttccacaacttgatggtcttgtcacgtgaacctgacacaatctgacggttatcagctgagaaagccacacttagcacgtccttcgtgtgtccaacaaacctacgagttgtctgaccagtgccaagatcccacaaacgaagggttccatcccaggatccagacagggcgaactgtccatctgatgacatgacgacgtcagacacgaagtgtccatgtccgcgcaaggccttgcgagggaagccgtaacgcgattcctcgcgagtcagctgccacagaatgagcgatttgtctctagaagccgacaaaatgatatcaggaaattgtggcgttgtagcaatttgggttacccatcctccgtgcccttggagggtaccgcgaagcgtcatttgctccgtcatgtttctt
--- a/test-data/outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_6_with_2_species.fasta	Fri Jul 06 02:52:15 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
->Ap5072_1/1_1.000_437
-cccaatcgctgtcactgctctcgaagttgacattttgctgcttcatgaagaaggtcaggaattcgttgtagttgatccgtccgctgtggtccttgtcgatttcgtggacgatctgaaagagctcgtctggattgaggaaggtgttcatctcccggaagatggcgttgatctcgccgatgtccaagcttccgtctttgtttttgtccgcccgctcgaaagcctgccgcatggcggcctctttgtggtagacatccgggacttgacccatcgcgatcaggtattcttgaagggatattttcttgtcgccgtctttatcaaactgtttgaattgggccttgagctgtttgcccttgaccttgtagcccttggccttcagggcctttttcagctccttgaaggtca
->Ac1013_1/1_1.000_525
-cccagtcgctgtcgctgctctcaaagttgacattttgctgcttcatgaagaaggtcaagaattcgtcgtagttgatccgtccgctgtggtccttgtcgatggcgtggacgatcttgaagagctcgtctggatcaaggaaggtgttcatctcttggaaaatggcgttaacctcgccgatgtccaagcttccgtctttgtttttgtccgcccgctcgaaagcctgccgcatggctgcctctttgtggtaggcatccgggacttggcccatcgcgatcaagtattcttgaagggttattttcttatcgccatctttatcaaactgtttgaattgagccttgagttgtttgcccttgaccttgtagcctttggccttgagggcttttttcagctccttgaaggtca
--- a/test-data/outputs_ORF_Search_05_CDS_aa/test1/orthogroup_14_with_2_species.fasta	Fri Jul 06 02:52:15 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
->Ac6688_1/1_1.000_963
-GGSALALVLITSGRNSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSLPPEAHKDPSGDTVTQFR
->Ap1491_1/1_1.000_963
-GGSALALVLITSGRSSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSFPPEAHKDPSGDTVTQFR
--- a/test-data/outputs_ORF_Search_05_CDS_aa/test1/orthogroup_6_with_2_species.fasta	Fri Jul 06 02:52:15 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
->Ap5072_1/1_1.000_437
-SRRCPSFRLCFCPPARKPAAWRPLCGRHPGLDPSRSGILEGIFSCRRLYQTV
->Ac1013_1/1_1.000_525
-PRRCPSFRLCFCPPARKPAAWLPLCGRHPGLGPSRSSILEGLFSYRHLYQTV
--- a/test-data/outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_14_with_2_species.fasta	Fri Jul 06 02:52:15 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
->Ac6688_1/1_1.000_963
-ggtggttcagcgctggcactggtactgatcacttctggacgcaactcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaggcagcacaaagccaatatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccacaacattgcctggccatccttaccaccagaagcacacaaagatccatcaggggacacggtgacacagttcagg
->Ap1491_1/1_1.000_963
-ggtggctcggcactggcactggtgctgatcacttctggacgcagctcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaagcagcacaaagccagtatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccataacattgcctggccatcttttccaccagaagcgcacaaagatccatcaggggacacagtgacacagttcaga
--- a/test-data/outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_6_with_2_species.fasta	Fri Jul 06 02:52:15 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
->Ap5072_1/1_1.000_437
-tctcgccgatgtccaagcttccgtctttgtttttgtccgcccgctcgaaagcctgccgcatggcggcctctttgtggtagacatccgggacttgacccatcgcgatcaggtattcttgaagggatattttcttgtcgccgtctttatcaaactgtt
->Ac1013_1/1_1.000_525
-cctcgccgatgtccaagcttccgtctttgtttttgtccgcccgctcgaaagcctgccgcatggctgcctctttgtggtaggcatccgggacttggcccatcgcgatcaagtattcttgaagggttattttcttatcgccatctttatcaaactgtt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/old_orthogroup_7_with_3_species.fasta	Mon Sep 24 03:58:34 2018 -0400
@@ -0,0 +1,6 @@
+>Am3527_1/1_1.000_270
+-------------------VSVVDVKWHVRSRYSKLLPFLALSDHTCGKTT----------------
+>Ap5050_1/1_1.000_243
+EIWHSQARTKKEDEPRHQFVSVVHVERHVRSGKTPLLPFFTPSNHTCGKSTSKLIKTKSSLQHFHSF
+>Ac2173_1/1_1.000_330
+EIWHSQARTKKEDEPSHQLVSVVHIERHVRSGKTPLLPFFTSPNHTCGKSTSKLVKTKSSLQHFQRF
--- a/test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_14_with_2_species.fasta	Fri Jul 06 02:52:15 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
->Ac6688_1/1_1.000_963
-GGSALALVLITSGRNSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSLPPEAHKDPSGDTVTQFR
->Ap1491_1/1_1.000_963
-GGSALALVLITSGRSSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSFPPEAHKDPSGDTVTQFR
--- a/test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_1_with_2_species.fasta	Fri Jul 06 02:52:15 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
->Ap2303_1/1_1.000_424
-FNLRYKIIWICVHFSTVYCLFIWIHSLLLDNFLINRISEFHFSVKWVLLVILN
->Ac3644_1/1_1.000_1626
-FNLRYKIIWICVHFSTVNCLFIWIHSLLLNNFLINSISEFHFSVKWVLLVIFN
--- a/test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_6_with_2_species.fasta	Fri Jul 06 02:52:15 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
->Ap5072_1/1_1.000_437
-SRRCPSFRLCFCPPARKPAAWRPLCGRHPGLDPSRSGILEGIFSCRRLYQTV
->Ac1013_1/1_1.000_525
-PRRCPSFRLCFCPPARKPAAWLPLCGRHPGLGPSRSSILEGLFSYRHLYQTV
--- a/test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_7_with_3_species.fasta	Fri Jul 06 02:52:15 2018 -0400
+++ b/test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_7_with_3_species.fasta	Mon Sep 24 03:58:34 2018 -0400
@@ -1,6 +1,6 @@
 >Am3527_1/1_1.000_270
--------------------VSVVDVKWHVRSRYSKLLPFLALSDHTCGKTT----------------
+VSVVDVKWHVRSRYSKLLPFLALSDHTCGKTT
 >Ap5050_1/1_1.000_243
-EIWHSQARTKKEDEPRHQFVSVVHVERHVRSGKTPLLPFFTPSNHTCGKSTSKLIKTKSSLQHFHSF
+VSVVHVERHVRSGKTPLLPFFTPSNHTCGKST
 >Ac2173_1/1_1.000_330
-EIWHSQARTKKEDEPSHQLVSVVHIERHVRSGKTPLLPFFTSPNHTCGKSTSKLVKTKSSLQHFQRF
+VSVVHIERHVRSGKTPLLPFFTSPNHTCGKST
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/old_orthogroup_7_with_3_species.fasta	Mon Sep 24 03:58:34 2018 -0400
@@ -0,0 +1,6 @@
+>Am3527_1/1_1.000_270
+---------------------------------------------------------gtatctgttgttgatgtcaaatggcatgttcgatccaggtactccaagctgttgccattcttggctttgtctgaccacacttgtggaaaaacgacg------------------------------------------------
+>Ap5050_1/1_1.000_243
+gagatatggcactcccaagccagaaccaaaaaagaggatgaaccacgccatcaatttgtatcggttgttcatgtcgaaaggcatgttagatccgggaagacccctctgttgccattcttcactccgtcgaaccacacttgtggaaaatcgacgagtaagttgattaagacgaagagctctctgcaacattttcacagcttc
+>Ac2173_1/1_1.000_330
+gagatatggcactcccaagccagaaccaaaaaagaggatgaaccaagccatcaacttgtatcggttgttcatatcgaaaggcatgttagatccgggaagacccctttgttgccattcttcacttcgccgaaccacacttgtggaaaatcgacgagtaagctggttaagacgaagagctctctgcaacattttcaacgcttc
--- a/test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_14_with_2_species.fasta	Fri Jul 06 02:52:15 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
->Ac6688_1/1_1.000_963
-ggtggttcagcgctggcactggtactgatcacttctggacgcaactcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaggcagcacaaagccaatatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccacaacattgcctggccatccttaccaccagaagcacacaaagatccatcaggggacacggtgacacagttcagg
->Ap1491_1/1_1.000_963
-ggtggctcggcactggcactggtgctgatcacttctggacgcagctcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaagcagcacaaagccagtatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccataacattgcctggccatcttttccaccagaagcgcacaaagatccatcaggggacacagtgacacagttcaga
--- a/test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_1_with_2_species.fasta	Fri Jul 06 02:52:15 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
->Ap2303_1/1_1.000_424
-tttaatcttagatacaaaatcatttggatttgtgtacatttctccactgtatattgcctgttcatctggatccattccttgttgctggacaacttcctcattaaccgcatatctgagtttcatttttctgtaaagtgggttcttcttgtaatacttaac
->Ac3644_1/1_1.000_1626
-tttaatcttagatacaaaatcatttggatttgtgtacatttctccactgtaaattgcctgttcatctggatccattccttgttgttgaacaacttcctcattaacagcatatctgagtttcatttttctgtaaagtgggttcttcttgtaatatttaac
--- a/test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_6_with_2_species.fasta	Fri Jul 06 02:52:15 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
->Ap5072_1/1_1.000_437
-tctcgccgatgtccaagcttccgtctttgtttttgtccgcccgctcgaaagcctgccgcatggcggcctctttgtggtagacatccgggacttgacccatcgcgatcaggtattcttgaagggatattttcttgtcgccgtctttatcaaactgtt
->Ac1013_1/1_1.000_525
-cctcgccgatgtccaagcttccgtctttgtttttgtccgcccgctcgaaagcctgccgcatggctgcctctttgtggtaggcatccgggacttggcccatcgcgatcaagtattcttgaagggttattttcttatcgccatctttatcaaactgtt
--- a/test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_7_with_3_species.fasta	Fri Jul 06 02:52:15 2018 -0400
+++ b/test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_7_with_3_species.fasta	Mon Sep 24 03:58:34 2018 -0400
@@ -1,6 +1,6 @@
 >Am3527_1/1_1.000_270
----------------------------------------------------------gtatctgttgttgatgtcaaatggcatgttcgatccaggtactccaagctgttgccattcttggctttgtctgaccacacttgtggaaaaacgacg------------------------------------------------
+gtatctgttgttgatgtcaaatggcatgttcgatccaggtactccaagctgttgccattcttggctttgtctgaccacacttgtggaaaaacgacg
 >Ap5050_1/1_1.000_243
-gagatatggcactcccaagccagaaccaaaaaagaggatgaaccacgccatcaatttgtatcggttgttcatgtcgaaaggcatgttagatccgggaagacccctctgttgccattcttcactccgtcgaaccacacttgtggaaaatcgacgagtaagttgattaagacgaagagctctctgcaacattttcacagcttc
+gtatcggttgttcatgtcgaaaggcatgttagatccgggaagacccctctgttgccattcttcactccgtcgaaccacacttgtggaaaatcgacg
 >Ac2173_1/1_1.000_330
-gagatatggcactcccaagccagaaccaaaaaagaggatgaaccaagccatcaacttgtatcggttgttcatatcgaaaggcatgttagatccgggaagacccctttgttgccattcttcacttcgccgaaccacacttgtggaaaatcgacgagtaagctggttaagacgaagagctctctgcaacattttcaacgcttc
+gtatcggttgttcatatcgaaaggcatgttagatccgggaagacccctttgttgccattcttcacttcgccgaaccacacttgtggaaaatcgacg