changeset 3:ff98ed7849fa draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit cf1b9c905931ca2ca25faa4844d45c908756472f
author abims-sbr
date Wed, 17 Jan 2018 08:55:29 -0500
parents 0d2f72caea10
children 0450307b2ffb
files CDS_search.xml scripts/S01_find_orf_on_multiple_alignment.py scripts/S02_remove_too_short_bit_or_whole_sequence.py scripts/S03_remove_site_with_not_enough_species_represented.py scripts/dico.py
diffstat 5 files changed, 87 insertions(+), 386 deletions(-) [+]
line wrap: on
line diff
--- a/CDS_search.xml	Wed Sep 27 10:03:05 2017 -0400
+++ b/CDS_search.xml	Wed Jan 17 08:55:29 2018 -0500
@@ -19,6 +19,8 @@
             #set $infiles = $infiles + $input.element_identifier + ","
         #end for
         #set $infiles = $infiles[:-1]
+        
+        ln -s $__tool_directory__/scripts/dico.py . &&
 
         python $__tool_directory__/scripts/S01_find_orf_on_multiple_alignment.py
         $infiles
@@ -184,353 +186,97 @@
 	</tests>
 	<help>
 
-@HELP_AUTHORS@
-
-============
-What it does
-============
-
-| This tool takes **'dataset collection list' containing fasta files with nucleic aligned sequences** and search the ORF and the CDS
-|
-
---------
-
-==========
-Parameters
-==========
-
-The choice of several parameters is possible.
-
-**min_length_seq**
-	| minimal length of the sequence in the proteic format
-	| when the removal of the indel is done, the minimal length equals :previous length less20
-	| for example if you choose 50 for the minimal length, the actual length equals 30
-	|
-
-**min_length_subseq**
-	| minimal length of the subsequence in the proteic format
-	| subsequence means the part of the original sequence between 2 sets of indels
-	| an indel set is composed by more than 2 indels, if not the set is considered as unknown amino acid
-	|
+@HELPAUTHORS@
 
-**min_length_nuc**
-	| Minimal length of the sequence in the nucleic format
-	|
-
---------
-
-======
-Inputs
-======
-
-option **universal code** :
-
-| the input must have the extension .txt
-| It's the file which will serve for the translation of nucleotides to amino acids
-| if there are "U"s in this file, they will be automatically transform into "T"s
-| for example :
-| UUU Phe F
-| UCU Ser S
-| UAU Tyr Y
-| UGU Cys C
-| UUC Phe F
-| UCC Ser S
-| UAC Tyr Y
-| UGC Cys C
-| UUA Leu L
-| UCA Ser S
-| UAA Stop *
-| ...
-
+<![CDATA[
 
 --------
 
-=======
-Outputs
-=======
-
-This tool, produces the following files :
-
-**ORF_Search**
-	| is the output with important informations (mainly statistics about the tools).
-	|
-
-**ORF_Search_Best_ORF_aa**
-	| is the output with the best ORF in the proteic format.
-	|
-
-**ORF_Search_Best_ORF_nuc**
-	| is the output with the best ORF in the nucleic format.
-	|
-
-**ORF_Search_CDS_aa**
-	| is the output with the CDS (regardless the Methionine) in the proteic format.
-	|
+**Description**
 
-**ORF_Search_CDS_nuc**
-	| is the output with the CDS (regardless the Methionine) in the nucleic format.
-	|
-
-**ORF_Search_CDS_with_M_aa**
-	| is the output with the CDS (considering the Methionine) in proteic format.
-	| the rule : they must have a methionine before the minimale length of the sequence.
-	| for example before the 30 last amino acid.
-	|
-
-**ORF_Search_CDS_with_M_nuc**
-	| is the output with the CDS (considering the Methionine) in nucleic format.
-	| the rule : they must have a methionine before the minimale length of the sequence.
-	| for example before the 30 last amino acid.
-	|
-
-**ORF_Search_CDS_without_indel_aa**
-	| is the output with the CDS without indel in proteic format.
-	| considering the Methionine or not : according to the option chosen.
-	|
-
-**ORF_Search_CDS_without_indel_nuc**
-	| is the output with the CDS without indel in proteic format.
-	| considering the Methionine or not : according to the option chosen.
-
+This tool takes files containing nucleic aligned sequences and search the ORF and the CDS.
 
 --------
 
-===============
-Working Example
-===============
-
-------------------------------
-The input file and its options
-------------------------------
-
-**ORF_Search**
-
-| a 'dataset collection list' containing 47 files with 1, 2 or 3 sequences inside
-| for example the file : locus5_sp3.fasta which contains 3 species :
-|
-
-| &gt;Ac533/40375
+**Inputs**
 
-----------gccccctagcgagtgacgacaaactcg----------------------------------------aacatgttggctgctgggctaagaactctaaaaagccttgcccctcggggttgcgtagcgtggtcgtgtacttcggtgcatgccaagcataccctaccagacttaccatatgattacaatgccctggagccacacatcagtgctgaaatcatgctgctgcatca
-caccaagcatcaccagacgtatgtcaacaacctgaatgttgcagaggagaagtttcatgaggctacagagaaaggtgatgtaaccacagcagtatcactgatgccagccctaagatttaatggtggtggacacatcaaccatactatattttggaagaacatgtcaccaaatggtggtggagagccatctggcgaactgatggaggccatcaaacgtgac
-tttggctcatttgaaaacatgaagaacatgttgagtacatcaaccactgcagtgcaaggctctggctggggatggcttgggtacaataaaaagatgaagaaactagaaattgccacctgtgccaaccaagatccattagagggcactacaggtctcgtcccgttgtttggcatagatgtctgggaacatgcctattacttgcagtacaagaatgttcgtccagact
-atgtaaaggctatttggaatgtggccaactgggatgacatcatggaacgttacaacaatgccagaaaataaactgttaaacaaataattaatatattaatgtgttgcaatt-tttgtcaattggtacatacacaattttgttcataaagaaaat--tgtgattactttctg-gcaactagttcccagtgaatacagcaattgttctgccaaggatgcatttggattgagaaggc
-acccagaatgggttatccgtgaatgtctattggaatgtggcaccatacgatgtttactgtattagttacaattaaaa-------------------
+Input files : (multiple) fasta files with nucleic aligned sequences.
 
-&gt;Ap401/11000
-ctgatgtgtggccccctagcgagtgaagacaaactcgatagcacccagacagttctgttggttagataaaagggagaaacatgctggctgctgggctaagaactctaaaaagccttgctcctcgtggtggtctagcttggtcttgtacctcggtacatgccaaacacacactgccagacttgccgtatgattataatgctctggagccacacatcagtgctgaaa
-tcatgttgctgcatcacacaaaacatcaccagacgtatgtgaacaacctgaatattgcagaggagaagtttcatgaggctaccgagaaaggcgatgtgaccacagcagtatcactgatgccagccctaagatttaatggtggtggacatatcaaccatactatattttggaagaacatgtcaccaaatggtggtggagaaccatctggcgaactgatggag
-gccatcaaacgtgactttggttcatttgaaaacatgaagaacatgctgagtacagcaaccactgcagtacaaggctctggctggggatggcttgggtacaataaaaagatgaagaaactagaaattgccacctgtgccaaccaagatccattagagggcactacaggtcttgtcccactgtttggtatcgatgtctgggaacatgcctattatttacagtacaa
-gaatgttcgtccagactatgtaaaggctatttggaatgtggctaactgggatgatatcatggagcgttacaacaatgccagaaaataaac--ttaaatacatcattatttagttaatgtgcgacttttgtttgttaatcagttcacacaccattctattcacaaagaaaatggtgtatttgctttctgttcaactggttcccggtgaatacagcagttgttctgccaaggatgtattt
-ggattgagaaggcaccaagaatgggctgtcactgaatgtcaattggaatgtagcctca----atgtttactgtattacctacaattaaaatgattatgatataaccaag
-
-| &gt;Pf2011/1100
-
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------tgccagatttaccatatgattataatgccttggaaccacacatcagtgctgaaataatgctccttcatcatacaaagcatcaccagacatatgtgaacaatctgaatgt
-agctgaagaaaagtttcatgaagccacggagaaaggtgatgtcactacagctgtgtcgctaatgccagcactaagatttaatggcggaggacacatcaatcacaccattttctggaagaacatgtctcctaatggcggaggagagccttctggcgagttgatggaagccattaaacgtgattttggttcatttgagaatatgaaaaacatgttaagtacagcta
-caacagctgtccaaggatctggctggggatggcttggttataacaaaaagatgaaaaagctcgagatagccacttgtgccaaccaggatccactggaaggaacaacaggattaattccactgtttggtattgacgtctgggagcatgcttactatctgcaatataaaaatgtacgtccagattatgttaaagctatctggaatgtggccaactgggatgatatta
-cagagcgctacaacaatgc-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+--------
 
 **Parameters**
 
-| option : Methionine is considerated
-| no option for the CDS's length. So, by default it's min_length_seq : 50, min_length_subseq : 15, min_length_nuc : 50
+    - methionine : choose to consider the methionine in the search of CDS.
+        yes/no.
 
-----------------
-The output files
-----------------
-
-
-**ORF_Search_Best_ORF_aa**
+    - 'Minimal number of species in each locus'        
+        Default : 10 (integer).
 
-|
-| *************** CDS detection ***************
-|
-| Files processed: 47
-|
-| Files with CDS: 32
-| Files with CDS plus M (codon start): 20
-| Files without CDS: 15
-|
-| In locus with CDS considering Methionine :
-|
-| *************** 1st filter : selection of the locus ***************
-|
-
-Total number of locus recorded  = 20
-
-| Number of locus with 1 species : 1
-| Number of locus with 2 species : 16
-| Number of locus with 3 species : 3
-|
-| Number of locus excluded (exclude if not at least 3 species in the alignment)= 17
-| 
-| *************** 2nd Filter : removal of the indel ***************
-| 
-| Total number of locus recorded  = 3
-|
-
-Total number of locus with no indels (SAVED) = 3
-
-Total number of locus with indels (EXCLUDED) = 0
-
-|
-
-**ORF_Search_Best_ORF_aa**
-
-| &gt;Pf2011/1100
+    - 'min_length_seq' :
+        minimal length of the sequence (in amino acids).        
+        when the removal of the indel is done, the minimal length equals : previous length - 20.
+        for example if you choose 50 for the minimal length, the actual length equals 30.
+        Default : 50 (integer).
 
-?????????????????????????????????????????????????????????PDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNVAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLSTATTAVQ
-GSGWGWLGYNKKMKKLEIATCANQDPLEGTTGLIPLFGIDVWEHAYYLQYKNVRPDYVKAIWNVANWDDITERYNN??????????????????????????????????????????????????????????????????????????????????????????
-
-| &gt;Ap401/11000
-
-DVWPPSE*RQTR*HPDSSVG*IKGRNMLAAGLRTLKSLAPRGGLAWSCTSVHAKHTLPDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNIAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLS
-TATTAVQGSGWGWLGYNKKMKKLEIATCANQDPLEGTTGLVPLFGIDVWEHAYYLQYKNVRPDYVKAIWNVANWDDIMERYNNARK*??KYIII*LMCDFCLLISSHTILFTKKMVYLLSVQLVPGEYSSCSAKDVFGLRRHQEWAVTECQLECSL??CLLYYLQLK*
-L*YNQ
-
-| &gt;Ac533/40375
+    - 'min_length_subseq' :
+        minimal length of the subsequence (in amino acids).
+        subsequence means the part of the original sequence between 2 sets of indels.
+        an indel set is composed by more than 2 indels, if not the set is considered as unknown amino acid.
+        Default : 15 (integer).
 
-???PPSE*RQT??????????????NMLAAGLRTLKSLAPRGCVAWSCTSVHAKHTLPDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNVAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLSTS
-TTAVQGSGWGWLGYNKKMKKLEIATCANQDPLEGTTGLVPLFGIDVWEHAYYLQYKNVRPDYVKAIWNVANWDDIMERYNNARK*TVKQIINILMCCN?LSIGTYTILFIKK??*LLS?QLVPSEYSNCSAKDAFGLRRHPEWVIRECLLECGTIRCLLY*LQLK??????
-
-.. class:: infomark
-
-| for example : locus5_sp3.fasta
-| 
-
-**ORF_Search_Best_ORF_nuc**
-
-| &gt;Pf2011/1100
+    - 'min_length_nuc' :
+        Minimal length of the sequence in the nucleic format, without indels.
+        Default : 50 (integer).
 
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------tgccagatttaccatatgattataatgccttggaaccacacatcagtgctgaaataatgctccttcatcatacaaagcatcaccagacatatgtgaacaatctgaatgta
-gctgaagaaaagtttcatgaagccacggagaaaggtgatgtcactacagctgtgtcgctaatgccagcactaagatttaatggcggaggacacatcaatcacaccattttctggaagaacatgtctcctaatggcggaggagagccttctggcgagttgatggaagccattaaacgtgattttggttcatttgagaatatgaaaaacatgttaagtacagctac
-aacagctgtccaaggatctggctggggatggcttggttataacaaaaagatgaaaaagctcgagatagccacttgtgccaaccaggatccactggaaggaacaacaggattaattccactgtttggtattgacgtctgggagcatgcttactatctgcaatataaaaatgtacgtccagattatgttaaagctatctggaatgtggccaactgggatgatattac
-agagcgctacaacaatgc----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-
-| &gt;Ap401/11000
-
-gatgtgtggccccctagcgagtgaagacaaactcgatagcacccagacagttctgttggttagataaaagggagaaacatgctggctgctgggctaagaactctaaaaagccttgctcctcgtggtggtctagcttggtcttgtacctcggtacatgccaaacacacactgccagacttgccgtatgattataatgctctggagccacacatcagtgctgaaatc
-atgttgctgcatcacacaaaacatcaccagacgtatgtgaacaacctgaatattgcagaggagaagtttcatgaggctaccgagaaaggcgatgtgaccacagcagtatcactgatgccagccctaagatttaatggtggtggacatatcaaccatactatattttggaagaacatgtcaccaaatggtggtggagaaccatctggcgaactgatggaggc
-catcaaacgtgactttggttcatttgaaaacatgaagaacatgctgagtacagcaaccactgcagtacaaggctctggctggggatggcttgggtacaataaaaagatgaagaaactagaaattgccacctgtgccaaccaagatccattagagggcactacaggtcttgtcccactgtttggtatcgatgtctgggaacatgcctattatttacagtacaagaa
-tgttcgtccagactatgtaaaggctatttggaatgtggctaactgggatgatatcatggagcgttacaacaatgccagaaaataaac--ttaaatacatcattatttagttaatgtgcgacttttgtttgttaatcagttcacacaccattctattcacaaagaaaatggtgtatttgctttctgttcaactggttcccggtgaatacagcagttgttctgccaaggatgtatttggat
-tgagaaggcaccaagaatgggctgtcactgaatgtcaattggaatgtagcctca----atgtttactgtattacctacaattaaaatgattatgatataaccaa
+    - others parameters allowing to choose which outputs you desire :
+        - outputs with best ORFs.
+        - outputs with CDS, with or without indels.
+        - in proteic or nucleic format.
 
-| &gt;Ac533/40375
+--------
 
---------gccccctagcgagtgacgacaaactcg----------------------------------------aacatgttggctgctgggctaagaactctaaaaagccttgcccctcggggttgcgtagcgtggtcgtgtacttcggtgcatgccaagcataccctaccagacttaccatatgattacaatgccctggagccacacatcagtgctgaaatcatgctgctgcatcac
-accaagcatcaccagacgtatgtcaacaacctgaatgttgcagaggagaagtttcatgaggctacagagaaaggtgatgtaaccacagcagtatcactgatgccagccctaagatttaatggtggtggacacatcaaccatactatattttggaagaacatgtcaccaaatggtggtggagagccatctggcgaactgatggaggccatcaaacgtgact
-ttggctcatttgaaaacatgaagaacatgttgagtacatcaaccactgcagtgcaaggctctggctggggatggcttgggtacaataaaaagatgaagaaactagaaattgccacctgtgccaaccaagatccattagagggcactacaggtctcgtcccgttgtttggcatagatgtctgggaacatgcctattacttgcagtacaagaatgttcgtccagacta
-tgtaaaggctatttggaatgtggccaactgggatgacatcatggaacgttacaacaatgccagaaaataaactgttaaacaaataattaatatattaatgtgttgcaatt-tttgtcaattggtacatacacaattttgttcataaagaaaat--tgtgattactttctg-gcaactagttcccagtgaatacagcaattgttctgccaaggatgcatttggattgagaaggca
-cccagaatgggttatccgtgaatgtctattggaatgtggcaccatacgatgtttactgtattagttacaattaaaa------------------
+**Outputs**
 
-.. class:: infomark
-
-| for example : locus5_sp3.fasta
-|
-
-**ORF_Search_CDS_with_M_aa**
+    - ORF_Search
+        the log file (mainly statistics about the tool).
 
-| &gt;Pf2011/1100
-
-????????????????????????????????????PDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNVAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLSTATTAVQGSGWGWLGYNKKMKK
-LEIATCANQDPLEGTTGLIPLFGIDVWEHAYYLQYKNVRPDYVKAIWNVANWDDITERYNN???
-
-| &gt;Ap401/11000
-
-IKGRNMLAAGLRTLKSLAPRGGLAWSCTSVHAKHTLPDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNIAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLSTATTAVQGSGWGWLGYNK
-KMKKLEIATCANQDPLEGTTGLVPLFGIDVWEHAYYLQYKNVRPDYVKAIWNVANWDDIMERYNNARK
+    - ORF_Search_Best_ORF_aa
+        the output with the best ORF in the proteic format.
 
-| &gt;Ac533/40375
-
-????NMLAAGLRTLKSLAPRGCVAWSCTSVHAKHTLPDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNVAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLSTSTTAVQGSGWGWLGYNK
-KMKKLEIATCANQDPLEGTTGLVPLFGIDVWEHAYYLQYKNVRPDYVKAIWNVANWDDIMERYNNARK
-
-.. class:: infomark
-
-| for example : locus5_sp3.fasta
-
-| It's the same for the option : regardless Methionine
-|
+    - ORF_Search_Best_ORF_nuc
+        the output with the best ORF in the nucleic format.
 
-**ORF_Search_CDS_with_M_nuc**
-
-| &gt;Pf2011/1100
-
-----------------------------------------------------------------------------------------------------------tgccagatttaccatatgattataatgccttggaaccacacatcagtgctgaaataatgctccttcatcatacaaagcatcaccagacatatgtgaacaatctgaatgtagctgaagaaaagtttcatgaagccacggagaaaggtgatg
-tcactacagctgtgtcgctaatgccagcactaagatttaatggcggaggacacatcaatcacaccattttctggaagaacatgtctcctaatggcggaggagagccttctggcgagttgatggaagccattaaacgtgattttggttcatttgagaatatgaaaaacatgttaagtacagctacaacagctgtccaaggatctggctggggatggcttggttataa
-caaaaagatgaaaaagctcgagatagccacttgtgccaaccaggatccactggaaggaacaacaggattaattccactgtttggtattgacgtctgggagcatgcttactatctgcaatataaaaatgtacgtccagattatgttaaagctatctggaatgtggccaactgggatgatattacagagcgctacaacaatgc-------
-
-| &gt;Ap401/11000
+    - ORF_Search_CDS_aa
+        the output with the CDS (regardless the Methionine) in the proteic format.
 
-ataaaagggagaaacatgctggctgctgggctaagaactctaaaaagccttgctcctcgtggtggtctagcttggtcttgtacctcggtacatgccaaacacacactgccagacttgccgtatgattataatgctctggagccacacatcagtgctgaaatcatgttgctgcatcacacaaaacatcaccagacgtatgtgaacaacctgaatattgcagagga
-gaagtttcatgaggctaccgagaaaggcgatgtgaccacagcagtatcactgatgccagccctaagatttaatggtggtggacatatcaaccatactatattttggaagaacatgtcaccaaatggtggtggagaaccatctggcgaactgatggaggccatcaaacgtgactttggttcatttgaaaacatgaagaacatgctgagtacagcaaccactg
-cagtacaaggctctggctggggatggcttgggtacaataaaaagatgaagaaactagaaattgccacctgtgccaaccaagatccattagagggcactacaggtcttgtcccactgtttggtatcgatgtctgggaacatgcctattatttacagtacaagaatgttcgtccagactatgtaaaggctatttggaatgtggctaactgggatgatatcatggagcg
-ttacaacaatgccagaaaa
-
-| &gt;Ac533/40375
+    - ORF_Search_CDS_nuc
+        the output with the CDS (regardless the Methionine) in the nucleic format.
 
-------------aacatgttggctgctgggctaagaactctaaaaagccttgcccctcggggttgcgtagcgtggtcgtgtacttcggtgcatgccaagcataccctaccagacttaccatatgattacaatgccctggagccacacatcagtgctgaaatcatgctgctgcatcacaccaagcatcaccagacgtatgtcaacaacctgaatgttgcagaggagaa
-gtttcatgaggctacagagaaaggtgatgtaaccacagcagtatcactgatgccagccctaagatttaatggtggtggacacatcaaccatactatattttggaagaacatgtcaccaaatggtggtggagagccatctggcgaactgatggaggccatcaaacgtgactttggctcatttgaaaacatgaagaacatgttgagtacatcaaccactgcagt
-gcaaggctctggctggggatggcttgggtacaataaaaagatgaagaaactagaaattgccacctgtgccaaccaagatccattagagggcactacaggtctcgtcccgttgtttggcatagatgtctgggaacatgcctattacttgcagtacaagaatgttcgtccagactatgtaaaggctatttggaatgtggccaactgggatgacatcatggaacgtt
-acaacaatgccagaaaa
-
-.. class:: infomark
+    - ORF_Search_CDS_with_M_aa
+        the output with the CDS (considering the Methionine) in proteic format.
+        the rule : they must have a methionine before the minimal length of the sequence.
+        for example before the 30 last amino acid.
 
-| for example : locus5_sp3.fasta
-|

-| It's the same for the option : regardless Methionine
-|
-
-**ORF_Search_CDS_without_indel_aa**
-
-| &gt;Pf2011/1100
+    - ORF_Search_CDS_with_M_nuc
+        the output with the CDS (considering the Methionine) in nucleic format.
+        the rule : they must have a methionine before the minimale length of the sequence.
+        for example before the 30 last amino acid.
 
-PDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNVAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLSTATTAVQGSGWGWLGYNKKMKKLEIATCANQDPLEGTTGLIPLFGIDVWEHAYYLQ
-YKNVRPDYVKAIWNVANWDDITERYNN
-
-| &gt;Ap401/11000
-
-PDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNIAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLSTATTAVQGSGWGWLGYNKKMKKLEIATCANQDPLEGTTGLVPLFGIDVWEHAYYLQ
-YKNVRPDYVKAIWNVANWDDIMERYNN
-
-| &gt;Ac533/40375
-
-PDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNVAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLSTSTTAVQGSGWGWLGYNKKMKKLEIATCANQDPLEGTTGLVPLFGIDVWEHAYYLQ
-YKNVRPDYVKAIWNVANWDDIMERYNN
-
-.. class:: infomark
+    - ORF_Search_CDS_without_indel_aa
+        is the output with the CDS without indel in proteic format.
+        considering the Methionine or not : according to the option chosen.
 
-| for example locus5_sp3_sp3.fasta
-||
-
-**ORF_Search_CDS_without_indel_nuc**
+    - ORF_Search_CDS_without_indel_nuc
+        is the output with the CDS without indel in proteic format.
+        considering the Methionine or not : according to the option chosen.
 
-| &gt;Pf2011/1100
-
-ccagatttaccatatgattataatgccttggaaccacacatcagtgctgaaataatgctccttcatcatacaaagcatcaccagacatatgtgaacaatctgaatgtagctgaagaaaagtttcatgaagccacggagaaaggtgatgtcactacagctgtgtcgctaatgccagcactaagatttaatggcggaggacacatcaatcacaccattttctggaa
-gaacatgtctcctaatggcggaggagagccttctggcgagttgatggaagccattaaacgtgattttggttcatttgagaatatgaaaaacatgttaagtacagctacaacagctgtccaaggatctggctggggatggcttggttataacaaaaagatgaaaaagctcgagatagccacttgtgccaaccaggatccactggaaggaacaacaggatta
-attccactgtttggtattgacgtctgggagcatgcttactatctgcaatataaaaatgtacgtccagattatgttaaagctatctggaatgtggccaactgggatgatattacagagcgctacaacaat
-
-| &gt;Ap401/11000
+---------
 
-ccagacttgccgtatgattataatgctctggagccacacatcagtgctgaaatcatgttgctgcatcacacaaaacatcaccagacgtatgtgaacaacctgaatattgcagaggagaagtttcatgaggctaccgagaaaggcgatgtgaccacagcagtatcactgatgccagccctaagatttaatggtggtggacatatcaaccatactatattttgga
-agaacatgtcaccaaatggtggtggagaaccatctggcgaactgatggaggccatcaaacgtgactttggttcatttgaaaacatgaagaacatgctgagtacagcaaccactgcagtacaaggctctggctggggatggcttgggtacaataaaaagatgaagaaactagaaattgccacctgtgccaaccaagatccattagagggcactacag
-gtcttgtcccactgtttggtatcgatgtctgggaacatgcctattatttacagtacaagaatgttcgtccagactatgtaaaggctatttggaatgtggctaactgggatgatatcatggagcgttacaacaat
-
-| &gt;Ac533/40375
+**The AdaptSearch Pipeline**
 
-ccagacttaccatatgattacaatgccctggagccacacatcagtgctgaaatcatgctgctgcatcacaccaagcatcaccagacgtatgtcaacaacctgaatgttgcagaggagaagtttcatgaggctacagagaaaggtgatgtaaccacagcagtatcactgatgccagccctaagatttaatggtggtggacacatcaaccatactatattttgg
-aagaacatgtcaccaaatggtggtggagagccatctggcgaactgatggaggccatcaaacgtgactttggctcatttgaaaacatgaagaacatgttgagtacatcaaccactgcagtgcaaggctctggctggggatggcttgggtacaataaaaagatgaagaaactagaaattgccacctgtgccaaccaagatccattagagggcactaca
-ggtctcgtcccgttgtttggcatagatgtctgggaacatgcctattacttgcagtacaagaatgttcgtccagactatgtaaaggctatttggaatgtggccaactgggatgacatcatggaacgttacaacaat
+.. image:: ../../adaptsearch_picture_helps.png :heigth: 593 :width: 852
 
-.. class:: infomark
-
-| for example : locus5_sp3_sp3.fasta
-|
-
----------------------------------------------------
+---------
 
 Changelog
 ---------
@@ -539,15 +285,13 @@
 
  - NEW: Replace the zip between tools by Dataset Collection
 
-
 **Version 1.0 - 13/04/2017**
 
- - Add funtional test with planemo
-
+ - Added functional test with planemo
  - planemo test with conda dependency for python
-
  - Scripts renamed + symlinks to the directory 'scripts'
 
+    ]]>
 
 	</help>
 
--- a/scripts/S01_find_orf_on_multiple_alignment.py	Wed Sep 27 10:03:05 2017 -0400
+++ b/scripts/S01_find_orf_on_multiple_alignment.py	Wed Jan 17 08:55:29 2018 -0500
@@ -9,29 +9,6 @@
                                  ## OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA
                                  ## OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA
 
-
-###############################
-##### DEF 1 : Dico fasta  #####
-###############################
-def dico(fasta_file_path):
-    F2 = open(fasta_file_path, "r")
-    dicoco = {}
-    while 1:
-        next2 = F2.readline()
-        if not next2:
-            break
-        if next2[0] == ">":
-            fasta_name_query = next2[:-1]
-            Sn = string.split(fasta_name_query, "||")
-            fasta_name_query = Sn[0]
-            next3 = F2.readline()
-            fasta_seq_query = next3[:-1]
-            dicoco[fasta_name_query]=fasta_seq_query
-    F2.close()
-    return(dicoco)
-############################################################
-
-
 ####################################################
 ###### DEF 2 : Create bash for genetic code ########
 ####################################################
@@ -343,6 +320,7 @@
 ##### RUN RUN RUN #####
 #######################
 import string, os, time, re, zipfile, sys
+from dico import dico
 
 infiles = sys.argv[1]
 MINIMAL_CDS_LENGTH = int(sys.argv[3])  ## in aa number
@@ -382,7 +360,9 @@
 for file in list_file:
     count_file_processed = count_file_processed + 1
     fasta_file_path = "./%s" %file
-    bash_fasta = dico(fasta_file_path)   ### DEF 1 ###
+    fasta_file = open(fasta_file_path, "r")
+    bash_fasta = dico(fasta_file)   ### DEF 1 ###
+    fasta_file.close()
     BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M  = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel)   ### DEF 4 - PART 2 - ###
 
     ## a ## OUTPUT BESTORF_nuc
--- a/scripts/S02_remove_too_short_bit_or_whole_sequence.py	Wed Sep 27 10:03:05 2017 -0400
+++ b/scripts/S02_remove_too_short_bit_or_whole_sequence.py	Wed Jan 17 08:55:29 2018 -0500
@@ -5,28 +5,6 @@
 ## Description : find and remove indels
 
 
-###############################
-##### DEF 0 : Dico fasta  #####
-###############################
-def dico(F2):
-    #F2 = open(fasta_file_path, "r")
-    dicoco = {}
-    while 1:
-        next2 = F2.readline()
-        if not next2:
-            break
-        if next2[0] == ">":
-            fasta_name_query = next2[:-1]
-            Sn = string.split(fasta_name_query, "||")
-            fasta_name_query = Sn[0]
-            next3 = F2.readline()
-            fasta_seq_query = next3[:-1]
-            dicoco[fasta_name_query]=fasta_seq_query
-    #F2.close()
-    return(dicoco)
-###################################################################################
-
-
 ###################
 ###### DEF 9 ######
 ###################
@@ -63,6 +41,7 @@
 ##### RUN RUN RUN #####
 #######################
 import string, os, time, re, sys
+from dico import dico
 
 ### 0 ### PARAMETERS
 MIN_LENGTH_ALL_aa = int(sys.argv[3])-20
--- a/scripts/S03_remove_site_with_not_enough_species_represented.py	Wed Sep 27 10:03:05 2017 -0400
+++ b/scripts/S03_remove_site_with_not_enough_species_represented.py	Wed Jan 17 08:55:29 2018 -0500
@@ -4,28 +4,6 @@
 
 ## Description : find and remove indels
 
-
-###############################
-##### DEF 1 : Dico fasta  #####
-###############################
-def dico(F2):
-    dicoco = {}
-    while 1:
-        next2 = F2.readline()
-        if not next2:
-            break
-        if next2[0] == ">":
-            fasta_name_query = next2[:-1]
-            Sn = string.split(fasta_name_query, "||")
-            fasta_name_query = Sn[0]
-            next3 = F2.readline()
-            fasta_seq_query = next3[:-1]
-            dicoco[fasta_name_query]=fasta_seq_query
-    #F2.close()
-    return(dicoco)
-###################################################################################
-
-
 ####################
 ###### DEF 2 #######
 ####################
@@ -116,6 +94,7 @@
 ##### RUN RUN RUN #####
 #######################
 import string, os, time, re, sys, zipfile
+from dico import dico
 
 ### 0 ### PARAMETERS
 MIN_SPECIES_NB = int(sys.argv[1])
@@ -149,6 +128,9 @@
 
     dico_aa = dico(file_INaa)   ### DEF 1 ###
     dico_nuc = dico(file_INnuc)   ### DEF 1 ###
+    
+    file_INaa.close()
+    file_INnuc.close()
 
     if len(dico_aa) < MIN_SPECIES_NB :
 	list_file.append(file)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/dico.py	Wed Jan 17 08:55:29 2018 -0500
@@ -0,0 +1,16 @@
+import string
+
+def dico(F1):    
+    dicoco = {}
+    while 1:
+        next2 = F1.readline()
+        if not next2:
+            break
+        if next2[0] == ">":
+            fasta_name_query = next2[:-1]
+            Sn = string.split(fasta_name_query, "||")
+            fasta_name_query = Sn[0]
+            next3 = F1.readline()
+            fasta_seq_query = next3[:-1]
+            dicoco[fasta_name_query]=fasta_seq_query    
+    return(dicoco)