Mercurial > repos > abims-sbr > cds_search
changeset 3:ff98ed7849fa draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit cf1b9c905931ca2ca25faa4844d45c908756472f
author | abims-sbr |
---|---|
date | Wed, 17 Jan 2018 08:55:29 -0500 |
parents | 0d2f72caea10 |
children | 0450307b2ffb |
files | CDS_search.xml scripts/S01_find_orf_on_multiple_alignment.py scripts/S02_remove_too_short_bit_or_whole_sequence.py scripts/S03_remove_site_with_not_enough_species_represented.py scripts/dico.py |
diffstat | 5 files changed, 87 insertions(+), 386 deletions(-) [+] |
line wrap: on
line diff
--- a/CDS_search.xml Wed Sep 27 10:03:05 2017 -0400 +++ b/CDS_search.xml Wed Jan 17 08:55:29 2018 -0500 @@ -19,6 +19,8 @@ #set $infiles = $infiles + $input.element_identifier + "," #end for #set $infiles = $infiles[:-1] + + ln -s $__tool_directory__/scripts/dico.py . && python $__tool_directory__/scripts/S01_find_orf_on_multiple_alignment.py $infiles @@ -184,353 +186,97 @@ </tests> <help> -@HELP_AUTHORS@ - -============ -What it does -============ - -| This tool takes **'dataset collection list' containing fasta files with nucleic aligned sequences** and search the ORF and the CDS -| - --------- - -========== -Parameters -========== - -The choice of several parameters is possible. - -**min_length_seq** - | minimal length of the sequence in the proteic format - | when the removal of the indel is done, the minimal length equals :previous length less20 - | for example if you choose 50 for the minimal length, the actual length equals 30 - | - -**min_length_subseq** - | minimal length of the subsequence in the proteic format - | subsequence means the part of the original sequence between 2 sets of indels - | an indel set is composed by more than 2 indels, if not the set is considered as unknown amino acid - | +@HELPAUTHORS@ -**min_length_nuc** - | Minimal length of the sequence in the nucleic format - | - --------- - -====== -Inputs -====== - -option **universal code** : - -| the input must have the extension .txt -| It's the file which will serve for the translation of nucleotides to amino acids -| if there are "U"s in this file, they will be automatically transform into "T"s -| for example : -| UUU Phe F -| UCU Ser S -| UAU Tyr Y -| UGU Cys C -| UUC Phe F -| UCC Ser S -| UAC Tyr Y -| UGC Cys C -| UUA Leu L -| UCA Ser S -| UAA Stop * -| ... - +<![CDATA[ -------- -======= -Outputs -======= - -This tool, produces the following files : - -**ORF_Search** - | is the output with important informations (mainly statistics about the tools). - | - -**ORF_Search_Best_ORF_aa** - | is the output with the best ORF in the proteic format. - | - -**ORF_Search_Best_ORF_nuc** - | is the output with the best ORF in the nucleic format. - | - -**ORF_Search_CDS_aa** - | is the output with the CDS (regardless the Methionine) in the proteic format. - | +**Description** -**ORF_Search_CDS_nuc** - | is the output with the CDS (regardless the Methionine) in the nucleic format. - | - -**ORF_Search_CDS_with_M_aa** - | is the output with the CDS (considering the Methionine) in proteic format. - | the rule : they must have a methionine before the minimale length of the sequence. - | for example before the 30 last amino acid. - | - -**ORF_Search_CDS_with_M_nuc** - | is the output with the CDS (considering the Methionine) in nucleic format. - | the rule : they must have a methionine before the minimale length of the sequence. - | for example before the 30 last amino acid. - | - -**ORF_Search_CDS_without_indel_aa** - | is the output with the CDS without indel in proteic format. - | considering the Methionine or not : according to the option chosen. - | - -**ORF_Search_CDS_without_indel_nuc** - | is the output with the CDS without indel in proteic format. - | considering the Methionine or not : according to the option chosen. - +This tool takes files containing nucleic aligned sequences and search the ORF and the CDS. -------- -=============== -Working Example -=============== - ------------------------------- -The input file and its options ------------------------------- - -**ORF_Search** - -| a 'dataset collection list' containing 47 files with 1, 2 or 3 sequences inside -| for example the file : locus5_sp3.fasta which contains 3 species : -| - -| >Ac533/40375 +**Inputs** -----------gccccctagcgagtgacgacaaactcg----------------------------------------aacatgttggctgctgggctaagaactctaaaaagccttgcccctcggggttgcgtagcgtggtcgtgtacttcggtgcatgccaagcataccctaccagacttaccatatgattacaatgccctggagccacacatcagtgctgaaatcatgctgctgcatca -caccaagcatcaccagacgtatgtcaacaacctgaatgttgcagaggagaagtttcatgaggctacagagaaaggtgatgtaaccacagcagtatcactgatgccagccctaagatttaatggtggtggacacatcaaccatactatattttggaagaacatgtcaccaaatggtggtggagagccatctggcgaactgatggaggccatcaaacgtgac -tttggctcatttgaaaacatgaagaacatgttgagtacatcaaccactgcagtgcaaggctctggctggggatggcttgggtacaataaaaagatgaagaaactagaaattgccacctgtgccaaccaagatccattagagggcactacaggtctcgtcccgttgtttggcatagatgtctgggaacatgcctattacttgcagtacaagaatgttcgtccagact -atgtaaaggctatttggaatgtggccaactgggatgacatcatggaacgttacaacaatgccagaaaataaactgttaaacaaataattaatatattaatgtgttgcaatt-tttgtcaattggtacatacacaattttgttcataaagaaaat--tgtgattactttctg-gcaactagttcccagtgaatacagcaattgttctgccaaggatgcatttggattgagaaggc -acccagaatgggttatccgtgaatgtctattggaatgtggcaccatacgatgtttactgtattagttacaattaaaa------------------- +Input files : (multiple) fasta files with nucleic aligned sequences. ->Ap401/11000 -ctgatgtgtggccccctagcgagtgaagacaaactcgatagcacccagacagttctgttggttagataaaagggagaaacatgctggctgctgggctaagaactctaaaaagccttgctcctcgtggtggtctagcttggtcttgtacctcggtacatgccaaacacacactgccagacttgccgtatgattataatgctctggagccacacatcagtgctgaaa -tcatgttgctgcatcacacaaaacatcaccagacgtatgtgaacaacctgaatattgcagaggagaagtttcatgaggctaccgagaaaggcgatgtgaccacagcagtatcactgatgccagccctaagatttaatggtggtggacatatcaaccatactatattttggaagaacatgtcaccaaatggtggtggagaaccatctggcgaactgatggag -gccatcaaacgtgactttggttcatttgaaaacatgaagaacatgctgagtacagcaaccactgcagtacaaggctctggctggggatggcttgggtacaataaaaagatgaagaaactagaaattgccacctgtgccaaccaagatccattagagggcactacaggtcttgtcccactgtttggtatcgatgtctgggaacatgcctattatttacagtacaa -gaatgttcgtccagactatgtaaaggctatttggaatgtggctaactgggatgatatcatggagcgttacaacaatgccagaaaataaac--ttaaatacatcattatttagttaatgtgcgacttttgtttgttaatcagttcacacaccattctattcacaaagaaaatggtgtatttgctttctgttcaactggttcccggtgaatacagcagttgttctgccaaggatgtattt -ggattgagaaggcaccaagaatgggctgtcactgaatgtcaattggaatgtagcctca----atgtttactgtattacctacaattaaaatgattatgatataaccaag - -| >Pf2011/1100 - ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------tgccagatttaccatatgattataatgccttggaaccacacatcagtgctgaaataatgctccttcatcatacaaagcatcaccagacatatgtgaacaatctgaatgt -agctgaagaaaagtttcatgaagccacggagaaaggtgatgtcactacagctgtgtcgctaatgccagcactaagatttaatggcggaggacacatcaatcacaccattttctggaagaacatgtctcctaatggcggaggagagccttctggcgagttgatggaagccattaaacgtgattttggttcatttgagaatatgaaaaacatgttaagtacagcta -caacagctgtccaaggatctggctggggatggcttggttataacaaaaagatgaaaaagctcgagatagccacttgtgccaaccaggatccactggaaggaacaacaggattaattccactgtttggtattgacgtctgggagcatgcttactatctgcaatataaaaatgtacgtccagattatgttaaagctatctggaatgtggccaactgggatgatatta -cagagcgctacaacaatgc----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +-------- **Parameters** -| option : Methionine is considerated -| no option for the CDS's length. So, by default it's min_length_seq : 50, min_length_subseq : 15, min_length_nuc : 50 + - methionine : choose to consider the methionine in the search of CDS. + yes/no. ----------------- -The output files ----------------- - - -**ORF_Search_Best_ORF_aa** + - 'Minimal number of species in each locus' + Default : 10 (integer). -| -| *************** CDS detection *************** -| -| Files processed: 47 -| -| Files with CDS: 32 -| Files with CDS plus M (codon start): 20 -| Files without CDS: 15 -| -| In locus with CDS considering Methionine : -| -| *************** 1st filter : selection of the locus *************** -| - -Total number of locus recorded = 20 - -| Number of locus with 1 species : 1 -| Number of locus with 2 species : 16 -| Number of locus with 3 species : 3 -| -| Number of locus excluded (exclude if not at least 3 species in the alignment)= 17 -| -| *************** 2nd Filter : removal of the indel *************** -| -| Total number of locus recorded = 3 -| - -Total number of locus with no indels (SAVED) = 3 - -Total number of locus with indels (EXCLUDED) = 0 - -| - -**ORF_Search_Best_ORF_aa** - -| >Pf2011/1100 + - 'min_length_seq' : + minimal length of the sequence (in amino acids). + when the removal of the indel is done, the minimal length equals : previous length - 20. + for example if you choose 50 for the minimal length, the actual length equals 30. + Default : 50 (integer). -?????????????????????????????????????????????????????????PDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNVAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLSTATTAVQ -GSGWGWLGYNKKMKKLEIATCANQDPLEGTTGLIPLFGIDVWEHAYYLQYKNVRPDYVKAIWNVANWDDITERYNN?????????????????????????????????????????????????????????????????????????????????????????? - -| >Ap401/11000 - -DVWPPSE*RQTR*HPDSSVG*IKGRNMLAAGLRTLKSLAPRGGLAWSCTSVHAKHTLPDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNIAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLS -TATTAVQGSGWGWLGYNKKMKKLEIATCANQDPLEGTTGLVPLFGIDVWEHAYYLQYKNVRPDYVKAIWNVANWDDIMERYNNARK*??KYIII*LMCDFCLLISSHTILFTKKMVYLLSVQLVPGEYSSCSAKDVFGLRRHQEWAVTECQLECSL??CLLYYLQLK* -L*YNQ - -| >Ac533/40375 + - 'min_length_subseq' : + minimal length of the subsequence (in amino acids). + subsequence means the part of the original sequence between 2 sets of indels. + an indel set is composed by more than 2 indels, if not the set is considered as unknown amino acid. + Default : 15 (integer). -???PPSE*RQT??????????????NMLAAGLRTLKSLAPRGCVAWSCTSVHAKHTLPDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNVAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLSTS -TTAVQGSGWGWLGYNKKMKKLEIATCANQDPLEGTTGLVPLFGIDVWEHAYYLQYKNVRPDYVKAIWNVANWDDIMERYNNARK*TVKQIINILMCCN?LSIGTYTILFIKK??*LLS?QLVPSEYSNCSAKDAFGLRRHPEWVIRECLLECGTIRCLLY*LQLK?????? - -.. class:: infomark - -| for example : locus5_sp3.fasta -| - -**ORF_Search_Best_ORF_nuc** - -| >Pf2011/1100 + - 'min_length_nuc' : + Minimal length of the sequence in the nucleic format, without indels. + Default : 50 (integer). --------------------------------------------------------------------------------------------------------------------------------------------------------------------------tgccagatttaccatatgattataatgccttggaaccacacatcagtgctgaaataatgctccttcatcatacaaagcatcaccagacatatgtgaacaatctgaatgta -gctgaagaaaagtttcatgaagccacggagaaaggtgatgtcactacagctgtgtcgctaatgccagcactaagatttaatggcggaggacacatcaatcacaccattttctggaagaacatgtctcctaatggcggaggagagccttctggcgagttgatggaagccattaaacgtgattttggttcatttgagaatatgaaaaacatgttaagtacagctac -aacagctgtccaaggatctggctggggatggcttggttataacaaaaagatgaaaaagctcgagatagccacttgtgccaaccaggatccactggaaggaacaacaggattaattccactgtttggtattgacgtctgggagcatgcttactatctgcaatataaaaatgtacgtccagattatgttaaagctatctggaatgtggccaactgggatgatattac -agagcgctacaacaatgc---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - -| >Ap401/11000 - -gatgtgtggccccctagcgagtgaagacaaactcgatagcacccagacagttctgttggttagataaaagggagaaacatgctggctgctgggctaagaactctaaaaagccttgctcctcgtggtggtctagcttggtcttgtacctcggtacatgccaaacacacactgccagacttgccgtatgattataatgctctggagccacacatcagtgctgaaatc -atgttgctgcatcacacaaaacatcaccagacgtatgtgaacaacctgaatattgcagaggagaagtttcatgaggctaccgagaaaggcgatgtgaccacagcagtatcactgatgccagccctaagatttaatggtggtggacatatcaaccatactatattttggaagaacatgtcaccaaatggtggtggagaaccatctggcgaactgatggaggc -catcaaacgtgactttggttcatttgaaaacatgaagaacatgctgagtacagcaaccactgcagtacaaggctctggctggggatggcttgggtacaataaaaagatgaagaaactagaaattgccacctgtgccaaccaagatccattagagggcactacaggtcttgtcccactgtttggtatcgatgtctgggaacatgcctattatttacagtacaagaa -tgttcgtccagactatgtaaaggctatttggaatgtggctaactgggatgatatcatggagcgttacaacaatgccagaaaataaac--ttaaatacatcattatttagttaatgtgcgacttttgtttgttaatcagttcacacaccattctattcacaaagaaaatggtgtatttgctttctgttcaactggttcccggtgaatacagcagttgttctgccaaggatgtatttggat -tgagaaggcaccaagaatgggctgtcactgaatgtcaattggaatgtagcctca----atgtttactgtattacctacaattaaaatgattatgatataaccaa + - others parameters allowing to choose which outputs you desire : + - outputs with best ORFs. + - outputs with CDS, with or without indels. + - in proteic or nucleic format. -| >Ac533/40375 +-------- ---------gccccctagcgagtgacgacaaactcg----------------------------------------aacatgttggctgctgggctaagaactctaaaaagccttgcccctcggggttgcgtagcgtggtcgtgtacttcggtgcatgccaagcataccctaccagacttaccatatgattacaatgccctggagccacacatcagtgctgaaatcatgctgctgcatcac -accaagcatcaccagacgtatgtcaacaacctgaatgttgcagaggagaagtttcatgaggctacagagaaaggtgatgtaaccacagcagtatcactgatgccagccctaagatttaatggtggtggacacatcaaccatactatattttggaagaacatgtcaccaaatggtggtggagagccatctggcgaactgatggaggccatcaaacgtgact -ttggctcatttgaaaacatgaagaacatgttgagtacatcaaccactgcagtgcaaggctctggctggggatggcttgggtacaataaaaagatgaagaaactagaaattgccacctgtgccaaccaagatccattagagggcactacaggtctcgtcccgttgtttggcatagatgtctgggaacatgcctattacttgcagtacaagaatgttcgtccagacta -tgtaaaggctatttggaatgtggccaactgggatgacatcatggaacgttacaacaatgccagaaaataaactgttaaacaaataattaatatattaatgtgttgcaatt-tttgtcaattggtacatacacaattttgttcataaagaaaat--tgtgattactttctg-gcaactagttcccagtgaatacagcaattgttctgccaaggatgcatttggattgagaaggca -cccagaatgggttatccgtgaatgtctattggaatgtggcaccatacgatgtttactgtattagttacaattaaaa------------------ +**Outputs** -.. class:: infomark - -| for example : locus5_sp3.fasta -| - -**ORF_Search_CDS_with_M_aa** + - ORF_Search + the log file (mainly statistics about the tool). -| >Pf2011/1100 - -????????????????????????????????????PDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNVAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLSTATTAVQGSGWGWLGYNKKMKK -LEIATCANQDPLEGTTGLIPLFGIDVWEHAYYLQYKNVRPDYVKAIWNVANWDDITERYNN??? - -| >Ap401/11000 - -IKGRNMLAAGLRTLKSLAPRGGLAWSCTSVHAKHTLPDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNIAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLSTATTAVQGSGWGWLGYNK -KMKKLEIATCANQDPLEGTTGLVPLFGIDVWEHAYYLQYKNVRPDYVKAIWNVANWDDIMERYNNARK + - ORF_Search_Best_ORF_aa + the output with the best ORF in the proteic format. -| >Ac533/40375 - -????NMLAAGLRTLKSLAPRGCVAWSCTSVHAKHTLPDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNVAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLSTSTTAVQGSGWGWLGYNK -KMKKLEIATCANQDPLEGTTGLVPLFGIDVWEHAYYLQYKNVRPDYVKAIWNVANWDDIMERYNNARK - -.. class:: infomark - -| for example : locus5_sp3.fasta - -| It's the same for the option : regardless Methionine -| + - ORF_Search_Best_ORF_nuc + the output with the best ORF in the nucleic format. -**ORF_Search_CDS_with_M_nuc** - -| >Pf2011/1100 - -----------------------------------------------------------------------------------------------------------tgccagatttaccatatgattataatgccttggaaccacacatcagtgctgaaataatgctccttcatcatacaaagcatcaccagacatatgtgaacaatctgaatgtagctgaagaaaagtttcatgaagccacggagaaaggtgatg -tcactacagctgtgtcgctaatgccagcactaagatttaatggcggaggacacatcaatcacaccattttctggaagaacatgtctcctaatggcggaggagagccttctggcgagttgatggaagccattaaacgtgattttggttcatttgagaatatgaaaaacatgttaagtacagctacaacagctgtccaaggatctggctggggatggcttggttataa -caaaaagatgaaaaagctcgagatagccacttgtgccaaccaggatccactggaaggaacaacaggattaattccactgtttggtattgacgtctgggagcatgcttactatctgcaatataaaaatgtacgtccagattatgttaaagctatctggaatgtggccaactgggatgatattacagagcgctacaacaatgc------- - -| >Ap401/11000 + - ORF_Search_CDS_aa + the output with the CDS (regardless the Methionine) in the proteic format. -ataaaagggagaaacatgctggctgctgggctaagaactctaaaaagccttgctcctcgtggtggtctagcttggtcttgtacctcggtacatgccaaacacacactgccagacttgccgtatgattataatgctctggagccacacatcagtgctgaaatcatgttgctgcatcacacaaaacatcaccagacgtatgtgaacaacctgaatattgcagagga -gaagtttcatgaggctaccgagaaaggcgatgtgaccacagcagtatcactgatgccagccctaagatttaatggtggtggacatatcaaccatactatattttggaagaacatgtcaccaaatggtggtggagaaccatctggcgaactgatggaggccatcaaacgtgactttggttcatttgaaaacatgaagaacatgctgagtacagcaaccactg -cagtacaaggctctggctggggatggcttgggtacaataaaaagatgaagaaactagaaattgccacctgtgccaaccaagatccattagagggcactacaggtcttgtcccactgtttggtatcgatgtctgggaacatgcctattatttacagtacaagaatgttcgtccagactatgtaaaggctatttggaatgtggctaactgggatgatatcatggagcg -ttacaacaatgccagaaaa - -| >Ac533/40375 + - ORF_Search_CDS_nuc + the output with the CDS (regardless the Methionine) in the nucleic format. -------------aacatgttggctgctgggctaagaactctaaaaagccttgcccctcggggttgcgtagcgtggtcgtgtacttcggtgcatgccaagcataccctaccagacttaccatatgattacaatgccctggagccacacatcagtgctgaaatcatgctgctgcatcacaccaagcatcaccagacgtatgtcaacaacctgaatgttgcagaggagaa -gtttcatgaggctacagagaaaggtgatgtaaccacagcagtatcactgatgccagccctaagatttaatggtggtggacacatcaaccatactatattttggaagaacatgtcaccaaatggtggtggagagccatctggcgaactgatggaggccatcaaacgtgactttggctcatttgaaaacatgaagaacatgttgagtacatcaaccactgcagt -gcaaggctctggctggggatggcttgggtacaataaaaagatgaagaaactagaaattgccacctgtgccaaccaagatccattagagggcactacaggtctcgtcccgttgtttggcatagatgtctgggaacatgcctattacttgcagtacaagaatgttcgtccagactatgtaaaggctatttggaatgtggccaactgggatgacatcatggaacgtt -acaacaatgccagaaaa - -.. class:: infomark + - ORF_Search_CDS_with_M_aa + the output with the CDS (considering the Methionine) in proteic format. + the rule : they must have a methionine before the minimal length of the sequence. + for example before the 30 last amino acid. -| for example : locus5_sp3.fasta -| - -| It's the same for the option : regardless Methionine -| - -**ORF_Search_CDS_without_indel_aa** - -| >Pf2011/1100 + - ORF_Search_CDS_with_M_nuc + the output with the CDS (considering the Methionine) in nucleic format. + the rule : they must have a methionine before the minimale length of the sequence. + for example before the 30 last amino acid. -PDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNVAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLSTATTAVQGSGWGWLGYNKKMKKLEIATCANQDPLEGTTGLIPLFGIDVWEHAYYLQ -YKNVRPDYVKAIWNVANWDDITERYNN - -| >Ap401/11000 - -PDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNIAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLSTATTAVQGSGWGWLGYNKKMKKLEIATCANQDPLEGTTGLVPLFGIDVWEHAYYLQ -YKNVRPDYVKAIWNVANWDDIMERYNN - -| >Ac533/40375 - -PDLPYDYNALEPHISAEIMLLHHTKHHQTYVNNLNVAEEKFHEATEKGDVTTAVSLMPALRFNGGGHINHTIFWKNMSPNGGGEPSGELMEAIKRDFGSFENMKNMLSTSTTAVQGSGWGWLGYNKKMKKLEIATCANQDPLEGTTGLVPLFGIDVWEHAYYLQ -YKNVRPDYVKAIWNVANWDDIMERYNN - -.. class:: infomark + - ORF_Search_CDS_without_indel_aa + is the output with the CDS without indel in proteic format. + considering the Methionine or not : according to the option chosen. -| for example locus5_sp3_sp3.fasta -|| - -**ORF_Search_CDS_without_indel_nuc** + - ORF_Search_CDS_without_indel_nuc + is the output with the CDS without indel in proteic format. + considering the Methionine or not : according to the option chosen. -| >Pf2011/1100 - -ccagatttaccatatgattataatgccttggaaccacacatcagtgctgaaataatgctccttcatcatacaaagcatcaccagacatatgtgaacaatctgaatgtagctgaagaaaagtttcatgaagccacggagaaaggtgatgtcactacagctgtgtcgctaatgccagcactaagatttaatggcggaggacacatcaatcacaccattttctggaa -gaacatgtctcctaatggcggaggagagccttctggcgagttgatggaagccattaaacgtgattttggttcatttgagaatatgaaaaacatgttaagtacagctacaacagctgtccaaggatctggctggggatggcttggttataacaaaaagatgaaaaagctcgagatagccacttgtgccaaccaggatccactggaaggaacaacaggatta -attccactgtttggtattgacgtctgggagcatgcttactatctgcaatataaaaatgtacgtccagattatgttaaagctatctggaatgtggccaactgggatgatattacagagcgctacaacaat - -| >Ap401/11000 +--------- -ccagacttgccgtatgattataatgctctggagccacacatcagtgctgaaatcatgttgctgcatcacacaaaacatcaccagacgtatgtgaacaacctgaatattgcagaggagaagtttcatgaggctaccgagaaaggcgatgtgaccacagcagtatcactgatgccagccctaagatttaatggtggtggacatatcaaccatactatattttgga -agaacatgtcaccaaatggtggtggagaaccatctggcgaactgatggaggccatcaaacgtgactttggttcatttgaaaacatgaagaacatgctgagtacagcaaccactgcagtacaaggctctggctggggatggcttgggtacaataaaaagatgaagaaactagaaattgccacctgtgccaaccaagatccattagagggcactacag -gtcttgtcccactgtttggtatcgatgtctgggaacatgcctattatttacagtacaagaatgttcgtccagactatgtaaaggctatttggaatgtggctaactgggatgatatcatggagcgttacaacaat - -| >Ac533/40375 +**The AdaptSearch Pipeline** -ccagacttaccatatgattacaatgccctggagccacacatcagtgctgaaatcatgctgctgcatcacaccaagcatcaccagacgtatgtcaacaacctgaatgttgcagaggagaagtttcatgaggctacagagaaaggtgatgtaaccacagcagtatcactgatgccagccctaagatttaatggtggtggacacatcaaccatactatattttgg -aagaacatgtcaccaaatggtggtggagagccatctggcgaactgatggaggccatcaaacgtgactttggctcatttgaaaacatgaagaacatgttgagtacatcaaccactgcagtgcaaggctctggctggggatggcttgggtacaataaaaagatgaagaaactagaaattgccacctgtgccaaccaagatccattagagggcactaca -ggtctcgtcccgttgtttggcatagatgtctgggaacatgcctattacttgcagtacaagaatgttcgtccagactatgtaaaggctatttggaatgtggccaactgggatgacatcatggaacgttacaacaat +.. image:: ../../adaptsearch_picture_helps.png :heigth: 593 :width: 852 -.. class:: infomark - -| for example : locus5_sp3_sp3.fasta -| - ---------------------------------------------------- +--------- Changelog --------- @@ -539,15 +285,13 @@ - NEW: Replace the zip between tools by Dataset Collection - **Version 1.0 - 13/04/2017** - - Add funtional test with planemo - + - Added functional test with planemo - planemo test with conda dependency for python - - Scripts renamed + symlinks to the directory 'scripts' + ]]> </help>
--- a/scripts/S01_find_orf_on_multiple_alignment.py Wed Sep 27 10:03:05 2017 -0400 +++ b/scripts/S01_find_orf_on_multiple_alignment.py Wed Jan 17 08:55:29 2018 -0500 @@ -9,29 +9,6 @@ ## OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA ## OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA - -############################### -##### DEF 1 : Dico fasta ##### -############################### -def dico(fasta_file_path): - F2 = open(fasta_file_path, "r") - dicoco = {} - while 1: - next2 = F2.readline() - if not next2: - break - if next2[0] == ">": - fasta_name_query = next2[:-1] - Sn = string.split(fasta_name_query, "||") - fasta_name_query = Sn[0] - next3 = F2.readline() - fasta_seq_query = next3[:-1] - dicoco[fasta_name_query]=fasta_seq_query - F2.close() - return(dicoco) -############################################################ - - #################################################### ###### DEF 2 : Create bash for genetic code ######## #################################################### @@ -343,6 +320,7 @@ ##### RUN RUN RUN ##### ####################### import string, os, time, re, zipfile, sys +from dico import dico infiles = sys.argv[1] MINIMAL_CDS_LENGTH = int(sys.argv[3]) ## in aa number @@ -382,7 +360,9 @@ for file in list_file: count_file_processed = count_file_processed + 1 fasta_file_path = "./%s" %file - bash_fasta = dico(fasta_file_path) ### DEF 1 ### + fasta_file = open(fasta_file_path, "r") + bash_fasta = dico(fasta_file) ### DEF 1 ### + fasta_file.close() BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel) ### DEF 4 - PART 2 - ### ## a ## OUTPUT BESTORF_nuc
--- a/scripts/S02_remove_too_short_bit_or_whole_sequence.py Wed Sep 27 10:03:05 2017 -0400 +++ b/scripts/S02_remove_too_short_bit_or_whole_sequence.py Wed Jan 17 08:55:29 2018 -0500 @@ -5,28 +5,6 @@ ## Description : find and remove indels -############################### -##### DEF 0 : Dico fasta ##### -############################### -def dico(F2): - #F2 = open(fasta_file_path, "r") - dicoco = {} - while 1: - next2 = F2.readline() - if not next2: - break - if next2[0] == ">": - fasta_name_query = next2[:-1] - Sn = string.split(fasta_name_query, "||") - fasta_name_query = Sn[0] - next3 = F2.readline() - fasta_seq_query = next3[:-1] - dicoco[fasta_name_query]=fasta_seq_query - #F2.close() - return(dicoco) -################################################################################### - - ################### ###### DEF 9 ###### ################### @@ -63,6 +41,7 @@ ##### RUN RUN RUN ##### ####################### import string, os, time, re, sys +from dico import dico ### 0 ### PARAMETERS MIN_LENGTH_ALL_aa = int(sys.argv[3])-20
--- a/scripts/S03_remove_site_with_not_enough_species_represented.py Wed Sep 27 10:03:05 2017 -0400 +++ b/scripts/S03_remove_site_with_not_enough_species_represented.py Wed Jan 17 08:55:29 2018 -0500 @@ -4,28 +4,6 @@ ## Description : find and remove indels - -############################### -##### DEF 1 : Dico fasta ##### -############################### -def dico(F2): - dicoco = {} - while 1: - next2 = F2.readline() - if not next2: - break - if next2[0] == ">": - fasta_name_query = next2[:-1] - Sn = string.split(fasta_name_query, "||") - fasta_name_query = Sn[0] - next3 = F2.readline() - fasta_seq_query = next3[:-1] - dicoco[fasta_name_query]=fasta_seq_query - #F2.close() - return(dicoco) -################################################################################### - - #################### ###### DEF 2 ####### #################### @@ -116,6 +94,7 @@ ##### RUN RUN RUN ##### ####################### import string, os, time, re, sys, zipfile +from dico import dico ### 0 ### PARAMETERS MIN_SPECIES_NB = int(sys.argv[1]) @@ -149,6 +128,9 @@ dico_aa = dico(file_INaa) ### DEF 1 ### dico_nuc = dico(file_INnuc) ### DEF 1 ### + + file_INaa.close() + file_INnuc.close() if len(dico_aa) < MIN_SPECIES_NB : list_file.append(file)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/dico.py Wed Jan 17 08:55:29 2018 -0500 @@ -0,0 +1,16 @@ +import string + +def dico(F1): + dicoco = {} + while 1: + next2 = F1.readline() + if not next2: + break + if next2[0] == ">": + fasta_name_query = next2[:-1] + Sn = string.split(fasta_name_query, "||") + fasta_name_query = Sn[0] + next3 = F1.readline() + fasta_seq_query = next3[:-1] + dicoco[fasta_name_query]=fasta_seq_query + return(dicoco)