Mercurial > repos > abims-sbr > mutcount

<?xml version="1.0"?>

<tool name="MutCount" id="mutcount" version="2.0">
    <description>
        This tool proceeds to count codons, amino acids on each species of a set of species, and then proceeds to permutation tests.
    </description>

    <macros>
        <import>macros.xml</import>
    </macros>

    <requirements>
        <expand macro="python_required" />
    </requirements>

    <command>
    <![CDATA[

        ln -s $__tool_directory__/scripts/functions.py . &&

        #if str($method.method_run) == "concat" :
            python '$__tool_directory__/scripts/S01a_mutcount_pairs.py' $method.num_sampled $method.num_iter $method.list_species
            &&
            python '$__tool_directory__/scripts/S02a_codon_counting.py' ${method.concat_nuc}
        #end if

        #if str($method.method_run) == "separated" :
            #set $infiles = ""
            #for $input in $method.sep_file
                ln -s '$input' '$input.element_identifier';
                #set $infiles = $infiles + $input.element_identifier + ","
            #end for
            #set $infiles = $infiles[:-1]

            #if str($method.format_run)== "nucleic" :
                python '$__tool_directory__/scripts/S02b_study_seq_composition_nuc.py' '$infiles' ${method.concat_phy}
            #end if

            #if str($method.format_run)== "proteic" :
                cp '$__tool_directory__/scripts/amino_acid_properties.csv' .
                &&
                python '$__tool_directory__/scripts/S01b_study_seq_composition_aa.py' '$infiles' ${method.concat_phy}
            #end if
        #end if
    ]]>
    </command>

    <inputs>
        <conditional name="method">
            <param name="method_run" type="select" label="Which method do you want to use for this tool? ">
                <option value="concat">Concatenated genes in DNA (concatenation from RAxML run)</option>
                <option value="separated">Set of separated genes (from ORF_Search output "output zip containing files with CDS without indel")</option>
            </param>

            <when value="concat">
                <param name="concat_nuc" type="data" format="fasta" label="Choose your fasta file in nucleic format" help="It must contain the concatenated file in NUCLEIC format from Phylogeny tool" />
                <param name="num_sampled" type="integer" value="100" min="0" label="Number of iterations"/>
                <param name="num_iter" type="integer" value="100" min="0" label="Number of sampled codons"/>
                <param name="list_species" type="text" size="100" label="List of species" help="List the species separated with a comma (for e.g Ap,As,Ct,Gt,Yu)" />
            </when>

            <when value="separated">
                <param name="format_run" type="select" label="Which format do you want to use for this tool (concatenation and RAxML run) ? ">
                    <option value="nucleic">Nucleic format</option>
                    <option value="proteic">Proteic format</option>
                </param>
                <param name="sep_file" type="data" format="fasta" multiple="true" label="Choose fasta files" help="Concatenated files from ORF_search tool ; in nucleic or proteic, according to the format chosen above" />
                <param name="concat_phy" type="data" format="fasta" label="Concatenated file from Phylogeny step" help="This file is used to retrieve the species names" />
            </when>
        </conditional>
    </inputs>

    <outputs>
        <!-- output concat -->
        <!--
        <data format="txt" name="output1" label="counts.txt" from_work_dir="counts.txt" >
            <filter>(method['method_run']=='concat')</filter>
        </data>
        <data format="txt" name="output2" label="biases.txt" from_work_dir="biases.txt" >
            <filter>(method['method_run']=='concat')</filter>
        </data>
        -->
        <data format="csv" name="codons_counts" label="codons_counts.csv" from_work_dir="codons_counts.csv" >
            <filter>(method['method_run']=='concat')</filter>
        </data>
        <data format="csv" name="aa_counts" label="aa_counts.csv" from_work_dir="aa_counts.csv" >
            <filter>(method['method_run']=='concat')</filter>
        </data>
        <data format="csv" name="aatypes_counts" label="aatypes_counts.csv" from_work_dir="aatypes_counts.csv" >
            <filter>(method['method_run']=='concat')</filter>
        </data>
        <data format="csv" name="gc_counts" label="gc_counts.csv" from_work_dir="gc_counts.csv" >
            <filter>(method['method_run']=='concat')</filter>
        </data>
        <data format="csv" name="aa_transitions" label="aa_transitions.csv" from_work_dir="aa_transitions.csv" >
            <filter>(method['method_run']=='concat')</filter>
        </data>
        <data format="csv" name="aatypes_transitions" label="aatypes_transitions.csv" from_work_dir="aatypes_transitions.csv" >
            <filter>(method['method_run']=='concat')</filter>
        </data>

        <!-- outputs separated - nucleic -->
        <data format="csv" name="nuc_comp" label="nuc_compositions.csv" from_work_dir="OUT/nuc_compositions.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'nucleic')</filter>
        </data>
        <data format="csv" name="percent_gc" label="percent_GC.csv" from_work_dir="OUT/percent_GC.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'nucleic')</filter>
        </data>
        <data format="csv" name="percent_pur" label="percent_purine.csv" from_work_dir="OUT/percent_purine.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'nucleic')</filter>
        </data>
        <data format="csv" name="purine_load" label="Purine_Load_Indice.csv" from_work_dir="OUT/Purine_Load_Indice.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'nucleic')</filter>
        </data>

        <!-- outputs separated - proteic -->
        <data format="csv" name="prot_comp" label="prot_compositions_All_AA.csv" from_work_dir="OUT/prot_compositions_All_AA.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
        </data>
        <data format="csv" name="ivywrel" label="IVYWREL.csv" from_work_dir="OUT/IVYWREL.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
        </data>
        <data format="csv" name="erk_dnqtsh" label="ERK_DNQTSH.csv" from_work_dir="OUT/ERK_DNQTSH.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
        </data>
        <data format="csv" name="ek_qh" label="EK_QH.csv" from_work_dir="OUT/EK_QH.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
        </data>
        <data format="csv" name="fymink_garp" label="FYMINK_GARP.csv" from_work_dir="OUT/FYMINK_GARP.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
        </data>
        <data format="csv" name="avlimfyw" label="AVLIMFYW.csv" from_work_dir="OUT/AVLIMFYW.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
        </data>
        <data format="csv" name="stnq" label="STNQ.csv" from_work_dir="OUT/STNQ.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
        </data>
        <data format="csv" name="rhkde" label="RHKDE.csv" from_work_dir="OUT/RHKDE.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
        </data>
        <data format="csv" name="payre_mvgds" label="PAYRE-MVGDS.csv" from_work_dir="OUT/PAYRE-MVGDS.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
        </data>
        <data format="csv" name="res_weigth" label="TotalResidueWeight.csv" from_work_dir="OUT/TotalResidueWeight.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
        </data>
        <data format="csv" name="res_vol" label="TotalResidueVolume.csv" from_work_dir="OUT/TotalResidueVolume.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
        </data>
        <data format="csv" name="spec_vol" label="TotalPartialSpecificVolume.csv" from_work_dir="OUT/TotalPartialSpecificVolume.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
        </data>
        <data format="csv" name="hydrat" label="TotalHydratation.csv" from_work_dir="OUT/TotalHydratation.csv" >
            <filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
        </data>

    </outputs>

    <tests>
        <test>
            <conditional name="method" >
                <param name="method_run" value="concat" />
                <param name="concat_nuc" ftype="fasta" value="test_07_output_phylogeny_concatenation.fasta" />
                <param name="num_sampled" value="100" />
                <param name="num_iter" value="100" />
                <param name="list_species" ftype="text" value="Ac,Am,Ap,Pu" />
            </conditional>
            <output name="codons_counts" value="OUT_concat/codons_counts.csv" lines_diff="8"/>
            <output name="aa_counts" value="OUT_concat/aa_counts.csv" lines_diff="8"/>
            <output name="aatypes_counts" value="OUT_concat/aatypes_counts.csv" lines_diff="8"/>
            <output name="gc_counts" value="OUT_concat/gc_counts.csv"/>
            <output name="aa_transitions" value="OUT_concat/aa_transitions.csv" lines_diff="14"/>
            <output name="aatypes_transitions" value="OUT_concat/aatypes_transitions.csv" lines_diff="14"/>
        </test>

        <test>
            <conditional name="method" >
                <param name="method_run" value="separated" />
                <param name="format_run" value="nucleic" />
                <param name="sep_file" ftype="fasta" value="sep_nuc/locus1_sp6_sp6.fasta,sep_nuc/locus1_sp8_sp8.fasta,sep_nuc/locus2_sp6_sp6.fasta" />
                <param name="concat_phy" ftype="fasta" value="phylogeny_concat.fasta" />
            </conditional>
            <output name="nuc_comp">
                <assert_contents>
                    <has_line line="locus2_sp6_sp6.fasta,0.30208,0.23958,0.19792,0.26042,0.29688,0.27604,0.18229,0.24479,NA,NA,NA,NA,0.30208,0.24479,0.19792,0.25521,NA,NA,NA,NA,0.31250,0.26042,0.17188,0.25521,0.32292,0.21875,0.20312,0.25521,NA,NA,NA,NA,NA,NA,NA,NA,0.31771,0.25521,0.17708,0.25000"/>
                </assert_contents>
            </output>
            <output name="percent_gc">
                <assert_contents>
                    <has_line line="locus2_sp6_sp6.fasta,45.83333,42.70833,NA,45.31250,NA,42.70833,45.83333,NA,NA,42.70833" />
                </assert_contents>
            </output>
            <output name="percent_pur">
                <assert_contents>
                    <has_line line="locus2_sp6_sp6.fasta,56.25000,54.16667,NA,55.72917,NA,56.77083,57.81250,NA,NA,56.77083" />
                </assert_contents>
            </output>
            <output name="purine_load">
                <assert_contents>
                    <has_line line="locus2_sp6_sp6.fasta,192,12,12,62.50000,62.50000,192,12,4,62.50000,20.83333,NA,NA,NA,NA,NA,192,11,11,57.29167,57.29167,NA,NA,NA,NA,NA,192,16,10,83.33333,52.08333,192,10,20,52.08333,104.16667,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,192,14,12,72.91667,62.50000" />
                </assert_contents>
            </output>
        </test>

        <test>
            <conditional name="method" >
                <param name="method_run" value="separated" />
                <param name="format_run" value="proteic" />
                <param name="sep_file" ftype="fasta" value="sep_aa/locus1_sp6_sp6.fasta,sep_aa/locus1_sp8_sp8.fasta,sep_aa/locus2_sp6_sp6.fasta" />
                <param name="concat_phy" ftype="fasta" value="phylogeny_concat.fasta" />
            </conditional>
            <output name="ivywrel">
                <assert_contents>
                    <has_line line="locus2_sp6_sp6.fasta,21.00000,0.32812,23.00000,0.35938,NA,NA,23.00000,0.35938,NA,NA,22.00000,0.34375,23.00000,0.35938,NA,NA,NA,NA,22.00000,0.34375" />
                </assert_contents>
            </output>
            <output name="rhkde">
                <assert_contents>
                    <has_line line="locus1_sp8_sp8.fasta,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,14.00000,0.18182,4.00000,0.05195,10.00000,0.12987,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,16.00000,0.20779,5.00000,0.06494,11.00000,0.14286,15.00000,0.19481,4.00000,0.05195,11.00000,0.14286,14.00000,0.18182,2.00000,0.02597,12.00000,0.15584,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,13.00000,0.19697,2.00000,0.03030,11.00000,0.16667,15.00000,0.19481,4.00000,0.05195,11.00000,0.14286"/>
                </assert_contents>
            </output>
            <output name="payre_mvgds">
                <assert_contents>
                    <has_line line="locus2_sp6_sp6.fasta,18.00000,0.28125,6.00000,0.09375,20.00000,0.31250,0.90000,0.30000,18.00000,0.28125,6.00000,0.09375,21.00000,0.32812,0.85714,0.28571,NA,NA,NA,NA,NA,NA,NA,NA,18.00000,0.28125,6.00000,0.09375,20.00000,0.31250,0.90000,0.30000,NA,NA,NA,NA,NA,NA,NA,NA,17.00000,0.26562,6.00000,0.09375,20.00000,0.31250,0.85000,0.30000,20.00000,0.31250,8.00000,0.12500,19.00000,0.29688,1.05263,0.42105,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,17.00000,0.26562,6.00000,0.09375,20.00000,0.31250,0.85000,0.30000"/>
                </assert_contents>
            </output>
            <output name="avlimfyw">
                <assert_contents>
                    <has_line line="locus2_sp6_sp6.fasta,27.00000,0.42188,21.00000,0.32812,6.00000,0.09375,28.00000,0.43750,22.00000,0.34375,6.00000,0.09375,NA,NA,NA,NA,NA,NA,28.00000,0.43750,22.00000,0.34375,6.00000,0.09375,NA,NA,NA,NA,NA,NA,28.00000,0.43750,22.00000,0.34375,6.00000,0.09375,30.00000,0.46875,24.00000,0.37500,6.00000,0.09375,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,28.00000,0.43750,22.00000,0.34375,6.00000,0.09375"/>
                </assert_contents>
            </output>
        </test>
    </tests>

    <help>

@HELP_AUTHORS@

<![CDATA[

**Last Version** : Victor Mataigne and Gildas Le Corguillé
--------

**Description**

This script counts the number of codons, amino acids, and types of amino acids in sequences, as well as the mutation bias from one item to another between 2 sequences. Counting is then compared to empirical p-values, obtained from bootstrapped sequences obtained from a subset of sequences.

In the output files, the pvalues indicate the position of the observed data in a distribution of empirical countings obtained from a resample of the data. Values above 0.95 indicate a significantly higher counting, values under 0.05 a significantly lower counting.

The script resamples random pairs of aligned codon to determine what countings can be expected under the hypothesis of an homogenous dataset.
Countings are performed on each generated random alignement, thousands of alignments allow to draw a gaussian distribution of the countings.
Then the script simply checks whether the observed data are within the 5% lowest or 5% highest values of the distribution.

--------

.. class:: infomark

**Input files**

If you choose the concatenated method, the input file is the concatenated genes fasta file (in nucleic format) from a previous run of the toolConcatPhyl.

If you choose the separated method, there are two input files :
- A dataset collection containing output files from the CDS_Search tool, the one without indels. These files must be in nucleic or proteic format according to the format chosen along with the method.
- The concatenated genes fasta file from ConcatPhyl, only used here to retrieve species name.

--------

**Parameters**

There are parameters only for the "Concatenated" method :

- The number of iterations : the number of alignments that will be generated (effect on the resolution of the gaussian distribution). Shouldn't be lower than 1000 to have a relatively smooth gaussian distribution.

- The number of sampled codons : the number of pairs of codons in each generated alignments (effect on the robustness on the countings performed on this alignement). Shouldn't be lower than 1000 to detect codons with relatively low occurence (<1%).

- The list of species, separated by commas and without space (e.g : sp1,sp2,sp3,sp4). You can run the tool on subgroup of species, not only on the total number of species present in the previous tools. You can also write 'all' to include every species.

--------

**Outputs**

Many outputs in .csv format , varying according to the chosen method and format (separated, nucleic ...)
    - When method = concat : 6 .csv outputs : countings of codons, amino acids, amino acids types, and transitions from amino acid to amino acid and from amino acid type to amino acid type.
    - When method = separated and format = nucleic : 4 .csv outputs : nucleotide composition, GC percent, purine percent, purine load indice.
    - When method = separated and format = proteic : 13 .csv outputs : protein composition, several files of countings various AA combinations, results on residues, hydratation, partial specific volume.

---------

**The AdaptSearch Pipeline**

.. image:: adaptsearch_picture_helps.png :heigth: 593 :width: 852

---------

Changelog
---------

**Version 2.1 - 10/01/2017**

- Splitted output of concatenated method in several csv files.
- Bug corrected in output files of separated method.

**Version 2.0 - 12/07/2017**

- NEW: Replaced the zip between tools by Dataset Collection
- More functional tests

**Version 1.0 - 14/04/2017**

- Added the tools to the suite
- Added a functional test with planemo
- Planemo test using conda dependencies for python
- Scripts renamed + symlinks to the directory 'scripts'

    ]]>

    </help>

    <expand macro="citations" />

</tool>
author	abims-sbr
date	Tue, 27 Feb 2018 08:43:50 -0500
parents	263caa68d7bb
children	0ba551449008