mutcount: MutCount.xml comparison

comparison MutCount.xml @ 1:8de21b6eb110 draft

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 44a89d5eeb82789bfc643b33c11f391281b6374b

author	abims-sbr
date	Wed, 27 Sep 2017 10:04:08 -0400
parents	78dd6454f6f0
children	988467f963f0

comparison

equal deleted inserted replaced

-:78dd6454f6f0
+:8de21b6eb110
 <?xml version="1.0"?>
-<tool name="MutCount" id="mutcount" version="1.0">
+<tool name="MutCount" id="mutcount" version="2.0">
-	<description>
+<description>
-		This tool proceeds to count codons, amino acids on each species of a set of species, and then proceeds to permutation tests.
+This tool proceeds to count codons, amino acids on each species of a set of species, and then proceeds to permutation tests.
-	</description>
+</description>
-	<macros>
+<macros>
-		<import>macros.xml</import>
+<import>macros.xml</import>
-	</macros>
+</macros>
-	<requirements>
+<requirements>
-		<expand macro="python_required" />
+<expand macro="python_required" />
-	</requirements>
+</requirements>
-	<command>
+<command>
-	<![CDATA[
+<![CDATA[
-		#if str($method.method_run) == "concat" :
+#if str($method.method_run) == "concat" :
-			python $__tool_directory__/scripts/S01a_mutcount_pairs.py $method.num_sampled $method.num_iter $method.list_species
+python '$__tool_directory__/scripts/S01a_mutcount_pairs.py' $method.num_sampled $method.num_iter $method.list_species
-			&&
+&&
-			python $__tool_directory__/scripts/S02a_codon_counting.py ${method.zip_nuc}
+python '$__tool_directory__/scripts/S02a_codon_counting.py' ${method.concat_nuc}
-		#end if
+#end if
-		#if str($method.method_run) == "separated" :
+#if str($method.method_run) == "separated" :
+#set $infiles = ""
-			#if str($method.format.format_run)== "nucleic" :
+#for $input in $method.sep_file
-				python $__tool_directory__/scripts/S01b_study_seq_composition_nuc.py ${method.format.zip_nuc}  ${method.format.concat_nuc}
+ln -s '$input' '$input.element_identifier';
-				&&
+#set $infiles = $infiles + $input.element_identifier + ","
-				zip -r multigenes_nucleic.zip OUT/
+#end for
-			#end if
+#set $infiles = $infiles[:-1]
-			#if str($method.format.format_run)== "proteic" :
+#if str($method.format_run)== "nucleic" :
-				cp $__tool_directory__/scripts/amino_acid_properties.csv .
+python '$__tool_directory__/scripts/S02b_study_seq_composition_nuc.py' '$infiles' ${method.concat_phy}
-				&&
+#end if
-python $__tool_directory__/scripts/S02b_study_seq_composition_aa.py ${method.format.zip_aa} ${method.format.concat_prot}
-				&&
+#if str($method.format_run)== "proteic" :
-				zip -r multigenes_proteic.zip OUT/
+cp '$__tool_directory__/scripts/amino_acid_properties.csv' .
-			#end if
+&&
-		#end if
+python '$__tool_directory__/scripts/S01b_study_seq_composition_aa.py' '$infiles' ${method.concat_phy}
+#end if
+#end if
 ]]>
-	</command>
+</command>
 <inputs>
-	    <conditional name="method">
+<conditional name="method">
-		    <param name="method_run" type="select" label="Which method do you want to use for this tool? ">
+<param name="method_run" type="select" label="Which method do you want to use for this tool? ">
-			    <option value="concat">Concatenated genes in DNA (concatenation from RAxML run)</option>
+<option value="concat">Concatenated genes in DNA (concatenation from RAxML run)</option>
-			    <option value="separated">Set of separated genes (from ORF_Search output "output zip containing files with CDS without indel")</option>
+<option value="separated">Set of separated genes (from ORF_Search output "output zip containing files with CDS without indel")</option>
-		    </param>
+</param>
-		    <when value="concat">
+<when value="concat">
-			    <param name="zip_nuc" type="data" format="fasta" label="Choose your fasta file in nucleic format" help="It must contain the concatenated file in NUCLEIC format from Phylogeny tool" />
+<param name="concat_nuc" type="data" format="fasta" label="Choose your fasta file in nucleic format" help="It must contain the concatenated file in NUCLEIC format from Phylogeny tool" />
-			    <param name="num_sampled" type="integer" value="100" min="0" label="Number of iterations"/>
+<param name="num_sampled" type="integer" value="100" min="0" label="Number of iterations"/>
-			    <param name="num_iter" type="integer" value="100" min="0" label="Number of sampled codons"/>
+<param name="num_iter" type="integer" value="100" min="0" label="Number of sampled codons"/>
-			    <param name="list_species" type="text" size="100" label="List of species" help="List the species separated with a comma (for e.g Ap,As,Ct,Gt,Yu)" />
+<param name="list_species" type="text" size="100" label="List of species" help="List the species separated with a comma (for e.g Ap,As,Ct,Gt,Yu)" />
-		    </when>
+</when>
-		    <when value="separated">
+<when value="separated">
-			    <conditional name="format">
+<param name="format_run" type="select" label="Which format do you want to use for this tool (concatenation and RAxML run) ? ">
-				    <param name="format_run" type="select" label="Which format do you want to use for this tool (concatenation and RAxML run) ? ">
+<option value="nucleic">Nucleic format</option>
-					    <option value="nucleic">Nucleic format</option>
+<option value="proteic">Proteic format</option>
-					    <option value="proteic">Proteic format</option>
+</param>
-				    </param>
+<param name="sep_file" type="data" format="fasta" multiple="true" label="Choose fasta files" help="Concatenated files from ORF_search tool ; in nucleic or proteic, according to the format chosen above" />
+<param name="concat_phy" type="data" format="fasta" label="Concatenated file from Phylogeny step" help="This file is used to retrieve the species names" />
-				    <when value="nucleic">
+</when>
-					    <param name="zip_nuc" type="data" format="fasta,no_unzip.zip" label="Choose your ZIP file" help="It must contain the concatenated file in NUCLEIC format from ORF_search tool" />
+</conditional>
-					    <param name="concat_nuc" type="data" format="fasta" label="Concatenated file from Phylogeny step" help="This file is used to retrieve the species names" />
-				    </when>
-				    <when value="proteic">
-					    <param name="zip_aa" type="data" format="fasta,no_unzip.zip" label="Choose your ZIP file" help="It must contain the concatenated file in PROTEIC format from ORF_search tool" />
-					    <param name="concat_prot" type="data" format="fasta" label="concatenated file from Phylogeny step" help="This file is used to retrieve the species names" />
-				    </when>
-			    </conditional>
-		    </when>
-	    </conditional>
 </inputs>
-	<outputs>
+<outputs>
-		<data format="txt" name="output1" label="concatenated_results.txt" from_work_dir="codoncounting_results.txt" >
+<!-- output concat -->
-			<filter>(method['method_run']=='concat')</filter>
+<data format="txt" name="output1" label="concatenated_results.txt" from_work_dir="codoncounting_results.txt" >
-		</data>
+<filter>(method['method_run']=='concat')</filter>
-		<data format="no_unzip.zip" name="output2" label="multigenes_nucleic.zip" from_work_dir="multigenes_nucleic.zip" >
+</data>
-			<filter>(method['method_run']=='separated' and method['format']['format_run']== 'nucleic')</filter>
-		</data>
+<!-- outputs separated - nucleic -->
-		<data format="no_unzip.zip" name="output3" label="multigenes_proteic.zip" from_work_dir="multigenes_proteic.zip"  >
+<data format="csv" name="nuc_comp" label="nuc_compositions.csv" from_work_dir="OUT/nuc_compositions.csv" >
-			<filter>(method['method_run']=='separated' and method['format']['format_run']== 'proteic')</filter>
+<filter>(method['method_run']=='separated' and method['format_run']== 'nucleic')</filter>
-		</data>
+</data>
-	</outputs>
+<data format="csv" name="percent_gc" label="percent_GC.csv" from_work_dir="OUT/percent_GC.csv" >
+<filter>(method['method_run']=='separated' and method['format_run']== 'nucleic')</filter>
-	<tests>
+</data>
-	    <test>
+<data format="csv" name="percent_pur" label="percent_purine.csv" from_work_dir="OUT/percent_purine.csv" >
-	        <conditional name="method" >
+<filter>(method['method_run']=='separated' and method['format_run']== 'nucleic')</filter>
-	            <param name="method_run" value="concat" />
+</data>
-	            <param name="zip_nuc" ftype="fasta" value="test_07_output_phylogeny_concatenation.fasta" />
+<data format="csv" name="purine_load" label="Purine_Load_Indice.csv" from_work_dir="OUT/Purine_Load_Indice.csv" >
-	            <param name="num_sampled" value="100" />
+<filter>(method['method_run']=='separated' and method['format_run']== 'nucleic')</filter>
-	            <param name="num_iter" value="100" />
+</data>
-	            <param name="list_species" ftype="text" value="Ac,Pu,Am,Ap,Pf,Pg,Th,Ph,Te" />
-	        </conditional>
+<!-- outputs separated - proteic -->
+<data format="csv" name="prot_comp" label="prot_compositions_All_AA.csv" from_work_dir="OUT/prot_compositions_All_AA.csv" >
+<filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
+</data>
+<data format="csv" name="ivywrel" label="IVYWREL.csv" from_work_dir="OUT/IVYWREL.csv" >
+<filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
+</data>
+<data format="csv" name="erk_dnqtsh" label="ERK_DNQTSH.csv" from_work_dir="OUT/ERK_DNQTSH.csv" >
+<filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
+</data>
+<data format="csv" name="ek_qh" label="EK_QH.csv" from_work_dir="OUT/EK_QH.csv" >
+<filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
+</data>
+<data format="csv" name="fymink_garp" label="FYMINK_GARP.csv" from_work_dir="OUT/FYMINK_GARP.csv" >
+<filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
+</data>
+<data format="csv" name="avlimfyw" label="AVLIMFYW.csv" from_work_dir="OUT/AVLIMFYW.csv" >
+<filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
+</data>
+<data format="csv" name="stnq" label="STNQ.csv" from_work_dir="OUT/STNQ.csv" >
+<filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
+</data>
+<data format="csv" name="rhkde" label="RHKDE.csv" from_work_dir="OUT/RHKDE.csv" >
+<filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
+</data>
+<data format="csv" name="payre_mvgds" label="PAYRE-MVGDS.csv" from_work_dir="OUT/PAYRE-MVGDS.csv" >
+<filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
+</data>
+<data format="csv" name="res_weigth" label="TotalResidueWeight.csv" from_work_dir="OUT/TotalResidueWeight.csv" >
+<filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
+</data>
+<data format="csv" name="res_vol" label="TotalResidueVolume.csv" from_work_dir="OUT/TotalResidueVolume.csv" >
+<filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
+</data>
+<data format="csv" name="spec_vol" label="TotalPartialSpecificVolume.csv" from_work_dir="OUT/TotalPartialSpecificVolume.csv" >
+<filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
+</data>
+<data format="csv" name="hydrat" label="TotalHydratation.csv" from_work_dir="OUT/TotalHydratation.csv" >
+<filter>(method['method_run']=='separated' and method['format_run']== 'proteic')</filter>
+</data>
+</outputs>
+<tests>
+<test>
+<conditional name="method" >
+<param name="method_run" value="concat" />
+<param name="concat_nuc" ftype="fasta" value="test_07_output_phylogeny_concatenation.fasta" />
+<param name="num_sampled" value="100" />
+<param name="num_iter" value="100" />
+<param name="list_species" ftype="text" value="Ac,Pu,Am,Ap,Pf,Pg,Th,Ph,Te" />
+</conditional>
 <output name="output1">
 <assert_contents>
 <has_text text="counting of Ac"/>
 <has_text text="counting of Pu"/>
 <has_text text="counting of Am"/>
 <has_text text="counting of Pg"/>
 <has_text text="counting of Th"/>
 <has_text text="counting of Ph"/>
 </assert_contents>
 </output>
 </test>
-	</tests>
+<test>
-	<help>
+<conditional name="method" >
+<param name="method_run" value="separated" />
+<param name="format_run" value="nucleic" />
+<param name="sep_file" ftype="fasta" value="sep_nuc/locus1_sp6_sp6.fasta,sep_nuc/locus1_sp8_sp8.fasta,sep_nuc/locus2_sp6_sp6.fasta" />
+<param name="concat_phy" ftype="fasta" value="phylogeny_concat.fasta" />
+</conditional>
+<output name="nuc_comp" >
+<assert_contents>
+<has_line line="locus1_sp8_sp8.fasta,0.29870,0.25541,0.19481,0.25108," />
+</assert_contents>
+</output>
+<output name="percent_gc">
+<assert_contents>
+<has_line line="locus2_sp6_sp6.fasta,42.70833," />
+</assert_contents>
+</output>
+<output name="percent_pur" >
+<assert_contents>
+<has_line line="locus2_sp6_sp6.fasta,56.77083," />
+</assert_contents>
+</output>
+<output name="purine_load" >
+<assert_contents>
+<has_line line="locus2_sp6_sp6.fasta,192,14,12,72.91667,62.50000," />
+</assert_contents>
+</output>
+</test>
+<test>
+<conditional name="method" >
+<param name="method_run" value="separated" />
+<param name="format_run" value="proteic" />
+<param name="sep_file" ftype="fasta" value="sep_aa/locus1_sp6_sp6.fasta,sep_aa/locus1_sp8_sp8.fasta,sep_aa/locus2_sp6_sp6.fasta" />
+<param name="concat_phy" ftype="fasta" value="phylogeny_concat.fasta" />
+</conditional>
+<output name="prot_comp" >
+<assert_contents>
+<has_line line="locus2_sp6_sp6.fasta,0.12500,0.00000,0.09375,0.04688,0.03125,0.09375,0.03125,0.07812,0.00000,0.04688,0.01562,0.03125,0.03125,0.01562,0.04688,0.00000,0.07812,0.07812,0.06250,0.09375,0.12500,0.00000,0.09375,0.04688,0.03125,0.09375,0.01562,0.10938,0.00000,0.04688,0.01562,0.04688,0.01562,0.01562,0.04688,0.00000,0.07812,0.07812,0.06250,0.07812,0.12500,0.00000,0.09375,0.04688,0.04688,0.09375,0.01562,0.09375,0.00000,0.04688,0.01562,0.03125,0.01562,0.01562,0.04688,0.00000,0.07812,0.07812,0.06250,0.09375,0.14062,0.00000,0.09375,0.06250,0.04688,0.09375,0.01562,0.09375,0.00000,0.03125,0.01562,0.04688,0.01562,0.01562,0.03125,0.00000,0.07812,0.07812,0.06250,0.07812,0.12500,0.00000,0.12500,0.04688,0.03125,0.09375,0.01562,0.10938,0.00000,0.04688,0.01562,0.04688,0.01562,0.01562,0.04688,0.00000,0.07812,0.07812,0.06250,0.04688,0.14062,0.00000,0.09375,0.06250,0.04688,0.09375,0.01562,0.09375,0.00000,0.03125,0.01562,0.04688,0.01562,0.01562,0.03125,0.00000,0.07812,0.07812,0.06250,0.07812," />
+</assert_contents>
+</output>
+<output name="ivywrel">
+<assert_contents>
+<has_line line="locus2_sp6_sp6.fasta,21.00000,0.32812,23.00000,0.35938,23.00000,0.35938,22.00000,0.34375,23.00000,0.35938,22.00000,0.34375," />
+</assert_contents>
+</output>
+<output name="res_vol" >
+<assert_contents>
+<has_line line="locus2_sp6_sp6.fasta,6575.00000,6593.00000,6587.00000,6645.00000,6631.00000,6645.00000," />
+</assert_contents>
+</output>
+<output name="hydrat" >
+<assert_contents>
+<has_line line="locus2_sp6_sp6.fasta,171.50000,171.50000,170.50000,171.00000,171.50000,171.00000," />
+</assert_contents>
+</output>
+</test>
+</tests>
+<help>
 .. class:: infomark
 **Authors**  Eric Fontanillas and Pierre-Guillaume Brun creates the scripts of this pipeline.
 ========
 Mutcount
 ========
 -----------
-Description (temporary - need to be to re-writed)
+Description
 -----------
-This script counts the number of codons, amino acids, and types of amino acids in sequences, as well as the mutation bias from one item to another between 2 sequences
+| This script counts the number of codons, amino acids, and types of amino acids in sequences, as well as the mutation bias from one item to another between 2 sequences. Counting is then compared to empirical p-values, obtained from bootstrapped sequences obtained from a subset of sequences
-counting is then compared to empirical p-values, obtained from bootstrapped sequences obtained from a subset of sequences
-In the output files, the pvalues indicate the position of the observed data in a distribution of empirical countings obtained from a resample of the data. Values above 0.95 indicate a significantly higher counting, values under 0.05 a significantly lower counting
+| In the output files, the pvalues indicate the position of the observed data in a distribution of empirical countings obtained from a resample of the data. Values above 0.95 indicate a significantly higher counting, values under 0.05 a significantly lower counting
-the script automatically reads the sequences to compare from a file that must be called pairs.txt and located with the .fasta file
+| The script automatically reads the sequences to compare from a file that must be called pairs.txt (pre-computed by the tool itself) and located with the .fasta file in the pairs.txt file, sequences (let's assume X, Y, Z, U, V) pairs must be written as 'X Y\nU V\nZ V' in this case, codoncounting will count the occurence of codons, amino acids, and types of amino acids in X, U, Z, and count the mutation bias from Y to X, V to U and V to Z X, Y, Z, U, V must be character strings contained in the sequences names in the .fasta file (and be specific to each of them). In pairs.txt, you must write how should be built the bootstrapped resampling of sequences. This must be formated as:'X Y\nbackground: length iterration plusminus listofspecies\nU V\nZ V', explanation below backgrounds must be excplicitely written in the pairs.txt file (the script still integers default parameters). This implies that the first line of pairs.txt should be a background line by default, once the background has been determined, it will be applied to each subsequent analysis until another background is written e.g. 'background: length1 iterration1 plusminus1 listofspecies1\nU V\nZ V\nbackground: length2 iterration2 plusminus2 listofspecies2\nX Y' the first background is applied to U V and Z V and the 2nd background to X Y
-in the pairs.txt file, sequences (let's assume X, Y, Z, U, V) pairs must be written as 'X Y\nU V\nZ V'
-in this case, codoncounting will count the occurence of codons, amino acids, and types of amino acids in X, U, Z, and count the mutation bias from Y to X, V to U and V to Z
-you can add comments in the pairs.txt file inbetween lines, beginning with '#'. E.G. 'X Y\n#This is my comment\nU V\nZ V'
+| The script resamples random pairs of aligned codon to determine what countings can be expected under the hypothesis of an homogenous dataset.
-X, Y, Z, U, V must be character strings contained in the sequences names in the .fasta file (and be specific to each of them)
+| Countings are performed on each generated random alignement, thousands of alignments allow to draw a gaussian distribution of the countings.
-in pairs.txt, you must write how should be built the bootstrapped resampling of sequences. This must be formated as:'X Y\nbackground: length iterration plusminus listofspecies\nU V\nZ V', explanation below
+| Then the script simply checks whether the observed data are within the 5% lowest or 5% highest values of the distribution
-backgrounds must be excplicitely written in the pairs.txt file (the script still integers default parameters). This implies that the first line of pairs.txt should be a background line
-by default, once the background has been determined, it will be applied to each subsequent analysis until another background is written
+| - length is the number of pairs of codons in each generated alignments (effect on the robustness on the countings performed on this alignement)
-e.g. 'background: length1 iterration1 plusminus1 listofspecies1\nU V\nZ V\nbackground: length2 iterration2 plusminus2 listofspecies2\nX Y' the first background is applied to U V and Z V and the 2nd background to X Y
+| - iterration is the number of alignments that will be generated (effect on the resolution of the gaussian distribution)
+| - plusminus can be either '+' or '-', '+' indicates that the following species only must be resampled, '-' that the following species must be excluded from the resampling
+| - listofspecies is the list of species (names contained in the sequences names from the fasta file) that must be included or excluded from the sampling. You can also write 'all' to include every species (in this case, plusminus parameter is ignored)
-#the script resamples random pairs of aligned codon to determine what countings can be expected under the hypothesis of an homogenous dataset
-#countings are performed on each generated random alignement, thousands of alignments allow to draw a gaussian distribution of the countings
+| Iteration shouldn't be lower that 1000 to have a relatively smooth gaussian distribution, length shouldn't be lower as 1000 to detect codons with relatively low occurence (&lt;1%). For the list of species, you can try to form subgroups depending on the studied parameter (e.g. comparing a terrestrial species with a background composed of marine species)
-#then the script simply checks whether the observed data are within the 5% lowest or 5% highest values of the distribution
-in background: length iterration plusminus listofspecies
--&gt; length is the number of pairs of codons in each generated alignments (effect on the robustness on the countings performed on this alignement)
--&gt; iterration is the number of alignments that will be generated (effect on the resolution of the gaussian distribution)
--&gt; plusminus can be either '+' or '-', '+' indicates that the following species only must be resampled, '-' that the following species must be excluded from the resampling
--&gt; listofspecies is the list of species (names contained in the sequences names from the fasta file) that must be included or excluded from the sampling. You can also write 'all' to include every species (in this case, plusminus parameter is ignored)
-#full example: background 5000 10000 + melanogaster elegans sapiens
-iterration shouldn't be lower that 1000 to have a relatively smooth gaussian distribution, length shouldn't be lower as 1000 to detect codons with relatively low occurence (&lt;1%)
-for the list of species, you can try to form subgroups depending on the studied parameter (e.g. comparing a terrestrial species with a background composed of marine species)
 .. class:: infomark
 **Important part of this tool (the inputs format)**
 --------
-============
+-----------
-Input format
+Input files
-============
+-----------
-The script takes as input the DNA alignment (fasta format): python codoncounting.py file_path.fasta
+| If you choose the concatenated method, the input file is the concatenated genes fasta file (in nucleic format) from a previous run of the toolConcatPhyl.
-example.
+| If you choose the separated method, there are two input files :
+| - A dataset collection containing output files from the CDS_Search tool, the one without indels. These files must be in nucleic or proteic format according to the format chosen along with the method.
-	</help>
+| - The concatenated genes fasta file from ConcatPhyl, only used here to retrieve species name.
-	<expand macro="citations" />
+----------
+Parameters
+----------
+| There are parameters only for the "Concatenated" method :
+| - The number of iterations
+| - The number of sampled codons
+| - The list of species, separated by commas and without space (e.g : sp1,sp2,sp3,sp4). You can run the toll on subgroup of species, not only on the total number of species present in the previous tools.
+---------
+Changelog
+---------
+**Version 2.0 - 12/07/2017**
+- NEW: Replaced the zip between tools by Dataset Collection
+- More functional tests
+**Version 1.0 - 14/04/2017**
+- Added the tools to the suite
+- Added a functional test with planemo
+- Planemo test using conda dependencies for python
+- Scripts renamed + symlinks to the directory 'scripts'
+</help>
+<expand macro="citations" />
 </tool>

Mercurial > repos > abims-sbr > mutcount

comparison MutCount.xml @ 1:8de21b6eb110 draft