comparison MutCount.xml @ 2:988467f963f0 draft

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit cf1b9c905931ca2ca25faa4844d45c908756472f
author abims-sbr
date Wed, 17 Jan 2018 08:57:49 -0500
parents 8de21b6eb110
children 263caa68d7bb
comparison
equal deleted inserted replaced
1:8de21b6eb110 2:988467f963f0
13 <expand macro="python_required" /> 13 <expand macro="python_required" />
14 </requirements> 14 </requirements>
15 15
16 <command> 16 <command>
17 <![CDATA[ 17 <![CDATA[
18
19 ln -s $__tool_directory__/scripts/functions.py . &&
20
18 #if str($method.method_run) == "concat" : 21 #if str($method.method_run) == "concat" :
19 python '$__tool_directory__/scripts/S01a_mutcount_pairs.py' $method.num_sampled $method.num_iter $method.list_species 22 python '$__tool_directory__/scripts/S01a_mutcount_pairs.py' $method.num_sampled $method.num_iter $method.list_species
20 && 23 &&
21 python '$__tool_directory__/scripts/S02a_codon_counting.py' ${method.concat_nuc} 24 python '$__tool_directory__/scripts/S02a_codon_counting.py' ${method.concat_nuc}
22 #end if 25 #end if
23 26
24 #if str($method.method_run) == "separated" : 27 #if str($method.method_run) == "separated" :
25 #set $infiles = "" 28 #set $infiles = ""
26 #for $input in $method.sep_file 29 #for $input in $method.sep_file
27 ln -s '$input' '$input.element_identifier'; 30 ln -s '$input' '$input.element_identifier';
28 #set $infiles = $infiles + $input.element_identifier + "," 31 #set $infiles = $infiles + $input.element_identifier + ","
29 #end for 32 #end for
30 #set $infiles = $infiles[:-1] 33 #set $infiles = $infiles[:-1]
31 34
32 #if str($method.format_run)== "nucleic" : 35 #if str($method.format_run)== "nucleic" :
33 python '$__tool_directory__/scripts/S02b_study_seq_composition_nuc.py' '$infiles' ${method.concat_phy} 36 python '$__tool_directory__/scripts/S02b_study_seq_composition_nuc.py' '$infiles' ${method.concat_phy}
34 #end if 37 #end if
35 38
36 #if str($method.format_run)== "proteic" : 39 #if str($method.format_run)== "proteic" :
37 cp '$__tool_directory__/scripts/amino_acid_properties.csv' . 40 cp '$__tool_directory__/scripts/amino_acid_properties.csv' .
38 && 41 &&
39 python '$__tool_directory__/scripts/S01b_study_seq_composition_aa.py' '$infiles' ${method.concat_phy} 42 python '$__tool_directory__/scripts/S01b_study_seq_composition_aa.py' '$infiles' ${method.concat_phy}
40 #end if 43 #end if
41 #end if 44 #end if
42 ]]> 45 ]]>
43 </command> 46 </command>
44 47
45 <inputs> 48 <inputs>
46 <conditional name="method"> 49 <conditional name="method">
47 <param name="method_run" type="select" label="Which method do you want to use for this tool? "> 50 <param name="method_run" type="select" label="Which method do you want to use for this tool? ">
48 <option value="concat">Concatenated genes in DNA (concatenation from RAxML run)</option> 51 <option value="concat">Concatenated genes in DNA (concatenation from RAxML run)</option>
67 </conditional> 70 </conditional>
68 </inputs> 71 </inputs>
69 72
70 <outputs> 73 <outputs>
71 <!-- output concat --> 74 <!-- output concat -->
72 <data format="txt" name="output1" label="concatenated_results.txt" from_work_dir="codoncounting_results.txt" > 75 <!--
76 <data format="txt" name="output1" label="counts.txt" from_work_dir="counts.txt" >
77 <filter>(method['method_run']=='concat')</filter>
78 </data>
79 <data format="txt" name="output2" label="biases.txt" from_work_dir="biases.txt" >
80 <filter>(method['method_run']=='concat')</filter>
81 </data>
82 -->
83 <data format="csv" name="codons_counts" label="codons_counts.csv" from_work_dir="codons_counts.csv" >
84 <filter>(method['method_run']=='concat')</filter>
85 </data>
86 <data format="csv" name="aa_counts" label="aa_counts.csv" from_work_dir="aa_counts.csv" >
87 <filter>(method['method_run']=='concat')</filter>
88 </data>
89 <data format="csv" name="aatypes_counts" label="aatypes_counts.csv" from_work_dir="aatypes_counts.csv" >
90 <filter>(method['method_run']=='concat')</filter>
91 </data>
92 <data format="csv" name="gc_counts" label="codons_counts.csv" from_work_dir="gc_counts.csv" >
93 <filter>(method['method_run']=='concat')</filter>
94 </data>
95 <data format="csv" name="aa_transitions" label="aa_transitions.csv" from_work_dir="aa_transitions.csv" >
96 <filter>(method['method_run']=='concat')</filter>
97 </data>
98 <data format="csv" name="aatypes_transitions" label="aatypes_transitions.csv" from_work_dir="aatypes_transitions.csv" >
73 <filter>(method['method_run']=='concat')</filter> 99 <filter>(method['method_run']=='concat')</filter>
74 </data> 100 </data>
75 101
76 <!-- outputs separated - nucleic --> 102 <!-- outputs separated - nucleic -->
77 <data format="csv" name="nuc_comp" label="nuc_compositions.csv" from_work_dir="OUT/nuc_compositions.csv" > 103 <data format="csv" name="nuc_comp" label="nuc_compositions.csv" from_work_dir="OUT/nuc_compositions.csv" >
135 <conditional name="method" > 161 <conditional name="method" >
136 <param name="method_run" value="concat" /> 162 <param name="method_run" value="concat" />
137 <param name="concat_nuc" ftype="fasta" value="test_07_output_phylogeny_concatenation.fasta" /> 163 <param name="concat_nuc" ftype="fasta" value="test_07_output_phylogeny_concatenation.fasta" />
138 <param name="num_sampled" value="100" /> 164 <param name="num_sampled" value="100" />
139 <param name="num_iter" value="100" /> 165 <param name="num_iter" value="100" />
140 <param name="list_species" ftype="text" value="Ac,Pu,Am,Ap,Pf,Pg,Th,Ph,Te" /> 166 <param name="list_species" ftype="text" value="Ac,Am,Ap,Pu" />
141 </conditional> 167 </conditional>
142 <output name="output1"> 168 <output name="codons_counts" value="OUT_concat/codons_counts.csv" lines_diff="8"/>
143 <assert_contents> 169 <output name="aa_counts" value="OUT_concat/aa_counts.csv" lines_diff="8"/>
144 <has_text text="counting of Ac"/> 170 <output name="aatypes_counts" value="OUT_concat/aatypes_counts.csv" lines_diff="8"/>
145 <has_text text="counting of Pu"/> 171 <output name="gc_counts" value="OUT_concat/gc_counts.csv"/>
146 <has_text text="counting of Am"/> 172 <output name="aa_transitions" value="OUT_concat/aa_transitions.csv" lines_diff="14"/>
147 <has_text text="counting of Ap"/> 173 <output name="aatypes_transitions" value="OUT_concat/aatypes_transitions.csv" lines_diff="14"/>
148 <has_text text="counting of Pf"/>
149 <has_text text="counting of Pg"/>
150 <has_text text="counting of Th"/>
151 <has_text text="counting of Ph"/>
152 </assert_contents>
153 </output>
154 </test> 174 </test>
155 175
156 <test> 176 <test>
157 <conditional name="method" > 177 <conditional name="method" >
158 <param name="method_run" value="separated" /> 178 <param name="method_run" value="separated" />
159 <param name="format_run" value="nucleic" /> 179 <param name="format_run" value="nucleic" />
160 <param name="sep_file" ftype="fasta" value="sep_nuc/locus1_sp6_sp6.fasta,sep_nuc/locus1_sp8_sp8.fasta,sep_nuc/locus2_sp6_sp6.fasta" /> 180 <param name="sep_file" ftype="fasta" value="sep_nuc/locus1_sp6_sp6.fasta,sep_nuc/locus1_sp8_sp8.fasta,sep_nuc/locus2_sp6_sp6.fasta" />
161 <param name="concat_phy" ftype="fasta" value="phylogeny_concat.fasta" /> 181 <param name="concat_phy" ftype="fasta" value="phylogeny_concat.fasta" />
162 </conditional> 182 </conditional>
163 <output name="nuc_comp" > 183 <output name="nuc_comp" value="OUT_nuc/nuc_compositions.csv" lines_diff="2"/>
164 <assert_contents> 184 <output name="percent_gc" value="OUT_nuc/percent_GC.csv" lines_diff="2"/>
165 <has_line line="locus1_sp8_sp8.fasta,0.29870,0.25541,0.19481,0.25108," /> 185 <output name="percent_pur" value="OUT_nuc/percent_purine.csv" lines_diff="2"/>
166 </assert_contents> 186 <output name="purine_load" value="OUT_nuc/Purine_Load_Indice.csv" lines_diff="2"/>
167 </output>
168 <output name="percent_gc">
169 <assert_contents>
170 <has_line line="locus2_sp6_sp6.fasta,42.70833," />
171 </assert_contents>
172 </output>
173 <output name="percent_pur" >
174 <assert_contents>
175 <has_line line="locus2_sp6_sp6.fasta,56.77083," />
176 </assert_contents>
177 </output>
178 <output name="purine_load" >
179 <assert_contents>
180 <has_line line="locus2_sp6_sp6.fasta,192,14,12,72.91667,62.50000," />
181 </assert_contents>
182 </output>
183 </test> 187 </test>
184 188
185 <test> 189 <test>
186 <conditional name="method" > 190 <conditional name="method" >
187 <param name="method_run" value="separated" /> 191 <param name="method_run" value="separated" />
188 <param name="format_run" value="proteic" /> 192 <param name="format_run" value="proteic" />
189 <param name="sep_file" ftype="fasta" value="sep_aa/locus1_sp6_sp6.fasta,sep_aa/locus1_sp8_sp8.fasta,sep_aa/locus2_sp6_sp6.fasta" /> 193 <param name="sep_file" ftype="fasta" value="sep_aa/locus1_sp6_sp6.fasta,sep_aa/locus1_sp8_sp8.fasta,sep_aa/locus2_sp6_sp6.fasta" />
190 <param name="concat_phy" ftype="fasta" value="phylogeny_concat.fasta" /> 194 <param name="concat_phy" ftype="fasta" value="phylogeny_concat.fasta" />
191 </conditional> 195 </conditional>
192 <output name="prot_comp" > 196 <output name="ivywrel" value="OUT_aa/IVYWREL.csv" lines_diff="2"/>
193 <assert_contents> 197 <output name="rhkde" value="OUT_aa/RHKDE.csv" lines_diff="2"/>
194 <has_line line="locus2_sp6_sp6.fasta,0.12500,0.00000,0.09375,0.04688,0.03125,0.09375,0.03125,0.07812,0.00000,0.04688,0.01562,0.03125,0.03125,0.01562,0.04688,0.00000,0.07812,0.07812,0.06250,0.09375,0.12500,0.00000,0.09375,0.04688,0.03125,0.09375,0.01562,0.10938,0.00000,0.04688,0.01562,0.04688,0.01562,0.01562,0.04688,0.00000,0.07812,0.07812,0.06250,0.07812,0.12500,0.00000,0.09375,0.04688,0.04688,0.09375,0.01562,0.09375,0.00000,0.04688,0.01562,0.03125,0.01562,0.01562,0.04688,0.00000,0.07812,0.07812,0.06250,0.09375,0.14062,0.00000,0.09375,0.06250,0.04688,0.09375,0.01562,0.09375,0.00000,0.03125,0.01562,0.04688,0.01562,0.01562,0.03125,0.00000,0.07812,0.07812,0.06250,0.07812,0.12500,0.00000,0.12500,0.04688,0.03125,0.09375,0.01562,0.10938,0.00000,0.04688,0.01562,0.04688,0.01562,0.01562,0.04688,0.00000,0.07812,0.07812,0.06250,0.04688,0.14062,0.00000,0.09375,0.06250,0.04688,0.09375,0.01562,0.09375,0.00000,0.03125,0.01562,0.04688,0.01562,0.01562,0.03125,0.00000,0.07812,0.07812,0.06250,0.07812," /> 198 <output name="payre_mvgds" value="OUT_aa/PAYRE-MVGDS.csv" lines_diff="2"/>
195 </assert_contents> 199 <output name="avlimfyw" value="OUT_aa/AVLIMFYW.csv" lines_diff="2"/>
196 </output>
197 <output name="ivywrel">
198 <assert_contents>
199 <has_line line="locus2_sp6_sp6.fasta,21.00000,0.32812,23.00000,0.35938,23.00000,0.35938,22.00000,0.34375,23.00000,0.35938,22.00000,0.34375," />
200 </assert_contents>
201 </output>
202 <output name="res_vol" >
203 <assert_contents>
204 <has_line line="locus2_sp6_sp6.fasta,6575.00000,6593.00000,6587.00000,6645.00000,6631.00000,6645.00000," />
205 </assert_contents>
206 </output>
207 <output name="hydrat" >
208 <assert_contents>
209 <has_line line="locus2_sp6_sp6.fasta,171.50000,171.50000,170.50000,171.00000,171.50000,171.00000," />
210 </assert_contents>
211 </output>
212 </test> 200 </test>
213 </tests> 201 </tests>
214 202
215 <help> 203 <help>
216 204
205 @HELP_AUTHORS@
206
207 <![CDATA[
208
209 **Last Version** : Victor Mataigne and Gildas Le Corguillé
210 --------
211
212 **Description**
213
214 This script counts the number of codons, amino acids, and types of amino acids in sequences, as well as the mutation bias from one item to another between 2 sequences. Counting is then compared to empirical p-values, obtained from bootstrapped sequences obtained from a subset of sequences.
215
216 In the output files, the pvalues indicate the position of the observed data in a distribution of empirical countings obtained from a resample of the data. Values above 0.95 indicate a significantly higher counting, values under 0.05 a significantly lower counting.
217
218 The script resamples random pairs of aligned codon to determine what countings can be expected under the hypothesis of an homogenous dataset.
219 Countings are performed on each generated random alignement, thousands of alignments allow to draw a gaussian distribution of the countings.
220 Then the script simply checks whether the observed data are within the 5% lowest or 5% highest values of the distribution.
221
222 --------
223
217 .. class:: infomark 224 .. class:: infomark
218 225
219 **Authors** Eric Fontanillas and Pierre-Guillaume Brun creates the scripts of this pipeline. 226 **Input files**
220 227
221 .. class:: infomark 228 If you choose the concatenated method, the input file is the concatenated genes fasta file (in nucleic format) from a previous run of the toolConcatPhyl.
222 229
223 **Galaxy integration** Julie Baffard and ABIMS TEAM 230 If you choose the separated method, there are two input files :
224 231 - A dataset collection containing output files from the CDS_Search tool, the one without indels. These files must be in nucleic or proteic format according to the format chosen along with the method.
225 | Contact support.abims@sb-roscoff.fr for any questions or concerns about the Galaxy implementation of this tool. 232 - The concatenated genes fasta file from ConcatPhyl, only used here to retrieve species name.
226 233
227 --------------------------------------------------- 234 --------
228 235
229 236 **Parameters**
230 ======== 237
231 Mutcount 238 There are parameters only for the "Concatenated" method :
232 ======== 239
233 240 - The number of iterations : the number of alignments that will be generated (effect on the resolution of the gaussian distribution). Shouldn't be lower than 1000 to have a relatively smooth gaussian distribution.
234 ----------- 241
235 Description 242 - The number of sampled codons : the number of pairs of codons in each generated alignments (effect on the robustness on the countings performed on this alignement). Shouldn't be lower than 1000 to detect codons with relatively low occurence (<1%).
236 ----------- 243
237 244 - The list of species, separated by commas and without space (e.g : sp1,sp2,sp3,sp4). You can run the tool on subgroup of species, not only on the total number of species present in the previous tools. You can also write 'all' to include every species.
238 245
239 | This script counts the number of codons, amino acids, and types of amino acids in sequences, as well as the mutation bias from one item to another between 2 sequences. Counting is then compared to empirical p-values, obtained from bootstrapped sequences obtained from a subset of sequences 246 --------
240 247
241 | In the output files, the pvalues indicate the position of the observed data in a distribution of empirical countings obtained from a resample of the data. Values above 0.95 indicate a significantly higher counting, values under 0.05 a significantly lower counting 248 **Outputs**
242 249
243 | The script automatically reads the sequences to compare from a file that must be called pairs.txt (pre-computed by the tool itself) and located with the .fasta file in the pairs.txt file, sequences (let's assume X, Y, Z, U, V) pairs must be written as 'X Y\nU V\nZ V' in this case, codoncounting will count the occurence of codons, amino acids, and types of amino acids in X, U, Z, and count the mutation bias from Y to X, V to U and V to Z X, Y, Z, U, V must be character strings contained in the sequences names in the .fasta file (and be specific to each of them). In pairs.txt, you must write how should be built the bootstrapped resampling of sequences. This must be formated as:'X Y\nbackground: length iterration plusminus listofspecies\nU V\nZ V', explanation below backgrounds must be excplicitely written in the pairs.txt file (the script still integers default parameters). This implies that the first line of pairs.txt should be a background line by default, once the background has been determined, it will be applied to each subsequent analysis until another background is written e.g. 'background: length1 iterration1 plusminus1 listofspecies1\nU V\nZ V\nbackground: length2 iterration2 plusminus2 listofspecies2\nX Y' the first background is applied to U V and Z V and the 2nd background to X Y 250 Many outputs in .csv format , varying according to the chosen method and format (separated, nucleic ...)
244 251 - When method = concat : 6 .csv outputs : countings of codons, amino acids, amino acids types, and transitions from amino acid to amino acid and from amino acid type to amino acid type.
245 252 - When method = separated and format = nucleic : 4 .csv outputs : nucleotide composition, GC percent, purine percent, purine load indice.
246 | The script resamples random pairs of aligned codon to determine what countings can be expected under the hypothesis of an homogenous dataset. 253 - When method = separated and format = proteic : 13 .csv outputs : protein composition, several files of countings various AA combinations, results on residues, hydratation, partial specific volume.
247 | Countings are performed on each generated random alignement, thousands of alignments allow to draw a gaussian distribution of the countings.
248 | Then the script simply checks whether the observed data are within the 5% lowest or 5% highest values of the distribution
249
250 | - length is the number of pairs of codons in each generated alignments (effect on the robustness on the countings performed on this alignement)
251 | - iterration is the number of alignments that will be generated (effect on the resolution of the gaussian distribution)
252 | - plusminus can be either '+' or '-', '+' indicates that the following species only must be resampled, '-' that the following species must be excluded from the resampling
253 | - listofspecies is the list of species (names contained in the sequences names from the fasta file) that must be included or excluded from the sampling. You can also write 'all' to include every species (in this case, plusminus parameter is ignored)
254
255 | Iteration shouldn't be lower that 1000 to have a relatively smooth gaussian distribution, length shouldn't be lower as 1000 to detect codons with relatively low occurence (&lt;1%). For the list of species, you can try to form subgroups depending on the studied parameter (e.g. comparing a terrestrial species with a background composed of marine species)
256
257
258
259 .. class:: infomark
260
261
262 **Important part of this tool (the inputs format)**
263
264 --------
265
266 -----------
267 Input files
268 -----------
269
270 | If you choose the concatenated method, the input file is the concatenated genes fasta file (in nucleic format) from a previous run of the toolConcatPhyl.
271
272 | If you choose the separated method, there are two input files :
273 | - A dataset collection containing output files from the CDS_Search tool, the one without indels. These files must be in nucleic or proteic format according to the format chosen along with the method.
274 | - The concatenated genes fasta file from ConcatPhyl, only used here to retrieve species name.
275
276 ----------
277 Parameters
278 ----------
279
280 | There are parameters only for the "Concatenated" method :
281 | - The number of iterations
282 | - The number of sampled codons
283 | - The list of species, separated by commas and without space (e.g : sp1,sp2,sp3,sp4). You can run the toll on subgroup of species, not only on the total number of species present in the previous tools.
284 254
285 --------- 255 ---------
256
257 **The AdaptSearch Pipeline**
258
259 .. image:: ../../adaptsearch_picture_helps.png :heigth: 593 :width: 852
260
261 ---------
262
286 Changelog 263 Changelog
287 --------- 264 ---------
288 265
266 **Version 2.1 - 10/01/2017**
267
268 - Splitted output of concatenated method in several csv files.
269 - Bug corrected in output files of separated method.
270
289 **Version 2.0 - 12/07/2017** 271 **Version 2.0 - 12/07/2017**
290 272
291 - NEW: Replaced the zip between tools by Dataset Collection 273 - NEW: Replaced the zip between tools by Dataset Collection
292 - More functional tests 274 - More functional tests
293 275
294 **Version 1.0 - 14/04/2017** 276 **Version 1.0 - 14/04/2017**
295 277
296 - Added the tools to the suite 278 - Added the tools to the suite
297 - Added a functional test with planemo 279 - Added a functional test with planemo
298 - Planemo test using conda dependencies for python 280 - Planemo test using conda dependencies for python
299 - Scripts renamed + symlinks to the directory 'scripts' 281 - Scripts renamed + symlinks to the directory 'scripts'
282
283 ]]>
300 284
301 </help> 285 </help>
302 286
303 <expand macro="citations" /> 287 <expand macro="citations" />
304 288