comparison preprocess.xml @ 7:76dcbe930b1d draft

"planemo upload for repository https://github.com/geraldinepascal/FROGS-wrappers/ commit 0a8dfe386b79711c479cf8a2bc8e9677e521b9e5-dirty"
author oinizan
date Wed, 18 Aug 2021 15:43:00 +0000
parents 192cac570229
children 7bf54edaba24
comparison
equal deleted inserted replaced
6:192cac570229 7:76dcbe930b1d
13 # GNU General Public License for more details. 13 # GNU General Public License for more details.
14 # 14 #
15 # You should have received a copy of the GNU General Public License 15 # You should have received a copy of the GNU General Public License
16 # along with this program. If not, see <http://www.gnu.org/licenses/>. 16 # along with this program. If not, see <http://www.gnu.org/licenses/>.
17 --> 17 -->
18 <tool id="FROGS_preprocess" name="FROGS Pre-process" version="3.2.3.1"> 18 <tool id="FROGS_preprocess" name="FROGS Pre-process" version="@TOOL_VERSION@+galaxy2">
19 <description>merging, denoising and dereplication.</description> 19 <description>merging, denoising and dereplication.</description>
20 <requirements> 20
21 <requirement type="package" version="3.2.3">frogs</requirement> 21 <macros>
22 <requirement type="package" version="2.17.0">vsearch</requirement> 22 <import>macros.xml</import>
23 <requirement type="package" version="1.2.11">flash</requirement> 23 </macros>
24 <requirement type="package" version="2.10">cutadapt</requirement> 24
25 </requirements> 25 <expand macro="requirements" >
26 <requirement type="package" version="2.17.0">vsearch</requirement>
27 <requirement type="package" version="1.2.11">flash</requirement>
28 <requirement type="package" version="2.10">cutadapt</requirement>
29 </expand>
30
26 <stdio> 31 <stdio>
27 <exit_code range="1:" /> 32 <exit_code range="1:" />
28 <exit_code range=":-1" /> 33 <exit_code range=":-1" />
29 </stdio> 34 </stdio>
30 <command> 35 <command>
31 preprocess.py $sequencer_type.sequencer_selected 36 preprocess.py '$sequencer_type.sequencer_selected'
32 --output-dereplicated $dereplicated_file --output-count $count_file --summary $summary_file 37 --output-dereplicated '$dereplicated_file' --output-count '$count_file' --summary '$summary_file'
33 --nb-cpus \${GALAXY_SLOTS:-1} 38 --nb-cpus \${GALAXY_SLOTS:-1}
34 --min-amplicon-size $sequencer_type.min_amplicon_size --max-amplicon-size $sequencer_type.max_amplicon_size 39 --min-amplicon-size $sequencer_type.min_amplicon_size --max-amplicon-size $sequencer_type.max_amplicon_size
35 40
36 #if $sequencer_type.sequencer_selected == "illumina" 41 #if $sequencer_type.sequencer_selected == "illumina"
37 #if $sequencer_type.sequencing_protocol.sequencing_protocol_selected == "standard" 42 #if $sequencer_type.sequencing_protocol.sequencing_protocol_selected == "standard"
38 --five-prim-primer $sequencer_type.sequencing_protocol.five_prim_primer --three-prim-primer $sequencer_type.sequencing_protocol.three_prim_primer 43 --five-prim-primer '$sequencer_type.sequencing_protocol.five_prim_primer' --three-prim-primer '$sequencer_type.sequencing_protocol.three_prim_primer'
39 #else 44 #else
40 --without-primers 45 --without-primers
41 #end if 46 #end if
42 #else 47 #else
43 --five-prim-primer $sequencer_type.five_prim_primer --three-prim-primer $sequencer_type.three_prim_primer 48 --five-prim-primer '$sequencer_type.five_prim_primer' --three-prim-primer '$sequencer_type.three_prim_primer'
44 #end if 49 #end if
45 50
46 #if $sequencer_type.input_type.input_type_selected == "archive" 51 #if $sequencer_type.input_type.input_type_selected == "archive"
47 --input-archive $sequencer_type.input_type.archive_file 52 --input-archive '$sequencer_type.input_type.archive_file'
48 #if $sequencer_type.sequencer_selected == "illumina" and $sequencer_type.input_type.archive_type.archive_type_selected == "already_merged" 53 #if $sequencer_type.sequencer_selected == "illumina" and $sequencer_type.input_type.archive_type.archive_type_selected == "already_merged"
49 --already-contiged 54 --already-contiged
50 #elif $sequencer_type.sequencer_selected == "illumina" 55 #elif $sequencer_type.sequencer_selected == "illumina"
51 --R1-size $sequencer_type.input_type.archive_type.R1_size --R2-size $sequencer_type.input_type.archive_type.R2_size 56 --R1-size $sequencer_type.input_type.archive_type.R1_size --R2-size $sequencer_type.input_type.archive_type.R2_size
52 --mismatch-rate $sequencer_type.input_type.archive_type.mm_rate 57 --mismatch-rate $sequencer_type.input_type.archive_type.mm_rate
53 --merge-software $sequencer_type.input_type.archive_type.merge_software_type.merge_software_selected 58 --merge-software '$sequencer_type.input_type.archive_type.merge_software_type.merge_software_selected'
54 #if $sequencer_type.input_type.archive_type.merge_software_type.merge_software_selected == "flash" 59 #if $sequencer_type.input_type.archive_type.merge_software_type.merge_software_selected == "flash"
55 --expected-amplicon-size $sequencer_type.input_type.archive_type.merge_software_type.expected_amplicon_size 60 --expected-amplicon-size $sequencer_type.input_type.archive_type.merge_software_type.expected_amplicon_size
56 #end if 61 #end if
57 #if $sequencer_type.input_type.archive_type.keep_unmerged 62 #if $sequencer_type.input_type.archive_type.keep_unmerged
58 --keep-unmerged 63 --keep-unmerged
59 #end if 64 #end if
60 #end if 65 #end if
61 #else 66 #else
62 #set $sep = ' ' 67 #set $sep = ' '
63 #if $sequencer_type.sequencer_selected == "illumina" 68 #if $sequencer_type.sequencer_selected == "illumina"
64 --samples-names 69 --samples-names
65 #for $current in $sequencer_type.input_type.files_by_samples_type.samples 70 #for $current in $sequencer_type.input_type.files_by_samples_type.samples
66 $sep'${current.name.strip()}' 71 $sep'${current.name.strip()}'
67 #end for 72 #end for
68 --input-R1 73 --input-R1
69 #for $current in $sequencer_type.input_type.files_by_samples_type.samples 74 #for $current in $sequencer_type.input_type.files_by_samples_type.samples
70 $sep${current.R1_file} 75 $sep'${current.R1_file}'
71 #end for 76 #end for
72 #if $sequencer_type.input_type.files_by_samples_type.files_by_samples_type_selected == "already_merged" 77 #if $sequencer_type.input_type.files_by_samples_type.files_by_samples_type_selected == "already_merged"
73 --already-contiged 78 --already-contiged
74 #else 79 #else
75 --input-R2 80 --input-R2
76 #for $current in $sequencer_type.input_type.files_by_samples_type.samples 81 #for $current in $sequencer_type.input_type.files_by_samples_type.samples
77 $sep${current.R2_file} 82 $sep'${current.R2_file}'
78 #end for 83 #end for
79 --R1-size $sequencer_type.input_type.files_by_samples_type.R1_size --R2-size $sequencer_type.input_type.files_by_samples_type.R2_size 84 --R1-size $sequencer_type.input_type.files_by_samples_type.R1_size --R2-size $sequencer_type.input_type.files_by_samples_type.R2_size
80 --mismatch-rate $sequencer_type.input_type.files_by_samples_type.mm_rate 85 --mismatch-rate $sequencer_type.input_type.files_by_samples_type.mm_rate
81 --merge-software $sequencer_type.input_type.files_by_samples_type.merge_software_type.merge_software_selected 86 --merge-software $sequencer_type.input_type.files_by_samples_type.merge_software_type.merge_software_selected
82 #if $sequencer_type.input_type.files_by_samples_type.merge_software_type.merge_software_selected == "flash" 87 #if $sequencer_type.input_type.files_by_samples_type.merge_software_type.merge_software_selected == "flash"
83 --expected-amplicon-size $sequencer_type.input_type.files_by_samples_type.merge_software_type.expected_amplicon_size 88 --expected-amplicon-size $sequencer_type.input_type.files_by_samples_type.merge_software_type.expected_amplicon_size
84 #end if 89 #end if
85 #if $sequencer_type.input_type.files_by_samples_type.keep_unmerged 90 #if $sequencer_type.input_type.files_by_samples_type.keep_unmerged
86 --keep-unmerged 91 --keep-unmerged
87 #end if 92 #end if
88 #end if 93 #end if
89 #else 94 #else
90 --input-R1 95 --input-R1
91 #for $current in $sequencer_type.input_type.samples 96 #for $current in $sequencer_type.input_type.samples
92 $sep${current.R1_file} 97 $sep'${current.R1_file}'
93 #end for 98 #end for
94 --samples-names 99 --samples-names
95 #for $current in $sequencer_type.input_type.samples 100 #for $current in $sequencer_type.input_type.samples
96 $sep'${current.name.strip()}' 101 $sep'${current.name.strip()}'
97 #end for 102 #end for
98 #end if 103 #end if
99 #end if 104 #end if
100 </command> 105 </command>
121 <!-- $sequencer_type.input_type.archive_type.archive_type_selected == "already_merged" --> 126 <!-- $sequencer_type.input_type.archive_type.archive_type_selected == "already_merged" -->
122 <when value="paired"> 127 <when value="paired">
123 <!-- Reads size --> 128 <!-- Reads size -->
124 <param name="R1_size" type="integer" label="Reads 1 size" help="The maximum read1 size." value="" optional="false" /> 129 <param name="R1_size" type="integer" label="Reads 1 size" help="The maximum read1 size." value="" optional="false" />
125 <param name="R2_size" type="integer" label="Reads 2 size" help="The maximum read2 size." value="" optional="false" /> 130 <param name="R2_size" type="integer" label="Reads 2 size" help="The maximum read2 size." value="" optional="false" />
126 <param name="mm_rate" type="float" label="Mismatch rate." help="The maximum rate of mismatch in the overlap region" value="0.1" optional="false" /> 131 <param name="mm_rate" type="float" label="Mismatch rate." help="The maximum rate of mismatch in the overlap region" value="0.1" optional="false" />
127 <conditional name="merge_software_type"> 132 <conditional name="merge_software_type">
128 <param name="merge_software_selected" type="select" label="Merge software" help="Select the software to merge paired-end reads."> 133 <param name="merge_software_selected" type="select" label="Merge software" help="Select the software to merge paired-end reads.">
129 <option value="vsearch" selected="true">Vsearch</option> 134 <option value="vsearch" selected="true">Vsearch</option>
130 <option value="flash">Flash</option> 135 <option value="flash">Flash</option>
131 </param> 136 </param>
199 </param> 204 </param>
200 </when> 205 </when>
201 <when value="without_primers"></when> 206 <when value="without_primers"></when>
202 </conditional> 207 </conditional>
203 </when> 208 </when>
204 209
205 <when value="454"> 210 <when value="454">
206 <!-- Samples --> 211 <!-- Samples -->
207 <conditional name="input_type"> 212 <conditional name="input_type">
208 <param name="input_type_selected" type="select" label="Input type" help="Samples files can be provided in single archive or with one file by sample."> 213 <param name="input_type_selected" type="select" label="Input type" help="Samples files can be provided in single archive or with one file by sample.">
209 <option value="files_by_samples" selected="true">One file by sample</option> 214 <option value="files_by_samples" selected="true">One file by sample</option>
294 </conditional> 299 </conditional>
295 </conditional> 300 </conditional>
296 <output name="dereplicated_file" file="references/01-prepro-vsearch.fasta" compare="diff" lines_diff="0" /> 301 <output name="dereplicated_file" file="references/01-prepro-vsearch.fasta" compare="diff" lines_diff="0" />
297 <output name="count_file" file="references/01-prepro-vsearch.tsv" compare="diff" lines_diff="0" /> 302 <output name="count_file" file="references/01-prepro-vsearch.tsv" compare="diff" lines_diff="0" />
298 <output name="summary_file" file="references/01-prepro-vsearch.html" compare="sim_size" delta="0"/> 303 <output name="summary_file" file="references/01-prepro-vsearch.html" compare="sim_size" delta="0"/>
299 </test> 304 </test>
300 </tests> 305 </tests>
301 <help> 306 <help>
302 307
303 .. image:: static/images/FROGS_logo.png 308 @HELP_LOGO@
304 :height: 144
305 :width: 110
306
307 309
308 .. class:: infomark page-header h2 310 .. class:: infomark page-header h2
309 311
310 What it does 312 What it does
311 313
359 This file contains the count of all unique sequences in each sample (format `TSV &lt;https://en.wikipedia.org/wiki/Tab-separated_values&gt;`_). 361 This file contains the count of all unique sequences in each sample (format `TSV &lt;https://en.wikipedia.org/wiki/Tab-separated_values&gt;`_).
360 362
361 **Summary file** (report.html): 363 **Summary file** (report.html):
362 364
363 This file reports the number of remaining sequences after each filter (format `HTML &lt;https://en.wikipedia.org/wiki/HTML&gt;`_). Depending of the tool configuration there will be more or less filtering steps so more or less bars in the barplot. 365 This file reports the number of remaining sequences after each filter (format `HTML &lt;https://en.wikipedia.org/wiki/HTML&gt;`_). Depending of the tool configuration there will be more or less filtering steps so more or less bars in the barplot.
364 366
365 .. image:: static/images/FROGS_preprocess_summary_v3.png 367 .. image:: static/images/FROGS_preprocess_summary_v3.png
366 :height: 850 368 :height: 850
367 :width: 831 369 :width: 831
368 370
369 It also presents the length distribution of the full amplicon sequences after merging step and after filtering steps. 371 It also presents the length distribution of the full amplicon sequences after merging step and after filtering steps.
370 372
371 .. image:: static/images/FROGS_preprocess_lengthsSamples_v3.png 373 .. image:: static/images/FROGS_preprocess_lengthsSamples_v3.png
372 :height: 379 374 :height: 379
373 :width: 364 375 :width: 364
374 376
375 .. class:: infomark page-header h2 377 .. class:: infomark page-header h2
376 378
377 How it works 379 How it works
378 380
379 .. csv-table:: 381 .. csv-table::
380 :header: "Steps", "Illumina", "454" 382 :header: "Steps", "Illumina", "454"
381 :widths: 5, 150, 150 383 :widths: 5, 150, 150
382 :class: table table-striped 384 :class: table table-striped
383 385
384 "1", "For un-merged data: Merges R1 and R2 with a maximum of M% mismatch in the overlaped region(`VSEARCH &lt;https://github.com/torognes/vsearch/&gt;`_ or `FLASH &lt;https://ccb.jhu.edu/software/FLASH/&gt;`_ or optionnaly `PEAR &lt;https://sco.h-its.org/exelixis/web/software/pear/&gt;`_) with a minimum of 10 bp in the overlap region. Resulting un-merged reads may optionnaly be artificially combined by adding 100 N between the reads", "/" 386 "1", "For un-merged data: Merges R1 and R2 with a maximum of M% mismatch in the overlaped region(`VSEARCH &lt;https://github.com/torognes/vsearch/&gt;`_ or `FLASH &lt;http://ccb.jhu.edu/software/FLASH/&gt;`_ or optionnaly `PEAR &lt;https://sco.h-its.org/exelixis/web/software/pear/&gt;`_) with a minimum of 10 bp in the overlap region. Resulting un-merged reads may optionnaly be artificially combined by adding 100 N between the reads", "/"
385 "2", "If sequencing protocol is the illumina standard protocol : Removes sequences where the two primers are not present and removes primers in the remaining sequence (`cutadapt &lt;http://cutadapt.readthedocs.org/en/latest/guide.html&gt;`_). The primer search accepts 10% of differences", "Removes sequences where the two primers are not present, removes primers sequence from amplicon sequence and reverse complement the sequences on strand - (`cutadapt &lt;http://cutadapt.readthedocs.org/en/latest/guide.html&gt;`_). The primer search accepts 10% of differences" 387 "2", "If sequencing protocol is the illumina standard protocol : Removes sequences where the two primers are not present and removes primers in the remaining sequence (`cutadapt &lt;http://cutadapt.readthedocs.org/en/latest/guide.html&gt;`_). The primer search accepts 10% of differences", "Removes sequences where the two primers are not present, removes primers sequence from amplicon sequence and reverse complement the sequences on strand - (`cutadapt &lt;http://cutadapt.readthedocs.org/en/latest/guide.html&gt;`_). The primer search accepts 10% of differences"
386 "3", "Filters sequences with ambiguous nucleotides and for merged sequences filters on their length which must be range between 'Minimum amplicon size - primer length' and 'Maximum amplicon size - primer length'", "Removes sequences with at least one homopolymer with more than seven nucleotides and with a distance of less than or equal to 10 nucleo-tides between two poor quality positions, i.e. with a Phred quality score lesser than 10" 388 "3", "Filters sequences with ambiguous nucleotides and for merged sequences filters on their length which must be range between 'Minimum amplicon size - primer length' and 'Maximum amplicon size - primer length'", "Removes sequences with at least one homopolymer with more than seven nucleotides and with a distance of less than or equal to 10 nucleo-tides between two poor quality positions, i.e. with a Phred quality score lesser than 10"
387 "4", "Dereplicates sequences", "Dereplicates sequences" 389 "4", "Dereplicates sequences", "Dereplicates sequences"
388 390
389 391
390 .. class:: infomark page-header h2 392 .. class:: infomark page-header h2
391 393
433 435
434 Primers parameters 436 Primers parameters
435 437
436 The (`Kozich et al. 2013 &lt;http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3753973/&gt;`_ ) protocol uses custom sequencing primers which are also the PCR primers. In this case the reads do not contain the PCR primers. 438 The (`Kozich et al. 2013 &lt;http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3753973/&gt;`_ ) protocol uses custom sequencing primers which are also the PCR primers. In this case the reads do not contain the PCR primers.
437 439
438 In case of Illumina standard protocol, the primers must be provided in 5' to 3' orientation. 440 In case of Illumina standard protocol, the primers must be provided in 5' to 3' orientation.
439 441
440 .. role:: alert-info 442 .. role:: alert-info
441 443
442 Example: 444 Example:
443 445
444 5' :alert-info:`ATGCCC` GTCGTCGTAAAATGC :alert-info:`ATTTCAG` 3' 446 5' :alert-info:`ATGCCC` GTCGTCGTAAAATGC :alert-info:`ATTTCAG` 3'
445 447
446 Value for parameter 5' primer: ATGCCC 448 Value for parameter 5' primer: ATGCCC
447 449
448 Value for parameter 3' primer: ATTTCAG 450 Value for parameter 3' primer: ATTTCAG
449 451
450 .. class:: h3 452 .. class:: h3
451 453
452 FLASH : Amplicons sizes parameters 454 FLASH : Amplicons sizes parameters
458 The two following images show two examples of perfect values fors sizes parameters. 460 The two following images show two examples of perfect values fors sizes parameters.
459 461
460 .. image:: static/images/FROGS_preprocess_ampliconSize_unimodal_v3.png 462 .. image:: static/images/FROGS_preprocess_ampliconSize_unimodal_v3.png
461 :height: 415 463 :height: 415
462 :width: 676 464 :width: 676
463 465
464 .. image:: static/images/FROGS_preprocess_ampliconSize_multimodal_v3.png 466 .. image:: static/images/FROGS_preprocess_ampliconSize_multimodal_v3.png
465 :height: 415 467 :height: 415
466 :width: 676 468 :width: 676
467 469
468 Don't worry the "Expected amplicon size" does not need to be very accurate, and only necessary for sequences merging with FLASH. 470 Don't worry the "Expected amplicon size" does not need to be very accurate, and only necessary for sequences merging with FLASH.
470 .. class:: h3 472 .. class:: h3
471 473
472 If the filter 'merged' reduce drasticaly the number of sequences: 474 If the filter 'merged' reduce drasticaly the number of sequences:
473 475
474 In un-merged Illumina data, and targeted amplicon size in the range of R1+R2-10, the reduction of dataset by the merged filter is classicaly inferior than 20%. A loss of more than 20% in all samples can highlight a quality problem. 476 In un-merged Illumina data, and targeted amplicon size in the range of R1+R2-10, the reduction of dataset by the merged filter is classicaly inferior than 20%. A loss of more than 20% in all samples can highlight a quality problem.
475 477
476 If the overlap between R1 and R2 is superior to 50 nucleotides and the quality of the end of the sequences is poor (see `FastQC &lt;http://www.bioinformatics.babraham.ac.uk/projects/fastqc/&gt;`_) you can try to cut the end of your sequences and relaunch the preprocess tool. You can either raise the mismatch percent in the overlapped region, but not too much! 478 If the overlap between R1 and R2 is superior to 50 nucleotides and the quality of the end of the sequences is poor (see `FastQC &lt;http://www.bioinformatics.babraham.ac.uk/projects/fastqc/&gt;`_) you can try to cut the end of your sequences and relaunch the preprocess tool. You can either raise the mismatch percent in the overlapped region, but not too much!
477 479
478 ---- 480
479 481 @HELP_CONTACT@
480 **Contact**
481
482 Contacts: frogs-support@inrae.fr
483
484 Repositories: https://github.com/geraldinepascal/FROGS, https://github.com/geraldinepascal/FROGS-wrappers
485
486 Website: http://frogs.toulouse.inrae.fr/
487
488 Please cite the **FROGS article**: `Escudie F., et al. Bioinformatics, 2018. FROGS: Find, Rapidly, OTUs with Galaxy Solution. &lt;https://doi.org/10.1093/bioinformatics/btx791&gt;`_
489 482
490 </help> 483 </help>
484
485 <citations>
486 <expand macro="citations" />
487 </citations>
488
491 </tool> 489 </tool>