comparison Amplicon_analysis-galaxy-update-to-Amplicon_analysis_pipeline-1.3/amplicon_analysis_pipeline.xml @ 41:7b9786a43a16 draft

Uploaded test version 1.3.5.0.
author pjbriggs
date Thu, 05 Dec 2019 11:44:03 +0000
parents
children
comparison
equal deleted inserted replaced
40:5ef333d1c303 41:7b9786a43a16
1 <tool id="amplicon_analysis_pipeline" name="Amplicon Analysis Pipeline" version="1.3.5.0">
2 <description>analyse 16S rRNA data from Illumina Miseq paired-end reads</description>
3 <requirements>
4 <requirement type="package" version="1.3.5">amplicon_analysis_pipeline</requirement>
5 </requirements>
6 <stdio>
7 <exit_code range="1:" />
8 </stdio>
9 <command><![CDATA[
10
11 ## Convenience variable for pipeline name
12 #set $pipeline_name = $pipeline.pipeline_name
13
14 ## Set the reference database name
15 #if str( $pipeline_name ) == "DADA2"
16 #set reference_database_name = "silva"
17 #else
18 #set reference_database = $pipeline.reference_database
19 #if $reference_database == "-S"
20 #set reference_database_name = "silva"
21 #else if $reference_database == "-H"
22 #set reference_database_name = "homd"
23 #else
24 #set reference_database_name = "gg"
25 #end if
26 #end if
27
28 ## Run the amplicon analysis pipeline wrapper
29 python $__tool_directory__/amplicon_analysis_pipeline.py
30 ## Set options
31 #if str( $forward_pcr_primer ) != ""
32 -g "$forward_pcr_primer"
33 #end if
34 #if str( $reverse_pcr_primer ) != ""
35 -G "$reverse_pcr_primer"
36 #end if
37 #if str( $trimming_threshold ) != ""
38 -q $trimming_threshold
39 #end if
40 #if str( $sliding_window_length ) != ""
41 -l $sliding_window_length
42 #end if
43 #if str( $minimum_overlap ) != ""
44 -O $minimum_overlap
45 #end if
46 #if str( $minimum_length ) != ""
47 -L $minimum_length
48 #end if
49 -P $pipeline_name
50 -r \${AMPLICON_ANALYSIS_REF_DATA_PATH-ReferenceData}
51 #if str( $pipeline_name ) != "DADA2"
52 ${reference_database}
53 #end if
54 #if str($categories_file_in) != 'None'
55 -c "${categories_file_in}"
56 #end if
57 ## Input files
58 "${metatable_file_in}"
59 ## FASTQ pairs
60 #if str($input_type.pairs_or_collection) == "collection"
61 #set fastq_pairs = $input_type.fastq_collection
62 #else
63 #set fastq_pairs = $input_type.fastq_pairs
64 #end if
65 #for $fq_pair in $fastq_pairs
66 "${fq_pair.name}" "${fq_pair.forward}" "${fq_pair.reverse}"
67 #end for
68 &&
69
70 ## Collect outputs
71 cp Metatable_log/Metatable_mod.txt "${metatable_mod}" &&
72 #if str( $pipeline_name ) == "Vsearch"
73 # Vsearch-specific
74 cp ${pipeline_name}_OTU_tables/multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_tax_OTU_table.biom "${tax_otu_table_biom_file}" &&
75 cp Multiplexed_files/${pipeline_name}_pipeline/multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_OTUs.fasta "${dereplicated_nonchimera_otus_fasta}" &&
76 cp QUALITY_CONTROL/Reads_count.txt "$read_counts_out" &&
77 #else
78 # DADA2-specific
79 cp ${pipeline_name}_OTU_tables/DADA2_tax_OTU_table.biom "${tax_otu_table_biom_file}" &&
80 cp ${pipeline_name}_OTU_tables/seqs.fa "${dereplicated_nonchimera_otus_fasta}" &&
81 #end if
82 cp ${pipeline_name}_OTU_tables/otus.tre "${otus_tre_file}" &&
83 cp RESULTS/${pipeline_name}_${reference_database_name}/OTUs_count.txt "${otus_count_file}" &&
84 cp RESULTS/${pipeline_name}_${reference_database_name}/table_summary.txt "${table_summary_file}" &&
85 cp fastqc_quality_boxplots.html "${fastqc_quality_boxplots_html}" &&
86
87 ## OTU table heatmap
88 cp RESULTS/${pipeline_name}_${reference_database_name}/Heatmap.pdf "${heatmap_otu_table_pdf}"" &&
89
90 ## HTML outputs
91
92 ## Phylum genus barcharts
93 mkdir $phylum_genus_dist_barcharts_html.files_path &&
94 cp -r RESULTS/${pipeline_name}_${reference_database_name}/phylum_genus_charts/charts $phylum_genus_dist_barcharts_html.files_path &&
95 cp -r RESULTS/${pipeline_name}_${reference_database_name}/phylum_genus_charts/raw_data $phylum_genus_dist_barcharts_html.files_path &&
96 cp RESULTS/${pipeline_name}_${reference_database_name}/phylum_genus_charts/bar_charts.html "${phylum_genus_dist_barcharts_html}" &&
97
98 ## Beta diversity weighted 2d plots
99 mkdir $beta_div_even_weighted_2d_plots.files_path &&
100 cp -r RESULTS/${pipeline_name}_${reference_database_name}/beta_div_even/weighted_2d_plot/* $beta_div_even_weighted_2d_plots.files_path &&
101 cp RESULTS/${pipeline_name}_${reference_database_name}/beta_div_even/weighted_2d_plot/weighted_unifrac_pc_2D_PCoA_plots.html "${beta_div_even_weighted_2d_plots}" &&
102
103 ## Beta diversity unweighted 2d plots
104 mkdir $beta_div_even_unweighted_2d_plots.files_path &&
105 cp -r RESULTS/${pipeline_name}_${reference_database_name}/beta_div_even/unweighted_2d_plot/* $beta_div_even_unweighted_2d_plots.files_path &&
106 cp RESULTS/${pipeline_name}_${reference_database_name}/beta_div_even/unweighted_2d_plot/unweighted_unifrac_pc_2D_PCoA_plots.html "${beta_div_even_unweighted_2d_plots}" &&
107
108 ## Alpha diversity rarefaction plots
109 mkdir $alpha_div_rarefaction_plots.files_path &&
110 cp RESULTS/${pipeline_name}_${reference_database_name}/Alpha_diversity/rarefaction_curves/rarefaction_plots.html $alpha_div_rarefaction_plots &&
111 cp -r RESULTS/${pipeline_name}_${reference_database_name}/Alpha_diversity/rarefaction_curves/average_plots $alpha_div_rarefaction_plots.files_path &&
112
113 ## DADA2 error rate plots
114 #if str($pipeline_name) == "DADA2"
115 mkdir $dada2_error_rate_plots.files_path &&
116 cp DADA2_OTU_tables/Error_rate_plots/error_rate_plots.html $dada2_error_rate_plots &&
117 cp -r DADA2_OTU_tables/Error_rate_plots/*.pdf $dada2_error_rate_plots.files_path &&
118 #end if
119
120 ## Categories data
121 #if str($categories_file_in) != 'None'
122 ## Alpha diversity boxplots
123 mkdir $alpha_div_boxplots.files_path &&
124 cp alpha_diversity_boxplots.html "$alpha_div_boxplots" &&
125 cp RESULTS/${pipeline_name}_${reference_database_name}/Alpha_diversity/Alpha_diversity_boxplot/Categories_shannon/*.pdf $alpha_div_boxplots.files_path &&
126 #end if
127
128 ## Pipeline outputs (log files etc)
129 mkdir $log_files.files_path &&
130 cp Amplicon_analysis_pipeline.log $log_files.files_path &&
131 cp pipeline.log $log_files.files_path &&
132 cp Pipeline_outputs.txt $log_files.files_path &&
133 cp Metatable_log/Metatable.html $log_files.files_path &&
134 cp pipeline_outputs.html "$log_files"
135 ]]></command>
136 <inputs>
137 <param name="title" type="text" value="test" size="25"
138 label="Title" help="Optional text that will be added to the output dataset names" />
139 <param type="data" name="metatable_file_in" format="tabular"
140 label="Input Metatable.txt file" />
141 <param type="data" name="categories_file_in" format="txt"
142 label="Input Categories.txt file" optional="true"
143 help="(optional)" />
144 <conditional name="input_type">
145 <param name="pairs_or_collection" type="select"
146 label="Input FASTQ type">
147 <option value="pairs_of_files">Pairs of datasets</option>
148 <option value="collection" selected="true">Dataset pairs in a collection</option>
149 </param>
150 <when value="collection">
151 <param name="fastq_collection" type="data_collection"
152 format="fastqsanger,fastq" collection_type="list:paired"
153 label="Collection of FASTQ forward and reverse (R1/R2) pairs"
154 help="Each FASTQ pair will be treated as one sample; the name of each sample will be taken from the first column of the Metatable file " />
155 </when>
156 <when value="pairs_of_files">
157 <repeat name="fastq_pairs" title="Input fastq pairs" min="1">
158 <param type="text" name="name" value=""
159 label="Final name for FASTQ pair" />
160 <param type="data" name="fastq_r1" format="fastqsanger,fastq"
161 label="FASTQ with forward reads (R1)" />
162 <param type="data" name="fastq_r2" format="fastqsanger,fastq"
163 label="FASTQ with reverse reads (R2)" />
164 </repeat>
165 </when>
166 </conditional>
167 <param type="text" name="forward_pcr_primer" value=""
168 label="Forward PCR primer sequence"
169 help="Optional; must not include barcode or adapter sequence (-g)" />
170 <param type="text" name="reverse_pcr_primer" value=""
171 label="Reverse PCR primer sequence"
172 help="Optional; must not include barcode or adapter sequence (-G)" />
173 <param type="integer" name="trimming_threshold" value="20"
174 label="Threshold quality below which read will be trimmed"
175 help="Phred score; default is 20 (-q)" />
176 <param type="integer" name="minimum_overlap" value="10"
177 label="Minimum overlap in bp between forward and reverse reads"
178 help="Default is 10 (-O)" />
179 <param type="integer" name="minimum_length" value="200"
180 label="Minimum length in bp to keep sequence after overlapping"
181 help="Default is 200 (-L)" />
182 <param type="integer" name="sliding_window_length" value="10"
183 label="Minimum length in bp to retain a read after trimming"
184 help="Supplied to Sickle; default is 10 (-l)" />
185 <conditional name="pipeline">
186 <param type="select" name="pipeline_name"
187 label="Pipeline to use for analysis">
188 <option value="Vsearch" selected="true" >Vsearch</option>
189 <option value="DADA2">DADA2</option>
190 </param>
191 <when value="Vsearch">
192 <param type="select" name="reference_database"
193 label="Reference database">
194 <option value="" selected="true">GreenGenes</option>
195 <option value="-S">Silva</option>
196 <option value="-H">Human Oral Microbiome Database (HOMD)</option>
197 </param>
198 </when>
199 <when value="DADA2">
200 </when>
201 </conditional>
202 </inputs>
203 <outputs>
204 <data format="tabular" name="metatable_mod"
205 label="${tool.name}:${title} Metatable_mod.txt" />
206 <data format="tabular" name="read_counts_out"
207 label="${tool.name} (${pipeline.pipeline_name}):${title} read counts">
208 <filter>pipeline['pipeline_name'] == 'Vsearch'</filter>
209 </data>
210 <data format="biom" name="tax_otu_table_biom_file"
211 label="${tool.name} (${pipeline.pipeline_name}):${title} tax OTU table (biom format)" />
212 <data format="tabular" name="otus_tre_file"
213 label="${tool.name} (${pipeline.pipeline_name}):${title} otus.tre" />
214 <data format="html" name="phylum_genus_dist_barcharts_html"
215 label="${tool.name} (${pipeline.pipeline_name}):${title} phylum genus dist barcharts HTML" />
216 <data format="tabular" name="otus_count_file"
217 label="${tool.name} (${pipeline.pipeline_name}):${title} OTUs count file" />
218 <data format="tabular" name="table_summary_file"
219 label="${tool.name} (${pipeline.pipeline_name}):${title} table summary file" />
220 <data format="fasta" name="dereplicated_nonchimera_otus_fasta"
221 label="${tool.name} (${pipeline.pipeline_name}):${title} multiplexed linearized dereplicated mc2 repset nonchimeras OTUs FASTA" />
222 <data format="html" name="fastqc_quality_boxplots_html"
223 label="${tool.name} (${pipeline.pipeline_name}):${title} FastQC per-base quality boxplots HTML" />
224 <data format="pdf" name="heatmap_otu_table_pdf"
225 label="${tool.name} (${pipeline.pipeline_name}):${title} heatmap OTU table PDF" />
226 <data format="html" name="beta_div_even_weighted_2d_plots"
227 label="${tool.name} (${pipeline.pipeline_name}):${title} beta diversity weighted 2D plots HTML" />
228 <data format="html" name="beta_div_even_unweighted_2d_plots"
229 label="${tool.name} (${pipeline.pipeline_name}):${title} beta diversity unweighted 2D plots HTML" />
230 <data format="html" name="alpha_div_rarefaction_plots"
231 label="${tool.name} (${pipeline.pipeline_name}):${title} alpha diversity rarefaction plots HTML" />
232 <data format="html" name="dada2_error_rate_plots"
233 label="${tool.name} (${pipeline.pipeline_name}):${title} DADA2 error rate plots">
234 <filter>pipeline['pipeline_name'] == 'DADA2'</filter>
235 </data>
236 <data format="html" name="alpha_div_boxplots"
237 label="${tool.name} (${pipeline.pipeline_name}):${title} alpha diversity boxplots">
238 <filter>categories_file_in is not None</filter>
239 </data>
240 <data format="html" name="log_files"
241 label="${tool.name} (${pipeline.pipeline_name}):${title} log files" />
242 </outputs>
243 <tests>
244 </tests>
245 <help><![CDATA[
246
247 What it does
248 ------------
249
250 This pipeline has been designed for the analysis of 16S rRNA data from
251 Illumina Miseq (Casava >= 1.8) paired-end reads.
252
253 Usage
254 -----
255
256 1. Preparation of the mapping file and format of unique sample id
257 *****************************************************************
258
259 Before using the amplicon analysis pipeline it would be necessary to
260 follow the steps as below to avoid analysis failures and ensure samples
261 are labelled appropriately. Sample names for the labelling are derived
262 from the fastq files names that are generated from the sequencing. The
263 labels will include everything between the beginning of the name and
264 the sample number (from C11 to S19 in Fig. 1)
265
266 .. image:: Pipeline_description_Fig1.png
267 :height: 46
268 :width: 382
269
270 **Figure 1**
271
272 If analysing 16S data from multiple runs:
273
274 The samples from different runs may have identical IDs. For example,
275 when sequencing the same samples twice, by chance, these could be at
276 the same position in both the runs. This would cause the fastq files
277 to have exactly the same IDs (Fig. 2).
278
279 .. image:: Pipeline_description_Fig2.png
280 :height: 100
281 :width: 463
282
283 **Figure 2**
284
285 In case of identical sample IDs the pipeline will fail to run and
286 generate an error at the beginning of the analysis.
287
288 To avoid having to change the file names, before uploading the files,
289 ensure that the samples IDs are not repeated.
290
291 2. To upload the file
292 *********************
293
294 Click on **Get Data/Upload File** from the Galaxy tool panel on the
295 left hand side.
296
297 From the pop-up window, choose how to upload the file. The
298 **Choose local file** option can be used for files up to 4Gb. Fastq files
299 from Illumina MiSeq will rarely be bigger than 4Gb and this option is
300 recommended.
301
302 After choosing the files click **Start** to begin the upload. The window can
303 now be closed and the files will be uploaded onto the Galaxy server. You
304 will see the progress on the ``HISTORY`` panel on the right
305 side of the screen. The colour will change from grey (queuing), to yellow
306 (uploading) and finally green (uploaded).
307
308 Once all the files are uploaded, click on the operations on multiple
309 datasets icon and select the fastq files that need to be analysed.
310 Click on the tab **For all selected...** and on the option
311 **Build List of Dataset pairs** (Fig. 3).
312
313 .. image:: Pipeline_description_Fig3.png
314 :height: 247
315 :width: 586
316
317 **Figure 3**
318
319 Change the filter parameter ``_1`` and ``_2`` to be ``_R1`` and ``_R2``.
320 The fastq files forward R1 and reverse R2 should now appear in the
321 corresponding columns.
322
323 Select **Autopair**. This creates a collection of paired fastq files for
324 the forward and reverse reads for each sample. The name of the pairs will
325 be the ones used by the pipeline. You are free to change the names at this
326 point as long as they are the same used in the Metatable file
327 (see section 3).
328
329 Name the collection and click on **create list**. This reduces the time
330 required to input the forward and reverse reads for each individual sample.
331
332 3. Create the Metatable files
333 *****************************
334
335 Metatable.txt
336 ~~~~~~~~~~~~~
337
338 Click on the list of pairs you just created to see the name of the single
339 pairs. The name of the pairs will be the ones used by the pipeline,
340 therefore, these are the names that need to be used in the Metatable file.
341
342 The Metatable file has to be in QIIME format. You can find a description
343 of it on QIIME website http://qiime.org/documentation/file_formats.html
344
345 EXAMPLE::
346
347 #SampleID BarcodeSequence LinkerPrimerSequence Disease Gender Description
348 Mock-RUN1 TAAGGCGAGCGTAAGA PsA Male Control
349 Mock-RUN2 CGTACTAGGCGTAAGA PsA Male Control
350 Mock-RUN3 AGGCAGAAGCGTAAGA PsC Female Control
351
352 Briefly: the column ``LinkerPrimerSequence`` is empty but it cannot be
353 deleted. The header is very important. ``#SampleID``, ``Barcode``,
354 ``LinkerPrimerSequence`` and ``Description`` are mandatory. Between
355 ``LinkerPrimerSequence`` and ``Description`` you can add as many columns
356 as you want. For every column a PCoA plot will be created (see
357 **Results** section). You can create this file in Excel and it will have
358 to be saved as ``Text(Tab delimited)``.
359
360 During the analysis the Metatable.txt will be checked to ensure that the
361 file has the correct format. If necessary, this will be modified and will
362 be available as Metatable_corrected.txt in the history panel. If you are
363 going to use the metatable file for any other statistical analyses,
364 remember to use the ``Metatable_mod.txt`` one, otherwise the sample
365 names might not match!
366
367 Categories.txt (optional)
368 ~~~~~~~~~~~~~~~~~~~~~~~~~
369
370 This file is required if you want to get box plots for comparison of
371 alpha diversity indices (see **Results** section). The file is a list
372 (without header and IN ONE COLUMN) of categories present in the
373 Metatable.txt file. THE NAMES YOU ARE USING HAVE TO BE THE SAME AS THE
374 ONES USED IN THE METATABLE.TXT. You can create this file in Excel and
375 will have to be saved as ``Text(Tab delimited)``.
376
377 EXAMPLE::
378
379 Disease
380 Gender
381
382 Metatable and categories files can be uploaded using Get Data as done
383 with the fatsq files.
384
385 4. Analysis
386 ***********
387
388 Under **Amplicon_Analysis_Pipeline**
389
390 * **Title** Name to distinguish between the runs. It will be shown at
391 the beginning of each output file name.
392
393 * **Input Metatable.txt file** Select the Metatable.txt file related to
394 this analysis
395
396 * **Input Categories.txt file (Optional)** Select the Categories.txt file
397 related to this analysis
398
399 * **Input FASTQ type** select *Dataset pairs in a collection* and, then,
400 the collection of pairs you created earlier.
401
402 * **Forward/Reverse PCR primer sequence** if the PCR primer sequences
403 have not been removed from the MiSeq during the fastq creation, they
404 have to be removed before the analysis. Insert the PCR primer sequence
405 in the corresponding field. DO NOT include any barcode or adapter
406 sequence. If the PCR primers have been already trimmed by the MiSeq,
407 and you include the sequence in this field, this would lead to an error.
408 Only include the sequences if still present in the fastq files.
409
410 * **Threshold quality below which reads will be trimmed** Choose the
411 Phred score used by Sickle to trim the reads at the 3’ end.
412
413 * **Minimum length to retain a read after trimming** If the read length
414 after trimming is shorter than a user defined length, the read, along
415 with the corresponding read pair, will be discarded.
416
417 * **Minimum overlap in bp between forward and reverse reads** Choose the
418 minimum basepair overlap used by Pandaseq to assemble the reads.
419 Default is 10.
420
421 * **Minimum length in bp to keep a sequence after overlapping** Choose the
422 minimum sequence length used by Pandaseq to keep a sequence after the
423 overlapping. This depends on the expected amplicon length. Default is
424 380 (used for V3-V4 16S sequencing; expected length ~440bp)
425
426 * **Pipeline to use for analysis** Choose the pipeline to use for OTU
427 clustering and chimera removal. The Galaxy tool supports the ``Vsearch``
428 and ``DADA2`` pipelines.
429
430 * **Reference database** Choose between ``GreenGenes``, ``Silva`` or
431 ``HOMD`` (Human Oral Microbiome Database) for taxa assignment.
432
433 Click on **Execute** to start the analysis.
434
435 5. Results
436 **********
437
438 Results are entirely generated using QIIME scripts. The results will
439 appear in the History panel when the analysis is completed.
440
441 The following outputs are captured:
442
443 * **Vsearch_tax_OTU_table.biom|DADA2_tax_OTU_table.biom (biom format)**
444 The OTU table in BIOM format (http://biom-format.org/)
445
446 * **otus.tre** Phylogenetic tree constructed using ``make_phylogeny.py``
447 (fasttree) QIIME script (http://qiime.org/scripts/make_phylogeny.html)
448
449 * **Phylum_genus_dist_barcharts_HTML** HTML file with bar charts at
450 Phylum, Genus and Species level
451 (http://qiime.org/scripts/summarize_taxa.html and
452 http://qiime.org/scripts/plot_taxa_summary.html)
453
454 * **OTUs_count_file** Summary of OTU counts per sample
455 (http://biom-format.org/documentation/summarizing_biom_tables.html)
456
457 * **Table_summary_file** Summary of sequences counts per sample
458 (http://biom-format.org/documentation/summarizing_biom_tables.html)
459
460 * **multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_OTUs.fasta|seqs.fa**
461 Fasta file with OTU sequences (Vsearch|DADA2)
462
463 * **Heatmap_PDF** OTU heatmap in PDF format
464 (http://qiime.org/1.8.0/scripts/make_otu_heatmap_html.html )
465
466 * **Vsearch_beta_diversity_weighted_2D_plots_HTML** PCoA plots in HTML
467 format using weighted Unifrac distance measure. Samples are grouped
468 by the column names present in the Metatable file. The samples are
469 firstly rarefied to the minimum sequencing depth
470 (http://qiime.org/scripts/beta_diversity_through_plots.html )
471
472 * **Vsearch_beta_diversity_unweighted_2D_plots_HTML** PCoA plots in HTML
473 format using Unweighted Unifrac distance measure. Samples are grouped
474 by the column names present in the Metatable file. The samples are
475 firstly rarefied to the minimum sequencing depth
476 (http://qiime.org/scripts/beta_diversity_through_plots.html )
477
478 Code availability
479 -----------------
480
481 **Code is available at** https://github.com/MTutino/Amplicon_analysis
482
483 Credits
484 -------
485
486 Pipeline author: Mauro Tutino
487
488 Galaxy tool: Peter Briggs
489
490 ]]></help>
491 <citations>
492 <citation type="bibtex">
493 @misc{githubAmplicon_analysis,
494 author = {Tutino, Mauro},
495 year = {2017},
496 title = {Amplicon Analysis Pipeline},
497 publisher = {GitHub},
498 journal = {GitHub repository},
499 url = {https://github.com/MTutino/Amplicon_analysis},
500 }</citation>
501 </citations>
502 </tool>