changeset 9:d908015e5889 draft

planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/topic/dada2/tools/dada2 commit a54770771e567c7ad8a9dd75cc4689c3935ef11c
author matthias
date Tue, 28 May 2019 12:15:38 -0400
parents 7970dfbedde3
children 279014cdd101
files README.md dada2_plotQualityProfile.xml macros.xml test-data/dada2_species.loc test-data/dada2_taxonomy.loc test-data/qualityProfile.pdf test-data/qualityProfileMultiple.pdf test-data/qualityProfileMultiple_rev.pdf test-data/qualityProfileSmallSample.pdf test-data/qualityProfile_rev.pdf tool_data_table_conf.xml.test
diffstat 11 files changed, 158 insertions(+), 109 deletions(-) [+]
line wrap: on
line diff
--- a/README.md	Mon May 27 13:23:01 2019 -0400
+++ b/README.md	Tue May 28 12:15:38 2019 -0400
@@ -11,7 +11,7 @@
 Datatypes
 =========
 
-The dada2 Galaxy wrappers use a few extra data types to ensure that only inputs of the correct type can be used. 
+The dada2 Galaxy wrappers use a few extra data types to ensure that only inputs of the correct type can be used, these datatypes are available from Galaxy release 19.05, for earlier releases they need to be added manually. 
 
 For the outputs of derep, dada, learnErrors, and mergePairs the following datatypes are used that derive from  Rdata (which contains the named list that is returned from the corresponding dada function):
 
@@ -35,5 +35,4 @@
 TODOs 
 =====
 
-- implememt getUniques tool to view intermediate results?
-- implement tests for cached reference data
+- implement getUniques tool to view intermediate results?
--- a/dada2_plotQualityProfile.xml	Mon May 27 13:23:01 2019 -0400
+++ b/dada2_plotQualityProfile.xml	Tue May 28 12:15:38 2019 -0400
@@ -8,121 +8,98 @@
     <command detect_errors="exit_code"><![CDATA[
 ##name files by linking
 #import re
-#if "batch" in str($paired_cond.paired_select)
-  #set elid = re.sub('[^\w\-\.]', '_', str($paired_cond.fl.element_identifier))
-  #if "single" in str($paired_cond.paired_select)
-    ln -s '$paired_cond.fl' '$elid' &&
-  #else
-    ln -s '$paired_cond.fl.forward' '$elid'_forward &&
-    ln -s '$paired_cond.fl.reverse' '$elid'_reverse &&
-  #end if
+mkdir forward &&
+#if $batch_cond.paired_cond.paired_select != "single"
+    mkdir reverse &&
+#end if
+
+#if $batch_cond.batch_select == "batch":
+    #set elid = re.sub('[^\w\-\.]', '_', str($batch_cond.paired_cond.reads.element_identifier))
+    #if $batch_cond.paired_cond.paired_select != "paired"
+        ln -s '$batch_cond.paired_cond.reads' forward/'$elid' &&
+    #else
+        ln -s '$batch_cond.paired_cond.reads.forward' forward/'$elid' &&
+        ln -s '$batch_cond.paired_cond.reads.reverse' reverse/'$elid' &&
+    #end if
+    #if $batch_cond.paired_cond.paired_select == "separate"
+        ln -s '$batch_cond.paired_cond.sdaer' reverse/'$elid' &&
+    #end if
 #else
-  #for $read in $paired_cond.fl:
-    #set elid = re.sub('[^\w\-\.]', '_', str($read.element_identifier))
-    #if "single" in str($paired_cond.paired_select)
-      ln -s '$read' '$elid' &&
-    #else
-      ln -s '$read.forward' '$elid'_forward &&
-      ln -s '$read.reverse' '$elid'_reverse &&
+    #for $read in $batch_cond.paired_cond.reads:
+        #set elid = re.sub('[^\w\-\.]', '_', str($read.element_identifier))
+        #if $batch_cond.paired_cond.paired_select != "paired"
+            ln -s '$read' forward/'$elid' &&
+        #else
+            ln -s '$read.forward' forward/'$elid' &&
+            ln -s '$read.reverse' reverse/'$elid' &&
+        #end if
+    #end for
+    #if $batch_cond.paired_cond.paired_select == "separate"
+        #for $read in $batch_cond.paired_cond.sdaer:
+            #set elid = re.sub('[^\w\-\.]', '_', str($read.element_identifier))
+            ln -s '$read' reverse/'$elid' &&
+        #end for
     #end if
-  #end for
 #end if
-	
-	Rscript --slave '$dada2_script'
+
+    Rscript --slave '$dada2_script'
     ]]></command>
     <configfiles>
         <configfile name="dada2_script"><![CDATA[
 #import re
-fwd_files = c()
-rev_files = c()
-#if "batch" in str($paired_cond.paired_select)
-  #set elid = re.sub('[^\w\-\.]', '_', str($paired_cond.fl.element_identifier))
-  #if "single" in str($paired_cond.paired_select)
-    fwd_files = c(fwd_files, '$elid')
-  #else
-    fwd_files = c(fwd_files, paste('$elid', 'forward', sep = "_"))
-    rev_files = c(rev_files, paste('$elid', 'reverse', sep = "_"))
-  #end if
-#else
-  #for $read in $paired_cond.fl:
-    #set elid = re.sub('[^\w\-\.]', '_', str($read.element_identifier))
-    #if "single" in str($paired_cond.paired_select)
-      fwd_files = c(fwd_files, '$elid')
-    #else
-      fwd_files = c(fwd_files, paste('$elid', 'forward', sep = "_"))
-      rev_files = c(rev_files, paste('$elid', 'reverse', sep = "_"))
-    #end if
-  #end for
-#end if
+library(ggplot2, quietly=T)
+library(dada2, quietly=T)
 
-#if not "batch" in str($paired_cond.paired_select)
-agg = $paired_cond.aggregate
+#if $batch_cond.batch_select != "batch"
+agg = $batch_cond.aggregate
 #else
 agg = FALSE
 #end if
 
-library(ggplot2, quietly=T)
-library(dada2, quietly=T)
-
-qp <- plotQualityProfile(fwd_files,
-#if str($n) != ""
-    n=$n,
-#end if
-    aggregate = agg)
+fwd_files = list.files("forward", full.names=T)
+qp <- plotQualityProfile(fwd_files, n=$n, aggregate = agg)
 ggsave('output.pdf', qp, width = 20,height = 15,units = c("cm"))
 
-#if "paired" in str($paired_cond.paired_select)
-qp <- plotQualityProfile(rev_files,
-#if str($n) != ""
-    n=$n,
-#end if
-    aggregate = agg)
+#if $batch_cond.paired_cond.paired_select != "single"
+rev_files = list.files("reverse", full.names=T)
+qp <- plotQualityProfile(rev_files, n=$n, aggregate = agg)
 ggsave('output_rev.pdf', qp, width = 20,height = 15,units = c("cm"))
 #end if
     ]]></configfile>
     </configfiles>
     <inputs>
-        <conditional name="paired_cond">
-            <param name="paired_select" type="select" label="Input data organisation and processing mode" help="Select if data is organized in a paired collection or not (note that the pairing of the data sets is not used by the tool); batch will create a separate pdf for each input data set or data set pair; non-batch will create one pdf containing a plot for each data set">
-                <option value="paired">paired - non batch</option>
-                <option value="single">single - non batch</option>
-                <option value="paired_batch">paired - batch</option>
-                <option value="single_batch">single - batch</option>
+        <conditional name="batch_cond">
+            <param name="batch_select" type="select" label="Processing mode" help="Joint processing processes all reads at once in a single job creating a single output (two in the case of paired data). Batch processes the samples in separate jobs and creates separate output for each">
+                <option value="joint">Joint</option>
+                <option value="batch">Batch</option>
             </param>
-            <when value="paired">
-                <param argument="fl" type="data_collection" collection_type="list:paired" format="fastq,fastq.gz" label="Short read data"/>
+            <when value="joint">
+                <expand macro="fastq_input" multiple="True" collection_type="list:paired" argument_fwd="fl" argument_rev="fl"/>
                 <param argument="aggregate" type="boolean" label="Aggregate data" checked="True" truevalue="TRUE" falsevalue="FALSE" help="Create a single plot for all data sets (default) or a separate plot for each data set"/>
             </when>
-            <when value="single">
-                <param argument="fl" type="data" multiple="true" format="fastq,fastq.gz" label="Short read data"/>
-                <param argument="aggregate" type="boolean" label="Aggregate data" checked="True" truevalue="TRUE" falsevalue="FALSE" help="Create a single plot for all data sets (default) or a separate plot for each data set"/>
-            </when>
-            <when value="paired_batch">
-                <param argument="fl" type="data_collection" collection_type="paired" format="fastq,fastq.gz" label="Short read data"/>
-            </when>
-            <when value="single_batch">
-                <param argument="fl" type="data" format="fastq,fastq.gz" label="Short read data"/>
+            <when value="batch">
+                <expand macro="fastq_input" multiple="False" collection_type="paired" argument_fwd="fl" argument_rev="fl"/>
             </when>
         </conditional>
         <param argument="n" type="integer" value="500000" label="sample number" help="number of records to sample from the fastq file"/>
     </inputs>
     <outputs>
         <data name="output" format="pdf" from_work_dir="output.pdf">
-            <filter>"single" in paired_cond['paired_select']</filter>
-		</data>
-		<data name="output_fwd" format="pdf" from_work_dir="output.pdf" label="${tool.name} on ${on_string}: forward reads">
-            <filter>"paired" in paired_cond['paired_select']</filter>
+            <filter>batch_cond['paired_cond']['paired_select'] == "single"</filter>
         </data>
-		<data name="output_rev" format="pdf" from_work_dir="output_rev.pdf" label="${tool.name} on ${on_string}: reverse reads">
-            <filter>"paired" in paired_cond['paired_select']</filter>
+        <data name="output_fwd" format="pdf" from_work_dir="output.pdf" label="${tool.name} on ${on_string}: forward reads">
+            <filter>batch_cond['paired_cond']['paired_select'] != "single"</filter>
+        </data>
+        <data name="output_rev" format="pdf" from_work_dir="output_rev.pdf" label="${tool.name} on ${on_string}: reverse reads">
+            <filter>batch_cond['paired_cond']['paired_select'] != "single"</filter>
         </data>
     </outputs>
     <tests>
-        <!-- paired non-batch, aggregate -->
-        <test>
-            <param name="paired_cond|paired_select" value="paired"/>
-            <param name="paired_cond|aggregate" value="TRUE"/>
-            <param name="paired_cond|fl">
+        <!-- paired joint, no-aggregate -->
+        <test expect_num_outputs="2">
+            <param name="batch_cond|batch_select" value="joint"/>
+            <param name="batch_cond|paired_cond|paired_select" value="paired"/>
+            <param name="batch_cond|paired_cond|reads">
                 <collection type="list:paired">
                     <element name="F3D0_S188_L001">
                         <collection type="paired">
@@ -132,13 +109,35 @@
                     </element>
                 </collection>
             </param>
+            <param name="batch_cond|aggregate" value="FALSE"/>
             <output name="output_fwd" value="qualityProfileMultiple.pdf" ftype="pdf"/>
             <output name="output_rev" value="qualityProfileMultiple_rev.pdf" ftype="pdf"/>
         </test>
-        <!-- paired, batch, no aggregate-->
-        <test>
-            <param name="paired_cond|paired_select" value="paired_batch"/>
-            <param name="paired_cond|fl">
+        <!-- paired-separate joint, no-aggregate (sim_size because element ids differ) -->
+        <test expect_num_outputs="2">
+            <param name="batch_cond|batch_select" value="joint"/>
+            <param name="batch_cond|paired_cond|paired_select" value="separate"/>
+            <param name="batch_cond|paired_cond|reads" value="F3D0_S188_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
+            <param name="batch_cond|paired_cond|sdaer" value="F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
+            <param name="batch_cond|aggregate" value="FALSE"/>
+            <output name="output_fwd" value="qualityProfileMultiple.pdf" ftype="pdf" compare="sim_size"/>
+            <output name="output_rev" value="qualityProfileMultiple_rev.pdf" ftype="pdf" compare="sim_size"/>
+        </test>
+        <!-- single, non-batch, aggregate, small sample -->
+        <test expect_num_outputs="1">
+            <param name="batch_cond|batch_select" value="joint"/>
+            <param name="batch_cond|paired_cond|paired_select" value="single"/>
+            <param name="batch_cond|paired_cond|reads" value="F3D0_S188_L001_R1_001.fastq.gz,F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
+            <param name="n" value="10000"/>
+            <param name="batch_cond|aggregate" value="TRUE"/>
+            <output name="output" value="qualityProfileSmallSample.pdf" ftype="pdf"/>
+        </test>
+
+        <!-- paired, batch -->
+        <test expect_num_outputs="2">
+            <param name="batch_cond|batch_select" value="batch"/>
+            <param name="batch_cond|paired_cond|paired_select" value="paired"/>
+            <param name="batch_cond|paired_cond|reads">
                 <collection type="paired">
                     <element name="forward" value="F3D0_S188_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
                     <element name="reverse" value="F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
@@ -147,22 +146,24 @@
             <output name="output_fwd" value="qualityProfile.pdf" ftype="pdf"/>
             <output name="output_rev" value="qualityProfile_rev.pdf" ftype="pdf"/>
         </test>
-        <!-- single, non-batch, aggregate -->
-        <test>
-            <param name="paired_cond|paired_select" value="single"/>
-            <param name="paired_cond|aggregate" value="TRUE"/>
-            <param name="paired_cond|fl" value="F3D0_S188_L001_R1_001.fastq.gz,F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
-            <param name="n" value="10000"/>
-            <output name="output" value="qualityProfileSmallSample.pdf" ftype="pdf"/>
+        <!-- paired-separate batch  (sim_size because element ids differ)-->
+        <test expect_num_outputs="2">
+            <param name="batch_cond|batch_select" value="batch"/>
+            <param name="batch_cond|paired_cond|paired_select" value="separate"/>
+            <param name="batch_cond|paired_cond|reads" value="F3D0_S188_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
+            <param name="batch_cond|paired_cond|sdaer" value="F3D0_S188_L001_R2_001.fastq.gz" ftype="fastqsanger.gz"/>
+            <output name="output_fwd" value="qualityProfile.pdf" ftype="pdf" compare="sim_size"/>
+            <output name="output_rev" value="qualityProfile_rev.pdf" ftype="pdf" compare="sim_size"/>
         </test>
-        <!-- single, batch, no aggregate -->
-        <test>
-            <param name="aggregate" value="FALSE"/>
-            <param name="paired_cond|paired_select" value="single_batch"/>
-            <param name="paired_cond|fl" value="F3D0_S188_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
+        <!-- single, batch -->
+        <test expect_num_outputs="1">
+            <param name="batch_cond|batch_select" value="batch"/>
+            <param name="batch_cond|paired_cond|paired_select" value="single"/>
+            <param name="batch_cond|paired_cond|reads" value="F3D0_S188_L001_R1_001.fastq.gz" ftype="fastqsanger.gz"/>
             <param name="n" value="10000"/>
             <output name="output" value="qualityProfileSmallSample.pdf" ftype="pdf" compare="sim_size"/>
-        </test>    </tests>
+        </test>
+    </tests>
     <help><![CDATA[
 Summary
 .......
@@ -174,8 +175,6 @@
 
 The distribution of quality scores at each position is shown as a grey-scale heat map, with dark colors corresponding to higher frequency. The plotted lines show positional summary statistics: green is the mean, orange is the median, and the dashed orange lines are the 25th and 75th quantiles. If the sequences vary in length, a red line will be plotted showing the percentage of reads that extend
 to at least that position.
-
-Note this tool ignores the pairing of the reads, but the data is just processed as list.
     ]]></help>
     <expand macro="citations"/>
 </tool>
--- a/macros.xml	Mon May 27 13:23:01 2019 -0400
+++ b/macros.xml	Tue May 28 12:15:38 2019 -0400
@@ -26,7 +26,7 @@
     <token name="@DADA_UNIQUES@">dada2_derep,dada2_dada,dada2_mergepairs</token>
 
     <!-- function to read dada2 data types
-         - derep, dada, and mergepairs are simply read as RDS 
+         - derep, dada, and mergepairs are simply read as RDS
          - sequence_table is a named integer matrix (rows=samples, columns=ASVs)
          - uniques is a named integer vector (columns=ASVs, only one rows)-->
     <token name="@READ_FOO@"><![CDATA[
@@ -48,7 +48,7 @@
     #end def
     ]]></token>
     <!-- function to write dada2 data types (the content or the R variable 'out' is written)
-         - derep, dada, and mergepairs are written as RDS 
+         - derep, dada, and mergepairs are written as RDS
          - sequence_table is a named integer matrix (rows=samples, columns=ASVs)
          - uniques is a named integer vector (columns=ASVs, only one rows)-->
     <token name="@WRITE_FOO@"><![CDATA[
@@ -61,7 +61,27 @@
         saveRDS(data, file=fname)
     }
 }
-    ]]></token> 
+    ]]></token>
+
+    <xml name="fastq_input" token_multiple="" token_collection_type="" token_argument_fwd="" token_argument_rev="">
+        <conditional name="paired_cond">
+            <param name="paired_select" type="select" label="Paired reads">
+                <option value="paired">paired - in a data set pair</option>
+                <option value="separate">paired - in two separate data sets</option>
+                <option value="single">single</option>
+            </param>
+            <when value="paired">
+                <param name="reads" argument="@ARGUMENT_FWD@/@ARGUMENT_REV@" type="data_collection" collection_type="@COLLECTION_TYPE@" format="fastq,fastq.gz" label="Paired short read data"/>
+            </when>
+            <when value="separate">
+                <param name="reads" argument="@ARGUMENT_FWD@" type="data" format="fastq,fastq.gz" multiple="@MULTIPLE@" label="Forward read data"/>
+                <param name="sdaer" argument="@ARGUMENT_REV@" type="data" format="fastq,fastq.gz" multiple="@MULTIPLE@" label="Reverse read data"/>
+            </when>
+            <when value="single">
+                <param name="reads" argument="@ARGUMENT_FWD@" type="data" format="fastq,fastq.gz" multiple="@MULTIPLE@" label="Short read data"/>
+            </when>
+        </conditional>
+    </xml>
 
     <!-- for filterAndTrim -->
     <xml name="trimmers">
@@ -69,7 +89,7 @@
             <param argument="truncQ" type="integer" value="2" min="0" label="Truncate reads at quality threshold" help="Truncate reads at the first instance of a quality score less than or equal to this threshold"/>
             <param argument="trimLeft" type="integer" value="0" min="0" label="Trim start of each read" help="The number of nucleotides to remove from the start of each read."/>
             <param argument="trimRight" type="integer" value="0" min="0" label="Trim end of each read" help="The number of nucleotides to remove from the end of each read"/>
-			<param argument="truncLen" type="integer" value="0" min="0" label="Truncate read length" help="Truncate reads after this amount of bases. Reads shorter than this are discarded. (default 0: no truncation)"/>
+            <param argument="truncLen" type="integer" value="0" min="0" label="Truncate read length" help="Truncate reads after this amount of bases. Reads shorter than this are discarded. (default 0: no truncation)"/>
         </section>
     </xml>
     <xml name="filters">
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/dada2_species.loc	Tue May 28 12:15:38 2019 -0400
@@ -0,0 +1,10 @@
+# This is a sample file distributed with Galaxy that is used to define a
+# list of dada2 reference data sets for species assignment, using three
+# tab separated columns:
+#
+# <unique_build_id>	<display_name>	<fasta_file_path>
+#
+# Datasets can be retrieved from http://busco.ezlab.org/frame_wget.html
+#
+# Datasets can be retrieved from https://benjjneb.github.io/dada2/training.html
+test_buildid	test_displayname	${__HERE__}/reference_species.fa
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/dada2_taxonomy.loc	Tue May 28 12:15:38 2019 -0400
@@ -0,0 +1,10 @@
+# This is a sample file distributed with Galaxy that is used to define a
+# list of dada2 reference data sets for taxonomy assignment, using three
+# tab separated columns:
+#
+# <unique_build_id>	<display_name>	<fasta_file_path>	<taxlevels>
+#
+# Datasets can be retrieved from https://benjjneb.github.io/dada2/training.html
+# 
+# taxlevels is a comma separated list of taxonomy levels
+test_buildid	test_displayname	${__HERE__}/reference.fa	Level1,Level2,Level3,Level4,Level5
Binary file test-data/qualityProfile.pdf has changed
Binary file test-data/qualityProfileMultiple.pdf has changed
Binary file test-data/qualityProfileMultiple_rev.pdf has changed
Binary file test-data/qualityProfileSmallSample.pdf has changed
Binary file test-data/qualityProfile_rev.pdf has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Tue May 28 12:15:38 2019 -0400
@@ -0,0 +1,11 @@
+<?xml version="1.0"?>
+<tables>
+    <table name="dada2_species" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="${__HERE__}/test-data/dada2_species.loc" />
+    </table>
+    <table name="dada2_taxonomy" comment_char="#">
+        <columns>value, name, path, taxlevels</columns>
+        <file path="${__HERE__}/test-data/dada2_taxonomy.loc" />
+    </table>
+</tables>