Mercurial > repos > matthias > dada2_dada
changeset 5:4a770a261b16 draft
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/topic/dada2/tools/dada2 commit 990192685955e9cda0282e348c28ef6462d88a38
| author | matthias |
|---|---|
| date | Sun, 05 May 2019 12:42:02 -0400 |
| parents | a544d3f273e8 |
| children | c144b65682ad |
| files | README.md dada2_dada.xml macros.xml test-data/.reference.fa.swp test-data/.reference_species.fa.swp test-data/assignTaxonomyAddspecies_F3D0_boot.tab test-data/filterAndTrim_F3D0.tab test-data/makeSequenceTable_F3D0.pdf test-data/makeSequenceTable_F3D0.tab test-data/mergePairs_F3D0_nondefault.Rdata test-data/qualityProfile.pdf test-data/removeBimeraDenovo_F3D0.tab test-data/removeBimeraDenovo_F3D0_dada_uniques.tab test-data/removeBimeraDenovo_F3D0_derep_uniques.tab test-data/removeBimeraDenovo_F3D0_mergepairs.Rdata test-data/seqCounts_F3D0_dadaF.tab test-data/seqCounts_F3D0_derepF.tab test-data/seqCounts_F3D0_filter.tab test-data/seqCounts_F3D0_merge.tab test-data/seqCounts_F3D0_nochim.tab test-data/seqCounts_F3D0_seqtab.tab todo.txt |
| diffstat | 22 files changed, 2707 insertions(+), 32 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Sun May 05 12:42:02 2019 -0400 @@ -0,0 +1,39 @@ +Wrappers for the core functionality of the dada2 package https://benjjneb.github.io/dada2/index.html. + +- filterAndTrim +- derep +- learnErrors +- dada +- mergePairs +- makeSequenceTable +- removeBimeraDenovo + +Datatypes +========= + +The dada2 Galaxy wrappers use a few extra data types to ensure that only inputs of the correct type can be used. + +For the outputs of derep, dada, learnErrors, and mergePairs the following datatypes are used that derive from Rdata (which contains the named list that is returned from the corresponding dada function): + +- dada2_derep (Rdata: named list see docs for derep-class) +- dada2_dada (Rdata: named list, see docs for dada-class) +- dada2_errorrates (Rdata: named list, see docs for learnErrors) +- dada2_mergepairs (Rdata: named list, see docs for mergePairs) + +For the outputs of makeSequenceTable and removeBimeraDenovo the following data types are used which derive from tabular: + +- dada2_uniques +-- in R a named integer vector (names are the unique sequences) +-- in Galaxy written as a table (each row corresponding to a unique sequence, column 1: the sequence, column 2: the count) +- dada2_sequencetable +-- in R a named integer matrix (rows = samples, columns = unique sequences) +-- in Galaxy written as a table (rows = unique sequences, columns = samples) + +Note the difference between the R and Galaxy representations! The main motivation is that the dada2_sequencetable is analogous to OTU tables as produced for instance by qiime (and it seemed natural to extend this to the uniques which are essentially a sequencetables of single samples). + + +TODOs +===== + +- implememt getUniques tool to view intermediate results? +- implement tests for cached reference data
--- a/dada2_dada.xml Mon Apr 29 09:53:39 2019 -0400 +++ b/dada2_dada.xml Sun May 05 12:42:02 2019 -0400 @@ -28,7 +28,7 @@ derep <- readRDS('$batch_cond.derep') #end if -err <- readRDS('$errorrates') +err <- readRDS('$err') #if $batch_cond.batch_select == "yes": pool <- F @@ -45,7 +45,7 @@ ## not needed for end user: errorEstimationFunction = $errfoo, selfConsist = $selfconsist, pool = pool, multithread = nthreads) - #if $batch_cond.batch_select == "no": +#if $batch_cond.batch_select == "no": #if len($batch_cond.derep) > 1: for( id in names(dada_result) ){ saveRDS(dada_result[[id]], file=file.path("output" ,paste(id, "dada2_dada", sep="."))) @@ -71,15 +71,15 @@ </configfiles> <inputs> <conditional name="batch_cond"> - <param name="batch_select" type="select" label="Process samples in batches"> + <param name="batch_select" type="select" label="Process samples in batches" help="process samples jointly (default) or in independent jobs (see also below)"> <option value="no">no</option> <option value="yes">yes</option> </param> <when value="yes"> - <param name="derep" type="data" format="dada2_derep" label="Dereplicated reads"/> + <param argument="derep" type="data" format="dada2_derep" label="Dereplicated reads"/> </when> <when value="no"> - <param name="derep" type="data" multiple="true" format="dada2_derep" label="Dereplicated reads"/> + <param argument="derep" type="data" multiple="true" format="dada2_derep" label="Dereplicated reads"/> <param argument="pool" type="select" label="Pool samples"> <option value="FALSE">process samples individually</option> <option value="TRUE">pool samples</option> @@ -87,7 +87,7 @@ </param> </when> </conditional> - <param name="errorrates" type="data" format="dada2_errorrates" label="Error rates"/> + <param argument="err" type="data" format="dada2_errorrates" label="Error rates"/> <!-- not needed for end user I guess <expand macro="errorEstimationFunction"/> <param name="selfconsist" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Alternate between sample inference and error rate estimation until convergence"/>--> @@ -97,27 +97,46 @@ <filter>batch_cond['batch_select']=="yes"</filter> </data> <collection name="data_collection" type="list"> - <discover_datasets pattern="__name_and_ext__" directory="output" /> + <discover_datasets pattern="(?P<name>.+)\.dada2_dada" format="dada2_dada" directory="output" /> <filter>batch_cond['batch_select']=="no"</filter> </collection> </outputs> <tests> + <!-- default, non batch --> <test> <param name="batch_cond|batch_select" value="no"/> <param name="batch_cond|derep" value="derepFastq_F3D0_R1.Rdata" ftype="dada2_derep" /> - <param name="errorrates" value="learnErrors_F3D0_R1.Rdata" ftype="dada2_errorrates" /> + <param name="err" value="learnErrors_F3D0_R1.Rdata" ftype="dada2_errorrates" /> <output_collection name="data_collection" type="list"> <element name="derepFastq_F3D0_R1.Rdata" file="dada_F3D0_R1.Rdata" ftype="dada2_dada"/> </output_collection> </test> + <!-- default, batch --> + <test> + <param name="batch_cond|batch_select" value="yes"/> + <param name="batch_cond|derep" value="derepFastq_F3D0_R1.Rdata" ftype="dada2_derep" /> + <param name="err" value="learnErrors_F3D0_R1.Rdata" ftype="dada2_errorrates" /> + <output name="dada" value="dada_F3D0_R1.Rdata" ftype="dada2_dada" /> + </test> + <!-- test for creating input for dada results for reverse, not needed for testing --> <test> <param name="batch_cond|batch_select" value="no"/> <param name="batch_cond|derep" value="derepFastq_F3D0_R2.Rdata" ftype="dada2_derep" /> - <param name="errorrates" value="learnErrors_F3D0_R2.Rdata" ftype="dada2_errorrates" /> + <param name="err" value="learnErrors_F3D0_R2.Rdata" ftype="dada2_errorrates" /> <output_collection name="data_collection" type="list"> <element name="derepFastq_F3D0_R2.Rdata" file="dada_F3D0_R2.Rdata" ftype="dada2_dada"/> </output_collection> </test> + <!-- test non-default options --> + <test> + <param name="batch_cond|batch_select" value="no"/> + <param name="batch_cond|derep" value="derepFastq_F3D0_R1.Rdata" ftype="dada2_derep" /> + <param name="batch_cond|pool" value="pseudo"/> + <param name="err" value="learnErrors_F3D0_R1.Rdata" ftype="dada2_errorrates" /> + <output_collection name="data_collection" type="list"> + <element name="derepFastq_F3D0_R1.Rdata" file="dada_F3D0_R1.Rdata" ftype="dada2_dada"/> + </output_collection> + </test> </tests> <help><![CDATA[ Description @@ -140,7 +159,7 @@ - Jointly (Process "samples in batches"=no): A single Galaxy job is started that processes all derep data sets jointly. You may chose different pooling strategies: if the started dada job processes the samples individually, pooled, or pseudo pooled. - In batches (Process "samples in batches"=yes): A separate Galaxy job is started for earch derep data set. This is equivalent to joint processing and choosing to process samples individually. -While the single dada job (in case of joint processing) can use multiple cores on one compute node, batched processing distributes the work on a number of jobs (equal to the number of input derep data sets) where each can use multiple cores. Hence, if you intend to or need to process the data sets individually, batched processing is more efficient -- in particular if Galaxy has access to a larger number of compute ressources. +While the single dada job (in case of joint processing) can use multiple cores on one compute node, batched processing distributes the work on a number of jobs (equal to the number of input derep data sets) where each can use multiple cores. Hence, if you intend to or need to process the data sets individually, batched processing is more efficient -- in particular if Galaxy has access to a larger number of compute resources. A typical use case of individual processing of the samples are large data sets for which the pooled strategy needs to much time or memory.
--- a/macros.xml Mon Apr 29 09:53:39 2019 -0400 +++ b/macros.xml Sun May 05 12:42:02 2019 -0400 @@ -25,23 +25,44 @@ <token name="@DADA_UNIQUES@">dada2_derep,dada2_dada,dada2_mergepairs</token> + <!-- function to read dada2 data types + - derep, dada, and mergepairs are simply read as RDS + - sequence_table is a named integer matrix (rows=samples, columns=ASVs) + - uniques is a named integer vector (columns=ASVs, only one rows)--> <token name="@READ_FOO@"><![CDATA[ + read.uniques <- function ( fname ) { + p <- read.table(fname, header=F, sep="\t") + n <-x[,2] + names(n)<-x[,1] + } #def read_data($dataset) - #if $dataset.is_of_type('dada2_derep') - readRDS('$dataset) - #else if $dataset.is_of_type('dada2_dada') + #if $dataset.is_of_type('dada2_sequencetable') + t(as.matrix( read.table('$dataset', header=T, sep="\t", row.names=1) )) + #else if $dataset.is_of_type('dada2_uniques') + read.uniques('$dataset') + #else if $dataset.is_of_type('tabular') + read.table('$dataset', header=T, sep="\t", row.names=1) + #else readRDS('$dataset') - #else if $dataset.is_of_type('dada2_sequencetable') - as.matrix( read.table('$dataset', header=T, sep="\t", row.names=1) ) - #else if $dataset.is_of_type('dada2_mergepairs') - readRDS('$dataset') - #else if $dataset.is_of_type('tabular') - read.table('$dataset', header=T, sep="\t", row.names=1 ) - #else - #raise Exception("error: unknown input type") #end if #end def ]]></token> + <!-- function to write dada2 data types (the content or the R variable 'out' is written) + - derep, dada, and mergepairs are written as RDS + - sequence_table is a named integer matrix (rows=samples, columns=ASVs) + - uniques is a named integer vector (columns=ASVs, only one rows)--> + <token name="@WRITE_FOO@"><![CDATA[ +write.data <- function( data, fname, type ){ + if( type == 'dada2_uniques'){ + write.table(data, file = fname, quote = F, sep = "\t", row.names = T, col.names = F) + }else if( type== 'dada2_sequencetable'){ + write.table(t(data), file=fname, quote=F, sep="\t", row.names = T, col.names = NA) + }else{ + saveRDS(data, file=fname) + } +} + ]]></token> + <!-- for filterAndTrim --> <xml name="trimmers"> <section name="trim" title="Trimming parameters"> @@ -63,9 +84,9 @@ <xml name="errorEstimationFunction"> <param name="errfoo" argument="errorEstimationFunction" type="select" label="Error function"> - <option value="loessErrfun">loess</option> - <option value="noqualErrfun">noqual</option> - <option value="PacBioErrfun">PacBio</option> + <option value="loessErrfun">loess: Use a loess fit to estimate error rates from transition counts</option> + <option value="noqualErrfun">noqual: Estimate error rates for each type of transition while ignoring quality scores.</option> + <option value="PacBioErrfun">PacBio: Estimate error rates from transition counts in PacBio CCS data.</option> </param> </xml> <token name="@HELP_OVERVIEW@"><