comparison dada2_learnErrors.xml @ 5:9aeea74a1fc9 draft

planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/topic/dada2/tools/dada2 commit 990192685955e9cda0282e348c28ef6462d88a38
author matthias
date Sun, 05 May 2019 12:22:22 -0400
parents 10141f4eaae9
children 382900945187
comparison
equal deleted inserted replaced
4:9f888de151d1 5:9aeea74a1fc9
15 15
16 args <- commandArgs(trailingOnly = TRUE) 16 args <- commandArgs(trailingOnly = TRUE)
17 nthreads <- as.integer(args[1]) 17 nthreads <- as.integer(args[1])
18 18
19 files <- c() 19 files <- c()
20 #for $read in $reads: 20 #for $read in $fls:
21 files <- c(files, '$read') 21 files <- c(files, '$read')
22 #end for 22 #end for
23 23
24 err <- learnErrors(files, nbases = 10**$nbases, 24 err <- learnErrors(files, nbases = 10**$nbases,
25 errorEstimationFunction = $advanced.errfoo, multithread = nthreads, 25 errorEstimationFunction = $advanced.errfoo, multithread = nthreads,
27 27
28 ## write.table(err\$err_out, file = '$errors', quote = F, sep = "\t", row.names = T, col.names = F) 28 ## write.table(err\$err_out, file = '$errors', quote = F, sep = "\t", row.names = T, col.names = F)
29 saveRDS(err, file='$errors') 29 saveRDS(err, file='$errors')
30 30
31 ## generate error plots 31 ## generate error plots
32 plot <- plotErrors(err, obs = $plotopt.obs, err_out = $plotopt.errout, err_in = $plotopt.errin, nominalQ = $plotopt.nominalQ) 32 plot <- plotErrors(err, obs = $plotopt.obs, err_out = $plotopt.err_out, err_in = $plotopt.err_in, nominalQ = $plotopt.nominalQ)
33 ggsave('plot.pdf', plot, width = 20,height = 15,units = c("cm")) 33 ggsave('plot.pdf', plot, width = 20,height = 15,units = c("cm"))
34 ]]></configfile> 34 ]]></configfile>
35 </configfiles> 35 </configfiles>
36 <inputs> 36 <inputs>
37 <param name="reads" type="data" multiple="true" format="fastqsanger,fastqsanger.gz" label="Short read data" help="forward or reverse reads should be processed separately"/> 37 <param argument="fls" type="data" multiple="true" format="fastqsanger,fastqsanger.gz" label="Short read data" help="forward or reverse reads should be processed separately"/>
38 <param argument="nbases" type="integer" value="8" min="0" label="Magnitide of number of bases to use for learning"/> 38 <param argument="nbases" type="integer" value="8" min="0" label="Magnitide of number of bases to use for learning"/>
39 <section name="advanced" title="Advanced Option"> 39 <section name="advanced" title="Advanced Option">
40 <expand macro="errorEstimationFunction"/> 40 <expand macro="errorEstimationFunction"/>
41 <param argument="randomize" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Randomize samples"/> 41 <param argument="randomize" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Randomize samples" help="Pick samples at random, otherwise samples are read in the provided order until enough reads are obtained (default)."/>
42 <param name="maxconsist" argument="MAX_CONSIST" type="integer" value="10" min="0" label="Maximum number of times to step through the selfconsistency loop" help=""/> 42 <param name="maxconsist" argument="MAX_CONSIST" type="integer" value="10" min="0" label="Maximum number of times to step through the selfconsistency loop" help="If convergence was not reached in MAX_CONSIST steps, the estimated error rates in the last step are returned."/>
43 <param name="omegac" argument="OMEGA_C" type="integer" value="0" min="0" label="Threshold at which unique sequences inferred to contain errors are corrected" help=""/> 43 <param name="omegac" argument="OMEGA_C" type="float" value="0" min="0" label="Threshold at which unique sequences inferred to contain errors are corrected" help="For reasons of convergence, and because it is
44 more conservative, it is recommended to set this value to 0, which means that
45 all reads are counted and contribute to estimating the error rates."/>
44 </section> 46 </section>
45 <section name="plotopt" title="Plotting Option"> 47 <section name="plotopt" title="Plotting Option">
46 <param name="obs" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot observed error rates"/> 48 <param argument="obs" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot observed error rates"/>
47 <param name="errout" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot output error rates"/> 49 <param argument="err_out" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot output error rates"/>
48 <param name="errin" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Plot input error rates"/> 50 <param argument="err_in" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Plot input error rates"/>
49 <param name="nominalQ" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot expected error rates"/> 51 <param argument="nominalQ" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot expected error rates"/>
50 </section> 52 </section>
51 </inputs> 53 </inputs>
52 <outputs> 54 <outputs>
53 <data name="errors" format="dada2_errorrates" label="${tool.name} on ${on_string}"/> 55 <data name="errors" format="dada2_errorrates" label="${tool.name} on ${on_string}"/>
54 <data name="plot" format="pdf" from_work_dir="plot.pdf" label="${tool.name} on ${on_string}: error rates plot"/> 56 <data name="plot" format="pdf" from_work_dir="plot.pdf" label="${tool.name} on ${on_string}: error rates plot"/>
55 </outputs> 57 </outputs>
56 <tests> 58 <tests>
57 <test> 59 <test>
58 <param name="reads" value="filterAndTrim_F3D0_R1.fq.gz" ftype="fastqsanger.gz"/> 60 <param name="fls" value="filterAndTrim_F3D0_R1.fq.gz" ftype="fastqsanger.gz"/>
59 <output name="errors" value="learnErrors_F3D0_R1.Rdata" ftype="dada2_errorrates"/> 61 <output name="errors" value="learnErrors_F3D0_R1.Rdata" ftype="dada2_errorrates"/>
60 <output name="plot" value="learnErrors_F3D0_R1.pdf" ftype="pdf" /> 62 <output name="plot" value="learnErrors_F3D0_R1.pdf" ftype="pdf" />
61 </test> 63 </test>
62 <!-- test for creating input for dada results for reverse, not needed for testing --> 64 <!-- test for creating input for dada results for reverse, not needed for testing -->
63 <test> 65 <test>
64 <param name="reads" value="filterAndTrim_F3D0_R2.fq.gz" ftype="fastqsanger.gz"/> 66 <param name="fls" value="filterAndTrim_F3D0_R2.fq.gz" ftype="fastqsanger.gz"/>
65 <output name="errors" value="learnErrors_F3D0_R2.Rdata" ftype="dada2_errorrates"/> 67 <output name="errors" value="learnErrors_F3D0_R2.Rdata" ftype="dada2_errorrates"/>
66 <output name="plot" value="learnErrors_F3D0_R2.pdf" ftype="pdf" /> 68 <output name="plot" value="learnErrors_F3D0_R2.pdf" ftype="pdf" />
67 </test> 69 </test>
68 <!-- test w non-default parameters --> 70 <!-- test w non-default parameters -->
69 <test> 71 <test>
70 <param name="reads" value="filterAndTrim_F3D0_R1.fq.gz" ftype="fastqsanger.gz"/> 72 <param name="fls" value="filterAndTrim_F3D0_R1.fq.gz" ftype="fastqsanger.gz"/>
73 <param name="nbases" value="6" />
74 <param name="advanced|errfoo" value="noqualErrfun" />
75 <param name="advanced|randomize" value="TRUE" />
76 <param name="advanced|maxconsist" value="5" />
77 <param name="advanced|omegac" value="1e-10" />
71 <param name="plotopt|obs" value="FALSE" /> 78 <param name="plotopt|obs" value="FALSE" />
72 <param name="plotopt|errout" value="FALSE" /> 79 <param name="plotopt|err_out" value="FALSE" />
73 <param name="plotopt|errin" value="TRUE" /> 80 <param name="plotopt|err_in" value="TRUE" />
74 <param name="plotopt|nominalQ" value="FALSE"/> 81 <param name="plotopt|nominalQ" value="FALSE"/>
75 <output name="errors" value="learnErrors_F3D0_R1.Rdata" ftype="dada2_errorrates" /> 82 <output name="errors" value="learnErrors_F3D0_R1.Rdata" ftype="dada2_errorrates" compare="sim_size" delta="14000"/>
76 <output name="output" value="learnErrors_F3D0_R1.pdf" ftype="pdf" compare="sim_size" /> 83 <output name="plot" value="learnErrors_F3D0_R1.pdf" ftype="pdf" compare="sim_size" />
77 </test> 84 </test>
78 <!-- TODO test w multiple inputs --> 85 <!-- TODO test w multiple inputs -->
79 </tests> 86 </tests>
80 <help><![CDATA[ 87 <help><![CDATA[
81 Description 88 Description
82 ........... 89 ...........
83 90
84 Error rates are learned by alternating between sample inference and error rate estimation until convergence. Additionally a plot is generated that shows the observed frequency of each transition (eg. A->C) as a function of the associated quality score, the final estimated error rates (if they exist), the initial input rates, and the expected error rates under the nominal definition of quality scores. 91 Error rates are learned by alternating between sample inference and error rate estimation until convergence. Additionally a plot is generated that shows the observed frequency of each transition (eg. A->C) as a function of the associated quality score, the final estimated error rates (if they exist), the initial input rates, and the expected error rates under the nominal definition of quality scores.
92
93 In addition a plot is generated (with plotErrors) that shows the observed frequency of each transition (eg. A->C) as a function of the associated quality score. Also the final estimated error rates (if they exist) are shown. Optionally also the initial input rates and the expected error rates under the nominal definition of quality scores can be added to the plot.
85 94
86 Usage 95 Usage
87 ..... 96 .....
88 97
89 **Input** are the FASTQ dataset containing the filtered and trimmed reads of the samples. 98 **Input** are the FASTQ dataset containing the filtered and trimmed reads of the samples.
101 110
102 The learnErrors method learns a parametric error model from the data, by alternating estimation of the error rates and inference of sample composition until they converge on a jointly consistent solution. As in many machine-learning problems, the algorithm must begin with an initial guess, for which the maximum possible error rates in this data are used (the error rates if only the most abundant sequence is correct and all the rest are errors). 111 The learnErrors method learns a parametric error model from the data, by alternating estimation of the error rates and inference of sample composition until they converge on a jointly consistent solution. As in many machine-learning problems, the algorithm must begin with an initial guess, for which the maximum possible error rates in this data are used (the error rates if only the most abundant sequence is correct and all the rest are errors).
103 112
104 It is expected that the estimated error rates (black lines in the plot) are in a good fit to the observed rates (points in the plot), and that the error rates drop with increased quality. Try to increase the **number of bases to use for learning** if this is not the case. 113 It is expected that the estimated error rates (black lines in the plot) are in a good fit to the observed rates (points in the plot), and that the error rates drop with increased quality. Try to increase the **number of bases to use for learning** if this is not the case.
105 114
115 Error functions:
116
117 - loessErrfun: accepts a matrix of observed transitions, with each transition corresponding to a row (eg. row 2 = A->C) and each column to a quality score (eg. col 31 = Q30). It returns a matrix of estimated error rates of the same shape. Error rates are estimates by a loess fit of the observed rates of each transition as a function of the quality score. Self-transitions (i.e. A->A) are taken to be the left-over probability.
118 - noqualErrfun: accepts a matrix of observed transitions, groups together all observed transitions regardless of quality scores, and estimates the error rate for that transition as the observed fraction of those transitions. The effect is that quality scores will be effectively ignored.
119 - PacBioErrfun: This function accepts a matrix of observed transitions from PacBio CCS amplicon sequencing data, with each transition corresponding to a row (eg. row 2 = A->C) and each column to a quality score (eg. col 31 = Q30). It returns a matrix of estimated error rates of the same shape. Error rates are estimates by loessErrfun for quality scores 0-92, and individually by the maximum likelihood estimate for the maximum quality score of 93.
120
106 @HELP_OVERVIEW@ 121 @HELP_OVERVIEW@
107 ]]></help> 122 ]]></help>
108 <expand macro="citations"/> 123 <expand macro="citations"/>
109 </tool> 124 </tool>