comparison BC/batch_correction.xml @ 3:2e3a23dd6c24 draft default tip

Uploaded
author melpetera
date Thu, 28 Feb 2019 05:12:34 -0500
parents
children
comparison
equal deleted inserted replaced
2:57edfd3943ab 3:2e3a23dd6c24
1 <tool id="Batch_correction" name="Batch_correction" version="2.2.4">
2 <description>Corrects intensities for signal drift and batch-effects</description>
3
4 <requirements>
5 <requirement type="package" version="1.1_4">r-batch</requirement>
6 <requirement type="package" version="1.7_8">r-ade4</requirement>
7 <requirement type="package" version="1.70.0">bioconductor-pcamethods</requirement>
8 <requirement type="package" version="1.10.0">bioconductor-ropls</requirement>
9 </requirements>
10
11 <stdio>
12 <exit_code range="1:" level="fatal" />
13 </stdio>
14
15 <command><![CDATA[
16 #if str($span_condition.method) == 'all_loess_pool':
17 Rscript $__tool_directory__/batch_correction_all_loess_wrapper.R
18 dataMatrix "$dataMatrix"
19 sampleMetadata "$sampleMetadata"
20 variableMetadata "$variableMetadata"
21 method "all_loess_pool"
22 span "${span_condition.span}"
23
24 #elif str($span_condition.method) == 'all_loess_sample':
25 Rscript $__tool_directory__/batch_correction_all_loess_wrapper.R
26 dataMatrix "$dataMatrix"
27 sampleMetadata "$sampleMetadata"
28 variableMetadata "$variableMetadata"
29 method "all_loess_sample"
30 span "${span_condition.span}"
31 #else:
32 Rscript $__tool_directory__/batch_correction_wrapper.R
33 analyse "batch_correction"
34 dataMatrix "$dataMatrix"
35 sampleMetadata "$sampleMetadata"
36 variableMetadata "$variableMetadata"
37 method "${span_condition.method}"
38 #if str($span_condition.method) == 'linear':
39 span "none"
40 #else:
41 span "${span_condition.span}"
42 #end if
43 valnull "${span_condition.valnull}"
44 ref_factor "${span_condition.ref_factor}"
45 detail "${span_condition.detail}"
46 #end if
47 dataMatrix_out "$dataMatrix_out" variableMetadata_out "$variableMetadata_out"
48 graph_output "$graph_output" rdata_output "$rdata_output"
49 batch_col_name "$batch_col_name" injection_order_col_name "$injection_order_col_name"
50 sample_type_col_name "$sample_type_col_name"
51 sample_type_tags "blank=$sampleTypeTagBlank,pool=$sampleTypeTagPool,sample=$sampleTypeTagSample"
52 ]]></command>
53
54 <inputs>
55 <param name="dataMatrix" label="Data Matrix file " format="tabular" type="data" />
56 <param name="sampleMetadata" label="Sample metadata file " format="tabular" type="data" help="must contain at least the three following columns: 'batch' + 'injectionOrder' + 'sampleType'"/>
57 <param name="variableMetadata" label="Variable metadata file " format="tabular" type="data" />
58
59 <param name="batch_col_name" label="Batch column name" type="text" size="64" value="batch" help="The name of the column containing the batch values."/>
60 <param name="injection_order_col_name" label="Injection order column name" type="text" size="64" value="injectionOrder" help="The name of the column containing the injection order values."/>
61 <param name="sample_type_col_name" label="Sample type column name" type="text" size="64" value="sampleType" help="The name of the column containing the sample type values."/>
62 <param name="sampleTypeTagPool" type="text" size="64" value="pool" label="Set the name used to tag samples as pool."/>
63 <param name="sampleTypeTagBlank" type="text" size="64" value="blank" label="Set the name used to tag samples as blank."/>
64 <param name="sampleTypeTagSample" type="text" size="64" value="sample" label="Set the name used to tag samples as real sample."/>
65
66 <conditional name="span_condition">
67 <param name="method" label="Type of regression model " type="select" help="To select between linear or non-linear (lowess or loess) methods to be used in Van der Kloet algorithm ; when using loess, you can choose to use pools or samples to model batch effect.">
68 <option value="linear">linear</option>
69 <option value="lowess">lowess</option>
70 <option value="loess">loess</option>
71 <option value="all_loess_pool">all loess pool</option>
72 <option value="all_loess_sample">all loess sample</option>
73 </param>
74 <when value="linear">
75 <param name="valnull" label="Null values" type="select" display="radio" help="What to do of generated negative or infinite values">
76 <option value="0">consider it as a null intensity</option>
77 <option value="NA">consider it as a missing value</option>
78 </param>
79 <param name="ref_factor" label="Factor of interest " type="text" value="batch" help="column name of factor of interest (often a biological factor); if none, leave 'batch'" />
80 <param name="detail" label="Level of details for plots " type="select" help="Amount of plots in the pdf file output. See Help section for more details.">
81 <option value="no">basic</option>
82 <option value="plot">standard</option>
83 <option value="reg">complete</option>
84 </param>
85 </when>
86 <when value="lowess">
87 <param name="span" type="float" value="0.85" label="span" help="it is a advanced option. Must be less than or equal to 1"/>
88 <param name="valnull" label="Null values" type="select" display="radio" help="What to do of generated negative or infinite values">
89 <option value="0">consider it as a null intensity</option>
90 <option value="NA">consider it as a missing value</option>
91 </param>
92 <param name="ref_factor" label="Factor of interest " type="text" value="batch" help="column name of factor of interest (often a biological factor); if none, leave 'batch'" />
93 <param name="detail" label="Level of details for plots " type="select" help="Amount of plots in the pdf file output. See Help section for more details.">
94 <option value="no">basic</option>
95 <option value="plot">standard</option>
96 <option value="reg">complete</option>
97 </param>
98 </when>
99 <when value="loess">
100 <param name="span" type="float" value="1" label="span" help="it is a advanced option. Must be strictly greater than 0"/>
101 <param name="valnull" label="Null values" type="select" display="radio" help="What to do of generated negative or infinite values">
102 <option value="0">consider it as a null intensity</option>
103 <option value="NA">consider it as a missing value</option>
104 </param>
105 <param name="ref_factor" label="Factor of interest " type="text" value="batch" help="column name of factor of interest (often a biological factor); if none, leave 'batch'" />
106 <param name="detail" label="Level of details for plots " type="select" help="Amount of plots in the pdf file output. See Help section for more details.">
107 <option value="no">basic</option>
108 <option value="plot">standard</option>
109 <option value="reg">complete</option>
110 </param>
111 </when>
112 <when value="all_loess_pool">
113 <param name="span" type="float" value="1" label="span" help="smoothing parameter; must be > 0"/>
114 </when>
115 <when value="all_loess_sample">
116 <param name="span" type="float" value="1" label="span" help="smoothing parameter; must be > 0"/>
117 </when>
118 </conditional>
119 </inputs>
120
121 <outputs>
122 <data name="dataMatrix_out" label="${tool.name}_${span_condition.method}_${dataMatrix.name}" format="tabular"/>
123 <data name="variableMetadata_out" label="${tool.name}_${span_condition.method}_${variableMetadata.name}" format="tabular"/>
124 <data name="graph_output" label="${tool.name}_${span_condition.method}_graph" format="pdf"/>
125 <data name="rdata_output" label="${tool.name}_${span_condition.method}_rdata" format="rdata"/>
126 </outputs>
127
128 <tests>
129 <test>
130 <param name="dataMatrix" value="input-batchcorrection-dataMatrix.tsv"/>
131 <param name="sampleMetadata" value="input-batchcorrection-sampleMetadata.tsv"/>
132 <param name="variableMetadata" value="input-batchcorrection-variableMetadata.tsv"/>
133 <param name="method" value="all_loess_pool"/>
134 <param name="span" value="1"/>
135 <output name="dataMatrix_out" file="output-batchcorrection-dataMatrix.tsv"/>
136 </test>
137 <test>
138 <param name="dataMatrix" value="input-batchcorrection-dataMatrix.tsv"/>
139 <param name="sampleMetadata" value="input-batchcorrection-sampleMetadata-customSampleType.tsv"/>
140 <param name="variableMetadata" value="input-batchcorrection-variableMetadata.tsv"/>
141 <param name="method" value="all_loess_pool"/>
142 <param name="span" value="1"/>
143 <param name="sample_type_col_name" value="MySampType"/>
144 <param name="sampleTypeTagPool" value="lot"/>
145 <param name="sampleTypeTagBlank" value="blanc"/>
146 <param name="sampleTypeTagSample" value="echant"/>
147 <output name="dataMatrix_out" file="output-batchcorrection-dataMatrix.tsv"/>
148 </test>
149 </tests>
150
151
152 <help>
153
154 .. class:: infomark
155
156 **Authors**
157 | Jean-Francois Martin - PF MetaToul-AXIOM ; INRA ; MetaboHUB (for original version of this tool and overall development of the R script)
158 | Melanie Petera - PFEM ; INRA ; MetaboHUB (for R wrapper and R script improvement)
159 | Marion Landi - FLAME ; PFEM ; INRA ; MetaboHUB (for xml interface and R wrapper)
160 | Franck Giacomoni - PFEM ; INRA ; MetaboHUB (for xml interface and R wrapper)
161 | Etienne Thevenot - LIST/LADIS ; CEA ; MetaboHUB (for R script and wrapper regarding "all loess pool" and "all loess sample" methods)
162
163 ---------------------------------------------------
164
165 .. class:: infomark
166
167 **Please cite** If you use this tool, please cite:
168
169 when using the **linear**, **lowess** or **loess** methods:
170 | `F.M. Van Der Kloet, I. Bobeldijk, E.R. Verheij, R.H. Jellema. (2009). "Analytical error reduction using single point calibration for accurate and precise metabolomic phenotyping." Journal of Proteome Research p5132-5141 &lt;http://www.ncbi.nlm.nih.gov/pubmed/19754161&gt;`_
171
172 when using the **all loess pool** or **all loess sample** method:
173 | `Dunn et al (2011). Procedures for large-scale metabolic profiling of serum and plasma using gas chromatography and liquid chromatography coupled to mass spectrometry. Nature Protocols, 6:1060-1083 &lt;http://dx.doi.org/10.1038/nprot.2011.335&gt;`_
174 | Cleveland et al (1997). In Statistical Models in S; Chambers JM. and Hastie TJ. Ed.; Chapman et Hall: London; pp. 309-376
175 | Etienne A. Thevenot, Aurelie Roux, Ying Xu, Eric Ezan, and Christophe Junot (2015). Analysis of the human adult urinary metabolome variations with age, body mass index and gender by implementing a comprehensive workflow for univariate and OPLS statistical analyses. *Journal of Proteome Research*, **14**:3322-3335 (http://dx.doi.org/10.1021/acs.jproteome.5b00354).
176
177 ---------------------------------------------------
178
179 .. class:: infomark
180
181 **Tool updates**
182
183 See the **NEWS** section at the bottom of this page
184
185 ---------------------------------------------------
186
187 ================
188 Batch_correction
189 ================
190
191 -----------
192 Description
193 -----------
194
195 | **Instrumental drift** and **offset differences** between batches have been described in **LC-MS** experiments when the number of samples is large and/or multiple batches of acquisition are needed.
196 | Recently a normalization strategy relying on the measurements of a **pooled** (or QC) sample injected periodically has been described: for each variable, a **regression model** is fitted to the values of the **pool** and subsequently used to adjust the intensities of the samples of interest (van der Kloet et al, 2009; Dunn et al, 2011).
197 |
198 | The current tool implements **two strategies** which differ in the way the regression model is applied to the variables (either depending on variable quality metrics, or 'loess' model for all variables) and also in the generated figure.
199 |
200
201
202 -----------------
203 Workflow position
204 -----------------
205
206 .. image:: batch_correction.png
207 :width: 800
208
209
210 -----------
211 Input files
212 -----------
213
214 +----------------------------+------------+
215 | Parameter : num + label | Format |
216 +============================+============+
217 | 1 : Data Matrix file | tabular |
218 +----------------------------+------------+
219 | 2 : Sample metadata file | tabular |
220 +----------------------------+------------+
221 | 3 : Variable metadata file | tabular |
222 +----------------------------+------------+
223
224
225 Data Matrix file must contain the intensity values of variables.
226 | First line must contain all the samples' names
227 | First column must contain all the variables' ID
228 |
229
230 Sample metadata file must contain at least the three following columns:
231 | "batch" to identify the batches of analyses
232 | "injectionOrder" (integers) defining the injection order of all samples (QC-pools as well as analysed samples)
233 | "sampleType" indicating if a sample ("sample") or a QC-pool ("pool"); each batch needs
234 | at least 3 QC-pools for intra-batch linear adjustment and 8 for lo(w)ess adjustment (5 for **all loess** methods)
235
236
237 .. class:: warningmark
238
239 MISSING DATA are allowed only with the **all loess** methods
240
241
242 ----------
243 Parameters
244 ----------
245
246 Type of regression model
247 | To choose between *linear*, *lowess*, *loess*, *all loess pool*, and *all loess sample* strategies
248 | **- Option 1** (**linear**, **lowess**, and **loess** methods): before the normalisation of each variable, some quality metrics are computed (see the "Determine Batch Correction" module); depending on the result, the variable can be normalized or not, with either the **linear**, **lowess** or **loess** model.
249 | **- Option 2** (**all loess pool** and **all loess sample**): each variable is normalized by using the 'loess' model;
250 | in the case **all loess pool** is chosen and the number of pool observations is below 5, the linear method is used (for all variables) and a warning is generated;
251 | if the pool intensities are not representative of the samples (which can be viewed on the figure where both trends are shown), the case **all loess sample** enables using the sample intensities (instead of the pool intensities) as the reference for the loess curve.
252 | In all "option 2" cases: the **median intensity of the reference observations** (either 'pool' or 'sample') is used as the scaling factor after the initial intensities have been divided by the loess predictions.
253 |
254
255 Span
256 | Smoothing parameter, advanced option for *lo(w)ess* and *all loess* methods
257 | In case of a loess fit, the **span** parameter (between 0 and 1) controls the smoothing
258 | (the higher the smoother; higher values are prefered to avoid overfitting; Cleveland et al, 1997).
259 |
260
261 Null values
262 | available for regression model *linear*, *lowess* and *loess*
263 | Controls what is done regarding negative or infinite values that can be generated during regression estimation.
264 | *consider it as a missing value* will switch concerned intensities to NA;
265 | this option implies that concerned ions will not be considered in PCA display.
266 | *consider it as a null intensity* will switch concerned intensities to 0 for lo(w)ess
267 | or correct them by the batch mean instead of regression estimate for linear.
268 |
269
270 Factor of interest
271 | available for regression model *linear*, *lowess* and *loess*
272 | Name of the factor (column header) in Sample metadata file that will be used as a categorical variable for plots and PCA.
273 | (often a biological factor ; if none, leave "batch")
274 | This factor does not affect correction calculation.
275 |
276
277 Level of details for plots
278 | available for regression model *linear*, *lowess* and *loess*
279 | *basic*: PCA + CV boxplot (before and after correction)
280 | *standard*: 'basic' plots + before/after-correction plots of intensities over injection order, and design effects for each ion
281 | *complete*: 'standard' plots + QC-pool regression plots per batch with samples' intensities over injection order
282 | This factor is not used by the *all loess* methods where a unique figure is generated showing the sum of intensities along injection order, and the first 4 PCA scores.
283 |
284
285
286 ------------
287 Output files
288 ------------
289
290 Batch_correction_$method_rdata.rdata
291 | binary data
292 | Download, open R and use the 'load' function; objects are in the 'res' list
293 |
294
295 Batch_correction_$method_graph.pdf
296 | graphical output
297 | For the *linear* and *lo(w)ess* methods, content depends on level of details chosen
298 |
299
300 Batch_correction_$method_variableMetadata.tabular
301 | tsv output
302 | Identical to the Variable metadata input file, with x more columns (where x is the number of batches) in case of *linear*, *lowess* and *loess* methods
303 |
304
305 Batch_correction_$method_dataMatrix.tabular
306 | tsv output (tabulated)
307 | Same formatting as Data Matrix file; contains corrected intensities
308 |
309
310
311 ---------------------------------------------------
312
313 ---------------
314 Working example
315 ---------------
316
317 .. class:: infomark
318
319 Refer to the corresponding "W4M HowTo" page:
320 | `MS data processing - Filters and normalisation &lt;http://workflow4metabolomics.org/sites/workflow4metabolomics.org/files/files/w4e-2016-data_processing.pdf&gt;`_
321 |
322 |
323
324 See also the reference history:
325 | `W4M00001_Sacurine-statistics (DOI:10.15454/1.4811121736910142E12) &lt;http://dx.doi.org/10.15454/1.4811121736910142E12&gt;`_
326 |
327
328 ---------------------------------------------------
329
330 ----
331 NEWS
332 ----
333
334 CHANGES IN VERSION 2.2.4
335 ========================
336
337 INTERNAL MODIFICATIONS
338
339 Fixed bug for pool selection ("all_loess" methods)
340
341 CHANGES IN VERSION 2.2.2
342 ========================
343
344 INTERNAL MODIFICATIONS
345
346 Fixed bug for color plot ("all_loess" methods)
347
348 CHANGES IN VERSION 2.2.0
349 ========================
350
351 NEW FEATURE
352
353 Specific names for the 'sampleType', 'injectionOrder', and 'batch' from sampleMetadata can be selected by the user (for compatibility with the MTBLS downloader)
354
355 CHANGES IN VERSION 2.1.2
356 ========================
357
358 INTERNAL MODIFICATIONS
359
360 Minor modifications in config file
361
362 CHANGES IN VERSION 2.1.0
363 ========================
364
365 INTERNAL MODIFICATIONS
366
367 For PCA figure display only (**all_loess** options): missing values are set to the minimum value before PCA computation is performed (with svd)
368
369 Additional running and installation tests added with planemo, conda, and travis
370
371 BUG FIX
372
373 Variables with NA or 0 values in all reference samples are discarded before applying the **all_loess** normalization
374
375 INTERNAL MODIFICATIONS
376
377 Modifications of the **all_loess_wrapper** file to handle the recent **ropls** package versions (i.e. 1.3.15 and above) which use S4 classes
378
379 </help>
380 <!-- [RECOMMANDED] All citations associated to this tool (main citation given above and other references). Can be extracted from the history panel -->
381 <citations>
382 <!-- [HELP] As DOI or BibTex entry -->
383 <citation type="doi">10.1021/pr900499r</citation>
384 <citation type="doi">10.1038/nprot.2011.335</citation>
385 <citation type="bibtex">@ARTICLE{Cleveland91,
386 author = {Cleveland et al},
387 year = {1991},
388 journal = {Statistical Models in S, Chambers JM. and Hastie TJ. Ed., Chapman et Hall: London},
389 title = {Local Regression Models},
390 pages = {309-376},
391 editor = {Chambers JM. and Hastie TJ. Ed.},
392 publisher = {Chapman et Hall: London},
393 chapter = {8}
394 }</citation>
395 <citation type="doi">10.1021/acs.jproteome.5b00354</citation>
396 </citations>
397
398
399 </tool>