Mercurial > repos > recetox > waveica
changeset 4:8b55efc7d117 draft
"planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/waveica commit aa8206a01efc1813f2586584782f28a73a17fe86"
| author | recetox |
|---|---|
| date | Mon, 10 Jan 2022 16:01:35 +0000 |
| parents | e3726251a055 |
| children | fba892edb9d9 |
| files | test-data/input_data_nobatch.csv test-data/normalized_data_nobatch.tsv waveica.xml waveica_macros.xml waveica_wrapper.R |
| diffstat | 5 files changed, 222 insertions(+), 79 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input_data_nobatch.csv Mon Jan 10 16:01:35 2022 +0000 @@ -0,0 +1,6 @@ +sampleName,class,sampleType,injectionOrder,M85T34,M86T41,M86T518,M86T539 +VT_160120_002,sample,sample,1,228520.06430737,35646729.21543971,2386896.97966461,1026645.83653468 +VT_160120_004,sample,sample,2,90217.384387202,35735702.457215995,2456290.69621518,1089246.46040563 +VT_160120_006,sample,sample,3,235656.75288383896,37021134.452711605,8873450.40260241,837856.449608585 +VT_160120_008,sample,sample,4,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685 +VT_160120_010,sample,sample,5,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/normalized_data_nobatch.tsv Mon Jan 10 16:01:35 2022 +0000 @@ -0,0 +1,6 @@ +sampleName class sampleType injectionOrder M85T34 M86T41 M86T518 M86T539 +VT_160120_002 sample sample 1 -9795801.68327503 29546678.5668331 -6207890.55898611 -8941748.93596051 +VT_160120_004 sample sample 2 -9798910.7423992 29543569.507709 -6210999.61811028 -8944857.99508468 +VT_160120_006 sample sample 3 -9797307.93142165 29545172.3186865 -6209396.80713273 -8943255.18410713 +VT_160120_008 sample sample 4 -9793706.69204905 29548773.5580591 -6205795.56776013 -8939653.94473453 +VT_160120_010 sample sample 5 -9800711.45464488 29541768.7954633 -6212800.33035596 -8946658.70733036
--- a/waveica.xml Fri Nov 12 23:22:14 2021 +0000 +++ b/waveica.xml Mon Jan 10 16:01:35 2022 +0000 @@ -1,96 +1,67 @@ -<tool id="waveica" name="WaveICA" version="0.1.0+galaxy3" python_template_version="3.5"> - +<tool id="waveica" name="WaveICA" version="@TOOL_VERSION@+galaxy0" python_template_version="3.5"> <description>removal of batch effects for untargeted metabolomics data</description> + <macros> + <import>waveica_macros.xml</import> + </macros> + <expand macro="creator" /> <requirements> - <requirement type="package" version="0.1.0">r-recetox-waveica</requirement> + <requirement type="package" version="@TOOL_VERSION@">r-recetox-waveica</requirement> </requirements> - <command detect_errors="aggressive"><![CDATA[ Rscript -e 'source("${__tool_directory__}/waveica_wrapper.R")' + #if $batch_correction.mode == "batchwise": -e 'normalized_data <- waveica( data = "$data", wavelet_filter = "$wf.wavelet_filter", wavelet_length = "$wf.wavelet_length", k = $k, - t = $t, - t2 = $t2, + t = $batch_correction.t, + t2 = $batch_correction.t2, alpha = $alpha, exclude_blanks = $exclude_blanks )' + #else if $batch_correction.mode == "single_batch": + -e 'normalized_data <- waveica_singlebatch( + data = "$data", + wavelet_filter = "$wf.wavelet_filter", + wavelet_length = "$wf.wavelet_length", + k = $k, + alpha = $alpha, + cutoff = $batch_correction.cutoff, + exclude_blanks = $exclude_blanks + )' + #end if -e 'store_data(normalized_data,"$normalized_data")' ]]></command> <inputs> - <param type="data" name="data" label="Feature table" format="csv" help=""/> - <conditional name="wf"> - <param type="select" name="wavelet_filter" label="Wavelet transform filter" help="wavelet function and filter length [1] (see footnotes for more details)"> - <option value="d" selected="True">Daubechies</option> - <option value="la" >Least Asymetric</option> - <option value="bl" >Best Localized</option> - <option value="c" >Coiflet</option> + <expand macro="general_parameters" /> + <expand macro="wf" /> + <conditional name="batch_correction"> + <param name="mode" type="select" label="Batch correction mode" help="'multiple batches' takes into account inter- and intrabatch intensity drift; 'single batch' relies only on the injection order of the samples and requires no batch information [2]"> + <option value="batchwise" selected="true">Multiple batches</option> + <option value="single_batch">Single batch (or no batch information)</option> </param> - <when value="d"> - <param name="wavelet_length" type="select" label="filter length"> - <option value="2" selected="True">2</option> - <option value="4">4</option> - <option value="6">6</option> - <option value="8">8</option> - <option value="10">10</option> - <option value="12">12</option> - <option value="14">14</option> - <option value="16">16</option> - <option value="18">18</option> - <option value="20">20</option> - </param> - </when> - <when value="la"> - <param name="wavelet_length" type="select" label="filter length"> - <option value="8">8</option> - <option value="10">10</option> - <option value="12">12</option> - <option value="14">14</option> - <option value="16">16</option> - <option value="18">18</option> - <option value="20">20</option> - </param> - </when> - <when value="bl"> - <param name="wavelet_length" type="select" label="filter length"> - <option value="14">14</option> - <option value="18">18</option> - <option value="20">20</option> - </param> - </when> - <when value="c"> - <param name="wavelet_length" type="select" label="filter length"> - <option value="6">6</option> - <option value="12">12</option> - <option value="18">18</option> - <option value="24">24</option> - <option value="30">30</option> - </param> + <when value="batchwise"> + <expand macro="batchwise_parameters" /> + </when> + <when value="single_batch"> + <expand macro="singlebatch_parameters" /> </when> </conditional> - <param type="integer" value="20" name="k" label="Number of components to decompose" help="maximal component that ICA decomposes"/> - <param type="float" value="0.05" name="t" label="Batch-association threshold" help="threshold to consider a component associate with the batch, - should be between 0 and 1"/> - <param type="float" value="0.05" name="t2" label="Group-association threshold" help="threshold to consider a component associate with the group, - should be between 0 and 1"/> - <param type="float" value="0" name="alpha" label="Alpha" help="trade-off value between the independence of samples (temporal ICA) and variables (spatial ICA), should be between 0 and 1"/> - <param name="exclude_blanks" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Remove blanks" help="Excludes blank samples from the output" /> + <expand macro="exclude_blanks" /> </inputs> - <outputs> - <data name="normalized_data" format="tsv" /> - </outputs> + <expand macro="outputs" /> <tests> <test> <param name="data" value="input_data.csv" ftype="csv" /> + <param name="mode" value="batchwise" /> <param name="wavelet_filter" value="d" /> <param name="filter_length" value="2" /> <param name="k" value="20" /> @@ -99,6 +70,16 @@ <param name="alpha" value="0" /> <output name="normalized_data" file="normalized_data.tsv" /> </test> + <test> + <param name="data" value="input_data_nobatch.csv" ftype="csv" /> + <param name="mode" value="single_batch" /> + <param name="wavelet_filter" value="d" /> + <param name="filter_length" value="2" /> + <param name="k" value="20" /> + <param name="alpha" value="0" /> + <param name="cutoff" value="0" /> + <output name="normalized_data" file="normalized_data_nobatch.tsv" /> + </test> <test expect_failure="true"> <param name="data" value="na_data.csv" ftype="csv" /> </test> @@ -110,7 +91,8 @@ <help><![CDATA[ **Description** - Removal of batch effects for large-scale untargeted metabolomics data based on wavelet analysis. The WaveICA R package provides a new algorithm to removing batch effects for metabolomics data. + Removal of batch effects for large-scale untargeted metabolomics data based on wavelet analysis and independent component analysis. + The WaveICA method uses the time trend of samples over the injection order, decomposes the original data into new multi-scale features, extracts and removes the batch effect resulting in normalized intensities across samples. The input is an intensity-by-feature table with metadata in the following format: @@ -125,18 +107,24 @@ +---------------+--------+------------+----------------+-------+------------+--------------+-------------+-------------+-----+ + + The required columns are **sampleName**, **class**, **sampleType**, **injectionOrder**, and the **features** that you want to normalize. + + The **batch** column is required if batch correction mode is **Multiple batches** and optional otherwise. + + The presence of any additional columns (except features) will result in incorrect batch correction or job failure. + the input table must not contain missing values. Missing intensities must be filled with 0. + **sampleType** column accepts three possible values: [QC, sample, blank] (case insensitive). + **class** column is used to denote a biological group of a sample (e.g., positive/negative species). The column accepts any values. + the **output** is the same table with corrected feature intensities. .. rubric:: **Footnotes** - .. [1] for details on wavelet filter parameters refer to R `wavelets::wt.filter <https://www.rdocumentation.org/packages/wavelets/versions/0.3-0.2/topics/wt.filter>`_; + .. [1] for details on wavelet-filter parameters refer to R `wavelets::wt.filter <https://www.rdocumentation.org/packages/wavelets/versions/0.3-0.2/topics/wt.filter>`_; + .. [2] when using 'Multiple batches', please cite the WaveICA (2019) paper; else, cite WaveICA 2.0 (2021) paper; + ]]></help> <citations> <citation type="doi">10.1016/j.aca.2019.02.010</citation> + <citation type="doi">10.1007/s11306-021-01839-7</citation> </citations> </tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/waveica_macros.xml Mon Jan 10 16:01:35 2022 +0000 @@ -0,0 +1,85 @@ +<macros> + <token name="@TOOL_VERSION@">0.2.0</token> + + <xml name="creator"> + <creator> + <organization + url="https://www.recetox.muni.cz" + name="RECETOX MUNI" /> + </creator> + </xml> + <xml name="general_parameters"> + <param type="data" name="data" label="Feature table" format="csv" help=""/> + <param type="integer" value="20" name="k" label="Number of components to decompose" help="maximal component that ICA decomposes"/> + <param type="float" value="0" name="alpha" label="Alpha" help="trade-off value between the independence of samples (temporal ICA) and variables (spatial ICA), should be between 0 and 1"/> + </xml> + <xml name="batchwise_parameters"> + <param type="float" value="0.05" name="t" label="Batch-association threshold" help="threshold to consider a component associate with the batch, + should be between 0 and 1"/> + <param type="float" value="0.05" name="t2" label="Group-association threshold" help="threshold to consider a component associate with the group, + should be between 0 and 1"/> + </xml> + <xml name="singlebatch_parameters"> + <param type="float" value="0" name="cutoff" label="Cutoff" help="threshold of the variation explained by the injection order for independent components, should be between 0 and 1"/> + </xml> + <xml name="exclude_blanks"> + <param name="exclude_blanks" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Remove blanks" help="Excludes blank samples from the output" /> + </xml> + <xml name="wf"> + <conditional name="wf"> + <param type="select" name="wavelet_filter" label="Wavelet transform filter" help="wavelet function and filter length [1] (see footnotes for more details)"> + <option value="d" selected="True">Daubechies</option> + <option value="la" >Least Asymetric</option> + <option value="bl" >Best Localized</option> + <option value="c" >Coiflet</option> + </param> + <when value="d"> + <param name="wavelet_length" type="select" label="filter length"> + <option value="2" selected="True">2</option> + <option value="4">4</option> + <option value="6">6</option> + <option value="8">8</option> + <option value="10">10</option> + <option value="12">12</option> + <option value="14">14</option> + <option value="16">16</option> + <option value="18">18</option> + <option value="20">20</option> + </param> + </when> + <when value="la"> + <param name="wavelet_length" type="select" label="filter length"> + <option value="8">8</option> + <option value="10">10</option> + <option value="12">12</option> + <option value="14">14</option> + <option value="16">16</option> + <option value="18">18</option> + <option value="20">20</option> + </param> + </when> + <when value="bl"> + <param name="wavelet_length" type="select" label="filter length"> + <option value="14">14</option> + <option value="18">18</option> + <option value="20">20</option> + </param> + </when> + <when value="c"> + <param name="wavelet_length" type="select" label="filter length"> + <option value="6">6</option> + <option value="12">12</option> + <option value="18">18</option> + <option value="24">24</option> + <option value="30">30</option> + </param> + </when> + </conditional> + </xml> + + <xml name="outputs"> + <outputs> + <data name="normalized_data" format="tsv" /> + </outputs> + </xml> +</macros> \ No newline at end of file
--- a/waveica_wrapper.R Fri Nov 12 23:22:14 2021 +0000 +++ b/waveica_wrapper.R Mon Jan 10 16:01:35 2022 +0000 @@ -11,19 +11,9 @@ data <- read.csv(data, header = TRUE) required_columns <- c("sampleName", "class", "sampleType", "injectionOrder", "batch") - if (anyNA(data)) { - stop("Error: dataframe cannot contain NULL values! -Make sure that your dataframe does not contain empty cells") - } else if (!all(required_columns %in% colnames(data))) { - stop("Error: missing metadata! -Make sure that the following columns are present in your dataframe: [sampleName, class, sampleType, injectionOrder, batch]") - } + verify_input_dataframe(data, required_columns) - # sort data by injection order - data <- data[order(data[, "batch"], - data[, "injectionOrder"], - decreasing = FALSE - ), ] + data <- sort_by_injection_order(data) # separate data into features, batch and group feature_columns <- colnames(data)[!colnames(data) %in% required_columns] @@ -32,7 +22,7 @@ batch <- data$batch # run WaveICA - features <- WaveICA::WaveICA( + features <- recetox.waveica::waveica( data = features, wf = get_wf(wavelet_filter, wavelet_length), batch = batch, @@ -43,7 +33,7 @@ alpha = alpha ) - data[, feature_columns] <- features$data_wave + data[, feature_columns] <- features # remove blanks from dataset if (exclude_blanks) { @@ -54,6 +44,74 @@ } +waveica_singlebatch <- function(data, + wavelet_filter, + wavelet_length, + k, + alpha, + cutoff, + exclude_blanks) { + + # get input from the Galaxy, preprocess data + data <- read.csv(data, header = TRUE) + + required_columns <- c("sampleName", "class", "sampleType", "injectionOrder") + optional_columns <- c("batch") + verify_input_dataframe(data, required_columns) + + data <- sort_by_injection_order(data) + + feature_columns <- colnames(data)[!colnames(data) %in% c(required_columns, optional_columns)] + features <- data[, feature_columns] + injection_order <- data$injectionOrder + + # run WaveICA + features <- recetox.waveica::waveica_nonbatchwise( + data = features, + wf = get_wf(wavelet_filter, wavelet_length), + injection_order = injection_order, + K = k, + alpha = alpha, + cutoff = cutoff + ) + + data[, feature_columns] <- features + + # remove blanks from dataset + if (exclude_blanks) { + data <- exclude_group(data, group) + } + + return(data) +} + + +sort_by_injection_order <- function(data) { + if ("batch" %in% colnames(data)) { + data <- data[order(data[, "batch"], + data[, "injectionOrder"], + decreasing = FALSE + ), ] + } else { + data <- data[order(data[, "injectionOrder"], + decreasing = FALSE + ), ] + } + return(data) +} + + +verify_input_dataframe <- function(data, required_columns) { + if (anyNA(data)) { + stop("Error: dataframe cannot contain NULL values! +Make sure that your dataframe does not contain empty cells") + } else if (!all(required_columns %in% colnames(data))) { + stop("Error: missing metadata! +Make sure that the following columns are present in your dataframe: ", paste(required_columns, collapse = ", ")) + } +} + + # Match group labels with [blank/sample/qc] and enumerate them enumerate_groups <- function(group) { group[grepl("blank", tolower(group))] <- 0
