Mercurial > repos > recetox > waveica
changeset 2:d08deef1eb44 draft
"planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/waveica commit e33ef984e78721ed37d825c6672795a539a461e1"
| author | recetox |
|---|---|
| date | Fri, 12 Nov 2021 09:14:04 +0000 |
| parents | 2bcfd5b450bb |
| children | e3726251a055 |
| files | test-data/features-normalized.tsv test-data/features-test.csv test-data/incomplete_metadata_data.csv test-data/input_data.csv test-data/na_data.csv test-data/normalized_data.tsv waveica.xml waveica_wrapper.R |
| diffstat | 8 files changed, 184 insertions(+), 155 deletions(-) [+] |
line wrap: on
line diff
--- a/test-data/features-normalized.tsv Wed Jul 28 11:58:20 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -"" "data_wave.M85T34" "data_wave.M86T41" "data_wave.M86T518" "data_wave.M86T539" -"VT_160120_002" 355200.506508035 75115889.9077485 6101488.54615418 2007379.02604984 -"VT_160120_004" 216897.826587868 75204863.1495248 6170882.26270475 2069979.64992079 -"VT_160120_006" 362337.195084504 76490295.1450204 12588041.969092 1818589.63912375 -"VT_160120_008" 143303.377379009 83771659.9549148 6181538.46316058 1975712.25920485 -"VT_160120_010" 189065.516447239 84108898.7658797 6103964.42378424 1935671.32085241
--- a/test-data/features-test.csv Wed Jul 28 11:58:20 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -sampleName,class,sampleType,injectionOrder,batch,M85T34,M86T41,M86T518,M86T539 -VT_160120_002,sample,sample,1,1,228520.06430737,35646729.21543971,2386896.97966461,1026645.83653468 -VT_160120_004,sample,sample,2,1,90217.384387202,35735702.457215995,2456290.69621518,1089246.46040563 -VT_160120_006,sample,sample,3,1,235656.75288383896,37021134.452711605,8873450.40260241,837856.449608585 -VT_160120_008,sample,sample,4,1,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685 -VT_160120_010,sample,sample,5,1,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/incomplete_metadata_data.csv Fri Nov 12 09:14:04 2021 +0000 @@ -0,0 +1,6 @@ +sampleName,class,sampleType,batch,M85T34,M86T41,M86T518,M86T539 +VT_160120_002,sample,sample,1,228520.06430737,35646729.2154397,2386896.97966461,1026645.83653468 +VT_160120_004,sample,sample,1,90217.384387202,35735702.457216,2456290.69621518,1089246.46040563 +VT_160120_006,sample,sample,1,235656.752883839,37021134.4527116,8873450.40260241,837856.449608585 +VT_160120_008,sample,sample,1,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685 +VT_160120_010,sample,sample,1,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input_data.csv Fri Nov 12 09:14:04 2021 +0000 @@ -0,0 +1,6 @@ +sampleName,class,sampleType,injectionOrder,batch,M85T34,M86T41,M86T518,M86T539 +VT_160120_002,sample,sample,1,1,228520.06430737,35646729.21543971,2386896.97966461,1026645.83653468 +VT_160120_004,sample,sample,2,1,90217.384387202,35735702.457215995,2456290.69621518,1089246.46040563 +VT_160120_006,sample,sample,3,1,235656.75288383896,37021134.452711605,8873450.40260241,837856.449608585 +VT_160120_008,sample,sample,4,1,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685 +VT_160120_010,sample,sample,5,1,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/na_data.csv Fri Nov 12 09:14:04 2021 +0000 @@ -0,0 +1,6 @@ +sampleName,class,sampleType,injectionOrder,batch,M85T34,M86T41,M86T518,M86T539 +VT_160120_002,sample,sample,1,1,NA,35646729.2154397,2386896.97966461,1026645.83653468 +VT_160120_004,sample,sample,2,1,90217.384387202,35735702.457216,2456290.69621518,1089246.46040563 +VT_160120_006,sample,sample,3,1,235656.752883839,37021134.4527116,8873450.40260241,837856.449608585 +VT_160120_008,sample,sample,4,1,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685 +VT_160120_010,sample,sample,5,1,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/normalized_data.tsv Fri Nov 12 09:14:04 2021 +0000 @@ -0,0 +1,6 @@ +sampleName class sampleType injectionOrder batch M85T34 M86T41 M86T518 M86T539 +VT_160120_002 sample sample 1 1 355200.506508035 75115889.9077485 6101488.54615418 2007379.02604984 +VT_160120_004 sample sample 2 1 216897.826587868 75204863.1495248 6170882.26270475 2069979.64992079 +VT_160120_006 sample sample 3 1 362337.195084504 76490295.1450204 12588041.969092 1818589.63912375 +VT_160120_008 sample sample 4 1 143303.377379009 83771659.9549148 6181538.46316058 1975712.25920485 +VT_160120_010 sample sample 5 1 189065.516447239 84108898.7658797 6103964.42378424 1935671.32085241
--- a/waveica.xml Wed Jul 28 11:58:20 2021 +0000 +++ b/waveica.xml Fri Nov 12 09:14:04 2021 +0000 @@ -1,4 +1,4 @@ -<tool id="waveica" name="WaveICA" version="0.1.0+galaxy1" python_template_version="3.5"> +<tool id="waveica" name="WaveICA" version="0.1.0+galaxy2" python_template_version="3.5"> <description>removal of batch effects for untargeted metabolomics data</description> @@ -12,96 +12,98 @@ -e 'normalized_data <- waveica( data = "$data", - wavelet_filter = "$parameters.wf.wavelet_filter", - wavelet_length = "$parameters.wf.wavelet_length", - k = $parameters.k, - t = $parameters.t, - t2 = $parameters.t2, - alpha = $parameters.alpha, - exclude_blanks = $optional_parameters.exclude_blanks + wavelet_filter = "$wf.wavelet_filter", + wavelet_length = "$wf.wavelet_length", + k = $k, + t = $t, + t2 = $t2, + alpha = $alpha, + exclude_blanks = $exclude_blanks )' - -e 'store_data(normalized_data,"$normalized_features")' + -e 'store_data(normalized_data,"$normalized_data")' ]]></command> <inputs> - <param type="data" name="data" label="Sample-by-matrix data" format="csv" help=""/> - <section name="parameters" title="Normalization Parameters" expanded="True"> - <conditional name="wf"> - <param type="select" name="wavelet_filter" label="Wavelet transform filter" help="selecting wavelet function and filter length"> - <option value="d" selected="True">Daubechies</option> - <option value="la" >Least Asymetric</option> - <option value="bl" >Best Localized</option> - <option value="c" >Coiflet</option> + <param type="data" name="data" label="Feature table" format="csv" help=""/> + <conditional name="wf"> + <param type="select" name="wavelet_filter" label="Wavelet transform filter" help="wavelet function and filter length [1] (see footnotes for more details)"> + <option value="d" selected="True">Daubechies</option> + <option value="la" >Least Asymetric</option> + <option value="bl" >Best Localized</option> + <option value="c" >Coiflet</option> + </param> + <when value="d"> + <param name="wavelet_length" type="select" label="filter length"> + <option value="2" selected="True">2</option> + <option value="4">4</option> + <option value="6">6</option> + <option value="8">8</option> + <option value="10">10</option> + <option value="12">12</option> + <option value="14">14</option> + <option value="16">16</option> + <option value="18">18</option> + <option value="20">20</option> </param> - <when value="d"> - <param name="wavelet_length" type="select" label="filter length"> - <option value="2" selected="True">2</option> - <option value="4">4</option> - <option value="6">6</option> - <option value="8">8</option> - <option value="10">10</option> - <option value="12">12</option> - <option value="14">14</option> - <option value="16">16</option> - <option value="18">18</option> - <option value="20">20</option> - </param> - </when> - <when value="la"> - <param name="wavelet_length" type="select" label="filter length"> - <option value="8">8</option> - <option value="10">10</option> - <option value="12">12</option> - <option value="14">14</option> - <option value="16">16</option> - <option value="18">18</option> - <option value="20">20</option> - </param> - </when> - <when value="bl"> - <param name="wavelet_length" type="select" label="filter length"> - <option value="14">14</option> - <option value="18">18</option> - <option value="20">20</option> - </param> - </when> - <when value="c"> - <param name="wavelet_length" type="select" label="filter length"> - <option value="6">6</option> - <option value="12">12</option> - <option value="18">18</option> - <option value="24">24</option> - <option value="30">30</option> - </param> - </when> - </conditional> - <param type="integer" value="20" name="k" label="Number of components to decompose" help="the maximal component that ICA decomposes"/> - <param type="float" value="0.05" name="t" label="Batch-assosiation threshold" help="the threshold to consider a component associate with the batch, + </when> + <when value="la"> + <param name="wavelet_length" type="select" label="filter length"> + <option value="8">8</option> + <option value="10">10</option> + <option value="12">12</option> + <option value="14">14</option> + <option value="16">16</option> + <option value="18">18</option> + <option value="20">20</option> + </param> + </when> + <when value="bl"> + <param name="wavelet_length" type="select" label="filter length"> + <option value="14">14</option> + <option value="18">18</option> + <option value="20">20</option> + </param> + </when> + <when value="c"> + <param name="wavelet_length" type="select" label="filter length"> + <option value="6">6</option> + <option value="12">12</option> + <option value="18">18</option> + <option value="24">24</option> + <option value="30">30</option> + </param> + </when> + </conditional> + <param type="integer" value="20" name="k" label="Number of components to decompose" help="maximal component that ICA decomposes"/> + <param type="float" value="0.05" name="t" label="Batch-association threshold" help="threshold to consider a component associate with the batch, should be between 0 and 1"/> - <param type="float" value="0.05" name="t2" label="Group-assosiation threshold" help="the threshold to consider a component associate with the group, + <param type="float" value="0.05" name="t2" label="Group-association threshold" help="threshold to consider a component associate with the group, should be between 0 and 1"/> - <param type="float" value="0" name="alpha" label="Alpha" help="the trade-off value between the independence of samples and those of variables and should be between 0 and 1"/> - </section> - <section name="optional_parameters" expanded="true" title="Optional Parameters"> - <param name="exclude_blanks" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Remove blanks" help="Excludes blank samples from the output" /> - </section> + <param type="float" value="0" name="alpha" label="Alpha" help="trade-off value between the independence of samples (temporal ICA) and variables (spatial ICA), should be between 0 and 1"/> + <param name="exclude_blanks" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Remove blanks" help="Excludes blank samples from the output" /> </inputs> <outputs> - <data name="normalized_features" format="tsv" /> + <data name="normalized_data" format="tsv" /> </outputs> <tests> <test> - <param name="data" value="features-test.csv" ftype="csv" /> + <param name="data" value="input_data.csv" ftype="csv" /> <param name="wavelet_filter" value="d" /> <param name="filter_length" value="2" /> <param name="k" value="20" /> <param name="t" value="0.05" /> <param name="t2" value="0.05" /> <param name="alpha" value="0" /> - <output name="normalized_features" file="features-normalized.tsv" /> + <output name="normalized_data" file="normalized_data.tsv" /> + </test> + <test expect_failure="true"> + <param name="data" value="na_data.csv" ftype="csv" /> + </test> + <test expect_failure="true"> + <param name="data" value="incomplete_metadata_data.csv" ftype="csv" /> </test> </tests> @@ -110,19 +112,26 @@ Removal of batch effects for large-scale untargeted metabolomics data based on wavelet analysis. The WaveICA R package provides a new algorithm to removing batch effects for metabolomics data. - The input is Sample-by-matrix table which must include: - - 1. Injection order of samples - - 2. Types of the samples denoted as "blank", "sample" or "QC" + The input is an intensity-by-feature table with metadata in the following format: - 3. Batch numbers - - 4. Features data + +---------------+--------+------------+----------------+-------+------------+--------------+-------------+-------------+-----+ + | sampleName | class | sampleType | injectionOrder | batch | M85T34 | M86T41 | M86T518 | M86T539 | ... | + +===============+========+============+================+=======+============+==============+=============+=============+=====+ + | VT_160120_002 | sample | sample | 1 | 1 | 228520.064 | 35646729.215 | 2386896.979 | 1026645.836 | ... | + +---------------+--------+------------+----------------+-------+------------+--------------+-------------+-------------+-----+ + | QC1 | sample | QC | 2 | 1 | 90217.384 | 35735702.457 | 2456290.696 | 1089246.460 | ... | + +---------------+--------+------------+----------------+-------+------------+--------------+-------------+-------------+-----+ + | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | + +---------------+--------+------------+----------------+-------+------------+--------------+-------------+-------------+-----+ - **Documentation** - See original repository for further information: https://github.com/dengkuistat/WaveICA + + the input table must not contain missing values. Missing intensities must be filled with 0. + + **sampleType** column accepts three possible values: [QC, sample, blank] (case insensitive). + + **class** column is used to denote a biological group of a sample (e.g., positive/negative species). The column accepts any values. + + the **output** is the same table with corrected feature intensities. + + .. rubric:: **Footnotes** + .. [1] for details on wavelet filter parameters refer to R `wavelets::wt.filter <https://www.rdocumentation.org/packages/wavelets/versions/0.3-0.2/topics/wt.filter>`_; ]]></help>
--- a/waveica_wrapper.R Wed Jul 28 11:58:20 2021 +0000 +++ b/waveica_wrapper.R Fri Nov 12 09:14:04 2021 +0000 @@ -1,89 +1,97 @@ -waveica <- function( - data, - wavelet_filter, - wavelet_length, - k, - t, - t2, - alpha, - exclude_blanks -) { +waveica <- function(data, + wavelet_filter, + wavelet_length, + k, + t, + t2, + alpha, + exclude_blanks) { + + # get input from the Galaxy, preprocess data + data <- read.csv(data, header = TRUE) - # get input from the Galaxy, preprocess data - data <- read.csv(data, header = TRUE, row.names = "sampleName") + required_columns <- c("sampleName", "class", "sampleType", "injectionOrder", "batch") + if (anyNA(data)) { + stop("Error: dataframe cannot contain NULL values! +Make sure that your dataframe does not contain empty cells") + } else if (!all(required_columns %in% colnames(data))) { + stop("Error: missing metadata! +Make sure that the following columns are present in your dataframe: [sampleName, class, sampleType, injectionOrder, batch]") + } - # sort data by injection order - data <- data[order(data$injectionOrder, decreasing = FALSE), ] - - data <- enumerate_groups(data) + # sort data by injection order + data <- data[order(data[, "batch"], + data[, "injectionOrder"], + decreasing = FALSE + ), ] - # remove blanks from dataset - if (exclude_blanks) { - data <- exclude_group(data) - } - - # separate data into features, batch and group - features <- data[, -c(1:4)] - group <- as.numeric(data$class) - batch <- data$batch + # separate data into features, batch and group + feature_columns <- colnames(data)[!colnames(data) %in% required_columns] + features <- data[, feature_columns] + group <- enumerate_groups(as.character(data$sampleType)) + batch <- data$batch - # run WaveICA - normalized_data <- WaveICA::WaveICA( - data = features, - wf = get_wf(wavelet_filter, wavelet_length), - batch = batch, - group = group, - K = k, - t = t, - t2 = t2, - alpha = alpha - ) + # run WaveICA + features <- WaveICA::WaveICA( + data = features, + wf = get_wf(wavelet_filter, wavelet_length), + batch = batch, + group = group, + K = k, + t = t, + t2 = t2, + alpha = alpha + ) - return(normalized_data) + data[, feature_columns] <- features$data_wave + + # remove blanks from dataset + if (exclude_blanks) { + data <- exclude_group(data, group) + } + + return(data) } # Match group labels with [blank/sample/qc] and enumerate them -enumerate_groups <- function(data) { +enumerate_groups <- function(group) { + group[grepl("blank", tolower(group))] <- 0 + group[grepl("sample", tolower(group))] <- 1 + group[grepl("qc", tolower(group))] <- 2 - data$sampleType[grepl("blank", tolower(data$sampleType))] <- 0 - data$sampleType[grepl("sample", tolower(data$sampleType))] <- 1 - data$sampleType[grepl("qc", tolower(data$sampleType))] <- 2 - - return(data) + return(group) } # Create appropriate input for R wavelets function get_wf <- function(wavelet_filter, wavelet_length) { - wf <- paste(wavelet_filter, wavelet_length, sep = "") + wf <- paste(wavelet_filter, wavelet_length, sep = "") - # exception to the wavelet function - if (wf == "d2") { - wf <- "haar" - } + # exception to the wavelet function + if (wf == "d2") { + wf <- "haar" + } - return(wf) + return(wf) } # Exclude blanks from a dataframe -exclude_group <- function(data) { - row_idx_to_exclude <- which(data$class %in% 0) - if (length(row_idx_to_exclude) > 0) { - data_without_blanks <- data[-c(row_idx_to_exclude), ] - msg <- paste("Blank samples have been excluded from the dataframe.\n") - cat(msg) - return(data_without_blanks) - } - else { - return(data) - } +exclude_group <- function(data, group) { + row_idx_to_exclude <- which(group %in% 0) + if (length(row_idx_to_exclude) > 0) { + data_without_blanks <- data[-c(row_idx_to_exclude), ] + cat("Blank samples have been excluded from the dataframe.\n") + return(data_without_blanks) + } else { + return(data) + } } # Store output of WaveICA in a tsv file -store_data <- function(normalized_data, output) { - write.table(normalized_data, file = output, sep = "\t", col.names = NA) - cat("Normalization has been completed.\n") +store_data <- function(data, output) { + write.table(data, file = output, sep = "\t", row.names = FALSE, quote = FALSE) + cat("Normalization has been completed.\n") }
