changeset 2:d08deef1eb44 draft

"planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/waveica commit e33ef984e78721ed37d825c6672795a539a461e1"
author recetox
date Fri, 12 Nov 2021 09:14:04 +0000
parents 2bcfd5b450bb
children e3726251a055
files test-data/features-normalized.tsv test-data/features-test.csv test-data/incomplete_metadata_data.csv test-data/input_data.csv test-data/na_data.csv test-data/normalized_data.tsv waveica.xml waveica_wrapper.R
diffstat 8 files changed, 184 insertions(+), 155 deletions(-) [+]
line wrap: on
line diff
--- a/test-data/features-normalized.tsv	Wed Jul 28 11:58:20 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-""	"data_wave.M85T34"	"data_wave.M86T41"	"data_wave.M86T518"	"data_wave.M86T539"
-"VT_160120_002"	355200.506508035	75115889.9077485	6101488.54615418	2007379.02604984
-"VT_160120_004"	216897.826587868	75204863.1495248	6170882.26270475	2069979.64992079
-"VT_160120_006"	362337.195084504	76490295.1450204	12588041.969092	1818589.63912375
-"VT_160120_008"	143303.377379009	83771659.9549148	6181538.46316058	1975712.25920485
-"VT_160120_010"	189065.516447239	84108898.7658797	6103964.42378424	1935671.32085241
--- a/test-data/features-test.csv	Wed Jul 28 11:58:20 2021 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-sampleName,class,sampleType,injectionOrder,batch,M85T34,M86T41,M86T518,M86T539
-VT_160120_002,sample,sample,1,1,228520.06430737,35646729.21543971,2386896.97966461,1026645.83653468
-VT_160120_004,sample,sample,2,1,90217.384387202,35735702.457215995,2456290.69621518,1089246.46040563
-VT_160120_006,sample,sample,3,1,235656.75288383896,37021134.452711605,8873450.40260241,837856.449608585
-VT_160120_008,sample,sample,4,1,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685
-VT_160120_010,sample,sample,5,1,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/incomplete_metadata_data.csv	Fri Nov 12 09:14:04 2021 +0000
@@ -0,0 +1,6 @@
+sampleName,class,sampleType,batch,M85T34,M86T41,M86T518,M86T539
+VT_160120_002,sample,sample,1,228520.06430737,35646729.2154397,2386896.97966461,1026645.83653468
+VT_160120_004,sample,sample,1,90217.384387202,35735702.457216,2456290.69621518,1089246.46040563
+VT_160120_006,sample,sample,1,235656.752883839,37021134.4527116,8873450.40260241,837856.449608585
+VT_160120_008,sample,sample,1,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685
+VT_160120_010,sample,sample,1,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input_data.csv	Fri Nov 12 09:14:04 2021 +0000
@@ -0,0 +1,6 @@
+sampleName,class,sampleType,injectionOrder,batch,M85T34,M86T41,M86T518,M86T539
+VT_160120_002,sample,sample,1,1,228520.06430737,35646729.21543971,2386896.97966461,1026645.83653468
+VT_160120_004,sample,sample,2,1,90217.384387202,35735702.457215995,2456290.69621518,1089246.46040563
+VT_160120_006,sample,sample,3,1,235656.75288383896,37021134.452711605,8873450.40260241,837856.449608585
+VT_160120_008,sample,sample,4,1,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685
+VT_160120_010,sample,sample,5,1,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/na_data.csv	Fri Nov 12 09:14:04 2021 +0000
@@ -0,0 +1,6 @@
+sampleName,class,sampleType,injectionOrder,batch,M85T34,M86T41,M86T518,M86T539
+VT_160120_002,sample,sample,1,1,NA,35646729.2154397,2386896.97966461,1026645.83653468
+VT_160120_004,sample,sample,2,1,90217.384387202,35735702.457216,2456290.69621518,1089246.46040563
+VT_160120_006,sample,sample,3,1,235656.752883839,37021134.4527116,8873450.40260241,837856.449608585
+VT_160120_008,sample,sample,4,1,16622.9351783435,44302499.262606,2466946.89667101,994979.069689685
+VT_160120_010,sample,sample,5,1,62385.0742465736,44639738.0735709,2389372.85729467,954938.131337246
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/normalized_data.tsv	Fri Nov 12 09:14:04 2021 +0000
@@ -0,0 +1,6 @@
+sampleName	class	sampleType	injectionOrder	batch	M85T34	M86T41	M86T518	M86T539
+VT_160120_002	sample	sample	1	1	355200.506508035	75115889.9077485	6101488.54615418	2007379.02604984
+VT_160120_004	sample	sample	2	1	216897.826587868	75204863.1495248	6170882.26270475	2069979.64992079
+VT_160120_006	sample	sample	3	1	362337.195084504	76490295.1450204	12588041.969092	1818589.63912375
+VT_160120_008	sample	sample	4	1	143303.377379009	83771659.9549148	6181538.46316058	1975712.25920485
+VT_160120_010	sample	sample	5	1	189065.516447239	84108898.7658797	6103964.42378424	1935671.32085241
--- a/waveica.xml	Wed Jul 28 11:58:20 2021 +0000
+++ b/waveica.xml	Fri Nov 12 09:14:04 2021 +0000
@@ -1,4 +1,4 @@
-<tool id="waveica" name="WaveICA" version="0.1.0+galaxy1" python_template_version="3.5">
+<tool id="waveica" name="WaveICA" version="0.1.0+galaxy2" python_template_version="3.5">
 
     <description>removal of batch effects for untargeted metabolomics data</description>
     
@@ -12,96 +12,98 @@
 
             -e 'normalized_data <- waveica(
                 data = "$data",
-                wavelet_filter = "$parameters.wf.wavelet_filter",
-                wavelet_length = "$parameters.wf.wavelet_length",
-                k = $parameters.k,
-                t = $parameters.t,
-                t2 = $parameters.t2,
-                alpha = $parameters.alpha,
-                exclude_blanks = $optional_parameters.exclude_blanks
+                wavelet_filter = "$wf.wavelet_filter",
+                wavelet_length = "$wf.wavelet_length",
+                k = $k,
+                t = $t,
+                t2 = $t2,
+                alpha = $alpha,
+                exclude_blanks = $exclude_blanks
             )'
 
-            -e 'store_data(normalized_data,"$normalized_features")'
+            -e 'store_data(normalized_data,"$normalized_data")'
     ]]></command>
 
     <inputs>
-        <param type="data" name="data" label="Sample-by-matrix data" format="csv" help=""/>
-        <section name="parameters" title="Normalization Parameters" expanded="True">
-            <conditional name="wf">
-                <param type="select" name="wavelet_filter" label="Wavelet transform filter" help="selecting wavelet function and filter length">
-                    <option value="d" selected="True">Daubechies</option>
-                    <option value="la" >Least Asymetric</option>
-                    <option value="bl" >Best Localized</option>
-                    <option value="c" >Coiflet</option>
+        <param type="data" name="data" label="Feature table" format="csv" help=""/>
+        <conditional name="wf">
+            <param type="select" name="wavelet_filter" label="Wavelet transform filter" help="wavelet function and filter length [1] (see footnotes for more details)">
+                <option value="d" selected="True">Daubechies</option>
+                <option value="la" >Least Asymetric</option>
+                <option value="bl" >Best Localized</option>
+                <option value="c" >Coiflet</option>
+            </param>
+            <when value="d">
+                <param name="wavelet_length" type="select" label="filter length">
+                    <option value="2" selected="True">2</option>
+                    <option value="4">4</option>
+                    <option value="6">6</option>
+                    <option value="8">8</option>
+                    <option value="10">10</option>
+                    <option value="12">12</option>
+                    <option value="14">14</option>
+                    <option value="16">16</option>
+                    <option value="18">18</option>
+                    <option value="20">20</option>
                 </param>
-                <when value="d">
-                    <param name="wavelet_length" type="select" label="filter length">
-                        <option value="2" selected="True">2</option>
-                        <option value="4">4</option>
-                        <option value="6">6</option>
-                        <option value="8">8</option>
-                        <option value="10">10</option>
-                        <option value="12">12</option>
-                        <option value="14">14</option>
-                        <option value="16">16</option>
-                        <option value="18">18</option>
-                        <option value="20">20</option>
-                    </param>
-                </when>
-                <when value="la">
-                    <param name="wavelet_length" type="select" label="filter length">
-                        <option value="8">8</option>
-                        <option value="10">10</option>
-                        <option value="12">12</option>
-                        <option value="14">14</option>
-                        <option value="16">16</option>
-                        <option value="18">18</option>
-                        <option value="20">20</option>
-                    </param>
-                </when>
-                <when value="bl">
-                    <param name="wavelet_length" type="select" label="filter length">
-                        <option value="14">14</option>
-                        <option value="18">18</option>
-                        <option value="20">20</option>
-                    </param>
-                </when>
-                <when value="c">
-                    <param name="wavelet_length" type="select" label="filter length">
-                        <option value="6">6</option>
-                        <option value="12">12</option>
-                        <option value="18">18</option>
-                        <option value="24">24</option>
-                        <option value="30">30</option>
-                    </param>
-                </when>
-            </conditional>
-            <param type="integer" value="20" name="k" label="Number of components to decompose" help="the maximal component that ICA decomposes"/>
-            <param type="float" value="0.05" name="t" label="Batch-assosiation threshold" help="the threshold to consider a component associate with the batch,
+            </when>
+            <when value="la">
+                <param name="wavelet_length" type="select" label="filter length">
+                    <option value="8">8</option>
+                    <option value="10">10</option>
+                    <option value="12">12</option>
+                    <option value="14">14</option>
+                    <option value="16">16</option>
+                    <option value="18">18</option>
+                    <option value="20">20</option>
+                </param>
+            </when>
+            <when value="bl">
+                <param name="wavelet_length" type="select" label="filter length">
+                    <option value="14">14</option>
+                    <option value="18">18</option>
+                    <option value="20">20</option>
+                </param>
+            </when>
+            <when value="c">
+                <param name="wavelet_length" type="select" label="filter length">
+                    <option value="6">6</option>
+                    <option value="12">12</option>
+                    <option value="18">18</option>
+                    <option value="24">24</option>
+                    <option value="30">30</option>
+                </param>
+            </when>
+        </conditional>
+        <param type="integer" value="20" name="k" label="Number of components to decompose" help="maximal component that ICA decomposes"/>
+        <param type="float" value="0.05" name="t" label="Batch-association threshold" help="threshold to consider a component associate with the batch,
  should be between 0 and 1"/>
-            <param type="float" value="0.05" name="t2" label="Group-assosiation threshold" help="the threshold to consider a component associate with the group,
+        <param type="float" value="0.05" name="t2" label="Group-association threshold" help="threshold to consider a component associate with the group,
  should be between 0 and 1"/>
-            <param type="float" value="0" name="alpha" label="Alpha" help="the trade-off value between the independence of samples and those of variables and should be between 0 and 1"/>
-        </section>
-        <section name="optional_parameters" expanded="true" title="Optional Parameters">
-            <param name="exclude_blanks" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Remove blanks" help="Excludes blank samples from the output" />
-        </section>
+        <param type="float" value="0" name="alpha" label="Alpha" help="trade-off value between the independence of samples (temporal ICA) and variables (spatial ICA), should be between 0 and 1"/>
+        <param name="exclude_blanks" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Remove blanks" help="Excludes blank samples from the output" />
     </inputs>
 
     <outputs>
-        <data name="normalized_features" format="tsv" />
+        <data name="normalized_data" format="tsv" />
     </outputs>
 
     <tests>
         <test>
-            <param name="data" value="features-test.csv" ftype="csv" />
+            <param name="data" value="input_data.csv" ftype="csv" />
             <param name="wavelet_filter" value="d" />
             <param name="filter_length" value="2" />
             <param name="k" value="20" />
             <param name="t" value="0.05" />
             <param name="t2" value="0.05" />
             <param name="alpha" value="0" />
-            <output name="normalized_features" file="features-normalized.tsv" /> 
+            <output name="normalized_data" file="normalized_data.tsv" /> 
+        </test>
+        <test expect_failure="true">
+            <param name="data" value="na_data.csv" ftype="csv" />
+        </test>
+        <test expect_failure="true">
+            <param name="data" value="incomplete_metadata_data.csv" ftype="csv" />
         </test>
     </tests>
 
@@ -110,19 +112,26 @@
 
         Removal of batch effects for large-scale untargeted metabolomics data based on wavelet analysis. The WaveICA R package provides a new algorithm to removing batch effects for metabolomics data.
         
-        The input is Sample-by-matrix table which must include: 
-
-        1. Injection order of samples
-
-        2. Types of the samples denoted as "blank", "sample" or "QC"
+        The input is an intensity-by-feature table with metadata in the following format: 
 
-        3. Batch numbers
-
-        4. Features data
+        +---------------+--------+------------+----------------+-------+------------+--------------+-------------+-------------+-----+
+        | sampleName    | class  | sampleType | injectionOrder | batch | M85T34     | M86T41       | M86T518     | M86T539     | ... |
+        +===============+========+============+================+=======+============+==============+=============+=============+=====+
+        | VT_160120_002 | sample | sample     | 1              | 1     | 228520.064 | 35646729.215 | 2386896.979 | 1026645.836 | ... |
+        +---------------+--------+------------+----------------+-------+------------+--------------+-------------+-------------+-----+
+        | QC1           | sample | QC         | 2              | 1     | 90217.384  | 35735702.457 | 2456290.696 | 1089246.460 | ... |
+        +---------------+--------+------------+----------------+-------+------------+--------------+-------------+-------------+-----+
+        | ...           | ...    | ...        | ...            | ...   | ...        | ...          | ...         | ...         | ... |
+        +---------------+--------+------------+----------------+-------+------------+--------------+-------------+-------------+-----+
 
-        **Documentation**
 
-        See original repository for further information: https://github.com/dengkuistat/WaveICA
+        + the input table must not contain missing values. Missing intensities must be filled with 0.
+        + **sampleType** column accepts three possible values: [QC, sample, blank] (case insensitive).
+        + **class** column is used to denote a biological group of a sample (e.g., positive/negative species). The column accepts any values.
+        + the **output** is the same table with corrected feature intensities.
+
+        .. rubric:: **Footnotes**
+        .. [1] for details on wavelet filter parameters refer to R `wavelets::wt.filter <https://www.rdocumentation.org/packages/wavelets/versions/0.3-0.2/topics/wt.filter>`_;
 
     ]]></help>
 
--- a/waveica_wrapper.R	Wed Jul 28 11:58:20 2021 +0000
+++ b/waveica_wrapper.R	Fri Nov 12 09:14:04 2021 +0000
@@ -1,89 +1,97 @@
-waveica <- function(
-    data,
-    wavelet_filter,
-    wavelet_length,
-    k,
-    t,
-    t2,
-    alpha,
-    exclude_blanks
-) {
+waveica <- function(data,
+                    wavelet_filter,
+                    wavelet_length,
+                    k,
+                    t,
+                    t2,
+                    alpha,
+                    exclude_blanks) {
+
+  # get input from the Galaxy, preprocess data
+  data <- read.csv(data, header = TRUE)
 
-    # get input from the Galaxy, preprocess data
-    data <- read.csv(data, header = TRUE, row.names = "sampleName")
+  required_columns <- c("sampleName", "class", "sampleType", "injectionOrder", "batch")
+  if (anyNA(data)) {
+    stop("Error: dataframe cannot contain NULL values!
+Make sure that your dataframe does not contain empty cells")
+  } else if (!all(required_columns %in% colnames(data))) {
+    stop("Error: missing metadata!
+Make sure that the following columns are present in your dataframe: [sampleName, class, sampleType, injectionOrder, batch]")
+  }
 
-    # sort data by injection order
-    data <- data[order(data$injectionOrder, decreasing = FALSE), ]
-
-    data <- enumerate_groups(data)
+  # sort data by injection order
+  data <- data[order(data[, "batch"],
+    data[, "injectionOrder"],
+    decreasing = FALSE
+  ), ]
 
-    # remove blanks from dataset
-    if (exclude_blanks) {
-        data <- exclude_group(data)
-    }
-
-    # separate data into features, batch and group
-    features <- data[, -c(1:4)]
-    group <- as.numeric(data$class)
-    batch <- data$batch
+  # separate data into features, batch and group
+  feature_columns <- colnames(data)[!colnames(data) %in% required_columns]
+  features <- data[, feature_columns]
+  group <- enumerate_groups(as.character(data$sampleType))
+  batch <- data$batch
 
-    # run WaveICA
-    normalized_data <- WaveICA::WaveICA(
-        data = features,
-        wf = get_wf(wavelet_filter, wavelet_length),
-        batch = batch,
-        group = group,
-        K = k,
-        t = t,
-        t2 = t2,
-        alpha = alpha
-        )
+  # run WaveICA
+  features <- WaveICA::WaveICA(
+    data = features,
+    wf = get_wf(wavelet_filter, wavelet_length),
+    batch = batch,
+    group = group,
+    K = k,
+    t = t,
+    t2 = t2,
+    alpha = alpha
+  )
 
-    return(normalized_data)
+  data[, feature_columns] <- features$data_wave
+
+  # remove blanks from dataset
+  if (exclude_blanks) {
+    data <- exclude_group(data, group)
+  }
+
+  return(data)
 }
 
 
 # Match group labels with [blank/sample/qc] and enumerate them
-enumerate_groups <- function(data) {
+enumerate_groups <- function(group) {
+  group[grepl("blank", tolower(group))] <- 0
+  group[grepl("sample", tolower(group))] <- 1
+  group[grepl("qc", tolower(group))] <- 2
 
-    data$sampleType[grepl("blank", tolower(data$sampleType))] <- 0
-    data$sampleType[grepl("sample", tolower(data$sampleType))] <- 1
-    data$sampleType[grepl("qc", tolower(data$sampleType))] <- 2
-
-    return(data)
+  return(group)
 }
 
 
 # Create appropriate input for R wavelets function
 get_wf <- function(wavelet_filter, wavelet_length) {
-    wf <- paste(wavelet_filter, wavelet_length, sep = "")
+  wf <- paste(wavelet_filter, wavelet_length, sep = "")
 
-    # exception to the wavelet function
-    if (wf == "d2") {
-        wf <- "haar"
-        }
+  # exception to the wavelet function
+  if (wf == "d2") {
+    wf <- "haar"
+  }
 
-    return(wf)
+  return(wf)
 }
 
 
 # Exclude blanks from a dataframe
-exclude_group <- function(data) {
-    row_idx_to_exclude <- which(data$class %in% 0)
-    if (length(row_idx_to_exclude) > 0) {
-        data_without_blanks <- data[-c(row_idx_to_exclude), ]
-        msg <- paste("Blank samples have been excluded from the dataframe.\n")
-        cat(msg)
-        return(data_without_blanks)
-        }
-    else {
-        return(data)
-    }
+exclude_group <- function(data, group) {
+  row_idx_to_exclude <- which(group %in% 0)
+  if (length(row_idx_to_exclude) > 0) {
+    data_without_blanks <- data[-c(row_idx_to_exclude), ]
+    cat("Blank samples have been excluded from the dataframe.\n")
+    return(data_without_blanks)
+  } else {
+    return(data)
+  }
 }
 
 
 # Store output of WaveICA in a tsv file
-store_data <- function(normalized_data, output) {
-    write.table(normalized_data, file = output, sep = "\t", col.names = NA)
-    cat("Normalization has been completed.\n")
+store_data <- function(data, output) {
+  write.table(data, file = output, sep = "\t", row.names = FALSE, quote = FALSE)
+  cat("Normalization has been completed.\n")
 }