diff test-data/decontam_Rscript.Rmd @ 0:86da5c894956 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontam commit 9e0ce6d02ee71d5b974ef615b1c5286bc45d8e6b
author iuc
date Tue, 15 Oct 2024 13:36:46 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/decontam_Rscript.Rmd	Tue Oct 15 13:36:46 2024 +0000
@@ -0,0 +1,176 @@
+---
+title: "decontam_docs"
+output: html_document
+date: "2024-09-10"
+---
+
+# This R markdown generates the test data for the wrapper and can be used to test the functions used in the configfile
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## Install test env and run studio in this env
+
+```{bash install}
+mamba create --name decontam bioconductor-decontam bioconductor-phyloseq r-tidyverse
+mamba activate decontam
+rstudio
+```
+
+### Check correct R home
+
+```{r home}
+R.home()
+```
+
+## Get test data
+
+Create test data for wrapper, it should be able to use a matrix and vector as well as phyloseq object as input.
+
+```{r store test data}
+R.home()
+library(phyloseq)
+packageVersion("phyloseq")
+library(ggplot2)
+packageVersion("ggplot2")
+library(decontam)
+packageVersion("decontam")
+
+ps <- readRDS(system.file("extdata", "MUClite.rds", package = "decontam"))
+
+# Sample from a physeq object with a sampling function.
+#   ps: physeq object to be sampled
+#   fun: function to use for sampling (default `sample`)
+#   ...: parameters to be passed to fun,
+# see `help(sample)` for default parameters
+sample_ps <- function(ps, fun = sample, ...) {
+    ids <- sample_names(ps)
+    sampled_ids <- fun(ids, ...)
+    ps <- prune_samples(sampled_ids, ps)
+    return(ps)
+}
+
+# the initial object is to big for the test case so we subsample
+ps <- sample_ps(ps, size = 200)
+
+## ps
+# get otu table
+otu <- as(otu_table(ps), "matrix")
+head(otu[, 1:10], 10)
+
+# add control column to sample data
+sample_data(ps)$control <- sample_data(ps)$Sample_or_Control == "Control Sample"
+# store as 0 and 1
+sample_data(ps)$control <- as.integer(sample_data(ps)$control)
+head(sample_data(ps), 1000)
+
+metadata <- as(sample_data(ps), "matrix")
+head(metadata, 1000)
+
+# store test data
+# stores the row names as column,
+# see https://stackoverflow.com/questions/2478352
+# /write-table-writes-unwanted-leading-empty-column-to-header-when-has-rownames
+write.table(data.frame("SampleID" = rownames(otu), otu),
+    file = file.path(getwd(), "otu_input.tsv"),
+    sep = "\t",
+    row.names = FALSE,
+    quote = FALSE
+)
+write.table(data.frame("SampleID" = rownames(metadata), metadata),
+    file = file.path(getwd(), "metadata_input.tsv"),
+    sep = "\t",
+    row.names = FALSE,
+    quote = FALSE
+)
+
+saveRDS(ps, file.path(getwd(), "phyloseq_input.rds"))
+```
+
+## Load test data
+
+```{r load test data}
+library(tidyverse)
+
+# get OTU table (first column is the OTU/ASV ID)
+otu <- read_tsv(file.path(getwd(), "otu_input.tsv"))
+# use first column as colname
+otu2 <- otu %>% tibble::column_to_rownames(colnames(otu)[1])
+otu <- otu_table(otu2, taxa_are_rows = FALSE)
+
+# get metadata table must have matching OTU/ASV ID in first column
+meta <- read_tsv(file.path(getwd(), "metadata_input.tsv"))
+# use first column as colname
+
+meta2 <- meta %>% tibble::column_to_rownames(colnames(meta)[1])
+control_column <- "control"
+
+# convert 0/1 to bool for the control column and store in control column
+meta2$control <- as.logical(meta2[[control_column]])
+sampledata <- sample_data(meta2)
+
+# create dummy tax table (actually not needed,
+# but nice to learn how to load phyloseq objects)
+taxmat <- as.data.frame(matrix(sample(letters, 10, replace = TRUE),
+    nrow = ncol(otu2), ncol = 7
+))
+rownames(taxmat) <- colnames(otu2)
+tax <- tax_table(as.matrix(taxmat))
+
+ps <- phyloseq(otu, tax, sampledata)
+```
+
+# plot 1
+
+```{r plot library size vs control}
+# Put sample_data into a ggplot-friendly data.frame
+df <- as.data.frame(sample_data(ps))
+df$LibrarySize <- sample_sums(ps)
+df <- df[order(df$LibrarySize), ]
+df$Index <- seq_len(nrow(df))
+ggplot(data = df, aes(x = Index, y = LibrarySize, color = control)) +
+    geom_point()
+```
+
+# plot 2
+
+```{r plot prevalence}
+contamdf_prev <- isContaminant(ps,
+    method = "prevalence",
+    neg = "control",
+    threshold = 0.5
+)
+table(contamdf_prev$contaminant)
+
+ps_pa <- transform_sample_counts(ps, function(abund) 1 * (abund > 0))
+ps_pa_neg <- prune_samples(sample_data(ps_pa)$control == TRUE, ps_pa)
+ps_pa_pos <- prune_samples(sample_data(ps_pa)$control == FALSE, ps_pa)
+# Make data_frame of prevalence in positive and negative samples
+df_pa <- data.frame(
+    pa_pos = taxa_sums(ps_pa_pos), pa_neg = taxa_sums(ps_pa_neg),
+    contaminant = contamdf_prev$contaminant
+)
+ggplot(data = df_pa, aes(x = pa_neg, y = pa_pos, color = contaminant)) +
+    geom_point() +
+    xlab("Prevalence (Negative Controls)") +
+    ylab("Prevalence (True Samples)")
+```
+
+# generate output
+
+```{r remove contams}
+id_name <- colnames(otu)[1]
+
+ps_noncontam <- prune_taxa(!contamdf_prev$contaminant, ps)
+
+otu_table(ps_noncontam) %>%
+    as.data.frame() %>%
+    rownames_to_column(id_name) <- otu
+
+write.table(otu,
+    file = file.path(getwd(), "otu_output.tsv"),
+    sep = "\t",
+    row.names = FALSE,
+)
+```