Mercurial > repos > eschen42 > mqppep_anova

---
title: "MaxQuant Phospho-Proteomic Enrichment Pipeline ANOVA"
author: "Larry Cheng; Art Eschenlauer"
date: "May 28, 2018; Mar 16, 2022"
output:
  pdf_document:
    toc: true
  latex_document:
    toc: true
params:
  alphaFile: "test-data/alpha_levels.tabular"
  inputFile: "test-data/UT_Phospho_ST_Sites.preproc.tabular"
  firstDataColumn: "^Intensity[^_]"
  imputationMethod: !r c("group-median", "median", "mean", "random")[4]
  meanPercentile: 1
  sdPercentile: 1.0
  regexSampleNames: "\\.\\d+[A-Z]$"
  regexSampleGrouping: "\\d+"
  imputedDataFilename: "test-data/limbo/imputedDataFilename.txt"
  imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt"
  show_toc: true
---
<!--
  alphaFile: "test-data/alpha_levels.tabular"
  inputFile: "test-data/test_input_for_anova.tabular"
  inputFile: "test-data/UT_Phospho_ST_Sites.preproc.tabular"
  inputFile: "test-data/density_failure.preproc_tab.tabular"
  latex_document: default
-->
```{r setup, include = FALSE}
# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10))

# freeze the random number generator so the same results will be produced
#  from run to run
set.seed(28571)

### CONSTANTS

const_parfin <- par("fin")
const_boxplot_fill <- "grey94"
const_stripchart_cex <- 0.5
const_stripsmall_cex <-
  sqrt(const_stripchart_cex * const_stripchart_cex / 2)
const_stripchart_jitter <- 0.3
const_write_debug_files <- FALSE
const_table_anchor <- "tbp"

### FUNCTIONS

#ANOVA filter function
anova_func <- function(x, grouping_factor) {
  x_aov <- aov(as.numeric(x) ~ grouping_factor)
  pvalue <- summary(x_aov)[[1]][["Pr(>F)"]][1]
  pvalue
}

write_debug_file <- function(s) {
  if (const_write_debug_files) {
    s_path <- sprintf("test-data/%s.txt", deparse(substitute(s)))
    write.table(
      s,
      file = s_path,
      sep = "\t",
      col.names = TRUE,
      row.names = TRUE,
      quote = FALSE
    )
  }
}

latex_collapsed_vector <- function(collapse_string, v) {
  cat(
    paste0(
      gsub("_", "\\\\_", v),
      collapse = collapse_string
      )
    )
}

latex_itemized_collapsed <- function(collapse_string, v) {
  cat("\\begin{itemize}\n\\item ")
  latex_collapsed_vector(collapse_string, v)
  cat("\n\\end{itemize}\n")
}

latex_itemized_list <- function(v) {
  latex_itemized_collapsed("\n\\item ", v)
}

latex_enumerated_collapsed <- function(collapse_string, v) {
  cat("\\begin{enumerate}\n\\item ")
  latex_collapsed_vector(collapse_string, v)
  cat("\n\\end{enumerate}\n")
}

latex_enumerated_list <- function(v) {
  latex_enumerated_collapsed("\n\\item ", v)
}

latex_table_row <- function(v) {
  latex_collapsed_vector(" & ", v)
  cat(" \\\\\n")
}

# Use this like print.data.frame, from which it is adapted:
data_frame_latex <-
  function(
    x,
    ...,
    # digits to pass to format.data.frame
    digits = NULL,
    # TRUE -> right-justify columns; FALSE -> left-justify
    right = TRUE,
    # maximumn number of rows to print
    max = NULL,
    # string with justification of each column
    justification = NULL,
    # TRUE to center on page
    centered = TRUE,
    # optional capttion
    caption = NULL,
    # h(inline); b(bottom); t (top) or p (separate page)
    anchor = "h"
  ) {
    if (is.null(justification))
      justification <-
        Reduce(
          f = paste,
          x = rep_len(if (right) "r" else "l", length(colnames(x)))
          )
    n <- length(rownames(x))
    if (length(x) == 0L) {
      cat(
        sprintf(
          # if n is one, use singular 'row', else use plural 'rows'
          ngettext(
            n,
            "data frame with 0 columns and %d row",
            "data frame with 0 columns and %d rows"
            ),
          n
          ),
        "\n",
        sep = ""
        )
    }
    else if (n == 0L) {
      cat("0 rows for:\n")
      latex_itemized_list(names(x))
    }
    else {
      if (is.null(max))
        max <- getOption("max.print", 99999L)
      if (!is.finite(max))
        stop("invalid 'max' / getOption(\"max.print\"): ",
          max)
      omit <- (n0 <- max %/% length(x)) < n
      m <- as.matrix(
        format.data.frame(
          if (omit) x[seq_len(n0), , drop = FALSE] else x,
          digits = digits,
          na.encode = FALSE
          )
        )
      cat(
        # h(inline); b(bottom); t (top) or p (separate page)
        paste0("\\begin{table}[", anchor, "]\n")
        )
      if (!is.null(caption))
        cat(paste0(" \\caption{", caption, "}"))
      if (centered) cat("\\centering\n")
      cat(
        paste(
          " \\begin{tabular}{",
          justification,
          "}\n",
          sep = ""
          )
        )
      if (!is.null(caption))
        cat(" \\hline\\hline\n")
      latex_table_row(colnames(m))
      cat("\\hline\n")
      for (i in seq_len(length(m[, 1]))) {
        latex_table_row(m[i, ])
      }
      cat(
        paste(
          " \\end{tabular}",
          "\\end{table}",
          sep = "\n"
          )
        )
      if (omit)
        cat(" [ reached 'max' / getOption(\"max.print\") -- omitted",
          n - n0, "rows ]\n")
    }
    invisible(x)
  }

```

## Purpose

Perform imputation of missing values, quantile normalization, and ANOVA.

```{r include = FALSE}
# Input Filename
input_file <- params$inputFile

# First data column - ideally, this could be detected via regexSampleNames,
#   but for now leave it as is.
first_data_column <- params$firstDataColumn
fdc_is_integer <- is.integer(first_data_column)
if (fdc_is_integer) {
  first_data_column <- as.integer(params$firstDataColumn)
}

# False discovery rate adjustment for ANOVA
#  Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05
val_fdr <-
  read.table(file = params$alphaFile, sep = "\t", header = F, quote = "")

if (
  ncol(val_fdr) != 1 ||
  sum(!is.numeric(val_fdr[, 1])) ||
  sum(val_fdr[, 1] < 0) ||
  sum(val_fdr[, 1] > 1)
) {
  stop("alphaFile should be one column of numbers within the range [0.0,1.0]")
}
val_fdr <- val_fdr[, 1]

#Imputed Data filename
imputed_data_filename <- params$imputedDataFilename
imp_qn_lt_data_filenm <- params$imputedQNLTDataFile

#ANOVA data filename
```

```{r echo = FALSE}
# Imputation method, should be one of
#   "random", "group-median", "median", or "mean"
imputation_method <- params$imputationMethod

# Selection of percentile of logvalue data to set the mean for random number
#   generation when using random imputation
mean_percentile <- params$meanPercentile / 100.0

# deviation adjustment-factor for random values; real number.
sd_percentile <- params$sdPercentile

# Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
regex_sample_names <- params$regexSampleNames

# Regular expression to extract Sample Grouping from Sample Name;
#   if error occurs, compare sample_factor_levels and sample_name_matches
#   to see if groupings/pairs line up
#   e.g., "(\\d+)"
regex_sample_grouping <- params$regexSampleGrouping

```

```{r echo = FALSE}
### READ DATA

library(data.table)

# read.table reads a file in table format and creates a data frame from it.
#   - note that `quote = ""` means that quotation marks are treated literally.
full_data <- read.table(
  file = input_file,
  sep = "\t",
  header = T,
  quote = "",
  check.names = FALSE
  )
```

## Extract Sample Names and Factor Levels

Column names parsed from input file are shown in Table 1; sample names and factor levels, in Table 2.

```{r echo = FALSE, results = 'asis'}

data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE)

if (!fdc_is_integer) {
  if (length(data_column_indices) > 0) {
    first_data_column <- data_column_indices[1]
  } else {
    stop(paste("failed to convert firstDataColumn:", first_data_column))
  }
}

cat(
  sprintf(
    paste(
      "\n\nPeptide-intensity data for each sample is",
      "in one of columns %d through %d.\n\n"
      ),
    min(data_column_indices),
    max(data_column_indices)
    )
  )

# Write column names as a LaTeX enumerated list.
column_name_df <- data.frame(
  column = seq_len(length(colnames(full_data))),
  name = colnames(full_data)
  )
data_frame_latex(
  x = column_name_df,
  justification = "l l",
  centered = TRUE,
  caption = "Input data column name",
  anchor = const_table_anchor
  )

```

```{r echo = FALSE, results = 'asis'}
#```{r echo = FALSE, results = 'asis'}
quant_data <- full_data[first_data_column:length(full_data)]
quant_data[quant_data == 0] <- NA
rownames(quant_data) <- full_data$Phosphopeptide
# Get factors -> group replicates (as indicated by terminal letter)
#   by the preceding digits;
# Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$"
#   get factors ->
#      group runs (samples) by ignoring terminal [A-Z] in sample names
#   e.g.
#     group .1A .1B .1C into group 1;
#     group .2A .2B .2C, into group 2;
#     etc.
m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE)
sample_name_matches <- regmatches(names(quant_data), m)
colnames(quant_data) <- sample_name_matches

write_debug_file(quant_data)

m2 <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
sample_factor_levels <- as.factor(regmatches(sample_name_matches, m2))
number_of_samples <- length(sample_name_matches)
sample_factor_df <- data.frame(
  sample = sample_name_matches,
  level = sample_factor_levels
  )
data_frame_latex(
  x = sample_factor_df,
  justification = "c c",
  centered = TRUE,
  caption = "Factor level",
  anchor = const_table_anchor
  )
```
```{r echo = FALSE, results = 'asis'}
cat("\\newpage\n")
```

### Are the log-transformed sample distributions similar?

```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}

quant_data[quant_data == 0] <- NA  #replace 0 with NA
quant_data_log <- log10(quant_data)

rownames(quant_data_log) <- rownames(quant_data)
colnames(quant_data_log) <- sample_name_matches

write_debug_file(quant_data_log)

# data visualization
old_par <- par(
  mai = par("mai") + c(0.5, 0, 0, 0)
)
# ref: https://r-charts.com/distribution/add-points-boxplot/
# Vertical plot
boxplot(
  quant_data_log
, las = 1
, col = const_boxplot_fill
, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
, xlab = "Sample"
)
par(old_par)


cat("\n\n\n")
cat("\n\n\n")

```

```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
library(ggplot2)
if (nrow(quant_data_log) > 1) {
  quant_data_log_stack <- stack(quant_data_log)
  ggplot(
    quant_data_log_stack,
    aes(x = values)
    ) + xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
    ylab("Probability density") +
    geom_density(
      aes(group = ind, colour = ind),
      na.rm = TRUE
    )
} else {
  cat("No density plot because there are too few peptides.\n\n")
}
```

### Globally, are peptide intensities are approximately unimodal?

<!--
# bquote could be used as an alternative to latex2exp::TeX below particularly
#   and when plotting math expressions generally, at the expense of mastering
#   another syntax, which hardly seems worthwhile when I need to use TeX
#   elsewhere; here's an introduction to bquote:
#   https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/
-->
```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}

# identify the location of missing values
fin <- is.finite(as.numeric(as.matrix(quant_data_log)))

logvalues <- as.numeric(as.matrix(quant_data_log))[fin]
logvalues_density <- density(logvalues)
plot(
  x = logvalues_density,
  main = latex2exp::TeX(
    "Smoothed estimated probability density vs. $log_{10}$(peptide intensity)"
    ),
  xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
  ylab = "Probability density"
  )
hist(
  x = as.numeric(as.matrix(quant_data_log)),
  xlim = c(min(logvalues_density$x), max(logvalues_density$x)),
  breaks = 100,
  main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"),
  xlab = latex2exp::TeX("$log_{10}$(peptide intensity)")
)
```

### Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values:

```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
# determine quantile
q1 <- quantile(logvalues, probs = mean_percentile)[1]

# determine standard deviation of quantile to impute
sd_finite <- function(x) {
  ok <- is.finite(x)
  sd(x[ok])
}
# 1 = row of matrix (ie, phosphopeptide)
sds <- apply(quant_data_log, 1, sd_finite)
if (sum(!is.na(sds)) > 2) {
  plot(
    density(sds, na.rm = T)
  , main = "Smoothed estimated probability density vs. std. deviation"
  , sub = "(probability estimation made with Gaussian smoothing)"
  , ylab = "Probability density"
  )
} else {
  cat(
    "At least two non-missing values are required to plot",
    "probability density.\n\n"
    )
}

```

```{r echo = FALSE}
# Determine number of cells to impute
temp <- quant_data[is.na(quant_data)]

# Determine number of values to impute
number_to_impute <- length(temp)

# Determine percent of missing values
pct_missing_values <-
  round(length(temp) / (length(logvalues) + length(temp)) * 100)
```

```{r echo = FALSE}

# prep for trt-median based imputation

```
## Impute Missing Values

```{r echo = FALSE}

imp_smry_potential_before <- length(logvalues)
imp_smry_missing_before   <- number_to_impute
imp_smry_pct_missing      <- pct_missing_values

```

```{r echo = FALSE}
#Determine number of cells to impute

```
```{r echo = FALSE}

#Impute data
quant_data_imp <- quant_data

# Identify which values are missing and need to be imputed
ind <- which(is.na(quant_data_imp), arr.ind = TRUE)

```
```{r echo = FALSE, results = 'asis'}

# Apply imputation
switch(
  imputation_method
, "group-median" = {
    imputation_method_description <-
      paste("Substitute missing value with",
        "median peptide intensity for sample group.\n"
        )
    sample_level_integers <- as.integer(sample_factor_levels)
    for (i in seq_len(length(levels(sample_factor_levels)))) {
      level_cols <- i == sample_level_integers
      ind <- which(is.na(quant_data_imp[, level_cols]), arr.ind = TRUE)
      quant_data_imp[ind, level_cols] <-
        apply(quant_data_imp[, level_cols], 1, median, na.rm = T)[ind[, 1]]
    }
    good_rows <- !is.na(rowMeans(quant_data_imp))
  }
, "median" = {
    imputation_method_description <-
      paste("Substitute missing value with",
        "median peptide intensity across all sample classes.\n"
        )
    quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]]
    good_rows <- !is.na(rowMeans(quant_data_imp))
  }
, "mean" = {
    imputation_method_description <-
      paste("Substitute missing value with",
        "mean peptide intensity across all sample classes.\n"
        )
    quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]]
    good_rows <- !is.na(rowMeans(quant_data_imp))
  }
, "random" = {
    m1 <- median(sds, na.rm = T) * sd_percentile #sd to be used is the median sd
    # If you want results to be reproducible, you will want to call
    #   base::set.seed before calling stats::rnorm
    imputation_method_description <-
      paste("Substitute each missing value with random intensity",
        sprintf(
          "random intensity $N \\sim (%0.2f, %0.2f)$.\n",
          q1, m1
          )
        )
    cat(sprintf("mean_percentile (from input parameter) is %2.0f\n\n",
      100 * mean_percentile))
    cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n",
      sd_percentile))
    quant_data_imp[ind] <-
      10 ^ rnorm(number_to_impute, mean = q1, sd = m1)
    good_rows <- !is.na(rowMeans(quant_data_imp))
  }
)

if (length(good_rows) < 1) {
  print("ERROR: Cannot impute data; there are no good rows!")
  return(-1)
  }
```

```{r echo = FALSE, results = 'asis'}
cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description)
```

```{r echo = FALSE}

imp_smry_potential_after <- sum(good_rows)
imp_smry_rejected_after  <- sum(!good_rows)
imp_smry_missing_after   <- sum(is.na(quant_data_imp[good_rows, ]))
```
```{r echo = FALSE, results = 'asis'}
# ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf
tabular_lines <- paste(
  "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page)
  " \\caption{Imputation Results}",
  " \\centering", # \centering centers the table on the page
  " \\begin{tabular}{l c c c}",
  "  \\hline\\hline",
  "  \\  & potential peptides   & missing values & rejected",
  "    peptides \\\\ [0.5ex]",
  "  \\hline",
  "  before imputation & %d     & %d (%d\\%s)    &    \\\\",
  "  after imputation  & %d     & %d             & %d \\\\ [1ex]",
  "  \\hline",
  " \\end{tabular}",
  #" \\label{table:nonlin}", # may be used to refer this table in the text
  "\\end{table}",
  sep = "\n"
  )
tabular_lines <-
  sprintf(
    tabular_lines,
    imp_smry_potential_before,
    imp_smry_missing_before,
    imp_smry_pct_missing,
    "%",
    imp_smry_potential_after,
    imp_smry_missing_after,
    imp_smry_rejected_after
    )
cat(tabular_lines)
```
```{r echo = FALSE}


# Zap rows where imputation was ineffective
full_data         <- full_data        [good_rows, ]
quant_data        <- quant_data       [good_rows, ]

write_debug_file(quant_data_imp)

quant_data_imp <- quant_data_imp[good_rows, ]
quant_data_imp_good_rows <- quant_data_imp

write_debug_file(quant_data_imp_good_rows)
```

```{r echo = FALSE, results = 'asis'}

can_plot_before_after_imp <- TRUE
d_combined <-
  as.numeric(
    as.matrix(
      log10(quant_data_imp)
      )
    )
d_original <-
  as.numeric(
    as.matrix(
      log10(quant_data_imp[!is.na(quant_data)])
      )
    )

if (sum(!is.na(d_combined)) > 2 && sum(!is.na(d_original)) > 2) {
  d_combined <- density(d_combined)
  d_original <- density(d_original)
} else {
  can_plot_before_after_imp <- FALSE
}

if (sum(is.na(quant_data)) > 0) {
  # There ARE missing values
  d_imputed <-
    as.numeric(
      as.matrix(
        log10(quant_data_imp[is.na(quant_data)])
        )
      )
  if (sum(!is.na(d_combined)) > 2) {
    d_imputed <- (density(d_imputed))
  } else {
    can_plot_before_after_imp <- FALSE
  }
} else {
  # There are NO missing values
  d_imputed <- d_combined
}

```

```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
if (sum(is.na(quant_data)) > 0) {
  cat("\\leavevmode\\newpage\n")
  # data visualization
  old_par <- par(
    mai = par("mai") + c(0.5, 0, 0, 0)
  )
  x <- quant_data
  x <- blue_dots <- x / x
  blue_dots <- log10(blue_dots * quant_data)
  x[is.na(x)] <- 0
  x[x == 1] <- NA
  x[x == 0] <- 1
  quant_data_imp_log10 <-
    log10(quant_data_imp)

  write_debug_file(quant_data_imp_log10)

  red_dots <- quant_data_imp_log10 * x
  count_red <- sum(!is.na(red_dots))
  count_blue <- sum(!is.na(blue_dots))
  ylim_save <- ylim <- c(
    min(red_dots, blue_dots, na.rm = TRUE),
    max(red_dots, blue_dots, na.rm = TRUE)
    )
  show_stripchart <-
    50 > (count_red + count_blue) / length(sample_name_matches)
  if (show_stripchart) {
    boxplot_sub <- "Light blue = data before imputation; Red = imputed data"
  } else {
    boxplot_sub <- ""
  }

  # Vertical plot
  colnames(blue_dots) <- sample_name_matches
  boxplot(
      blue_dots
    , las = 1 # "always horizontal"
    , col = const_boxplot_fill
    , ylim = ylim
    , main = "Peptide intensities before and after imputation"
    , sub = boxplot_sub
    , xlab = "Sample"
    , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
    )

  if (show_stripchart) {
    # Points
    # ref: https://r-charts.com/distribution/add-points-boxplot/
    # NA values are not plotted
    stripchart(
      blue_dots,                 # Data
      method = "jitter",          # Random noise
      jitter = const_stripchart_jitter,
      pch = 19,                   # Pch symbols
      cex = const_stripsmall_cex, # Size of symbols reduced
      col = "lightblue",          # Color of the symbol
      vertical = TRUE,            # Vertical mode
      add = TRUE                  # Add it over
      )
    stripchart(
      red_dots,                   # Data
      method = "jitter",          # Random noise
      jitter = const_stripchart_jitter,
      pch = 19,                   # Pch symbols
      cex = const_stripsmall_cex, # Size of symbols reduced
      col = "red",                # Color of the symbol
      vertical = TRUE,            # Vertical mode
      add = TRUE                  # Add it over
      )

  } else {
    # violin plot
    cat("\\leavevmode\n\\quad\n\n\\quad\n\n")
    vioplot::vioplot(
      x = lapply(blue_dots, function(x) x[!is.na(x)]),
      col = "lightblue1",
      side = "left",
      plotCentre = "line",
      ylim = ylim_save,
      main = "Distributions of observed and imputed data",
      sub = "Light blue = observed data; Pink = imputed data",
      xlab = "Sample",
      ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
      )
    vioplot::vioplot(
      x = lapply(red_dots, function(x) x[!is.na(x)]),
      col = "lightpink1",
      side = "right",
      plotCentre = "line",
      add = T
      )
  }

  par(old_par)

  # density plot
  cat("\\leavevmode\n\n\n\n\n\n\n")
  if (can_plot_before_after_imp) {
    ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y))
    plot(
      d_combined,
      ylim = ylim,
      sub =
        paste(
          "Blue = data before imputation; Red = imputed data;",
          "Black = combined"
          ),
      main = "Density of peptide intensity before and after imputation",
      xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
      ylab = "Probability density"
    )
    lines(d_original, col = "blue")
    lines(d_imputed, col = "red")
  } else {
    cat(
      "There are too few points to plot the density of peptide intensity",
      "before and after imputation."
      )
  }
  cat("\\leavevmode\\newpage\n")
}
```

## Perform Quantile Normalization

<!--
# Apply quantile normalization using preprocessCore::normalize.quantiles
# ---
# tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html
#   except this: https://support.bioconductor.org/p/122925/#9135989
#   says to install it like this:
#     ```
#     BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1])
#     ```
# conda installation (necessary because of a bug in recent openblas):
#   conda install bioconductor-preprocesscore openblas=0.3.3
# ...
# ---
# normalize.quantiles {preprocessCore}	--  Quantile Normalization
#
# Description:
#   Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities.
#
# Usage:
#   normalize.quantiles(x, copy = TRUE, keep.names = FALSE)
#
# Arguments:
#
#   - x: A matrix of intensities where each column corresponds to a chip and each row is a probe.
#
#   - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy,
#       but in certain situations not making a copy of the matrix, but instead normalizing
#       it in place will be more memory friendly.
#
#   - keep.names: Boolean option to preserve matrix row and column names in output.
#
# Details:
#   This method is based upon the concept of a quantile-quantile plot extended to n dimensions.
#     No special allowances are made for outliers. If you make use of quantile normalization
#     please cite Bolstad et al, Bioinformatics (2003).
#
#   This functions will handle missing data (ie NA values), based on
#     the assumption that the data is missing at random.
#
#   Note that the current implementation optimizes for better memory usage
#     at the cost of some additional run-time.
#
# Value: A normalized matrix.
#
# Author: Ben Bolstad, bmbolstad.com
#
# References
#
#   - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide
#       Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf
#
#   - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of
#       Normalization Methods for High Density Oligonucleotide Array Data Based on Bias
#       and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185
#       http://bmbolstad.com/misc/normalize/normalize.html
# ...
-->
```{r echo = FALSE, results = 'asis'}
library(preprocessCore)

if (nrow(quant_data_imp) > 0) {
  quant_data_imp_qn <- normalize.quantiles(as.matrix(quant_data_imp))
} else {
  quant_data_imp_qn <- as.matrix(quant_data_imp)
}

quant_data_imp_qn <- as.data.frame(quant_data_imp_qn)
names(quant_data_imp_qn) <- names(quant_data_imp)

write_debug_file(quant_data_imp_qn)

quant_data_imp_qn_log <- log10(quant_data_imp_qn)
rownames(quant_data_imp_qn_log) <- rownames(quant_data_imp)

write_debug_file(quant_data_imp_qn_log)

quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn))))

any_nan <- function(x) {
  !any(x == "NaN")
}
sel <- apply(quant_data_imp_qn_ls, 1, any_nan)
quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls[which(sel), ]
quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2)
rownames(quant_data_imp_qn_ls2) <- rownames(quant_data_imp)[sel]

quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls)
rownames(quant_data_imp_qn_ls) <- rownames(quant_data_imp)

write_debug_file(quant_data_imp_qn_ls)
write_debug_file(quant_data_imp_qn_ls2)

#output quantile normalized data
data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log)
write.table(
  data_table_imp_qn_lt,
  file = imp_qn_lt_data_filenm,
  sep = "\t",
  col.names = TRUE,
  row.names = FALSE,
  quote = FALSE
)

```

<!-- ACE insertion begin -->
### Checking that normalized, imputed, log-transformed sample distributions are similar:

```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}

# Save unimputed quant_data_log for plotting below
unimputed_quant_data_log <- quant_data_log

# log10 transform (after preparing for zero values,
#   which should never happen...)
quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001
quant_data_log <- log10(quant_data_imp_qn)

# Output with imputed, un-normalized data

data_table_imputed <- cbind(full_data[1:9], quant_data_imp)
write.table(
    data_table_imputed
  , file = imputed_data_filename
  , sep = "\t"
  , col.names = TRUE
  , row.names = FALSE
  , quote = FALSE
  )

how_many_peptides <- nrow(quant_data_log)

if ((how_many_peptides) > 0) {
  cat(
    sprintf(
      "Intensities for %d peptides:\n\n\n",
      how_many_peptides
      )
    )
  cat("\n\n\n")


  # data visualization
  old_par <- par(
    mai = par("mai") + c(0.5, 0, 0, 0)
  , oma = par("oma") + c(0.5, 0, 0, 0)
  )
  # ref: https://r-charts.com/distribution/add-points-boxplot/
  # Vertical plot
  colnames(quant_data_log) <- sample_name_matches
  boxplot(
    quant_data_log
  , las = 1
  , col = const_boxplot_fill
  , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
  , xlab = "Sample"
  )
  par(old_par)
} else {
  cat("There are no peptides to plot\n")
}

cat("\n\n\n")

```

```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
if (nrow(quant_data_log) > 1) {
  quant_data_log_stack <- stack(quant_data_log)
  ggplot(
    quant_data_log_stack,
    aes(x = values)
    ) + xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
    ylab("Probability density") +
    geom_density(
      aes(group = ind, colour = ind),
      na.rm = TRUE
      )
} else {
  cat("No density plot because there are fewer than two peptides to plot.\n\n")
}
```
```{r echo = FALSE, results = 'asis'}
cat("\\leavevmode\\newpage\n")
```

## Perform ANOVA Filters

```{r, echo = FALSE}
# Make new data frame containing only Phosphopeptides
#   to connect preANOVA to ANOVA (connect_df)
connect_df <- data.frame(
    data_table_imp_qn_lt$Phosphopeptide
  , data_table_imp_qn_lt[, first_data_column]
  )
colnames(connect_df) <- c("Phosphopeptide", "Intensity")
```

```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
count_of_factor_levels <- length(levels(sample_factor_levels))
if (count_of_factor_levels < 2) {
  nuke_control_sequences <-
    function(s) {
      s <- gsub("[\\]", "xyzzy_plugh", s)
      s <- gsub("[$]", "\\\\$", s)
      s <- gsub("xyzzy_plugh", "$\\\\backslash$", s)
      return(s)
    }
  cat(
    "ERROR!!!! Cannot perform ANOVA analysis",
    "(see next page)\\newpage\n"
  )
  cat(
    "ERROR: ANOVA analysis",
    "requires two or more factor levels!\n\n\n"
  )

  cat("\n\n\n\n\n")
  cat("Unparsed sample names are:\n\n\n",
    "\\begin{quote}\n",
    paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"),
    "\n\\end{quote}\n\n")

  regex_sample_names <- nuke_control_sequences(regex_sample_names)

  cat("\\leavevmode\n\n\n")
  cat("Parsing rule for SampleNames is",
    "\n\n\n",
    "\\text{'",
    regex_sample_names,
    "'}\n\n\n",
    sep = ""
    )

  cat("\nParsed sample names are:\n",
    "\\begin{quote}\n",
    paste(sample_name_matches, collapse = "\n\n\n"),
    "\n\\end{quote}\n\n")

  regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping)

  cat("\\leavevmode\n\n\n")
  cat("Parsing rule for SampleGrouping is",
    "\n\n\n",
    "\\text{'",
    regex_sample_grouping,
    "'}\n\n\n",
    sep = ""
    )

  cat("\n\n\n")
  cat("Sample group assignments are:\n",
    "\\begin{quote}\n",
    paste(regmatches(sample_name_matches, m2), collapse = "\n\n\n"),
    "\n\\end{quote}\n\n")

} else {
  p_value_data_anova_ps <-
    apply(
      quant_data_imp_qn_log,
      1,
      anova_func,
      grouping_factor = sample_factor_levels
      )

  p_value_data_anova_ps_fdr <-
    p.adjust(p_value_data_anova_ps, method = "fdr")
  p_value_data <- data.frame(
    phosphopeptide = full_data[, 1]
    ,
    raw_anova_p = p_value_data_anova_ps
    ,
    fdr_adjusted_anova_p = p_value_data_anova_ps_fdr
  )

  # output ANOVA file to constructed filename,
  #   e.g.    "Outputfile_pST_ANOVA_STEP5.txt"
  #   becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt"

  # Re-output datasets to include p-values
  write.table(
    cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp),
    file = imputed_data_filename,
    sep = "\t",
    col.names = TRUE,
    row.names = FALSE,
    quote = FALSE
    )

  write.table(
    cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp_qn_log),
    file = imp_qn_lt_data_filenm,
    sep = "\t",
    col.names = TRUE,
    row.names = FALSE,
    quote = FALSE
    )


  p_value_data <-
    p_value_data[order(p_value_data$fdr_adjusted_anova_p), ]

  first_page_suppress <- 1
  number_of_peptides_found <- 0
  cutoff <- val_fdr[1]
  for (cutoff in val_fdr) {
    if (number_of_peptides_found > 49) {
      cat("\\leavevmode\n\n\n")
      break
      }

    #loop through FDR cutoffs

    filtered_p <-
      p_value_data[
        which(p_value_data$fdr_adjusted_anova_p < cutoff),
        , drop = FALSE
        ]
    filtered_data_filtered <-
      quant_data_imp_qn_log[
        rownames(filtered_p),
        , drop = FALSE
        ]
    filtered_data_filtered <-
      filtered_data_filtered[
        order(filtered_p$fdr_adjusted_anova_p),
        , drop = FALSE
        ]

    # <!-- ACE insertion start -->

    if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) {
      if (first_page_suppress == 1) {
        first_page_suppress <- 0
        }
        else {
          cat("\\newpage\n")
        }
      cat(sprintf(
        "Intensities for %d peptides whose adjusted p-value < %0.2f:\n",
        nrow(filtered_data_filtered),
        cutoff
      ))
      cat("\n\n\n")
      cat("\n\n\n")

      old_oma <- par("oma")
      old_par <- par(
        mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1),
        oma = old_oma * c(1, 1, 0.3, 1),
        cex.main = 0.9,
        cex.axis = 0.7,
        fin = c(9, 7.25)
        )
      # ref: https://r-charts.com/distribution/add-points-boxplot/
      # Vertical plot
      colnames(filtered_data_filtered) <- sample_name_matches
      boxplot(
        filtered_data_filtered,
        main = "Imputed, normalized intensities", # no line plot
        las = 1,
        col = const_boxplot_fill,
        ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
      )
      par(old_par)
    } else {
      cat(sprintf(
        "%s < %0.2f\n\n\n\n\n",
        "No peptides were found to have cutoff adjusted p-value",
        cutoff
      ))
    }

    if (nrow(filtered_data_filtered) > 0) {
      #Add Phosphopeptide column to anova_filtered table
      anova_filtered_merge <- merge(
        x = connect_df,
        y = filtered_data_filtered,
        by.x = "Intensity",
        by.y = 1
      )
      anova_filtered_merge_order <- rownames(filtered_p)

      anova_filtered_merge_format <- sapply(
        X = filtered_p$fdr_adjusted_anova_p
        ,
        FUN = function(x) {
          if (x > 0.0001)
            paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s")
          else
            paste0("(%0.4e) %s")
        }
      )

      anova_filtered <- data.table(
        anova_filtered_merge$Phosphopeptide
        ,
        anova_filtered_merge$Intensity
        ,
        anova_filtered_merge[, 2:number_of_samples + 1]
      )
      colnames(anova_filtered) <-
        c("Phosphopeptide", colnames(filtered_data_filtered))

      # Merge qualitative columns into the ANOVA data
      output_table <- data.frame(anova_filtered$Phosphopeptide)
      output_table <- merge(
        x = output_table,
        y = data_table_imp_qn_lt,
        by.x = "anova_filtered.Phosphopeptide",
        by.y = "Phosphopeptide"
      )

      # Produce heatmap to visualize significance and the effect of imputation
      m <-
        as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ])
      m_nan_rows <- rowSums(
        matrix(
          as.integer(is.na(m)),
          nrow = nrow(m)
          )
        )
      m <- m[!m_nan_rows, , drop = FALSE]
      if (nrow(m) > 0) {
        rownames_m <- rownames(m)
        rownames(m) <- sapply(
          X = seq_len(nrow(m))
          ,
          FUN = function(i) {
            sprintf(
              anova_filtered_merge_format[i],
              filtered_p$fdr_adjusted_anova_p[i],
              rownames_m[i]
            )
          }
        )
        how_many_peptides <- min(50, nrow(m))
        number_of_peptides_found <- how_many_peptides
        if (nrow(m) > 1) {
          m_margin <- m[how_many_peptides:1, ]
          margins <-
            c(max(nchar(colnames(m_margin))) * 10 / 16 # col
              , max(nchar(rownames(m_margin))) * 5 / 16 # row
              )
          }

        cat("\\newpage\n")
        if (nrow(m) > 50) {
          cat("Heatmap for the 50 most-significant peptides",
            sprintf(
              "whose adjusted p-value < %0.2f\n",
              cutoff)
          )
        } else {
          if (nrow(m) == 1) {
            next
          } else {
            cat(
              sprintf("Heatmap for %d usable peptides whose", nrow(m)),
              sprintf("adjusted p-value < %0.2f\n", cutoff)
            )
          }
        }
        cat("\n\n\n")
        cat("\n\n\n")
        try(
          if (nrow(m) > 1) {
            old_oma <- par("oma")
            par(cex.main = 0.6)
            heatmap(
              m[how_many_peptides:1, ],
              Rowv = NA,
              Colv = NA,
              cexRow = 0.7,
              cexCol = 0.8,
              scale = "row",
              margins = margins,
              main =
                "Unimputed, unnormalized intensities",
              xlab = "",
              las = 1 #, fin = c(9, 5.5)
              )
          }
        )
      }
    }
  }
}
cat("\\leavevmode\n\n\n")
```
author	eschen42
date	Wed, 13 Apr 2022 19:48:32 +0000
parents	2c5f1a2fe16a
children	61adb8801b73