Mercurial > repos > eschen42 > mqppep_anova

diff mqppep_anova_script.Rmd @ 13:b41a077af3aa draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 040e4945da00a279cb60daae799fce9489f99c50"
author: eschen42
date: Tue, 22 Mar 2022 20:47:40 +0000
parents: 4deacfee76ef
children: 6679616d0c18
--- a/mqppep_anova_script.Rmd	Tue Mar 15 18:17:55 2022 +0000
+++ b/mqppep_anova_script.Rmd	Tue Mar 22 20:47:40 2022 +0000
@@ -1,24 +1,50 @@
 ---
 title: "MaxQuant Phospho-Proteomic Enrichment Pipeline ANOVA"
 author: "Larry Cheng; Art Eschenlauer"
-date: "May 28, 2018; Nov 16, 2021"
+date: "May 28, 2018; Mar 16, 2022"
 output:
-  pdf_document: default
+  pdf_document:
+    toc: true
+  latex_document:
+    toc: true
 params:
   inputFile: "test-data/test_input_for_anova.tabular"
   alphaFile: "test-data/alpha_levels.tabular"
   firstDataColumn: "Intensity"
   imputationMethod: !r c("group-median", "median", "mean", "random")[1]
   meanPercentile: 1
-  sdPercentile: 0.2
+  sdPercentile: 1.0
   regexSampleNames: "\\.\\d+[A-Z]$"
   regexSampleGrouping: "\\d+"
-  imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt"
+  imputedDataFilename: "test-data/imputedDataFilename.txt"
+  imputedQNLTDataFile: "test-data/imputedQNLTDataFile.txt"
+  show_toc: true
 ---
+<!--
+  latex_document: default
+  inputFile: "test-data/test_input_for_anova.tabular"
+  inputFile: "test-data/density_failure.preproc_tab.tabular"
+  inputFile: "test-data/UT_Phospho_STY_Sites.preproc_tab"
+date: "May 28, 2018; Mar 16, 2022"
+-->
 ```{r setup, include = FALSE}
 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10))
 
+# freeze the random number generator so the same results will be produced
+#  from run to run
+set.seed(28571)
+
+### CONSTANTS
+
+const_parfin <- par("fin")
+const_boxplot_fill <- "grey94"
+const_stripchart_cex <- 0.5
+const_stripsmall_cex <-
+  sqrt(const_stripchart_cex * const_stripchart_cex / 2)
+const_stripchart_jitter <- 0.3
+const_write_debug_files <- FALSE
+
 ### FUNCTIONS
 
 #ANOVA filter function
@@ -27,9 +53,155 @@
   pvalue <- summary(x_aov)[[1]][["Pr(>F)"]][1]
   pvalue
 }
+
+write_debug_file <- function(s) {
+  if (const_write_debug_files) {
+    s_path <- sprintf("test-data/%s.txt", deparse(substitute(s)))
+    write.table(
+      s,
+      file = s_path,
+      sep = "\t",
+      col.names = TRUE,
+      row.names = TRUE,
+      quote = FALSE
+    )
+  }
+}
+
+latex_collapsed_vector <- function(collapse_string, v) {
+  cat(
+    paste0(
+      gsub("_", "\\\\_", v),
+      collapse = collapse_string
+      )
+    )
+}
+
+latex_itemized_collapsed <- function(collapse_string, v) {
+  cat("\\begin{itemize}\n\\item ")
+  latex_collapsed_vector(collapse_string, v)
+  cat("\n\\end{itemize}\n")
+}
+
+latex_itemized_list <- function(v) {
+  latex_itemized_collapsed("\n\\item ", v)
+}
+
+latex_enumerated_collapsed <- function(collapse_string, v) {
+  cat("\\begin{enumerate}\n\\item ")
+  latex_collapsed_vector(collapse_string, v)
+  cat("\n\\end{enumerate}\n")
+}
+
+latex_enumerated_list <- function(v) {
+  latex_enumerated_collapsed("\n\\item ", v)
+}
+
+latex_table_row <- function(v) {
+  latex_collapsed_vector(" & ", v)
+  cat(" \\\\\n")
+}
+
+# Use this like print.data.frame, from which it is adapted:
+print_data_frame_latex <-
+  function(
+    x,
+    ...,
+    # digits to pass to format.data.frame
+    digits = NULL,
+    # TRUE -> right-justify columns; FALSE -> left-justify
+    right = TRUE,
+    # maximumn number of rows to print
+    max = NULL,
+    # string with justification of each column
+    justification = NULL,
+    # TRUE to center on page
+    centered = FALSE,
+    # optional capttion
+    caption = NULL,
+    # h(inline); b(bottom); t (top) or p (separate page)
+    anchor = "h"
+  ) {
+    if (is.null(justification))
+      justification <-
+        Reduce(
+          f = paste,
+          x = rep_len(if (right) "r" else "l", length(colnames(x)))
+          )
+    n <- length(rownames(x))
+    if (length(x) == 0L) {
+      cat(
+        sprintf(
+          # if n is one, use singular 'row', else use plural 'rows'
+          ngettext(
+            n,
+            "data frame with 0 columns and %d row",
+            "data frame with 0 columns and %d rows"
+            ),
+          n
+          ),
+        "\n",
+        sep = ""
+        )
+    }
+    else if (n == 0L) {
+      cat("0 rows for:\n")
+      latex_itemized_list(names(x))
+    }
+    else {
+      if (is.null(max))
+        max <- getOption("max.print", 99999L)
+      if (!is.finite(max))
+        stop("invalid 'max' / getOption(\"max.print\"): ",
+          max)
+      omit <- (n0 <- max %/% length(x)) < n
+      m <- as.matrix(
+        format.data.frame(
+          if (omit) x[seq_len(n0), , drop = FALSE] else x,
+          digits = digits,
+          na.encode = FALSE
+          )
+        )
+      cat(
+        # h(inline); b(bottom); t (top) or p (separate page)
+        paste0("\\begin{table}[", anchor, "]\n")
+        )
+      if (!is.null(caption))
+        cat(paste0(" \\caption{", caption, "}"))
+      if (centered) cat("\\centering\n")
+      cat(
+        paste(
+          " \\begin{tabular}{",
+          justification,
+          "}\n",
+          sep = ""
+          )
+        )
+      if (!is.null(caption))
+        cat(" \\hline\\hline\n")
+      latex_table_row(colnames(m))
+      cat("\\hline\n")
+      for (i in seq_len(length(m[, 1]))) {
+        latex_table_row(m[i, ])
+      }
+      cat(
+        paste(
+          " \\end{tabular}",
+          "\\end{table}",
+          sep = "\n"
+          )
+        )
+      if (omit)
+        cat(" [ reached 'max' / getOption(\"max.print\") -- omitted",
+          n - n0, "rows ]\n")
+    }
+    invisible(x)
+  }
+
 ```
 
 ## Purpose:
+
 Perform imputation of missing values, quantile normalization, and ANOVA.
 
 <!--
@@ -58,6 +230,7 @@
 
 #Imputed Data filename
 imputed_data_filename <- params$imputedDataFilename
+imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
 
 #ANOVA data filename
 ```
@@ -78,7 +251,7 @@
 regex_sample_names <- params$regexSampleNames
 
 # Regular expression to extract Sample Grouping from Sample Name;
-#   if error occurs, compare sample_factor_levels and temp_matches
+#   if error occurs, compare sample_factor_levels and sample_name_matches
 #   to see if groupings/pairs line up
 #   e.g., "(\\d+)"
 regex_sample_grouping <- params$regexSampleGrouping
@@ -101,25 +274,32 @@
   )
 ```
 
-### Column names from input file
-
-```{r echo = FALSE, results = 'markup'}
-print(colnames(full_data))
-data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE)
-cat(sprintf("First data column:  %d\n", min(data_column_indices)))
-cat(sprintf("Last data column:   %d\n", max(data_column_indices)))
-```
+### Parse column names, sample names, and factor levels from input file
 
 ```{r echo = FALSE, results = 'asis'}
-cat("\\newpage\n")
-```
+# Write column naames as an enumerated list.
+column_name_df <- data.frame(
+  column = seq_len(length(colnames(full_data))),
+  name = colnames(full_data)
+  )
+print_data_frame_latex(
+  x = column_name_df,
+  justification = "l l",
+  centered = TRUE,
+  caption = "Input data column name",
+  anchor = "h"
+  )
 
-### Checking that log-transformed sample distributions are similar:
-
-```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
+data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE)
+cat(
+  sprintf(
+    "\n\nData columns: [%d,%d]\n\n",
+    min(data_column_indices),
+    max(data_column_indices)
+    )
+  )
 
 if (FALSE == fdc_is_integer) {
-
   if (length(data_column_indices) > 0) {
     first_data_column <- data_column_indices[1]
   } else {
@@ -127,45 +307,114 @@
   }
 }
 
-quant_data0 <- full_data[first_data_column:length(full_data)]
+```
+
+```{r echo = FALSE, results = 'asis'}
+#```{r echo = FALSE, results = 'asis'}
 quant_data <- full_data[first_data_column:length(full_data)]
+quant_data[quant_data == 0] <- NA
+rownames(quant_data) <- full_data$Phosphopeptide
+# Get factors -> group replicates (as indicated by terminal letter)
+#   by the preceding digits;
+# Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$"
+#   get factors ->
+#      group runs (samples) by ignoring terminal [A-Z] in sample names
+#   e.g.
+#     group .1A .1B .1C into group 1;
+#     group .2A .2B .2C, into group 2;
+#     etc.
+m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE)
+sample_name_matches <- regmatches(names(quant_data), m)
+colnames(quant_data) <- sample_name_matches
+
+write_debug_file(quant_data)
+
+m2 <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
+sample_factor_levels <- as.factor(regmatches(sample_name_matches, m2))
+number_of_samples <- length(sample_name_matches)
+sample_factor_df <- data.frame(
+  sample = sample_name_matches,
+  level = sample_factor_levels
+  )
+print_data_frame_latex(
+  x = sample_factor_df,
+  justification = "c c",
+  centered = TRUE,
+  caption = "Factor level",
+  anchor = "h"
+  )
+```
+```{r echo = FALSE, results = 'asis'}
+cat("\\newpage\n")
+```
+
+### Are the log-transformed sample distributions similar?
+
+```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
+
 quant_data[quant_data == 0] <- NA  #replace 0 with NA
 quant_data_log <- log10(quant_data)
 
-rownames(quant_data_log) <- full_data$Phosphopeptide
+rownames(quant_data_log) <- rownames(quant_data)
+colnames(quant_data_log) <- sample_name_matches
+
+write_debug_file(quant_data_log)
 
 # data visualization
 old_par <- par(
   mai = par("mai") + c(0.5, 0, 0, 0)
 )
+# ref: https://r-charts.com/distribution/add-points-boxplot/
+# Vertical plot
 boxplot(
   quant_data_log
-, las = 2
+, las = 1
+, col = const_boxplot_fill
 )
+# Points
+stripchart(
+  quant_data_log, # Data
+  method = "jitter",          # Random noise
+  jitter = const_stripchart_jitter,
+  pch = 19,                   # Pch symbols
+  cex = const_stripchart_cex, # Size of symbols reduced
+  col = "goldenrod",          # Color of the symbol
+  vertical = TRUE,            # Vertical mode
+  add = TRUE                  # Add it over
+  )
 par(old_par)
 
 
 
-cat("\\newline\n")
-cat("\\newline\n")
+cat("\n\n\n")
+cat("\n\n\n")
 
 ```
 
-```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), warning = FALSE}
-quant_data_log_stack <- stack(quant_data_log)
+```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
 library(ggplot2)
-ggplot(
-  quant_data_log_stack,
-  aes(x = values)) + geom_density(aes(group = ind, colour = ind))
+if (nrow(quant_data_log) > 1) {
+  quant_data_log_stack <- stack(quant_data_log)
+  ggplot(
+    quant_data_log_stack,
+    aes(x = values)
+    ) +
+    geom_density(
+      aes(group = ind, colour = ind),
+      na.rm = TRUE
+    )
+} else {
+  cat("No density plot because there are too few peptides.\n\n")
+}
 ```
 
-### Globally, are phosphopeptide intensities are approximately unimodal?
+### Globally, are peptide intensities are approximately unimodal?
 
 <!--
 # ref for bquote below particularly and plotting math expressions generally:
 #   https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/
 -->
-```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5)}
+```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
 
 # identify the location of missing values
 fin <- is.finite(as.numeric(as.matrix(quant_data_log)))
@@ -185,92 +434,63 @@
 )
 ```
 
-### Distribution of standard deviations of phosphopeptides, ignoring missing values:
+### Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values:
 
-```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5)}
+```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
 # determine quantile
 q1 <- quantile(logvalues, probs = mean_percentile)[1]
 
 # determine standard deviation of quantile to impute
 sd_finite <- function(x) {
   ok <- is.finite(x)
-  sd(x[ok]) * sd_percentile
+  sd(x[ok])
 }
 # 1 = row of matrix (ie, phosphopeptide)
 sds <- apply(quant_data_log, 1, sd_finite)
-plot(
-  density(sds, na.rm = T)
-, main = "Smoothed estimated probability density vs. std. deviation"
-, sub = "(probability estimation made with Gaussian smoothing)"
-)
-
-m1 <- median(sds, na.rm = T) #sd to be used is the median sd
+if (sum(!is.na(sds)) > 2) {
+  plot(
+    density(sds, na.rm = T)
+  , main = "Smoothed estimated probability density vs. std. deviation"
+  , sub = "(probability estimation made with Gaussian smoothing)"
+  )
+} else {
+  cat(
+    "At least two non-missing values are required to plot",
+    "probability density.\n\n"
+    )
+}
 
 ```
 
-
-
-<!--
-The number of missing values are:
--->
 ```{r echo = FALSE}
-#Determine number of cells to impute
+# Determine number of cells to impute
 temp <- quant_data[is.na(quant_data)]
 
-#Determine number of values to impute
+# Determine number of values to impute
 number_to_impute <- length(temp)
-```
-
-<!--
-% of values that are missing:
--->
-```{r echo = FALSE}
-pct_missing_values <- length(temp) / (length(logvalues) + length(temp)) * 100
-```
 
-<!--
-First few rows of data before imputation:
--->
-```{r echo = FALSE, results = 'asis'}
-cat("\\newpage\n")
+# Determine percent of missing values
+pct_missing_values <-
+  round(length(temp) / (length(logvalues) + length(temp)) * 100)
 ```
 
-## Parse sample names
-
-Parse the names of the samples to deduce the factor level for each sample:
-
 ```{r echo = FALSE}
 
 # prep for trt-median based imputation
 
-# Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$"
-#   get factors ->
-#      group runs (samples) by ignoring terminal [A-Z] in sample names
-
-m <- regexpr(regex_sample_names, names(quant_data), perl = TRUE)
-temp_matches <- regmatches(names(quant_data), m)
-print("Extracted sample names")
-print(temp_matches)
-m2 <- regexpr(regex_sample_grouping, temp_matches, perl = TRUE)
-sample_factor_levels <- as.factor(regmatches(temp_matches, m2))
-print("Factor levels")
-print(sample_factor_levels)
-
 ```
-## Impute missing values
+## Impute Missing Values
 
 ```{r echo = FALSE}
 
+imp_smry_potential_before <- length(logvalues)
+imp_smry_missing_before   <- number_to_impute
+imp_smry_pct_missing      <- pct_missing_values
+
+```
+
+```{r echo = FALSE}
 #Determine number of cells to impute
-cat("Before imputation,",
-  sprintf(
-    "there are:\n  %d peptides\n  %d missing values (%2.0f%s)",
-    sum(rep.int(TRUE, nrow(quant_data))),
-    sum(is.na(quant_data)),
-    pct_missing_values,
-    "%"
-    )
-)
 
 ```
 ```{r echo = FALSE}
@@ -282,15 +502,16 @@
 ind <- which(is.na(quant_data_imp), arr.ind = TRUE)
 
 ```
-```{r echo = FALSE}
+```{r echo = FALSE, results = 'asis'}
 
 # Apply imputation
 switch(
   imputation_method
 , "group-median" = {
-    cat("Imputation method:\n   substitute missing value",
-      "with median peptide-intensity for sample-group\n")
-
+    imputation_method_description <-
+      paste("Substitute missing value with",
+        "median peptide intensity for sample group\n"
+        )
     sample_level_integers <- as.integer(sample_factor_levels)
     for (i in seq_len(length(levels(sample_factor_levels)))) {
       level_cols <- i == sample_level_integers
@@ -301,47 +522,90 @@
     good_rows <- !is.na(rowMeans(quant_data_imp))
   }
 , "median" = {
-    cat("Imputation method:\n   substitute missing value with",
-      "median peptide-intensity across all sample classes\n")
+    imputation_method_description <-
+      paste("Substitute missing value with",
+        "median peptide intensity across all sample classes\n"
+        )
     quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]]
     good_rows <- !is.na(rowMeans(quant_data_imp))
   }
 , "mean" = {
-    cat("Imputation method:\n   substitute missing value with",
-      "mean peptide-intensity across all sample classes\n")
+    imputation_method_description <-
+      paste("Substitute missing value with",
+        "mean peptide intensity across all sample classes\n"
+        )
     quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]]
     good_rows <- !is.na(rowMeans(quant_data_imp))
   }
 , "random" = {
-    cat(
-      "Imputation method:\n   substitute missing value with\n  ",
-      sprintf(
-        "random intensity N ~ (%0.2f, %0.2f)\n"
-      , q1, m1
-      )
-    )
-    quant_data_imp[is.na(quant_data_imp)] <-
+    m1 <- median(sds, na.rm = T) * sd_percentile #sd to be used is the median sd
+    # If you want results to be reproducible, you will want to call
+    #   base::set.seed before calling stats::rnorm
+    imputation_method_description <-
+      paste("Substitute each missing value with random intensity",
+        sprintf(
+          "random intensity $N \\sim (%0.2f, %0.2f)$\n",
+          q1, m1
+          )
+        )
+    cat(sprintf("mean_percentile (from input parameter) is %2.0f\n\n",
+      100 * mean_percentile))
+    cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n",
+      sd_percentile))
+    #ACE cat(sprintf("sd for rnorm is %0.4f\n\n", m1))
+    quant_data_imp[ind] <-
       10 ^ rnorm(number_to_impute, mean = q1, sd = m1)
     good_rows <- !is.na(rowMeans(quant_data_imp))
   }
 )
 
+if (length(good_rows) < 1) {
+  print("ERROR: Cannot impute data; there are no good rows!")
+  return(-1)
+  }
 ```
+
+```{r echo = FALSE, results = 'asis'}
+cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description)
+```
+
 ```{r echo = FALSE}
 
-#Determine number of cells to impute
-temp <- quant_data_imp[is.na(quant_data_imp)]
-cat("After imputation, there are:",
+imp_smry_potential_after <- sum(good_rows)
+imp_smry_rejected_after  <- sum(!good_rows)
+imp_smry_missing_after   <- sum(is.na(quant_data_imp[good_rows, ]))
+```
+```{r echo = FALSE, results = 'asis'}
+# ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf
+tabular_lines <- paste(
+  "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page)
+  " \\caption{Imputation Results}",
+  " \\centering", # \centering centers the table on the page
+  " \\begin{tabular}{l c c c}",
+  "  \\hline\\hline",
+  "  \\  & potential peptides   & missing values & rejected",
+  "    peptides \\\\ [0.5ex]",
+  "  \\hline",
+  "  before imputation & %d     & %d (%d\\%s)    &    \\\\",
+  "  after imputation  & %d     & %d             & %d \\\\ [1ex]",
+  "  \\hline",
+  " \\end{tabular}",
+  #" \\label{table:nonlin}", # may be used to refer this table in the text
+  "\\end{table}",
+  sep = "\n"
+  )
+tabular_lines <-
   sprintf(
-    "\n  %d missing values\n  %d usable peptides analysis"
-  , sum(is.na(quant_data_imp[good_rows, ]))
-  , sum(good_rows)
-  ),
-  sprintf(
-    "\n  %d peptides with too many missing values for further analysis"
-  , sum(!good_rows)
-  )
-)
+    tabular_lines,
+    imp_smry_potential_before,
+    imp_smry_missing_before,
+    imp_smry_pct_missing,
+    "%",
+    imp_smry_potential_after,
+    imp_smry_missing_after,
+    imp_smry_rejected_after
+    )
+cat(tabular_lines)
 ```
 ```{r echo = FALSE}
 
@@ -349,27 +613,51 @@
 # Zap rows where imputation was ineffective
 full_data         <- full_data        [good_rows, ]
 quant_data        <- quant_data       [good_rows, ]
-quant_data_imp <- quant_data_imp[good_rows, ]
+
+write_debug_file(quant_data_imp)
 
+quant_data_imp <- quant_data_imp[good_rows, ]
+quant_data_imp_good_rows <- quant_data_imp
+
+write_debug_file(quant_data_imp_good_rows)
 ```
-```{r echo = FALSE}
+
+```{r echo = FALSE, results = 'asis'}
 
-d_combined <- (density(as.numeric(as.matrix(
-  log10(quant_data_imp)
-))))
+can_plot_before_after_imp <- TRUE
+d_combined <-
+  as.numeric(
+    as.matrix(
+      log10(quant_data_imp)
+      )
+    )
 d_original <-
-  density(as.numeric(as.matrix(
-    log10(quant_data_imp[!is.na(quant_data)]))))
+  as.numeric(
+    as.matrix(
+      log10(quant_data_imp[!is.na(quant_data)])
+      )
+    )
 
-```
-```{r echo = FALSE}
+if (sum(!is.na(d_combined)) > 2 && sum(!is.na(d_original)) > 2) {
+  d_combined <- density(d_combined)
+  d_original <- density(d_original)
+} else {
+  can_plot_before_after_imp <- FALSE
+}
 
 if (sum(is.na(quant_data)) > 0) {
   # There ARE missing values
   d_imputed <-
-    (density(as.numeric(as.matrix(
-      log10(quant_data_imp[is.na(quant_data)])
-    ))))
+    as.numeric(
+      as.matrix(
+        log10(quant_data_imp[is.na(quant_data)])
+        )
+      )
+  if (sum(!is.na(d_combined)) > 2) {
+    d_imputed <- (density(d_imputed))
+  } else {
+    can_plot_before_after_imp <- FALSE
+  }
 } else {
   # There are NO missing values
   d_imputed <- d_combined
@@ -377,16 +665,92 @@
 
 ```
 
-```{r echo = FALSE, fig.dim = c(9, 5)}
-ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y))
-plot(
-  d_combined,
-  ylim = ylim,
-  sub = "Blue = data before imputation; Red = imputed data",
-  main = "Density vs. log10(intensity) before and after imputation"
-)
-lines(d_original, col = "blue")
-lines(d_imputed, col = "red")
+```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
+if (sum(is.na(quant_data)) > 0) {
+  cat("\\leavevmode\\newpage\n")
+  # data visualization
+  old_par <- par(
+    mai = par("mai") + c(0.5, 0, 0, 0)
+  )
+  x <- quant_data
+  x <- blue_dots <- x / x
+  blue_dots <- log10(blue_dots * quant_data)
+  x[is.na(x)] <- 0
+  x[x == 1] <- NA
+  x[x == 0] <- 1
+  quant_data_imp_log10 <-
+    log10(quant_data_imp)
+
+  write_debug_file(quant_data_imp_log10)
+
+  red_dots <- quant_data_imp_log10 * x
+  ylim <- c(
+    min(red_dots, blue_dots, na.rm = TRUE),
+    max(red_dots, blue_dots, na.rm = TRUE)
+    )
+  # ref: https://r-charts.com/distribution/add-points-boxplot/
+  # Vertical plot
+  colnames(blue_dots) <- sample_name_matches
+  boxplot(
+      blue_dots
+    , las = 1 # "always horizontal"
+    , col = const_boxplot_fill
+    , ylim = ylim
+    , main = "Peptide intensities before and after imputation"
+    , sub = "Light blue = data before imputation; Red = imputed data"
+    , xlab = "Sample"
+    , ylab = "log10(peptide intensity)"
+    )
+  # Points
+  # NA values are not plotted
+  stripchart(
+    blue_dots,                 # Data
+    method = "jitter",          # Random noise
+    jitter = const_stripchart_jitter,
+    pch = 19,                   # Pch symbols
+    cex = const_stripsmall_cex, # Size of symbols reduced
+    col = "lightblue",          # Color of the symbol
+    vertical = TRUE,            # Vertical mode
+    add = TRUE                  # Add it over
+    )
+  stripchart(
+    red_dots,                   # Data
+    method = "jitter",          # Random noise
+    jitter = const_stripchart_jitter,
+    pch = 19,                   # Pch symbols
+    cex = const_stripsmall_cex, # Size of symbols reduced
+    col = "red",                # Color of the symbol
+    vertical = TRUE,            # Vertical mode
+    add = TRUE                  # Add it over
+    )
+  par(old_par)
+
+  # density plot
+  cat("\\leavevmode\n\n\n\n\n\n\n")
+  if (can_plot_before_after_imp) {
+    ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y))
+    plot(
+      d_combined,
+      ylim = ylim,
+      sub =
+        paste(
+          "Blue = data before imputation; Red = imputed data;",
+          "Black = combined"
+          ),
+      main = "Density of peptide intensity before and after imputation",
+      xlab = "log10(peptide intensity)",
+      ylab = "Probability density"
+    )
+    lines(d_original, col = "blue")
+    lines(d_imputed, col = "red")
+  } else {
+    cat(
+      "There are too few points to plot the density of peptide intensity",
+      "before and after imputation."
+      )
+  }
+  cat("\\leavevmode\\newpage\n")
+}
 ```
 
 ## Perform Quantile Normalization
@@ -448,10 +812,10 @@
 #       http://bmbolstad.com/misc/normalize/normalize.html
 # ...
 -->
-```{r echo = FALSE}
+```{r echo = FALSE, results = 'asis'}
 library(preprocessCore)
 
-if (TRUE) {
+if (nrow(quant_data_imp) > 0) {
   quant_data_imp_qn <- normalize.quantiles(as.matrix(quant_data_imp))
 } else {
   quant_data_imp_qn <- as.matrix(quant_data_imp)
@@ -459,28 +823,39 @@
 
 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn)
 names(quant_data_imp_qn) <- names(quant_data_imp)
-quant_data_imp_qn_log <- log10(quant_data_imp_qn)
+
+write_debug_file(quant_data_imp_qn)
 
-rownames(quant_data_imp_qn_log) <- full_data[, 1]
+quant_data_imp_qn_log <- log10(quant_data_imp_qn)
+rownames(quant_data_imp_qn_log) <- rownames(quant_data_imp)
+
+write_debug_file(quant_data_imp_qn_log)
 
 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn))))
+
 any_nan <- function(x) {
   !any(x == "NaN")
 }
 sel <- apply(quant_data_imp_qn_ls, 1, any_nan)
 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls[which(sel), ]
 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2)
+rownames(quant_data_imp_qn_ls2) <- rownames(quant_data_imp)[sel]
+
+quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls)
+rownames(quant_data_imp_qn_ls) <- rownames(quant_data_imp)
+
+write_debug_file(quant_data_imp_qn_ls)
+write_debug_file(quant_data_imp_qn_ls2)
 
 #output quantile normalized data
 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log)
 write.table(
   data_table_imp_qn_lt,
-  file = paste(paste(
-    strsplit(imputed_data_filename, ".txt"), "QN_LT", sep = "_"
-  ), ".txt", sep = ""),
+  file = imp_qn_lt_data_filenm,
   sep = "\t",
   col.names = TRUE,
-  row.names = FALSE
+  row.names = FALSE,
+  quote = FALSE
 )
 
 ```
@@ -490,7 +865,6 @@
 
 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
 
-
 # Save unimputed quant_data_log for plotting below
 unimputed_quant_data_log <- quant_data_log
 
@@ -499,10 +873,9 @@
 quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001
 quant_data_log <- log10(quant_data_imp_qn)
 
-# Output quantile-normalized log-transformed dataset
-#   with imputed, normalized data
+# Output with imputed, un-normalized data
 
-data_table_imputed <- cbind(full_data[1:9], quant_data_log)
+data_table_imputed <- cbind(full_data[1:9], quant_data_imp)
 write.table(
     data_table_imputed
   , file = imputed_data_filename
@@ -512,38 +885,72 @@
   , quote = FALSE
   )
 
+how_many_peptides <- nrow(quant_data_log)
+
+if ((how_many_peptides) > 0) {
+  cat(
+    sprintf(
+      "Intensities for %d peptides:\n\n\n",
+      how_many_peptides
+      )
+    )
+  cat("\n\n\n")
 
 
-# data visualization
-old_par <- par(
-  mai = par("mai") + c(0.5, 0, 0, 0)
-, oma = par("oma") + c(0.5, 0, 0, 0)
-)
-boxplot(
-  quant_data_log
-, las = 2
-)
-par(old_par)
+  # data visualization
+  old_par <- par(
+    mai = par("mai") + c(0.5, 0, 0, 0)
+  , oma = par("oma") + c(0.5, 0, 0, 0)
+  )
+  # ref: https://r-charts.com/distribution/add-points-boxplot/
+  # Vertical plot
+  colnames(quant_data_log) <- sample_name_matches
+  boxplot(
+    quant_data_log
+  , las = 1
+  , col = const_boxplot_fill
+  )
+  # Points
+  stripchart(
+    quant_data_log, # Data
+    method = "jitter",          # Random noise
+    jitter = const_stripchart_jitter,
+    pch = 19,                   # Pch symbols
+    cex = const_stripchart_cex, # Size of symbols reduced
+    col = "goldenrod",          # Color of the symbol
+    vertical = TRUE,            # Vertical mode
+    add = TRUE                  # Add it over
+    )
+  par(old_par)
+} else {
+  cat("There are no peptides to plot\n")
+}
 
-
-
-cat("\\newline\n")
-cat("\\newline\n")
+cat("\n\n\n")
 
 ```
 
-```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4)}
-quant_data_log_stack <- stack(quant_data_log)
-ggplot(
-  quant_data_log_stack,
-  aes(x = values)
-  ) + geom_density(aes(group = ind, colour = ind))
+```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
+if (nrow(quant_data_log) > 1) {
+  quant_data_log_stack <- stack(quant_data_log)
+  ggplot(
+    quant_data_log_stack,
+    aes(x = values)
+    ) +
+    geom_density(
+      aes(group = ind, colour = ind),
+      na.rm = TRUE
+      )
+} else {
+  cat("No density plot because there are fewer than two peptides to plot.\n\n")
+}
+```
+```{r echo = FALSE, results = 'asis'}
+cat("\\leavevmode\\newpage\n")
 ```
 
 ## Perform ANOVA filters
 
-(see following pages)
-
 ```{r, echo = FALSE}
 # Make new data frame containing only Phosphopeptides
 #   to connect preANOVA to ANOVA (connect_df)
@@ -555,22 +962,8 @@
 ```
 
 ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
-# Get factors -> group replicates (as indicated by terminal letter)
-#   by the preceding digits;
-#   e.g., group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc..
-m <-
-  regexpr(regex_sample_names, names(quant_data_imp_qn_log), perl = TRUE)
-
-temp_matches <- regmatches(names(quant_data_imp_qn_log), m)
-
-number_of_samples <- length(temp_matches)
-
-m2 <- regexpr(regex_sample_grouping, temp_matches, perl = TRUE)
-
-
-sample_factor_levels <- as.factor(regmatches(temp_matches, m2))
-
-if (length(levels(sample_factor_levels)) < 2) {
+count_of_factor_levels <- length(levels(sample_factor_levels))
+if (count_of_factor_levels < 2) {
   nuke_control_sequences <-
     function(s) {
       s <- gsub("[\\]", "xyzzy_plugh", s)
@@ -584,46 +977,46 @@
   )
   cat(
     "ERROR: ANOVA analysis",
-    "requires two or more factor levels!\\newline\n"
+    "requires two or more factor levels!\n\n\n"
   )
 
-  cat("\\newline\\newline\n")
-  cat("Unparsed sample names are:\\newline\n",
+  cat("\n\n\n\n\n")
+  cat("Unparsed sample names are:\n\n\n",
     "\\begin{quote}\n",
-    paste(names(quant_data_imp_qn_log), collapse = "\\newline\n"),
+    paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"),
     "\n\\end{quote}\n\n")
 
   regex_sample_names <- nuke_control_sequences(regex_sample_names)
 
-  cat("\\leavevmode\\newline\n")
+  cat("\\leavevmode\n\n\n")
   cat("Parsing rule for SampleNames is",
-    "\\newline\n",
+    "\n\n\n",
     "\\text{'",
     regex_sample_names,
-    "'}\\newline\n",
+    "'}\n\n\n",
     sep = ""
     )
 
   cat("\nParsed sample names are:\n",
     "\\begin{quote}\n",
-    paste(temp_matches, collapse = "\\newline\n"),
+    paste(sample_name_matches, collapse = "\n\n\n"),
     "\n\\end{quote}\n\n")
 
   regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping)
 
-  cat("\\leavevmode\\newline\n")
+  cat("\\leavevmode\n\n\n")
   cat("Parsing rule for SampleGrouping is",
-    "\\newline\n",
+    "\n\n\n",
     "\\text{'",
     regex_sample_grouping,
-    "'}\\newline\n",
+    "'}\n\n\n",
     sep = ""
     )
 
-  cat("\\newline\n")
+  cat("\n\n\n")
   cat("Sample group assignments are:\n",
     "\\begin{quote}\n",
-    paste(regmatches(temp_matches, m2), collapse = "\\newline\n"),
+    paste(regmatches(sample_name_matches, m2), collapse = "\n\n\n"),
     "\n\\end{quote}\n\n")
 
 } else {
@@ -649,14 +1042,19 @@
   #   e.g.    "Outputfile_pST_ANOVA_STEP5.txt"
   #   becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt"
 
-  # Re-output quantile-normalized log-transformed dataset
-  #   with imputed, normalized data to include p-values
+  # Re-output datasets to include p-values
+  write.table(
+    cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp),
+    file = imputed_data_filename,
+    sep = "\t",
+    col.names = TRUE,
+    row.names = FALSE,
+    quote = FALSE
+    )
 
-  data_table_imputed <-
-    cbind(full_data[1:9], p_value_data[, 2:3], quant_data_log)
   write.table(
-    data_table_imputed,
-    file = imputed_data_filename,
+    cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp_qn_log),
+    file = imp_qn_lt_data_filenm,
     sep = "\t",
     col.names = TRUE,
     row.names = FALSE,
@@ -667,70 +1065,94 @@
   p_value_data <-
     p_value_data[order(p_value_data$fdr_adjusted_anova_p), ]
 
+  first_page_suppress <- 1
+  number_of_peptides_found <- 0
   cutoff <- val_fdr[1]
   for (cutoff in val_fdr) {
+    if (number_of_peptides_found > 49) {
+      cat("\\leavevmode\n\n\n")
+      break
+      }
+
     #loop through FDR cutoffs
 
     filtered_p <-
       p_value_data[
         which(p_value_data$fdr_adjusted_anova_p < cutoff),
-        ,
-        drop = FALSE
+        , drop = FALSE
         ]
     filtered_data_filtered <-
       quant_data_imp_qn_log[
         rownames(filtered_p),
-        ,
-        drop = FALSE
+        , drop = FALSE
         ]
     filtered_data_filtered <-
       filtered_data_filtered[
         order(filtered_p$fdr_adjusted_anova_p),
-        ,
-        drop = FALSE
+        , drop = FALSE
         ]
 
     # <!-- ACE insertion start -->
-    old_oma <- par("oma")
-    old_par <- par(
-      mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1),
-      oma = old_oma * c(1, 1, 0.3, 1),
-      cex.main = 0.9,
-      cex.axis = 0.7
-      )
 
-    cat("\\newpage\n")
-    if (nrow(filtered_data_filtered) > 0) {
+    if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) {
+      if (first_page_suppress == 1) {
+        first_page_suppress <- 0
+        }
+        else {
+          cat("\\newpage\n")
+        }
       cat(sprintf(
-        "Intensities for peptides whose adjusted p-value < %0.2f\n",
+        "Intensities for %d peptides whose adjusted p-value < %0.2f:\n",
+        nrow(filtered_data_filtered),
         cutoff
       ))
-      cat("\\newline\n")
-      cat("\\newline\n")
+      cat("\n\n\n")
+      cat("\n\n\n")
 
+      old_oma <- par("oma")
+      old_par <- par(
+        mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1),
+        oma = old_oma * c(1, 1, 0.3, 1),
+        cex.main = 0.9,
+        cex.axis = 0.7,
+        fin = c(9, 7.25)
+        )
+      # ref: https://r-charts.com/distribution/add-points-boxplot/
+      # Vertical plot
+      colnames(filtered_data_filtered) <- sample_name_matches
       boxplot(
         filtered_data_filtered,
         main = "Imputed, normalized intensities", # no line plot
-        las = 2,
+        las = 1,
+        col = const_boxplot_fill,
         ylab = expression(log[10](intensity))
       )
+      # Points
+      stripchart(
+        filtered_data_filtered,     # Data
+        method = "jitter",          # Random noise
+        jitter = const_stripchart_jitter,
+        pch = 19,                   # Pch symbols
+        cex = const_stripchart_cex, # Size of symbols reduced
+        col = "goldenrod",          # Color of the symbol
+        vertical = TRUE,            # Vertical mode
+        add = TRUE                  # Add it over
+        )
+      par(old_par)
     } else {
       cat(sprintf(
-        "No peptides were found to have cutoff adjusted p-value < %0.2f\n",
+        "%s < %0.2f\n\n\n\n\n",
+        "No peptides were found to have cutoff adjusted p-value <",
         cutoff
       ))
     }
-    par(old_par)
 
     if (nrow(filtered_data_filtered) > 0) {
       #Add Phosphopeptide column to anova_filtered table
       anova_filtered_merge <- merge(
-        x = connect_df
-        ,
-        y = filtered_data_filtered
-        ,
-        by.x = "Intensity"
-        ,
+        x = connect_df,
+        y = filtered_data_filtered,
+        by.x = "Intensity",
         by.y = 1
       )
       anova_filtered_merge_order <- rownames(filtered_p)
@@ -759,12 +1181,9 @@
       # Merge qualitative columns into the ANOVA data
       output_table <- data.frame(anova_filtered$Phosphopeptide)
       output_table <- merge(
-        x = output_table
-        ,
-        y = data_table_imp_qn_lt
-        ,
-        by.x = "anova_filtered.Phosphopeptide"
-        ,
+        x = output_table,
+        y = data_table_imp_qn_lt,
+        by.x = "anova_filtered.Phosphopeptide",
         by.y = "Phosphopeptide"
       )
 
@@ -777,7 +1196,7 @@
           nrow = nrow(m)
           )
         )
-      m <- m[!m_nan_rows, ]
+      m <- m[!m_nan_rows, , drop = FALSE]
       if (nrow(m) > 0) {
         rownames_m <- rownames(m)
         rownames(m) <- sapply(
@@ -791,11 +1210,15 @@
             )
           }
         )
-        margins <-
-          c(max(nchar(colnames(m))) * 10 / 16 # col
-            , max(nchar(rownames(m))) * 5 / 16 # row
-            )
         how_many_peptides <- min(50, nrow(m))
+        number_of_peptides_found <- how_many_peptides
+        if (nrow(m) > 1) {
+          m_margin <- m[how_many_peptides:1, ]
+          margins <-
+            c(max(nchar(colnames(m_margin))) * 10 / 16 # col
+              , max(nchar(rownames(m_margin))) * 5 / 16 # row
+              )
+          }
 
         cat("\\newpage\n")
         if (nrow(m) > 50) {
@@ -805,16 +1228,19 @@
               cutoff)
           )
         } else {
-          cat("Heatmap for peptides whose",
-            sprintf("adjusted p-value < %0.2f\n",
-            cutoff)
-          )
+          if (nrow(m) == 1) {
+            cat(
+              sprintf("Heatmap for %d usable peptides whose", nrow(m)),
+              sprintf("adjusted p-value < %0.2f\n", cutoff)
+            )
+            next
+          }
         }
-        cat("\\newline\n")
-        cat("\\newline\n")
-        op <- par("cex.main")
+        cat("\n\n\n")
+        cat("\n\n\n")
         try(
           if (nrow(m) > 1) {
+            old_oma <- par("oma")
             par(cex.main = 0.6)
             heatmap(
               m[how_many_peptides:1, ],
@@ -823,19 +1249,19 @@
               cexRow = 0.7,
               cexCol = 0.8,
               scale = "row",
-              #ACE scale = "none",
               margins = margins,
               main =
-                "Heatmap of unimputed, unnormalized intensities",
-              xlab = ""
+                "Unimputed, unnormalized intensities",
+              xlab = "",
+              las = 1 #, fin = c(9, 5.5)
               )
           }
         )
-        par(op)
       }
     }
   }
 }
+cat("\\leavevmode\n\n\n")
 ```
 
 <!--
author	eschen42
date	Tue, 22 Mar 2022 20:47:40 +0000
parents	4deacfee76ef
children	6679616d0c18