mqppep_anova: mqppep_anova_script.Rmd comparison

comparison mqppep_anova_script.Rmd @ 26:5b8e15b2a67c draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit e0b80550743f634282b4b4348b75e6f172dc1488

author	eschen42
date	Wed, 26 Oct 2022 23:48:51 +0000
parents	f9cd87ac8006
children	42b207aaa527

comparison

equal deleted inserted replaced

-:f9cd87ac8006
+:5b8e15b2a67c
 - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]"
 - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]"
 date:
 - "May 28, 2018"
 - "; revised June 23, 2022"
+lot: true
 output:
 pdf_document:
 toc: true
-toc_depth: 3
+toc_depth: 2
 keep_tex: true
-header-includes:
+dev: pdf
-- \usepackage{longtable}
+includes:
-- \newcommand\T{\rule{0pt}{2.6ex}}       % Top strut
+in_header: mqppep_anova_preamble.tex
-- \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut
+latex_macros: false
+raw_tex: true
+urlcolor: blue
 params:
 alphaFile:            "test-data/alpha_levels.tabular"
 inputFile:            "test-data/test_input_for_anova.tabular"
 preprocDb:            "test-data/test_input_for_anova.sqlite"
 kseaAppPrepDb:        !r c(":memory:", "test-data/mqppep.sqlite")[2]
 regexSampleNames:     "\\.\\d+[A-Z]$"
 regexSampleGrouping:  "\\d+"
-show_toc:             true
+groupFilterPatterns:  ".+"
+groupFilter:    !r c("none", "exclude", "include")[1]
+imputationMethod:     !r c("group-median", "median", "mean", "random")[4]
+kseaCutoffThreshold:  !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5]
+#imputationMethod:     !r c("group-median", "median", "mean", "random")[1]
+# how should sample groups be interpreted?
+#  - "f": fixed patterns (like `grep -F`)
+#  - "p": PERL-compatible (like `grep -P`)
+#  - "r": extended grep patterns (like `grep -E`)
+# use what case sensitivity?
+#  - "i": case insensitive matching (like `grep -i`)
+groupFilterMode: !r c("r", "ri", "p", "pi", "f", "fi")[1]
+# what pattern should be used for the first column
+#   (extended grep pattern, case sensitive)
 firstDataColumn:      "^Intensity[^_]"
-imputationMethod:     !r c("group-median", "median", "mean", "random")[1]
+# for small random value imputation, what percentile should be center?
-meanPercentile:       1
+meanPercentile:       50
+#meanPercentile:       1
+# for small random value imputation, what should `s / mean(x)` ratio be?
 sdPercentile:         1.0
+# output path for imputed data file
 imputedDataFilename:  "test-data/limbo/imputedDataFilename.txt"
+# output path for imputed/quantile-normalized/log-transformed data file
 imputedQNLTDataFile:  "test-data/limbo/imputedQNLTDataFile.txt"
+# output path for contents of `stats_metadata_v` table
 anovaKseaMetadata:    "test-data/limbo/anovaKseaMetadata.txt"
+# how to test one variable with > 2 categories (e.g., aov or kruskal.test)
 oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1]
+# how to test one variable with 2 categories (e.g., oneway.test)
 oneWayTwoCategories:  !r c("aov", "kruskal.test", "oneway.test")[3]
-kseaCutoffStatistic:  !r c("p.value", "FDR")[2]
+# what should be the minimum quality for consideration in both
-kseaCutoffThreshold:  !r c( 0.1, 0.05)[2]
+minQuality:           0
-kseaMinKinaseCount:   1
+# correct KSEA with FDR (recommended) or raw p-value
-intensityHeatmapRows: 75
+kseaCutoffStatistic:  !r c("FDR", "p.value")[1]
+# correct KSEA threshold 0.05 (conventional) or higher (perhaps better)
+#   "perhaps better" meaning that KSEA is an hypothesis-generator, not -test
+#kseaCutoffThreshold:  !r c(0.05, 0.1, 0.25, 0.5)[1]
+# minimum number of substrates required for a kinase to be considered in KSEA
+kseaMinSubstrateCount: 1
+# Should KSEA be performed aggregating signed log2FC or absolute?
+# FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores
+# TRUE  use abs(log2FC) for KSEA as Justin Drake requested; this is a
+#         justifiable deviation from the KSEAapp::KSEA.Scores algorithm.
+kseaUseAbsoluteLog2FC: TRUE
+#kseaUseAbsoluteLog2FC: FALSE
+# minimum number of observed values per sample-group
+intensityMinValuesPerGroup: 1
+# maximum number of heatmap rows (result are poor when > 50)
+intensityHeatmapRows: 50
+# what should be the primary criterion to eliminate excessive heatmap rows
+intensityHeatmapCriteria: !r c("quality", "na_count", "p_value")[1]
+# should correlation among substrates be used (rather than covariance)
+correlateSubstrates:  TRUE
+# only show covariance among variables having variance > 1
+filterCovVarGT1:      TRUE
+# maximum number of residues to display for ppeps in rownames or columnames
+ppepTruncN:           10
+# maximum number of characters of subgenes to display in rownames or columnames
+subgeneTruncN:        10
+# maximum number of characters for paste(subgene, ppep) for enrichment plots
+substTruncN:          20
+# should boxplots use variable-width boxes to reflect # of samples
+boxPlotVarWidth:      TRUE
+# should boxplots use notched boxes to reflect difference between samples
+boxPlotNotch:         TRUE
+# look-up tables for kinase descriptions
+kinaseNameUprtLutBz2: "./kinase_name_uniprot_lut.tabular.bz2"
+kinaseUprtDescLutBz2: "./kinase_uniprot_description_lut.tabular.bz2"
+# should debugging trace messages be printed?
+showEnrichedSubstrates: FALSE
+# should debugging nb/nbe messages be printed?
+printNBMsgs:          FALSE
+# should debugging trace messages be printed?
+printTraceMsgs:       FALSE
+# when debugging files are needed, set debugFileBasePath to the path
+#   to the directory where they should be writtn
+debugFileBasePath:    !r if (TRUE) NULL else "test-data"
 ---
 <!--
+alphaFile:            "test-data/alpha_levels.tabular"
+inputFile:            "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular"
+preprocDb:            "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite"
+kseaAppPrepDb:        !r c(":memory:", "test-data/mqppep.sqlite")[2]
+regexSampleNames:     "\\.\\w+\\.\\d+[A-Z]$"
+regexSampleGrouping:  "\\w+"
+groupFilterPatterns:  ".+"
+groupFilter:    !r c("none", "exclude", "include")[3]
+imputationMethod:     !r c("group-median", "median", "mean", "random")[1]
+kseaCutoffThreshold:  !r c(0.05, 0.1, 0.20, 0.35, 0.4, 0.5, 0.999)[1]
+ut_alphaFile:            "test-data/alpha_levels.tabular"
+ut_inputFile:            "test-data/UT_phospho_ST_sites.preproc.tabular"
+ut_preprocDb:            "test-data/UT_phospho_ST_sites.preproc.sqlite"
+ut_kseaAppPrepDb:        !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2]
+ut_regexSampleNames:     "\\.\\d+[A-Z]$"
+ut_regexSampleGrouping:  "\\d+"
+ut_groupFilterPatterns:  ".+,.*"
+ut_groupFilter:    !r c("none", "exclude", "include")[1]
+ut_imputationMethod:     !r c("group-median", "median", "mean", "random")[4]
+ut_kseaCutoffThreshold:  !r c(0.05, 0.1, 0.25, 0.5, 0.9)[1]
+tst_alphaFile:            "test-data/alpha_levels.tabular"
+tst_inputFile:            "test-data/test_input_for_anova.tabular"
+tst_preprocDb:            "test-data/test_input_for_anova.sqlite"
+tst_kseaAppPrepDb:        !r c(":memory:", "test-data/mqppep.sqlite")[2]
+tst_regexSampleNames:     "\\.\\d+[A-Z]$"
+tst_regexSampleGrouping:  "\\d+"
+tst_groupFilterPatterns:  ".+"
+tst_groupFilter:    !r c("none", "exclude", "include")[1]
+tst_imputationMethod:     !r c("group-median", "median", "mean", "random")[4]
+tst_kseaCutoffThreshold:  !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5]
+tst_alphaFile:            "test-data/alpha_levels.tabular"
+tst_inputFile:            "test-data/UT_phospho_ST_sites.preproc.tabular"
+tst_preprocDb:            "test-data/UT_phospho_ST_sites.preproc.sqlite"
+tst_kseaAppPrepDb:        !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2]
+tst_regexSampleNames:     "\\.\\d+[A-Z]$"
+tst_regexSampleGrouping:  "\\d+"
+tst_groupFilterPatterns:  ".+,.*"
+tst_groupFilter:    !r c("none", "exclude", "include")[1]
+tst_imputationMethod:     !r c("group-median", "median", "mean", "random")[4]
+tst_kseaCutoffThreshold:  !r c(0.05, 0.1, 0.20, 0.35, 0.4, 0.5, 0.999)[5]
+px_alphaFile:            "test-data/alpha_levels.tabular"
+px_inputFile:            "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular"
+px_preprocDb:            "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite"
+px_kseaAppPrepDb:        !r c(":memory:", "test-data/mqppep.sqlite")[2]
+px_regexSampleNames:     "\\.\\w+\\.\\d+[A-Z]$"
+px_regexSampleGrouping:  "\\w+"
+px_groupFilterPatterns:  ".+"
+px_groupFilter:    !r c("none", "exclude", "include")[3]
+px_imputationMethod:     !r c("group-median", "median", "mean", "random")[4]
+px_kseaCutoffThreshold:  !r c(0.05, 0.1, 0.20, 0.35, 0.4, 0.5, 0.999)[5]
+pdx_alphaFile:            "test-data/alpha_levels.tabular"
+pdx_inputFile:            "test-data/PDX012970_pST.preproc_tab.tabular"
+pdx_preprocDb:            "test-data/PDX012970_pST.preproc.sqlite"
+pdx_kseaAppPrepDb:        !r c(":memory:", "test-data/PDX012970.sqlite")[2]
+pdx_regexSampleNames:     "\\.\\w+\\.\\w+\\.\\d+[A-Z]$"
+pdx_regexSampleGrouping:  "\\.\\w+\\K\\.\\w+"
+pdx_groupFilterPatterns:  "AdCa,AVPC"
+pdx_groupFilter:    !r c("none", "exclude", "include")[3]
+pdx_imputationMethod:     !r c("group-median", "median", "mean", "random")[4]
+pdx_kseaCutoffThreshold:  !r c(0.05, 0.1, 0.25, 0.5, 0.9)[1]
+tst_alphaFile:            "test-data/alpha_levels.tabular"
+tst_inputFile:            "test-data/test_input_for_anova.tabular"
+tst_preprocDb:            "test-data/test_input_for_anova.sqlite"
+tst_kseaAppPrepDb:        !r c(":memory:", "test-data/mqppep.sqlite")[2]
+tst_regexSampleNames:     "\\.\\d+[A-Z]$"
+tst_regexSampleGrouping:  "\\d+"
+tst_groupFilterPatterns:  ".+"
+tst_groupFilter:    !r c("none", "exclude", "include")[1]
+tst_kseaCutoffThreshold:  !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5]
+tst_imputationMethod:     !r c("group-median", "median", "mean", "random")[1]
+ut_alphaFile:            "test-data/alpha_levels.tabular"
+ut_inputFile:            "test-data/UT_phospho_ST_sites.preproc.tabular"
+ut_preprocDb:            "test-data/UT_phospho_ST_sites.preproc.sqlite"
+ut_kseaAppPrepDb:        !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2]
+ut_regexSampleNames:     "\\.\\d+[A-Z]$"
+ut_regexSampleGrouping:  "\\d+"
+ut_groupFilterPatterns:  ".+,.*"
+ut_groupFilter:    !r c("none", "exclude", "include")[1]
+ut_imputationMethod:     !r c("group-median", "median", "mean", "random")[4]
 alphaFile:            "test-data/alpha_levels.tabular"
 inputFile:            "test-data/test_input_for_anova.tabular"
 preprocDb:            "test-data/test_input_for_anova.sqlite"
 kseaAppPrepDb:        !r c(":memory:", "test-data/mqppep.sqlite")[2]
 regexSampleNames:     "\\.\\d+[A-Z]$"
 regexSampleGrouping:  "\\d+"
+groupFilterPatterns:  ".+,.*"
+groupFilter:    !r c("none", "exclude", "include")[1]
+imputationMethod:     !r c("group-median", "median", "mean", "random")[4]
+nd_alphaFile:            "test-data/alpha_levels.tabular"
+nd_inputFile:            "test-data/pST_Sites_NancyDu.txt.preproc.tabular"
+nd_preprocDb:            "test-data/pST_Sites_NancyDu.txt.preproc.sqlite"
+nd_kseaAppPrepDb:        !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2]
+nd_regexSampleNames:     "\\.\\d+[A-Z]$"
+nd_regexSampleGrouping:  "\\d+"
+nd_groupFilterPatterns:     ".+,.*"
+nd_groupFilter:    !r c("none", "exclude", "include")[1]
+nd_imputationMethod:     !r c("group-median", "median", "mean", "random")[4]
+pxd_alphaFile:            "test-data/alpha_levels.tabular"
+pxd_inputFile:            "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular"
+pxd_preprocDb:            "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite"
+pxd_kseaAppPrepDb:        !r c(":memory:", "test-data/mqppep.sqlite")[2]
+pxd_regexSampleNames:     "\\.\\w+\\.\\d+[A-Z]$"
+pxd_regexSampleGrouping:  "\\w+"
+pxd_groupFilterPatterns:     ".+"
+pxd_groupFilter:    !r c("none", "exclude", "include")[3]
+pxd_imputationMethod:     !r c("group-median", "median", "mean", "random")[4]
+alphaFile:            "test-data/alpha_levels.tabular"
+inputFile:            "test-data/test_input_for_anova.tabular"
+preprocDb:            "test-data/test_input_for_anova.sqlite"
+kseaAppPrepDb:        !r c(":memory:", "test-data/mqppep.sqlite")[2]
+regexSampleNames:     "\\.\\d+[A-Z]$"
+regexSampleGrouping:  "\\d+"
+groupFilterPatterns:  ".+,.*"
+groupFilter:    !r c("none", "exclude", "include")[1]
+alphaFile:            "test-data/alpha_levels.tabular"
+inputFile:            "test-data/PDX012970_pST.preproc_tab.tabular"
+preprocDb:            "test-data/PDX012970_pST.preproc.sqlite"
+kseaAppPrepDb:        !r c(":memory:", "test-data/PDX012970.sqlite")[2]
+regexSampleNames:     "\\.\\w+\\.\\w+\\.\\d+[A-Z]$"
+regexSampleGrouping:  "\\.\\w+\\K\\.\\w+"
+groupFilterPatterns:  "AdCa,AVPC"
+groupFilter:    !r c("none", "exclude", "include")[3]
 alphaFile:            "test-data/alpha_levels.tabular"
 inputFile:            "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular"
 preprocDb:            "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite"
 kseaAppPrepDb:        !r c(":memory:", "test-data/mqppep.sqlite")[2]
 regexSampleNames:     "\\.\\w+\\.\\d+[A-Z]$"
 regexSampleGrouping:  "\\w+"
+groupFilterPatterns:     ".+,.*"
+groupFilter:    !r c("none", "exclude", "include")[3]
 kseaCutoffStatistic:  !r c("p.value", "FDR")[2]
 kseaCutoffThreshold:  !r c(0.05, 0.1)[1]
 alphaFile:            "test-data/alpha_levels.tabular"
 inputFile:            "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular"
 preprocDb:            "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite"
 kseaAppPrepDb:        !r c(":memory:", "test-data/pY_Sites_NancyDu.ksea.sqlite")[2]
 regexSampleNames:     "\\.\\d+[A-Z]$"
 regexSampleGrouping:  "\\d+"
+groupFilterPatterns:  ".+,.*"
+groupFilter:    !r c("none", "exclude", "include")[3]
 alphaFile:            "test-data/alpha_levels.tabular"
 inputFile:            "test-data/pST_Sites_NancyDu.txt.preproc.tabular"
 preprocDb:            "test-data/pST_Sites_NancyDu.txt.preproc.sqlite"
 kseaAppPrepDb:        !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2]
 regexSampleNames:     "\\.\\d+[A-Z]$"
 regexSampleGrouping:  "\\d+"
+groupFilterPatterns:  ".+,.*"
-inputFile:            "test-data/density_failure.preproc_tab.tabular"
+groupFilter:    !r c("none", "exclude", "include")[1]
-kseaAppPrepDb:        !r c(":memory:", "mqppep.sqlite")[2]
-latex_document: default
 -->
-```{r setup, include = FALSE}
+```{r setup, include = FALSE, results = 'asis'}
+# simple debug messaging
+print_nb_messages <- params$printNBMsgs
+nb <- if (!print_nb_messages) {
+function(...) invisible()
+} else {
+function(..., f = cat) f("\n$\\exists{}\\supset\\forall{}$", ...)
+}
+nbe <- if (!print_nb_messages) {
+function(...) invisible()
+} else {
+function(..., f = cat, file = stderr()) {
+cat(
+stringi::stri_unescape_unicode("\nNBE \\u2203\\u2283\\u2200"),
+...,
+file = file
+)
+}
+}
 #ref for debugging: https://yihui.org/tinytex/r/#debugging
 options(tinytex.verbose = TRUE)
 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355
-knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10))
+knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10), dpi = 300)
 # freeze the random number generator so the same results will be produced
 #  from run to run
 set.seed(28571)
 ### LIBRARIES
+if (print_nb_messages) nbe("library(gplots)")
 library(gplots)
+if (print_nb_messages) nbe("library(caret)")
+# load caret for nearZeroVar
+if (print_nb_messages) nbe("Please ignore the messages about systemd, if any.\n")
+library(caret)
+if (print_nb_messages) nbe("library(DBI)")
 library(DBI)
+if (print_nb_messages) nbe("library(RSQLite)")
 library(RSQLite)
+if (print_nb_messages) nbe("library(sqldf)\n")
 # Suppress "Warning: no DISPLAY variable so Tk is not available"
 suppressWarnings(suppressMessages(library(sqldf)))
 # required but not added to search list:
 # - DBI
 # - reshape2
 # - vioplot
 ### CONSTANTS
-const_parfin <- par("fin")
+const_boxplot_fill           <- "grey94"
-const_boxplot_fill <- "grey94"
-const_stripchart_cex <- 0.5
-const_stripsmall_cex <-
-sqrt(const_stripchart_cex * const_stripchart_cex / 2)
-const_stripchart_jitter <- 0.3
-const_write_debug_files <- FALSE
-const_table_anchor_bp <- "bp"
-const_table_anchor_ht <- "ht"
-const_table_anchor_p <- "p"
-const_table_anchor_tbp <- "tbp"
 const_ksea_astrsk_kinases    <- 1
 const_ksea_nonastrsk_kinases <- 2
 const_ksea_all_kinases       <- 3
+const_log10_e                <- log10(exp(1))
-const_log10_e <- log10(exp(1))
+const_stripchart_cex         <- 0.5
+const_stripchart_jitter      <- 0.3
-### FUNCTIONS
+const_table_anchor_bp        <- "bp"
+const_table_anchor_ht        <- "ht"
-# from `demo(error.catching)`
+const_table_anchor_p         <- "p"
+const_table_anchor_t         <- "t"
+const_table_anchor_tbp       <- "tbp"
+### GLOBAL VARIABLES (params)
+## functions to process params
+is_string_null_or_empty <- function(x) {
+# N. B. non-strings are intentionally treated as NULL
+if (is.null(x))
+TRUE
+else if (!is.character(x))
+TRUE
+else  x == ""
+}
 ##' Catch *and* save both errors and warnings, and in the case of
 ##' a warning, also keep the computed result.
+##' return result as list(value = ..., warning = ...)
+##' - value will be:
+##'   - the result if no exception is thrown
+##'   - the exception if an exception is caught
+##' - warning will be a string except perhaps when warning argument is not NULL
+##'
+##' adapted from `demo(error.catching)`
 ##'
 ##' @title tryCatch both warnings (with value) and errors
 ##' @param expr an \R expression to evaluate
 ##' @return a list with 'value' and 'warning', where
 ##'   'value' may be an error caught.
 ##' @author Martin Maechler;
 ##' Copyright (C) 2010-2012  The R Core Team
-try_catch_w_e <- function(expr) {
+try_catch_w_e <-
-wrn <- NULL
+function(expr, error = function(e) e, warning = NULL) {
-# warning handler
+wrn <- NULL
-w_handler <- function(w) {
+# warning handler
-wrn <<- w
+w_handler <-
-invokeRestart("muffleWarning")
+if (is.function(warning))
-}
+warning
-list(
+else
-value = withCallingHandlers(
+function(w) {
-tryCatch(
+wrn <<- w
-expr,
+invokeRestart("muffleWarning")
-error = function(e) e
+}
+e_handler <-
+if (is.function(error))
+error
+else
+function(e) e
+# return result as list(value = ..., warning = ...)
+# - value will be:
+#   - the result if no exception is thrown
+#   - the exception if an exception is caught
+list(
+value = withCallingHandlers(
+tryCatch(
+expr,
+error = e_handler
+),
+warning = w_handler
 ),
-warning = w_handler
+warning = wrn
-),
+)
-warning = wrn
+}
-)
-}
+see_kvp <-
+function(format, key, value, suffix = "") {
+if (
-write_debug_file <- function(s) {
+!all(
-if (const_write_debug_files) {
+is.character(format),
-s_path <- sprintf("test-data/%s.txt", deparse(substitute(s)))
+is.character(key),
-print(sprintf("DEBUG writing file %s", spath))
+is.character(value),
+is.character(suffix)
+)
+) {
+cat("all arguments to see_kvp should be character")
+knitr::knit_exit()
+}
+result <- sprintf(format, value)
+if (length(result) > 1) {
+sprintf(
+"%s = c(%s)%s",
+whack_underscores(key),
+paste(result, collapse = ", "),
+suffix
+)
+} else {
+sprintf(
+"%s = %s%s",
+key,
+result,
+suffix
+)
+}
+}
+see_logical <-
+function(x, suffix = "", xprssn = deparse1(substitute(x))) {
+result <- as.character(as.logical(x))
+# handle NAs and NaNs
+result[is.na(result)] <- "NA"
+see_kvp(
+format = "%s",
+key    = xprssn,
+value  = result,
+suffix = suffix
+)
+}
+see_numeric <-
+function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) {
+if (is.numeric(digits) && is.numeric(x)) {
+digits <- as.integer(digits)
+digits <- min(16, max(0, digits))
+format <- paste0("%0.", as.character(digits), "g")
+result <- sprintf(format, x)
+see_kvp(
+format = "%s",
+key    = xprssn,
+value  = result,
+suffix = suffix
+)
+}
+}
+see_character <-
+function(x, suffix = "", xprssn = deparse1(substitute(x))) {
+if (is.character(x)) {
+see_kvp(
+format = "%s",
+key    = xprssn,
+value  = sprintf("\"%s\"", x),
+suffix = suffix
+)
+}
+}
+see_variable <-
+function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) {
+if (is.character(x)) {
+see_character(x, suffix, xprssn)
+} else if (is.numeric(x)) {
+see_numeric(x, suffix, digits, xprssn)
+} else if (is.logical(x)) {
+see_logical(x, suffix, xprssn)
+} else {
+f <- file("")
+sink(f)
+str(x)
+msg <- paste(readLines(f), collapse = "\n")
+sink()
+close(f)
+paste0(
+"see_variable - str(",
+xprssn,
+"):\n",
+msg, "\n"
+)
+}
+}
+# ref: https://tug.org/texinfohtml/latex2e.html
+# LaTeX sets aside the following characters for special purposes.
+#   For example, the percent sign % is for comments.
+#   They are called reserved characters or special characters.
+#   They are all discussed elsewhere in this manual.
+#
+#   $ % & { } _ ~ ^ \ #
+#
+# If you want a reserved character to be printed as itself, in the text body
+#   font, for all but the final three characters in that list simply put
+#   a backslash \ in front of the character.
+#   Thus, typing \$1.23 will produce $1.23 in your output.
+#
+# As to the last three characters, to get a tilde in the text body font,
+#   use \~{} (omitting the curly braces would result in the next character
+#   receiving a tilde accent).
+# Similarly, to get a text body font circumflex use \^{}.
+#   To get a backslash in the font of the text body enter \textbackslash{}.
+whack_math <-
+function(v) {
+v <- as.character(v)
+w <- gsub("\\", "\\textbackslash ", v, fixed = TRUE)
+w <- Reduce(
+f = function(l, r) {
+gsub(r, paste0("\\", r), l, fixed = TRUE)
+},
+x = c("#", "$", "%", "&", "{", "}", "_"),
+init = w
+)
+w <- gsub("^", "\\^{}", w, fixed = TRUE)
+return(w)
+if (all(v == w))
+v
+else
+paste0("\\texttt{", w, "}")
+}
+whack_underscores <- whack_math
+## dump params to stderr (remove this eventually)
+if (FALSE) nbe(see_variable(params))
+## unlist params for eventual output
+param_unlist <- unlist(as.list(params))
+# no need to whack underscores and dollars because this is verbatim
+param_df <- data.frame(
+parameter = paste0("\\verb@", names(param_unlist), "@"),
+value = paste0(
+"\n\\begin{tiny}\n\\verb@",
+param_unlist,
+"@\n\\end{tiny}"
+)
+)
+param_df <- data.frame(
+parameter = names(param_unlist),
+value = param_unlist
+)
+param_df <- param_df[order(param_df$parameter), ]
+## general output control
+debug_file_base_path     <- params$debugFileBasePath
+print_trace_messages     <- params$printTraceMsgs
+show_enriched_substrates <- params$showEnrichedSubstrates
+boxplot_varwidth         <- params$boxPlotVarWidth
+boxplot_notch            <- params$boxPlotNotch
+## parameters for static data
+kinase_name_uprt_lut_bz2 <- params$kinaseNameUprtLutBz2
+kinase_uprt_desc_lut_bz2 <- params$kinaseUprtDescLutBz2
+## parameters for input file
+preproc_db        <- params$preprocDb
+alpha_file        <- params$alphaFile
+input_file        <- params$inputFile
+# First data column - ideally, this could be detected via
+#   regexSampleNames, but for now leave it as is.
+first_data_column <- params$firstDataColumn
+fdc_is_integer    <- is.integer(first_data_column)
+if (fdc_is_integer) {
+first_data_column <- as.integer(params$firstDataColumn)
+}
+## parameters for output files
+ksea_app_prep_db      <- params$kseaAppPrepDb
+imputed_data_filename <- params$imputedDataFilename
+imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
+anova_ksea_mtdt_file  <- params$anovaKseaMetadata
+## parameters for imputation
+# Imputation method, should be one of
+#   "random", "group-median", "median", or "mean"
+imputation_method <- params$imputationMethod
+# Selection of percentile of logvalue data to set the mean for random number
+#   generation when using random imputation
+mean_percentile <- params$meanPercentile / 100.0
+# deviation adjustment-factor for random values; real number.
+sd_percentile <- params$sdPercentile
+## parameters for group parsing and filtering
+# Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
+regex_sample_names <- params$regexSampleNames
+# Regular expression to extract Sample Grouping from Sample Name;
+#   if error occurs, compare smpl_trt vs. sample_name_matches
+#   to see if groupings/pairs line up
+#   e.g., "(\\d+)"
+regex_sample_grouping <- params$regexSampleGrouping
+# What are the patterns for filtering sample groups?
+# How should sample groups be filtered?
+#   - none:    do not filter
+#   - include: include sample groups matching filter
+#   - exclude: include sample groups not matching filter
+sample_group_filter <- params$groupFilter
+if (grepl("f", params$groupFilterMode, fixed = TRUE)) {
+sample_group_filter_perl <- FALSE
+sample_group_filter_fixed <- TRUE
+} else if (grepl("p", params$groupFilterMode, fixed = TRUE)) {
+sample_group_filter_perl <- TRUE
+sample_group_filter_fixed <- FALSE
+} else { # normal regex
+sample_group_filter_perl <- FALSE
+sample_group_filter_fixed <- FALSE
+}
+sample_group_filter_nocase <-
+grepl("i", params$groupFilterMode, fixed = TRUE)
+# What PCRE patterns should be included or excluded
+group_filter_patterns_csv <- params$groupFilterPatterns
+sample_group_filter_patterns <- strsplit(
+x = group_filter_patterns_csv,
+split = ",",
+fixed = TRUE
+)[[1]]
+## parameters for hypothesis testing
+one_way_all_categories_fname <- params$oneWayManyCategories
+one_way_all_categories <- try_catch_w_e(
+match.fun(one_way_all_categories_fname))
+if (!is.function(one_way_all_categories$value)) {
+write("fatal error for parameter oneWayManyCategories:", stderr())
+write(one_way_all_categories$value$message,             stderr())
+if (sys.nframe() > 0) {
+cat("Cannot continue and quit() failed. Goodbye.")
+knitr::knit_exit()
+quit(save = "no", status = 1)
+}
+}
+one_way_all_categories <- one_way_all_categories$value
+one_way_two_categories_fname <- params$oneWayManyCategories
+one_way_two_categories <- try_catch_w_e(
+match.fun(one_way_two_categories_fname))
+if (!is.function(one_way_two_categories$value)) {
+cat("fatal error for parameter oneWayTwoCategories: \n")
+cat(one_way_two_categories$value$message, fill = TRUE)
+if (sys.nframe() > 0) {
+cat("Cannot continue and quit() failed. Goodbye.")
+knitr::knit_exit()
+quit(save = "no", status = 1)
+}
+}
+one_way_two_categories <- one_way_two_categories$value
+## parameters for KSEA
+ksea_cutoff_statistic <- params$kseaCutoffStatistic
+ksea_cutoff_threshold <- params$kseaCutoffThreshold
+ksea_min_substrate_count <- params$kseaMinSubstrateCount
+## parameters for global variables consumed by functions
+# intensityHeatmapCriteria: !r c("na_count", "p_value")[2] # TODO switch to 1
+# TODO Validate within list
+g_intensity_hm_criteria <- params$intensityHeatmapCriteria
+if (is_string_null_or_empty(g_intensity_hm_criteria)) {
+cat("invalid intensityHeatmapCriteria parameter (must be string)")
+knitr::knit_exit()
+}
+switch(
+g_intensity_hm_criteria,
+"quality" = NULL,
+"na_count" = NULL,
+"p_value" = NULL,
+{
+with(
+params,
+cat(
+sprintf(
+"invalid %s (must be %s)",
+see_variable(intensityHeatmapCriteria),
+"one of quality or na_count or p_value"
+)
+)
+)
+knitr::knit_exit()
+}
+)
+# intensityHeatmapRows: 50
+# TODO Validate >> 0 < 75
+g_intensity_hm_rows     <- params$intensityHeatmapRows
+if (!is.integer(g_intensity_hm_rows) || g_intensity_hm_rows < 1) {
+cat("invalid intensityHeatmapRows (must be integer > 0)")
+knitr::knit_exit()
+}
+g_intensity_min_per_class <- params$intensityMinValuesPerGroup
+if (!is.integer(g_intensity_min_per_class) || g_intensity_min_per_class < 0) {
+cat("invalid intensityMinValuesPerGroup (must be integer > -1")
+knitr::knit_exit()
+}
+if (is.na(as.logical(g_correlate_substrates  <- params$correlateSubstrates))) {
+cat("invalid correlateSubstrates (must be TRUE or FALSE)")
+knitr::knit_exit()
+}
+if (is.na(as.logical(g_filter_cov_var_gt_1   <- params$filterCovVarGT1))) {
+cat("invalid filterCovVarGT1 parameter (must be TRUE or FALSE)")
+knitr::knit_exit()
+}
+# TODO Validate >> 0 < 30
+g_ppep_trunc_n          <- params$ppepTruncN
+# TODO Validate >> 0 < 30
+g_subgene_trunc_n       <- params$subgeneTruncN
+# TODO Validate >> 0 < 30
+g_sbstr_trunc_n         <- params$substTruncN
+### OPERATORS
+# Test for exclusion
+# ref: https://www.reneshbedre.com/blog/in-operator-r.html
+`%notin%` <- Negate(`%in%`)
+# Augmented assignment
+# ref: https://www2.cs.arizona.edu/icon/refernce/infix2.htm#aug_assign
+`%||:=%` <- function(lvalue, ...) {
+pf <- parent.frame()
+rvalue <- Reduce(paste0, x = ..., init = lvalue)
+assign(
+x = as.character(substitute(lvalue)),
+value = rvalue,
+pos = pf
+)
+invisible(rvalue)
+}
+### FUNCTIONS
+no_op <-
+function() {
+}
+# this function is not used in this file and should be removed while
+#   factoring out reusable code
+all_apply <- function(f, v, na_rm = TRUE, ...) {
+Reduce(
+f = function(l, r) if (na_rm && is.na(r)) TRUE else l && r,
+x = sapply(X = v, FUN = f, ...),
+init = TRUE
+)
+}
+write_debug_file <- function(data_frame) {
+if (!is.null(debug_file_base_path)) {
+s_path <-
+sprintf(
+"%s/%s.txt",
+debug_file_base_path,
+deparse(substitute(data_frame))
+)
 write.table(
-s,
+data_frame,
 file = s_path,
 sep = "\t",
 col.names = TRUE,
 row.names = TRUE,
 quote = FALSE
 #   Hence, `x <- 1; get("x", new_env())` fails by design.
 new_env <- function() {
 new.env(parent = emptyenv())
 }
+# make apply readable for rows
+row_apply <- function(x, fun, ..., simplify = TRUE) {
+apply(x, MARGIN = 1, fun, ..., simplify = TRUE)
+}
+# make apply readable for columns
+column_apply <- function(x, fun, ..., simplify = TRUE) {
+apply(x, MARGIN = 2, fun, ..., simplify = TRUE)
+}
+##' Produce a vector of boolean values whose i-th value is TRUE when any
+##'   member of v matches the i-th membr of s, where i in 1:seq_len(length(s))
+##'
+##' @title Search multiple strings for matches of multiple substrings
+##' @param v a vector of substrings to match
+##' @param s a vector of strings to search for matches
+##' @param ... additional arguments to grepl
+##' @return a list with keys in s and valuse that are vectors of elements of v
+##' @author Art Eschenlauer
+##' Copyright (C) 2022 Art Eschenlauer;
+##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms
+mgrepl <- function(v, s, ...) {
+grpl_rslt <- rep_len(0, length(s))
+for (vi in v) {
+grpl_rslt_v <- sapply(
+X = s,
+FUN = function(t) {
+Reduce(
+f = function(l, r) if (is.null(l)) r else c(l, r),
+x = sapply(
+X = vi,
+FUN = function(f) grepl(f, t, ...)
+),
+init = c()
+)
+},
+simplify = "array"
+)
+grpl_rslt <- grpl_rslt + grpl_rslt_v
+}
+rslt <- unname(grpl_rslt > 0)
+}
+##' Produce positions in a vector where succeeding value != current valus
+##'
+##' @title Search vector for neighboring positions having different values
+##' @param v a vector of comparable numeric values (e.g. integers)
+##' @return a vector of positions i where v[i] != v[i + 1]
+##' @author Art Eschenlauer
+##' Copyright (C) 2022 Art Eschenlauer;
+##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms
+transition_positions <- function(v) {
+Reduce(
+f = function(l, i) if ((i != 1) && (v[i - 1] != v[i])) c(l, i - 1) else l,
+x = seq_along(v)[-1:0],
+init = c()
+)
+}
+### figure debug functions
+cat_par_vector <- function(par_name, lbl = "", newlines = TRUE) {
+cat(
+sprintf(
+"%spar(%s) = c(%s)%s",
+lbl,
+par_name,
+paste(par(par_name), collapse = ", "),
+if (newlines) "\n\n" else ""
+)
+)
+}
+cat_margins <- function(lbl = NULL) {
+for (p in c("fig", "fin", "mar", "mai", "omd", "omi", "oma"))
+cat_par_vector(p, if (!is.null(lbl)) paste0(lbl, " ") else NULL)
+}
+cat_variable <-
+function(x, suffix = "", digits = 3, force_str = FALSE) {
+xprssn <- deparse1(substitute(x))
+if (force_str || is.matrix(x) || is.list(x) || is.data.frame(x)) {
+cat(
+paste0(
+"\n\\texttt{\\textbf{",
+whack_underscores(xprssn),
+"}} [",
+typeof(x),
+",",
+mode(x),
+"] =\n"
+)
+)
+cat("\n\\begin{verbatim}\n")
+str(x)
+cat("\n\\end{verbatim}\n")
+} else {
+cat("\n", see_variable(x, suffix, digits, xprssn))
+}
+}
+### structure helper functions
+# ref: staque.R - Icon-oriented stack and queue operations
+# - https://gist.github.com/eschen42/917690355e53918b9e7ba7138a02d1f8
+#
+# sq_get(v):x produces the leftmost element of v and removes it from v,
+#   but produces NA if v is empty
+sq_get <- function(v) {
+if (length(v) == 0) return(NA)
+assign(as.character(substitute(v)), v[-1], parent.frame())
+return(v[1])
+}
+#
+# sq_put(v,x1,...,xn):v puts x1, x2, ..., xn onto the right end of v,
+#   producing v.
+#   Values are pushed in order from left to right,
+#   so xn becomes the last (rightmost) value on v.
+#   sq_put(v) with no second argument does nothing.
+sq_put <- function(v, x = NA, ...) {
+pf <- parent.frame()
+if (is.null(x)) return(pf$v)
+if (
+!(length(x) > 1) &&
+!rlang::is_closure(x) &&
+is.na(x)
+) return(pf$v)
+assign(as.character(substitute(v)), c(v, x, ...), pf)
+pf[[as.character(substitute(v))]]
+}
 ### numerical/statistical helper functions
 any_nan <- function(x) {
 !any(x == "NaN")
 }
 sd_finite <- function(x) {
 ok <- is.finite(x)
 sd(x[ok])
 }
+# compute anova raw p-value
 anova_func <- function(x, grouping_factor, one_way_f) {
 subject <- data.frame(
 intensity = x
 )
 x_aov <-
 else
 pvalue <- x_aov$p.value
 pvalue
 }
+# This code adapted from matrixcalc::is.positive.definite
+#   Notably, this simply tests without calling stop()
+is_positive_definite <- function(x, tol = 1e-08) {
+if (!is.matrix(x))
+return(FALSE)
+if (!is.numeric(x))
+return(FALSE)
+if (nrow(x) < 1)
+return(FALSE)
+if (ncol(x) < 1)
+return(FALSE)
+if (nrow(x) != ncol(x))
+return(FALSE)
+sum_symm <- sum(x == t(x), na.rm = TRUE)
+value_count <- Reduce("*", dim(x))
+if (sum_symm != value_count)
+return(FALSE)
+eigenvalues <- eigen(x, only.values = TRUE)$values
+n <- nrow(x)
+for (i in 1:n) {
+if (abs(eigenvalues[i]) < tol) {
+eigenvalues[i] <- 0
+}
+}
+if (any(eigenvalues <= 0)) {
+return(FALSE)
+}
+return(TRUE)
+}
 ### LaTeX functions
-latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) {
-v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v
-cat(
-paste0(
-v_sub,
-collapse = collapse_string
-)
-)
-}
-latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
-cat("\\begin{itemize}\n\\item ")
-latex_collapsed_vector(collapse_string, v, underscore_whack)
-cat("\n\\end{itemize}\n")
-}
-latex_itemized_list <- function(v, underscore_whack = TRUE) {
-latex_itemized_collapsed("\n\\item ", v, underscore_whack)
-}
-latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
-cat("\\begin{enumerate}\n\\item ")
-latex_collapsed_vector(collapse_string, v, underscore_whack)
-cat("\n\\end{enumerate}\n")
-}
-latex_enumerated_list <- function(v) {
-latex_enumerated_collapsed("\n\\item ", v)
-}
-latex_table_row <- function(v, extra = "", underscore_whack = TRUE) {
-latex_collapsed_vector(" & ", v, underscore_whack)
-cat(extra)
-cat(" \\\\\n")
-}
 # Use this like print.data.frame, from which it is adapted:
-data_frame_latex <-
+data_frame_table_latex <-
 function(
 x,
-...,
 # digits to pass to format.data.frame
 digits = NULL,
 # TRUE -> right-justify columns; FALSE -> left-justify
 right = TRUE,
 # maximumn number of rows to print
 # optional caption
 caption = NULL,
 # h(inline); b(bottom); t (top) or p (separate page)
 anchor = "h",
 # set underscore_whack to TRUE to escape underscores
-underscore_whack = TRUE
+underscore_whack = TRUE,
+# how to emit results
+emit = cat
 ) {
 if (is.null(justification))
 justification <-
 Reduce(
 f = paste,
 x = rep_len(if (right) "r" else "l", length(colnames(x)))
 )
+n <- length(rownames(x))
+if (length(x) == 0L) {
+emit(
+sprintf(
+# if n is one, use singular 'row', else use plural 'rows'
+ngettext(
+n,
+"data frame with 0 columns and %d row",
+"data frame with 0 columns and %d rows"
+),
+n
+),
+"\n",
+sep = ""
+)
+} else if (n == 0L) {
+emit("0 rows for:\n")
+latex_itemized_list(
+v = names(x),
+underscore_whack = underscore_whack
+)
+} else {
+if (is.null(max))
+max <- getOption("max.print", 99999L)
+if (!is.finite(max)) {
+cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max)
+knitr::knit_exit()
+}
+omit <- (n0 <- max %/% length(x)) < n
+m <- as.matrix(
+format.data.frame(
+if (omit) x[seq_len(n0), , drop = FALSE] else x,
+digits = digits,
+na.encode = FALSE
+)
+)
+emit(
+# h(inline); b(bottom); t (top) or p (separate page)
+paste0("\\begin{table}[", anchor, "]"),
+"\\leavevmode",
+sep = "\n"
+)
+if (!is.null(caption))
+emit(paste0(" \\caption{", caption, "}"))
+if (centered) emit("\\centering\n")
+emit(
+paste(
+" \\begin{tabular}{",
+justification,
+"}\n",
+sep = ""
+)
+)
+# ref for top and bottom struts (\T and \B):
+#   https://tex.stackexchange.com/a/50355
+if (!is.null(caption))
+emit("\\B \\\\\n")
+latex_table_row(
+v = colnames(m),
+extra = " \\T \\B",
+underscore_whack = underscore_whack
+)
+emit("\\hline \\\\\n")
+for (i in seq_len(length(m[, 1]))) {
+latex_table_row(
+v = m[i, ],
+underscore_whack = underscore_whack
+)
+}
+emit(
+paste(
+" \\end{tabular}",
+"\\end{table}",
+sep = "\n"
+)
+)
+if (omit)
+emit(" [ reached 'max' / getOption(\"max.print\") -- omitted",
+n - n0, "rows ]\n")
+}
+invisible(x)
+}
+# Use this like print.data.frame, from which it is adapted:
+data_frame_tabbing_latex <-
+function(
+x,
+# vector of tab stops, in inches
+tabstops,
+# vector of headings, registered with tab-stops
+headings = colnames(x),
+# digits to pass to format.data.frame
+digits = NULL,
+# maximumn number of rows to print
+max = NULL,
+# optional caption
+caption = NULL,
+# set underscore_whack to TRUE to escape underscores
+underscore_whack = TRUE,
+# flag for landscape mode
+landscape = FALSE,
+# flag indicating that subsubsection should be used for caption
+#   rather than subsection
+use_subsubsection_header = TRUE,
+# character-size indicator; for possible values, see:
+#   https://tug.org/texinfohtml/latex2e.html#Font-sizes
+charactersize = "small",
+# set verbatim to TRUE to debug output
+verbatim = FALSE
+) {
+hlinport <- if (landscape) {
+function() cat("\\hlinlscp \\\\\n")
+} else {
+function() cat("\\hlinport \\\\\n")
+}
+tabstops_tex <-
+Reduce(
+f    = function(l, r) paste0(l, r),
+x    = sprintf("\\hspace{%0.2fin}\\=", tabstops),
+init = ""
+)
 n <- length(rownames(x))
 if (length(x) == 0L) {
 cat(
 sprintf(
 # if n is one, use singular 'row', else use plural 'rows'
 underscore_whack = underscore_whack
 )
 } else {
 if (is.null(max))
 max <- getOption("max.print", 99999L)
-if (!is.finite(max))
+if (!is.finite(max)) {
-stop("invalid 'max' / getOption(\"max.print\"): ",
+cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max)
-max)
+knitr::knit_exit()
+}
 omit <- (n0 <- max %/% length(x)) < n
 m <- as.matrix(
 format.data.frame(
 if (omit) x[seq_len(n0), , drop = FALSE] else x,
 digits = digits,
 na.encode = FALSE
 )
 )
-cat(
+if (landscape)
-# h(inline); b(bottom); t (top) or p (separate page)
+cat("\n\\begin{landscape}")
-paste0("\\begin{table}[", anchor, "]\n")
+tex_caption <-
-)
+if (!is.null(caption)) sprintf("\\captionof{table}{%s}\n", caption)
-if (!is.null(caption))
+else "\n"
-cat(paste0(" \\caption{", caption, "}"))
+# build the column names, which have multiple lines when
-if (centered) cat("\\centering\n")
+#   length(headings) is a multiple of the number of columns
-cat(
+column_names <- ""
-paste(
+while (length(headings) > 0) {
-" \\begin{tabular}{",
+my_row <- c()
-justification,
+for (i in 1:(1 + length(tabstops))) {
-"}\n",
+my_field <- sq_get(headings)
-sep = ""
+sq_put(my_row, if (is.na(my_field)) "" else my_field)
-)
+}
-)
+column_names %||:=% latex_tabbing_row(
-# ref: https://tex.stackexchange.com/a/50353
+v = my_row,
-#   Describes use of \rule{0pt}{3ex}
+underscore_whack = underscore_whack,
-if (!is.null(caption))
+action = paste0
-cat("\\B \\\\ \\hline\\hline\n")
-# ref for top and bottom struts: https://tex.stackexchange.com/a/50355
-latex_table_row(
-v = colnames(m),
-extra = "\\T\\B",
-underscore_whack = underscore_whack
-)
-cat("\\hline\n")
-for (i in seq_len(length(m[, 1]))) {
-latex_table_row(
-v = m[i, ],
-underscore_whack = underscore_whack
 )
 }
+# Begin tabbing environment after beginning charactersize environment
+if (verbatim) cat("\n\\begin{verbatim}")
 cat(
-paste(
+paste0(
-" \\end{tabular}",
+"\n\\begin{", charactersize, "}", tex_caption,
-"\\end{table}",
+"\\begin{tabwrap}{", tabstops_tex, "}\n"
-sep = "\n"
-)
 )
+)
+# emit column names
+cat(column_names)
+# emit hline
+hlinport()
+for (i in seq_len(length(m[, 1]))) {
+my_row <- latex_tabbing_row(
+v = m[i, ],
+underscore_whack = underscore_whack,
+action = paste0
+)
+if (FALSE)
+cat(my_row)
+else
+cat(my_row)
+}
+hlinport()
 if (omit)
 cat(" [ reached 'max' / getOption(\"max.print\") -- omitted",
 n - n0, "rows ]\n")
+# End charactersize environment after ending tabbing environment
+cat(paste0("\\end{tabwrap}\n\\end{", charactersize, "}\n"))
+if (verbatim) cat("\\end{verbatim}\n")
+if (landscape)
+cat("\\end{landscape}\n")
 }
 invisible(x)
+}
+param_df_noexit <-
+function(e = NULL) {
+data_frame_tabbing_latex(
+x = param_df,
+tabstops = c(1.75),
+underscore_whack = TRUE,
+caption = "Input parameters",
+verbatim = FALSE
+)
+if (!is.null(e)) {
+sink(stderr())
+cat("Caught fatal error:\n\n")
+str(e)
+sink()
+}
+}
+param_df_exit <-
+function(e = NULL) {
+param_df_noexit(e)
+knitr::knit_exit()
+exit(-1)
+}
+# exit with exit code (default 0) and optional msg
+exit <-
+function(code = 0, msg = NULL, use_stderr = FALSE) {
+if (!is.null(msg)) {
+if (use_stderr) sink(stderr())
+cat("\n\n", msg, "\n\n")
+if (use_stderr) sink()
+}
+q(save = "no", status = code)
+}
+# make control sequences into printable latex sequences
+latex_printable_control_seqs <-
+function(s) {
+s <- gsub("[\\]", "xyzzy_plugh", s)
+s <- gsub("[$]", "\\\\$", s)
+s <- gsub("xyzzy_plugh", "$\\\\backslash$", s)
+return(s)
+}
+nolatex_verbatim <-
+function(expr) eval(expr)
+latex_verbatim <-
+function(expr) {
+arg_string <- deparse1(substitute(expr))
+cat("\n\\begin{verbatim}\n___\n")
+tryCatch(
+expr = expr,
+error = param_df_exit,
+#ACE error =
+#ACE   function(e) {
+#ACE     cat("Caught error:\n\n")
+#ACE     str(e)
+#ACE     knitr::knit_exit()
+#ACE     stop(e)
+#ACE   },
+finally = cat("...\n\\end{verbatim}\n")
+)
+}
+latex_samepage <-
+function(expr) {
+arg_string <- deparse1(substitute(expr))
+cat("\n\\begin{samepage}\n")
+tryCatch(
+expr = expr,
+error = param_df_exit,
+#ACE error =
+#ACE   function(e) {
+#ACE     cat("Caught error:\n\n")
+#ACE     str(e)
+#ACE     knitr::knit_exit()
+#ACE     stop(e)
+#ACE   },
+finally = cat("\n\\end{samepage}\n")
+)
+}
+# return the result of invocation after showing parameters
+# ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/
+latex_show_invocation <-
+function(f, f_name = deparse1(substitute(f)), head_patch = FALSE) {
+function(...) {
+my_env <- (as.list(environment()))
+va <- list(...)
+my_rslt <- new_env()
+my_rslt$rslt <- NULL
+latex_verbatim(
+expr = {
+cat(sprintf("\n ..  Local variables for '%s':\n\n", f_name))
+str(va)
+if (!head_patch) {
+# return this result
+# ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/
+cat(sprintf("\n ..  Invoking '%s'\n", f_name))
+tryCatch(
+{
+cat("\n\\end{verbatim}\n")
+rslt <- do.call(f, va)
+},
+error = param_df_exit,
+#ACE error = function(e) {
+#ACE    cat("\n\\begin{verbatim}\n")
+#ACE    str(e)
+#ACE    cat("\n\\end{verbatim}\n")
+#ACE    knitr::knit_exit()
+#ACE    stop(e)
+#ACE },
+finally = cat("\n\\begin{verbatim}\n")
+)
+cat(sprintf("\n ..  '%s' returned:\n", f_name))
+str(rslt)
+my_rslt$rslt <- rslt
+}
+}
+)
+# return the result of invocation with the shown parameters
+# ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/
+if (head_patch) my_rslt$rslt <- do.call(f, va)
+(my_rslt$rslt)
+}
+}
+latex_collapsed_vector <- function(
+collapse_string,
+v,
+underscore_whack = TRUE,
+action = cat0
+) {
+v_sub <-
+if (underscore_whack) whack_underscores(v) else v
+action(
+paste0(
+v_sub,
+collapse = collapse_string
+)
+)
+}
+latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
+cat("\\begin{itemize}\n\\item ")
+latex_collapsed_vector(collapse_string, v, underscore_whack)
+cat("\n\\end{itemize}\n")
+}
+latex_itemized_list <- function(v, underscore_whack = TRUE) {
+latex_itemized_collapsed("\n\\item ", v, underscore_whack)
+}
+latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
+cat("\\begin{enumerate}\n\\item ")
+latex_collapsed_vector(collapse_string, v, underscore_whack)
+cat("\n\\end{enumerate}\n")
+}
+latex_enumerated_list <- function(v) {
+latex_enumerated_collapsed("\n\\item ", v)
+}
+latex_table_row <- function(v, extra = "", underscore_whack = TRUE) {
+latex_collapsed_vector(" & ", v, underscore_whack)
+cat(extra)
+cat(" \\\\\n")
+}
+latex_tabbing_row <- function(
+v,
+extra = "",
+underscore_whack = TRUE,
+action = cat0
+) {
+# latex_collapsed_vector applies action to result of paste0;
+#   by default, action = cat;
+#   hence, a scalar string is assigned to v_collapsed
+v_collapsed <-
+latex_collapsed_vector(
+"} \\> \\tabfill{",
+v,
+underscore_whack,
+action = paste0
+)
+action(
+"\\tabfill{",
+v_collapsed,
+"}",
+extra,
+" \\\\\n"
+)
+}
+# N.B. use con = "" to emulate regular cat
+fcat0 <-
+function(..., sprtr = " ", cnnctn = file()) {
+cat0(..., sep = sprtr, file = cnnctn)
+invisible(cnnctn)
 }
 hypersub <-
 function(s) {
 hyper <- tolower(s)
 hyper <- gsub("[^a-z0-9]+", "-", hyper)
 hyper <- gsub("[-]+",       "-", hyper)
+hyper <- gsub("[_]+",       "-", hyper)
 hyper <- sub("^[-]",        "",  hyper)
 hyper <- sub("[-]$",        "",  hyper)
 return(hyper)
 }
-subsection_header <-
+table_href <- function(s = "offset", caption = "") {
-function(s) {
+paste0("\\hyperlink{table.\\arabic{", s, "}}{Table \\arabic{", s, "}}")
+}
+table_offset <- function(i = 0, s = "offset", new = FALSE) {
+paste0(
+if (new) paste0("\\newcounter{", s, "}\n") else "",
+"\\setcounter{", s, "}{\\value{table}}\n",
+paste0(if (i > 0) rep(paste0("\\stepcounter{", s, "}"), i), "\n")
+)
+}
+a_section_header <-
+function(s, prefix = "") {
 hyper <- hypersub(s)
-cat(
+my_subsection_header <- sprintf(
-sprintf(
+"\\hypertarget{%s}{\\%ssection{%s}\\label{%s}}\n",
-"\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n",
+hyper,
-hyper, s, hyper
+prefix,
-)
+gsub("_", "\\_", s, fixed = TRUE),
-)
+hyper
-}
+)
+my_subsection_header
-subsubsection_header <-
+}
-function(s) {
+section_header <- function(s) a_section_header(s, "")
-hyper <- hypersub(s)
+subsection_header <- function(s) a_section_header(s, "sub")
-cat(
+subsubsection_header <- function(s) a_section_header(s, "subsub")
-sprintf(
-"\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n",
-hyper, s, hyper
-)
-)
-}
 ### SQLite functions
 ddl_exec <- function(db, sql) {
 discard <- DBI::dbExecute(conn = db, statement = sql)
 }
 }
 ### KSEA functions and helpers
-# Adapted from KSEAapp::KSEA.Scores to allow retrieval of:
+#' The KSEA App Analysis (KSEA Kinase Scores Only)
-# - maximum log2(FC)
+#'
+#' Compute KSEA kinase scores and statistics from phoshoproteomics data input
+#' Adapted from KSEAapp::KSEA.Scores to allow retrieval of maximum log2(FC)
+#'
+#' Result is an R data.frame with column names
+#'   "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR"
+#' "Please refer to the original Casado et al. publication for detailed
+#'   description of these columns and what they represent:
+#'
+#'   - Kinase.Gene indicates the gene name for each kinase.
+#'   - mS          represents the mean log2(fold change) of all the
+#'                   kinase's substrates.
+#'   - Enrichment  is the background-adjusted value of the kinase's mS.
+#'   - m           is the total number of detected substrates
+#'                   from the experimental dataset for each kinase.
+#'   - z.score     is the normalized score for each kinase, weighted by
+#'                   the number of identified substrates.
+#'   - p.value     represents the statistical assessment for the z.score.
+#'   - FDR         is the p-value adjusted for multiple hypothesis testing
+#'                   using the Benjamini & Hochberg method."
+#'
+#' @param ksdata           the Kinase-Substrate dataset uploaded from the file
+#'                           prefaced with "PSP&NetworKIN_"
+#'                           available from github.com/casecpb/KSEA/
+#' @param px               the experimental data file formatted as described in
+#'                           the KSEA.Complete() documentation
+#' @param networkin        a binary input of TRUE or FALSE, indicating whether
+#'                           or not to include NetworKIN predictions, where
+#'                           \code{NetworKIN = TRUE}
+#'                           means include NetworKIN predictions
+#' @param networkin_cutoff a numeric value between 1 and infinity setting
+#'                           the minimum NetworKIN score
+#'                           (this can be omitted if NetworKIN = FALSE)
+#'
+#' @return                 creates a new R data.frame with all the KSEA kinase
+#'                           scores, along with each one's statistical
+#'                           assessment, as described herein.
+#'
+#' @references
+#'
+#' Casado et al. (2013) Sci Signal. 6(268):rs6
+#'
+#' Hornbeck et al. (2015) Nucleic Acids Res. 43:D512-20
+#'
+#' Horn et al. (2014) Nature Methods 11(6):603-4
+#'
 ksea_scores <- function(
 # For human data, typically, ksdata = KSEAapp::ksdata
 ksdata,
 # Input data file having columns:
 # - Protein     : abbreviated protein name
 #   NetworKIN predictions
 networkin,
 # A numeric value between 1 and infinity setting the minimum NetworKIN
 #   score (can be left out if networkin = FALSE)
-networkin_cutoff
+networkin_cutoff,
+# Minimum substrate count, necessary to adjust the p-value appropriately.
+minimum_substrate_count
 ) {
+# no px$FC should be <= 0, but abs(px$FC) is used below as a precaution.
 if (length(grep(";", px$Residue.Both)) == 0) {
 # There are no Residue.Both entries having semicolons, so new is
 #   simply px except two columns are renamed and a column is added
 #   for log2(abs(fold-change))
 new <- px
 # Convert any illegal values from NaN to NA
 new[is.nan(new$log2_fc), "log2_fc"] <- NA
 # Eliminate rows having missing values (e.g., non-imputed data)
 new <- new[complete.cases(new$log2_fc), ]
 }
-if (networkin == TRUE) {
+# At this point, new$log2_fc is signed according to which contrast has
-# When NetworKIN is true, filter on NetworKIN.cutoff which includes
+#   the greater intensity
-#   PhosphoSitePlus data *because its networkin_score is set to Inf*
+# To take the magnitude into account without taking the direction into
-ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ]
+#   account, set params$kseaUseAbsoluteLog2FC to TRUE
-ksdata_filtered <- ksdata_filtered[
+#
-(ksdata_filtered$networkin_score >= networkin_cutoff), ]
+# Should KSEA be performed aggregating signed log2FC or absolute?
-} else {
+# FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores
-# Otherwise, simply use PhosphSitePlus rows
+if (params$kseaUseAbsoluteLog2FC) {
-ksdata_filtered <- ksdata[
+# TRUE  use abs(log2FC) for KSEA as Justin requested; this is a
-grep("PhosphoSitePlus", ksdata$Source), ]
+#         justifiable deviation from the KSEAapp::KSEA.Scores algorithm.
-}
+new$log2_fc <- abs(new$log2_fc)
-# Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD
+}
+monitor_filtration_on_stderr <- TRUE
+if (monitor_filtration_on_stderr) {
+# set to TRUE to monitor filtration on stderr
+sink(stderr())
+cat(see_variable(networkin, "\n"))
+}
+ksdata_filtered <-
+sqldf(
+sprintf("%s %s",
+"select * from ksdata where not Source = 'NetworKIN'",
+if (networkin)
+sprintf("or networkin_score >= %d", networkin_cutoff)
+else
+""
+)
+)
+if (monitor_filtration_on_stderr) {
+cat(see_variable(sqldf(
+"select count(*), Source from ksdata group by Source"), "\n"))
+cat(see_variable(sqldf(
+"select count(*), Source from ksdata_filtered group by Source"), "\n"))
+sink()
+}
+############################################################################
+# Line numbers below refer to lines of:
+#   https://github.com/casecpb/KSEAapp/blob/master/R/KSEA.Scores.R
+# I would put the original line in a comment but then lint would complain...
+# - Indeed, I had to rename all the variables because lint didn't like names
+#   containing periods or capital letters.
+# ACE
+############################################################################
+#
+# (1) Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD
 #   colnames of ksdata_filtered:
 #     "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID"
 #     "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID"
 #     "SITE_...7_AA" "networkin_score" "Source"
 #   colnames of new:
 #   SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc
 #     FROM ksdata_filtered a
 #       INNER JOIN new b
 #         ON a.SUB_GENE = b.SUB_GENE
 #           AND a.SUB_MOD_RSD = b.SUB_MOD_RSD
+#   (KSEA.Scores.R line # 105)
+#   "Extract KSData.filtered annotations that are only found in new"
 ksdata_dataset <- base::merge(ksdata_filtered, new)
 #   colnames of ksdata_dataset:
 #     "KINASE"      "KIN_ACC_ID"   "GENE"       "KIN_ORGANISM" "SUBSTRATE"
 #     "SUB_GENE_ID" "SUB_ACC_ID"   "SUB_GENE"   "SUB_ORGANISM" "SUB_MOD_RSD"
 #     "SITE_GRP_ID" "SITE_...7_AA" "networkin_score"  "Source" "Protein"
 #     "Peptide"     "p"            "FC"         "log2_fc" (uniprot_no_isoform)
 # Re-order dataset; prior to accounting for isoforms
+#   (KSEA.Scores.R line # 106)
 ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ]
 # Extract non-isoform accession in UniProtKB
+#   (KSEA.Scores.R line # 107)
 ksdata_dataset$uniprot_no_isoform <- sapply(
 ksdata_dataset$KIN_ACC_ID,
 function(x) unlist(strsplit(as.character(x), split = "-"))[1]
 )
+#   "last expression collapses isoforms ... for easy processing"
 # Discard previous results while selecting interesting columns ...
+#   (KSEA.Scores.R line # 110)
 ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)]
 # Column names are now:
 #   "GENE"       "SUB_GENE"        "SUB_MOD_RSD"    "Peptide" "p"
 #   "FC" "log2_fc" "Source"
 # Make column names human-readable
+#   (KSEA.Scores.R line # 111)
 colnames(ksdata_dataset_abbrev) <- c(
 "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p",
 "FC", "log2FC", "Source"
 )
 # SELECT * FROM ksdata_dataset_abbrev
 #   ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p
+#   (KSEA.Scores.R line # 112)
+#   "Extract KSData.filtered annotations that are only found in new"
 ksdata_dataset_abbrev <-
 ksdata_dataset_abbrev[
 order(
 ksdata_dataset_abbrev$Kinase.Gene,
 ksdata_dataset_abbrev$Substrate.Gene,
 ksdata_dataset_abbrev$Substrate.Mod,
 ksdata_dataset_abbrev$p),
 ]
+if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev))
 # First aggregation step to account for multiply phosphorylated peptides
 #   and differing peptide sequences; the goal here is to combine results
 #   for all measurements of the same substrate.
 # SELECT  `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
 #         `Source`, avg(log2FC) AS log2FC
 #   GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
 #         `Source`
 #   ORDER BY `Kinase.Gene`;
 # in two steps:
 # (1) compute average log_2(fold-change)
+#   "take the mean of the log2FC amongst phosphosite duplicates"
+#   (KSEA.Scores.R line # 115)
 ksdata_dataset_abbrev <- aggregate(
 log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source,
 data = ksdata_dataset_abbrev,
 FUN = mean
 )
+if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev))
 # (2) order by Kinase.Gene
+#   (KSEA.Scores.R line # 117)
 ksdata_dataset_abbrev <-
 ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ]
 # SELECT  `Kinase.Gene`, count(*)
 #   FROM  ksdata_dataset_abbrev
 #  GROUP BY `Kinase.Gene`;
 # in two steps:
 # (1) Extract the list of Kinase.Gene names
+#   "@@@@@@@@@@@@@@@@@@@@"
+#   "Do analysis for KSEA"
+#   "@@@@@@@@@@@@@@@@@@@@"
+#   (KSEA.Scores.R line # 124)
 kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene)
 # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev,
 #   named by Kinase.Gene
+#   (KSEA.Scores.R line # 125)
 kinase_list <- as.matrix(table(kinase_list))
 # Second aggregation step to account for all substrates per kinase
 # CREATE TABLE mean_fc
 #   AS
 # SELECT  `Kinase.Gene`, avg(log2FC) AS log2FC
 #   FROM  ksdata_dataset_abbrev
 #   GROUP BY `Kinase.Gene`
-mean_fc <- aggregate(
+#   (KSEA.Scores.R line # 127)
-log2FC ~ Kinase.Gene,
+if (print_nb_messages) nb(see_variable(ksdata_dataset_abbrev), "\n")
-data = ksdata_dataset_abbrev,
+mean_fc <-
-FUN = mean
+aggregate(
-)
-# mean_fc columns: "Kinase.Gene", "log2FC"
-if (FALSE) {
-# I need to re-think this; I was trying to find the most-represented
-#   peptide, but that horse has already left the barn
-# SELECT  `Kinase.Gene`, max(abs(log2FC)) AS log2FC
-#   FROM  ksdata_dataset_abbrev
-#   GROUP BY `Kinase.Gene`
-max_fc <- aggregate(
 log2FC ~ Kinase.Gene,
 data = ksdata_dataset_abbrev,
-FUN = function(r) max(abs(r))
+FUN = mean
 )
-}
+if (print_nb_messages) nbe(see_variable(mean_fc), "\n")
+# for contrast j
+#   for each kinase i
+#     extract log2 of fold-change (from `new` above)
+#     (used in KSEA.Scores.R lines # 130 & 132)
+log2_fc_j_each_i <-
+new$log2_fc
+# for contrast j
+#   for all kinases i
+#     compute mean of abs(log2 of fold-change)
+#     (used in KSEA.Scores.R lines # 130)
+mean_abs_log2_fc_j_all_i <-
+mean(abs(log2_fc_j_each_i), na.rm = TRUE)
+# for contrast j
+#   for all kinases i
+#     compute mean of log2 of fold-change
+#     (used in KSEA.Scores.R lines # 132)
+mean_log2_fc_j_all_i <-
+mean(log2_fc_j_each_i, na.rm = TRUE)
+# Reorder mean_fc, although I don't see why
+#   (KSEA.Scores.R line 128
+mean_fc <- mean_fc[order(mean_fc[, 1]), ]
+# mean_fc columns so far: "Kinase.Gene", "log2FC"
+# - Kinase.Gene
+#               indicates the gene name for each kinase.
 # Create column 3: mS
-mean_fc$m_s <- mean_fc[, 2]
+# - mS
+#               represents the mean log2(fold change) of all the
+#               kinase's substrates.
+#   (KSEA.Scores.R line # 129)
+mean_fc$m_s <-
+mean_fc_m_s <- mean_fc[, 2]
 # Create column 4: Enrichment
-mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE))
+# - Enrichment
-# Create column 5: m, count of substrates
+#               is the background-adjusted value of the kinase's mS.
-mean_fc$m <- kinase_list
+#   (KSEA.Scores.R line # 130)
+mean_fc$enrichment <-
+mean_fc_m_s / mean_abs_log2_fc_j_all_i
+# Create column 5: m, count of substrates of kinase (count of j for i)
+# - m
+#               is the total number of detected substrates
+#               from the experimental dataset for each kinase.
+#   (KSEA.Scores.R line # 131)
+mean_fc$m <-
+mean_fc_m <- kinase_list
 # Create column 6: z-score
-mean_fc$z_score <- (
+# - z.score
-(mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) *
+#               is the normalized score for each kinase, weighted by
-sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE)
+#               the number of identified substrates.
+#   (KSEA.Scores.R line # 132)
+mean_fc$z_score <-
+(mean_fc_m_s - mean_log2_fc_j_all_i) * sqrt(mean_fc_m) /
+sd(log2_fc_j_each_i, na.rm = TRUE)
 # Create column 7: p-value, deduced from z-score
-mean_fc$p_value <- pnorm(-abs(mean_fc$z_score))
+# - p.value
+#               represents the statistical assessment for the z.score.
+#   (KSEA.Scores.R line # 133)
+# "one-tailed p-value"
+mean_fc$p_value <-
+pnorm(-abs(mean_fc$z_score))
+# zap excluded kinases; this must be done before adjusting p-value
+if (TRUE) {
+mean_fc <-
+mean_fc[
+mean_fc$m >= minimum_substrate_count,
+,
+drop = FALSE
+]
+}
+#ACE nb(see_variable(nrow(mean_fc)), "\n")
 # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value
-mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr")
+# - FDR
+#               is the p-value adjusted for multiple hypothesis testing
-# Remove log2FC column, which is duplicated as mS
+#               using the Benjamini & Hochberg method."
-mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2]
+#   (KSEA.Scores.R line # 134)
+mean_fc$fdr <-
+p.adjust(mean_fc$p_value, method = "fdr")
+# It makes no sense to leave Z-scores negative when using
+#   absolute value of fold-change
+if (params$kseaUseAbsoluteLog2FC) {
+mean_fc$z_score <- abs(mean_fc$z_score)
+}
+# Remove second column (log2FC), which is duplicated as mS
+#   (KSEA.Scores.R line # 136)
+mean_fc <-
+mean_fc[order(mean_fc$Kinase.Gene), -2]
 # Correct the column names which we had to hack because of the linter...
 colnames(mean_fc) <- c(
 "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR"
 )
+#   (KSEA.Scores.R line # 138)
 return(mean_fc)
 }
-low_fdr_barplot <- function(
+ksea_low_fdr_barplot_factory <- function(
 rslt,
 i_cntrst,
 i,
 a_level,
 b_level,
 k$fdr
 },
 "p.value" = {
 k$p_value
 },
-stop(
+{
-sprintf(
+cat(
-"Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+sprintf(
-ksea_cutoff_statistic
+"Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+ksea_cutoff_statistic
+)
 )
-)
+param_df_exit()
-)
+knitr::knit_exit()
+}
+)
 k <- k[selector < ksea_cutoff_threshold, ]
+nrow_k <- nrow(k)
-if (nrow(k) > 0) {
-op <- par(mai = c(1, 1.5, 0.4, 0.4))
+#ACE nbe(see_variable(fdr_barplot_dataframe <- k))
+if (nrow_k > 0) {
+max_nchar_rowname <- max(nchar(rownames(k)))
+my_cex_names <- 1.0 / (1 + nrow_k / 50)
+if (print_trace_messages) cat_margins("Initially")
+if (print_trace_messages) cat_variable(nrow_k,            "\n\n", 0)
+if (print_trace_messages) cat_variable(my_cex_names,      "\n\n", 0)
+if (print_trace_messages) cat_variable(max_nchar_rowname, "\n\n", 0)
+# fin: The figure region dimensions, (width, height), in inches.
+# mar: A numerical vector of the form c(bottom, left, top, right)
+#      that gives the number of lines of margin to be specified
+#      on the four sides of the plot; default: c(5, 4, 4, 2) + 0.1
+# mar: The figure region dimensions, (width, height), in inches.
 numeric_z_score <- as.numeric(k$z_score)
-z_score_order <- order(numeric_z_score)
+bar_order <- order(-as.numeric(k$p_value))
 kinase_name <- k$kinase_gene
 long_caption <-
 sprintf(
-"Kinase z-score, %s < %s, %s",
+"Kinase z-score, %s, KSEA %s < %s",
+caption,
 ksea_cutoff_statistic,
-ksea_cutoff_threshold,
+ksea_cutoff_threshold
-caption
 )
 my_cex_caption <- 65.0 / max(65.0, nchar(long_caption))
-cat("\n\\clearpage\n")
+# return a function that draws the plot
-barplot(
+function() {
-height = numeric_z_score[z_score_order],
+par_fin <- par("fin") # vector of width_in_inches and height_in_inches)
-border = NA,
+op <- par(
-xpd = FALSE,
+bg = if (print_trace_messages) "yellow" else "white",
-cex.names = 1.0,
+fin = c(par_fin[1], min(par_fin[2], 2.5 + nrow_k / 6)),
-main = long_caption,
+mar = par("mar") +
-cex.main = my_cex_caption,
+c(3 / nrow_k, (1 + max_nchar_rowname * my_cex_names) / 2, 0, 0)
-names.arg = kinase_name[z_score_order],
+# bottom,     left,                                     top, right
-horiz = TRUE,
-srt = 45,
-las = 1,
-cex.axis = 0.9
 )
-par(op)
+on.exit(par(op))
+if (print_trace_messages) cat_margins("Eventually")
+barplot(
+height = numeric_z_score[bar_order],
+border = NA,
+xpd = FALSE,
+cex.names = my_cex_names,
+main = long_caption,
+cex.main = my_cex_caption,
+names.arg = kinase_name[bar_order],
+horiz = TRUE,
+srt = 45,
+las = 1,
+cex.axis = 0.9
+)
+}
 }
+} else {
+no_op
 }
 }
 # note that this adds elements to the global variable `ksea_asterisk_hash`
-low_fdr_print <- function(
+ksea_low_fdr_print <- function(
 rslt,
 i_cntrst,
 i,
 a_level,
 b_level,
 fold_change,
-caption
+caption,
+write_db = TRUE, # if TRUE, write to DB, else print table
+anchor = c(const_table_anchor_p, const_table_anchor_t)
 ) {
 rslt_score_list_i <- rslt$score_list[[i]]
 if (!is.null(rslt_score_list_i)) {
 rslt_score_list_i_nrow <- nrow(rslt_score_list_i)
 k <- contrast_ksea_scores <- data.frame(
 k$fdr
 },
 "p.value" = {
 k$p_value
 },
-stop(
+{
-sprintf(
+cat(
-"Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+sprintf(
-ksea_cutoff_statistic
+"Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+ksea_cutoff_statistic
+)
 )
-)
+param_df_exit()
-)
+knitr::knit_exit()
+}
+)
 k <- k[selector < ksea_cutoff_threshold, ]
 # save kinase names to ksea_asterisk_hash
 for (kinase_name in k$kinase_gene) {
 ksea_asterisk_hash[[kinase_name]] <- 1
 }
-db_write_table_overwrite <- (i_cntrst < 2)
+if (write_db) {
-db_write_table_append    <- !db_write_table_overwrite
+db_write_table_overwrite <- (i_cntrst < 2)
-RSQLite::dbWriteTable(
+db_write_table_append    <- !db_write_table_overwrite
-conn = db,
+RSQLite::dbWriteTable(
-name = "contrast_ksea_scores",
+conn = db,
-value = contrast_ksea_scores,
+name = "contrast_ksea_scores",
-append = db_write_table_append
+value = contrast_ksea_scores,
-)
+append = db_write_table_append
-selector <- switch(
+)
-ksea_cutoff_statistic,
+""
-"FDR" = {
+} else {
-contrast_ksea_scores$fdr
+selector <- switch(
-},
+ksea_cutoff_statistic,
-"p.value" = {
+"FDR" = {
-contrast_ksea_scores$p_value
+contrast_ksea_scores$fdr
 },
-stop(
+"p.value" = {
-sprintf(
+contrast_ksea_scores$p_value
-"Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+},
-ksea_cutoff_statistic
+{
+cat(
+sprintf(
+"Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+ksea_cutoff_statistic
+)
+)
+param_df_exit()
+knitr::knit_exit()
+}
+)
+if (print_nb_messages) nbe(see_variable(contrast_ksea_scores))
+output_df <- contrast_ksea_scores[
+selector < ksea_cutoff_threshold,
+c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count",
+"z_score", "p_value", "fdr")
+]
+output_df$kinase_gene <-
+gsub(
+"_",
+"\\\\_",
+output_df$kinase_gene
+)
+colnames(output_df) <-
+c(
+colnames(output_df)[1],
+colnames(output_df)[2],
+"enrichment",
+"m_s",
+"z_score",
+"p_value",
+"fdr"
+)
+#ACE output_order <- with(output_df, order(fdr))
+output_order <- with(output_df, order(p_value))
+output_df <- output_df[output_order, ]
+output_df[, 2] <- sprintf("%0.3g", output_df[, 2])
+output_df$fdr <- sprintf("%0.4f", output_df$fdr)
+output_df$p_value <- sprintf("%0.2e", output_df$p_value)
+output_df$z_score <- sprintf("%0.2f", output_df$z_score)
+output_df$m_s <- sprintf("%d", output_df$m_s)
+output_df$enrichment <- sprintf("%0.3g", output_df$enrichment)
+output_ncol <- ncol(output_df)
+colnames(output_df) <-
+c(
+"Kinase",
+"\\(\\overline{{\\lvert}\\log_2 (\\text{fold-change}){\\rvert}}\\)",
+"Enrichment",
+"Substrates",
+"z-score",
+"p-value",
+"FDR"
+)
+selector <- switch(
+ksea_cutoff_statistic,
+"FDR" = {
+rslt$score_list[[i]]$FDR
+},
+"p.value" = {
+rslt$score_list[[i]]$p.value
+},
+{
+cat(
+sprintf(
+"Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+ksea_cutoff_statistic
+)
+)
+param_df_exit()
+knitr::knit_exit()
+}
+)
+if (sum(selector < ksea_cutoff_threshold) > 0) {
+if (print_nb_messages) nbe(see_variable(output_df))
+math_caption <- gsub("{", "\\{", caption,      fixed = TRUE)
+math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE)
+# with (
+#   output_df,
+#   )
+if (TRUE) {
+output_df$Kinase <- whack_underscores(output_df$Kinase)
+data_frame_tabbing_latex(
+x = output_df,
+# vector of tab stops, in inches
+tabstops = c(1.0, 1.2, 1.0, 1.0, 1.0, 1.0),
+# vector of headings, registered with tab-stops
+headings = colnames(output_df),
+# digits to pass to format.data.frame
+digits = NULL,
+# maximumn number of rows to print
+max = NULL,
+# optional caption
+caption = sprintf(
+"\\text{%s}, KSEA %s < %s",
+math_caption,
+ksea_cutoff_statistic,
+ksea_cutoff_threshold
+),
+# set underscore_whack to TRUE to escape underscores
+underscore_whack = FALSE,
+# flag for landscape mode
+landscape = FALSE,
+# flag indicating that subsubsection should be used for caption
+#   rather than subsection
+use_subsubsection_header = TRUE,
+# character-size indicator; for possible values, see:
+#   https://tug.org/texinfohtml/latex2e.html#Font-sizes
+charactersize = "small",
+# set verbatim to TRUE to debug output
+verbatim = FALSE
+)
+} else {
+data_frame_table_latex(
+x = output_df,
+justification = "l c c c c c c",
+centered = TRUE,
+caption = sprintf(
+"\\text{%s}, KSEA %s < %s",
+math_caption,
+ksea_cutoff_statistic,
+ksea_cutoff_threshold
+),
+anchor = anchor,
+underscore_whack = FALSE
+)
+}
+} else {
+cat(
+sprintf(
+"\\break
+No kinases had
+\\(\\text{KSEA %s}_\\text{enrichment} < %s\\)
+for contrast %s\\hfill\\break\n",
+ksea_cutoff_statistic,
+ksea_cutoff_threshold,
+caption
 )
 )
-)
+}
-output_df <- contrast_ksea_scores[
-selector < ksea_cutoff_threshold,
-c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count",
-"z_score", "p_value", "fdr")
-]
-output_order <- with(output_df, order(mean_log2_fc, kinase_gene))
-output_df <- output_df[output_order, ]
-colnames(output_df) <-
-c(
-colnames(output_df)[1],
-colnames(output_df)[2],
-"enrichment",
-"m_s",
-"z_score",
-"p_value",
-"fdr"
-)
-output_df$fdr <- sprintf("%0.4f", output_df$fdr)
-output_df$p_value <- sprintf("%0.2e", output_df$p_value)
-output_df$z_score <- sprintf("%0.2f", output_df$z_score)
-output_df$m_s <- sprintf("%d", output_df$m_s)
-output_df$enrichment <- sprintf("%0.2f", output_df$enrichment)
-output_ncol <- ncol(output_df)
-colnames(output_df) <-
-c(
-"Kinase",
-"\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)",
-"Enrichment",
-"Substrates",
-"z-score",
-"p-value",
-"FDR"
-)
-selector <- switch(
-ksea_cutoff_statistic,
-"FDR" = {
-rslt$score_list[[i]]$FDR
-},
-"p.value" = {
-rslt$score_list[[i]]$p.value
-},
-stop(
-sprintf(
-"Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
-ksea_cutoff_statistic
-)
-)
-)
-if (sum(selector < ksea_cutoff_threshold) > 0) {
-math_caption <- gsub("{", "\\{", caption,      fixed = TRUE)
-math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE)
-data_frame_latex(
-x = output_df,
-justification = "l c c c c c c",
-centered = TRUE,
-caption = sprintf(
-"\\text{%s}, %s < %s",
-math_caption,
-ksea_cutoff_statistic,
-ksea_cutoff_threshold
-),
-anchor = const_table_anchor_p
-)
-} else {
-cat(
-sprintf(
-"\\break
-No kinases had
-\\(\\text{%s}_\\text{enrichment} < %s\\)
-for contrast %s\\hfill\\break\n",
-ksea_cutoff_statistic,
-ksea_cutoff_threshold,
-caption
-)
-)
 }
+} else {
+""
 }
 }
 # create_breaks is a helper for ksea_heatmap
 create_breaks <- function(merged_scores) {
 mycol <- unique(append(mycol_neg, mycol_pos))
 color_breaks <- list(breaks_all, mycol)
 return(color_breaks)
 }
+hm2plus <- function(
+x,
+mat = matrix(
+c(
+c(0, 4, 0),
+c(0, 3, 3),
+c(2, 1, 1)
+),
+nrow = 3,
+ncol = 3,
+byrow = TRUE
+),
+denwid = 0.5,
+denhgt = 0.15,
+widths = c(0.5, 2.5, 1.5),
+heights = c(0.4, 0.15, 3.95),
+divergent = FALSE,
+notecol = "grey50",
+trace = "none",
+margins = c(6, 20),
+srtcol       = 90,
+srtrow       = 0,
+density_info = "none",
+key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
+key_par = list(),
+hclustfun = hclust,
+...
+) {
+varargs <- list(...)
+if (FALSE) # this is to avoid commenting out code to pass linting...
+my_hm2 <- latex_show_invocation(heatmap.2, head_patch = FALSE)
+else
+my_hm2 <- heatmap.2
+x <- as.matrix(x)
+if (sum(!is.na(x)) < 1)
+return(NULL)
+color_count <- 1 + max(64, length(as.vector(x))) # 8 was not enough
+break_count <- 1 + color_count
+min_nonax <- min(x, na.rm = TRUE)
+max_nonax <- max(x, na.rm = TRUE)
+if (print_nb_messages) nb("within hm2plus", see_variable(divergent), "\n")
+if (divergent) {
+zlim <- max(abs(min_nonax), abs(max_nonax))
+if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n"))
+breaks <- (zlim) / (break_count:1)
+if (print_nb_messages) nb(see_variable(breaks, "\n"))
+breaks <- breaks - median(breaks)
+zlim <- c(-zlim, zlim)
+if (print_nb_messages) nb(see_variable(zlim, "\n"))
+} else {
+zlim <- max(abs(min_nonax), abs(max_nonax))
+if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n"))
+breaks <- zlim / (break_count:1)
+if (print_nb_messages) nb(see_variable(breaks, "\n"))
+if  (max_nonax < 0) {
+breaks <- breaks - zlim
+zlim <- c(-zlim, 0)
+} else {
+zlim <- c(0, zlim)
+}
+if (print_nb_messages) nb(see_variable(zlim, "\n"))
+}
+nonax <- x
+nonax[is.na(x)] <- min_nonax
+if (is.null(widths)) widths <- c(denwid, 4 - denwid, 1.5)
+if (is.null(heights)) heights <- c(0.4, denhgt, 4.0)
+colors <-
+if (divergent && min_nonax < 0) {
+# divergent colors on both sides of zero
+colorRampPalette(c("red", "white", "blue"))(color_count)
+} else if (divergent && min_nonax > 0) {
+# "divergent" colors > zero
+colorRampPalette(c("white", "blue"))(color_count)
+} else if (divergent && max_nonax < 0) {
+# "divergent" colors < zero
+colorRampPalette(c("red", "white"))(color_count)
+} else {
+# "non-divergent" colors including zero
+hcl.colors(color_count, "YlOrRd", rev = TRUE)
+}
+#ACE if (print_nb_messages) nb("within hm2plus", see_variable(key_par), "\n")
+#ACE if (print_nb_messages) nb(see_variable(colors, "\n"))
+#ACE key_par$col = colors
+#ACE key_par$breaks = breaks
+if (print_nb_messages) nb(see_variable(par(), "\n")) #ACE TODO remove me
+if (print_nb_messages) cat("\\leavevmode\n\\linebreak\n") #ACE TODO remove me
+suppressWarnings(
+my_hm2(
+x = x,
+col = colors,
+#ACE symkey = FALSE,
+density.info = density_info,
+srtCol = srtcol,
+srtRow = srtrow,
+margins = margins,
+lwid = widths,
+lhei = heights,
+key.title = NA,
+key.xlab = key_xlab,
+key.par = key_par,
+lmat = mat,
+notecol = notecol,
+trace = trace,
+bg = "yellow",
+hclustfun = hclustfun,
+#ACE breaks = breaks,
+oldstyle = FALSE,
+... # varargs
+)
+)
+# implicitly returning value returned by heatmap.2
+}
 # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap
 draw_kseaapp_summary_heatmap <- function(
-x,
+x,                       # matrix with row/col names already formatted
-sample_cluster,
+sample_cluster,          # a binary input of TRUE or FALSE,
-merged_asterisk,
+#   indicating whether or not to perform
-my_cex_row,
+#   hierarchical clustering of the sample columns
-color_breaks,
+merged_asterisk,         # matrix having dimensions of x, values "*" or ""
-margins,
+color_breaks,            # breaks for color gradation, from create_breaks
+#   passed to `breaks` argument of `image`
+margins = c(8, 15),      # two integers setting the bottom and right margins
+#   to accommodate row and column labels
+master_cex = 0.7,        # basis for text sizes
 ...
 ) {
 merged_scores <- x
 if (!is.matrix(x)) {
 cat(
 "No plot because \\texttt{typeof(x)} is '",
 typeof(x),
 "' rather than 'matrix'.\n\n"
 )
 )
-} else if (nrow(x) < 2) {
+cat_variable(x)
-cat("No plot because matrix has ", nrow(x), " rows.\n\n")
 return(FALSE)
-} else if (ncol(x) < 2) {
+}
-cat("No plot because matrix x has ", ncol(x), " columns.\n\n")
+if (print_trace_messages) cat(sprintf("master_cex = %03f\n\n", master_cex))
+nrow_x <- nrow(x)
+ncol_x <- ncol(x)
+#if (nrow_x < 2) {
+if (nrow_x < 1) {
+cat("No plot because matrix has no rows.\n\n")
 return(FALSE)
-} else {
+} else if (nrow_x < 2) {
-my_limit <- 25
+cat("No plot because matrix has one row.  Matrix looks like this:\n\n")
-my_cex_col <- my_limit / (my_limit + ncol(x))
+cat("\n\\begin{verbatim}\n")
-my_cex_row <- my_limit / (my_limit + nrow(x))
+print(x)
-my_scale <- 12.0
+cat("\n\\end{verbatim}\n")
-if (ncol(x) < 10 && nrow(x) < 10)
+return(FALSE)
-my_scale <- my_scale * 10 / (10 - nrow(x)) * 10 / (10 - ncol(x))
+} else if (ncol_x < 2) {
-gplots::heatmap.2(
+cat("No plot because matrix x has ", ncol_x, " columns.\n\n")
-x            = merged_scores,
+cat_variable(x)
-Colv         = sample_cluster,
+return(FALSE)
-breaks       = color_breaks[[1]],
+}
-cellnote     = merged_asterisk,
+max_nchar_rowname <- max(nchar(rownames(x)))
-cexCol       = 0.9 * my_cex_col,
+max_nchar_colname <- max(nchar(colnames(x)))
-cexRow       = 2 * my_cex_row,
+my_limit <- g_intensity_hm_rows
-col          = color_breaks[[2]],
-density.info = "none",
+my_row_cex_scale <- master_cex * 150 / nrow_x
-key          = FALSE,
+my_col_cex_scale <- 3.0
-lhei         = c(0.4, 8.0, 1.1),
+my_asterisk_scale <- 0.4 * my_row_cex_scale
-lmat         = rbind(c(0, 3), c(2, 1), c(0, 4)),
+my_row_warp <- 1
-lwid         = c(0.5, 3),
+my_note_warp <- 2
-margins      = margins,
+my_row_warp <- 1
-notecex      = my_scale * my_cex_row * my_cex_col,
+my_row_cex_asterisk <-
-notecol      = "white",
+master_cex * my_row_warp * my_asterisk_scale
-scale        = "none",
-srtCol       = 45,
-srtRow       = 45,
+my_col_cex <- my_col_cex_scale * master_cex
-trace        = "none",
+my_row_cex <- min(3.5 * my_row_cex_asterisk, my_col_cex)
-...
+my_key_cex <- 1.286
-)
+my_hm2_cex <- 1 * master_cex
-return(TRUE)
+my_offset <- (4.8 / (9 + nrow_x / 10)) - 0.4
-}
+if (print_trace_messages) cat(sprintf("nrow_x = %03f\n\n", nrow_x))
-}
+if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset))
+my_offset <- 0.05
-# Adapted from KSEAapp::KSEA.Heatmap
+if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset))
+my_scale <- 3.0
+if (ncol_x < 10 && nrow_x < 10)
+my_scale <- my_scale * 10 / (10 - nrow_x) * 10 / (10 - ncol_x)
+my_heights <- c(
+0.15,
+3.85 - my_offset,
+0.5  + my_offset
+)
+my_margins <- c(1, 1) +
+c(
+margins[1] * 0.08 * max_nchar_colname * my_col_cex,
+margins[2] * 0.04 * max_nchar_rowname * my_row_cex
+)
+my_notecex <-
+my_scale *
+min(
+1.1,
+my_row_cex_asterisk * my_note_warp,
+my_col_cex * my_note_warp
+)
+if (print_trace_messages) {
+cat_variable(my_heights,          suffix = "; ")
+cat_variable(my_margins,          suffix = "\n\n")
+cat_variable(my_row_cex_scale,    suffix = "; ")
+cat_variable(my_col_cex_scale,    suffix = "\n\n")
+cat_variable(my_row_cex_asterisk, suffix = "\n\n")
+cat_variable(my_row_cex,          suffix = "; ")
+cat_variable(my_col_cex,          suffix = "\n\n")
+cat_variable(my_row_cex,          suffix = "; ")
+cat_variable(my_col_cex,          suffix = "\n\n")
+}
+hm2plus(
+x            = merged_scores,
+Colv         = sample_cluster,
+cellnote     = merged_asterisk,
+cex          = my_hm2_cex,
+cexCol       = my_col_cex,
+cexRow       = my_row_cex,
+denhgt       = 0.15,
+density_info = "none",
+denwid       = 0.5,
+divergent    = TRUE,
+key_par      = list(cex = my_key_cex),
+key_xlab     = "Z-score",
+margins      = my_margins,
+notecex      = my_scale * min(
+1.5,
+my_row_cex_asterisk * my_note_warp,
+my_col_cex * my_note_warp
+),
+notecol      = "white",
+scale        = "none",
+srtcol       = 90,
+srtrow       = 0,
+trace        = "none",
+mat          = matrix(
+c(
+c(0, 3, 3),
+c(2, 1, 1),
+c(0, 4, 0)
+),
+nrow = 3,
+ncol = 3,
+byrow = TRUE
+),
+widths       = c(0.5, 3.1, 0.9),
+heights      = my_heights,
+...
+)
+return(TRUE)
+}
+# function drawing heatmap of contrast fold-change for each kinase,
+#   adapted from KSEAapp::KSEA.Heatmap
 ksea_heatmap <- function(
 # the data frame outputs from the KSEA.Scores() function, in list format
 score_list,
 # a character vector of all the sample names for heatmap annotation:
 # - the names must be in the same order as the data in score_list
 # a numeric value between 0 and infinity indicating the min. number of
 #   substrates a kinase must have to be included in the heatmap
 m_cutoff,
 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
 #   for indicating significant kinases in the heatmap
-p_cutoff =
+p_cutoff = {
-stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"),
+cat("argument 'p_cutoff' is required for function 'ksea_heatmap'")
+param_df_exit()
+knitr::knit_exit()
+},
 # a binary input of TRUE or FALSE, indicating whether or not to perform
 #   hierarchical clustering of the sample columns
 sample_cluster,
 # a binary input of TRUE or FALSE, indicating whether or not to export
 #   the heatmap as a .png image into the working directory
 export = FALSE,
-# bottom and right margins; adjust as needed if contrast names are too long
+# bottom and right margins; adjust as needehttps://tex.stackexchange.com/a/56795d if contrast names are too long
-margins = c(6, 20),
+margins = c(6, 6),
 # print which kinases?
 # - Mandatory argument, must be one of const_ksea_.*_kinases
 which_kinases,
 # additional arguments to gplots::heatmap.2, such as:
 # - main: main title of plot
 }
 return(new)
 }
 merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff))
-# begin hack to print only significant rows
 asterisk_rows <- rowSums(merged_asterisk == "*") > 0
 all_rows <- rownames(merged_stats)
 names(asterisk_rows) <- all_rows
 non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE])
 asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE])
 merged_scores_asterisk <- merged_scores[names(asterisk_rows), , drop = FALSE]
 merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), , drop = FALSE]
-# end hack to print only significant rows
 row_list <- list()
 row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows
 row_list[[const_ksea_all_kinases]] <- all_rows
 row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows
 stts <- merged_stats[my_row_names, , drop = FALSE]
 merged_asterisk <- as.matrix(asterisk(stts, p_cutoff))
 color_breaks <- create_breaks(scrs)
 if (is.null(color_breaks)) {
-cat("No plot because matrix has too many missing values.\n\n")
+cat("No plot because matrix has too few rows.\n\n")
 return(NULL)
 }
 plot_height <- nrow(scrs) ^ 0.55
 plot_width <- ncol(scrs) ^ 0.7
-my_cex_row <- 0.25 * 16 / plot_height
 if (export == "TRUE") {
 png(
 "KSEA.Merged.Heatmap.png",
 width = plot_width * 300,
 height = 2 * plot_height * 300,
 }
 did_draw <- draw_kseaapp_summary_heatmap(
 x               = scrs,
 sample_cluster  = sample_cluster,
 merged_asterisk = merged_asterisk,
-my_cex_row      = my_cex_row,
 color_breaks    = color_breaks,
 margins         = margins
 )
 if (export == "TRUE") {
 dev.off()
 if (!did_draw)
 return(NULL)
 return(my_row_names)
 }
-# helper for heatmaps of phosphopeptide intensities
+# helpers for heatmaps of phosphopeptide intensities
-draw_ppep_heatmap <-
+# factory producing function to truncate string after n characters
+trunc_n <- function(n) {
+function(x) {
+sapply(
+X = x,
+FUN = function(s) {
+if (is.na(s))
+return("NA")
+cond <- try_catch_w_e(nchar(s) > n)
+if (!is.logical(cond$value)) {
+return(cond$value$message)
+} else if (cond$value) {
+paste0(
+strtrim(s, n),
+"..."
+)
+} else {
+s
+}
+},
+USE.NAMES = FALSE
+)
+}
+}
+trunc_long_ppep <- function(x) trunc_n(40)(x)
+trunc_ppep <- function(x) trunc_n(g_ppep_trunc_n)(x)
+trunc_subgene <- function(x) trunc_n(g_subgene_trunc_n)(x)
+trunc_enriched_substrate <- function(x) trunc_n(g_sbstr_trunc_n)(x)
+# factory producing a function that returns a covariance
+#   matrix's rows (and columns) having variance > v_min
+keep_cov_w_var_gtr_min <- function(v_min) {
+function(x) {
+if (!is.matrix(x))
+return(NULL)
+keepers <- sapply(
+X = seq_len(nrow(x)),
+FUN = function(i) {
+if (x[i, i] < v_min)
+NA
+else
+x[i, i]
+}
+)
+names(keepers) <- rownames(x)
+keepers <- keepers[!is.na(keepers)]
+keepers <- names(keepers)
+if (length(keepers) == 0)
+return(NULL)
+x[keepers, keepers]
+}
+}
+# function that returns a matrix's rows having variance > 1
+keep_cov_w_var_gtr_1 <- keep_cov_w_var_gtr_min(1)
+# factory producing a function that returns
+#   - either a matrix's rows (rows = TRUE)
+#   - or a matrix's columns (rows = FALSE)
+#   having variance > v_min
+keep_var_gtr_min <- function(v_min) {
+function(x, rows = TRUE) {
+nrowcol <- if (rows) nrow else ncol
+if (!is.matrix(x))
+return(NULL)
+keepers <- sapply(
+X = seq_len(nrowcol(x)),
+FUN = function(i) {
+row_var <- var(
+if (rows) x[i, ] else  x[, i],
+na.rm = TRUE
+)
+if (is.na(row_var) || row_var <= v_min) NA else i
+}
+)
+keepers <- keepers[!is.na(keepers)]
+if (rows) x[keepers, ] else  x[, keepers]
+}
+}
+keep_var_gtr_0 <- keep_var_gtr_min(0)
+# function drawing heatmap of phosphopeptide intensities
+ppep_heatmap <-
 function(
 m,                              # matrix with rownames already formatted
 cutoff,                         # cutoff used by hm_heading_function
-hm_heading_function,            # construct and cat heading from m and cutoff
+hm_heading_function,            # construct $ cat heading from m and cutoff
 hm_main_title,                  # main title for plot (drawn below heading)
 suppress_row_dendrogram = TRUE, # set to false to show dendrogram
-max_peptide_count               # experimental:
+max_peptide_count =             # experimental:
-= intensity_hm_rows,      #   values of 50 and 75 worked well
+g_intensity_hm_rows,    #   values of 50 and 75 worked well
-...                             # passthru parameters for heatmap
+master_cex = 1.0,               # basis for text sizes
+margins = NULL,                 # optional margins (bottom, right)
+cellnote = NULL,                # optional matrix of character; dim = dim(m)
+adj = 0.5,                      # adjust text: 0 left, 0.5 middle, 1 right
+...                             # passthru to hm2plus or heatmap.2
 ) {
+use_heatmap_1 <- FALSE
 peptide_count <- 0
 # emit the heading for the heatmap
 if (hm_heading_function(m, cutoff)) {
-peptide_count <- min(max_peptide_count, nrow(m))
+nrow_m <- nrow(m)
-if (nrow(m) > 0) {
+peptide_count <- min(max_peptide_count, nrow_m)
+if (nrow_m > 1) {
 m_margin <- m[peptide_count:1, ]
-# Margin setting was heuristically derived
+# Margin was heuristically derived to accommodate the widest label
-margins <-
+row_mchar_max <- max(nchar(rownames(m_margin)))
-c(0.5, # col
+col_mchar_max <- max(nchar(colnames(m_margin)))
-max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16  # row
+row_margin <- master_cex * row_mchar_max * 2.6
-)
+col_margin <- master_cex * col_mchar_max * 2.6
-}
+if (print_trace_messages) cat(sprintf("row_margin = %0.3f; ", row_margin))
-if (nrow(m) > 0) {
+if (print_trace_messages) cat(sprintf("col_margin = %0.3f; ", col_margin))
 hm_call <- NULL
 tryCatch(
 {
-old_oma <- par("oma")
+# set non-argument parameters for hm_call inner function
-par(cex.main = 0.6)
+my_row_cex <-
-# Heuristically determined character size adjustment formula
+master_cex * 200000 / (
-my_cex_row <-
+(max(nchar(rownames(m_margin)))^2) * g_intensity_hm_rows
-250000 / (
-max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows
 )
 m_hm <-  m[peptide_count:1, , drop = FALSE]
-my_limit <- 60
+if (is.null(cellnote)) {
-my_cex_col <- 0.75 * my_limit / (my_limit + ncol(m_hm))
+cellnote <-  matrix("", nrow = nrow(m_hm), ncol = ncol(m_hm))
+cellnote[is.na(m_hm)] <- "NA"
+} else {
+cellnote <-  cellnote[peptide_count:1, , drop = FALSE]
+}
+m_hm[is.na(m_hm)] <- 0
+nrow_m_hm <- nrow(m_hm)
+ncol_m_hm <- ncol(m_hm)
+if (nrow_m_hm < 1 || ncol_m_hm < 1)
+return(peptide_count) # return zero as initialized above
+my_limit <- g_intensity_hm_rows
+my_row_cex <- master_cex * (100 / (2 + row_mchar_max))
+my_col_cex <- master_cex * 6 * row_margin / col_margin
+my_col_adj <- min(my_col_cex, my_row_cex) / my_col_cex
+my_col_cex <- min(my_col_cex, my_row_cex)
+col_margin <- sqrt(my_col_adj) * col_margin
+if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex))
+if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex))
+if (is.null(margins)) my_margins <-
+c(
+(my_col_cex + col_margin), # col
+(my_row_cex + row_margin) / my_row_cex  # row
+)
+else
+my_margins <- margins
+if (print_trace_messages) cat(
+sprintf(
+"my_margins = c(%s)\n\n",
+paste(my_margins, collapse = ", ")
+)
+)
+my_hm2_cex <- 2 * master_cex
+my_key_cex <- 0.9 - 0.1 * (g_intensity_hm_rows + nrow_m_hm) / g_intensity_hm_rows
+my_key_warp <- 1.5 * 22.75 / row_margin
+my_key_cex <- min(1.10, my_key_warp * my_key_cex)
+my_hgt_scale <- 3.70 - 0.4 * (max(1, 0.9 * my_row_cex) - 1)
+my_hgt_scale <- 3.75 # 3.615
+my_hgt_scale <- 3.60 # 3.615
+if (print_trace_messages)
+cat_variable(my_hgt_scale, "\n\n", 3)
+my_warp <- max(0.1, 1.4 * (7.5 + nrow_m) / g_intensity_hm_rows)
+if (print_trace_messages)
+cat_variable(my_warp, "\n\n", 3)
+# added 0.9 heuristically...
+my_plot_height <-
+(0.566 + 0.354 * (nrow_m / g_intensity_hm_rows)) *
+min(my_hgt_scale, my_hgt_scale * my_warp)
+my_plot_height <- min(3.65, my_plot_height * g_intensity_hm_rows / 50)
+my_heights <- c(
+0.3,                                 # title and top dendrogram
+my_plot_height,                      # plot and bottom margin
+4.15 - my_hgt_scale,                 # legend
+0.05 + my_hgt_scale - my_plot_height # whitespace below legend
+)
+my_note_cex <- min(0.8, my_row_cex, my_col_cex)
+if (print_trace_messages) {
+cat_variable(my_plot_height, "\n\n", 3)
+cat_variable(4.19 - my_hgt_scale, "\n\n", 3)
+cat_variable(nrow_m_hm, "; ", 0)
+cat_variable(ncol_m_hm, "; ", 0)
+cat_variable(my_row_cex, "; ", 3)
+cat_variable(my_col_cex, "; ", 3)
+cat_variable(my_note_cex, "; ", 3)
+cat_variable(my_key_cex, "\n\n", 3)
+cat_variable(my_hgt_scale, "; ", 3)
+cat_variable(my_plot_height, "; ", 3)
+cat_variable(my_warp, "\n\n", 3)
+cat_variable(my_heights, "; ", 2)
+cat_variable(sum(my_heights), "\n\n", 3)
+}
+# define hm_call inner function
 hm_call <- function(x, scaling, title) {
-heatmap(
+my_cex_main <- min(5.0, 220 / nchar(title))
-x,
+op <- par(
-Rowv = if (suppress_row_dendrogram) NA else NULL,
+cex.main = my_cex_main * master_cex,
-Colv = NA,
+adj = adj
-cexRow = my_cex_row,
+)
-cexCol = my_cex_col,
+if (
-scale = scaling,
+!is.null(
-margins = margins,
+hm2plus(
-main = title,
+x,
-xlab = "",
+Colv = NA,
-las = 1,
+Rowv = TRUE,
-...
+cexRow = my_row_cex,
+cexCol = my_col_cex,
+dendrogram = "row",
+las = 1,
+main = title,
+key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
+cex = my_hm2_cex,
+key_par = list(cex = my_key_cex),
+margins = my_margins,
+widths =  c(0.4, 2.6, 1.5),
+heights = my_heights,
+mat = matrix(
+c(
+c(0, 3, 3),
+c(2, 1, 1),
+c(0, 4, 0),
+c(0, 0, 0)
+),
+nrow = 4,
+ncol = 3,
+byrow = TRUE
+),
+na.rm = TRUE,
+scale = scaling,
+srtcol = 90,
+srtrow = 0,
+xlab = "",
+cellnote = cellnote,
+notecex = my_note_cex,
+...
+)
 )
+) {
+if (print_trace_messages) cat(
+sprintf(
+"my_heights = c(%s); sum = %0.3f\n\n",
+paste(
+sprintf("%0.3f", my_heights),
+collapse = ", "
+),
+sum(my_heights)
+)
+)
+if (print_trace_messages) cat(
+sprintf("my_key_cex = %0.3f\n\n",
+my_key_cex)
+)
+if (print_trace_messages) cat(
+sprintf("my_key_cex/my_heights[3] = %0.3f\n\n",
+my_key_cex / my_heights[3])
+)
+if (print_trace_messages) cat(
+sprintf("my_heights[2]/my_heights[3] = %0.3f\n\n",
+my_heights[2] / my_heights[3])
+)
+}
+par(op)
 }
+# invoke hm_call inner function
 if (sum(rowSums(!is.na(m_hm)) < 2))
 hm_call(
 m_hm,
 "none",
 "log(intensities), unscaled, unimputed, and unnormalized"
 if (nrow(m_hm) > 1)
 hm_call(
 m_hm,
 "none",
 paste(
-"log(intensities), unscaled, unimputed,",
+"log(intensities), unscaled,",
-"NAs zeroed, unnormalized"
+"zero-imputed, unnormalized"
 )
 )
 else
 cat("\nThere are too few peptides to produce a heatmap.\n")
 },
 error = function(r) {
 cat(
 sprintf(
-"\n%s %s Internal message: %s\n",
+"\n%s %s Internal message: %s\n\\newline\n\n",
-"Could not draw heatmap,",
+"Failure drawing heatmap,",
-"possibly because of too many missing values.",
+"possibly because of too many missing values.\n\\newline\n\n",
 r$message
 )
 )
+cat_margins()
 }
 )
 } else {
 cat(
-"\nCould not draw heatmap, possibly because of too many missing values.\n"
+"\nFailure drawing heatmap, possibly because of too many missing values.\n"
 )
 }
-},
+}
-finally = par(old_oma)
 )
 }
 }
-return(peptide_count)
+# return value:
-}
+peptide_count
+}
+# function drawing heatmap of correlations if they exist, else covariances
+cov_heatmap <-
+function(
+m,                              # matrix with rownames already formatted
+top_substrates = FALSE,
+...                             # passthru to hm2plus or heatmap.2
+) {
+if (print_nb_messages) nbe(see_variable(m), " [", nrow(m), "x", ncol(m), "\n")
+#ACE nb(rowSums(m, na.rm = TRUE))
+#ACE bad_rows <- (rowSums(m, na.rm = TRUE) == 0)
+#ACE nb(see_variable(bad_rows))
+#ACE m <- m[-bad_rows, , drop = FALSE]
+colnames_m <- colnames(m)
+is_na_m <- is.na(m)
+tmp <- m
+tmp[is_na_m] <- 0
+tmp <- m[, 0 < colSums(x = tmp)] # by default, na.rm is FALSE
+colnames_tmp <- colnames(tmp)
+my_low_p_seq <- seq(
+from = min(g_intensity_hm_rows, nrow(m)),
+to = 1,
+by = -1
+)
+if (g_correlate_substrates) {
+# zap samples having zero or near-zero variance
+tmp[is.na(tmp)] <- 0
+nzv <- caret::nearZeroVar(
+tmp,           # matrix of values, samples x variables
+freqCut = 1.01, # min(freq most prevalent value /
+#     freq second most prevalent)
+uniqueCut = 99 # max(number of unique values /
+#     total number of samples)
+)
+tmp <- if (length(nzv) > 0) {
+m[, -nzv, drop = FALSE]
+} else {
+m
+}
+} else {
+tmp <- m[my_low_p_seq, , drop = FALSE]
+}
+t_m <- t(tmp)
+t_m[is.na(t_m)] <- 0
+prefiltered_nrow <- ncol(t_m)
+my_corcov <- cov(t_m)
+did_filter_rows <- did_filter_cols <- FALSE
+if (g_correlate_substrates && !is_positive_definite(my_corcov)) {
+my_correlate_substrates <- FALSE
+t_m <- t(m[my_low_p_seq, , drop = FALSE])
+t_m[is.na(t_m)] <- 0
+unfiltered_row_count <- ncol(t_m)
+unfiltered_col_count <- nrow(t_m)
+# zap empty samples
+t_m <- t_m[0 < rowSums(x = t_m), ]
+# zap substrates present in fewer than two samples
+foo <- t_m > 0
+foo <- colSums(x = foo) > 1
+t_m <- t_m[, foo]
+did_filter_rows <- unfiltered_row_count > ncol(t_m)
+did_filter_cols <- unfiltered_col_count > nrow(t_m)
+colnames_tmp <- rownames(t_m)
+my_corcov <- cov(t_m)
+if (g_filter_cov_var_gt_1) {
+my_corcov <- keep_cov_w_var_gtr_1(my_corcov)
+}
+} else if (g_correlate_substrates) {
+my_corcov <- cov2cor(my_corcov)
+my_correlate_substrates <- TRUE
+} else {
+my_correlate_substrates <- FALSE
+if (g_filter_cov_var_gt_1) my_corcov <- keep_cov_w_var_gtr_1(my_corcov)
+}
+omitted_samples <- colnames_m[colnames_m %notin% colnames_tmp]
+suffix <- if (length(omitted_samples) > 1) "s" else ""
+f_omissions <-
+function(is_corr) {
+cat(
+sprintf(
+"Below is the %s plot for %s substrates",
+if (is_corr) "correlation" else "covariance",
+sprintf(
+if (top_substrates)
+"%0.0f \"highest-quality\""
+else
+"%0.0f",
+ncol(t_m)
+)
+)
+)
+if (did_filter_cols) {
+cat(sprintf(", omitting sample%s ", suffix))
+latex_collapsed_vector(", ", omitted_samples)
+}
+cat(".\n\n")
+}
+if (is.null(my_corcov) || sum(!is.na(t_m)) < 2) {
+cat(
+sprintf(
+"\\newline\n%s %s plot.\n",
+"Insufficient covariance to produce",
+if (my_correlate_substrates)
+"correlation"
+else
+"covariance"
+),
+"\\newpage\n"
+)
+return(NULL)
+}
+cat("\\leavevmode\n", "\\newpage\n")
+f_omissions(my_correlate_substrates)
+master_cex <- 0.4
+max_nchar <- max(nchar(rownames(t_m)))
+my_limit <- g_intensity_hm_rows
+diminution <- sqrt(my_limit  / (my_limit + ncol(t_m)))
+my_row_cex <-
+my_col_cex <-
+min(1.75, master_cex * 9 * diminution ^ 1.5)
+my_margin <- 3 + my_row_cex * 64 / (8 + max_nchar)
+my_key_cex <- 1.4
+my_hm2_cex <- 1.0 * master_cex
+my_hgt_scale <- 3.50 - 0.26 * (max(0.4, my_key_cex) - 0.4)
+my_hgt_scale <- 2.7
+my_legend_height <- 4.0 - my_hgt_scale
+my_legend_height <- 0.5 * my_key_cex
+my_warp <- 0.65 * (my_limit + ncol(t_m)) / my_limit
+my_warp <- 0.8
+my_legend_height <- 0.77
+my_legend_height <- 0.67
+my_plot_height <- my_hgt_scale + (1 - my_warp) * my_legend_height
+my_legend_height <- my_warp * my_legend_height
+parjust <- par(adj = 0.5)
+on.exit(par(parjust))
+my_corcov <- my_corcov[order(rownames(my_corcov)), ]
+my_main <-
+sprintf("%s among %s substrates %s",
+if (my_correlate_substrates) "Correlation"
+else "Covariance",
+kinase_name,
+if (!my_correlate_substrates &&
+g_filter_cov_var_gt_1 &&
+did_filter_rows
+)
+"having variance > 1"
+else ""
+)
+my_main_nchar <- nchar(my_main)
+my_heights <- c(
+0.3,
+my_plot_height,
+my_legend_height # was 4.0 - my_hgt_scale # was 4.19
+)
+if (print_trace_messages) cat(sprintf("max_nchar = %0.3f; ", max_nchar))
+if (print_trace_messages) cat(sprintf("my_margin = %0.3f; ", my_margin))
+if (print_trace_messages) cat(sprintf("my_plot_height = %0.3f\n\n", my_plot_height))
+if (print_trace_messages) cat(sprintf("master_cex = %0.3f; ", master_cex))
+if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex))
+if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex))
+if (print_trace_messages) cat(sprintf("my_key_cex = %0.3f\n\n", my_key_cex))
+if (print_trace_messages) cat(sprintf("my_hgt_scale = %0.3f\n\n", my_hgt_scale))
+if (print_trace_messages) cat(sprintf("legend height = %0.3f\n\n", my_legend_height))
+if (print_trace_messages) cat(
+sprintf(
+"my_heights = c(%s); sum = %0.3f\n\n",
+paste(
+sprintf("%0.3f", my_heights),
+collapse = ", "
+),
+sum(my_heights)
+)
+)
+op <- par(cex.main = (30 + my_main_nchar) / my_main_nchar)
+on.exit(par(op))
+hm2plus(
+x            = my_corcov,
+cex          = my_hm2_cex,
+cexCol       = my_col_cex,
+cexRow       = my_row_cex,
+density_info = "none",
+denhgt       = 0.15,
+denwid       = 0.5,
+divergent    = TRUE,
+key_par      = list(cex = my_key_cex),
+key_xlab     = if (my_correlate_substrates) "Correlation"
+else "Covariance",
+main         = my_main,
+mat          = matrix(
+c(
+c(0, 3, 3),
+c(2, 1, 1),
+c(0, 4, 0)
+),
+nrow = 3,
+ncol = 3,
+byrow = TRUE
+),
+heights = my_heights,
+margins      = c(my_margin, my_margin),
+widths       = c(0.5, 3.1, 0.9),
+scale        = "none",
+symkey       = TRUE,
+symbreaks    = TRUE,
+symm         = FALSE #TODO evaluate TRUE
+# ...
+)
+} # end cov_heatmap
+### FILE IMPORT
+# function reading bzipped file to data.frame
+bzip2df <- function(d, f, ctor = bzfile) {
+# read.delim file (by default, compressed by bzip2)
+if (file.exists(f)) {
+conn <- NULL
+pf <- parent.frame()
+tryCatch(
+assign(
+as.character(substitute(d)),
+read.delim(conn <- bzfile(f, open = "r")),
+pf
+),
+finally = if (!is.null(conn)) close(conn)
+)
+}
+}
 ```
-```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
-cat("\\listoftables\n")
-```
 # Purpose
-To perform for phosphopeptides:
+The purpose of this analysis is to perform for phosphopeptides:
 - imputation of missing values,
 - quantile normalization,
-- ANOVA (using the R stats::`r params$oneWayManyCategories` function), and
+- ANOVA (using the R stats::`r params$oneWayManyCategories` function),
+- assignment of an FDR-adjusted $p$-value and a "quality score" to each phosphopeptide, and
 - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases:
 - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org)
 - The Human Proteome Database [http://hprd.org](http://hprd.org)
 - NetworKIN [http://networkin.science/](http://networkin.science/)
 - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx)
 ```{r include = FALSE}
-### GLOBAL VARIABLES
+if (params$kseaUseAbsoluteLog2FC) {
+sfc <- "|s|"
-# parameters for KSEA
+pfc <- "|p|"
+pfc_txt <- "$\\text{absolute value}({\\log_2 (\\text{fold-change})})$"
-ksea_cutoff_statistic <- params$kseaCutoffStatistic
+} else {
-ksea_cutoff_threshold <- params$kseaCutoffThreshold
+sfc <- "s"
-ksea_min_kinase_count <- params$kseaMinKinaseCount
+pfc <- "p"
+pfc_txt <- "${\\log_2 (\\text{fold-change}})$"
+}
 ksea_heatmap_titles <- list()
 ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <-
 sprintf(
 "Summary for all kinases enriched in one or more contrasts at %s < %s",
 ksea_cutoff_threshold
 )
 # hash to hold names of significantly enriched kinases
 ksea_asterisk_hash <- new_env()
-# READ PARAMETERS (mostly)
+# PROCESS (mostly read) PARAMETERS
-intensity_hm_rows <- params$intensityHeatmapRows
-# Input Filename
-input_file <- params$inputFile
-# First data column - ideally, this could be detected via regexSampleNames,
-#   but for now leave it as is.
-first_data_column <- params$firstDataColumn
-fdc_is_integer <- is.integer(first_data_column)
-if (fdc_is_integer) {
-first_data_column <- as.integer(params$firstDataColumn)
-}
 # False discovery rate adjustment for ANOVA
 #  Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05
-val_fdr <-
+val_fdr <- read.table(file = alpha_file, sep = "\t", header = FALSE, quote = "")
-read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "")
 if (
 ncol(val_fdr) != 1 ||
 sum(!is.numeric(val_fdr[, 1])) ||
 sum(val_fdr[, 1] < 0) ||
 sum(val_fdr[, 1] > 1)
 ) {
-stop("alphaFile should be one column of numbers within the range [0.0,1.0]")
+cat("alphaFile should be one column of numbers within the range [0.0,1.0]")
+param_df_exit()
+knitr::knit_exit()
 }
 val_fdr <- val_fdr[, 1]
-#Imputed Data filename
-imputed_data_filename <- params$imputedDataFilename
-imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
-anova_ksea_mtdt_file  <- params$anovaKseaMetadata
 ```
-```{r echo = FALSE}
+```{r echo = FALSE, results = 'asis'}
-# Imputation method, should be one of
-#   "random", "group-median", "median", or "mean"
-imputation_method <- params$imputationMethod
-# Selection of percentile of logvalue data to set the mean for random number
-#   generation when using random imputation
-mean_percentile <- params$meanPercentile / 100.0
-# deviation adjustment-factor for random values; real number.
-sd_percentile <- params$sdPercentile
-# Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
-regex_sample_names <- params$regexSampleNames
-# Regular expression to extract Sample Grouping from Sample Name;
-#   if error occurs, compare sample_treatment_levels vs. sample_name_matches
-#   to see if groupings/pairs line up
-#   e.g., "(\\d+)"
-regex_sample_grouping <- params$regexSampleGrouping
-one_way_all_categories_fname <- params$oneWayManyCategories
-one_way_all_categories <- try_catch_w_e(
-match.fun(one_way_all_categories_fname))
-if (!is.function(one_way_all_categories$value)) {
-write("fatal error for parameter oneWayManyCategories:", stderr())
-write(one_way_all_categories$value$message,             stderr())
-if (sys.nframe() > 0) quit(save = "no", status = 1)
-stop("Cannot continue. Goodbye.")
-}
-one_way_all_categories <- one_way_all_categories$value
-one_way_two_categories_fname <- params$oneWayManyCategories
-one_way_two_categories <- try_catch_w_e(
-match.fun(one_way_two_categories_fname))
-if (!is.function(one_way_two_categories$value)) {
-cat("fatal error for parameter oneWayTwoCategories: \n")
-cat(one_way_two_categories$value$message, fill = TRUE)
-if (sys.nframe() > 0) quit(save = "no", status = 1)
-stop("Cannot continue. Goodbye.")
-}
-one_way_two_categories <- one_way_two_categories$value
-preproc_db       <- params$preprocDb
-ksea_app_prep_db <- params$kseaAppPrepDb
 result <- file.copy(
 from      = preproc_db,
 to        = ksea_app_prep_db,
 overwrite = TRUE
 )
 preproc_db,
 ksea_app_prep_db,
 ),
 stderr()
 )
-if (sys.nframe() > 0) quit(save = "no", status = 1)
+if (sys.nframe() > 0) {
-stop("Cannot continue. Goodbye.")
+cat("Cannot continue and quit() failed. Goodbye.")
-}
+param_df_exit()
+knitr::knit_exit()
+# in case knit_exit doesn't exit
+quit(save = "no", status = 1)
+}
+}
+if (FALSE) {
+write.table(x = param_df, file = "test-data/params.txt")
+}
 ```
 ```{r echo = FALSE}
 ### READ DATA
 sep = "\t",
 header = TRUE,
 quote = "",
 check.names = FALSE
 )
 ```
-# Extract Sample Classes and Names
+# Extraction of Sample Classes and Names from Input Data
-Column names parsed from input file are shown in Table 1; sample classes and names, in Table 2.
 ```{r echo = FALSE, results = 'asis'}
 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE)
+my_column_names <- names(full_data)
 if (!fdc_is_integer) {
 if (length(data_column_indices) > 0) {
 first_data_column <- data_column_indices[1]
 } else {
-stop(paste("failed to convert firstDataColumn:", first_data_column))
+cat(paste("failed to convert firstDataColumn:", first_data_column))
+param_df_exit()
+knitr::knit_exit()
 }
 }
 cat(
 sprintf(
 paste(
-"\n\nThe input data file has peptide-intensity data for each sample",
+"\n\nThe input data file has peptide-intensity data",
-"in one of columns %d through %d.\n\n"
+"in columns %d (\"%s\") through %d (\"%s\")."
 ),
-min(data_column_indices),
+tmp <- min(data_column_indices),
-max(data_column_indices)
+my_column_names[tmp],
-)
+tmp <- max(data_column_indices),
-)
+my_column_names[tmp]
+)
-# Write column names as a LaTeX enumerated list.
+)
-column_name_df <- data.frame(
-column = seq_len(length(colnames(full_data))),
+if (TRUE) {
-name = paste0("\\verb@", colnames(full_data), "@")
+cat0(
-)
+table_offset(i = 1, new = TRUE),
-cat("\n\\begin{tiny}\n")
+"Sample classes and names are shown in ",
-data_frame_latex(
+table_href(),
-x = column_name_df,
+".\n\n"
-justification = "l l",
+)
-centered = TRUE,
+} else {
-caption = "Input data column names",
+cat0(
-anchor = const_table_anchor_bp,
+"\\newcounter{offset}\n",
-underscore_whack = FALSE
+"\\setcounter{offset}{\\value{table}}\n",
-)
+"\\stepcounter{offset}\n",
-cat("\n\\end{tiny}\n")
+"Sample classes and names are shown in ",
+table_href(),
+".\n\n"
+)
+}
+#TODO remove this unused variable and assignment
+if (FALSE) {
+# Write column names as a LaTeX enumerated list.
+column_name_df <- data.frame(
+column = seq_len(length(colnames(full_data))),
+name = paste0("\\verb@", colnames(full_data), "@")
+)
+}
 ```
 ```{r echo = FALSE, results = 'asis'}
+# extract intensity columns from full_data to quant_data
 quant_data <- full_data[first_data_column:length(full_data)]
 quant_data[quant_data == 0] <- NA
 rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide
+full_data_names <- colnames(quant_data)
 # Extract factors and trt-replicates using regular expressions.
 # Typically:
 #   regex_sample_names    is "\\.\\d+[A-Z]$"
 #   regex_sample_grouping is "\\d+"
 # This would distinguish trt-replicates by terminal letter [A-Z]
 colnames(quant_data) <- sample_name_matches
 write_debug_file(quant_data)
 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
-sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match))
+smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match))
+if (print_nb_messages) nbe(see_variable(smpl_trt, "\n\n"))
+if (print_nb_messages) nbe(see_variable(sample_name_matches, "\n\n"))
+if (print_nb_messages) nbe(see_variable(full_data_names, "\n\n"))
+sample_treatment_df <-
+save_sample_treatment_df <-
+data.frame(
+class = smpl_trt,
+sample = sample_name_matches,
+full_sample_names = full_data_names
+)
+if (print_nb_messages) nbe(see_variable(sample_treatment_df, "\n\n"))
+# reorder data
+my_order <- with(sample_treatment_df, order(class, sample))
+quant_data <- quant_data[, my_order]
+sample_name_matches <- sample_name_matches[my_order]
+smpl_trt <- smpl_trt[my_order]
+sample_treatment_df <- data.frame(
+class = smpl_trt,
+sample = sample_name_matches
+)
+# filter smpl_trt as appropriate
+if (sample_group_filter %in% c("include", "exclude")) {
+include_sample <-
+mgrepl(
+v = sample_group_filter_patterns,
+s = as.character(smpl_trt),
+fixed = sample_group_filter_fixed,
+perl = sample_group_filter_perl,
+ignore.case = sample_group_filter_nocase
+)
+if (sum(include_sample) < 2) {
+errmsg <-
+paste(
+"ERROR:",
+sum(include_sample),
+"samples are too few for analysis;",
+"check input parameters for sample-name parsing"
+)
+cat0(
+errmsg,
+"\\stepcounter{offset}\n",
+" in ",
+table_href(),
+".\n\n"
+)
+data_frame_tabbing_latex(
+x = save_sample_treatment_df,
+tabstops = c(1.25, 1.25),
+caption = "Sample classes",
+use_subsubsection_header = FALSE
+)
+data_frame_tabbing_latex(
+x =
+param_df[
+c("regexSampleNames",
+"regexSampleGrouping",
+"groupFilterPatterns",
+"groupFilter",
+"groupFilterMode"
+),
+],
+tabstops = c(1.75),
+underscore_whack = TRUE,
+caption = "Input parameters for sample-name parsing",
+verbatim = FALSE
+)
+param_df_exit()
+knitr::knit_exit()
+return(invisible(-1))
+}
+sample_treatment_df <-
+if (sample_group_filter == "include")
+sample_treatment_df[include_sample, ]
+else
+sample_treatment_df[!include_sample, ]
+} else {
+include_sample <- rep.int(TRUE, length(smpl_trt))
+}
+sample_name_matches <- sample_treatment_df$sample
+rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
+smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match))
+sample_treatment_df$class <- smpl_trt
+# filter quant_data as appropriate
 number_of_samples <- length(sample_name_matches)
-sample_treatment_df <- data.frame(
+quant_data <- quant_data[, sample_name_matches]
-class = sample_treatment_levels,
-sample = sample_name_matches
+sample_level_integers <- as.integer(smpl_trt)
-)
+sample_treatment_levels <- levels(smpl_trt)
-# reorder data
+count_of_treatment_levels <- length(sample_treatment_levels)
-if (TRUE) {
-my_order <- with(sample_treatment_df, order(class, sample))
+# for each phosphopeptide, across treatment levels, compute minimum
-quant_data <- quant_data[, my_order]
+#   count of observed (i.e., non-missing) values
-sample_name_matches <- sample_name_matches[my_order]
+my_env <- new_env()
-sample_treatment_levels <- sample_treatment_levels[my_order]
+for (l in sample_treatment_levels)
-}
+my_env[[as.character(l)]] <-
-sample_treatment_df <- data.frame(
+as.vector(rowSums(!is.na(quant_data[l == smpl_trt])))
-class = sample_treatment_levels,
+min_group_obs_count <- row_apply(
-sample = sample_name_matches
+x = Reduce(
-)
+f = function(l, r) cbind(l, my_env[[r]]),
-data_frame_latex(
+x = sample_treatment_levels,
+init = c()
+),
+fun = min
+)
+names(min_group_obs_count) <- rownames(quant_data)
+rm(my_env)
+# display (possibly-filtered) results
+cat("\\newpage\n")
+if (sum(include_sample) > 1) {
+data_frame_tabbing_latex(
 x = sample_treatment_df,
-justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}",
+tabstops = c(1.25),
-centered = TRUE,
 caption = "Sample classes",
-anchor = const_table_anchor_tbp,
+use_subsubsection_header = FALSE
-underscore_whack = FALSE
+)
-)
+}
-sample_name_shrink <- 10 / (10 + max(nchar(sample_name_matches)))
+sample_name_grow <- 10 / (10 + max(nchar(sample_name_matches) + 6))
+sample_colsep <- transition_positions(as.integer(sample_treatment_df$class))
 ```
 ```{r echo = FALSE, results = 'asis'}
 cat("\\newpage\n")
 ```
 ## Are the log-transformed sample distributions similar?
-```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
+```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'}
 quant_data[quant_data == 0] <- NA  #replace 0 with NA
 quant_data_log <- log10(quant_data)
 rownames(quant_data_log) <- rownames(quant_data)
 colnames(quant_data_log) <- sample_name_matches
 write_debug_file(quant_data_log)
-# data visualization
+g_ppep_distrib_ctl <- new_env()
-old_par <- par(
+g_ppep_distrib_ctl$xlab_line <- 3.5 + 11.86 * (0.67 - sample_name_grow)
-mai = par("mai") + c(0.5, 0, 0, 0)
+g_ppep_distrib_ctl$mai_bottom <- (0.5 + 3.95 * (0.67 - sample_name_grow))
-)
+g_ppep_distrib_ctl$axis <- (0.6 + 0.925 * (0.67 - sample_name_grow))
-# ref: https://r-charts.com/distribution/add-points-boxplot/
-# Vertical plot
+my_ppep_distrib_bxp <- function(
-boxplot(
+x
-quant_data_log
+, sample_name_grow = sample_name_grow
-, las = 2
+, main
-, cex.axis = 0.9 * sample_name_shrink
+, varwidth = FALSE
-, col = const_boxplot_fill
+, sub = NULL
-, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+, xlab
-, xlab = "Sample"
+, ylab
-)
+, col = const_boxplot_fill
-par(old_par)
+, notch = FALSE
+, ppep_distrib_ctl = g_ppep_distrib_ctl
+, ...
+) {
-cat("\n\n\n")
+my_xlab_line  <- g_ppep_distrib_ctl$xlab_line
-cat("\n\n\n")
+my_mai_bottom <- g_ppep_distrib_ctl$mai_bottom
+my_axis       <- g_ppep_distrib_ctl$axis
+if (print_trace_messages) {
+cat_variable(my_xlab_line, suffix = "; ")
+cat_variable(my_mai_bottom, suffix = "; ")
+cat_variable(my_axis, suffix = "\n\n")
+}
+old_par <- par(
+mai = par("mai") + c(my_mai_bottom, 0, 0, 0),
+cex.axis = my_axis,
+cex.lab = 1.2
+)
+tryCatch(
+{
+# Vertical plot
+boxplot(
+x
+, las = 2
+, col = col
+, main = main
+, sub = NULL
+, ylab = ylab
+, xlab = NULL
+, notch = notch
+, varwidth = varwidth
+, ...
+)
+title(
+sub = sub
+, cex.sub = 1.0
+, line = my_xlab_line + 1
+)
+title(
+xlab = xlab
+, line = my_xlab_line
+)
+},
+finally = par(old_par)
+)
+}
+my_ppep_distrib_bxp(
+x = quant_data_log
+, sample_name_grow = sample_name_grow
+, main = "Peptide intensities for each sample"
+, varwidth = boxplot_varwidth
+, sub = "Box widths reflect number of peptides for sample"
+, xlab = "Sample"
+, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+, col = const_boxplot_fill
+, notch = FALSE
+)
+cat("\n\n\n\n")
 ```
 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
 if (nrow(quant_data_log) > 1) {
 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"),
 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)")
 )
 ```
+# Characterization of Input Data
 ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values
 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
 # determine quantile
 q1 <- quantile(logvalues, probs = mean_percentile)[1]
 # 1 = row of matrix (ie, phosphopeptide)
-sds <- apply(quant_data_log, 1, sd_finite)
+sds <- row_apply(quant_data_log, sd_finite)
 if (sum(!is.na(sds)) > 2) {
 plot(
 density(sds, na.rm = TRUE)
 , main = "Smoothed estimated probability density vs. std. deviation"
 , sub = "(probability estimation made with Gaussian smoothing)"
 ```{r echo = FALSE}
 # prep for trt-median based imputation
 ```
-# Impute Missing Values
+# Imputation of Missing Values
 ```{r echo = FALSE}
 imp_smry_pot_peptides_before <- nrow(quant_data_log)
 imp_smry_missing_values_before   <- number_to_impute
 quant_data_imp <- quant_data
 imputation_method_description <-
 paste("Substitute missing value with",
 "median peptide-intensity for sample group.\n"
 )
-sample_level_integers <- as.integer(sample_treatment_levels)
 # Take the accurate ln(x+1) because the data are log-normally distributed
 #   and because median can involve an average of two measurements.
 quant_data_imp <- log1p(quant_data_imp)
-for (i in seq_len(length(levels(sample_treatment_levels)))) {
+for (i in seq_len(count_of_treatment_levels)) {
 # Determine the columns for this factor-level
 level_cols <- i == sample_level_integers
 # Extract those columns
 lvlsbst <- quant_data_imp[, level_cols, drop = FALSE]
 # assign to ind the row-column pairs corresponding to each NA
 ind <- which(is.na(lvlsbst), arr.ind = TRUE)
 # No group-median exists if there is only one sample
 #   a given ppep has no measurement; otherwise, proceed.
 if (ncol(lvlsbst) > 1) {
 the_centers <-
-apply(lvlsbst, 1, median, na.rm = TRUE)
+row_apply(lvlsbst, median, na.rm = TRUE)
 for (j in seq_len(nrow(lvlsbst))) {
 for (k in seq_len(ncol(lvlsbst))) {
 if (is.na(lvlsbst[j, k])) {
 lvlsbst[j, k] <- the_centers[j]
 }
 "median peptide-intensity across all sample classes.\n"
 )
 # Take the accurate ln(x+1) because the data are log-normally distributed
 #   and because median can involve an average of two measurements.
 quant_data_imp <- log1p(quant_data_imp)
-quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]]
+quant_data_imp[ind] <- row_apply(quant_data_imp, median, na.rm = TRUE)[ind[, 1]]
 # Take the accurate e^x - 1 to match scaling of original input.
 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
 good_rows <- !is.nan(rowMeans(quant_data_imp))
 }
 , "mean" = {
 # Take the accurate ln(x+1) because the data are log-normally distributed,
 #   so arguments to mean should be previously transformed.
 #   this will have to be
 quant_data_imp <- log1p(quant_data_imp)
 # Assign to NA cells the mean for the row
-quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]]
+quant_data_imp[ind] <- row_apply(quant_data_imp, mean, na.rm = TRUE)[ind[, 1]]
 # Take the accurate e^x - 1 to match scaling of original input.
 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
 good_rows <- !is.nan(rowMeans(quant_data_imp))
 }
 , "random" = {
 ```{r echo = FALSE}
 imp_smry_pot_peptides_after <- sum(good_rows)
 imp_smry_rejected_after  <- sum(!good_rows)
 imp_smry_missing_values_after   <- sum(is.na(quant_data_imp[good_rows, ]))
+# From ?`%in%`, %in% is currently defined as function(x, table) match(x, table, nomatch = 0) > 0
+sink(stderr())
+print("`%in%`:")
+print(`%in%`)
+sink()
+stock_in <-
+names(good_rows) %in%
+names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count])
+if (print_nb_messages) nbe(see_variable(stock_in), "\n")
+explicit_in <-
+0 < match(
+names(good_rows),
+names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count])
+)
+if (print_nb_messages) nbe(see_variable(explicit_in), "\n")
+great_enough_row_names <- good_rows[
+names(good_rows) %in%
+names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count])
+]
+if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n")
+great_enough_row_names <- great_enough_row_names[great_enough_row_names]
+if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n")
 ```
 ```{r echo = FALSE, results = 'asis'}
 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf
 tabular_lines_fmt <- paste(
 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page)
+" \\leavevmode",
 " \\caption{Imputation Results}",
 " \\centering", # \centering centers the table on the page
 " \\begin{tabular}{l c c c}",
 "  \\hline\\hline",
 "  \\  & potential peptides   & missing values & rejected",
 "    peptides \\\\ [0.5ex]",
 "  \\hline",
-"  before imputation & %d     & %d (%d\\%s)    &    \\\\",
+"  before imputation     & %d & %d (%d\\%s) &    \\\\",
-"  after imputation  & %d     & %d             & %d \\\\ [1ex]",
+"  after imputation      & %d & %d          & %d \\\\",
+"  after keep comparable & %d &             & %d \\\\ [1ex]",
 "  \\hline",
 " \\end{tabular}",
 #" \\label{table:nonlin}", # may be used to refer this table in the text
 "\\end{table}",
 sep = "\n"
 imp_smry_missing_values_before,
 imp_smry_pct_missing,
 "%",
 imp_smry_pot_peptides_after,
 imp_smry_missing_values_after,
-imp_smry_rejected_after
+imp_smry_rejected_after,
+length(great_enough_row_names),
+imp_smry_pot_peptides_before -
+length(great_enough_row_names)
 )
 cat(tabular_lines)
 ```
-```{r echo = FALSE}
+```{r filter_good_rows, echo = FALSE}
-# Zap rows where imputation was ineffective
+if (print_nb_messages) nbe("before name extraction, ", see_variable(length(good_rows)), " ", see_variable(good_rows), "\n")
+good_rows <- names(good_rows[names(great_enough_row_names)])
+if (print_nb_messages) nbe("after name extraction, ", see_variable(length(good_rows)), see_variable(good_rows), "\n")
+#ACE min_group_obs_count <- min_group_obs_count[names(great_enough_row_names)]
+#ACE nbe("good_rows")
+#ACE nbe(see_variable(good_rows))
+#ACE nbe("names(min_group_obs_count) before filter for good rows")
+#ACE nbe(see_variable(names(min_group_obs_count)))
+min_group_obs_count <- min_group_obs_count[good_rows]
+#ACE nbe("min_group_obs_count after filter for good rows")
+#ACE nbe(see_variable(names(min_group_obs_count)))
+# Zap rows where imputation was insufficiently effective
 full_data         <- full_data        [good_rows, ]
 quant_data        <- quant_data       [good_rows, ]
+quant_data_log    <- quant_data_log   [good_rows, ]
+if (print_nb_messages) nbe("before row filter, ", see_variable(nrow(quant_data_imp)), "\n")
 quant_data_imp <- quant_data_imp[good_rows, ]
+if (print_nb_messages) nbe("after row filter, ", see_variable(nrow(quant_data_imp)), "\n")
 write_debug_file(quant_data_imp)
 quant_data_imp_good_rows <- quant_data_imp
 write_debug_file(quant_data_imp_good_rows)
 ```
 d_imputed <- d_combined
 }
 ```
-```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
+```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'}
 zero_sd_rownames <-
 rownames(quant_data_imp)[
-is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0)
+is.na((row_apply(quant_data_imp, sd, na.rm = TRUE)) == 0)
 ]
 if (length(zero_sd_rownames) >= nrow(quant_data_imp)) {
-stop("All peptides have zero standard deviation.  Cannot continue.")
+cat("All peptides have zero standard deviation.  Cannot continue.")
+param_df_exit()
+knitr::knit_exit()
 }
 if (length(zero_sd_rownames) > 0) {
 cat(
-sprintf("%d peptides with zero variance were removed from statistical consideration",
+sprintf(
-length(zero_sd_rownames)
+"%d %s %s",
+length(zero_sd_rownames),
+"peptides with zero variance",
+"were removed from statistical consideration"
 )
 )
 zap_named_rows <- function(df, nms) {
 return(df[!(row.names(df) %in% nms), ])
 }
-quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames)
+quant_data_imp <-
-quant_data     <- zap_named_rows(quant_data,     zero_sd_rownames)
+zap_named_rows(quant_data_imp, zero_sd_rownames)
-full_data      <- zap_named_rows(full_data,      zero_sd_rownames)
+quant_data <-
+zap_named_rows(quant_data, zero_sd_rownames)
+full_data <-
+zap_named_rows(full_data, zero_sd_rownames)
+min_group_obs_count <-
+min_group_obs_count[names(min_group_obs_count) %notin% zero_sd_rownames]
 }
 if (sum(is.na(quant_data)) > 0) {
 cat("\\leavevmode\\newpage\n")
-# data visualization
-old_par <- par(
-mai = par("mai") + c(0.5, 0, 0, 0)
-)
 # Copy quant data to x
 x <- quant_data
 # x gets to have values of:
 #  - NA for observed values
 #  - 1 for missing values
 max(red_dots, blue_dots, na.rm = TRUE)
 )
 show_stripchart <-
 50 > (count_red + count_blue) / length(sample_name_matches)
 if (show_stripchart) {
-boxplot_sub <- "Light blue = data before imputation; Red = imputed data"
+boxplot_sub <- "Light blue = data before imputation; Red = imputed data;"
 } else {
 boxplot_sub <- ""
 }
 # Vertical plot
 colnames(blue_dots) <- sample_name_matches
-boxplot(
+my_ppep_distrib_bxp(
-blue_dots
+x = blue_dots
-, las = 2 # "always vertical"
+, sample_name_grow = sample_name_grow
-, cex.axis = 0.9 * sample_name_shrink
-, col = const_boxplot_fill
-, ylim = ylim
 , main = "Peptide intensities after eliminating unusable peptides"
-, sub = boxplot_sub
+, varwidth = boxplot_varwidth
+, sub = paste(boxplot_sub, "Box widths reflect number of peptides for sample")
 , xlab = "Sample"
 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+, col = const_boxplot_fill
+, notch = FALSE
+, ylim = ylim
 )
 if (show_stripchart) {
 # Points
 # ref: https://r-charts.com/distribution/add-points-boxplot/
 stripchart(
 blue_dots,                 # Data
 method = "jitter",          # Random noise
 jitter = const_stripchart_jitter,
 pch = 19,                   # Pch symbols
-cex = const_stripsmall_cex, # Size of symbols reduced
+cex = const_stripchart_cex, # Size of symbols reduced
 col = "lightblue",          # Color of the symbol
 vertical = TRUE,            # Vertical mode
 add = TRUE                  # Add it over
 )
 stripchart(
 red_dots,                   # Data
 method = "jitter",          # Random noise
 jitter = const_stripchart_jitter,
 pch = 19,                   # Pch symbols
-cex = const_stripsmall_cex, # Size of symbols reduced
+cex = const_stripchart_cex, # Size of symbols reduced
 col = "red",                # Color of the symbol
 vertical = TRUE,            # Vertical mode
 add = TRUE                  # Add it over
 )
 }
-if (TRUE) {
+}
-# show measured values in blue on left half-violin plot
+```
-cat("\\leavevmode\n\\quad\n\n\\quad\n\n")
-vioplot::vioplot(
+```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
-x = lapply(blue_dots, function(x) x[!is.na(x)]),
+if (sum(is.na(quant_data)) > 0) {
-col = "lightblue1",
+# show measured values in blue on left half-violin plot
-side = "left",
+cat("\\leavevmode\n\\quad\n\n\\quad\n\n")
-plotCentre = "line",
+old_par <- par(
-ylim = ylim_save,
+mai = par("mai") + c(g_ppep_distrib_ctl$mai_bottom, 0, 0, 0),
-main = "Distributions of observed and imputed data",
+cex.axis = g_ppep_distrib_ctl$axis,
-sub = "Light blue = observed data; Pink = imputed data",
+cex.lab = 1.2
-las = 2,
+)
-cex.axis = 0.9 * sample_name_shrink,
+tryCatch(
-xlab = "Sample",
+{
-ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+vioplot::vioplot(
-)
+x = lapply(blue_dots, function(x) x[!is.na(x)]),
-red_violins <- lapply(red_dots, function(x) x[!is.na(x)])
+col = "lightblue1",
-cols_to_delete <- c()
+side = "left",
-for (ix in seq_len(length(red_violins))) {
+plotCentre = "line",
-if (length(red_violins[[ix]]) < 1) {
+ylim = ylim_save,
-cols_to_delete <- c(cols_to_delete, ix)
+main = "Distributions of observed and imputed data",
+sub = NULL,
+las = 2,
+xlab = NULL,
+ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+)
+title(
+sub = "Light blue = observed data; Pink = imputed data",
+cex.sub = 1.0,
+line = g_ppep_distrib_ctl$xlab_line + 1
+)
+title(
+xlab = "Sample",
+line = g_ppep_distrib_ctl$xlab_line
+)
+red_violins <- lapply(red_dots, function(x) x[!is.na(x)])
+cols_to_delete <- c()
+for (ix in seq_len(length(red_violins))) {
+if (length(red_violins[[ix]]) < 1) {
+cols_to_delete <- c(cols_to_delete, ix)
+}
 }
-}
+# destroy any unimputable columns
-# destroy any unimputable columns
+if (!is.null(cols_to_delete)) {
-if (!is.null(cols_to_delete)) {
+red_violins <- red_violins[-cols_to_delete]
-red_violins <- red_violins[-cols_to_delete]
+}
-}
+# plot imputed values in red on right half-violin plot
-# plot imputed values in red on right half-violin plot
+vioplot::vioplot(
-vioplot::vioplot(
+x = red_violins,
-x = red_violins,
+col = "lightpink1",
-col = "lightpink1",
+side = "right",
-side = "right",
+plotCentre = "line",
-plotCentre = "line",
+add = TRUE
-add = TRUE
+)
-)
-}
+},
+finally = par(old_par)
-par(old_par)
+)
 # density plot
 cat("\\leavevmode\n\n\n\n\n\n\n")
 if (can_plot_before_after_imp) {
 ylim <- c(
 }
 cat("\\leavevmode\\newpage\n")
 }
 ```
-# Perform Quantile Normalization
+# Quantile Normalization
 The excellent `normalize.quantiles` function from
 *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)*
 performs "quantile normalization" as described Bolstad *et al.* (2003),
 DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)*
-and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*,
+and its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html),
 i.e., it assumes that the goal is to detect
 subtle differences among grossly similar samples (having similar distributions)
-by equailzing intra-quantile quantitations.
+by equalizing intra-quantile quantitations^[Unfortunately,
-Unfortunately, one software library upon which it depends
+one software library upon which `preprocessCore` depends
 *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)*
-that requires that a specific, non-concurrent version of the library be
+that requires that a specific, non-concurrent version of the library (`openblas` version $0.3.3$) be
 installed.  The installation command equivalent to what was used to install the library to produce the results presented here is:
-```
+\linebreak
-conda install bioconductor-preprocesscore openblas=0.3.3
+`    conda install bioconductor-preprocesscore openblas=0.3.3`].
-```
 <!--
 # Apply quantile normalization using preprocessCore::normalize.quantiles
 # ---
 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html
 #   except this: https://support.bioconductor.org/p/122925/#9135989
 #   says to install it like this:
 #     ```
 #     BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1])
 #     ```
 # conda installation (necessary because of a bug in recent openblas):
 #   conda install bioconductor-preprocesscore openblas=0.3.3
 # ...
 # ---
 # normalize.quantiles {preprocessCore} --  Quantile Normalization
 #
 # Description:
 #   Using a normalization based upon quantiles, this function normalizes a
 #     matrix of probe level intensities.
 #
 #   THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the
 #     assumption that the data is missing at random.
 #
 # Usage:
 #   normalize.quantiles(x, copy = TRUE, keep.names = FALSE)
 #
 # Arguments:
 #
 #   - x: A matrix of intensities where each column corresponds to a chip and each row is a probe.
 #
 #   - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy,
 #       but in certain situations not making a copy of the matrix, but instead normalizing
 #       it in place will be more memory friendly.
 #
 #   - keep.names: Boolean option to preserve matrix row and column names in output.
 #
 # Details:
 #   This method is based upon the concept of a quantile-quantile plot extended to n dimensions.
 #     No special allowances are made for outliers. If you make use of quantile normalization
 #     please cite Bolstad et al, Bioinformatics (2003).
 #
 #   This functions will handle missing data (ie NA values), based on
 #     the assumption that the data is missing at random.
 #
 #   Note that the current implementation optimizes for better memory usage
 #     at the cost of some additional run-time.
 #
 # Value: A normalized matrix.
 #
 # Author: Ben Bolstad, bmbolstad.com
 #
 # References
 #
 #   - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide
 #       Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf
 #
 #   - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of
 #       Normalization Methods for High Density Oligonucleotide Array Data Based on Bias
 #       and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185
 #       http://bmbolstad.com/misc/normalize/normalize.html
 # ...
 -->
 ```{r echo = FALSE, results = 'asis'}
+if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp)), "\n")
 if (nrow(quant_data_imp) > 0) {
 quant_data_imp_qn <- preprocessCore::normalize.quantiles(
 as.matrix(quant_data_imp), keep.names = TRUE
 )
 } else {
 quant_data_imp_qn <- as.matrix(quant_data_imp)
 }
+if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn)), "\n")
 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn)
 write_debug_file(quant_data_imp_qn)
 quant_data_imp_qn_log <- log10(quant_data_imp_qn)
 write_debug_file(quant_data_imp_qn_log)
+if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n")
+if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n")
 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn))))
-sel <- apply(quant_data_imp_qn_ls, 1, any_nan)
+sel <- row_apply(quant_data_imp_qn_ls, any_nan)
 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls
 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ]
 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2)
 # Create data.frame used by ANOVA analysis
 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log)
 ```
-<!-- ACE insertion begin -->
 ## Are normalized, imputed, log-transformed sample distributions similar?
-```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
+```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'}
 # Save unimputed quant_data_log for plotting below
 unimputed_quant_data_log <- quant_data_log
 # log10 transform (after preparing for zero values,
 )
 cat("\n\n\n")
 # data visualization
+if (TRUE) {
+my_ppep_distrib_bxp(
+x = quant_data_log
+, sample_name_grow = sample_name_grow
+, main = "Peptide intensities for each sample"
+, varwidth = boxplot_varwidth
+, sub = NULL
+, xlab = "Sample"
+, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+, col = const_boxplot_fill
+, notch = boxplot_notch
+)
+} else {
 old_par <- par(
 mai = par("mai") + c(0.5, 0, 0, 0)
 , oma = par("oma") + c(0.5, 0, 0, 0)
 )
 # ref: https://r-charts.com/distribution/add-points-boxplot/
 # Vertical plot
 colnames(quant_data_log) <- sample_name_matches
 boxplot(
 quant_data_log
 , las = 2
-, cex.axis = 0.9 * sample_name_shrink
+, cex.axis = 0.9 * sample_name_grow
 , col = const_boxplot_fill
 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
 , xlab = "Sample"
+, notch = boxplot_notch
+, varwidth = boxplot_varwidth
 )
 par(old_par)
+}
 } else {
 cat("There are no peptides to plot\n")
 }
 cat("\n\n\n")
 cat("\\leavevmode\\newpage\n")
 ```
 # ANOVA Analysis
-```{r, echo = FALSE}
+## Assignment of $p$-value and quality score
+For each phosphopeptide, ANOVA analysis was performed to compute a $p$-value representing the evidence against the "null hypothesis" ($H_0$) that the intensity does not vary significantly among sample groups.
+However, because as more and more phosphopeptides are tested, there is increasd probability that, by random chance, a given peptide will have a $p$-value that appears to indicate significance.  For this reason, the $p$-values were adjusted by applying the False Discovery Rate (FDR) correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x).
+Furthermore, to give more weight to phosphopeptides having fewer missing values, an (arbitrarily defined) quality score was assigned to each, defined as:
+$$
+\textit{quality}_j
+= \frac{1 + o_{j}}{v_{j}(1 + o_{j}) + 0.005}
+$$
+where:
+- $o_j$ is the minimum number of non-missing observations per sample group for substrate $j$ for all sample groups, and
+- $v_j$ is the FDR-adjusted ANOVA $p$-value for substrate $j$.
+```{r, echo = FALSE, results = 'asis'}
+cat("\\newpage\n")
 # Make new data frame containing only Phosphopeptides
 #   to connect preANOVA to ANOVA (connect_df)
 connect_df <- data.frame(
 data_table_imp_qn_lt$Phosphopeptide
 , data_table_imp_qn_lt[, first_data_column]
 )
 colnames(connect_df) <- c("Phosphopeptide", "Intensity")
 ```
-```{r anova, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+```{r anova, echo = FALSE, fig.dim = c(10, 12), results = 'asis'}
-count_of_treatment_levels <- length(levels(sample_treatment_levels))
+g_can_run_ksea <- FALSE
+old_oma <- par("oma")
 if (count_of_treatment_levels < 2) {
-nuke_control_sequences <-
-function(s) {
-s <- gsub("[\\]", "xyzzy_plugh", s)
-s <- gsub("[$]", "\\\\$", s)
-s <- gsub("xyzzy_plugh", "$\\\\backslash$", s)
-return(s)
-}
 cat(
 "ERROR!!!! Cannot perform ANOVA analysis",
 "(see next page)\\newpage\n"
 )
 cat(
 cat("Unparsed sample names are:\n\n\n",
 "\\begin{quote}\n",
 paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"),
 "\n\\end{quote}\n\n")
-regex_sample_names <- nuke_control_sequences(regex_sample_names)
+regex_sample_names <- latex_printable_control_seqs(regex_sample_names)
 cat("\\leavevmode\n\n\n")
 cat("Parsing rule for SampleNames is",
 "\n\n\n",
 "\\text{'",
 cat("\nParsed sample names are:\n",
 "\\begin{quote}\n",
 paste(sample_name_matches, collapse = "\n\n\n"),
 "\n\\end{quote}\n\n")
-regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping)
+regex_sample_grouping <- latex_printable_control_seqs(regex_sample_grouping)
 cat("\\leavevmode\n\n\n")
 cat("Parsing rule for SampleGrouping is",
 "\n\n\n",
 "\\text{'",
 paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"),
 "\n\\end{quote}\n\n")
 } else {
-p_value_data_anova_ps <-
+if (print_nb_messages) nbe("computing p_value_data_anova_ps\n")
-apply(
+if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n")
-quant_data_imp_qn_log,
+if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n")
-1,
+if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log[, ".NE.7C"]), "\n")
-anova_func,
+if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log), "\n")
-grouping_factor = sample_treatment_levels,
+if (print_nb_messages) nbe(see_variable(anova_func), "\n")
-one_way_f = one_way_all_categories
+if (print_nb_messages) nbe(see_variable(smpl_trt), "\n")
-)
+if (print_nb_messages) nbe(see_variable(one_way_all_categories), "\n")
+tryCatch(
+{
+p_value_data_anova_ps <-
+row_apply(
+quant_data_imp_qn_log,
+anova_func,
+grouping_factor = smpl_trt,
+one_way_f = one_way_all_categories
+)
+},
+error = function(e) {
+mesg <- paste("Could not compute ANOVA because", e$message)
+cat("\n\n", mesg, "\n\n")
+param_df_noexit(e)
+sink(stderr())
+cat("\n\n", mesg, "\n\n")
+values <- paste(param_df$parameter, "=", param_df$value, collapse = "\n")
+cat(values)
+sink()
+knitr::knit_exit()
+exit(code = 1)
+}
+)
+if (print_nb_messages) nbe(see_variable(p_value_data_anova_ps), "\n")
 p_value_data_anova_ps_fdr <-
 p.adjust(p_value_data_anova_ps, method = "fdr")
+my_ppep_v <- full_data[, 1]
+p_value_data <- list(
+phosphopeptide = my_ppep_v,
+raw_anova_p = p_value_data_anova_ps,
+fdr_adjusted_anova_p = p_value_data_anova_ps_fdr,
+missing_values = rowSums(is.na(quant_data)),
+min_group_obs_count = min_group_obs_count
+)
 p_value_data <- data.frame(
-phosphopeptide = full_data[, 1],
+phosphopeptide = my_ppep_v,
 raw_anova_p = p_value_data_anova_ps,
-fdr_adjusted_anova_p = p_value_data_anova_ps_fdr
+fdr_adjusted_anova_p = p_value_data_anova_ps_fdr,
-)
+missing_values = rowSums(is.na(quant_data)),
+min_group_obs_count = min_group_obs_count
+)
+p_value_data$quality <- 1.0 / with(
+p_value_data,
+fdr_adjusted_anova_p + 0.005 / (1 + min_group_obs_count)
+)
+p_value_data$ranking <-
+with(
+p_value_data,
+switch(
+g_intensity_hm_criteria,
+"quality"  = order(-quality),
+"na_count" = order(missing_values, fdr_adjusted_anova_p),
+# the default is "p_value"
+order(fdr_adjusted_anova_p)
+)
+)
+p_value_data <- p_value_data[p_value_data$ranking, , drop = FALSE]
+write.table(
+p_value_data,
+file = "p_value_data.txt",
+sep = "\t",
+col.names = TRUE,
+row.names = FALSE,
+quote = FALSE
+)
 # output ANOVA file to constructed filename,
 #   e.g.    "Outputfile_pST_ANOVA_STEP5.txt"
-#   becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt"
+#   becomes "Outputfile_pST_ANOVA_STEP5_FDR0.05.txt"
 # Re-output datasets to include p-values
-metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3])
+metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:ncol(p_value_data)])
 write.table(
 cbind(metadata_plus_p, quant_data_imp),
 file = imputed_data_filename,
 sep = "\t",
 col.names = TRUE,
 row.names = FALSE,
 quote = FALSE
 )
-p_value_data <-
-p_value_data[order(p_value_data$fdr_adjusted_anova_p), ]
 first_page_suppress <- 1
 number_of_peptides_found <- 0
 cutoff <- val_fdr[1]
 for (cutoff in val_fdr) {
-if (number_of_peptides_found > 49) {
+#loop through FDR cutoffs
+if (number_of_peptides_found > g_intensity_hm_rows - 1) {
 cat("\\leavevmode\n\n\n")
 break
 }
-#loop through FDR cutoffs
+bool_1 <- (p_value_data$fdr_adjusted_anova_p < cutoff)
+bool_2 <- (p_value_data$min_group_obs_count >= g_intensity_min_per_class)
+g_can_run_ksea <- g_can_run_ksea || (sum(bool_2) > 0)
+bool_4 <- (p_value_data$quality >= params$minQuality)
+bool_3 <- as.logical(
+as.integer(bool_1) *
+as.integer(bool_2) *
+as.integer(bool_4)
+)
+if (print_trace_messages) {
+if (length(bool_1) > 30) {
+cat_variable(bool_1, force_str = TRUE)
+cat_variable(bool_2, force_str = TRUE)
+cat_variable(bool_3, force_str = TRUE)
+} else {
+cat_variable(bool_1, suffix = "\n\n")
+cat_variable(bool_2, suffix = "\n\n")
+cat_variable(bool_3, suffix = "\n\n")
+}
+cat_variable(length(bool_3), digits = 0, suffix = "; ")
+cat_variable(sum(bool_3), digits = 0, suffix = "\n\n")
+}
 filtered_p <-
-p_value_data[
+p_value_data[bool_3, , drop = FALSE]
-which(p_value_data$fdr_adjusted_anova_p < cutoff),
+filtered_p <-
-, drop = FALSE
+filtered_p[!is.na(filtered_p$phosphopeptide), , drop = FALSE]
-]
+if (print_trace_messages)
+cat_variable(filtered_p, force_str = TRUE)
+have_remaining_peptides <- sum(bool_3, na.rm = TRUE) > 0
+filter_result_string <-
+sprintf(
+"%s, %s of %0.0f peptides remained having both %s and %s.\n\n",
+"After filtering for ANOVA results",
+if (have_remaining_peptides)
+as.character(sum(bool_3, na.rm = TRUE))
+else
+"none",
+length(bool_3),
+sprintf("adjusted p-value < %s", as.character(signif(cutoff, 2))),
+sprintf(
+"more than %0.0f observations in some groups",
+max(0, g_intensity_min_per_class - 1)
+)
+)
 filtered_data_filtered <-
 quant_data_imp_qn_log[
 rownames(filtered_p),
 , drop = FALSE
 ]
+# order by p-value
 filtered_data_filtered <-
 filtered_data_filtered[
 order(filtered_p$fdr_adjusted_anova_p),
 , drop = FALSE
 ]
-# <!-- ACE insertion start -->
+if (have_remaining_peptides && nrow(filtered_p) > 0 && nrow(filtered_data_filtered) > 0) {
-if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) {
 if (first_page_suppress == 1) {
 first_page_suppress <- 0
 } else {
 cat("\\newpage\n")
 }
-if (nrow(filtered_data_filtered) > 1) {
+latex_samepage({
-subsection_header(sprintf(
+cat(filter_result_string)
-"Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n",
+if (nrow(filtered_data_filtered) > 1) {
-nrow(filtered_data_filtered),
+cat(subsection_header(sprintf(
-cutoff
+"Intensity distributions for %d phosphopeptides\n",
-))
+nrow(filtered_data_filtered)
-} else {
+)))
-subsection_header(sprintf(
+} else {
-"Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n",
+cat(subsection_header(sprintf(
-rownames(filtered_data_filtered)[1],
+"Intensity distribution for one phosphopeptide (%s)\n",
-cutoff
+rownames(filtered_data_filtered)[1]
-))
+)))
 }
-cat("\n\n\n")
+}) # end latex_samepage
-cat("\n\n\n")
-old_oma <- par("oma")
 old_par <- par(
 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1),
 oma = old_oma * c(1, 1, 0.3, 1),
 cex.main = 0.9,
 cex.axis = 0.7,
 fin = c(9, 7.25)
 )
-# ref: https://r-charts.com/distribution/add-points-boxplot/
 # Vertical plot
 colnames(filtered_data_filtered) <- sample_name_matches
 tryCatch(
 boxplot(
 filtered_data_filtered,
 main = "Imputed, normalized intensities", # no line plot
 las = 2,
-cex.axis = 0.9 * sample_name_shrink,
+cex.axis = 0.9 * sample_name_grow,
 col = const_boxplot_fill,
-ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+ylab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
+notch = FALSE,
+varwidth = boxplot_varwidth
 ),
-error = function(e) print(e)
+error = function(e) {
+print(e)
+cat_margins()
+}
 )
 par(old_par)
 } else {
 cat(sprintf(
 "%s < %0.2f\n\n\n\n\n",
 "No peptides were found to have cutoff adjusted p-value",
 cutoff
 ))
 }
-if (nrow(filtered_data_filtered) > 0) {
+if (have_remaining_peptides && nrow(filtered_data_filtered) > 0) {
 # Add Phosphopeptide column to anova_filtered table
 # The assumption here is that the first intensity is unique;
 #   this is a hokey assumption but almost definitely will
 #   be true in the real world unless there is a computation
 #   error upstream.
 by.y = "Phosphopeptide"
 )
 # Produce heatmap to visualize significance and the effect of imputation
-anova_filtered_merge_format <- sapply(
-X = filtered_p$fdr_adjusted_anova_p
-,
-FUN = function(x) {
-if (x > 0.01)
-paste0("%s (%0.", 1 + ceiling(-log10(x)), "f)")
-else
-paste0("%s (%0.2e)")
-}
-)
 cat_hm_heading <- function(m, cutoff) {
-if (nrow(m) > intensity_hm_rows) {
+if (nrow(m) > g_intensity_hm_rows) {
-cat("\\newpage\n")
+cat("\\clearpage\n")
-subsection_header(
+cat(subsection_header(
 paste(
 sprintf("Heatmap for the %d most-significant peptides",
-intensity_hm_rows),
+g_intensity_hm_rows),
 sprintf("whose adjusted p-value < %0.2f\n", cutoff)
 )
-)
+))
 } else {
 if (nrow(m) == 0) {
 return(FALSE)
 } else {
-subsection_header(
+cat(subsection_header(
 paste(
-sprintf("Heatmap for %d usable peptides whose", nrow(m)),
+sprintf("Heatmap for %d usable peptide genes whose", nrow(m)),
 sprintf("adjusted p-value < %0.2f\n", cutoff)
 )
-)
+))
 }
 }
 cat("\n\n\n")
 cat("\n\n\n")
 return(TRUE)
 }
 # construct matrix with appropriate rownames
 m <-
 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ])
-if (nrow(m) > 0) {
+nrow_m <- nrow(m)
+ncol_m <- ncol(m)
+if (nrow_m > 0) {
 rownames_m <- rownames(m)
-rownames(m) <- sapply(
+q <- data.frame(pepname = rownames_m)
-X = seq_len(nrow(m))
+g <- sqldf("
-,
+SELECT q.pepname, substr(met.Gene_Name, 1, 30) AS gene_name
+FROM q, metadata_plus_p AS met
+WHERE q.pepname = met.Phosphopeptide
+ORDER BY q.rowid
+")
+tmp <- sapply(
+X = seq_len(nrow(g)),
+FUN = function(i) {
+pre <- strsplit(g$gene_name[i], "; ")[[1]]
+rslt <- paste(unique(pre), sep = "; ")
+return(rslt)
+}
+)
+tmp <- unlist(tmp)
+tmp <-
+make.names(tmp, unique = TRUE)
+tmp <- sub(
+"No_Gene_Name",
+"not_found",
+tmp,
+fixed = TRUE
+)
+ten_trunc_names <- trunc_ppep(rownames_m)
+tmp <- sapply(
+X = seq_len(nrow_m),
 FUN = function(i) {
 sprintf(
-anova_filtered_merge_format[i],
+"(%s) %s",
-rownames_m[i],
+tmp[i],
-signif(filtered_p$fdr_adjusted_anova_p[i], 2)
+ten_trunc_names[i]
 )
 }
 )
+rownames(m) <- tmp
 }
 # draw the heading and heatmap
-if (nrow(m) > 0) {
+if (nrow_m > 0) {
 number_of_peptides_found <-
-draw_ppep_heatmap(
+ppep_heatmap(
 m                       = m,
 cutoff                  = cutoff,
 hm_heading_function     = cat_hm_heading,
 hm_main_title           =
 "log(intensities), row-scaled, unimputed, unnormalized",
-suppress_row_dendrogram = FALSE
+suppress_row_dendrogram = FALSE,
+master_cex              = 0.35,
+sepcolor                = "black",
+colsep                  = sample_colsep
 )
 if (number_of_peptides_found > 1) {
 cat("\\leavevmode\n")
-cat("The adjusted ANOVA \\textit{p}-value is shown in parentheses
-after the phosphopeptide sequence.\n\n")
 }
 }
 }
 }
 }
+cat(filter_result_string)
 cat("\\leavevmode\n")
+if (!g_can_run_ksea) {
+errmsg <- paste("Cannot proceed with KSEA analysis",
+"because too many values are missing.")
+if (FALSE) cat0(
+errmsg,
+"\\stepcounter{offset}\n",
+"\\stepcounter{offset}\n",
+"\\stepcounter{offset}\n",
+" in ",
+table_href(),
+".\n\n"
+)
+if (FALSE) {
+if (print_nb_messages) nbe(see_variable(p_value_data))
+} else {
+if (print_nb_messages) nbe(see_variable(p_value_data))
+display_p_value_data <- p_value_data
+display_p_value_data$raw_anova_p <-
+sprintf("%0.3g", display_p_value_data$raw_anova_p)
+display_p_value_data$fdr_adjusted_anova_p <-
+sprintf("%0.3g", display_p_value_data$fdr_adjusted_anova_p)
+display_p_value_data$quality <-
+sprintf("%0.3g", display_p_value_data$quality)
+headers_1st_line <-
+c("",               "Raw ANOVA", "FDR-adj.", "Missing", "Min. #", "", "")
+headers_2nd_line <-
+c("Phosphopeptide", "p-value",   "p-value",  "values",  "group-obs", "Quality", "Ranking")
+data_frame_tabbing_latex(
+x = display_p_value_data,
+tabstops = c(2.75, 0.80, 0.80, 0.5, 0.6, 0.60),
+use_subsubsection_header = FALSE,
+headings = c(headers_1st_line, headers_2nd_line),
+caption = "ANOVA results"
+)
+}
+data_frame_tabbing_latex(
+x = save_sample_treatment_df,
+tabstops = c(1.25, 1.25),
+caption = "Sample classes",
+use_subsubsection_header = FALSE
+)
+param_df_exit()
+knitr::knit_exit()
+return(invisible(-1))
+}
 ```
 ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
-if (count_of_treatment_levels > 1) {
+if (g_can_run_ksea && count_of_treatment_levels > 1) {
 # Prepare two-way contrasts with adjusted p-values
 # Strategy:
 # - use imputed, log-transformed data:
 #   - remember this when computing log2(fold-change)
 # - each contrast is between a combination of trt levels
 # - adjust p-value, assuming that
 #   (# of pppeps)*(# of contrasts) tests were performed
 # Each contrast is between a combination of trt levels
 m2 <- combn(
-x = seq_len(length(levels(sample_treatment_levels))),
+x = seq_len(length(levels(smpl_trt))),
 m = 2,
 simplify = TRUE
 )
 contrast_count <- ncol(m2)
 f_m2 <-
 function(cntrst, lvl1, lvl2) {
 return(
 data.frame(
 contrast = cntrst,
-level = sample_treatment_levels[
+level = smpl_trt[
-sample_treatment_levels %in%
+smpl_trt %in%
-levels(sample_treatment_levels)[c(lvl1, lvl2)]
+levels(smpl_trt)[c(lvl1, lvl2)]
 ],
 label = sample_name_matches[
-sample_treatment_levels %in%
+smpl_trt %in%
-levels(sample_treatment_levels)[c(lvl1, lvl2)]
+levels(smpl_trt)[c(lvl1, lvl2)]
 ]
 )
 )
 }
 # - compute a df for each contrast
 ;
 "
 )
 # - create contrast-metadata table
+if (print_nb_messages) nbe("CREATE TABLE contrast_lvl_lvl_metadata")
 dml_no_rows_exec(db, "
 CREATE TABLE contrast_lvl_lvl_metadata
 AS
 SELECT DISTINCT
 a.contrast              AS ab_contrast,
 rownames(grouping_factor) <- grouping_factor$sample
 grouping_factor <- grouping_factor[, "level", drop = FALSE]
 # - run the two-level (one-way) test
 p_value_data_contrast_ps <-
-apply(
+row_apply(
-X = contrast_cast_data,
+x = contrast_cast_data,
-MARGIN = 1, # apply to rows
+fun = anova_func,
-FUN = anova_func,
 grouping_factor =
 as.factor(grouping_factor$level), # anova_func arg2
 one_way_f = one_way_two_categories, # anova_func arg3
 simplify = TRUE # TRUE is the default for simplify
 )
 AND NOT m.`Gene` = 'No_Gene_Name'
 AND NOT v.log2_fc = 0
 ;
 "
 )
+# We are done with DDL and insertion
+RSQLite::dbDisconnect(db)
 }
 ```
 ```{r echo = FALSE, results = 'asis'}
 cat("\\newpage\n")
 ```
-# KSEA Analysis
+# KSEA Analysis Summaries
 Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched.   Enrichments are found by the CRAN `KSEAapp` package:
 - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp
 - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415).
 - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/).
 For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as:
 $$
-\text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j}
+\text{kinase enrichment }z\text{-score}_{j,i} = \frac{(\overline{`r sfc`}_{j,i} - \overline{`r pfc`}_j)\sqrt{m_{j,i}}}{\delta_j}
 $$
 and fold-enrichment is computed as:
 $$
-\text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j}
+\text{Enrichment}_{j,i} = \frac{\overline{`r sfc`}_{j,i}}{\overline{`r pfc`}_j}
 $$
 where:
-- $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$,
+- $\overline{`r sfc`}_{j,i}$ is the mean `r pfc_txt` in intensities of known substrates of the kinase $i$ in contrast $j$,
-- $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and
+- $\overline{`r pfc`}_j$ is the mean `r pfc_txt` of all phosphosites identified in contrast $j$, and
 - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$,
-- $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset.
+- $\delta_j$ is the standard deviation of the $\log_2 (\text{fold-change})$ for contrast $j$ across all phosphosites in the dataset.
 - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score.
-$\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x)
+$\text{FDR}_{j,i}$ is the False Discovery Rate corrected kinase enrichment score.
 Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive).
 Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`.
 - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis.
 - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)).
-- Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads).
+- Kinase-substrate data were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads).
-```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+For each enriched kinase, a heatmap showing the intensities is presented for up to `r g_intensity_hm_rows` substrates, i.e., those substrates having the highest"quality".
+Where possible, a heatmap of the correlations among these the selected substrates is also presented; if correlations cannot be computed (because of too many missing values), then the covariances are heatmapped for substrates having a variance greater than 1.
+```{r ksea, echo = FALSE, fig.dim = c(12, 14.5), results = 'asis'}
+cat("\\clearpage\n")
 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db)
 # -- eliminate the table that's about to be defined
 ddl_exec(db, "
 sub_title <- contrast_longlabel
 tryCatch(
 expr = {
 ksea_scores_rslt <-
 ksea_scores(
-ksdata           = pseudo_ksdata, # KSEAapp::KSData,
+ksdata                  = pseudo_ksdata,
-px               = kseaapp_input,
+px                      = kseaapp_input,
-networkin        = TRUE,
+networkin               = TRUE,
-networkin_cutoff = 2
+networkin_cutoff        = 2,
+minimum_substrate_count = ksea_min_substrate_count
 )
+if (FALSE) {
+ksea_scores_rslt <-
+ksea_scores_rslt[
+ksea_scores_rslt$m >= ksea_min_substrate_count,
+,
+drop = FALSE
+]
+}
+if (FALSE) {
+data_frame_tabbing_latex(
+x = ksea_scores_rslt,
+tabstops = c(0.8, 0.8, 0.8, 0.8, 0.8, 0.8),
+caption = paste("KSEA scores for contrast ",
+cntrst_b_level, "to",  cntrst_a_level),
+use_subsubsection_header = FALSE
+)
+}
+if (FALSE) {
+if (print_nb_messages) nbe("Output contents of `ksea_scores_rslt` table\n")
+cat_variable(ksea_scores_rslt)
+cat("\n\\clearpage\n")
+}
 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
 next_index <- 1 + next_index
 rslt$score_list[[next_index]] <- ksea_scores_rslt
 rslt$name_list[[next_index]] <- contrast_label
 rslt$longname_list[[next_index]] <- contrast_longlabel
-low_fdr_print(
+ksea_low_fdr_print(
 rslt = rslt,
 i_cntrst = i_cntrst,
 i = next_index,
 a_level = cntrst_a_level,
 b_level = cntrst_b_level,
 fold_change = cntrst_fold_change,
 caption = contrast_longlabel
 )
 }
 },
-error = function(e) str(e)
+error = function(e) {
+str(e)
+cat_margins()
+}
 )
 }
 plotted_kinases <- NULL
-if (length(rslt$score_list) > 1) {
+if (g_can_run_ksea && length(rslt$score_list) > 1) {
 for (i in seq_len(length(ksea_heatmap_titles))) {
 hdr <- ksea_heatmap_titles[[i]]
 which_kinases <- i
 cat("\\clearpage\n\\begin{center}\n")
 if (i == const_ksea_astrsk_kinases) {
-subsection_header(hdr)
+cat(subsection_header(hdr))
 } else {
-subsection_header(hdr)
+cat(subsection_header(hdr))
 }
 cat("\\end{center}\n")
 plotted_kinases <- ksea_heatmap(
 # the data frame outputs from the KSEA.Scores() function, in list format
 # a numeric value between 0 and infinity indicating the min. number of
 #   substrates a kinase must have to be included in the heatmap
 m_cutoff = 1,
 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
 #   for indicating significant kinases in the heatmap
-p_cutoff = 0.05,
+p_cutoff = params$kseaCutoffThreshold,
 # a binary input of TRUE or FALSE, indicating whether or not to perform
 #   hierarchical clustering of the sample columns
 sample_cluster = TRUE,
 # a binary input of TRUE or FALSE, indicating whether or not to export
 #   the heatmap as a .png image into the working directory
 ylab = "Kinase",
 # print which kinases:
 # - 1 : all kinases
 # - 2 : significant kinases
 # - 3 : non-significant kinases
-which_kinases = which_kinases
+which_kinases = which_kinases,
+margins = c(7, 15)
 )
 if (!is.null(plotted_kinases)) {
 cat("\\begin{center}\n")
-cat("Color intensity reflects $z$-score magnitudes; hue reflects $z$-score sign.\n")
 if (which_kinases != const_ksea_nonastrsk_kinases)
 cat("Asterisks reflect significance.\n")
 cat("\\end{center}\n")
 }
 } # end for (i in ...
 } # end if (length ...
+```
-for (i_cntrst in seq_len(length(rslt$score_list))) {
-next_index <- i_cntrst
+```{r kseabar_calc, echo = FALSE, fig.dim = c(9.5, 6), results = 'asis'}
-cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
+ksea_prints <- list()
-cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
+ksea_barplots <- list()
-cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
+for (i_cntrst in seq_len(length(rslt$score_list))) {
-contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
+next_index <- i_cntrst
-contrast_longlabel <- (
+cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
+cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
+cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
+contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
+contrast_longlabel <- (
+sprintf(
+"Class %s -> Class %s",
+contrast_metadata_df[i_cntrst, "b_level"],
+contrast_metadata_df[i_cntrst, "a_level"]
+)
+)
+main_title <- (
+sprintf(
+"Change from treatment %s to treatment %s",
+contrast_metadata_df[i_cntrst, "b_level"],
+contrast_metadata_df[i_cntrst, "a_level"]
+)
+)
+sub_title <- contrast_longlabel
+tryCatch(
+expr = {
+ksea_scores_rslt <- rslt$score_list[[next_index]]
+if (print_nb_messages) nbe(see_variable(ksea_scores_rslt)) #ACE
+if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
+sink(deferred <- file())
+ksea_low_fdr_print(
+rslt = rslt,
+i_cntrst = i_cntrst,
+i = next_index,
+a_level = cntrst_a_level,
+b_level = cntrst_b_level,
+fold_change = cntrst_fold_change,
+caption = contrast_longlabel,
+write_db = FALSE,
+anchor = const_table_anchor_t
+)
+cat("\n")
+sink()
+lines <-
+paste(
+readLines(deferred, warn = FALSE),
+collapse = "\n"
+)
+close(deferred)
+sq_put(ksea_prints, lines)
+sink(stderr())
+cat("\n---\n")
+cat_variable(ksea_prints)
+barplot_closure <-
+ksea_low_fdr_barplot_factory(
+rslt = rslt,
+i_cntrst = i_cntrst,
+i = next_index,
+a_level = cntrst_a_level,
+b_level = cntrst_b_level,
+fold_change = cntrst_fold_change,
+caption = contrast_longlabel
+)
+if (rlang::is_closure(barplot_closure))
+sq_put(ksea_barplots, barplot_closure)
+else
+sq_put(ksea_barplots, no_op)
+str(ksea_barplots)
+cat("\n...\n")
+sink()
+}
+},
+error = function(e) {
+str(e)
+cat_margins()
+}
+)
+}
+```
+```{r phosphoelm_kinase_upid_desc, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'}
+have_kinase_descriptions <-
+if (!is.null(bzip2df(kinase_uprt_desc_lut, kinase_uprt_desc_lut_bz2)) &&
+!is.null(bzip2df(kinase_name_uprt_lut, kinase_name_uprt_lut_bz2))
+) {
+rownames(kinase_uprt_desc_lut) <- kinase_uprt_desc_lut$UniProtID
+kinase_name_to_desc_uprt <- function(s) {
+rslt <- NULL
+tryCatch(
+{
+which_rows <- eval(s == kinase_name_uprt_lut$kinase)
+kinase_uprtid <-
+kinase_name_uprt_lut[which_rows, 2]
+# filter for first _HUMAN match if any
+grepl_human <- grepl("_HUMAN$", kinase_uprtid)
+if (0 < sum(grepl_human))
+kinase_uprtid <- kinase_uprtid[grepl_human]
+# filter for first match if any
+if (0 < length(kinase_uprtid)) {
+kinase_uprtid <- kinase_uprtid[1]
+kinase_desc <- kinase_uprt_desc_lut[kinase_uprtid, 2]
+if (!is.na(kinase_desc))
+rslt <- c(kinase_desc, kinase_uprtid)
+else
+rslt <- c(kinase_desc, "")
+}
+},
+warning = str
+)
+rslt
+}
+TRUE
+} else {
+kinase_name_to_desc_uprt <- function(s) NULL
+FALSE
+}
+```
+```{r write_params, echo = FALSE, results = 'asis'}
+# perhaps this should be moved into the functions section, eventually ...
+write_params <- function(db) {
+# write parameters to report
+# write parameters to SQLite output
+mqppep_anova_script_param_df <- data.frame(
+script    = "mqppep_anova_script.Rmd",
+parameter = names(param_unlist),
+value     = param_unlist
+)
+ddl_exec(db, "
+DROP TABLE IF EXISTS script_parameter;
+"
+)
+ddl_exec(db, "
+CREATE TABLE IF NOT EXISTS script_parameter(
+script    TEXT,
+parameter TEXT,
+value     ANY,
+UNIQUE (script, parameter) ON CONFLICT REPLACE
+)
+;
+"
+)
+RSQLite::dbWriteTable(
+conn = db,
+name = "script_parameter",
+value = mqppep_anova_script_param_df,
+append = TRUE
+)
+loaded_packages_df <-  sessioninfo::package_info("loaded")
+loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library)
+loaded_packages_df <- data.frame(
+package = loaded_packages_df$package,
+version = loaded_packages_df$loadedversion,
+date    = loaded_packages_df$date
+)
+#ACE cat("\\clearpage\n\\section{R package versions}\n")
+#ACE data_frame_tabbing_latex(
+#ACE   x = loaded_packages_df,
+#ACE   tabstops = c(2.5, 1.25),
+#ACE   caption = "R package versions"
+#ACE   )
+cat("\\clearpage\n\\section{Input parameter settings}\n")
+data_frame_tabbing_latex(
+x = param_df,
+tabstops = c(1.75),
+underscore_whack = TRUE,
+caption = "Input parameters",
+verbatim = FALSE
+)
+}
+if (!have_kinase_descriptions) {
+write_params(db)
+# We are done with output
+RSQLite::dbDisconnect(db)
+param_df_exit()
+knitr::knit_exit()
+}
+```
+```{r kseabar, echo = FALSE, fig.dim = c(9.5, 12.3), results = 'asis'}
+if (have_kinase_descriptions) {
+my_section_header <-
 sprintf(
-"Class %s -> Class %s",
+"inases whose KSEA %s < %s\n",
-contrast_metadata_df[i_cntrst, "b_level"],
+ksea_cutoff_statistic,
-contrast_metadata_df[i_cntrst, "a_level"]
+signif(ksea_cutoff_threshold, 2)
 )
-)
-main_title <- (
+# Use enriched kinases to find enriched kinase-substrate pairs
+enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash))
+enriched_kinase_descs <-
+Reduce(
+f = function(l, r) {
+lkup <- kinase_name_to_desc_uprt(r)
+if (is.null(lkup)) l
+else r2 <- rbind(
+l,
+data.frame(
+kinase = r,
+uniprot_id = lkup[2],
+description = lkup[1]
+)
+)
+},
+x = enriched_kinases$kinase,
+init = NULL
+)
+if (length(enriched_kinase_descs) > 0 && nrow(enriched_kinase_descs) > 0) {
+cat("\n\\clearpage\n")
+data_frame_tabbing_latex(
+x = enriched_kinase_descs,
+tabstops = c(0.9, 1.3),
+headings = c("Kinase", "UniProt ID", "Description"),
+caption = paste0("Descriptions of k", my_section_header)
+)
+}
+if (FALSE) {
+cat_variable(sqldf("SELECT kinase FROM enriched_kinases"))
+cat_variable(sqldf("
+SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
+FROM pseudo_ksdata
+WHERE gene IN (SELECT kinase FROM enriched_kinases)
+"))
+data_frame_table_latex(
+x = sqldf("
+SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
+FROM pseudo_ksdata
+WHERE gene IN (SELECT kinase FROM enriched_kinases)
+"),
+justification = "l l l",
+centered = TRUE,
+caption = "substrates of enriched kinases",
+anchor = c(const_table_anchor_p, const_table_anchor_t),
+underscore_whack = TRUE
+)
+data_frame_table_latex(
+x = sqldf("
+SELECT
+gene AS kinase,
+ppep,
+sub_gene,
+'('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label,
+fdr_adjusted_anova_p,
+quality,
+min_group_obs_count
+FROM (
+SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
+FROM pseudo_ksdata
+WHERE gene IN (SELECT kinase FROM enriched_kinases)
+),
+p_value_data
+WHERE ppep = phosphopeptide
+GROUP BY kinase, ppep
+ORDER BY kinase, ppep, p_value_data.quality DESC
+"),
+justification = "l l l l l l l",
+centered = TRUE,
+caption = "labeled substrates of enriched kinases",
+anchor = c(const_table_anchor_p, const_table_anchor_t),
+underscore_whack = TRUE
+)
+}
+all_enriched_substrates <- sqldf("
+SELECT
+gene AS kinase,
+ppep,
+sub_gene,
+'('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label,
+fdr_adjusted_anova_p,
+quality,
+min_group_obs_count
+FROM (
+SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
+FROM pseudo_ksdata
+WHERE gene IN (SELECT kinase FROM enriched_kinases)
+),
+p_value_data
+WHERE ppep = phosphopeptide
+GROUP BY kinase, ppep
+ORDER BY kinase, ppep, p_value_data.quality DESC
+")
+all_enriched_substrates <-
+all_enriched_substrates[
+all_enriched_substrates$quality >= params$minQuality,
+,
+drop = FALSE
+]
+all_enriched_substrates$sub_gene <-
+sub(
+" ///.*",
+" ...",
+all_enriched_substrates$sub_gene
+)
+all_enriched_substrates$label <-
+with(
+all_enriched_substrates,
 sprintf(
-"Change from treatment %s to treatment %s",
+"(%s-%s) %s",
-contrast_metadata_df[i_cntrst, "b_level"],
+kinase,
-contrast_metadata_df[i_cntrst, "a_level"]
+trunc_subgene(sub_gene),
-)
+ppep
 )
-sub_title <- contrast_longlabel
+)
-tryCatch(
-expr = {
+# this global is set to TRUE by cat_enriched_heading immediately below
-ksea_scores_rslt <- rslt$score_list[[next_index]]
+g_neednewpage <- FALSE
-if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
+# helper used to label per-kinase substrate enrichment figure
-low_fdr_barplot(
+cat_enriched_heading <- function(m, cut_args) {
-rslt = rslt,
+cutoff <- cut_args$cutoff
-i_cntrst = i_cntrst,
+kinase <- cut_args$kinase
-i = next_index,
+if (g_neednewpage) cat("\\newpage\n")
-a_level = cntrst_a_level,
+g_neednewpage <- TRUE
-b_level = cntrst_b_level,
+if (nrow(m) > g_intensity_hm_rows) {
-fold_change = cntrst_fold_change,
+cat(subsection_header(
-caption = contrast_longlabel
-)
-}
-},
-error = function(e) str(e)
-)
-}
-```
-```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
-# Use enriched kinases to find enriched kinase-substrate pairs
-enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash))
-all_enriched_substrates <- sqldf("
-SELECT
-gene AS kinase,
-ppep,
-sub_gene,
-'('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label,
-fdr_adjusted_anova_p
-FROM (
-SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
-FROM pseudo_ksdata
-WHERE gene IN (SELECT kinase FROM enriched_kinases)
-),
-p_value_data
-WHERE ppep = phosphopeptide
-GROUP BY ppep
-ORDER BY fdr_adjusted_anova_p
-")
-# helper used to label per-kinase substrate enrichment figure
-cat_enriched_heading <- function(m, cut_args) {
-cutoff <- cut_args$cutoff
-kinase <- cut_args$kinase
-statistic <- cut_args$statistic
-threshold <- cut_args$threshold
-cat("\\newpage\n")
-if (nrow(m) > intensity_hm_rows) {
-subsection_header(
-paste(
 sprintf(
-"Lowest p-valued %d (of %d) enriched %s-substrates,",
+"Highest-quality %d (of %d) enriched %s-substrates",
-intensity_hm_rows,
+g_intensity_hm_rows,
 nrow(m),
 kinase
-),
+)
-sprintf(" KSEA %s < %0.2f\n", statistic, threshold)
+))
-)
-)
-} else {
-if (nrow(m) == 0) {
-return(FALSE)
 } else {
-subsection_header(
+if (nrow(m) == 0) {
-paste(
+return(FALSE)
+} else {
+nrow_m <- nrow(m)
+cat(subsection_header(
 sprintf(
-"%d enriched %s-substrates,",
+"%d enriched %s-substrate%s",
-nrow(m),
+nrow_m,
-kinase
+kinase,
-),
+if (nrow_m > 1) "s" else ""
-sprintf(
-" KSEA %s < %0.2f\n",
-statistic,
-threshold
 )
+))
+}
+}
+cat("\n\n\n")
+cat("\n\n\n")
+return(TRUE)
+}
+# --------------------------------
+# hack begin - show all substrates
+enriched_substrates <- all_enriched_substrates
+# add "FALSE &&" to prevent listing of substrates
+if (show_enriched_substrates && nrow(enriched_substrates) > 0) {
+short_row_names <- sub(
+"$FAILED_MATCH_GENE_NAME",
+"not_found",
+enriched_substrates$sub_gene,
+fixed = TRUE
+)
+if (print_nb_messages) nbe(see_variable(enriched_substrates))
+substrates_df <- with(
+enriched_substrates,
+data.frame(
+kinase = kinase,
+substrate = sub(" ///*", "...", short_row_names),
+anova_p_value = signif(fdr_adjusted_anova_p, 2),
+min_group_obs_count = signif(min_group_obs_count, 0),
+quality = signif(quality, 3),
+sequence = trunc_n(30)(ppep)
 )
 )
-}
-}
+substrates_df <- substrates_df[
-cat("\n\n\n")
+with(substrates_df, order(kinase, -quality)),
-cat("\n\n\n")
+,
-return(TRUE)
+drop = FALSE
-}
+]
-# Disabling heatmaps for substrates pending decision whether to eliminate them altogether
+if (print_nb_messages) nbe(see_variable(substrates_df))
-if (TRUE)
+if (nrow(substrates_df) < 1)
+substrates_df$sequence <- c()
+if (print_nb_messages) nbe(see_variable(substrates_df))
+names(substrates_df) <- headers_2nd_line <-
+c("Kinase", "Substrate", "p-value", "per group)", "quality", "Sequence")
+headers_1st_line <- c("", "", "ANOVA", "min(values", "", "")
+data_frame_tabbing_latex(
+x = substrates_df,
+tabstops = c(1.2, 0.8, 0.5, 0.65, 0.5),
+headings = c(headers_1st_line, headers_2nd_line),
+caption = "Details for all enriched substrates of enriched kinases"
+)
+rm(
+enriched_substrates,
+substrates_df,
+short_row_names,
+headers_1st_line,
+headers_2nd_line
+)
+}
+cat("\\clearpage\n")
+# hack end - show all substrates
+# --------------------------------
+# print deferred tables and graphs for kinases from contrasts
+for (i_cntrst in seq_len(length(ksea_prints))) {
+#latex_samepage({
+cat(ksea_prints[[i_cntrst]])
+cat("\n")
+ksea_barplots[[i_cntrst]]()
+cat("\n")
+cat("\\clearpage\n")
+#})
+}
+}
+```
+```{r enriched, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'}
+if (g_can_run_ksea) {
+g_did_enriched_header <- FALSE
 for (kinase_name in sort(enriched_kinases$kinase)) {
 enriched_substrates <-
 all_enriched_substrates[
 all_enriched_substrates$kinase == kinase_name,
 ,
 drop = FALSE
 ]
+ten_trunc_ppep <- trunc_enriched_substrate(enriched_substrates$ppep)
 enriched_substrates$label <- with(
 enriched_substrates,
 sprintf(
-"(%s-%s) %s (%0.2g)",
+"(%s) %s",
-kinase,
+make.names(
-sub("$FAILED_MATCH_GENE_NAME", "unidentified", sub_gene, fixed = TRUE),
+sub("$FAILED_MATCH_GENE_NAME", "not_found", sub_gene, fixed = TRUE),
-ppep,
+unique = TRUE
-fdr_adjusted_anova_p
+),
+ten_trunc_ppep
 )
 )
 # Get the intensity values for the heatmap
 enriched_intensities <-
 as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE])
 # Remove rows having too many NA values to be relevant
+good_rows <- (rowSums(enriched_intensities, na.rm = TRUE) != 0)
+#ACE nbe(see_variable(good_rows), "\n")
+enriched_substrates <- enriched_substrates[good_rows, , drop = FALSE]
+enriched_intensities <- enriched_intensities[good_rows, , drop = FALSE]
 # Rename the rows with the display-name for the heatmap
-rownames(enriched_intensities) <-
+short_row_names <- sub(
+"$FAILED_MATCH_GENE_NAME",
+"not_found",
+enriched_substrates$sub_gene,
+fixed = TRUE
+)
+short_row_names <-
+make.names(short_row_names, unique = TRUE)
+long_row_names <-
 sapply(
 X = rownames(enriched_intensities),
 FUN = function(rn) {
 enriched_substrates[enriched_substrates$ppep == rn, "label"]
 }
 )
+rownames(enriched_intensities) <- long_row_names
 # Format as matrix for heatmap
 m <- as.matrix(enriched_intensities)
+rownames(m) <- trunc_enriched_substrate(rownames(m))
+#ACE nb("m with bad rows: ", see_variable(m), "\n")
+#ACE good_rows <- (rowSums(m, na.rm = TRUE) != 0)
+#ACE nb(see_variable(good_rows), "\n")
+#ACE m <- m[good_rows, , drop = FALSE]
+#ACE nb("m without(?) bad rows: ", see_variable(m), "\n")
+#ACE nb(see_variable(short_row_names), "\n")
+#ACE local_short_row_names <- short_row_names[good_rows]
+#ACE local_long_row_names  <- long_row_names[good_rows]
+#ACE local_enriched_intensities <- enriched_intensities[local_long_row_names, ]
 # Draw the heading and heatmap
-if (nrow(m) > 0) {
+nrow_m <- nrow(m)
+if (nrow_m > 0) {
+if (!g_did_enriched_header) {
+cat("\n\\clearpage\n")
+cat(section_header(paste0("K", my_section_header)))
+g_did_enriched_header <- TRUE
+}
+is_na_m <- is.na(m)
+cellnote_m <- is_na_m
+cellnote_m[!is_na_m] <- ""
+cellnote_m[is_na_m] <- "NA"
 cut_args <- new_env()
 cut_args$cutoff <- cutoff
 cut_args$kinase <- kinase_name
 cut_args$statistic <- ksea_cutoff_statistic
 cut_args$threshold <- ksea_cutoff_threshold
 number_of_peptides_found <-
-draw_ppep_heatmap(
+ppep_heatmap(
 m                       = m,
+cellnote                = cellnote_m,
 cutoff                  = cut_args,
 hm_heading_function     = cat_enriched_heading,
 hm_main_title
 = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates",
-suppress_row_dendrogram = FALSE
+suppress_row_dendrogram = FALSE,
+master_cex              = 0.35,
+sepcolor                = "black",
+colsep                  = sample_colsep
 )
 if (number_of_peptides_found > 1) {
-cat("\\leavevmode\n")
-cat("The kinase-subsrate pair is shown in parentheses
+tryCatch(
-before the phosphopeptide sequence.\n\n")
+{
-cat("The adjusted ANOVA \\textit{p}-value is shown in parentheses
+rownames(m) <- short_row_names
-after the phosphopeptide sequence.\n\n")
+cov_heatmap(m, nrow_m > g_intensity_hm_rows)
+},
+error = function(e) {
+cat(
+sprintf(
+"ERROR: %s\n\\newline\n",
+mget("e")
+)
+)
+cat(
+sprintf(
+"message: %s\n\\newline\n",
+e$message
+)
+)
+cat_margins()
+}
+)
 }
-if (nrow(m) == 1) {
+substrates_df <- with(
-cat(
+enriched_substrates,
-sprintf(
+data.frame(
-"\n\nSubstrate is %s,
+substrate = sub(" ///*", "...", short_row_names),
-\nphopshopeptide is %s,
+sequence = trunc_long_ppep(ppep),
-\n\nand adjusted ANOVA \\textit{p}-value is %0.2g.\n",
+anova_p_value = signif(fdr_adjusted_anova_p, 2),
-enriched_substrates[1, "sub_gene"],
+min_group_obs_count = signif(min_group_obs_count, 0),
-enriched_substrates[1, "ppep"],
+quality = signif(quality, 3)
-enriched_substrates[1, "fdr_adjusted_anova_p"]
+)
+)
+excess_substrates <- nrow(substrates_df) > g_intensity_hm_rows
+if (excess_substrates)
+substrates_df <- substrates_df[1:g_intensity_hm_rows, ]
+names(substrates_df) <- headers_2nd_line <-
+c("Substrate", "Sequence", "p-value", "per group)", "quality")
+headers_1st_line <- c("", "", "ANOVA", "min(values", "")
+if (1 < nrow(enriched_substrates))
+cat("\n\\newpage\n")
+cat(subsubsection_header(
+sprintf(
+"Details for %s%s-substrates",
+if (excess_substrates)
+sprintf(
+"%s \"highest quality\" ",
+g_intensity_hm_rows
 )
+else "",
+kinase_name
 )
-}
+))
+substrates_df <- substrates_df[order(-substrates_df$quality), ]
+data_frame_tabbing_latex(
+x = substrates_df,
+tabstops = c(0.8, 3.8, 0.6, 0.8),
+headings = c(headers_1st_line, headers_2nd_line)
+)
+} else {
+if (print_nb_messages) nbe(see_variable(nrow_m > 0), "\n")
 }
-}
+if (print_nb_messages) nb("end kinase ", kinase_name, "\n")
+}
-# Write output tabular files
+# Write output tabular files
-# get kinase, ppep, concat(kinase) tuples for enriched kinases
+# get kinase, ppep, concat(kinase) tuples for enriched kinases
-kinase_ppep_label <- sqldf("
-WITH
+if (print_nb_messages) nb("kinase_ppep_label <- ...\n")
-t(ppep, label) AS
+if (print_nb_messages) nbe("kinase_ppep_label <- ...\n")
-(
+kinase_ppep_label <- sqldf("
-SELECT DISTINCT
+WITH
-SUB_MOD_RSD AS ppep,
+t(ppep, label) AS
-group_concat(gene, '; ') AS label
+(
+SELECT DISTINCT
+SUB_MOD_RSD AS ppep,
+group_concat(gene, '; ') AS label
+FROM pseudo_ksdata
+WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+GROUP BY ppep
+),
+k(kinase, ppep_join) AS
+(
+SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join
 FROM pseudo_ksdata
 WHERE GENE IN (SELECT kinase FROM enriched_kinases)
-GROUP BY ppep
+)
-),
+SELECT k.kinase, t.ppep, t.label
-k(kinase, ppep_join) AS
+FROM  t,  k
-(
+WHERE t.ppep = k.ppep_join
-SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join
+ORDER BY k.kinase, t.ppep
-FROM pseudo_ksdata
+")
-WHERE GENE IN (SELECT kinase FROM enriched_kinases)
-)
-SELECT k.kinase, t.ppep, t.label
+# extract what we need from full_data
-FROM  t,  k
+impish <- cbind(rownames(quant_data_imp), quant_data_imp)
-WHERE t.ppep = k.ppep_join
+colnames(impish)[1] <- "Phosphopeptide"
-ORDER BY k.kinase, t.ppep
+data_table_imputed_sql <- "
-")
+SELECT
+f.*,
-# extract what we need from full_data
+k.label AS KSEA_enrichments,
-impish <- cbind(rownames(quant_data_imp), quant_data_imp)
+q.*
-colnames(impish)[1] <- "Phosphopeptide"
+FROM
-data_table_imputed_sql <- "
+metadata_plus_p f
-SELECT
+LEFT JOIN kinase_ppep_label k
-f.*,
+ON f.Phosphopeptide = k.ppep,
-k.label AS KSEA_enrichments,
+impish q
-q.*
+WHERE
-FROM
+f.Phosphopeptide = q.Phosphopeptide
-metadata_plus_p f
+"
-LEFT JOIN kinase_ppep_label k
+data_table_imputed <- sqldf(data_table_imputed_sql)
-ON f.Phosphopeptide = k.ppep,
+# Zap the duplicated 'Phosphopeptide' column named 'ppep'
-impish q
+data_table_imputed <-
-WHERE
+data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
-f.Phosphopeptide = q.Phosphopeptide
-"
+# Output imputed, un-normalized data
-data_table_imputed <- sqldf(data_table_imputed_sql)
+if (print_nb_messages) nb("Output imputed, un-normalized data tabular file\n")
-# Zap the duplicated 'Phosphopeptide' column named 'ppep'
+if (print_nb_messages) nbe("Output imputed, un-normalized data tabular file\n")
-data_table_imputed <-
+write.table(
-data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
+data_table_imputed
+, file = imputed_data_filename
-# Output with imputed, un-normalized data
+, sep = "\t"
+, col.names = TRUE
-write.table(
+, row.names = FALSE
-data_table_imputed
+, quote = FALSE
-, file = imputed_data_filename
+)
-, sep = "\t"
-, col.names = TRUE
-, row.names = FALSE
+#output quantile normalized data
-, quote = FALSE
+impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log)
-)
+colnames(impish)[1] <- "Phosphopeptide"
+data_table_imputed <- sqldf(data_table_imputed_sql)
+# Zap the duplicated 'Phosphopeptide' column named 'ppep'
-#output quantile normalized data
+data_table_imputed <-
-impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log)
+data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
-colnames(impish)[1] <- "Phosphopeptide"
+if (print_nb_messages) nb("Output quantile normalized data tabular file\n")
-data_table_imputed <- sqldf(data_table_imputed_sql)
+if (print_nb_messages) nbe("Output quantile normalized data tabular file\n")
-# Zap the duplicated 'Phosphopeptide' column named 'ppep'
+write.table(
-data_table_imputed <-
+data_table_imputed,
-data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
+file = imp_qn_lt_data_filenm,
-write.table(
+sep = "\t",
-data_table_imputed,
+col.names = TRUE,
-file = imp_qn_lt_data_filenm,
+row.names = FALSE,
-sep = "\t",
+quote = FALSE
-col.names = TRUE,
+)
-row.names = FALSE,
-quote = FALSE
+ppep_kinase <- sqldf("
-)
+SELECT DISTINCT k.ppep, k.kinase
+FROM (
-ppep_kinase <- sqldf("
+SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep
-SELECT DISTINCT k.ppep, k.kinase
+FROM pseudo_ksdata
-FROM (
+WHERE GENE IN (SELECT kinase FROM enriched_kinases)
-SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep
+) k
-FROM pseudo_ksdata
+ORDER BY k.ppep, k.kinase
-WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+")
-) k
-ORDER BY k.ppep, k.kinase
+RSQLite::dbWriteTable(
-")
+conn = db,
+name = "ksea_enriched_ks",
-RSQLite::dbWriteTable(
+value = ppep_kinase,
-conn = db,
+append = FALSE
-name = "ksea_enriched_ks",
+)
-value = ppep_kinase,
+}
-append = FALSE
-)
+if (print_nb_messages) nb("RSQLite::dbWriteTable anova_signif\n")
 RSQLite::dbWriteTable(
 conn = db,
 name = "anova_signif",
 value = p_value_data,
 ON m.phospho_peptide = kek.ppep
 ;
 "
 )
+if (print_nb_messages) nb("Output contents of `stats_metadata_v` table to tabular file\n")
+if (print_nb_messages) nbe("Output contents of `stats_metadata_v` table to tabular file\n")
 write.table(
 dbReadTable(db, "stats_metadata_v"),
 file = anova_ksea_mtdt_file,
 sep = "\t",
 col.names = TRUE,
 row.names = FALSE,
 quote = FALSE
 )
+cat("\n\\clearpage\n")
 ```
+# Data-processing summary flowchart
+![Flowchart showing ANOVA and KSEA data-processing steps](KSEA_impl_flowchart.pdf)
 ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
 cat("\\leavevmode\n\n\n")
-# write parameters to report
+write_params(db)
-param_unlist <- unlist(as.list(params))
-param_df <- data.frame(
-parameter = paste0("\\verb@", names(param_unlist), "@"),
-value = paste0(
-"\n\\begin{tiny}\n\\verb@",
-gsub("$", "\\$", param_unlist, fixed = TRUE),
-"@\n\\end{tiny}"
-)
-)
-data_frame_latex(
-x = param_df,
-justification = "p{0.35\\linewidth} p{0.6\\linewidth}",
-centered = TRUE,
-caption = "Input parameters",
-anchor = const_table_anchor_bp,
-underscore_whack = FALSE
-)
-# write parameters to SQLite output
-mqppep_anova_script_param_df <- data.frame(
-script    = "mqppep_anova_script.Rmd",
-parameter = names(param_unlist),
-value     = param_unlist
-)
-ddl_exec(db, "
-DROP TABLE IF EXISTS script_parameter;
-"
-)
-ddl_exec(db, "
-CREATE TABLE IF NOT EXISTS script_parameter(
-script    TEXT,
-parameter TEXT,
-value     ANY,
-UNIQUE (script, parameter) ON CONFLICT REPLACE
-)
-;
-"
-)
-RSQLite::dbWriteTable(
-conn = db,
-name = "script_parameter",
-value = mqppep_anova_script_param_df,
-append = TRUE
-)
 # We are done with output
 RSQLite::dbDisconnect(db)
+cat("\\clearpage\n\\section{R package versions}\n")
+utils::toLatex(utils::sessionInfo())
 ```
-<!--
-There's gotta be a better way...
-loaded_packages_df <-  sessioninfo::package_info("loaded")
-loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library)
-loaded_packages_df <- data.frame(
-package = loaded_packages_df$package,
-version = loaded_packages_df$loadedversion,
-date    = loaded_packages_df$date
-)
-data_frame_latex(
-x = loaded_packages_df,
-justification = "l | l l",
-centered = FALSE,
-caption = "Loaded R packages",
-anchor = const_table_anchor_bp
-)
--->

Mercurial > repos > eschen42 > mqppep_anova

comparison mqppep_anova_script.Rmd @ 26:5b8e15b2a67c draft