comparison mqppep_anova_script.Rmd @ 26:5b8e15b2a67c draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit e0b80550743f634282b4b4348b75e6f172dc1488
author eschen42
date Wed, 26 Oct 2022 23:48:51 +0000
parents f9cd87ac8006
children 42b207aaa527
comparison
equal deleted inserted replaced
25:f9cd87ac8006 26:5b8e15b2a67c
5 - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]" 5 - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]"
6 - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]" 6 - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]"
7 date: 7 date:
8 - "May 28, 2018" 8 - "May 28, 2018"
9 - "; revised June 23, 2022" 9 - "; revised June 23, 2022"
10 lot: true
10 output: 11 output:
11 pdf_document: 12 pdf_document:
12 toc: true 13 toc: true
13 toc_depth: 3 14 toc_depth: 2
14 keep_tex: true 15 keep_tex: true
15 header-includes: 16 dev: pdf
16 - \usepackage{longtable} 17 includes:
17 - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut 18 in_header: mqppep_anova_preamble.tex
18 - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut 19 latex_macros: false
20 raw_tex: true
21 urlcolor: blue
19 params: 22 params:
20 alphaFile: "test-data/alpha_levels.tabular" 23 alphaFile: "test-data/alpha_levels.tabular"
21 inputFile: "test-data/test_input_for_anova.tabular" 24 inputFile: "test-data/test_input_for_anova.tabular"
22 preprocDb: "test-data/test_input_for_anova.sqlite" 25 preprocDb: "test-data/test_input_for_anova.sqlite"
23 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] 26 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]
24 regexSampleNames: "\\.\\d+[A-Z]$" 27 regexSampleNames: "\\.\\d+[A-Z]$"
25 regexSampleGrouping: "\\d+" 28 regexSampleGrouping: "\\d+"
26 show_toc: true 29 groupFilterPatterns: ".+"
30 groupFilter: !r c("none", "exclude", "include")[1]
31 imputationMethod: !r c("group-median", "median", "mean", "random")[4]
32 kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5]
33 #imputationMethod: !r c("group-median", "median", "mean", "random")[1]
34
35 # how should sample groups be interpreted?
36 # - "f": fixed patterns (like `grep -F`)
37 # - "p": PERL-compatible (like `grep -P`)
38 # - "r": extended grep patterns (like `grep -E`)
39 # use what case sensitivity?
40 # - "i": case insensitive matching (like `grep -i`)
41 groupFilterMode: !r c("r", "ri", "p", "pi", "f", "fi")[1]
42 # what pattern should be used for the first column
43 # (extended grep pattern, case sensitive)
27 firstDataColumn: "^Intensity[^_]" 44 firstDataColumn: "^Intensity[^_]"
28 imputationMethod: !r c("group-median", "median", "mean", "random")[1] 45 # for small random value imputation, what percentile should be center?
29 meanPercentile: 1 46 meanPercentile: 50
47 #meanPercentile: 1
48 # for small random value imputation, what should `s / mean(x)` ratio be?
30 sdPercentile: 1.0 49 sdPercentile: 1.0
50 # output path for imputed data file
31 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" 51 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt"
52 # output path for imputed/quantile-normalized/log-transformed data file
32 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" 53 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt"
54 # output path for contents of `stats_metadata_v` table
33 anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" 55 anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt"
56 # how to test one variable with > 2 categories (e.g., aov or kruskal.test)
34 oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] 57 oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1]
58 # how to test one variable with 2 categories (e.g., oneway.test)
35 oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] 59 oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3]
36 kseaCutoffStatistic: !r c("p.value", "FDR")[2] 60 # what should be the minimum quality for consideration in both
37 kseaCutoffThreshold: !r c( 0.1, 0.05)[2] 61 minQuality: 0
38 kseaMinKinaseCount: 1 62 # correct KSEA with FDR (recommended) or raw p-value
39 intensityHeatmapRows: 75 63 kseaCutoffStatistic: !r c("FDR", "p.value")[1]
64 # correct KSEA threshold 0.05 (conventional) or higher (perhaps better)
65 # "perhaps better" meaning that KSEA is an hypothesis-generator, not -test
66 #kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5)[1]
67 # minimum number of substrates required for a kinase to be considered in KSEA
68 kseaMinSubstrateCount: 1
69 # Should KSEA be performed aggregating signed log2FC or absolute?
70 # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores
71 # TRUE use abs(log2FC) for KSEA as Justin Drake requested; this is a
72 # justifiable deviation from the KSEAapp::KSEA.Scores algorithm.
73 kseaUseAbsoluteLog2FC: TRUE
74 #kseaUseAbsoluteLog2FC: FALSE
75 # minimum number of observed values per sample-group
76 intensityMinValuesPerGroup: 1
77 # maximum number of heatmap rows (result are poor when > 50)
78 intensityHeatmapRows: 50
79 # what should be the primary criterion to eliminate excessive heatmap rows
80 intensityHeatmapCriteria: !r c("quality", "na_count", "p_value")[1]
81 # should correlation among substrates be used (rather than covariance)
82 correlateSubstrates: TRUE
83 # only show covariance among variables having variance > 1
84 filterCovVarGT1: TRUE
85 # maximum number of residues to display for ppeps in rownames or columnames
86 ppepTruncN: 10
87 # maximum number of characters of subgenes to display in rownames or columnames
88 subgeneTruncN: 10
89 # maximum number of characters for paste(subgene, ppep) for enrichment plots
90 substTruncN: 20
91 # should boxplots use variable-width boxes to reflect # of samples
92 boxPlotVarWidth: TRUE
93 # should boxplots use notched boxes to reflect difference between samples
94 boxPlotNotch: TRUE
95 # look-up tables for kinase descriptions
96 kinaseNameUprtLutBz2: "./kinase_name_uniprot_lut.tabular.bz2"
97 kinaseUprtDescLutBz2: "./kinase_uniprot_description_lut.tabular.bz2"
98 # should debugging trace messages be printed?
99 showEnrichedSubstrates: FALSE
100
101 # should debugging nb/nbe messages be printed?
102 printNBMsgs: FALSE
103 # should debugging trace messages be printed?
104 printTraceMsgs: FALSE
105 # when debugging files are needed, set debugFileBasePath to the path
106 # to the directory where they should be writtn
107 debugFileBasePath: !r if (TRUE) NULL else "test-data"
40 --- 108 ---
41 <!-- 109 <!--
110 alphaFile: "test-data/alpha_levels.tabular"
111 inputFile: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular"
112 preprocDb: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite"
113 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]
114 regexSampleNames: "\\.\\w+\\.\\d+[A-Z]$"
115 regexSampleGrouping: "\\w+"
116 groupFilterPatterns: ".+"
117 groupFilter: !r c("none", "exclude", "include")[3]
118 imputationMethod: !r c("group-median", "median", "mean", "random")[1]
119 kseaCutoffThreshold: !r c(0.05, 0.1, 0.20, 0.35, 0.4, 0.5, 0.999)[1]
120 ut_alphaFile: "test-data/alpha_levels.tabular"
121 ut_inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular"
122 ut_preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite"
123 ut_kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2]
124 ut_regexSampleNames: "\\.\\d+[A-Z]$"
125 ut_regexSampleGrouping: "\\d+"
126 ut_groupFilterPatterns: ".+,.*"
127 ut_groupFilter: !r c("none", "exclude", "include")[1]
128 ut_imputationMethod: !r c("group-median", "median", "mean", "random")[4]
129 ut_kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[1]
130 tst_alphaFile: "test-data/alpha_levels.tabular"
131 tst_inputFile: "test-data/test_input_for_anova.tabular"
132 tst_preprocDb: "test-data/test_input_for_anova.sqlite"
133 tst_kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]
134 tst_regexSampleNames: "\\.\\d+[A-Z]$"
135 tst_regexSampleGrouping: "\\d+"
136 tst_groupFilterPatterns: ".+"
137 tst_groupFilter: !r c("none", "exclude", "include")[1]
138 tst_imputationMethod: !r c("group-median", "median", "mean", "random")[4]
139 tst_kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5]
140
141 tst_alphaFile: "test-data/alpha_levels.tabular"
142 tst_inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular"
143 tst_preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite"
144 tst_kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2]
145 tst_regexSampleNames: "\\.\\d+[A-Z]$"
146 tst_regexSampleGrouping: "\\d+"
147 tst_groupFilterPatterns: ".+,.*"
148 tst_groupFilter: !r c("none", "exclude", "include")[1]
149 tst_imputationMethod: !r c("group-median", "median", "mean", "random")[4]
150 tst_kseaCutoffThreshold: !r c(0.05, 0.1, 0.20, 0.35, 0.4, 0.5, 0.999)[5]
151 px_alphaFile: "test-data/alpha_levels.tabular"
152 px_inputFile: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular"
153 px_preprocDb: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite"
154 px_kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]
155 px_regexSampleNames: "\\.\\w+\\.\\d+[A-Z]$"
156 px_regexSampleGrouping: "\\w+"
157 px_groupFilterPatterns: ".+"
158 px_groupFilter: !r c("none", "exclude", "include")[3]
159 px_imputationMethod: !r c("group-median", "median", "mean", "random")[4]
160 px_kseaCutoffThreshold: !r c(0.05, 0.1, 0.20, 0.35, 0.4, 0.5, 0.999)[5]
161 pdx_alphaFile: "test-data/alpha_levels.tabular"
162 pdx_inputFile: "test-data/PDX012970_pST.preproc_tab.tabular"
163 pdx_preprocDb: "test-data/PDX012970_pST.preproc.sqlite"
164 pdx_kseaAppPrepDb: !r c(":memory:", "test-data/PDX012970.sqlite")[2]
165 pdx_regexSampleNames: "\\.\\w+\\.\\w+\\.\\d+[A-Z]$"
166 pdx_regexSampleGrouping: "\\.\\w+\\K\\.\\w+"
167 pdx_groupFilterPatterns: "AdCa,AVPC"
168 pdx_groupFilter: !r c("none", "exclude", "include")[3]
169 pdx_imputationMethod: !r c("group-median", "median", "mean", "random")[4]
170 pdx_kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[1]
171 tst_alphaFile: "test-data/alpha_levels.tabular"
172 tst_inputFile: "test-data/test_input_for_anova.tabular"
173 tst_preprocDb: "test-data/test_input_for_anova.sqlite"
174 tst_kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]
175 tst_regexSampleNames: "\\.\\d+[A-Z]$"
176 tst_regexSampleGrouping: "\\d+"
177 tst_groupFilterPatterns: ".+"
178 tst_groupFilter: !r c("none", "exclude", "include")[1]
179 tst_kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5]
180 tst_imputationMethod: !r c("group-median", "median", "mean", "random")[1]
181 ut_alphaFile: "test-data/alpha_levels.tabular"
182 ut_inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular"
183 ut_preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite"
184 ut_kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2]
185 ut_regexSampleNames: "\\.\\d+[A-Z]$"
186 ut_regexSampleGrouping: "\\d+"
187 ut_groupFilterPatterns: ".+,.*"
188 ut_groupFilter: !r c("none", "exclude", "include")[1]
189 ut_imputationMethod: !r c("group-median", "median", "mean", "random")[4]
42 alphaFile: "test-data/alpha_levels.tabular" 190 alphaFile: "test-data/alpha_levels.tabular"
43 inputFile: "test-data/test_input_for_anova.tabular" 191 inputFile: "test-data/test_input_for_anova.tabular"
44 preprocDb: "test-data/test_input_for_anova.sqlite" 192 preprocDb: "test-data/test_input_for_anova.sqlite"
45 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] 193 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]
46 regexSampleNames: "\\.\\d+[A-Z]$" 194 regexSampleNames: "\\.\\d+[A-Z]$"
47 regexSampleGrouping: "\\d+" 195 regexSampleGrouping: "\\d+"
196 groupFilterPatterns: ".+,.*"
197 groupFilter: !r c("none", "exclude", "include")[1]
198 imputationMethod: !r c("group-median", "median", "mean", "random")[4]
199 nd_alphaFile: "test-data/alpha_levels.tabular"
200 nd_inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular"
201 nd_preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite"
202 nd_kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2]
203 nd_regexSampleNames: "\\.\\d+[A-Z]$"
204 nd_regexSampleGrouping: "\\d+"
205 nd_groupFilterPatterns: ".+,.*"
206 nd_groupFilter: !r c("none", "exclude", "include")[1]
207 nd_imputationMethod: !r c("group-median", "median", "mean", "random")[4]
208 pxd_alphaFile: "test-data/alpha_levels.tabular"
209 pxd_inputFile: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular"
210 pxd_preprocDb: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite"
211 pxd_kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]
212 pxd_regexSampleNames: "\\.\\w+\\.\\d+[A-Z]$"
213 pxd_regexSampleGrouping: "\\w+"
214 pxd_groupFilterPatterns: ".+"
215 pxd_groupFilter: !r c("none", "exclude", "include")[3]
216 pxd_imputationMethod: !r c("group-median", "median", "mean", "random")[4]
217
218 alphaFile: "test-data/alpha_levels.tabular"
219 inputFile: "test-data/test_input_for_anova.tabular"
220 preprocDb: "test-data/test_input_for_anova.sqlite"
221 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]
222 regexSampleNames: "\\.\\d+[A-Z]$"
223 regexSampleGrouping: "\\d+"
224 groupFilterPatterns: ".+,.*"
225 groupFilter: !r c("none", "exclude", "include")[1]
226
227 alphaFile: "test-data/alpha_levels.tabular"
228 inputFile: "test-data/PDX012970_pST.preproc_tab.tabular"
229 preprocDb: "test-data/PDX012970_pST.preproc.sqlite"
230 kseaAppPrepDb: !r c(":memory:", "test-data/PDX012970.sqlite")[2]
231 regexSampleNames: "\\.\\w+\\.\\w+\\.\\d+[A-Z]$"
232 regexSampleGrouping: "\\.\\w+\\K\\.\\w+"
233 groupFilterPatterns: "AdCa,AVPC"
234 groupFilter: !r c("none", "exclude", "include")[3]
48 235
49 alphaFile: "test-data/alpha_levels.tabular" 236 alphaFile: "test-data/alpha_levels.tabular"
50 inputFile: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular" 237 inputFile: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular"
51 preprocDb: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite" 238 preprocDb: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite"
52 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] 239 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]
53 regexSampleNames: "\\.\\w+\\.\\d+[A-Z]$" 240 regexSampleNames: "\\.\\w+\\.\\d+[A-Z]$"
54 regexSampleGrouping: "\\w+" 241 regexSampleGrouping: "\\w+"
242 groupFilterPatterns: ".+,.*"
243 groupFilter: !r c("none", "exclude", "include")[3]
55 244
56 kseaCutoffStatistic: !r c("p.value", "FDR")[2] 245 kseaCutoffStatistic: !r c("p.value", "FDR")[2]
57 kseaCutoffThreshold: !r c(0.05, 0.1)[1] 246 kseaCutoffThreshold: !r c(0.05, 0.1)[1]
58 247
59 alphaFile: "test-data/alpha_levels.tabular" 248 alphaFile: "test-data/alpha_levels.tabular"
67 inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular" 256 inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular"
68 preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite" 257 preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite"
69 kseaAppPrepDb: !r c(":memory:", "test-data/pY_Sites_NancyDu.ksea.sqlite")[2] 258 kseaAppPrepDb: !r c(":memory:", "test-data/pY_Sites_NancyDu.ksea.sqlite")[2]
70 regexSampleNames: "\\.\\d+[A-Z]$" 259 regexSampleNames: "\\.\\d+[A-Z]$"
71 regexSampleGrouping: "\\d+" 260 regexSampleGrouping: "\\d+"
261 groupFilterPatterns: ".+,.*"
262 groupFilter: !r c("none", "exclude", "include")[3]
72 263
73 alphaFile: "test-data/alpha_levels.tabular" 264 alphaFile: "test-data/alpha_levels.tabular"
74 inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" 265 inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular"
75 preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" 266 preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite"
76 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] 267 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2]
77 regexSampleNames: "\\.\\d+[A-Z]$" 268 regexSampleNames: "\\.\\d+[A-Z]$"
78 regexSampleGrouping: "\\d+" 269 regexSampleGrouping: "\\d+"
79 270 groupFilterPatterns: ".+,.*"
80 inputFile: "test-data/density_failure.preproc_tab.tabular" 271 groupFilter: !r c("none", "exclude", "include")[1]
81 kseaAppPrepDb: !r c(":memory:", "mqppep.sqlite")[2] 272
82 latex_document: default
83 --> 273 -->
84 ```{r setup, include = FALSE} 274 ```{r setup, include = FALSE, results = 'asis'}
275
276 # simple debug messaging
277 print_nb_messages <- params$printNBMsgs
278
279 nb <- if (!print_nb_messages) {
280 function(...) invisible()
281 } else {
282 function(..., f = cat) f("\n$\\exists{}\\supset\\forall{}$", ...)
283 }
284
285 nbe <- if (!print_nb_messages) {
286 function(...) invisible()
287 } else {
288 function(..., f = cat, file = stderr()) {
289 cat(
290 stringi::stri_unescape_unicode("\nNBE \\u2203\\u2283\\u2200"),
291 ...,
292 file = file
293 )
294 }
295 }
296
85 #ref for debugging: https://yihui.org/tinytex/r/#debugging 297 #ref for debugging: https://yihui.org/tinytex/r/#debugging
86 options(tinytex.verbose = TRUE) 298 options(tinytex.verbose = TRUE)
87 299
88 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 300 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
89 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 301 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355
90 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) 302 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10), dpi = 300)
91 303
92 # freeze the random number generator so the same results will be produced 304 # freeze the random number generator so the same results will be produced
93 # from run to run 305 # from run to run
94 set.seed(28571) 306 set.seed(28571)
95 307
96 ### LIBRARIES 308 ### LIBRARIES
309
310 if (print_nb_messages) nbe("library(gplots)")
97 library(gplots) 311 library(gplots)
312 if (print_nb_messages) nbe("library(caret)")
313 # load caret for nearZeroVar
314 if (print_nb_messages) nbe("Please ignore the messages about systemd, if any.\n")
315 library(caret)
316 if (print_nb_messages) nbe("library(DBI)")
98 library(DBI) 317 library(DBI)
318 if (print_nb_messages) nbe("library(RSQLite)")
99 library(RSQLite) 319 library(RSQLite)
320 if (print_nb_messages) nbe("library(sqldf)\n")
100 # Suppress "Warning: no DISPLAY variable so Tk is not available" 321 # Suppress "Warning: no DISPLAY variable so Tk is not available"
101 suppressWarnings(suppressMessages(library(sqldf))) 322 suppressWarnings(suppressMessages(library(sqldf)))
102 323
103 # required but not added to search list: 324 # required but not added to search list:
104 # - DBI 325 # - DBI
110 # - reshape2 331 # - reshape2
111 # - vioplot 332 # - vioplot
112 333
113 ### CONSTANTS 334 ### CONSTANTS
114 335
115 const_parfin <- par("fin") 336 const_boxplot_fill <- "grey94"
116 const_boxplot_fill <- "grey94"
117 const_stripchart_cex <- 0.5
118 const_stripsmall_cex <-
119 sqrt(const_stripchart_cex * const_stripchart_cex / 2)
120 const_stripchart_jitter <- 0.3
121 const_write_debug_files <- FALSE
122 const_table_anchor_bp <- "bp"
123 const_table_anchor_ht <- "ht"
124 const_table_anchor_p <- "p"
125 const_table_anchor_tbp <- "tbp"
126
127
128 const_ksea_astrsk_kinases <- 1 337 const_ksea_astrsk_kinases <- 1
129 const_ksea_nonastrsk_kinases <- 2 338 const_ksea_nonastrsk_kinases <- 2
130 const_ksea_all_kinases <- 3 339 const_ksea_all_kinases <- 3
131 340 const_log10_e <- log10(exp(1))
132 const_log10_e <- log10(exp(1)) 341 const_stripchart_cex <- 0.5
133 342 const_stripchart_jitter <- 0.3
134 ### FUNCTIONS 343 const_table_anchor_bp <- "bp"
135 344 const_table_anchor_ht <- "ht"
136 # from `demo(error.catching)` 345 const_table_anchor_p <- "p"
346 const_table_anchor_t <- "t"
347 const_table_anchor_tbp <- "tbp"
348
349
350 ### GLOBAL VARIABLES (params)
351
352 ## functions to process params
353
354 is_string_null_or_empty <- function(x) {
355 # N. B. non-strings are intentionally treated as NULL
356 if (is.null(x))
357 TRUE
358 else if (!is.character(x))
359 TRUE
360 else x == ""
361 }
362
137 ##' Catch *and* save both errors and warnings, and in the case of 363 ##' Catch *and* save both errors and warnings, and in the case of
138 ##' a warning, also keep the computed result. 364 ##' a warning, also keep the computed result.
365 ##' return result as list(value = ..., warning = ...)
366 ##' - value will be:
367 ##' - the result if no exception is thrown
368 ##' - the exception if an exception is caught
369 ##' - warning will be a string except perhaps when warning argument is not NULL
370 ##'
371 ##' adapted from `demo(error.catching)`
139 ##' 372 ##'
140 ##' @title tryCatch both warnings (with value) and errors 373 ##' @title tryCatch both warnings (with value) and errors
141 ##' @param expr an \R expression to evaluate 374 ##' @param expr an \R expression to evaluate
142 ##' @return a list with 'value' and 'warning', where 375 ##' @return a list with 'value' and 'warning', where
143 ##' 'value' may be an error caught. 376 ##' 'value' may be an error caught.
144 ##' @author Martin Maechler; 377 ##' @author Martin Maechler;
145 ##' Copyright (C) 2010-2012 The R Core Team 378 ##' Copyright (C) 2010-2012 The R Core Team
146 try_catch_w_e <- function(expr) { 379 try_catch_w_e <-
147 wrn <- NULL 380 function(expr, error = function(e) e, warning = NULL) {
148 # warning handler 381 wrn <- NULL
149 w_handler <- function(w) { 382 # warning handler
150 wrn <<- w 383 w_handler <-
151 invokeRestart("muffleWarning") 384 if (is.function(warning))
152 } 385 warning
153 list( 386 else
154 value = withCallingHandlers( 387 function(w) {
155 tryCatch( 388 wrn <<- w
156 expr, 389 invokeRestart("muffleWarning")
157 error = function(e) e 390 }
391 e_handler <-
392 if (is.function(error))
393 error
394 else
395 function(e) e
396 # return result as list(value = ..., warning = ...)
397 # - value will be:
398 # - the result if no exception is thrown
399 # - the exception if an exception is caught
400 list(
401 value = withCallingHandlers(
402 tryCatch(
403 expr,
404 error = e_handler
405 ),
406 warning = w_handler
158 ), 407 ),
159 warning = w_handler 408 warning = wrn
160 ), 409 )
161 warning = wrn 410 }
162 ) 411
163 } 412 see_kvp <-
164 413 function(format, key, value, suffix = "") {
165 414 if (
166 write_debug_file <- function(s) { 415 !all(
167 if (const_write_debug_files) { 416 is.character(format),
168 s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) 417 is.character(key),
169 print(sprintf("DEBUG writing file %s", spath)) 418 is.character(value),
419 is.character(suffix)
420 )
421 ) {
422 cat("all arguments to see_kvp should be character")
423 knitr::knit_exit()
424 }
425 result <- sprintf(format, value)
426 if (length(result) > 1) {
427 sprintf(
428 "%s = c(%s)%s",
429 whack_underscores(key),
430 paste(result, collapse = ", "),
431 suffix
432 )
433 } else {
434 sprintf(
435 "%s = %s%s",
436 key,
437 result,
438 suffix
439 )
440 }
441 }
442
443 see_logical <-
444 function(x, suffix = "", xprssn = deparse1(substitute(x))) {
445 result <- as.character(as.logical(x))
446 # handle NAs and NaNs
447 result[is.na(result)] <- "NA"
448 see_kvp(
449 format = "%s",
450 key = xprssn,
451 value = result,
452 suffix = suffix
453 )
454 }
455
456 see_numeric <-
457 function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) {
458 if (is.numeric(digits) && is.numeric(x)) {
459 digits <- as.integer(digits)
460 digits <- min(16, max(0, digits))
461 format <- paste0("%0.", as.character(digits), "g")
462 result <- sprintf(format, x)
463 see_kvp(
464 format = "%s",
465 key = xprssn,
466 value = result,
467 suffix = suffix
468 )
469 }
470 }
471
472 see_character <-
473 function(x, suffix = "", xprssn = deparse1(substitute(x))) {
474 if (is.character(x)) {
475 see_kvp(
476 format = "%s",
477 key = xprssn,
478 value = sprintf("\"%s\"", x),
479 suffix = suffix
480 )
481 }
482 }
483
484 see_variable <-
485 function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) {
486 if (is.character(x)) {
487 see_character(x, suffix, xprssn)
488 } else if (is.numeric(x)) {
489 see_numeric(x, suffix, digits, xprssn)
490 } else if (is.logical(x)) {
491 see_logical(x, suffix, xprssn)
492 } else {
493 f <- file("")
494 sink(f)
495 str(x)
496 msg <- paste(readLines(f), collapse = "\n")
497 sink()
498 close(f)
499 paste0(
500 "see_variable - str(",
501 xprssn,
502 "):\n",
503 msg, "\n"
504 )
505 }
506 }
507
508 # ref: https://tug.org/texinfohtml/latex2e.html
509 # LaTeX sets aside the following characters for special purposes.
510 # For example, the percent sign % is for comments.
511 # They are called reserved characters or special characters.
512 # They are all discussed elsewhere in this manual.
513 #
514 # $ % & { } _ ~ ^ \ #
515 #
516 # If you want a reserved character to be printed as itself, in the text body
517 # font, for all but the final three characters in that list simply put
518 # a backslash \ in front of the character.
519 # Thus, typing \$1.23 will produce $1.23 in your output.
520 #
521 # As to the last three characters, to get a tilde in the text body font,
522 # use \~{} (omitting the curly braces would result in the next character
523 # receiving a tilde accent).
524 # Similarly, to get a text body font circumflex use \^{}.
525 # To get a backslash in the font of the text body enter \textbackslash{}.
526 whack_math <-
527 function(v) {
528 v <- as.character(v)
529 w <- gsub("\\", "\\textbackslash ", v, fixed = TRUE)
530 w <- Reduce(
531 f = function(l, r) {
532 gsub(r, paste0("\\", r), l, fixed = TRUE)
533 },
534 x = c("#", "$", "%", "&", "{", "}", "_"),
535 init = w
536 )
537 w <- gsub("^", "\\^{}", w, fixed = TRUE)
538 return(w)
539 if (all(v == w))
540 v
541 else
542 paste0("\\texttt{", w, "}")
543 }
544 whack_underscores <- whack_math
545
546 ## dump params to stderr (remove this eventually)
547
548 if (FALSE) nbe(see_variable(params))
549
550 ## unlist params for eventual output
551
552 param_unlist <- unlist(as.list(params))
553
554 # no need to whack underscores and dollars because this is verbatim
555 param_df <- data.frame(
556 parameter = paste0("\\verb@", names(param_unlist), "@"),
557 value = paste0(
558 "\n\\begin{tiny}\n\\verb@",
559 param_unlist,
560 "@\n\\end{tiny}"
561 )
562 )
563 param_df <- data.frame(
564 parameter = names(param_unlist),
565 value = param_unlist
566 )
567 param_df <- param_df[order(param_df$parameter), ]
568
569 ## general output control
570
571 debug_file_base_path <- params$debugFileBasePath
572 print_trace_messages <- params$printTraceMsgs
573 show_enriched_substrates <- params$showEnrichedSubstrates
574 boxplot_varwidth <- params$boxPlotVarWidth
575 boxplot_notch <- params$boxPlotNotch
576
577 ## parameters for static data
578
579 kinase_name_uprt_lut_bz2 <- params$kinaseNameUprtLutBz2
580 kinase_uprt_desc_lut_bz2 <- params$kinaseUprtDescLutBz2
581
582 ## parameters for input file
583
584 preproc_db <- params$preprocDb
585 alpha_file <- params$alphaFile
586 input_file <- params$inputFile
587
588 # First data column - ideally, this could be detected via
589 # regexSampleNames, but for now leave it as is.
590 first_data_column <- params$firstDataColumn
591 fdc_is_integer <- is.integer(first_data_column)
592 if (fdc_is_integer) {
593 first_data_column <- as.integer(params$firstDataColumn)
594 }
595
596 ## parameters for output files
597
598 ksea_app_prep_db <- params$kseaAppPrepDb
599 imputed_data_filename <- params$imputedDataFilename
600 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
601 anova_ksea_mtdt_file <- params$anovaKseaMetadata
602
603 ## parameters for imputation
604
605 # Imputation method, should be one of
606 # "random", "group-median", "median", or "mean"
607 imputation_method <- params$imputationMethod
608
609 # Selection of percentile of logvalue data to set the mean for random number
610 # generation when using random imputation
611 mean_percentile <- params$meanPercentile / 100.0
612
613 # deviation adjustment-factor for random values; real number.
614 sd_percentile <- params$sdPercentile
615
616 ## parameters for group parsing and filtering
617
618 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
619 regex_sample_names <- params$regexSampleNames
620 # Regular expression to extract Sample Grouping from Sample Name;
621 # if error occurs, compare smpl_trt vs. sample_name_matches
622 # to see if groupings/pairs line up
623 # e.g., "(\\d+)"
624
625 regex_sample_grouping <- params$regexSampleGrouping
626 # What are the patterns for filtering sample groups?
627 # How should sample groups be filtered?
628 # - none: do not filter
629 # - include: include sample groups matching filter
630 # - exclude: include sample groups not matching filter
631
632 sample_group_filter <- params$groupFilter
633 if (grepl("f", params$groupFilterMode, fixed = TRUE)) {
634 sample_group_filter_perl <- FALSE
635 sample_group_filter_fixed <- TRUE
636 } else if (grepl("p", params$groupFilterMode, fixed = TRUE)) {
637 sample_group_filter_perl <- TRUE
638 sample_group_filter_fixed <- FALSE
639 } else { # normal regex
640 sample_group_filter_perl <- FALSE
641 sample_group_filter_fixed <- FALSE
642 }
643
644 sample_group_filter_nocase <-
645 grepl("i", params$groupFilterMode, fixed = TRUE)
646
647 # What PCRE patterns should be included or excluded
648 group_filter_patterns_csv <- params$groupFilterPatterns
649 sample_group_filter_patterns <- strsplit(
650 x = group_filter_patterns_csv,
651 split = ",",
652 fixed = TRUE
653 )[[1]]
654
655 ## parameters for hypothesis testing
656
657 one_way_all_categories_fname <- params$oneWayManyCategories
658
659 one_way_all_categories <- try_catch_w_e(
660 match.fun(one_way_all_categories_fname))
661
662 if (!is.function(one_way_all_categories$value)) {
663 write("fatal error for parameter oneWayManyCategories:", stderr())
664 write(one_way_all_categories$value$message, stderr())
665 if (sys.nframe() > 0) {
666 cat("Cannot continue and quit() failed. Goodbye.")
667 knitr::knit_exit()
668 quit(save = "no", status = 1)
669 }
670 }
671
672 one_way_all_categories <- one_way_all_categories$value
673
674 one_way_two_categories_fname <- params$oneWayManyCategories
675 one_way_two_categories <- try_catch_w_e(
676 match.fun(one_way_two_categories_fname))
677 if (!is.function(one_way_two_categories$value)) {
678 cat("fatal error for parameter oneWayTwoCategories: \n")
679 cat(one_way_two_categories$value$message, fill = TRUE)
680 if (sys.nframe() > 0) {
681 cat("Cannot continue and quit() failed. Goodbye.")
682 knitr::knit_exit()
683 quit(save = "no", status = 1)
684 }
685 }
686 one_way_two_categories <- one_way_two_categories$value
687
688 ## parameters for KSEA
689
690 ksea_cutoff_statistic <- params$kseaCutoffStatistic
691 ksea_cutoff_threshold <- params$kseaCutoffThreshold
692 ksea_min_substrate_count <- params$kseaMinSubstrateCount
693
694 ## parameters for global variables consumed by functions
695
696 # intensityHeatmapCriteria: !r c("na_count", "p_value")[2] # TODO switch to 1
697 # TODO Validate within list
698 g_intensity_hm_criteria <- params$intensityHeatmapCriteria
699 if (is_string_null_or_empty(g_intensity_hm_criteria)) {
700 cat("invalid intensityHeatmapCriteria parameter (must be string)")
701 knitr::knit_exit()
702 }
703 switch(
704 g_intensity_hm_criteria,
705 "quality" = NULL,
706 "na_count" = NULL,
707 "p_value" = NULL,
708 {
709 with(
710 params,
711 cat(
712 sprintf(
713 "invalid %s (must be %s)",
714 see_variable(intensityHeatmapCriteria),
715 "one of quality or na_count or p_value"
716 )
717 )
718 )
719 knitr::knit_exit()
720 }
721 )
722
723 # intensityHeatmapRows: 50
724 # TODO Validate >> 0 < 75
725 g_intensity_hm_rows <- params$intensityHeatmapRows
726 if (!is.integer(g_intensity_hm_rows) || g_intensity_hm_rows < 1) {
727 cat("invalid intensityHeatmapRows (must be integer > 0)")
728 knitr::knit_exit()
729 }
730
731 g_intensity_min_per_class <- params$intensityMinValuesPerGroup
732 if (!is.integer(g_intensity_min_per_class) || g_intensity_min_per_class < 0) {
733 cat("invalid intensityMinValuesPerGroup (must be integer > -1")
734 knitr::knit_exit()
735 }
736
737 if (is.na(as.logical(g_correlate_substrates <- params$correlateSubstrates))) {
738 cat("invalid correlateSubstrates (must be TRUE or FALSE)")
739 knitr::knit_exit()
740 }
741
742 if (is.na(as.logical(g_filter_cov_var_gt_1 <- params$filterCovVarGT1))) {
743 cat("invalid filterCovVarGT1 parameter (must be TRUE or FALSE)")
744 knitr::knit_exit()
745 }
746
747 # TODO Validate >> 0 < 30
748 g_ppep_trunc_n <- params$ppepTruncN
749
750 # TODO Validate >> 0 < 30
751 g_subgene_trunc_n <- params$subgeneTruncN
752
753 # TODO Validate >> 0 < 30
754 g_sbstr_trunc_n <- params$substTruncN
755
756
757 ### OPERATORS
758
759 # Test for exclusion
760 # ref: https://www.reneshbedre.com/blog/in-operator-r.html
761 `%notin%` <- Negate(`%in%`)
762
763 # Augmented assignment
764 # ref: https://www2.cs.arizona.edu/icon/refernce/infix2.htm#aug_assign
765 `%||:=%` <- function(lvalue, ...) {
766 pf <- parent.frame()
767 rvalue <- Reduce(paste0, x = ..., init = lvalue)
768 assign(
769 x = as.character(substitute(lvalue)),
770 value = rvalue,
771 pos = pf
772 )
773 invisible(rvalue)
774 }
775
776 ### FUNCTIONS
777
778 no_op <-
779 function() {
780 }
781 # this function is not used in this file and should be removed while
782 # factoring out reusable code
783 all_apply <- function(f, v, na_rm = TRUE, ...) {
784 Reduce(
785 f = function(l, r) if (na_rm && is.na(r)) TRUE else l && r,
786 x = sapply(X = v, FUN = f, ...),
787 init = TRUE
788 )
789 }
790
791 write_debug_file <- function(data_frame) {
792 if (!is.null(debug_file_base_path)) {
793 s_path <-
794 sprintf(
795 "%s/%s.txt",
796 debug_file_base_path,
797 deparse(substitute(data_frame))
798 )
170 write.table( 799 write.table(
171 s, 800 data_frame,
172 file = s_path, 801 file = s_path,
173 sep = "\t", 802 sep = "\t",
174 col.names = TRUE, 803 col.names = TRUE,
175 row.names = TRUE, 804 row.names = TRUE,
176 quote = FALSE 805 quote = FALSE
187 # Hence, `x <- 1; get("x", new_env())` fails by design. 816 # Hence, `x <- 1; get("x", new_env())` fails by design.
188 new_env <- function() { 817 new_env <- function() {
189 new.env(parent = emptyenv()) 818 new.env(parent = emptyenv())
190 } 819 }
191 820
821 # make apply readable for rows
822 row_apply <- function(x, fun, ..., simplify = TRUE) {
823 apply(x, MARGIN = 1, fun, ..., simplify = TRUE)
824 }
825
826 # make apply readable for columns
827 column_apply <- function(x, fun, ..., simplify = TRUE) {
828 apply(x, MARGIN = 2, fun, ..., simplify = TRUE)
829 }
830
831 ##' Produce a vector of boolean values whose i-th value is TRUE when any
832 ##' member of v matches the i-th membr of s, where i in 1:seq_len(length(s))
833 ##'
834 ##' @title Search multiple strings for matches of multiple substrings
835 ##' @param v a vector of substrings to match
836 ##' @param s a vector of strings to search for matches
837 ##' @param ... additional arguments to grepl
838 ##' @return a list with keys in s and valuse that are vectors of elements of v
839 ##' @author Art Eschenlauer
840 ##' Copyright (C) 2022 Art Eschenlauer;
841 ##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms
842 mgrepl <- function(v, s, ...) {
843 grpl_rslt <- rep_len(0, length(s))
844 for (vi in v) {
845 grpl_rslt_v <- sapply(
846 X = s,
847 FUN = function(t) {
848 Reduce(
849 f = function(l, r) if (is.null(l)) r else c(l, r),
850 x = sapply(
851 X = vi,
852 FUN = function(f) grepl(f, t, ...)
853 ),
854 init = c()
855 )
856 },
857 simplify = "array"
858 )
859 grpl_rslt <- grpl_rslt + grpl_rslt_v
860 }
861 rslt <- unname(grpl_rslt > 0)
862 }
863
864 ##' Produce positions in a vector where succeeding value != current valus
865 ##'
866 ##' @title Search vector for neighboring positions having different values
867 ##' @param v a vector of comparable numeric values (e.g. integers)
868 ##' @return a vector of positions i where v[i] != v[i + 1]
869 ##' @author Art Eschenlauer
870 ##' Copyright (C) 2022 Art Eschenlauer;
871 ##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms
872 transition_positions <- function(v) {
873 Reduce(
874 f = function(l, i) if ((i != 1) && (v[i - 1] != v[i])) c(l, i - 1) else l,
875 x = seq_along(v)[-1:0],
876 init = c()
877 )
878 }
879
880 ### figure debug functions
881
882 cat_par_vector <- function(par_name, lbl = "", newlines = TRUE) {
883 cat(
884 sprintf(
885 "%spar(%s) = c(%s)%s",
886 lbl,
887 par_name,
888 paste(par(par_name), collapse = ", "),
889 if (newlines) "\n\n" else ""
890 )
891 )
892 }
893
894 cat_margins <- function(lbl = NULL) {
895 for (p in c("fig", "fin", "mar", "mai", "omd", "omi", "oma"))
896 cat_par_vector(p, if (!is.null(lbl)) paste0(lbl, " ") else NULL)
897 }
898
899 cat_variable <-
900 function(x, suffix = "", digits = 3, force_str = FALSE) {
901 xprssn <- deparse1(substitute(x))
902 if (force_str || is.matrix(x) || is.list(x) || is.data.frame(x)) {
903 cat(
904 paste0(
905 "\n\\texttt{\\textbf{",
906 whack_underscores(xprssn),
907 "}} [",
908 typeof(x),
909 ",",
910 mode(x),
911 "] =\n"
912 )
913 )
914 cat("\n\\begin{verbatim}\n")
915 str(x)
916 cat("\n\\end{verbatim}\n")
917 } else {
918 cat("\n", see_variable(x, suffix, digits, xprssn))
919 }
920 }
921
922 ### structure helper functions
923
924 # ref: staque.R - Icon-oriented stack and queue operations
925 # - https://gist.github.com/eschen42/917690355e53918b9e7ba7138a02d1f8
926 #
927 # sq_get(v):x produces the leftmost element of v and removes it from v,
928 # but produces NA if v is empty
929 sq_get <- function(v) {
930 if (length(v) == 0) return(NA)
931 assign(as.character(substitute(v)), v[-1], parent.frame())
932 return(v[1])
933 }
934 #
935 # sq_put(v,x1,...,xn):v puts x1, x2, ..., xn onto the right end of v,
936 # producing v.
937 # Values are pushed in order from left to right,
938 # so xn becomes the last (rightmost) value on v.
939 # sq_put(v) with no second argument does nothing.
940 sq_put <- function(v, x = NA, ...) {
941 pf <- parent.frame()
942 if (is.null(x)) return(pf$v)
943 if (
944 !(length(x) > 1) &&
945 !rlang::is_closure(x) &&
946 is.na(x)
947 ) return(pf$v)
948 assign(as.character(substitute(v)), c(v, x, ...), pf)
949 pf[[as.character(substitute(v))]]
950 }
951
192 ### numerical/statistical helper functions 952 ### numerical/statistical helper functions
193 953
194 any_nan <- function(x) { 954 any_nan <- function(x) {
195 !any(x == "NaN") 955 !any(x == "NaN")
196 } 956 }
199 sd_finite <- function(x) { 959 sd_finite <- function(x) {
200 ok <- is.finite(x) 960 ok <- is.finite(x)
201 sd(x[ok]) 961 sd(x[ok])
202 } 962 }
203 963
964 # compute anova raw p-value
204 anova_func <- function(x, grouping_factor, one_way_f) { 965 anova_func <- function(x, grouping_factor, one_way_f) {
205 subject <- data.frame( 966 subject <- data.frame(
206 intensity = x 967 intensity = x
207 ) 968 )
208 x_aov <- 969 x_aov <-
216 else 977 else
217 pvalue <- x_aov$p.value 978 pvalue <- x_aov$p.value
218 pvalue 979 pvalue
219 } 980 }
220 981
982 # This code adapted from matrixcalc::is.positive.definite
983 # Notably, this simply tests without calling stop()
984 is_positive_definite <- function(x, tol = 1e-08) {
985 if (!is.matrix(x))
986 return(FALSE)
987 if (!is.numeric(x))
988 return(FALSE)
989 if (nrow(x) < 1)
990 return(FALSE)
991 if (ncol(x) < 1)
992 return(FALSE)
993 if (nrow(x) != ncol(x))
994 return(FALSE)
995 sum_symm <- sum(x == t(x), na.rm = TRUE)
996 value_count <- Reduce("*", dim(x))
997 if (sum_symm != value_count)
998 return(FALSE)
999 eigenvalues <- eigen(x, only.values = TRUE)$values
1000 n <- nrow(x)
1001 for (i in 1:n) {
1002 if (abs(eigenvalues[i]) < tol) {
1003 eigenvalues[i] <- 0
1004 }
1005 }
1006 if (any(eigenvalues <= 0)) {
1007 return(FALSE)
1008 }
1009 return(TRUE)
1010 }
221 1011
222 ### LaTeX functions 1012 ### LaTeX functions
223 1013
224 latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) {
225 v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v
226 cat(
227 paste0(
228 v_sub,
229 collapse = collapse_string
230 )
231 )
232 }
233
234 latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
235 cat("\\begin{itemize}\n\\item ")
236 latex_collapsed_vector(collapse_string, v, underscore_whack)
237 cat("\n\\end{itemize}\n")
238 }
239
240 latex_itemized_list <- function(v, underscore_whack = TRUE) {
241 latex_itemized_collapsed("\n\\item ", v, underscore_whack)
242 }
243
244 latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
245 cat("\\begin{enumerate}\n\\item ")
246 latex_collapsed_vector(collapse_string, v, underscore_whack)
247 cat("\n\\end{enumerate}\n")
248 }
249
250 latex_enumerated_list <- function(v) {
251 latex_enumerated_collapsed("\n\\item ", v)
252 }
253
254 latex_table_row <- function(v, extra = "", underscore_whack = TRUE) {
255 latex_collapsed_vector(" & ", v, underscore_whack)
256 cat(extra)
257 cat(" \\\\\n")
258 }
259
260 # Use this like print.data.frame, from which it is adapted: 1014 # Use this like print.data.frame, from which it is adapted:
261 data_frame_latex <- 1015 data_frame_table_latex <-
262 function( 1016 function(
263 x, 1017 x,
264 ...,
265 # digits to pass to format.data.frame 1018 # digits to pass to format.data.frame
266 digits = NULL, 1019 digits = NULL,
267 # TRUE -> right-justify columns; FALSE -> left-justify 1020 # TRUE -> right-justify columns; FALSE -> left-justify
268 right = TRUE, 1021 right = TRUE,
269 # maximumn number of rows to print 1022 # maximumn number of rows to print
275 # optional caption 1028 # optional caption
276 caption = NULL, 1029 caption = NULL,
277 # h(inline); b(bottom); t (top) or p (separate page) 1030 # h(inline); b(bottom); t (top) or p (separate page)
278 anchor = "h", 1031 anchor = "h",
279 # set underscore_whack to TRUE to escape underscores 1032 # set underscore_whack to TRUE to escape underscores
280 underscore_whack = TRUE 1033 underscore_whack = TRUE,
1034 # how to emit results
1035 emit = cat
281 ) { 1036 ) {
282 if (is.null(justification)) 1037 if (is.null(justification))
283 justification <- 1038 justification <-
284 Reduce( 1039 Reduce(
285 f = paste, 1040 f = paste,
286 x = rep_len(if (right) "r" else "l", length(colnames(x))) 1041 x = rep_len(if (right) "r" else "l", length(colnames(x)))
287 ) 1042 )
1043 n <- length(rownames(x))
1044 if (length(x) == 0L) {
1045 emit(
1046 sprintf(
1047 # if n is one, use singular 'row', else use plural 'rows'
1048 ngettext(
1049 n,
1050 "data frame with 0 columns and %d row",
1051 "data frame with 0 columns and %d rows"
1052 ),
1053 n
1054 ),
1055 "\n",
1056 sep = ""
1057 )
1058 } else if (n == 0L) {
1059 emit("0 rows for:\n")
1060 latex_itemized_list(
1061 v = names(x),
1062 underscore_whack = underscore_whack
1063 )
1064 } else {
1065 if (is.null(max))
1066 max <- getOption("max.print", 99999L)
1067 if (!is.finite(max)) {
1068 cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max)
1069 knitr::knit_exit()
1070 }
1071 omit <- (n0 <- max %/% length(x)) < n
1072 m <- as.matrix(
1073 format.data.frame(
1074 if (omit) x[seq_len(n0), , drop = FALSE] else x,
1075 digits = digits,
1076 na.encode = FALSE
1077 )
1078 )
1079 emit(
1080 # h(inline); b(bottom); t (top) or p (separate page)
1081 paste0("\\begin{table}[", anchor, "]"),
1082 "\\leavevmode",
1083 sep = "\n"
1084 )
1085 if (!is.null(caption))
1086 emit(paste0(" \\caption{", caption, "}"))
1087 if (centered) emit("\\centering\n")
1088 emit(
1089 paste(
1090 " \\begin{tabular}{",
1091 justification,
1092 "}\n",
1093 sep = ""
1094 )
1095 )
1096
1097 # ref for top and bottom struts (\T and \B):
1098 # https://tex.stackexchange.com/a/50355
1099 if (!is.null(caption))
1100 emit("\\B \\\\\n")
1101 latex_table_row(
1102 v = colnames(m),
1103 extra = " \\T \\B",
1104 underscore_whack = underscore_whack
1105 )
1106 emit("\\hline \\\\\n")
1107 for (i in seq_len(length(m[, 1]))) {
1108 latex_table_row(
1109 v = m[i, ],
1110 underscore_whack = underscore_whack
1111 )
1112 }
1113 emit(
1114 paste(
1115 " \\end{tabular}",
1116 "\\end{table}",
1117 sep = "\n"
1118 )
1119 )
1120 if (omit)
1121 emit(" [ reached 'max' / getOption(\"max.print\") -- omitted",
1122 n - n0, "rows ]\n")
1123 }
1124 invisible(x)
1125 }
1126
1127 # Use this like print.data.frame, from which it is adapted:
1128 data_frame_tabbing_latex <-
1129 function(
1130 x,
1131 # vector of tab stops, in inches
1132 tabstops,
1133 # vector of headings, registered with tab-stops
1134 headings = colnames(x),
1135 # digits to pass to format.data.frame
1136 digits = NULL,
1137 # maximumn number of rows to print
1138 max = NULL,
1139 # optional caption
1140 caption = NULL,
1141 # set underscore_whack to TRUE to escape underscores
1142 underscore_whack = TRUE,
1143 # flag for landscape mode
1144 landscape = FALSE,
1145 # flag indicating that subsubsection should be used for caption
1146 # rather than subsection
1147 use_subsubsection_header = TRUE,
1148 # character-size indicator; for possible values, see:
1149 # https://tug.org/texinfohtml/latex2e.html#Font-sizes
1150 charactersize = "small",
1151 # set verbatim to TRUE to debug output
1152 verbatim = FALSE
1153 ) {
1154
1155 hlinport <- if (landscape) {
1156 function() cat("\\hlinlscp \\\\\n")
1157 } else {
1158 function() cat("\\hlinport \\\\\n")
1159 }
1160
1161 tabstops_tex <-
1162 Reduce(
1163 f = function(l, r) paste0(l, r),
1164 x = sprintf("\\hspace{%0.2fin}\\=", tabstops),
1165 init = ""
1166 )
1167
288 n <- length(rownames(x)) 1168 n <- length(rownames(x))
289 if (length(x) == 0L) { 1169 if (length(x) == 0L) {
290 cat( 1170 cat(
291 sprintf( 1171 sprintf(
292 # if n is one, use singular 'row', else use plural 'rows' 1172 # if n is one, use singular 'row', else use plural 'rows'
307 underscore_whack = underscore_whack 1187 underscore_whack = underscore_whack
308 ) 1188 )
309 } else { 1189 } else {
310 if (is.null(max)) 1190 if (is.null(max))
311 max <- getOption("max.print", 99999L) 1191 max <- getOption("max.print", 99999L)
312 if (!is.finite(max)) 1192 if (!is.finite(max)) {
313 stop("invalid 'max' / getOption(\"max.print\"): ", 1193 cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max)
314 max) 1194 knitr::knit_exit()
1195 }
315 omit <- (n0 <- max %/% length(x)) < n 1196 omit <- (n0 <- max %/% length(x)) < n
316 m <- as.matrix( 1197 m <- as.matrix(
317 format.data.frame( 1198 format.data.frame(
318 if (omit) x[seq_len(n0), , drop = FALSE] else x, 1199 if (omit) x[seq_len(n0), , drop = FALSE] else x,
319 digits = digits, 1200 digits = digits,
320 na.encode = FALSE 1201 na.encode = FALSE
321 ) 1202 )
322 ) 1203 )
323 cat( 1204 if (landscape)
324 # h(inline); b(bottom); t (top) or p (separate page) 1205 cat("\n\\begin{landscape}")
325 paste0("\\begin{table}[", anchor, "]\n") 1206 tex_caption <-
326 ) 1207 if (!is.null(caption)) sprintf("\\captionof{table}{%s}\n", caption)
327 if (!is.null(caption)) 1208 else "\n"
328 cat(paste0(" \\caption{", caption, "}")) 1209 # build the column names, which have multiple lines when
329 if (centered) cat("\\centering\n") 1210 # length(headings) is a multiple of the number of columns
330 cat( 1211 column_names <- ""
331 paste( 1212 while (length(headings) > 0) {
332 " \\begin{tabular}{", 1213 my_row <- c()
333 justification, 1214 for (i in 1:(1 + length(tabstops))) {
334 "}\n", 1215 my_field <- sq_get(headings)
335 sep = "" 1216 sq_put(my_row, if (is.na(my_field)) "" else my_field)
336 ) 1217 }
337 ) 1218 column_names %||:=% latex_tabbing_row(
338 # ref: https://tex.stackexchange.com/a/50353 1219 v = my_row,
339 # Describes use of \rule{0pt}{3ex} 1220 underscore_whack = underscore_whack,
340 if (!is.null(caption)) 1221 action = paste0
341 cat("\\B \\\\ \\hline\\hline\n")
342 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355
343 latex_table_row(
344 v = colnames(m),
345 extra = "\\T\\B",
346 underscore_whack = underscore_whack
347 )
348 cat("\\hline\n")
349 for (i in seq_len(length(m[, 1]))) {
350 latex_table_row(
351 v = m[i, ],
352 underscore_whack = underscore_whack
353 ) 1222 )
354 } 1223 }
1224
1225 # Begin tabbing environment after beginning charactersize environment
1226 if (verbatim) cat("\n\\begin{verbatim}")
355 cat( 1227 cat(
356 paste( 1228 paste0(
357 " \\end{tabular}", 1229 "\n\\begin{", charactersize, "}", tex_caption,
358 "\\end{table}", 1230 "\\begin{tabwrap}{", tabstops_tex, "}\n"
359 sep = "\n"
360 )
361 ) 1231 )
1232 )
1233 # emit column names
1234 cat(column_names)
1235 # emit hline
1236 hlinport()
1237 for (i in seq_len(length(m[, 1]))) {
1238 my_row <- latex_tabbing_row(
1239 v = m[i, ],
1240 underscore_whack = underscore_whack,
1241 action = paste0
1242 )
1243 if (FALSE)
1244 cat(my_row)
1245 else
1246 cat(my_row)
1247 }
1248 hlinport()
362 if (omit) 1249 if (omit)
363 cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", 1250 cat(" [ reached 'max' / getOption(\"max.print\") -- omitted",
364 n - n0, "rows ]\n") 1251 n - n0, "rows ]\n")
1252 # End charactersize environment after ending tabbing environment
1253 cat(paste0("\\end{tabwrap}\n\\end{", charactersize, "}\n"))
1254 if (verbatim) cat("\\end{verbatim}\n")
1255 if (landscape)
1256 cat("\\end{landscape}\n")
365 } 1257 }
366 invisible(x) 1258 invisible(x)
1259 }
1260
1261 param_df_noexit <-
1262 function(e = NULL) {
1263 data_frame_tabbing_latex(
1264 x = param_df,
1265 tabstops = c(1.75),
1266 underscore_whack = TRUE,
1267 caption = "Input parameters",
1268 verbatim = FALSE
1269 )
1270 if (!is.null(e)) {
1271 sink(stderr())
1272 cat("Caught fatal error:\n\n")
1273 str(e)
1274 sink()
1275 }
1276 }
1277
1278 param_df_exit <-
1279 function(e = NULL) {
1280 param_df_noexit(e)
1281 knitr::knit_exit()
1282 exit(-1)
1283 }
1284
1285 # exit with exit code (default 0) and optional msg
1286 exit <-
1287 function(code = 0, msg = NULL, use_stderr = FALSE) {
1288 if (!is.null(msg)) {
1289 if (use_stderr) sink(stderr())
1290 cat("\n\n", msg, "\n\n")
1291 if (use_stderr) sink()
1292 }
1293 q(save = "no", status = code)
1294 }
1295
1296 # make control sequences into printable latex sequences
1297 latex_printable_control_seqs <-
1298 function(s) {
1299 s <- gsub("[\\]", "xyzzy_plugh", s)
1300 s <- gsub("[$]", "\\\\$", s)
1301 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s)
1302 return(s)
1303 }
1304 nolatex_verbatim <-
1305 function(expr) eval(expr)
1306
1307 latex_verbatim <-
1308 function(expr) {
1309 arg_string <- deparse1(substitute(expr))
1310 cat("\n\\begin{verbatim}\n___\n")
1311 tryCatch(
1312 expr = expr,
1313 error = param_df_exit,
1314 #ACE error =
1315 #ACE function(e) {
1316 #ACE cat("Caught error:\n\n")
1317 #ACE str(e)
1318 #ACE knitr::knit_exit()
1319 #ACE stop(e)
1320 #ACE },
1321 finally = cat("...\n\\end{verbatim}\n")
1322 )
1323 }
1324
1325 latex_samepage <-
1326 function(expr) {
1327 arg_string <- deparse1(substitute(expr))
1328 cat("\n\\begin{samepage}\n")
1329 tryCatch(
1330 expr = expr,
1331 error = param_df_exit,
1332 #ACE error =
1333 #ACE function(e) {
1334 #ACE cat("Caught error:\n\n")
1335 #ACE str(e)
1336 #ACE knitr::knit_exit()
1337 #ACE stop(e)
1338 #ACE },
1339 finally = cat("\n\\end{samepage}\n")
1340 )
1341 }
1342
1343 # return the result of invocation after showing parameters
1344 # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/
1345 latex_show_invocation <-
1346 function(f, f_name = deparse1(substitute(f)), head_patch = FALSE) {
1347 function(...) {
1348 my_env <- (as.list(environment()))
1349 va <- list(...)
1350 my_rslt <- new_env()
1351 my_rslt$rslt <- NULL
1352 latex_verbatim(
1353 expr = {
1354 cat(sprintf("\n .. Local variables for '%s':\n\n", f_name))
1355 str(va)
1356 if (!head_patch) {
1357 # return this result
1358 # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/
1359 cat(sprintf("\n .. Invoking '%s'\n", f_name))
1360 tryCatch(
1361 {
1362 cat("\n\\end{verbatim}\n")
1363 rslt <- do.call(f, va)
1364 },
1365 error = param_df_exit,
1366 #ACE error = function(e) {
1367 #ACE cat("\n\\begin{verbatim}\n")
1368 #ACE str(e)
1369 #ACE cat("\n\\end{verbatim}\n")
1370 #ACE knitr::knit_exit()
1371 #ACE stop(e)
1372 #ACE },
1373 finally = cat("\n\\begin{verbatim}\n")
1374 )
1375 cat(sprintf("\n .. '%s' returned:\n", f_name))
1376 str(rslt)
1377 my_rslt$rslt <- rslt
1378 }
1379 }
1380 )
1381 # return the result of invocation with the shown parameters
1382 # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/
1383 if (head_patch) my_rslt$rslt <- do.call(f, va)
1384 (my_rslt$rslt)
1385 }
1386 }
1387
1388 latex_collapsed_vector <- function(
1389 collapse_string,
1390 v,
1391 underscore_whack = TRUE,
1392 action = cat0
1393 ) {
1394 v_sub <-
1395 if (underscore_whack) whack_underscores(v) else v
1396 action(
1397 paste0(
1398 v_sub,
1399 collapse = collapse_string
1400 )
1401 )
1402 }
1403
1404 latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
1405 cat("\\begin{itemize}\n\\item ")
1406 latex_collapsed_vector(collapse_string, v, underscore_whack)
1407 cat("\n\\end{itemize}\n")
1408 }
1409
1410 latex_itemized_list <- function(v, underscore_whack = TRUE) {
1411 latex_itemized_collapsed("\n\\item ", v, underscore_whack)
1412 }
1413
1414 latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
1415 cat("\\begin{enumerate}\n\\item ")
1416 latex_collapsed_vector(collapse_string, v, underscore_whack)
1417 cat("\n\\end{enumerate}\n")
1418 }
1419
1420 latex_enumerated_list <- function(v) {
1421 latex_enumerated_collapsed("\n\\item ", v)
1422 }
1423
1424 latex_table_row <- function(v, extra = "", underscore_whack = TRUE) {
1425 latex_collapsed_vector(" & ", v, underscore_whack)
1426 cat(extra)
1427 cat(" \\\\\n")
1428 }
1429
1430 latex_tabbing_row <- function(
1431 v,
1432 extra = "",
1433 underscore_whack = TRUE,
1434 action = cat0
1435 ) {
1436 # latex_collapsed_vector applies action to result of paste0;
1437 # by default, action = cat;
1438 # hence, a scalar string is assigned to v_collapsed
1439 v_collapsed <-
1440 latex_collapsed_vector(
1441 "} \\> \\tabfill{",
1442 v,
1443 underscore_whack,
1444 action = paste0
1445 )
1446 action(
1447 "\\tabfill{",
1448 v_collapsed,
1449 "}",
1450 extra,
1451 " \\\\\n"
1452 )
1453 }
1454
1455 # N.B. use con = "" to emulate regular cat
1456 fcat0 <-
1457 function(..., sprtr = " ", cnnctn = file()) {
1458 cat0(..., sep = sprtr, file = cnnctn)
1459 invisible(cnnctn)
367 } 1460 }
368 1461
369 hypersub <- 1462 hypersub <-
370 function(s) { 1463 function(s) {
371 hyper <- tolower(s) 1464 hyper <- tolower(s)
372 hyper <- gsub("[^a-z0-9]+", "-", hyper) 1465 hyper <- gsub("[^a-z0-9]+", "-", hyper)
373 hyper <- gsub("[-]+", "-", hyper) 1466 hyper <- gsub("[-]+", "-", hyper)
1467 hyper <- gsub("[_]+", "-", hyper)
374 hyper <- sub("^[-]", "", hyper) 1468 hyper <- sub("^[-]", "", hyper)
375 hyper <- sub("[-]$", "", hyper) 1469 hyper <- sub("[-]$", "", hyper)
376 return(hyper) 1470 return(hyper)
377 } 1471 }
378 1472
379 subsection_header <- 1473 table_href <- function(s = "offset", caption = "") {
380 function(s) { 1474 paste0("\\hyperlink{table.\\arabic{", s, "}}{Table \\arabic{", s, "}}")
1475 }
1476
1477 table_offset <- function(i = 0, s = "offset", new = FALSE) {
1478 paste0(
1479 if (new) paste0("\\newcounter{", s, "}\n") else "",
1480 "\\setcounter{", s, "}{\\value{table}}\n",
1481 paste0(if (i > 0) rep(paste0("\\stepcounter{", s, "}"), i), "\n")
1482 )
1483 }
1484
1485 a_section_header <-
1486 function(s, prefix = "") {
381 hyper <- hypersub(s) 1487 hyper <- hypersub(s)
382 cat( 1488 my_subsection_header <- sprintf(
383 sprintf( 1489 "\\hypertarget{%s}{\\%ssection{%s}\\label{%s}}\n",
384 "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n", 1490 hyper,
385 hyper, s, hyper 1491 prefix,
386 ) 1492 gsub("_", "\\_", s, fixed = TRUE),
387 ) 1493 hyper
388 } 1494 )
389 1495 my_subsection_header
390 subsubsection_header <- 1496 }
391 function(s) { 1497 section_header <- function(s) a_section_header(s, "")
392 hyper <- hypersub(s) 1498 subsection_header <- function(s) a_section_header(s, "sub")
393 cat( 1499 subsubsection_header <- function(s) a_section_header(s, "subsub")
394 sprintf(
395 "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n",
396 hyper, s, hyper
397 )
398 )
399 }
400 1500
401 ### SQLite functions 1501 ### SQLite functions
402 1502
403 ddl_exec <- function(db, sql) { 1503 ddl_exec <- function(db, sql) {
404 discard <- DBI::dbExecute(conn = db, statement = sql) 1504 discard <- DBI::dbExecute(conn = db, statement = sql)
432 } 1532 }
433 } 1533 }
434 1534
435 ### KSEA functions and helpers 1535 ### KSEA functions and helpers
436 1536
437 # Adapted from KSEAapp::KSEA.Scores to allow retrieval of: 1537 #' The KSEA App Analysis (KSEA Kinase Scores Only)
438 # - maximum log2(FC) 1538 #'
1539 #' Compute KSEA kinase scores and statistics from phoshoproteomics data input
1540 #' Adapted from KSEAapp::KSEA.Scores to allow retrieval of maximum log2(FC)
1541 #'
1542 #' Result is an R data.frame with column names
1543 #' "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR"
1544 #' "Please refer to the original Casado et al. publication for detailed
1545 #' description of these columns and what they represent:
1546 #'
1547 #' - Kinase.Gene indicates the gene name for each kinase.
1548 #' - mS represents the mean log2(fold change) of all the
1549 #' kinase's substrates.
1550 #' - Enrichment is the background-adjusted value of the kinase's mS.
1551 #' - m is the total number of detected substrates
1552 #' from the experimental dataset for each kinase.
1553 #' - z.score is the normalized score for each kinase, weighted by
1554 #' the number of identified substrates.
1555 #' - p.value represents the statistical assessment for the z.score.
1556 #' - FDR is the p-value adjusted for multiple hypothesis testing
1557 #' using the Benjamini & Hochberg method."
1558 #'
1559 #' @param ksdata the Kinase-Substrate dataset uploaded from the file
1560 #' prefaced with "PSP&NetworKIN_"
1561 #' available from github.com/casecpb/KSEA/
1562 #' @param px the experimental data file formatted as described in
1563 #' the KSEA.Complete() documentation
1564 #' @param networkin a binary input of TRUE or FALSE, indicating whether
1565 #' or not to include NetworKIN predictions, where
1566 #' \code{NetworKIN = TRUE}
1567 #' means include NetworKIN predictions
1568 #' @param networkin_cutoff a numeric value between 1 and infinity setting
1569 #' the minimum NetworKIN score
1570 #' (this can be omitted if NetworKIN = FALSE)
1571 #'
1572 #' @return creates a new R data.frame with all the KSEA kinase
1573 #' scores, along with each one's statistical
1574 #' assessment, as described herein.
1575 #'
1576 #' @references
1577 #'
1578 #' Casado et al. (2013) Sci Signal. 6(268):rs6
1579 #'
1580 #' Hornbeck et al. (2015) Nucleic Acids Res. 43:D512-20
1581 #'
1582 #' Horn et al. (2014) Nature Methods 11(6):603-4
1583 #'
439 ksea_scores <- function( 1584 ksea_scores <- function(
440
441 # For human data, typically, ksdata = KSEAapp::ksdata 1585 # For human data, typically, ksdata = KSEAapp::ksdata
442 ksdata, 1586 ksdata,
443 1587
444 # Input data file having columns: 1588 # Input data file having columns:
445 # - Protein : abbreviated protein name 1589 # - Protein : abbreviated protein name
457 # NetworKIN predictions 1601 # NetworKIN predictions
458 networkin, 1602 networkin,
459 1603
460 # A numeric value between 1 and infinity setting the minimum NetworKIN 1604 # A numeric value between 1 and infinity setting the minimum NetworKIN
461 # score (can be left out if networkin = FALSE) 1605 # score (can be left out if networkin = FALSE)
462 networkin_cutoff 1606 networkin_cutoff,
1607
1608 # Minimum substrate count, necessary to adjust the p-value appropriately.
1609 minimum_substrate_count
463 1610
464 ) { 1611 ) {
1612 # no px$FC should be <= 0, but abs(px$FC) is used below as a precaution.
465 if (length(grep(";", px$Residue.Both)) == 0) { 1613 if (length(grep(";", px$Residue.Both)) == 0) {
466 # There are no Residue.Both entries having semicolons, so new is 1614 # There are no Residue.Both entries having semicolons, so new is
467 # simply px except two columns are renamed and a column is added 1615 # simply px except two columns are renamed and a column is added
468 # for log2(abs(fold-change)) 1616 # for log2(abs(fold-change))
469 new <- px 1617 new <- px
505 # Convert any illegal values from NaN to NA 1653 # Convert any illegal values from NaN to NA
506 new[is.nan(new$log2_fc), "log2_fc"] <- NA 1654 new[is.nan(new$log2_fc), "log2_fc"] <- NA
507 # Eliminate rows having missing values (e.g., non-imputed data) 1655 # Eliminate rows having missing values (e.g., non-imputed data)
508 new <- new[complete.cases(new$log2_fc), ] 1656 new <- new[complete.cases(new$log2_fc), ]
509 } 1657 }
510 if (networkin == TRUE) { 1658 # At this point, new$log2_fc is signed according to which contrast has
511 # When NetworKIN is true, filter on NetworKIN.cutoff which includes 1659 # the greater intensity
512 # PhosphoSitePlus data *because its networkin_score is set to Inf* 1660 # To take the magnitude into account without taking the direction into
513 ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ] 1661 # account, set params$kseaUseAbsoluteLog2FC to TRUE
514 ksdata_filtered <- ksdata_filtered[ 1662 #
515 (ksdata_filtered$networkin_score >= networkin_cutoff), ] 1663 # Should KSEA be performed aggregating signed log2FC or absolute?
516 } else { 1664 # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores
517 # Otherwise, simply use PhosphSitePlus rows 1665 if (params$kseaUseAbsoluteLog2FC) {
518 ksdata_filtered <- ksdata[ 1666 # TRUE use abs(log2FC) for KSEA as Justin requested; this is a
519 grep("PhosphoSitePlus", ksdata$Source), ] 1667 # justifiable deviation from the KSEAapp::KSEA.Scores algorithm.
520 } 1668 new$log2_fc <- abs(new$log2_fc)
521 # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD 1669 }
1670
1671 monitor_filtration_on_stderr <- TRUE
1672 if (monitor_filtration_on_stderr) {
1673 # set to TRUE to monitor filtration on stderr
1674 sink(stderr())
1675 cat(see_variable(networkin, "\n"))
1676 }
1677 ksdata_filtered <-
1678 sqldf(
1679 sprintf("%s %s",
1680 "select * from ksdata where not Source = 'NetworKIN'",
1681 if (networkin)
1682 sprintf("or networkin_score >= %d", networkin_cutoff)
1683 else
1684 ""
1685 )
1686 )
1687 if (monitor_filtration_on_stderr) {
1688 cat(see_variable(sqldf(
1689 "select count(*), Source from ksdata group by Source"), "\n"))
1690 cat(see_variable(sqldf(
1691 "select count(*), Source from ksdata_filtered group by Source"), "\n"))
1692 sink()
1693 }
1694
1695 ############################################################################
1696 # Line numbers below refer to lines of:
1697 # https://github.com/casecpb/KSEAapp/blob/master/R/KSEA.Scores.R
1698 # I would put the original line in a comment but then lint would complain...
1699 # - Indeed, I had to rename all the variables because lint didn't like names
1700 # containing periods or capital letters.
1701 # ACE
1702 ############################################################################
1703 #
1704 # (1) Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD
522 # colnames of ksdata_filtered: 1705 # colnames of ksdata_filtered:
523 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" 1706 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID"
524 # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" 1707 # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID"
525 # "SITE_...7_AA" "networkin_score" "Source" 1708 # "SITE_...7_AA" "networkin_score" "Source"
526 # colnames of new: 1709 # colnames of new:
529 # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc 1712 # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc
530 # FROM ksdata_filtered a 1713 # FROM ksdata_filtered a
531 # INNER JOIN new b 1714 # INNER JOIN new b
532 # ON a.SUB_GENE = b.SUB_GENE 1715 # ON a.SUB_GENE = b.SUB_GENE
533 # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD 1716 # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD
1717 # (KSEA.Scores.R line # 105)
1718 # "Extract KSData.filtered annotations that are only found in new"
534 ksdata_dataset <- base::merge(ksdata_filtered, new) 1719 ksdata_dataset <- base::merge(ksdata_filtered, new)
535 # colnames of ksdata_dataset: 1720 # colnames of ksdata_dataset:
536 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" 1721 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE"
537 # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" 1722 # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD"
538 # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" 1723 # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein"
539 # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) 1724 # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform)
540 # Re-order dataset; prior to accounting for isoforms 1725 # Re-order dataset; prior to accounting for isoforms
1726 # (KSEA.Scores.R line # 106)
541 ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] 1727 ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ]
542 # Extract non-isoform accession in UniProtKB 1728 # Extract non-isoform accession in UniProtKB
1729 # (KSEA.Scores.R line # 107)
543 ksdata_dataset$uniprot_no_isoform <- sapply( 1730 ksdata_dataset$uniprot_no_isoform <- sapply(
544 ksdata_dataset$KIN_ACC_ID, 1731 ksdata_dataset$KIN_ACC_ID,
545 function(x) unlist(strsplit(as.character(x), split = "-"))[1] 1732 function(x) unlist(strsplit(as.character(x), split = "-"))[1]
546 ) 1733 )
1734 # "last expression collapses isoforms ... for easy processing"
547 # Discard previous results while selecting interesting columns ... 1735 # Discard previous results while selecting interesting columns ...
1736 # (KSEA.Scores.R line # 110)
548 ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] 1737 ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)]
549 # Column names are now: 1738 # Column names are now:
550 # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" 1739 # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p"
551 # "FC" "log2_fc" "Source" 1740 # "FC" "log2_fc" "Source"
552 # Make column names human-readable 1741 # Make column names human-readable
1742 # (KSEA.Scores.R line # 111)
553 colnames(ksdata_dataset_abbrev) <- c( 1743 colnames(ksdata_dataset_abbrev) <- c(
554 "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", 1744 "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p",
555 "FC", "log2FC", "Source" 1745 "FC", "log2FC", "Source"
556 ) 1746 )
557 # SELECT * FROM ksdata_dataset_abbrev 1747 # SELECT * FROM ksdata_dataset_abbrev
558 # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p 1748 # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p
1749 # (KSEA.Scores.R line # 112)
1750 # "Extract KSData.filtered annotations that are only found in new"
559 ksdata_dataset_abbrev <- 1751 ksdata_dataset_abbrev <-
560 ksdata_dataset_abbrev[ 1752 ksdata_dataset_abbrev[
561 order( 1753 order(
562 ksdata_dataset_abbrev$Kinase.Gene, 1754 ksdata_dataset_abbrev$Kinase.Gene,
563 ksdata_dataset_abbrev$Substrate.Gene, 1755 ksdata_dataset_abbrev$Substrate.Gene,
564 ksdata_dataset_abbrev$Substrate.Mod, 1756 ksdata_dataset_abbrev$Substrate.Mod,
565 ksdata_dataset_abbrev$p), 1757 ksdata_dataset_abbrev$p),
566 ] 1758 ]
1759 if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev))
567 # First aggregation step to account for multiply phosphorylated peptides 1760 # First aggregation step to account for multiply phosphorylated peptides
568 # and differing peptide sequences; the goal here is to combine results 1761 # and differing peptide sequences; the goal here is to combine results
569 # for all measurements of the same substrate. 1762 # for all measurements of the same substrate.
570 # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, 1763 # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
571 # `Source`, avg(log2FC) AS log2FC 1764 # `Source`, avg(log2FC) AS log2FC
573 # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, 1766 # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
574 # `Source` 1767 # `Source`
575 # ORDER BY `Kinase.Gene`; 1768 # ORDER BY `Kinase.Gene`;
576 # in two steps: 1769 # in two steps:
577 # (1) compute average log_2(fold-change) 1770 # (1) compute average log_2(fold-change)
1771 # "take the mean of the log2FC amongst phosphosite duplicates"
1772 # (KSEA.Scores.R line # 115)
578 ksdata_dataset_abbrev <- aggregate( 1773 ksdata_dataset_abbrev <- aggregate(
579 log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, 1774 log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source,
580 data = ksdata_dataset_abbrev, 1775 data = ksdata_dataset_abbrev,
581 FUN = mean 1776 FUN = mean
582 ) 1777 )
1778 if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev))
583 # (2) order by Kinase.Gene 1779 # (2) order by Kinase.Gene
1780 # (KSEA.Scores.R line # 117)
584 ksdata_dataset_abbrev <- 1781 ksdata_dataset_abbrev <-
585 ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] 1782 ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ]
586 # SELECT `Kinase.Gene`, count(*) 1783 # SELECT `Kinase.Gene`, count(*)
587 # FROM ksdata_dataset_abbrev 1784 # FROM ksdata_dataset_abbrev
588 # GROUP BY `Kinase.Gene`; 1785 # GROUP BY `Kinase.Gene`;
589 # in two steps: 1786 # in two steps:
590 # (1) Extract the list of Kinase.Gene names 1787 # (1) Extract the list of Kinase.Gene names
1788 # "@@@@@@@@@@@@@@@@@@@@"
1789 # "Do analysis for KSEA"
1790 # "@@@@@@@@@@@@@@@@@@@@"
1791 # (KSEA.Scores.R line # 124)
591 kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) 1792 kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene)
592 # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, 1793 # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev,
593 # named by Kinase.Gene 1794 # named by Kinase.Gene
1795 # (KSEA.Scores.R line # 125)
594 kinase_list <- as.matrix(table(kinase_list)) 1796 kinase_list <- as.matrix(table(kinase_list))
595 # Second aggregation step to account for all substrates per kinase 1797 # Second aggregation step to account for all substrates per kinase
596 # CREATE TABLE mean_fc 1798 # CREATE TABLE mean_fc
597 # AS 1799 # AS
598 # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC 1800 # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC
599 # FROM ksdata_dataset_abbrev 1801 # FROM ksdata_dataset_abbrev
600 # GROUP BY `Kinase.Gene` 1802 # GROUP BY `Kinase.Gene`
601 mean_fc <- aggregate( 1803 # (KSEA.Scores.R line # 127)
602 log2FC ~ Kinase.Gene, 1804 if (print_nb_messages) nb(see_variable(ksdata_dataset_abbrev), "\n")
603 data = ksdata_dataset_abbrev, 1805 mean_fc <-
604 FUN = mean 1806 aggregate(
605 )
606 # mean_fc columns: "Kinase.Gene", "log2FC"
607 if (FALSE) {
608 # I need to re-think this; I was trying to find the most-represented
609 # peptide, but that horse has already left the barn
610 # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC
611 # FROM ksdata_dataset_abbrev
612 # GROUP BY `Kinase.Gene`
613 max_fc <- aggregate(
614 log2FC ~ Kinase.Gene, 1807 log2FC ~ Kinase.Gene,
615 data = ksdata_dataset_abbrev, 1808 data = ksdata_dataset_abbrev,
616 FUN = function(r) max(abs(r)) 1809 FUN = mean
617 ) 1810 )
618 } 1811 if (print_nb_messages) nbe(see_variable(mean_fc), "\n")
1812
1813 # for contrast j
1814 # for each kinase i
1815 # extract log2 of fold-change (from `new` above)
1816 # (used in KSEA.Scores.R lines # 130 & 132)
1817 log2_fc_j_each_i <-
1818 new$log2_fc
1819
1820 # for contrast j
1821 # for all kinases i
1822 # compute mean of abs(log2 of fold-change)
1823 # (used in KSEA.Scores.R lines # 130)
1824 mean_abs_log2_fc_j_all_i <-
1825 mean(abs(log2_fc_j_each_i), na.rm = TRUE)
1826
1827 # for contrast j
1828 # for all kinases i
1829 # compute mean of log2 of fold-change
1830 # (used in KSEA.Scores.R lines # 132)
1831 mean_log2_fc_j_all_i <-
1832 mean(log2_fc_j_each_i, na.rm = TRUE)
1833
1834 # Reorder mean_fc, although I don't see why
1835 # (KSEA.Scores.R line 128
1836 mean_fc <- mean_fc[order(mean_fc[, 1]), ]
1837 # mean_fc columns so far: "Kinase.Gene", "log2FC"
1838 # - Kinase.Gene
1839 # indicates the gene name for each kinase.
619 1840
620 # Create column 3: mS 1841 # Create column 3: mS
621 mean_fc$m_s <- mean_fc[, 2] 1842 # - mS
1843 # represents the mean log2(fold change) of all the
1844 # kinase's substrates.
1845 # (KSEA.Scores.R line # 129)
1846 mean_fc$m_s <-
1847 mean_fc_m_s <- mean_fc[, 2]
1848
622 # Create column 4: Enrichment 1849 # Create column 4: Enrichment
623 mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE)) 1850 # - Enrichment
624 # Create column 5: m, count of substrates 1851 # is the background-adjusted value of the kinase's mS.
625 mean_fc$m <- kinase_list 1852 # (KSEA.Scores.R line # 130)
1853 mean_fc$enrichment <-
1854 mean_fc_m_s / mean_abs_log2_fc_j_all_i
1855
1856 # Create column 5: m, count of substrates of kinase (count of j for i)
1857 # - m
1858 # is the total number of detected substrates
1859 # from the experimental dataset for each kinase.
1860 # (KSEA.Scores.R line # 131)
1861 mean_fc$m <-
1862 mean_fc_m <- kinase_list
1863
1864
626 # Create column 6: z-score 1865 # Create column 6: z-score
627 mean_fc$z_score <- ( 1866 # - z.score
628 (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) * 1867 # is the normalized score for each kinase, weighted by
629 sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE) 1868 # the number of identified substrates.
1869 # (KSEA.Scores.R line # 132)
1870 mean_fc$z_score <-
1871 (mean_fc_m_s - mean_log2_fc_j_all_i) * sqrt(mean_fc_m) /
1872 sd(log2_fc_j_each_i, na.rm = TRUE)
1873
630 # Create column 7: p-value, deduced from z-score 1874 # Create column 7: p-value, deduced from z-score
631 mean_fc$p_value <- pnorm(-abs(mean_fc$z_score)) 1875 # - p.value
1876 # represents the statistical assessment for the z.score.
1877 # (KSEA.Scores.R line # 133)
1878 # "one-tailed p-value"
1879 mean_fc$p_value <-
1880 pnorm(-abs(mean_fc$z_score))
1881
1882 # zap excluded kinases; this must be done before adjusting p-value
1883 if (TRUE) {
1884 mean_fc <-
1885 mean_fc[
1886 mean_fc$m >= minimum_substrate_count,
1887 ,
1888 drop = FALSE
1889 ]
1890 }
1891
1892 #ACE nb(see_variable(nrow(mean_fc)), "\n")
632 # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value 1893 # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value
633 mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr") 1894 # - FDR
634 1895 # is the p-value adjusted for multiple hypothesis testing
635 # Remove log2FC column, which is duplicated as mS 1896 # using the Benjamini & Hochberg method."
636 mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2] 1897 # (KSEA.Scores.R line # 134)
1898 mean_fc$fdr <-
1899 p.adjust(mean_fc$p_value, method = "fdr")
1900
1901 # It makes no sense to leave Z-scores negative when using
1902 # absolute value of fold-change
1903 if (params$kseaUseAbsoluteLog2FC) {
1904 mean_fc$z_score <- abs(mean_fc$z_score)
1905 }
1906
1907 # Remove second column (log2FC), which is duplicated as mS
1908 # (KSEA.Scores.R line # 136)
1909 mean_fc <-
1910 mean_fc[order(mean_fc$Kinase.Gene), -2]
637 # Correct the column names which we had to hack because of the linter... 1911 # Correct the column names which we had to hack because of the linter...
638 colnames(mean_fc) <- c( 1912 colnames(mean_fc) <- c(
639 "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" 1913 "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR"
640 ) 1914 )
1915 # (KSEA.Scores.R line # 138)
641 return(mean_fc) 1916 return(mean_fc)
642 } 1917 }
643 1918
644 low_fdr_barplot <- function( 1919 ksea_low_fdr_barplot_factory <- function(
645 rslt, 1920 rslt,
646 i_cntrst, 1921 i_cntrst,
647 i, 1922 i,
648 a_level, 1923 a_level,
649 b_level, 1924 b_level,
671 k$fdr 1946 k$fdr
672 }, 1947 },
673 "p.value" = { 1948 "p.value" = {
674 k$p_value 1949 k$p_value
675 }, 1950 },
676 stop( 1951 {
677 sprintf( 1952 cat(
678 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", 1953 sprintf(
679 ksea_cutoff_statistic 1954 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
1955 ksea_cutoff_statistic
1956 )
680 ) 1957 )
681 ) 1958 param_df_exit()
682 ) 1959 knitr::knit_exit()
1960 }
1961 )
683 1962
684 k <- k[selector < ksea_cutoff_threshold, ] 1963 k <- k[selector < ksea_cutoff_threshold, ]
685 1964 nrow_k <- nrow(k)
686 if (nrow(k) > 0) { 1965
687 op <- par(mai = c(1, 1.5, 0.4, 0.4)) 1966 #ACE nbe(see_variable(fdr_barplot_dataframe <- k))
1967
1968 if (nrow_k > 0) {
1969 max_nchar_rowname <- max(nchar(rownames(k)))
1970 my_cex_names <- 1.0 / (1 + nrow_k / 50)
1971
1972 if (print_trace_messages) cat_margins("Initially")
1973 if (print_trace_messages) cat_variable(nrow_k, "\n\n", 0)
1974 if (print_trace_messages) cat_variable(my_cex_names, "\n\n", 0)
1975 if (print_trace_messages) cat_variable(max_nchar_rowname, "\n\n", 0)
1976
1977 # fin: The figure region dimensions, (width, height), in inches.
1978 # mar: A numerical vector of the form c(bottom, left, top, right)
1979 # that gives the number of lines of margin to be specified
1980 # on the four sides of the plot; default: c(5, 4, 4, 2) + 0.1
1981
1982 # mar: The figure region dimensions, (width, height), in inches.
688 numeric_z_score <- as.numeric(k$z_score) 1983 numeric_z_score <- as.numeric(k$z_score)
689 z_score_order <- order(numeric_z_score) 1984 bar_order <- order(-as.numeric(k$p_value))
690 kinase_name <- k$kinase_gene 1985 kinase_name <- k$kinase_gene
691 long_caption <- 1986 long_caption <-
692 sprintf( 1987 sprintf(
693 "Kinase z-score, %s < %s, %s", 1988 "Kinase z-score, %s, KSEA %s < %s",
1989 caption,
694 ksea_cutoff_statistic, 1990 ksea_cutoff_statistic,
695 ksea_cutoff_threshold, 1991 ksea_cutoff_threshold
696 caption
697 ) 1992 )
698 my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) 1993 my_cex_caption <- 65.0 / max(65.0, nchar(long_caption))
699 cat("\n\\clearpage\n") 1994 # return a function that draws the plot
700 barplot( 1995 function() {
701 height = numeric_z_score[z_score_order], 1996 par_fin <- par("fin") # vector of width_in_inches and height_in_inches)
702 border = NA, 1997 op <- par(
703 xpd = FALSE, 1998 bg = if (print_trace_messages) "yellow" else "white",
704 cex.names = 1.0, 1999 fin = c(par_fin[1], min(par_fin[2], 2.5 + nrow_k / 6)),
705 main = long_caption, 2000 mar = par("mar") +
706 cex.main = my_cex_caption, 2001 c(3 / nrow_k, (1 + max_nchar_rowname * my_cex_names) / 2, 0, 0)
707 names.arg = kinase_name[z_score_order], 2002 # bottom, left, top, right
708 horiz = TRUE,
709 srt = 45,
710 las = 1,
711 cex.axis = 0.9
712 ) 2003 )
713 par(op) 2004 on.exit(par(op))
2005 if (print_trace_messages) cat_margins("Eventually")
2006
2007 barplot(
2008 height = numeric_z_score[bar_order],
2009 border = NA,
2010 xpd = FALSE,
2011 cex.names = my_cex_names,
2012 main = long_caption,
2013 cex.main = my_cex_caption,
2014 names.arg = kinase_name[bar_order],
2015 horiz = TRUE,
2016 srt = 45,
2017 las = 1,
2018 cex.axis = 0.9
2019 )
2020 }
714 } 2021 }
2022 } else {
2023 no_op
715 } 2024 }
716 } 2025 }
717 2026
718 # note that this adds elements to the global variable `ksea_asterisk_hash` 2027 # note that this adds elements to the global variable `ksea_asterisk_hash`
719 2028
720 low_fdr_print <- function( 2029 ksea_low_fdr_print <- function(
721 rslt, 2030 rslt,
722 i_cntrst, 2031 i_cntrst,
723 i, 2032 i,
724 a_level, 2033 a_level,
725 b_level, 2034 b_level,
726 fold_change, 2035 fold_change,
727 caption 2036 caption,
2037 write_db = TRUE, # if TRUE, write to DB, else print table
2038 anchor = c(const_table_anchor_p, const_table_anchor_t)
728 ) { 2039 ) {
729 rslt_score_list_i <- rslt$score_list[[i]] 2040 rslt_score_list_i <- rslt$score_list[[i]]
730 if (!is.null(rslt_score_list_i)) { 2041 if (!is.null(rslt_score_list_i)) {
731 rslt_score_list_i_nrow <- nrow(rslt_score_list_i) 2042 rslt_score_list_i_nrow <- nrow(rslt_score_list_i)
732 k <- contrast_ksea_scores <- data.frame( 2043 k <- contrast_ksea_scores <- data.frame(
748 k$fdr 2059 k$fdr
749 }, 2060 },
750 "p.value" = { 2061 "p.value" = {
751 k$p_value 2062 k$p_value
752 }, 2063 },
753 stop( 2064 {
754 sprintf( 2065 cat(
755 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", 2066 sprintf(
756 ksea_cutoff_statistic 2067 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
2068 ksea_cutoff_statistic
2069 )
757 ) 2070 )
758 ) 2071 param_df_exit()
759 ) 2072 knitr::knit_exit()
2073 }
2074 )
760 2075
761 k <- k[selector < ksea_cutoff_threshold, ] 2076 k <- k[selector < ksea_cutoff_threshold, ]
762 # save kinase names to ksea_asterisk_hash 2077 # save kinase names to ksea_asterisk_hash
763 for (kinase_name in k$kinase_gene) { 2078 for (kinase_name in k$kinase_gene) {
764 ksea_asterisk_hash[[kinase_name]] <- 1 2079 ksea_asterisk_hash[[kinase_name]] <- 1
765 } 2080 }
766 2081
767 db_write_table_overwrite <- (i_cntrst < 2) 2082 if (write_db) {
768 db_write_table_append <- !db_write_table_overwrite 2083 db_write_table_overwrite <- (i_cntrst < 2)
769 RSQLite::dbWriteTable( 2084 db_write_table_append <- !db_write_table_overwrite
770 conn = db, 2085 RSQLite::dbWriteTable(
771 name = "contrast_ksea_scores", 2086 conn = db,
772 value = contrast_ksea_scores, 2087 name = "contrast_ksea_scores",
773 append = db_write_table_append 2088 value = contrast_ksea_scores,
774 ) 2089 append = db_write_table_append
775 selector <- switch( 2090 )
776 ksea_cutoff_statistic, 2091 ""
777 "FDR" = { 2092 } else {
778 contrast_ksea_scores$fdr 2093 selector <- switch(
779 }, 2094 ksea_cutoff_statistic,
780 "p.value" = { 2095 "FDR" = {
781 contrast_ksea_scores$p_value 2096 contrast_ksea_scores$fdr
782 }, 2097 },
783 stop( 2098 "p.value" = {
784 sprintf( 2099 contrast_ksea_scores$p_value
785 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", 2100 },
786 ksea_cutoff_statistic 2101 {
2102 cat(
2103 sprintf(
2104 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
2105 ksea_cutoff_statistic
2106 )
2107 )
2108 param_df_exit()
2109 knitr::knit_exit()
2110 }
2111 )
2112 if (print_nb_messages) nbe(see_variable(contrast_ksea_scores))
2113 output_df <- contrast_ksea_scores[
2114 selector < ksea_cutoff_threshold,
2115 c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count",
2116 "z_score", "p_value", "fdr")
2117 ]
2118 output_df$kinase_gene <-
2119 gsub(
2120 "_",
2121 "\\\\_",
2122 output_df$kinase_gene
2123 )
2124 colnames(output_df) <-
2125 c(
2126 colnames(output_df)[1],
2127 colnames(output_df)[2],
2128 "enrichment",
2129 "m_s",
2130 "z_score",
2131 "p_value",
2132 "fdr"
2133 )
2134 #ACE output_order <- with(output_df, order(fdr))
2135 output_order <- with(output_df, order(p_value))
2136 output_df <- output_df[output_order, ]
2137
2138 output_df[, 2] <- sprintf("%0.3g", output_df[, 2])
2139 output_df$fdr <- sprintf("%0.4f", output_df$fdr)
2140 output_df$p_value <- sprintf("%0.2e", output_df$p_value)
2141 output_df$z_score <- sprintf("%0.2f", output_df$z_score)
2142 output_df$m_s <- sprintf("%d", output_df$m_s)
2143 output_df$enrichment <- sprintf("%0.3g", output_df$enrichment)
2144 output_ncol <- ncol(output_df)
2145 colnames(output_df) <-
2146 c(
2147 "Kinase",
2148 "\\(\\overline{{\\lvert}\\log_2 (\\text{fold-change}){\\rvert}}\\)",
2149 "Enrichment",
2150 "Substrates",
2151 "z-score",
2152 "p-value",
2153 "FDR"
2154 )
2155 selector <- switch(
2156 ksea_cutoff_statistic,
2157 "FDR" = {
2158 rslt$score_list[[i]]$FDR
2159 },
2160 "p.value" = {
2161 rslt$score_list[[i]]$p.value
2162 },
2163 {
2164 cat(
2165 sprintf(
2166 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
2167 ksea_cutoff_statistic
2168 )
2169 )
2170 param_df_exit()
2171 knitr::knit_exit()
2172 }
2173 )
2174 if (sum(selector < ksea_cutoff_threshold) > 0) {
2175 if (print_nb_messages) nbe(see_variable(output_df))
2176 math_caption <- gsub("{", "\\{", caption, fixed = TRUE)
2177 math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE)
2178 # with (
2179 # output_df,
2180 # )
2181 if (TRUE) {
2182 output_df$Kinase <- whack_underscores(output_df$Kinase)
2183 data_frame_tabbing_latex(
2184 x = output_df,
2185 # vector of tab stops, in inches
2186 tabstops = c(1.0, 1.2, 1.0, 1.0, 1.0, 1.0),
2187 # vector of headings, registered with tab-stops
2188 headings = colnames(output_df),
2189 # digits to pass to format.data.frame
2190 digits = NULL,
2191 # maximumn number of rows to print
2192 max = NULL,
2193 # optional caption
2194 caption = sprintf(
2195 "\\text{%s}, KSEA %s < %s",
2196 math_caption,
2197 ksea_cutoff_statistic,
2198 ksea_cutoff_threshold
2199 ),
2200 # set underscore_whack to TRUE to escape underscores
2201 underscore_whack = FALSE,
2202 # flag for landscape mode
2203 landscape = FALSE,
2204 # flag indicating that subsubsection should be used for caption
2205 # rather than subsection
2206 use_subsubsection_header = TRUE,
2207 # character-size indicator; for possible values, see:
2208 # https://tug.org/texinfohtml/latex2e.html#Font-sizes
2209 charactersize = "small",
2210 # set verbatim to TRUE to debug output
2211 verbatim = FALSE
2212 )
2213 } else {
2214 data_frame_table_latex(
2215 x = output_df,
2216 justification = "l c c c c c c",
2217 centered = TRUE,
2218 caption = sprintf(
2219 "\\text{%s}, KSEA %s < %s",
2220 math_caption,
2221 ksea_cutoff_statistic,
2222 ksea_cutoff_threshold
2223 ),
2224 anchor = anchor,
2225 underscore_whack = FALSE
2226 )
2227 }
2228 } else {
2229 cat(
2230 sprintf(
2231 "\\break
2232 No kinases had
2233 \\(\\text{KSEA %s}_\\text{enrichment} < %s\\)
2234 for contrast %s\\hfill\\break\n",
2235 ksea_cutoff_statistic,
2236 ksea_cutoff_threshold,
2237 caption
787 ) 2238 )
788 ) 2239 )
789 ) 2240 }
790 output_df <- contrast_ksea_scores[
791 selector < ksea_cutoff_threshold,
792 c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count",
793 "z_score", "p_value", "fdr")
794 ]
795 output_order <- with(output_df, order(mean_log2_fc, kinase_gene))
796 output_df <- output_df[output_order, ]
797 colnames(output_df) <-
798 c(
799 colnames(output_df)[1],
800 colnames(output_df)[2],
801 "enrichment",
802 "m_s",
803 "z_score",
804 "p_value",
805 "fdr"
806 )
807 output_df$fdr <- sprintf("%0.4f", output_df$fdr)
808 output_df$p_value <- sprintf("%0.2e", output_df$p_value)
809 output_df$z_score <- sprintf("%0.2f", output_df$z_score)
810 output_df$m_s <- sprintf("%d", output_df$m_s)
811 output_df$enrichment <- sprintf("%0.2f", output_df$enrichment)
812 output_ncol <- ncol(output_df)
813 colnames(output_df) <-
814 c(
815 "Kinase",
816 "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)",
817 "Enrichment",
818 "Substrates",
819 "z-score",
820 "p-value",
821 "FDR"
822 )
823 selector <- switch(
824 ksea_cutoff_statistic,
825 "FDR" = {
826 rslt$score_list[[i]]$FDR
827 },
828 "p.value" = {
829 rslt$score_list[[i]]$p.value
830 },
831 stop(
832 sprintf(
833 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
834 ksea_cutoff_statistic
835 )
836 )
837 )
838 if (sum(selector < ksea_cutoff_threshold) > 0) {
839 math_caption <- gsub("{", "\\{", caption, fixed = TRUE)
840 math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE)
841 data_frame_latex(
842 x = output_df,
843 justification = "l c c c c c c",
844 centered = TRUE,
845 caption = sprintf(
846 "\\text{%s}, %s < %s",
847 math_caption,
848 ksea_cutoff_statistic,
849 ksea_cutoff_threshold
850 ),
851 anchor = const_table_anchor_p
852 )
853 } else {
854 cat(
855 sprintf(
856 "\\break
857 No kinases had
858 \\(\\text{%s}_\\text{enrichment} < %s\\)
859 for contrast %s\\hfill\\break\n",
860 ksea_cutoff_statistic,
861 ksea_cutoff_threshold,
862 caption
863 )
864 )
865 } 2241 }
2242 } else {
2243 ""
866 } 2244 }
867 } 2245 }
868 2246
869 # create_breaks is a helper for ksea_heatmap 2247 # create_breaks is a helper for ksea_heatmap
870 create_breaks <- function(merged_scores) { 2248 create_breaks <- function(merged_scores) {
905 mycol <- unique(append(mycol_neg, mycol_pos)) 2283 mycol <- unique(append(mycol_neg, mycol_pos))
906 color_breaks <- list(breaks_all, mycol) 2284 color_breaks <- list(breaks_all, mycol)
907 return(color_breaks) 2285 return(color_breaks)
908 } 2286 }
909 2287
2288 hm2plus <- function(
2289 x,
2290 mat = matrix(
2291 c(
2292 c(0, 4, 0),
2293 c(0, 3, 3),
2294 c(2, 1, 1)
2295 ),
2296 nrow = 3,
2297 ncol = 3,
2298 byrow = TRUE
2299 ),
2300 denwid = 0.5,
2301 denhgt = 0.15,
2302 widths = c(0.5, 2.5, 1.5),
2303 heights = c(0.4, 0.15, 3.95),
2304 divergent = FALSE,
2305 notecol = "grey50",
2306 trace = "none",
2307 margins = c(6, 20),
2308 srtcol = 90,
2309 srtrow = 0,
2310 density_info = "none",
2311 key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
2312 key_par = list(),
2313 hclustfun = hclust,
2314 ...
2315 ) {
2316
2317 varargs <- list(...)
2318 if (FALSE) # this is to avoid commenting out code to pass linting...
2319 my_hm2 <- latex_show_invocation(heatmap.2, head_patch = FALSE)
2320 else
2321 my_hm2 <- heatmap.2
2322
2323 x <- as.matrix(x)
2324 if (sum(!is.na(x)) < 1)
2325 return(NULL)
2326 color_count <- 1 + max(64, length(as.vector(x))) # 8 was not enough
2327 break_count <- 1 + color_count
2328 min_nonax <- min(x, na.rm = TRUE)
2329 max_nonax <- max(x, na.rm = TRUE)
2330 if (print_nb_messages) nb("within hm2plus", see_variable(divergent), "\n")
2331 if (divergent) {
2332 zlim <- max(abs(min_nonax), abs(max_nonax))
2333 if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n"))
2334 breaks <- (zlim) / (break_count:1)
2335 if (print_nb_messages) nb(see_variable(breaks, "\n"))
2336 breaks <- breaks - median(breaks)
2337 zlim <- c(-zlim, zlim)
2338 if (print_nb_messages) nb(see_variable(zlim, "\n"))
2339 } else {
2340 zlim <- max(abs(min_nonax), abs(max_nonax))
2341 if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n"))
2342 breaks <- zlim / (break_count:1)
2343 if (print_nb_messages) nb(see_variable(breaks, "\n"))
2344 if (max_nonax < 0) {
2345 breaks <- breaks - zlim
2346 zlim <- c(-zlim, 0)
2347 } else {
2348 zlim <- c(0, zlim)
2349 }
2350 if (print_nb_messages) nb(see_variable(zlim, "\n"))
2351 }
2352 nonax <- x
2353 nonax[is.na(x)] <- min_nonax
2354 if (is.null(widths)) widths <- c(denwid, 4 - denwid, 1.5)
2355 if (is.null(heights)) heights <- c(0.4, denhgt, 4.0)
2356 colors <-
2357 if (divergent && min_nonax < 0) {
2358 # divergent colors on both sides of zero
2359 colorRampPalette(c("red", "white", "blue"))(color_count)
2360 } else if (divergent && min_nonax > 0) {
2361 # "divergent" colors > zero
2362 colorRampPalette(c("white", "blue"))(color_count)
2363 } else if (divergent && max_nonax < 0) {
2364 # "divergent" colors < zero
2365 colorRampPalette(c("red", "white"))(color_count)
2366 } else {
2367 # "non-divergent" colors including zero
2368 hcl.colors(color_count, "YlOrRd", rev = TRUE)
2369 }
2370
2371 #ACE if (print_nb_messages) nb("within hm2plus", see_variable(key_par), "\n")
2372 #ACE if (print_nb_messages) nb(see_variable(colors, "\n"))
2373 #ACE key_par$col = colors
2374 #ACE key_par$breaks = breaks
2375
2376 if (print_nb_messages) nb(see_variable(par(), "\n")) #ACE TODO remove me
2377 if (print_nb_messages) cat("\\leavevmode\n\\linebreak\n") #ACE TODO remove me
2378 suppressWarnings(
2379 my_hm2(
2380 x = x,
2381 col = colors,
2382 #ACE symkey = FALSE,
2383 density.info = density_info,
2384 srtCol = srtcol,
2385 srtRow = srtrow,
2386 margins = margins,
2387 lwid = widths,
2388 lhei = heights,
2389 key.title = NA,
2390 key.xlab = key_xlab,
2391 key.par = key_par,
2392 lmat = mat,
2393 notecol = notecol,
2394 trace = trace,
2395 bg = "yellow",
2396 hclustfun = hclustfun,
2397 #ACE breaks = breaks,
2398 oldstyle = FALSE,
2399 ... # varargs
2400 )
2401 )
2402 # implicitly returning value returned by heatmap.2
2403 }
2404
910 # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap 2405 # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap
911 draw_kseaapp_summary_heatmap <- function( 2406 draw_kseaapp_summary_heatmap <- function(
912 x, 2407 x, # matrix with row/col names already formatted
913 sample_cluster, 2408 sample_cluster, # a binary input of TRUE or FALSE,
914 merged_asterisk, 2409 # indicating whether or not to perform
915 my_cex_row, 2410 # hierarchical clustering of the sample columns
916 color_breaks, 2411 merged_asterisk, # matrix having dimensions of x, values "*" or ""
917 margins, 2412 color_breaks, # breaks for color gradation, from create_breaks
2413 # passed to `breaks` argument of `image`
2414 margins = c(8, 15), # two integers setting the bottom and right margins
2415 # to accommodate row and column labels
2416 master_cex = 0.7, # basis for text sizes
918 ... 2417 ...
919 ) { 2418 ) {
920 merged_scores <- x 2419 merged_scores <- x
921 if (!is.matrix(x)) { 2420 if (!is.matrix(x)) {
922 cat( 2421 cat(
924 "No plot because \\texttt{typeof(x)} is '", 2423 "No plot because \\texttt{typeof(x)} is '",
925 typeof(x), 2424 typeof(x),
926 "' rather than 'matrix'.\n\n" 2425 "' rather than 'matrix'.\n\n"
927 ) 2426 )
928 ) 2427 )
929 } else if (nrow(x) < 2) { 2428 cat_variable(x)
930 cat("No plot because matrix has ", nrow(x), " rows.\n\n")
931 return(FALSE) 2429 return(FALSE)
932 } else if (ncol(x) < 2) { 2430 }
933 cat("No plot because matrix x has ", ncol(x), " columns.\n\n") 2431 if (print_trace_messages) cat(sprintf("master_cex = %03f\n\n", master_cex))
2432 nrow_x <- nrow(x)
2433 ncol_x <- ncol(x)
2434 #if (nrow_x < 2) {
2435 if (nrow_x < 1) {
2436 cat("No plot because matrix has no rows.\n\n")
934 return(FALSE) 2437 return(FALSE)
935 } else { 2438 } else if (nrow_x < 2) {
936 my_limit <- 25 2439 cat("No plot because matrix has one row. Matrix looks like this:\n\n")
937 my_cex_col <- my_limit / (my_limit + ncol(x)) 2440 cat("\n\\begin{verbatim}\n")
938 my_cex_row <- my_limit / (my_limit + nrow(x)) 2441 print(x)
939 my_scale <- 12.0 2442 cat("\n\\end{verbatim}\n")
940 if (ncol(x) < 10 && nrow(x) < 10) 2443 return(FALSE)
941 my_scale <- my_scale * 10 / (10 - nrow(x)) * 10 / (10 - ncol(x)) 2444 } else if (ncol_x < 2) {
942 gplots::heatmap.2( 2445 cat("No plot because matrix x has ", ncol_x, " columns.\n\n")
943 x = merged_scores, 2446 cat_variable(x)
944 Colv = sample_cluster, 2447 return(FALSE)
945 breaks = color_breaks[[1]], 2448 }
946 cellnote = merged_asterisk, 2449 max_nchar_rowname <- max(nchar(rownames(x)))
947 cexCol = 0.9 * my_cex_col, 2450 max_nchar_colname <- max(nchar(colnames(x)))
948 cexRow = 2 * my_cex_row, 2451 my_limit <- g_intensity_hm_rows
949 col = color_breaks[[2]], 2452
950 density.info = "none", 2453 my_row_cex_scale <- master_cex * 150 / nrow_x
951 key = FALSE, 2454 my_col_cex_scale <- 3.0
952 lhei = c(0.4, 8.0, 1.1), 2455 my_asterisk_scale <- 0.4 * my_row_cex_scale
953 lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), 2456 my_row_warp <- 1
954 lwid = c(0.5, 3), 2457 my_note_warp <- 2
955 margins = margins, 2458 my_row_warp <- 1
956 notecex = my_scale * my_cex_row * my_cex_col, 2459 my_row_cex_asterisk <-
957 notecol = "white", 2460 master_cex * my_row_warp * my_asterisk_scale
958 scale = "none", 2461
959 srtCol = 45, 2462
960 srtRow = 45, 2463 my_col_cex <- my_col_cex_scale * master_cex
961 trace = "none", 2464 my_row_cex <- min(3.5 * my_row_cex_asterisk, my_col_cex)
962 ... 2465 my_key_cex <- 1.286
963 ) 2466 my_hm2_cex <- 1 * master_cex
964 return(TRUE) 2467 my_offset <- (4.8 / (9 + nrow_x / 10)) - 0.4
965 } 2468 if (print_trace_messages) cat(sprintf("nrow_x = %03f\n\n", nrow_x))
966 } 2469 if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset))
967 2470 my_offset <- 0.05
968 # Adapted from KSEAapp::KSEA.Heatmap 2471 if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset))
2472 my_scale <- 3.0
2473 if (ncol_x < 10 && nrow_x < 10)
2474 my_scale <- my_scale * 10 / (10 - nrow_x) * 10 / (10 - ncol_x)
2475
2476 my_heights <- c(
2477 0.15,
2478 3.85 - my_offset,
2479 0.5 + my_offset
2480 )
2481 my_margins <- c(1, 1) +
2482 c(
2483 margins[1] * 0.08 * max_nchar_colname * my_col_cex,
2484 margins[2] * 0.04 * max_nchar_rowname * my_row_cex
2485 )
2486
2487 my_notecex <-
2488 my_scale *
2489 min(
2490 1.1,
2491 my_row_cex_asterisk * my_note_warp,
2492 my_col_cex * my_note_warp
2493 )
2494
2495 if (print_trace_messages) {
2496 cat_variable(my_heights, suffix = "; ")
2497 cat_variable(my_margins, suffix = "\n\n")
2498 cat_variable(my_row_cex_scale, suffix = "; ")
2499 cat_variable(my_col_cex_scale, suffix = "\n\n")
2500 cat_variable(my_row_cex_asterisk, suffix = "\n\n")
2501 cat_variable(my_row_cex, suffix = "; ")
2502 cat_variable(my_col_cex, suffix = "\n\n")
2503 cat_variable(my_row_cex, suffix = "; ")
2504 cat_variable(my_col_cex, suffix = "\n\n")
2505 }
2506
2507 hm2plus(
2508 x = merged_scores,
2509 Colv = sample_cluster,
2510 cellnote = merged_asterisk,
2511 cex = my_hm2_cex,
2512 cexCol = my_col_cex,
2513 cexRow = my_row_cex,
2514 denhgt = 0.15,
2515 density_info = "none",
2516 denwid = 0.5,
2517 divergent = TRUE,
2518 key_par = list(cex = my_key_cex),
2519 key_xlab = "Z-score",
2520 margins = my_margins,
2521 notecex = my_scale * min(
2522 1.5,
2523 my_row_cex_asterisk * my_note_warp,
2524 my_col_cex * my_note_warp
2525 ),
2526 notecol = "white",
2527 scale = "none",
2528 srtcol = 90,
2529 srtrow = 0,
2530 trace = "none",
2531 mat = matrix(
2532 c(
2533 c(0, 3, 3),
2534 c(2, 1, 1),
2535 c(0, 4, 0)
2536 ),
2537 nrow = 3,
2538 ncol = 3,
2539 byrow = TRUE
2540 ),
2541 widths = c(0.5, 3.1, 0.9),
2542 heights = my_heights,
2543 ...
2544 )
2545 return(TRUE)
2546 }
2547
2548 # function drawing heatmap of contrast fold-change for each kinase,
2549 # adapted from KSEAapp::KSEA.Heatmap
969 ksea_heatmap <- function( 2550 ksea_heatmap <- function(
970 # the data frame outputs from the KSEA.Scores() function, in list format 2551 # the data frame outputs from the KSEA.Scores() function, in list format
971 score_list, 2552 score_list,
972 # a character vector of all the sample names for heatmap annotation: 2553 # a character vector of all the sample names for heatmap annotation:
973 # - the names must be in the same order as the data in score_list 2554 # - the names must be in the same order as the data in score_list
979 # a numeric value between 0 and infinity indicating the min. number of 2560 # a numeric value between 0 and infinity indicating the min. number of
980 # substrates a kinase must have to be included in the heatmap 2561 # substrates a kinase must have to be included in the heatmap
981 m_cutoff, 2562 m_cutoff,
982 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff 2563 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
983 # for indicating significant kinases in the heatmap 2564 # for indicating significant kinases in the heatmap
984 p_cutoff = 2565 p_cutoff = {
985 stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"), 2566 cat("argument 'p_cutoff' is required for function 'ksea_heatmap'")
2567 param_df_exit()
2568 knitr::knit_exit()
2569 },
986 # a binary input of TRUE or FALSE, indicating whether or not to perform 2570 # a binary input of TRUE or FALSE, indicating whether or not to perform
987 # hierarchical clustering of the sample columns 2571 # hierarchical clustering of the sample columns
988 sample_cluster, 2572 sample_cluster,
989 # a binary input of TRUE or FALSE, indicating whether or not to export 2573 # a binary input of TRUE or FALSE, indicating whether or not to export
990 # the heatmap as a .png image into the working directory 2574 # the heatmap as a .png image into the working directory
991 export = FALSE, 2575 export = FALSE,
992 # bottom and right margins; adjust as needed if contrast names are too long 2576 # bottom and right margins; adjust as needehttps://tex.stackexchange.com/a/56795d if contrast names are too long
993 margins = c(6, 20), 2577 margins = c(6, 6),
994 # print which kinases? 2578 # print which kinases?
995 # - Mandatory argument, must be one of const_ksea_.*_kinases 2579 # - Mandatory argument, must be one of const_ksea_.*_kinases
996 which_kinases, 2580 which_kinases,
997 # additional arguments to gplots::heatmap.2, such as: 2581 # additional arguments to gplots::heatmap.2, such as:
998 # - main: main title of plot 2582 # - main: main title of plot
1037 } 2621 }
1038 return(new) 2622 return(new)
1039 } 2623 }
1040 merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) 2624 merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff))
1041 2625
1042 # begin hack to print only significant rows
1043 asterisk_rows <- rowSums(merged_asterisk == "*") > 0 2626 asterisk_rows <- rowSums(merged_asterisk == "*") > 0
1044 all_rows <- rownames(merged_stats) 2627 all_rows <- rownames(merged_stats)
1045 names(asterisk_rows) <- all_rows 2628 names(asterisk_rows) <- all_rows
1046 non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) 2629 non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE])
1047 asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) 2630 asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE])
1048 merged_scores_asterisk <- merged_scores[names(asterisk_rows), , drop = FALSE] 2631 merged_scores_asterisk <- merged_scores[names(asterisk_rows), , drop = FALSE]
1049 merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), , drop = FALSE] 2632 merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), , drop = FALSE]
1050 # end hack to print only significant rows
1051 2633
1052 row_list <- list() 2634 row_list <- list()
1053 row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows 2635 row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows
1054 row_list[[const_ksea_all_kinases]] <- all_rows 2636 row_list[[const_ksea_all_kinases]] <- all_rows
1055 row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows 2637 row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows
1060 stts <- merged_stats[my_row_names, , drop = FALSE] 2642 stts <- merged_stats[my_row_names, , drop = FALSE]
1061 merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) 2643 merged_asterisk <- as.matrix(asterisk(stts, p_cutoff))
1062 2644
1063 color_breaks <- create_breaks(scrs) 2645 color_breaks <- create_breaks(scrs)
1064 if (is.null(color_breaks)) { 2646 if (is.null(color_breaks)) {
1065 cat("No plot because matrix has too many missing values.\n\n") 2647 cat("No plot because matrix has too few rows.\n\n")
1066 return(NULL) 2648 return(NULL)
1067 } 2649 }
1068 plot_height <- nrow(scrs) ^ 0.55 2650 plot_height <- nrow(scrs) ^ 0.55
1069 plot_width <- ncol(scrs) ^ 0.7 2651 plot_width <- ncol(scrs) ^ 0.7
1070 my_cex_row <- 0.25 * 16 / plot_height
1071 if (export == "TRUE") { 2652 if (export == "TRUE") {
1072 png( 2653 png(
1073 "KSEA.Merged.Heatmap.png", 2654 "KSEA.Merged.Heatmap.png",
1074 width = plot_width * 300, 2655 width = plot_width * 300,
1075 height = 2 * plot_height * 300, 2656 height = 2 * plot_height * 300,
1079 } 2660 }
1080 did_draw <- draw_kseaapp_summary_heatmap( 2661 did_draw <- draw_kseaapp_summary_heatmap(
1081 x = scrs, 2662 x = scrs,
1082 sample_cluster = sample_cluster, 2663 sample_cluster = sample_cluster,
1083 merged_asterisk = merged_asterisk, 2664 merged_asterisk = merged_asterisk,
1084 my_cex_row = my_cex_row,
1085 color_breaks = color_breaks, 2665 color_breaks = color_breaks,
1086 margins = margins 2666 margins = margins
1087 ) 2667 )
1088 if (export == "TRUE") { 2668 if (export == "TRUE") {
1089 dev.off() 2669 dev.off()
1091 if (!did_draw) 2671 if (!did_draw)
1092 return(NULL) 2672 return(NULL)
1093 return(my_row_names) 2673 return(my_row_names)
1094 } 2674 }
1095 2675
1096 # helper for heatmaps of phosphopeptide intensities 2676 # helpers for heatmaps of phosphopeptide intensities
1097 2677
1098 draw_ppep_heatmap <- 2678 # factory producing function to truncate string after n characters
2679 trunc_n <- function(n) {
2680 function(x) {
2681 sapply(
2682 X = x,
2683 FUN = function(s) {
2684 if (is.na(s))
2685 return("NA")
2686 cond <- try_catch_w_e(nchar(s) > n)
2687 if (!is.logical(cond$value)) {
2688 return(cond$value$message)
2689 } else if (cond$value) {
2690 paste0(
2691 strtrim(s, n),
2692 "..."
2693 )
2694 } else {
2695 s
2696 }
2697 },
2698 USE.NAMES = FALSE
2699 )
2700 }
2701 }
2702 trunc_long_ppep <- function(x) trunc_n(40)(x)
2703 trunc_ppep <- function(x) trunc_n(g_ppep_trunc_n)(x)
2704 trunc_subgene <- function(x) trunc_n(g_subgene_trunc_n)(x)
2705 trunc_enriched_substrate <- function(x) trunc_n(g_sbstr_trunc_n)(x)
2706
2707 # factory producing a function that returns a covariance
2708 # matrix's rows (and columns) having variance > v_min
2709 keep_cov_w_var_gtr_min <- function(v_min) {
2710 function(x) {
2711 if (!is.matrix(x))
2712 return(NULL)
2713 keepers <- sapply(
2714 X = seq_len(nrow(x)),
2715 FUN = function(i) {
2716 if (x[i, i] < v_min)
2717 NA
2718 else
2719 x[i, i]
2720 }
2721 )
2722 names(keepers) <- rownames(x)
2723 keepers <- keepers[!is.na(keepers)]
2724 keepers <- names(keepers)
2725 if (length(keepers) == 0)
2726 return(NULL)
2727 x[keepers, keepers]
2728 }
2729 }
2730 # function that returns a matrix's rows having variance > 1
2731 keep_cov_w_var_gtr_1 <- keep_cov_w_var_gtr_min(1)
2732
2733 # factory producing a function that returns
2734 # - either a matrix's rows (rows = TRUE)
2735 # - or a matrix's columns (rows = FALSE)
2736 # having variance > v_min
2737 keep_var_gtr_min <- function(v_min) {
2738 function(x, rows = TRUE) {
2739 nrowcol <- if (rows) nrow else ncol
2740 if (!is.matrix(x))
2741 return(NULL)
2742 keepers <- sapply(
2743 X = seq_len(nrowcol(x)),
2744 FUN = function(i) {
2745 row_var <- var(
2746 if (rows) x[i, ] else x[, i],
2747 na.rm = TRUE
2748 )
2749 if (is.na(row_var) || row_var <= v_min) NA else i
2750 }
2751 )
2752 keepers <- keepers[!is.na(keepers)]
2753 if (rows) x[keepers, ] else x[, keepers]
2754 }
2755 }
2756
2757 keep_var_gtr_0 <- keep_var_gtr_min(0)
2758
2759 # function drawing heatmap of phosphopeptide intensities
2760 ppep_heatmap <-
1099 function( 2761 function(
1100 m, # matrix with rownames already formatted 2762 m, # matrix with rownames already formatted
1101 cutoff, # cutoff used by hm_heading_function 2763 cutoff, # cutoff used by hm_heading_function
1102 hm_heading_function, # construct and cat heading from m and cutoff 2764 hm_heading_function, # construct $ cat heading from m and cutoff
1103 hm_main_title, # main title for plot (drawn below heading) 2765 hm_main_title, # main title for plot (drawn below heading)
1104 suppress_row_dendrogram = TRUE, # set to false to show dendrogram 2766 suppress_row_dendrogram = TRUE, # set to false to show dendrogram
1105 max_peptide_count # experimental: 2767 max_peptide_count = # experimental:
1106 = intensity_hm_rows, # values of 50 and 75 worked well 2768 g_intensity_hm_rows, # values of 50 and 75 worked well
1107 ... # passthru parameters for heatmap 2769 master_cex = 1.0, # basis for text sizes
2770 margins = NULL, # optional margins (bottom, right)
2771 cellnote = NULL, # optional matrix of character; dim = dim(m)
2772 adj = 0.5, # adjust text: 0 left, 0.5 middle, 1 right
2773 ... # passthru to hm2plus or heatmap.2
1108 ) { 2774 ) {
2775 use_heatmap_1 <- FALSE
1109 peptide_count <- 0 2776 peptide_count <- 0
1110 # emit the heading for the heatmap 2777 # emit the heading for the heatmap
1111 if (hm_heading_function(m, cutoff)) { 2778 if (hm_heading_function(m, cutoff)) {
1112 peptide_count <- min(max_peptide_count, nrow(m)) 2779 nrow_m <- nrow(m)
1113 if (nrow(m) > 0) { 2780 peptide_count <- min(max_peptide_count, nrow_m)
2781 if (nrow_m > 1) {
1114 m_margin <- m[peptide_count:1, ] 2782 m_margin <- m[peptide_count:1, ]
1115 # Margin setting was heuristically derived 2783 # Margin was heuristically derived to accommodate the widest label
1116 margins <- 2784 row_mchar_max <- max(nchar(rownames(m_margin)))
1117 c(0.5, # col 2785 col_mchar_max <- max(nchar(colnames(m_margin)))
1118 max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row 2786 row_margin <- master_cex * row_mchar_max * 2.6
1119 ) 2787 col_margin <- master_cex * col_mchar_max * 2.6
1120 } 2788 if (print_trace_messages) cat(sprintf("row_margin = %0.3f; ", row_margin))
1121 if (nrow(m) > 0) { 2789 if (print_trace_messages) cat(sprintf("col_margin = %0.3f; ", col_margin))
1122 hm_call <- NULL 2790 hm_call <- NULL
1123 tryCatch( 2791 tryCatch(
1124 { 2792 {
1125 old_oma <- par("oma") 2793 # set non-argument parameters for hm_call inner function
1126 par(cex.main = 0.6) 2794 my_row_cex <-
1127 # Heuristically determined character size adjustment formula 2795 master_cex * 200000 / (
1128 my_cex_row <- 2796 (max(nchar(rownames(m_margin)))^2) * g_intensity_hm_rows
1129 250000 / (
1130 max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows
1131 ) 2797 )
1132 m_hm <- m[peptide_count:1, , drop = FALSE] 2798 m_hm <- m[peptide_count:1, , drop = FALSE]
1133 my_limit <- 60 2799 if (is.null(cellnote)) {
1134 my_cex_col <- 0.75 * my_limit / (my_limit + ncol(m_hm)) 2800 cellnote <- matrix("", nrow = nrow(m_hm), ncol = ncol(m_hm))
2801 cellnote[is.na(m_hm)] <- "NA"
2802 } else {
2803 cellnote <- cellnote[peptide_count:1, , drop = FALSE]
2804 }
2805 m_hm[is.na(m_hm)] <- 0
2806 nrow_m_hm <- nrow(m_hm)
2807 ncol_m_hm <- ncol(m_hm)
2808 if (nrow_m_hm < 1 || ncol_m_hm < 1)
2809 return(peptide_count) # return zero as initialized above
2810 my_limit <- g_intensity_hm_rows
2811
2812
2813 my_row_cex <- master_cex * (100 / (2 + row_mchar_max))
2814 my_col_cex <- master_cex * 6 * row_margin / col_margin
2815 my_col_adj <- min(my_col_cex, my_row_cex) / my_col_cex
2816 my_col_cex <- min(my_col_cex, my_row_cex)
2817 col_margin <- sqrt(my_col_adj) * col_margin
2818 if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex))
2819 if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex))
2820 if (is.null(margins)) my_margins <-
2821 c(
2822 (my_col_cex + col_margin), # col
2823 (my_row_cex + row_margin) / my_row_cex # row
2824 )
2825 else
2826 my_margins <- margins
2827
2828 if (print_trace_messages) cat(
2829 sprintf(
2830 "my_margins = c(%s)\n\n",
2831 paste(my_margins, collapse = ", ")
2832 )
2833 )
2834 my_hm2_cex <- 2 * master_cex
2835 my_key_cex <- 0.9 - 0.1 * (g_intensity_hm_rows + nrow_m_hm) / g_intensity_hm_rows
2836 my_key_warp <- 1.5 * 22.75 / row_margin
2837 my_key_cex <- min(1.10, my_key_warp * my_key_cex)
2838 my_hgt_scale <- 3.70 - 0.4 * (max(1, 0.9 * my_row_cex) - 1)
2839 my_hgt_scale <- 3.75 # 3.615
2840 my_hgt_scale <- 3.60 # 3.615
2841 if (print_trace_messages)
2842 cat_variable(my_hgt_scale, "\n\n", 3)
2843 my_warp <- max(0.1, 1.4 * (7.5 + nrow_m) / g_intensity_hm_rows)
2844 if (print_trace_messages)
2845 cat_variable(my_warp, "\n\n", 3)
2846 # added 0.9 heuristically...
2847 my_plot_height <-
2848 (0.566 + 0.354 * (nrow_m / g_intensity_hm_rows)) *
2849 min(my_hgt_scale, my_hgt_scale * my_warp)
2850 my_plot_height <- min(3.65, my_plot_height * g_intensity_hm_rows / 50)
2851 my_heights <- c(
2852 0.3, # title and top dendrogram
2853 my_plot_height, # plot and bottom margin
2854 4.15 - my_hgt_scale, # legend
2855 0.05 + my_hgt_scale - my_plot_height # whitespace below legend
2856 )
2857 my_note_cex <- min(0.8, my_row_cex, my_col_cex)
2858 if (print_trace_messages) {
2859 cat_variable(my_plot_height, "\n\n", 3)
2860 cat_variable(4.19 - my_hgt_scale, "\n\n", 3)
2861 cat_variable(nrow_m_hm, "; ", 0)
2862 cat_variable(ncol_m_hm, "; ", 0)
2863 cat_variable(my_row_cex, "; ", 3)
2864 cat_variable(my_col_cex, "; ", 3)
2865 cat_variable(my_note_cex, "; ", 3)
2866 cat_variable(my_key_cex, "\n\n", 3)
2867 cat_variable(my_hgt_scale, "; ", 3)
2868 cat_variable(my_plot_height, "; ", 3)
2869 cat_variable(my_warp, "\n\n", 3)
2870 cat_variable(my_heights, "; ", 2)
2871 cat_variable(sum(my_heights), "\n\n", 3)
2872 }
2873
2874 # define hm_call inner function
1135 hm_call <- function(x, scaling, title) { 2875 hm_call <- function(x, scaling, title) {
1136 heatmap( 2876 my_cex_main <- min(5.0, 220 / nchar(title))
1137 x, 2877 op <- par(
1138 Rowv = if (suppress_row_dendrogram) NA else NULL, 2878 cex.main = my_cex_main * master_cex,
1139 Colv = NA, 2879 adj = adj
1140 cexRow = my_cex_row, 2880 )
1141 cexCol = my_cex_col, 2881 if (
1142 scale = scaling, 2882 !is.null(
1143 margins = margins, 2883 hm2plus(
1144 main = title, 2884 x,
1145 xlab = "", 2885 Colv = NA,
1146 las = 1, 2886 Rowv = TRUE,
1147 ... 2887 cexRow = my_row_cex,
2888 cexCol = my_col_cex,
2889 dendrogram = "row",
2890 las = 1,
2891 main = title,
2892 key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
2893 cex = my_hm2_cex,
2894 key_par = list(cex = my_key_cex),
2895 margins = my_margins,
2896 widths = c(0.4, 2.6, 1.5),
2897 heights = my_heights,
2898 mat = matrix(
2899 c(
2900 c(0, 3, 3),
2901 c(2, 1, 1),
2902 c(0, 4, 0),
2903 c(0, 0, 0)
2904 ),
2905 nrow = 4,
2906 ncol = 3,
2907 byrow = TRUE
2908 ),
2909 na.rm = TRUE,
2910 scale = scaling,
2911 srtcol = 90,
2912 srtrow = 0,
2913 xlab = "",
2914 cellnote = cellnote,
2915 notecex = my_note_cex,
2916 ...
2917 )
1148 ) 2918 )
2919 ) {
2920 if (print_trace_messages) cat(
2921 sprintf(
2922 "my_heights = c(%s); sum = %0.3f\n\n",
2923 paste(
2924 sprintf("%0.3f", my_heights),
2925 collapse = ", "
2926 ),
2927 sum(my_heights)
2928 )
2929 )
2930 if (print_trace_messages) cat(
2931 sprintf("my_key_cex = %0.3f\n\n",
2932 my_key_cex)
2933 )
2934 if (print_trace_messages) cat(
2935 sprintf("my_key_cex/my_heights[3] = %0.3f\n\n",
2936 my_key_cex / my_heights[3])
2937 )
2938 if (print_trace_messages) cat(
2939 sprintf("my_heights[2]/my_heights[3] = %0.3f\n\n",
2940 my_heights[2] / my_heights[3])
2941 )
2942 }
2943 par(op)
1149 } 2944 }
2945
2946 # invoke hm_call inner function
1150 if (sum(rowSums(!is.na(m_hm)) < 2)) 2947 if (sum(rowSums(!is.na(m_hm)) < 2))
1151 hm_call( 2948 hm_call(
1152 m_hm, 2949 m_hm,
1153 "none", 2950 "none",
1154 "log(intensities), unscaled, unimputed, and unnormalized" 2951 "log(intensities), unscaled, unimputed, and unnormalized"
1168 if (nrow(m_hm) > 1) 2965 if (nrow(m_hm) > 1)
1169 hm_call( 2966 hm_call(
1170 m_hm, 2967 m_hm,
1171 "none", 2968 "none",
1172 paste( 2969 paste(
1173 "log(intensities), unscaled, unimputed,", 2970 "log(intensities), unscaled,",
1174 "NAs zeroed, unnormalized" 2971 "zero-imputed, unnormalized"
1175 ) 2972 )
1176 ) 2973 )
1177 else 2974 else
1178 cat("\nThere are too few peptides to produce a heatmap.\n") 2975 cat("\nThere are too few peptides to produce a heatmap.\n")
1179 }, 2976 },
1180 error = function(r) { 2977 error = function(r) {
1181 cat( 2978 cat(
1182 sprintf( 2979 sprintf(
1183 "\n%s %s Internal message: %s\n", 2980 "\n%s %s Internal message: %s\n\\newline\n\n",
1184 "Could not draw heatmap,", 2981 "Failure drawing heatmap,",
1185 "possibly because of too many missing values.", 2982 "possibly because of too many missing values.\n\\newline\n\n",
1186 r$message 2983 r$message
1187 ) 2984 )
1188 ) 2985 )
2986 cat_margins()
1189 } 2987 }
1190 ) 2988 )
1191 } else { 2989 } else {
1192 cat( 2990 cat(
1193 "\nCould not draw heatmap, possibly because of too many missing values.\n" 2991 "\nFailure drawing heatmap, possibly because of too many missing values.\n"
1194 ) 2992 )
1195 } 2993 }
1196 }, 2994 }
1197 finally = par(old_oma)
1198 ) 2995 )
1199 } 2996 }
1200 } 2997 }
1201 return(peptide_count) 2998 # return value:
1202 } 2999 peptide_count
3000 }
3001
3002 # function drawing heatmap of correlations if they exist, else covariances
3003 cov_heatmap <-
3004 function(
3005 m, # matrix with rownames already formatted
3006 top_substrates = FALSE,
3007 ... # passthru to hm2plus or heatmap.2
3008 ) {
3009 if (print_nb_messages) nbe(see_variable(m), " [", nrow(m), "x", ncol(m), "\n")
3010 #ACE nb(rowSums(m, na.rm = TRUE))
3011 #ACE bad_rows <- (rowSums(m, na.rm = TRUE) == 0)
3012 #ACE nb(see_variable(bad_rows))
3013 #ACE m <- m[-bad_rows, , drop = FALSE]
3014 colnames_m <- colnames(m)
3015 is_na_m <- is.na(m)
3016 tmp <- m
3017 tmp[is_na_m] <- 0
3018
3019 tmp <- m[, 0 < colSums(x = tmp)] # by default, na.rm is FALSE
3020
3021 colnames_tmp <- colnames(tmp)
3022
3023 my_low_p_seq <- seq(
3024 from = min(g_intensity_hm_rows, nrow(m)),
3025 to = 1,
3026 by = -1
3027 )
3028
3029 if (g_correlate_substrates) {
3030 # zap samples having zero or near-zero variance
3031 tmp[is.na(tmp)] <- 0
3032 nzv <- caret::nearZeroVar(
3033 tmp, # matrix of values, samples x variables
3034 freqCut = 1.01, # min(freq most prevalent value /
3035 # freq second most prevalent)
3036 uniqueCut = 99 # max(number of unique values /
3037 # total number of samples)
3038 )
3039 tmp <- if (length(nzv) > 0) {
3040 m[, -nzv, drop = FALSE]
3041 } else {
3042 m
3043 }
3044 } else {
3045 tmp <- m[my_low_p_seq, , drop = FALSE]
3046 }
3047
3048
3049 t_m <- t(tmp)
3050 t_m[is.na(t_m)] <- 0
3051 prefiltered_nrow <- ncol(t_m)
3052
3053 my_corcov <- cov(t_m)
3054 did_filter_rows <- did_filter_cols <- FALSE
3055 if (g_correlate_substrates && !is_positive_definite(my_corcov)) {
3056 my_correlate_substrates <- FALSE
3057 t_m <- t(m[my_low_p_seq, , drop = FALSE])
3058 t_m[is.na(t_m)] <- 0
3059 unfiltered_row_count <- ncol(t_m)
3060 unfiltered_col_count <- nrow(t_m)
3061
3062 # zap empty samples
3063 t_m <- t_m[0 < rowSums(x = t_m), ]
3064 # zap substrates present in fewer than two samples
3065 foo <- t_m > 0
3066 foo <- colSums(x = foo) > 1
3067 t_m <- t_m[, foo]
3068
3069 did_filter_rows <- unfiltered_row_count > ncol(t_m)
3070 did_filter_cols <- unfiltered_col_count > nrow(t_m)
3071
3072 colnames_tmp <- rownames(t_m)
3073 my_corcov <- cov(t_m)
3074 if (g_filter_cov_var_gt_1) {
3075 my_corcov <- keep_cov_w_var_gtr_1(my_corcov)
3076 }
3077 } else if (g_correlate_substrates) {
3078 my_corcov <- cov2cor(my_corcov)
3079 my_correlate_substrates <- TRUE
3080 } else {
3081 my_correlate_substrates <- FALSE
3082 if (g_filter_cov_var_gt_1) my_corcov <- keep_cov_w_var_gtr_1(my_corcov)
3083 }
3084
3085 omitted_samples <- colnames_m[colnames_m %notin% colnames_tmp]
3086 suffix <- if (length(omitted_samples) > 1) "s" else ""
3087
3088 f_omissions <-
3089 function(is_corr) {
3090 cat(
3091 sprintf(
3092 "Below is the %s plot for %s substrates",
3093 if (is_corr) "correlation" else "covariance",
3094 sprintf(
3095 if (top_substrates)
3096 "%0.0f \"highest-quality\""
3097 else
3098 "%0.0f",
3099 ncol(t_m)
3100 )
3101 )
3102 )
3103 if (did_filter_cols) {
3104 cat(sprintf(", omitting sample%s ", suffix))
3105 latex_collapsed_vector(", ", omitted_samples)
3106 }
3107 cat(".\n\n")
3108 }
3109
3110 if (is.null(my_corcov) || sum(!is.na(t_m)) < 2) {
3111 cat(
3112 sprintf(
3113 "\\newline\n%s %s plot.\n",
3114 "Insufficient covariance to produce",
3115 if (my_correlate_substrates)
3116 "correlation"
3117 else
3118 "covariance"
3119 ),
3120 "\\newpage\n"
3121 )
3122 return(NULL)
3123 }
3124
3125 cat("\\leavevmode\n", "\\newpage\n")
3126 f_omissions(my_correlate_substrates)
3127
3128 master_cex <- 0.4
3129 max_nchar <- max(nchar(rownames(t_m)))
3130 my_limit <- g_intensity_hm_rows
3131 diminution <- sqrt(my_limit / (my_limit + ncol(t_m)))
3132 my_row_cex <-
3133 my_col_cex <-
3134 min(1.75, master_cex * 9 * diminution ^ 1.5)
3135 my_margin <- 3 + my_row_cex * 64 / (8 + max_nchar)
3136 my_key_cex <- 1.4
3137 my_hm2_cex <- 1.0 * master_cex
3138 my_hgt_scale <- 3.50 - 0.26 * (max(0.4, my_key_cex) - 0.4)
3139 my_hgt_scale <- 2.7
3140
3141 my_legend_height <- 4.0 - my_hgt_scale
3142 my_legend_height <- 0.5 * my_key_cex
3143 my_warp <- 0.65 * (my_limit + ncol(t_m)) / my_limit
3144 my_warp <- 0.8
3145 my_legend_height <- 0.77
3146 my_legend_height <- 0.67
3147 my_plot_height <- my_hgt_scale + (1 - my_warp) * my_legend_height
3148 my_legend_height <- my_warp * my_legend_height
3149
3150 parjust <- par(adj = 0.5)
3151 on.exit(par(parjust))
3152 my_corcov <- my_corcov[order(rownames(my_corcov)), ]
3153 my_main <-
3154 sprintf("%s among %s substrates %s",
3155 if (my_correlate_substrates) "Correlation"
3156 else "Covariance",
3157 kinase_name,
3158 if (!my_correlate_substrates &&
3159 g_filter_cov_var_gt_1 &&
3160 did_filter_rows
3161 )
3162 "having variance > 1"
3163 else ""
3164 )
3165 my_main_nchar <- nchar(my_main)
3166 my_heights <- c(
3167 0.3,
3168 my_plot_height,
3169 my_legend_height # was 4.0 - my_hgt_scale # was 4.19
3170 )
3171 if (print_trace_messages) cat(sprintf("max_nchar = %0.3f; ", max_nchar))
3172 if (print_trace_messages) cat(sprintf("my_margin = %0.3f; ", my_margin))
3173 if (print_trace_messages) cat(sprintf("my_plot_height = %0.3f\n\n", my_plot_height))
3174 if (print_trace_messages) cat(sprintf("master_cex = %0.3f; ", master_cex))
3175 if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex))
3176 if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex))
3177 if (print_trace_messages) cat(sprintf("my_key_cex = %0.3f\n\n", my_key_cex))
3178 if (print_trace_messages) cat(sprintf("my_hgt_scale = %0.3f\n\n", my_hgt_scale))
3179 if (print_trace_messages) cat(sprintf("legend height = %0.3f\n\n", my_legend_height))
3180 if (print_trace_messages) cat(
3181 sprintf(
3182 "my_heights = c(%s); sum = %0.3f\n\n",
3183 paste(
3184 sprintf("%0.3f", my_heights),
3185 collapse = ", "
3186 ),
3187 sum(my_heights)
3188 )
3189 )
3190 op <- par(cex.main = (30 + my_main_nchar) / my_main_nchar)
3191 on.exit(par(op))
3192 hm2plus(
3193 x = my_corcov,
3194 cex = my_hm2_cex,
3195 cexCol = my_col_cex,
3196 cexRow = my_row_cex,
3197 density_info = "none",
3198 denhgt = 0.15,
3199 denwid = 0.5,
3200 divergent = TRUE,
3201 key_par = list(cex = my_key_cex),
3202 key_xlab = if (my_correlate_substrates) "Correlation"
3203 else "Covariance",
3204 main = my_main,
3205 mat = matrix(
3206 c(
3207 c(0, 3, 3),
3208 c(2, 1, 1),
3209 c(0, 4, 0)
3210 ),
3211 nrow = 3,
3212 ncol = 3,
3213 byrow = TRUE
3214 ),
3215 heights = my_heights,
3216 margins = c(my_margin, my_margin),
3217 widths = c(0.5, 3.1, 0.9),
3218 scale = "none",
3219 symkey = TRUE,
3220 symbreaks = TRUE,
3221 symm = FALSE #TODO evaluate TRUE
3222 # ...
3223 )
3224 } # end cov_heatmap
3225
3226 ### FILE IMPORT
3227
3228 # function reading bzipped file to data.frame
3229 bzip2df <- function(d, f, ctor = bzfile) {
3230 # read.delim file (by default, compressed by bzip2)
3231 if (file.exists(f)) {
3232 conn <- NULL
3233 pf <- parent.frame()
3234 tryCatch(
3235 assign(
3236 as.character(substitute(d)),
3237 read.delim(conn <- bzfile(f, open = "r")),
3238 pf
3239 ),
3240 finally = if (!is.null(conn)) close(conn)
3241 )
3242 }
3243 }
3244
1203 ``` 3245 ```
1204 3246
1205 ```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
1206 cat("\\listoftables\n")
1207 ```
1208 # Purpose 3247 # Purpose
1209 3248
1210 To perform for phosphopeptides: 3249 The purpose of this analysis is to perform for phosphopeptides:
1211 3250
1212 - imputation of missing values, 3251 - imputation of missing values,
1213 - quantile normalization, 3252 - quantile normalization,
1214 - ANOVA (using the R stats::`r params$oneWayManyCategories` function), and 3253 - ANOVA (using the R stats::`r params$oneWayManyCategories` function),
3254 - assignment of an FDR-adjusted $p$-value and a "quality score" to each phosphopeptide, and
1215 - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: 3255 - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases:
1216 - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) 3256 - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org)
1217 - The Human Proteome Database [http://hprd.org](http://hprd.org) 3257 - The Human Proteome Database [http://hprd.org](http://hprd.org)
1218 - NetworKIN [http://networkin.science/](http://networkin.science/) 3258 - NetworKIN [http://networkin.science/](http://networkin.science/)
1219 - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx) 3259 - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx)
1220 3260
1221 ```{r include = FALSE} 3261 ```{r include = FALSE}
1222 3262
1223 ### GLOBAL VARIABLES 3263 if (params$kseaUseAbsoluteLog2FC) {
1224 3264 sfc <- "|s|"
1225 # parameters for KSEA 3265 pfc <- "|p|"
1226 3266 pfc_txt <- "$\\text{absolute value}({\\log_2 (\\text{fold-change})})$"
1227 ksea_cutoff_statistic <- params$kseaCutoffStatistic 3267 } else {
1228 ksea_cutoff_threshold <- params$kseaCutoffThreshold 3268 sfc <- "s"
1229 ksea_min_kinase_count <- params$kseaMinKinaseCount 3269 pfc <- "p"
3270 pfc_txt <- "${\\log_2 (\\text{fold-change}})$"
3271 }
1230 3272
1231 ksea_heatmap_titles <- list() 3273 ksea_heatmap_titles <- list()
1232 ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- 3274 ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <-
1233 sprintf( 3275 sprintf(
1234 "Summary for all kinases enriched in one or more contrasts at %s < %s", 3276 "Summary for all kinases enriched in one or more contrasts at %s < %s",
1244 ksea_cutoff_threshold 3286 ksea_cutoff_threshold
1245 ) 3287 )
1246 # hash to hold names of significantly enriched kinases 3288 # hash to hold names of significantly enriched kinases
1247 ksea_asterisk_hash <- new_env() 3289 ksea_asterisk_hash <- new_env()
1248 3290
1249 # READ PARAMETERS (mostly) 3291 # PROCESS (mostly read) PARAMETERS
1250
1251 intensity_hm_rows <- params$intensityHeatmapRows
1252 # Input Filename
1253 input_file <- params$inputFile
1254
1255 # First data column - ideally, this could be detected via regexSampleNames,
1256 # but for now leave it as is.
1257 first_data_column <- params$firstDataColumn
1258 fdc_is_integer <- is.integer(first_data_column)
1259 if (fdc_is_integer) {
1260 first_data_column <- as.integer(params$firstDataColumn)
1261 }
1262 3292
1263 # False discovery rate adjustment for ANOVA 3293 # False discovery rate adjustment for ANOVA
1264 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 3294 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05
1265 val_fdr <- 3295 val_fdr <- read.table(file = alpha_file, sep = "\t", header = FALSE, quote = "")
1266 read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "")
1267 3296
1268 if ( 3297 if (
1269 ncol(val_fdr) != 1 || 3298 ncol(val_fdr) != 1 ||
1270 sum(!is.numeric(val_fdr[, 1])) || 3299 sum(!is.numeric(val_fdr[, 1])) ||
1271 sum(val_fdr[, 1] < 0) || 3300 sum(val_fdr[, 1] < 0) ||
1272 sum(val_fdr[, 1] > 1) 3301 sum(val_fdr[, 1] > 1)
1273 ) { 3302 ) {
1274 stop("alphaFile should be one column of numbers within the range [0.0,1.0]") 3303 cat("alphaFile should be one column of numbers within the range [0.0,1.0]")
3304 param_df_exit()
3305 knitr::knit_exit()
1275 } 3306 }
1276 val_fdr <- val_fdr[, 1] 3307 val_fdr <- val_fdr[, 1]
1277 3308
1278 #Imputed Data filename
1279 imputed_data_filename <- params$imputedDataFilename
1280 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
1281 anova_ksea_mtdt_file <- params$anovaKseaMetadata
1282
1283 ``` 3309 ```
1284 3310
1285 ```{r echo = FALSE} 3311 ```{r echo = FALSE, results = 'asis'}
1286 # Imputation method, should be one of 3312
1287 # "random", "group-median", "median", or "mean" 3313
1288 imputation_method <- params$imputationMethod
1289
1290 # Selection of percentile of logvalue data to set the mean for random number
1291 # generation when using random imputation
1292 mean_percentile <- params$meanPercentile / 100.0
1293
1294 # deviation adjustment-factor for random values; real number.
1295 sd_percentile <- params$sdPercentile
1296
1297 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
1298 regex_sample_names <- params$regexSampleNames
1299
1300 # Regular expression to extract Sample Grouping from Sample Name;
1301 # if error occurs, compare sample_treatment_levels vs. sample_name_matches
1302 # to see if groupings/pairs line up
1303 # e.g., "(\\d+)"
1304 regex_sample_grouping <- params$regexSampleGrouping
1305
1306 one_way_all_categories_fname <- params$oneWayManyCategories
1307 one_way_all_categories <- try_catch_w_e(
1308 match.fun(one_way_all_categories_fname))
1309 if (!is.function(one_way_all_categories$value)) {
1310 write("fatal error for parameter oneWayManyCategories:", stderr())
1311 write(one_way_all_categories$value$message, stderr())
1312 if (sys.nframe() > 0) quit(save = "no", status = 1)
1313 stop("Cannot continue. Goodbye.")
1314 }
1315 one_way_all_categories <- one_way_all_categories$value
1316
1317 one_way_two_categories_fname <- params$oneWayManyCategories
1318 one_way_two_categories <- try_catch_w_e(
1319 match.fun(one_way_two_categories_fname))
1320 if (!is.function(one_way_two_categories$value)) {
1321 cat("fatal error for parameter oneWayTwoCategories: \n")
1322 cat(one_way_two_categories$value$message, fill = TRUE)
1323 if (sys.nframe() > 0) quit(save = "no", status = 1)
1324 stop("Cannot continue. Goodbye.")
1325 }
1326 one_way_two_categories <- one_way_two_categories$value
1327
1328 preproc_db <- params$preprocDb
1329 ksea_app_prep_db <- params$kseaAppPrepDb
1330 result <- file.copy( 3314 result <- file.copy(
1331 from = preproc_db, 3315 from = preproc_db,
1332 to = ksea_app_prep_db, 3316 to = ksea_app_prep_db,
1333 overwrite = TRUE 3317 overwrite = TRUE
1334 ) 3318 )
1339 preproc_db, 3323 preproc_db,
1340 ksea_app_prep_db, 3324 ksea_app_prep_db,
1341 ), 3325 ),
1342 stderr() 3326 stderr()
1343 ) 3327 )
1344 if (sys.nframe() > 0) quit(save = "no", status = 1) 3328 if (sys.nframe() > 0) {
1345 stop("Cannot continue. Goodbye.") 3329 cat("Cannot continue and quit() failed. Goodbye.")
1346 } 3330 param_df_exit()
3331 knitr::knit_exit()
3332 # in case knit_exit doesn't exit
3333 quit(save = "no", status = 1)
3334 }
3335 }
3336
3337 if (FALSE) {
3338 write.table(x = param_df, file = "test-data/params.txt")
3339 }
3340
1347 ``` 3341 ```
1348 3342
1349 ```{r echo = FALSE} 3343 ```{r echo = FALSE}
1350 ### READ DATA 3344 ### READ DATA
1351 3345
1356 sep = "\t", 3350 sep = "\t",
1357 header = TRUE, 3351 header = TRUE,
1358 quote = "", 3352 quote = "",
1359 check.names = FALSE 3353 check.names = FALSE
1360 ) 3354 )
3355
1361 ``` 3356 ```
1362 3357
1363 # Extract Sample Classes and Names 3358
1364 3359 # Extraction of Sample Classes and Names from Input Data
1365 Column names parsed from input file are shown in Table 1; sample classes and names, in Table 2.
1366 3360
1367 ```{r echo = FALSE, results = 'asis'} 3361 ```{r echo = FALSE, results = 'asis'}
1368 3362
1369 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) 3363 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE)
3364 my_column_names <- names(full_data)
1370 3365
1371 if (!fdc_is_integer) { 3366 if (!fdc_is_integer) {
1372 if (length(data_column_indices) > 0) { 3367 if (length(data_column_indices) > 0) {
1373 first_data_column <- data_column_indices[1] 3368 first_data_column <- data_column_indices[1]
1374 } else { 3369 } else {
1375 stop(paste("failed to convert firstDataColumn:", first_data_column)) 3370 cat(paste("failed to convert firstDataColumn:", first_data_column))
3371 param_df_exit()
3372 knitr::knit_exit()
1376 } 3373 }
1377 } 3374 }
1378 3375
1379 cat( 3376 cat(
1380 sprintf( 3377 sprintf(
1381 paste( 3378 paste(
1382 "\n\nThe input data file has peptide-intensity data for each sample", 3379 "\n\nThe input data file has peptide-intensity data",
1383 "in one of columns %d through %d.\n\n" 3380 "in columns %d (\"%s\") through %d (\"%s\")."
1384 ), 3381 ),
1385 min(data_column_indices), 3382 tmp <- min(data_column_indices),
1386 max(data_column_indices) 3383 my_column_names[tmp],
1387 ) 3384 tmp <- max(data_column_indices),
1388 ) 3385 my_column_names[tmp]
1389 3386 )
1390 # Write column names as a LaTeX enumerated list. 3387 )
1391 column_name_df <- data.frame( 3388
1392 column = seq_len(length(colnames(full_data))), 3389 if (TRUE) {
1393 name = paste0("\\verb@", colnames(full_data), "@") 3390 cat0(
1394 ) 3391 table_offset(i = 1, new = TRUE),
1395 cat("\n\\begin{tiny}\n") 3392 "Sample classes and names are shown in ",
1396 data_frame_latex( 3393 table_href(),
1397 x = column_name_df, 3394 ".\n\n"
1398 justification = "l l", 3395 )
1399 centered = TRUE, 3396 } else {
1400 caption = "Input data column names", 3397 cat0(
1401 anchor = const_table_anchor_bp, 3398 "\\newcounter{offset}\n",
1402 underscore_whack = FALSE 3399 "\\setcounter{offset}{\\value{table}}\n",
1403 ) 3400 "\\stepcounter{offset}\n",
1404 cat("\n\\end{tiny}\n") 3401 "Sample classes and names are shown in ",
3402 table_href(),
3403 ".\n\n"
3404 )
3405 }
3406
3407 #TODO remove this unused variable and assignment
3408 if (FALSE) {
3409 # Write column names as a LaTeX enumerated list.
3410 column_name_df <- data.frame(
3411 column = seq_len(length(colnames(full_data))),
3412 name = paste0("\\verb@", colnames(full_data), "@")
3413 )
3414 }
1405 3415
1406 ``` 3416 ```
1407 3417
1408 ```{r echo = FALSE, results = 'asis'} 3418 ```{r echo = FALSE, results = 'asis'}
3419 # extract intensity columns from full_data to quant_data
1409 quant_data <- full_data[first_data_column:length(full_data)] 3420 quant_data <- full_data[first_data_column:length(full_data)]
1410 quant_data[quant_data == 0] <- NA 3421 quant_data[quant_data == 0] <- NA
1411 rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide 3422 rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide
3423 full_data_names <- colnames(quant_data)
1412 # Extract factors and trt-replicates using regular expressions. 3424 # Extract factors and trt-replicates using regular expressions.
1413 # Typically: 3425 # Typically:
1414 # regex_sample_names is "\\.\\d+[A-Z]$" 3426 # regex_sample_names is "\\.\\d+[A-Z]$"
1415 # regex_sample_grouping is "\\d+" 3427 # regex_sample_grouping is "\\d+"
1416 # This would distinguish trt-replicates by terminal letter [A-Z] 3428 # This would distinguish trt-replicates by terminal letter [A-Z]
1424 colnames(quant_data) <- sample_name_matches 3436 colnames(quant_data) <- sample_name_matches
1425 3437
1426 write_debug_file(quant_data) 3438 write_debug_file(quant_data)
1427 3439
1428 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) 3440 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
1429 sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match)) 3441 smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match))
3442
3443 if (print_nb_messages) nbe(see_variable(smpl_trt, "\n\n"))
3444 if (print_nb_messages) nbe(see_variable(sample_name_matches, "\n\n"))
3445 if (print_nb_messages) nbe(see_variable(full_data_names, "\n\n"))
3446
3447 sample_treatment_df <-
3448 save_sample_treatment_df <-
3449 data.frame(
3450 class = smpl_trt,
3451 sample = sample_name_matches,
3452 full_sample_names = full_data_names
3453 )
3454
3455 if (print_nb_messages) nbe(see_variable(sample_treatment_df, "\n\n"))
3456
3457 # reorder data
3458 my_order <- with(sample_treatment_df, order(class, sample))
3459 quant_data <- quant_data[, my_order]
3460 sample_name_matches <- sample_name_matches[my_order]
3461 smpl_trt <- smpl_trt[my_order]
3462 sample_treatment_df <- data.frame(
3463 class = smpl_trt,
3464 sample = sample_name_matches
3465 )
3466
3467 # filter smpl_trt as appropriate
3468 if (sample_group_filter %in% c("include", "exclude")) {
3469 include_sample <-
3470 mgrepl(
3471 v = sample_group_filter_patterns,
3472 s = as.character(smpl_trt),
3473 fixed = sample_group_filter_fixed,
3474 perl = sample_group_filter_perl,
3475 ignore.case = sample_group_filter_nocase
3476 )
3477 if (sum(include_sample) < 2) {
3478 errmsg <-
3479 paste(
3480 "ERROR:",
3481 sum(include_sample),
3482 "samples are too few for analysis;",
3483 "check input parameters for sample-name parsing"
3484 )
3485 cat0(
3486 errmsg,
3487 "\\stepcounter{offset}\n",
3488 " in ",
3489 table_href(),
3490 ".\n\n"
3491 )
3492 data_frame_tabbing_latex(
3493 x = save_sample_treatment_df,
3494 tabstops = c(1.25, 1.25),
3495 caption = "Sample classes",
3496 use_subsubsection_header = FALSE
3497 )
3498 data_frame_tabbing_latex(
3499 x =
3500 param_df[
3501 c("regexSampleNames",
3502 "regexSampleGrouping",
3503 "groupFilterPatterns",
3504 "groupFilter",
3505 "groupFilterMode"
3506 ),
3507 ],
3508 tabstops = c(1.75),
3509 underscore_whack = TRUE,
3510 caption = "Input parameters for sample-name parsing",
3511 verbatim = FALSE
3512 )
3513 param_df_exit()
3514 knitr::knit_exit()
3515 return(invisible(-1))
3516 }
3517 sample_treatment_df <-
3518 if (sample_group_filter == "include")
3519 sample_treatment_df[include_sample, ]
3520 else
3521 sample_treatment_df[!include_sample, ]
3522 } else {
3523 include_sample <- rep.int(TRUE, length(smpl_trt))
3524 }
3525 sample_name_matches <- sample_treatment_df$sample
3526 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
3527 smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match))
3528 sample_treatment_df$class <- smpl_trt
3529
3530 # filter quant_data as appropriate
1430 number_of_samples <- length(sample_name_matches) 3531 number_of_samples <- length(sample_name_matches)
1431 sample_treatment_df <- data.frame( 3532 quant_data <- quant_data[, sample_name_matches]
1432 class = sample_treatment_levels, 3533
1433 sample = sample_name_matches 3534 sample_level_integers <- as.integer(smpl_trt)
1434 ) 3535 sample_treatment_levels <- levels(smpl_trt)
1435 # reorder data 3536 count_of_treatment_levels <- length(sample_treatment_levels)
1436 if (TRUE) { 3537
1437 my_order <- with(sample_treatment_df, order(class, sample)) 3538 # for each phosphopeptide, across treatment levels, compute minimum
1438 quant_data <- quant_data[, my_order] 3539 # count of observed (i.e., non-missing) values
1439 sample_name_matches <- sample_name_matches[my_order] 3540 my_env <- new_env()
1440 sample_treatment_levels <- sample_treatment_levels[my_order] 3541 for (l in sample_treatment_levels)
1441 } 3542 my_env[[as.character(l)]] <-
1442 sample_treatment_df <- data.frame( 3543 as.vector(rowSums(!is.na(quant_data[l == smpl_trt])))
1443 class = sample_treatment_levels, 3544 min_group_obs_count <- row_apply(
1444 sample = sample_name_matches 3545 x = Reduce(
1445 ) 3546 f = function(l, r) cbind(l, my_env[[r]]),
1446 data_frame_latex( 3547 x = sample_treatment_levels,
3548 init = c()
3549 ),
3550 fun = min
3551 )
3552 names(min_group_obs_count) <- rownames(quant_data)
3553 rm(my_env)
3554
3555 # display (possibly-filtered) results
3556 cat("\\newpage\n")
3557
3558 if (sum(include_sample) > 1) {
3559 data_frame_tabbing_latex(
1447 x = sample_treatment_df, 3560 x = sample_treatment_df,
1448 justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}", 3561 tabstops = c(1.25),
1449 centered = TRUE,
1450 caption = "Sample classes", 3562 caption = "Sample classes",
1451 anchor = const_table_anchor_tbp, 3563 use_subsubsection_header = FALSE
1452 underscore_whack = FALSE 3564 )
1453 ) 3565 }
1454 sample_name_shrink <- 10 / (10 + max(nchar(sample_name_matches))) 3566 sample_name_grow <- 10 / (10 + max(nchar(sample_name_matches) + 6))
3567 sample_colsep <- transition_positions(as.integer(sample_treatment_df$class))
1455 ``` 3568 ```
1456 3569
1457 ```{r echo = FALSE, results = 'asis'} 3570 ```{r echo = FALSE, results = 'asis'}
1458 cat("\\newpage\n") 3571 cat("\\newpage\n")
1459 ``` 3572 ```
1460 3573
1461 ## Are the log-transformed sample distributions similar? 3574 ## Are the log-transformed sample distributions similar?
1462 3575
1463 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} 3576 ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'}
1464 3577
1465 quant_data[quant_data == 0] <- NA #replace 0 with NA 3578 quant_data[quant_data == 0] <- NA #replace 0 with NA
1466 quant_data_log <- log10(quant_data) 3579 quant_data_log <- log10(quant_data)
1467 3580
1468 rownames(quant_data_log) <- rownames(quant_data) 3581 rownames(quant_data_log) <- rownames(quant_data)
1469 colnames(quant_data_log) <- sample_name_matches 3582 colnames(quant_data_log) <- sample_name_matches
1470 3583
1471 write_debug_file(quant_data_log) 3584 write_debug_file(quant_data_log)
1472 3585
1473 # data visualization 3586 g_ppep_distrib_ctl <- new_env()
1474 old_par <- par( 3587 g_ppep_distrib_ctl$xlab_line <- 3.5 + 11.86 * (0.67 - sample_name_grow)
1475 mai = par("mai") + c(0.5, 0, 0, 0) 3588 g_ppep_distrib_ctl$mai_bottom <- (0.5 + 3.95 * (0.67 - sample_name_grow))
1476 ) 3589 g_ppep_distrib_ctl$axis <- (0.6 + 0.925 * (0.67 - sample_name_grow))
1477 # ref: https://r-charts.com/distribution/add-points-boxplot/ 3590
1478 # Vertical plot 3591 my_ppep_distrib_bxp <- function(
1479 boxplot( 3592 x
1480 quant_data_log 3593 , sample_name_grow = sample_name_grow
1481 , las = 2 3594 , main
1482 , cex.axis = 0.9 * sample_name_shrink 3595 , varwidth = FALSE
1483 , col = const_boxplot_fill 3596 , sub = NULL
1484 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") 3597 , xlab
1485 , xlab = "Sample" 3598 , ylab
1486 ) 3599 , col = const_boxplot_fill
1487 par(old_par) 3600 , notch = FALSE
1488 3601 , ppep_distrib_ctl = g_ppep_distrib_ctl
1489 3602 , ...
1490 3603 ) {
1491 cat("\n\n\n") 3604 my_xlab_line <- g_ppep_distrib_ctl$xlab_line
1492 cat("\n\n\n") 3605 my_mai_bottom <- g_ppep_distrib_ctl$mai_bottom
3606 my_axis <- g_ppep_distrib_ctl$axis
3607
3608 if (print_trace_messages) {
3609 cat_variable(my_xlab_line, suffix = "; ")
3610 cat_variable(my_mai_bottom, suffix = "; ")
3611 cat_variable(my_axis, suffix = "\n\n")
3612 }
3613
3614 old_par <- par(
3615 mai = par("mai") + c(my_mai_bottom, 0, 0, 0),
3616 cex.axis = my_axis,
3617 cex.lab = 1.2
3618 )
3619 tryCatch(
3620 {
3621 # Vertical plot
3622 boxplot(
3623 x
3624 , las = 2
3625 , col = col
3626 , main = main
3627 , sub = NULL
3628 , ylab = ylab
3629 , xlab = NULL
3630 , notch = notch
3631 , varwidth = varwidth
3632 , ...
3633 )
3634 title(
3635 sub = sub
3636 , cex.sub = 1.0
3637 , line = my_xlab_line + 1
3638 )
3639 title(
3640 xlab = xlab
3641 , line = my_xlab_line
3642 )
3643 },
3644 finally = par(old_par)
3645 )
3646 }
3647
3648 my_ppep_distrib_bxp(
3649 x = quant_data_log
3650 , sample_name_grow = sample_name_grow
3651 , main = "Peptide intensities for each sample"
3652 , varwidth = boxplot_varwidth
3653 , sub = "Box widths reflect number of peptides for sample"
3654 , xlab = "Sample"
3655 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
3656 , col = const_boxplot_fill
3657 , notch = FALSE
3658 )
3659
3660 cat("\n\n\n\n")
1493 3661
1494 ``` 3662 ```
1495 3663
1496 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} 3664 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
1497 if (nrow(quant_data_log) > 1) { 3665 if (nrow(quant_data_log) > 1) {
1536 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), 3704 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"),
1537 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") 3705 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)")
1538 ) 3706 )
1539 ``` 3707 ```
1540 3708
3709 # Characterization of Input Data
3710
1541 ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values 3711 ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values
1542 3712
1543 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} 3713 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
1544 # determine quantile 3714 # determine quantile
1545 q1 <- quantile(logvalues, probs = mean_percentile)[1] 3715 q1 <- quantile(logvalues, probs = mean_percentile)[1]
1546 3716
1547 # 1 = row of matrix (ie, phosphopeptide) 3717 # 1 = row of matrix (ie, phosphopeptide)
1548 sds <- apply(quant_data_log, 1, sd_finite) 3718 sds <- row_apply(quant_data_log, sd_finite)
1549 if (sum(!is.na(sds)) > 2) { 3719 if (sum(!is.na(sds)) > 2) {
1550 plot( 3720 plot(
1551 density(sds, na.rm = TRUE) 3721 density(sds, na.rm = TRUE)
1552 , main = "Smoothed estimated probability density vs. std. deviation" 3722 , main = "Smoothed estimated probability density vs. std. deviation"
1553 , sub = "(probability estimation made with Gaussian smoothing)" 3723 , sub = "(probability estimation made with Gaussian smoothing)"
1577 ```{r echo = FALSE} 3747 ```{r echo = FALSE}
1578 3748
1579 # prep for trt-median based imputation 3749 # prep for trt-median based imputation
1580 3750
1581 ``` 3751 ```
1582 # Impute Missing Values 3752 # Imputation of Missing Values
1583 3753
1584 ```{r echo = FALSE} 3754 ```{r echo = FALSE}
1585 3755
1586 imp_smry_pot_peptides_before <- nrow(quant_data_log) 3756 imp_smry_pot_peptides_before <- nrow(quant_data_log)
1587 imp_smry_missing_values_before <- number_to_impute 3757 imp_smry_missing_values_before <- number_to_impute
1608 quant_data_imp <- quant_data 3778 quant_data_imp <- quant_data
1609 imputation_method_description <- 3779 imputation_method_description <-
1610 paste("Substitute missing value with", 3780 paste("Substitute missing value with",
1611 "median peptide-intensity for sample group.\n" 3781 "median peptide-intensity for sample group.\n"
1612 ) 3782 )
1613 sample_level_integers <- as.integer(sample_treatment_levels)
1614 # Take the accurate ln(x+1) because the data are log-normally distributed 3783 # Take the accurate ln(x+1) because the data are log-normally distributed
1615 # and because median can involve an average of two measurements. 3784 # and because median can involve an average of two measurements.
1616 quant_data_imp <- log1p(quant_data_imp) 3785 quant_data_imp <- log1p(quant_data_imp)
1617 for (i in seq_len(length(levels(sample_treatment_levels)))) { 3786 for (i in seq_len(count_of_treatment_levels)) {
1618 # Determine the columns for this factor-level 3787 # Determine the columns for this factor-level
1619 level_cols <- i == sample_level_integers 3788 level_cols <- i == sample_level_integers
1620 # Extract those columns 3789 # Extract those columns
1621 lvlsbst <- quant_data_imp[, level_cols, drop = FALSE] 3790 lvlsbst <- quant_data_imp[, level_cols, drop = FALSE]
1622 # assign to ind the row-column pairs corresponding to each NA 3791 # assign to ind the row-column pairs corresponding to each NA
1623 ind <- which(is.na(lvlsbst), arr.ind = TRUE) 3792 ind <- which(is.na(lvlsbst), arr.ind = TRUE)
1624 # No group-median exists if there is only one sample 3793 # No group-median exists if there is only one sample
1625 # a given ppep has no measurement; otherwise, proceed. 3794 # a given ppep has no measurement; otherwise, proceed.
1626 if (ncol(lvlsbst) > 1) { 3795 if (ncol(lvlsbst) > 1) {
1627 the_centers <- 3796 the_centers <-
1628 apply(lvlsbst, 1, median, na.rm = TRUE) 3797 row_apply(lvlsbst, median, na.rm = TRUE)
1629 for (j in seq_len(nrow(lvlsbst))) { 3798 for (j in seq_len(nrow(lvlsbst))) {
1630 for (k in seq_len(ncol(lvlsbst))) { 3799 for (k in seq_len(ncol(lvlsbst))) {
1631 if (is.na(lvlsbst[j, k])) { 3800 if (is.na(lvlsbst[j, k])) {
1632 lvlsbst[j, k] <- the_centers[j] 3801 lvlsbst[j, k] <- the_centers[j]
1633 } 3802 }
1647 "median peptide-intensity across all sample classes.\n" 3816 "median peptide-intensity across all sample classes.\n"
1648 ) 3817 )
1649 # Take the accurate ln(x+1) because the data are log-normally distributed 3818 # Take the accurate ln(x+1) because the data are log-normally distributed
1650 # and because median can involve an average of two measurements. 3819 # and because median can involve an average of two measurements.
1651 quant_data_imp <- log1p(quant_data_imp) 3820 quant_data_imp <- log1p(quant_data_imp)
1652 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]] 3821 quant_data_imp[ind] <- row_apply(quant_data_imp, median, na.rm = TRUE)[ind[, 1]]
1653 # Take the accurate e^x - 1 to match scaling of original input. 3822 # Take the accurate e^x - 1 to match scaling of original input.
1654 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) 3823 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
1655 good_rows <- !is.nan(rowMeans(quant_data_imp)) 3824 good_rows <- !is.nan(rowMeans(quant_data_imp))
1656 } 3825 }
1657 , "mean" = { 3826 , "mean" = {
1663 # Take the accurate ln(x+1) because the data are log-normally distributed, 3832 # Take the accurate ln(x+1) because the data are log-normally distributed,
1664 # so arguments to mean should be previously transformed. 3833 # so arguments to mean should be previously transformed.
1665 # this will have to be 3834 # this will have to be
1666 quant_data_imp <- log1p(quant_data_imp) 3835 quant_data_imp <- log1p(quant_data_imp)
1667 # Assign to NA cells the mean for the row 3836 # Assign to NA cells the mean for the row
1668 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]] 3837 quant_data_imp[ind] <- row_apply(quant_data_imp, mean, na.rm = TRUE)[ind[, 1]]
1669 # Take the accurate e^x - 1 to match scaling of original input. 3838 # Take the accurate e^x - 1 to match scaling of original input.
1670 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) 3839 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
1671 good_rows <- !is.nan(rowMeans(quant_data_imp)) 3840 good_rows <- !is.nan(rowMeans(quant_data_imp))
1672 } 3841 }
1673 , "random" = { 3842 , "random" = {
1707 ```{r echo = FALSE} 3876 ```{r echo = FALSE}
1708 3877
1709 imp_smry_pot_peptides_after <- sum(good_rows) 3878 imp_smry_pot_peptides_after <- sum(good_rows)
1710 imp_smry_rejected_after <- sum(!good_rows) 3879 imp_smry_rejected_after <- sum(!good_rows)
1711 imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) 3880 imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ]))
3881
3882 # From ?`%in%`, %in% is currently defined as function(x, table) match(x, table, nomatch = 0) > 0
3883
3884 sink(stderr())
3885 print("`%in%`:")
3886 print(`%in%`)
3887 sink()
3888
3889 stock_in <-
3890 names(good_rows) %in%
3891 names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count])
3892 if (print_nb_messages) nbe(see_variable(stock_in), "\n")
3893
3894 explicit_in <-
3895 0 < match(
3896 names(good_rows),
3897 names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count])
3898 )
3899 if (print_nb_messages) nbe(see_variable(explicit_in), "\n")
3900
3901 great_enough_row_names <- good_rows[
3902 names(good_rows) %in%
3903 names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count])
3904 ]
3905 if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n")
3906 great_enough_row_names <- great_enough_row_names[great_enough_row_names]
3907 if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n")
1712 ``` 3908 ```
3909
1713 ```{r echo = FALSE, results = 'asis'} 3910 ```{r echo = FALSE, results = 'asis'}
1714 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf 3911 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf
1715 tabular_lines_fmt <- paste( 3912 tabular_lines_fmt <- paste(
1716 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) 3913 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page)
3914 " \\leavevmode",
1717 " \\caption{Imputation Results}", 3915 " \\caption{Imputation Results}",
1718 " \\centering", # \centering centers the table on the page 3916 " \\centering", # \centering centers the table on the page
1719 " \\begin{tabular}{l c c c}", 3917 " \\begin{tabular}{l c c c}",
1720 " \\hline\\hline", 3918 " \\hline\\hline",
1721 " \\ & potential peptides & missing values & rejected", 3919 " \\ & potential peptides & missing values & rejected",
1722 " peptides \\\\ [0.5ex]", 3920 " peptides \\\\ [0.5ex]",
1723 " \\hline", 3921 " \\hline",
1724 " before imputation & %d & %d (%d\\%s) & \\\\", 3922 " before imputation & %d & %d (%d\\%s) & \\\\",
1725 " after imputation & %d & %d & %d \\\\ [1ex]", 3923 " after imputation & %d & %d & %d \\\\",
3924 " after keep comparable & %d & & %d \\\\ [1ex]",
1726 " \\hline", 3925 " \\hline",
1727 " \\end{tabular}", 3926 " \\end{tabular}",
1728 #" \\label{table:nonlin}", # may be used to refer this table in the text 3927 #" \\label{table:nonlin}", # may be used to refer this table in the text
1729 "\\end{table}", 3928 "\\end{table}",
1730 sep = "\n" 3929 sep = "\n"
1736 imp_smry_missing_values_before, 3935 imp_smry_missing_values_before,
1737 imp_smry_pct_missing, 3936 imp_smry_pct_missing,
1738 "%", 3937 "%",
1739 imp_smry_pot_peptides_after, 3938 imp_smry_pot_peptides_after,
1740 imp_smry_missing_values_after, 3939 imp_smry_missing_values_after,
1741 imp_smry_rejected_after 3940 imp_smry_rejected_after,
3941 length(great_enough_row_names),
3942 imp_smry_pot_peptides_before -
3943 length(great_enough_row_names)
1742 ) 3944 )
1743 cat(tabular_lines) 3945 cat(tabular_lines)
1744 ``` 3946 ```
1745 ```{r echo = FALSE} 3947
1746 3948 ```{r filter_good_rows, echo = FALSE}
1747 3949
1748 # Zap rows where imputation was ineffective 3950 if (print_nb_messages) nbe("before name extraction, ", see_variable(length(good_rows)), " ", see_variable(good_rows), "\n")
3951 good_rows <- names(good_rows[names(great_enough_row_names)])
3952 if (print_nb_messages) nbe("after name extraction, ", see_variable(length(good_rows)), see_variable(good_rows), "\n")
3953
3954 #ACE min_group_obs_count <- min_group_obs_count[names(great_enough_row_names)]
3955 #ACE nbe("good_rows")
3956 #ACE nbe(see_variable(good_rows))
3957 #ACE nbe("names(min_group_obs_count) before filter for good rows")
3958 #ACE nbe(see_variable(names(min_group_obs_count)))
3959 min_group_obs_count <- min_group_obs_count[good_rows]
3960 #ACE nbe("min_group_obs_count after filter for good rows")
3961 #ACE nbe(see_variable(names(min_group_obs_count)))
3962
3963 # Zap rows where imputation was insufficiently effective
1749 full_data <- full_data [good_rows, ] 3964 full_data <- full_data [good_rows, ]
1750 quant_data <- quant_data [good_rows, ] 3965 quant_data <- quant_data [good_rows, ]
1751 3966 quant_data_log <- quant_data_log [good_rows, ]
3967
3968 if (print_nb_messages) nbe("before row filter, ", see_variable(nrow(quant_data_imp)), "\n")
1752 quant_data_imp <- quant_data_imp[good_rows, ] 3969 quant_data_imp <- quant_data_imp[good_rows, ]
3970 if (print_nb_messages) nbe("after row filter, ", see_variable(nrow(quant_data_imp)), "\n")
1753 write_debug_file(quant_data_imp) 3971 write_debug_file(quant_data_imp)
1754 quant_data_imp_good_rows <- quant_data_imp 3972 quant_data_imp_good_rows <- quant_data_imp
1755 3973
1756 write_debug_file(quant_data_imp_good_rows) 3974 write_debug_file(quant_data_imp_good_rows)
1757 ``` 3975 ```
1801 d_imputed <- d_combined 4019 d_imputed <- d_combined
1802 } 4020 }
1803 4021
1804 ``` 4022 ```
1805 4023
1806 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} 4024 ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'}
1807 zero_sd_rownames <- 4025 zero_sd_rownames <-
1808 rownames(quant_data_imp)[ 4026 rownames(quant_data_imp)[
1809 is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0) 4027 is.na((row_apply(quant_data_imp, sd, na.rm = TRUE)) == 0)
1810 ] 4028 ]
1811 4029
1812 if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { 4030 if (length(zero_sd_rownames) >= nrow(quant_data_imp)) {
1813 stop("All peptides have zero standard deviation. Cannot continue.") 4031 cat("All peptides have zero standard deviation. Cannot continue.")
4032 param_df_exit()
4033 knitr::knit_exit()
1814 } 4034 }
1815 if (length(zero_sd_rownames) > 0) { 4035 if (length(zero_sd_rownames) > 0) {
1816 cat( 4036 cat(
1817 sprintf("%d peptides with zero variance were removed from statistical consideration", 4037 sprintf(
1818 length(zero_sd_rownames) 4038 "%d %s %s",
4039 length(zero_sd_rownames),
4040 "peptides with zero variance",
4041 "were removed from statistical consideration"
1819 ) 4042 )
1820 ) 4043 )
1821 zap_named_rows <- function(df, nms) { 4044 zap_named_rows <- function(df, nms) {
1822 return(df[!(row.names(df) %in% nms), ]) 4045 return(df[!(row.names(df) %in% nms), ])
1823 } 4046 }
1824 quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames) 4047 quant_data_imp <-
1825 quant_data <- zap_named_rows(quant_data, zero_sd_rownames) 4048 zap_named_rows(quant_data_imp, zero_sd_rownames)
1826 full_data <- zap_named_rows(full_data, zero_sd_rownames) 4049 quant_data <-
4050 zap_named_rows(quant_data, zero_sd_rownames)
4051 full_data <-
4052 zap_named_rows(full_data, zero_sd_rownames)
4053 min_group_obs_count <-
4054 min_group_obs_count[names(min_group_obs_count) %notin% zero_sd_rownames]
1827 } 4055 }
1828 4056
1829 if (sum(is.na(quant_data)) > 0) { 4057 if (sum(is.na(quant_data)) > 0) {
1830 cat("\\leavevmode\\newpage\n") 4058 cat("\\leavevmode\\newpage\n")
1831 # data visualization
1832 old_par <- par(
1833 mai = par("mai") + c(0.5, 0, 0, 0)
1834 )
1835 # Copy quant data to x 4059 # Copy quant data to x
1836 x <- quant_data 4060 x <- quant_data
1837 # x gets to have values of: 4061 # x gets to have values of:
1838 # - NA for observed values 4062 # - NA for observed values
1839 # - 1 for missing values 4063 # - 1 for missing values
1858 max(red_dots, blue_dots, na.rm = TRUE) 4082 max(red_dots, blue_dots, na.rm = TRUE)
1859 ) 4083 )
1860 show_stripchart <- 4084 show_stripchart <-
1861 50 > (count_red + count_blue) / length(sample_name_matches) 4085 50 > (count_red + count_blue) / length(sample_name_matches)
1862 if (show_stripchart) { 4086 if (show_stripchart) {
1863 boxplot_sub <- "Light blue = data before imputation; Red = imputed data" 4087 boxplot_sub <- "Light blue = data before imputation; Red = imputed data;"
1864 } else { 4088 } else {
1865 boxplot_sub <- "" 4089 boxplot_sub <- ""
1866 } 4090 }
1867 4091
1868 # Vertical plot 4092 # Vertical plot
1869 colnames(blue_dots) <- sample_name_matches 4093 colnames(blue_dots) <- sample_name_matches
1870 boxplot( 4094 my_ppep_distrib_bxp(
1871 blue_dots 4095 x = blue_dots
1872 , las = 2 # "always vertical" 4096 , sample_name_grow = sample_name_grow
1873 , cex.axis = 0.9 * sample_name_shrink
1874 , col = const_boxplot_fill
1875 , ylim = ylim
1876 , main = "Peptide intensities after eliminating unusable peptides" 4097 , main = "Peptide intensities after eliminating unusable peptides"
1877 , sub = boxplot_sub 4098 , varwidth = boxplot_varwidth
4099 , sub = paste(boxplot_sub, "Box widths reflect number of peptides for sample")
1878 , xlab = "Sample" 4100 , xlab = "Sample"
1879 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") 4101 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
4102 , col = const_boxplot_fill
4103 , notch = FALSE
4104 , ylim = ylim
1880 ) 4105 )
1881 4106
1882 if (show_stripchart) { 4107 if (show_stripchart) {
1883 # Points 4108 # Points
1884 # ref: https://r-charts.com/distribution/add-points-boxplot/ 4109 # ref: https://r-charts.com/distribution/add-points-boxplot/
1886 stripchart( 4111 stripchart(
1887 blue_dots, # Data 4112 blue_dots, # Data
1888 method = "jitter", # Random noise 4113 method = "jitter", # Random noise
1889 jitter = const_stripchart_jitter, 4114 jitter = const_stripchart_jitter,
1890 pch = 19, # Pch symbols 4115 pch = 19, # Pch symbols
1891 cex = const_stripsmall_cex, # Size of symbols reduced 4116 cex = const_stripchart_cex, # Size of symbols reduced
1892 col = "lightblue", # Color of the symbol 4117 col = "lightblue", # Color of the symbol
1893 vertical = TRUE, # Vertical mode 4118 vertical = TRUE, # Vertical mode
1894 add = TRUE # Add it over 4119 add = TRUE # Add it over
1895 ) 4120 )
1896 stripchart( 4121 stripchart(
1897 red_dots, # Data 4122 red_dots, # Data
1898 method = "jitter", # Random noise 4123 method = "jitter", # Random noise
1899 jitter = const_stripchart_jitter, 4124 jitter = const_stripchart_jitter,
1900 pch = 19, # Pch symbols 4125 pch = 19, # Pch symbols
1901 cex = const_stripsmall_cex, # Size of symbols reduced 4126 cex = const_stripchart_cex, # Size of symbols reduced
1902 col = "red", # Color of the symbol 4127 col = "red", # Color of the symbol
1903 vertical = TRUE, # Vertical mode 4128 vertical = TRUE, # Vertical mode
1904 add = TRUE # Add it over 4129 add = TRUE # Add it over
1905 ) 4130 )
1906 4131
1907 } 4132 }
1908 if (TRUE) { 4133 }
1909 # show measured values in blue on left half-violin plot 4134 ```
1910 cat("\\leavevmode\n\\quad\n\n\\quad\n\n") 4135
1911 vioplot::vioplot( 4136 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
1912 x = lapply(blue_dots, function(x) x[!is.na(x)]), 4137 if (sum(is.na(quant_data)) > 0) {
1913 col = "lightblue1", 4138 # show measured values in blue on left half-violin plot
1914 side = "left", 4139 cat("\\leavevmode\n\\quad\n\n\\quad\n\n")
1915 plotCentre = "line", 4140 old_par <- par(
1916 ylim = ylim_save, 4141 mai = par("mai") + c(g_ppep_distrib_ctl$mai_bottom, 0, 0, 0),
1917 main = "Distributions of observed and imputed data", 4142 cex.axis = g_ppep_distrib_ctl$axis,
1918 sub = "Light blue = observed data; Pink = imputed data", 4143 cex.lab = 1.2
1919 las = 2, 4144 )
1920 cex.axis = 0.9 * sample_name_shrink, 4145 tryCatch(
1921 xlab = "Sample", 4146 {
1922 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") 4147 vioplot::vioplot(
1923 ) 4148 x = lapply(blue_dots, function(x) x[!is.na(x)]),
1924 red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) 4149 col = "lightblue1",
1925 cols_to_delete <- c() 4150 side = "left",
1926 for (ix in seq_len(length(red_violins))) { 4151 plotCentre = "line",
1927 if (length(red_violins[[ix]]) < 1) { 4152 ylim = ylim_save,
1928 cols_to_delete <- c(cols_to_delete, ix) 4153 main = "Distributions of observed and imputed data",
4154 sub = NULL,
4155 las = 2,
4156 xlab = NULL,
4157 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
4158 )
4159 title(
4160 sub = "Light blue = observed data; Pink = imputed data",
4161 cex.sub = 1.0,
4162 line = g_ppep_distrib_ctl$xlab_line + 1
4163 )
4164 title(
4165 xlab = "Sample",
4166 line = g_ppep_distrib_ctl$xlab_line
4167 )
4168 red_violins <- lapply(red_dots, function(x) x[!is.na(x)])
4169 cols_to_delete <- c()
4170 for (ix in seq_len(length(red_violins))) {
4171 if (length(red_violins[[ix]]) < 1) {
4172 cols_to_delete <- c(cols_to_delete, ix)
4173 }
1929 } 4174 }
1930 } 4175 # destroy any unimputable columns
1931 # destroy any unimputable columns 4176 if (!is.null(cols_to_delete)) {
1932 if (!is.null(cols_to_delete)) { 4177 red_violins <- red_violins[-cols_to_delete]
1933 red_violins <- red_violins[-cols_to_delete] 4178 }
1934 } 4179 # plot imputed values in red on right half-violin plot
1935 # plot imputed values in red on right half-violin plot 4180 vioplot::vioplot(
1936 vioplot::vioplot( 4181 x = red_violins,
1937 x = red_violins, 4182 col = "lightpink1",
1938 col = "lightpink1", 4183 side = "right",
1939 side = "right", 4184 plotCentre = "line",
1940 plotCentre = "line", 4185 add = TRUE
1941 add = TRUE 4186 )
1942 ) 4187
1943 } 4188 },
1944 4189 finally = par(old_par)
1945 par(old_par) 4190 )
1946 4191
1947 # density plot 4192 # density plot
1948 cat("\\leavevmode\n\n\n\n\n\n\n") 4193 cat("\\leavevmode\n\n\n\n\n\n\n")
1949 if (can_plot_before_after_imp) { 4194 if (can_plot_before_after_imp) {
1950 ylim <- c( 4195 ylim <- c(
1978 } 4223 }
1979 cat("\\leavevmode\\newpage\n") 4224 cat("\\leavevmode\\newpage\n")
1980 } 4225 }
1981 ``` 4226 ```
1982 4227
1983 # Perform Quantile Normalization 4228 # Quantile Normalization
1984 4229
1985 The excellent `normalize.quantiles` function from 4230 The excellent `normalize.quantiles` function from
1986 *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* 4231 *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)*
1987 performs "quantile normalization" as described Bolstad *et al.* (2003), 4232 performs "quantile normalization" as described Bolstad *et al.* (2003),
1988 DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* 4233 DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)*
1989 and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*, 4234 and its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html),
1990 i.e., it assumes that the goal is to detect 4235 i.e., it assumes that the goal is to detect
1991 subtle differences among grossly similar samples (having similar distributions) 4236 subtle differences among grossly similar samples (having similar distributions)
1992 by equailzing intra-quantile quantitations. 4237 by equalizing intra-quantile quantitations^[Unfortunately,
1993 Unfortunately, one software library upon which it depends 4238 one software library upon which `preprocessCore` depends
1994 *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* 4239 *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)*
1995 that requires that a specific, non-concurrent version of the library be 4240 that requires that a specific, non-concurrent version of the library (`openblas` version $0.3.3$) be
1996 installed. The installation command equivalent to what was used to install the library to produce the results presented here is: 4241 installed. The installation command equivalent to what was used to install the library to produce the results presented here is:
1997 ``` 4242 \linebreak
1998 conda install bioconductor-preprocesscore openblas=0.3.3 4243 ` conda install bioconductor-preprocesscore openblas=0.3.3`].
1999 ```
2000 4244
2001 4245
2002 <!-- 4246 <!--
2003 # Apply quantile normalization using preprocessCore::normalize.quantiles 4247 # Apply quantile normalization using preprocessCore::normalize.quantiles
2004 # --- 4248 # ---
2005 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html 4249 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html
2006 # except this: https://support.bioconductor.org/p/122925/#9135989 4250 # except this: https://support.bioconductor.org/p/122925/#9135989
2007 # says to install it like this: 4251 # says to install it like this:
2008 # ``` 4252 # ```
2009 # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) 4253 # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1])
2010 # ``` 4254 # ```
2011 # conda installation (necessary because of a bug in recent openblas): 4255 # conda installation (necessary because of a bug in recent openblas):
2012 # conda install bioconductor-preprocesscore openblas=0.3.3 4256 # conda install bioconductor-preprocesscore openblas=0.3.3
2013 # ... 4257 # ...
2014 # --- 4258 # ---
2015 # normalize.quantiles {preprocessCore} -- Quantile Normalization 4259 # normalize.quantiles {preprocessCore} -- Quantile Normalization
2016 # 4260 #
2017 # Description: 4261 # Description:
2018 # Using a normalization based upon quantiles, this function normalizes a 4262 # Using a normalization based upon quantiles, this function normalizes a
2019 # matrix of probe level intensities. 4263 # matrix of probe level intensities.
2020 # 4264 #
2021 # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the 4265 # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the
2022 # assumption that the data is missing at random. 4266 # assumption that the data is missing at random.
2023 # 4267 #
2024 # Usage: 4268 # Usage:
2025 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) 4269 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE)
2026 # 4270 #
2027 # Arguments: 4271 # Arguments:
2028 # 4272 #
2029 # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. 4273 # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe.
2030 # 4274 #
2031 # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, 4275 # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy,
2032 # but in certain situations not making a copy of the matrix, but instead normalizing 4276 # but in certain situations not making a copy of the matrix, but instead normalizing
2033 # it in place will be more memory friendly. 4277 # it in place will be more memory friendly.
2034 # 4278 #
2035 # - keep.names: Boolean option to preserve matrix row and column names in output. 4279 # - keep.names: Boolean option to preserve matrix row and column names in output.
2036 # 4280 #
2037 # Details: 4281 # Details:
2038 # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. 4282 # This method is based upon the concept of a quantile-quantile plot extended to n dimensions.
2039 # No special allowances are made for outliers. If you make use of quantile normalization 4283 # No special allowances are made for outliers. If you make use of quantile normalization
2040 # please cite Bolstad et al, Bioinformatics (2003). 4284 # please cite Bolstad et al, Bioinformatics (2003).
2041 # 4285 #
2042 # This functions will handle missing data (ie NA values), based on 4286 # This functions will handle missing data (ie NA values), based on
2043 # the assumption that the data is missing at random. 4287 # the assumption that the data is missing at random.
2044 # 4288 #
2045 # Note that the current implementation optimizes for better memory usage 4289 # Note that the current implementation optimizes for better memory usage
2046 # at the cost of some additional run-time. 4290 # at the cost of some additional run-time.
2047 # 4291 #
2048 # Value: A normalized matrix. 4292 # Value: A normalized matrix.
2049 # 4293 #
2050 # Author: Ben Bolstad, bmbolstad.com 4294 # Author: Ben Bolstad, bmbolstad.com
2051 # 4295 #
2052 # References 4296 # References
2053 # 4297 #
2054 # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide 4298 # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide
2055 # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf 4299 # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf
2056 # 4300 #
2057 # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of 4301 # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of
2058 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias 4302 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias
2059 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 4303 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185
2060 # http://bmbolstad.com/misc/normalize/normalize.html 4304 # http://bmbolstad.com/misc/normalize/normalize.html
2061 # ... 4305 # ...
2062 --> 4306 -->
2063 ```{r echo = FALSE, results = 'asis'} 4307 ```{r echo = FALSE, results = 'asis'}
2064 4308
4309 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp)), "\n")
2065 if (nrow(quant_data_imp) > 0) { 4310 if (nrow(quant_data_imp) > 0) {
2066 quant_data_imp_qn <- preprocessCore::normalize.quantiles( 4311 quant_data_imp_qn <- preprocessCore::normalize.quantiles(
2067 as.matrix(quant_data_imp), keep.names = TRUE 4312 as.matrix(quant_data_imp), keep.names = TRUE
2068 ) 4313 )
2069 } else { 4314 } else {
2070 quant_data_imp_qn <- as.matrix(quant_data_imp) 4315 quant_data_imp_qn <- as.matrix(quant_data_imp)
2071 } 4316 }
2072 4317
4318 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn)), "\n")
4319
2073 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) 4320 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn)
2074
2075 write_debug_file(quant_data_imp_qn) 4321 write_debug_file(quant_data_imp_qn)
2076 4322
2077 quant_data_imp_qn_log <- log10(quant_data_imp_qn) 4323 quant_data_imp_qn_log <- log10(quant_data_imp_qn)
2078
2079 write_debug_file(quant_data_imp_qn_log) 4324 write_debug_file(quant_data_imp_qn_log)
2080 4325
4326 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n")
4327 if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n")
4328
2081 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) 4329 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn))))
2082 4330
2083 sel <- apply(quant_data_imp_qn_ls, 1, any_nan) 4331 sel <- row_apply(quant_data_imp_qn_ls, any_nan)
2084 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls 4332 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls
2085 4333
2086 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] 4334 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ]
2087 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) 4335 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2)
2088 4336
2093 4341
2094 # Create data.frame used by ANOVA analysis 4342 # Create data.frame used by ANOVA analysis
2095 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) 4343 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log)
2096 ``` 4344 ```
2097 4345
2098 <!-- ACE insertion begin -->
2099 ## Are normalized, imputed, log-transformed sample distributions similar? 4346 ## Are normalized, imputed, log-transformed sample distributions similar?
2100 4347
2101 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} 4348 ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'}
2102 4349
2103 # Save unimputed quant_data_log for plotting below 4350 # Save unimputed quant_data_log for plotting below
2104 unimputed_quant_data_log <- quant_data_log 4351 unimputed_quant_data_log <- quant_data_log
2105 4352
2106 # log10 transform (after preparing for zero values, 4353 # log10 transform (after preparing for zero values,
2119 ) 4366 )
2120 cat("\n\n\n") 4367 cat("\n\n\n")
2121 4368
2122 4369
2123 # data visualization 4370 # data visualization
4371 if (TRUE) {
4372
4373 my_ppep_distrib_bxp(
4374 x = quant_data_log
4375 , sample_name_grow = sample_name_grow
4376 , main = "Peptide intensities for each sample"
4377 , varwidth = boxplot_varwidth
4378 , sub = NULL
4379 , xlab = "Sample"
4380 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
4381 , col = const_boxplot_fill
4382 , notch = boxplot_notch
4383 )
4384
4385 } else {
4386
2124 old_par <- par( 4387 old_par <- par(
2125 mai = par("mai") + c(0.5, 0, 0, 0) 4388 mai = par("mai") + c(0.5, 0, 0, 0)
2126 , oma = par("oma") + c(0.5, 0, 0, 0) 4389 , oma = par("oma") + c(0.5, 0, 0, 0)
2127 ) 4390 )
2128 # ref: https://r-charts.com/distribution/add-points-boxplot/ 4391 # ref: https://r-charts.com/distribution/add-points-boxplot/
2129 # Vertical plot 4392 # Vertical plot
2130 colnames(quant_data_log) <- sample_name_matches 4393 colnames(quant_data_log) <- sample_name_matches
2131 boxplot( 4394 boxplot(
2132 quant_data_log 4395 quant_data_log
2133 , las = 2 4396 , las = 2
2134 , cex.axis = 0.9 * sample_name_shrink 4397 , cex.axis = 0.9 * sample_name_grow
2135 , col = const_boxplot_fill 4398 , col = const_boxplot_fill
2136 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") 4399 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
2137 , xlab = "Sample" 4400 , xlab = "Sample"
4401 , notch = boxplot_notch
4402 , varwidth = boxplot_varwidth
2138 ) 4403 )
2139 par(old_par) 4404 par(old_par)
4405 }
2140 } else { 4406 } else {
2141 cat("There are no peptides to plot\n") 4407 cat("There are no peptides to plot\n")
2142 } 4408 }
2143 4409
2144 cat("\n\n\n") 4410 cat("\n\n\n")
2160 cat("\\leavevmode\\newpage\n") 4426 cat("\\leavevmode\\newpage\n")
2161 ``` 4427 ```
2162 4428
2163 # ANOVA Analysis 4429 # ANOVA Analysis
2164 4430
2165 ```{r, echo = FALSE} 4431 ## Assignment of $p$-value and quality score
4432
4433 For each phosphopeptide, ANOVA analysis was performed to compute a $p$-value representing the evidence against the "null hypothesis" ($H_0$) that the intensity does not vary significantly among sample groups.
4434
4435 However, because as more and more phosphopeptides are tested, there is increasd probability that, by random chance, a given peptide will have a $p$-value that appears to indicate significance. For this reason, the $p$-values were adjusted by applying the False Discovery Rate (FDR) correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x).
4436
4437 Furthermore, to give more weight to phosphopeptides having fewer missing values, an (arbitrarily defined) quality score was assigned to each, defined as:
4438
4439 $$
4440 \textit{quality}_j
4441 = \frac{1 + o_{j}}{v_{j}(1 + o_{j}) + 0.005}
4442 $$
4443
4444 where:
4445
4446 - $o_j$ is the minimum number of non-missing observations per sample group for substrate $j$ for all sample groups, and
4447 - $v_j$ is the FDR-adjusted ANOVA $p$-value for substrate $j$.
4448
4449
4450 ```{r, echo = FALSE, results = 'asis'}
4451 cat("\\newpage\n")
4452
2166 # Make new data frame containing only Phosphopeptides 4453 # Make new data frame containing only Phosphopeptides
2167 # to connect preANOVA to ANOVA (connect_df) 4454 # to connect preANOVA to ANOVA (connect_df)
2168 connect_df <- data.frame( 4455 connect_df <- data.frame(
2169 data_table_imp_qn_lt$Phosphopeptide 4456 data_table_imp_qn_lt$Phosphopeptide
2170 , data_table_imp_qn_lt[, first_data_column] 4457 , data_table_imp_qn_lt[, first_data_column]
2171 ) 4458 )
2172 colnames(connect_df) <- c("Phosphopeptide", "Intensity") 4459 colnames(connect_df) <- c("Phosphopeptide", "Intensity")
2173 ``` 4460 ```
2174 4461
2175 ```{r anova, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} 4462 ```{r anova, echo = FALSE, fig.dim = c(10, 12), results = 'asis'}
2176 count_of_treatment_levels <- length(levels(sample_treatment_levels)) 4463 g_can_run_ksea <- FALSE
4464 old_oma <- par("oma")
2177 if (count_of_treatment_levels < 2) { 4465 if (count_of_treatment_levels < 2) {
2178 nuke_control_sequences <-
2179 function(s) {
2180 s <- gsub("[\\]", "xyzzy_plugh", s)
2181 s <- gsub("[$]", "\\\\$", s)
2182 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s)
2183 return(s)
2184 }
2185 cat( 4466 cat(
2186 "ERROR!!!! Cannot perform ANOVA analysis", 4467 "ERROR!!!! Cannot perform ANOVA analysis",
2187 "(see next page)\\newpage\n" 4468 "(see next page)\\newpage\n"
2188 ) 4469 )
2189 cat( 4470 cat(
2195 cat("Unparsed sample names are:\n\n\n", 4476 cat("Unparsed sample names are:\n\n\n",
2196 "\\begin{quote}\n", 4477 "\\begin{quote}\n",
2197 paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), 4478 paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"),
2198 "\n\\end{quote}\n\n") 4479 "\n\\end{quote}\n\n")
2199 4480
2200 regex_sample_names <- nuke_control_sequences(regex_sample_names) 4481 regex_sample_names <- latex_printable_control_seqs(regex_sample_names)
2201 4482
2202 cat("\\leavevmode\n\n\n") 4483 cat("\\leavevmode\n\n\n")
2203 cat("Parsing rule for SampleNames is", 4484 cat("Parsing rule for SampleNames is",
2204 "\n\n\n", 4485 "\n\n\n",
2205 "\\text{'", 4486 "\\text{'",
2211 cat("\nParsed sample names are:\n", 4492 cat("\nParsed sample names are:\n",
2212 "\\begin{quote}\n", 4493 "\\begin{quote}\n",
2213 paste(sample_name_matches, collapse = "\n\n\n"), 4494 paste(sample_name_matches, collapse = "\n\n\n"),
2214 "\n\\end{quote}\n\n") 4495 "\n\\end{quote}\n\n")
2215 4496
2216 regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping) 4497 regex_sample_grouping <- latex_printable_control_seqs(regex_sample_grouping)
2217 4498
2218 cat("\\leavevmode\n\n\n") 4499 cat("\\leavevmode\n\n\n")
2219 cat("Parsing rule for SampleGrouping is", 4500 cat("Parsing rule for SampleGrouping is",
2220 "\n\n\n", 4501 "\n\n\n",
2221 "\\text{'", 4502 "\\text{'",
2230 paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"), 4511 paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"),
2231 "\n\\end{quote}\n\n") 4512 "\n\\end{quote}\n\n")
2232 4513
2233 } else { 4514 } else {
2234 4515
2235 p_value_data_anova_ps <- 4516 if (print_nb_messages) nbe("computing p_value_data_anova_ps\n")
2236 apply( 4517 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n")
2237 quant_data_imp_qn_log, 4518 if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n")
2238 1, 4519 if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log[, ".NE.7C"]), "\n")
2239 anova_func, 4520 if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log), "\n")
2240 grouping_factor = sample_treatment_levels, 4521 if (print_nb_messages) nbe(see_variable(anova_func), "\n")
2241 one_way_f = one_way_all_categories 4522 if (print_nb_messages) nbe(see_variable(smpl_trt), "\n")
2242 ) 4523 if (print_nb_messages) nbe(see_variable(one_way_all_categories), "\n")
4524 tryCatch(
4525 {
4526 p_value_data_anova_ps <-
4527 row_apply(
4528 quant_data_imp_qn_log,
4529 anova_func,
4530 grouping_factor = smpl_trt,
4531 one_way_f = one_way_all_categories
4532 )
4533 },
4534 error = function(e) {
4535 mesg <- paste("Could not compute ANOVA because", e$message)
4536 cat("\n\n", mesg, "\n\n")
4537 param_df_noexit(e)
4538 sink(stderr())
4539 cat("\n\n", mesg, "\n\n")
4540 values <- paste(param_df$parameter, "=", param_df$value, collapse = "\n")
4541 cat(values)
4542 sink()
4543 knitr::knit_exit()
4544 exit(code = 1)
4545 }
4546 )
4547 if (print_nb_messages) nbe(see_variable(p_value_data_anova_ps), "\n")
2243 4548
2244 p_value_data_anova_ps_fdr <- 4549 p_value_data_anova_ps_fdr <-
2245 p.adjust(p_value_data_anova_ps, method = "fdr") 4550 p.adjust(p_value_data_anova_ps, method = "fdr")
4551 my_ppep_v <- full_data[, 1]
4552 p_value_data <- list(
4553 phosphopeptide = my_ppep_v,
4554 raw_anova_p = p_value_data_anova_ps,
4555 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr,
4556 missing_values = rowSums(is.na(quant_data)),
4557 min_group_obs_count = min_group_obs_count
4558 )
2246 p_value_data <- data.frame( 4559 p_value_data <- data.frame(
2247 phosphopeptide = full_data[, 1], 4560 phosphopeptide = my_ppep_v,
2248 raw_anova_p = p_value_data_anova_ps, 4561 raw_anova_p = p_value_data_anova_ps,
2249 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr 4562 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr,
2250 ) 4563 missing_values = rowSums(is.na(quant_data)),
4564 min_group_obs_count = min_group_obs_count
4565 )
4566 p_value_data$quality <- 1.0 / with(
4567 p_value_data,
4568 fdr_adjusted_anova_p + 0.005 / (1 + min_group_obs_count)
4569 )
4570
4571 p_value_data$ranking <-
4572 with(
4573 p_value_data,
4574 switch(
4575 g_intensity_hm_criteria,
4576 "quality" = order(-quality),
4577 "na_count" = order(missing_values, fdr_adjusted_anova_p),
4578 # the default is "p_value"
4579 order(fdr_adjusted_anova_p)
4580 )
4581 )
4582 p_value_data <- p_value_data[p_value_data$ranking, , drop = FALSE]
4583
4584 write.table(
4585 p_value_data,
4586 file = "p_value_data.txt",
4587 sep = "\t",
4588 col.names = TRUE,
4589 row.names = FALSE,
4590 quote = FALSE
4591 )
4592
2251 4593
2252 # output ANOVA file to constructed filename, 4594 # output ANOVA file to constructed filename,
2253 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" 4595 # e.g. "Outputfile_pST_ANOVA_STEP5.txt"
2254 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" 4596 # becomes "Outputfile_pST_ANOVA_STEP5_FDR0.05.txt"
2255 4597
2256 # Re-output datasets to include p-values 4598 # Re-output datasets to include p-values
2257 metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3]) 4599 metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:ncol(p_value_data)])
2258 write.table( 4600 write.table(
2259 cbind(metadata_plus_p, quant_data_imp), 4601 cbind(metadata_plus_p, quant_data_imp),
2260 file = imputed_data_filename, 4602 file = imputed_data_filename,
2261 sep = "\t", 4603 sep = "\t",
2262 col.names = TRUE, 4604 col.names = TRUE,
2272 row.names = FALSE, 4614 row.names = FALSE,
2273 quote = FALSE 4615 quote = FALSE
2274 ) 4616 )
2275 4617
2276 4618
2277 p_value_data <-
2278 p_value_data[order(p_value_data$fdr_adjusted_anova_p), ]
2279
2280 first_page_suppress <- 1 4619 first_page_suppress <- 1
2281 number_of_peptides_found <- 0 4620 number_of_peptides_found <- 0
2282 cutoff <- val_fdr[1] 4621 cutoff <- val_fdr[1]
2283 for (cutoff in val_fdr) { 4622 for (cutoff in val_fdr) {
2284 if (number_of_peptides_found > 49) { 4623 #loop through FDR cutoffs
4624 if (number_of_peptides_found > g_intensity_hm_rows - 1) {
2285 cat("\\leavevmode\n\n\n") 4625 cat("\\leavevmode\n\n\n")
2286 break 4626 break
2287 } 4627 }
2288 4628
2289 #loop through FDR cutoffs 4629 bool_1 <- (p_value_data$fdr_adjusted_anova_p < cutoff)
4630 bool_2 <- (p_value_data$min_group_obs_count >= g_intensity_min_per_class)
4631 g_can_run_ksea <- g_can_run_ksea || (sum(bool_2) > 0)
4632 bool_4 <- (p_value_data$quality >= params$minQuality)
4633 bool_3 <- as.logical(
4634 as.integer(bool_1) *
4635 as.integer(bool_2) *
4636 as.integer(bool_4)
4637 )
4638 if (print_trace_messages) {
4639 if (length(bool_1) > 30) {
4640 cat_variable(bool_1, force_str = TRUE)
4641 cat_variable(bool_2, force_str = TRUE)
4642 cat_variable(bool_3, force_str = TRUE)
4643 } else {
4644 cat_variable(bool_1, suffix = "\n\n")
4645 cat_variable(bool_2, suffix = "\n\n")
4646 cat_variable(bool_3, suffix = "\n\n")
4647 }
4648 cat_variable(length(bool_3), digits = 0, suffix = "; ")
4649 cat_variable(sum(bool_3), digits = 0, suffix = "\n\n")
4650 }
2290 4651
2291 filtered_p <- 4652 filtered_p <-
2292 p_value_data[ 4653 p_value_data[bool_3, , drop = FALSE]
2293 which(p_value_data$fdr_adjusted_anova_p < cutoff), 4654 filtered_p <-
2294 , drop = FALSE 4655 filtered_p[!is.na(filtered_p$phosphopeptide), , drop = FALSE]
2295 ] 4656
4657 if (print_trace_messages)
4658 cat_variable(filtered_p, force_str = TRUE)
4659
4660 have_remaining_peptides <- sum(bool_3, na.rm = TRUE) > 0
4661 filter_result_string <-
4662 sprintf(
4663 "%s, %s of %0.0f peptides remained having both %s and %s.\n\n",
4664 "After filtering for ANOVA results",
4665 if (have_remaining_peptides)
4666 as.character(sum(bool_3, na.rm = TRUE))
4667 else
4668 "none",
4669 length(bool_3),
4670 sprintf("adjusted p-value < %s", as.character(signif(cutoff, 2))),
4671 sprintf(
4672 "more than %0.0f observations in some groups",
4673 max(0, g_intensity_min_per_class - 1)
4674 )
4675 )
4676
2296 filtered_data_filtered <- 4677 filtered_data_filtered <-
2297 quant_data_imp_qn_log[ 4678 quant_data_imp_qn_log[
2298 rownames(filtered_p), 4679 rownames(filtered_p),
2299 , drop = FALSE 4680 , drop = FALSE
2300 ] 4681 ]
4682 # order by p-value
2301 filtered_data_filtered <- 4683 filtered_data_filtered <-
2302 filtered_data_filtered[ 4684 filtered_data_filtered[
2303 order(filtered_p$fdr_adjusted_anova_p), 4685 order(filtered_p$fdr_adjusted_anova_p),
2304 , drop = FALSE 4686 , drop = FALSE
2305 ] 4687 ]
2306 4688
2307 # <!-- ACE insertion start --> 4689 if (have_remaining_peptides && nrow(filtered_p) > 0 && nrow(filtered_data_filtered) > 0) {
2308
2309 if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) {
2310 if (first_page_suppress == 1) { 4690 if (first_page_suppress == 1) {
2311 first_page_suppress <- 0 4691 first_page_suppress <- 0
2312 } else { 4692 } else {
2313 cat("\\newpage\n") 4693 cat("\\newpage\n")
2314 } 4694 }
2315 if (nrow(filtered_data_filtered) > 1) { 4695 latex_samepage({
2316 subsection_header(sprintf( 4696 cat(filter_result_string)
2317 "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n", 4697 if (nrow(filtered_data_filtered) > 1) {
2318 nrow(filtered_data_filtered), 4698 cat(subsection_header(sprintf(
2319 cutoff 4699 "Intensity distributions for %d phosphopeptides\n",
2320 )) 4700 nrow(filtered_data_filtered)
2321 } else { 4701 )))
2322 subsection_header(sprintf( 4702 } else {
2323 "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n", 4703 cat(subsection_header(sprintf(
2324 rownames(filtered_data_filtered)[1], 4704 "Intensity distribution for one phosphopeptide (%s)\n",
2325 cutoff 4705 rownames(filtered_data_filtered)[1]
2326 )) 4706 )))
2327 } 4707 }
2328 cat("\n\n\n") 4708 }) # end latex_samepage
2329 cat("\n\n\n") 4709
2330
2331 old_oma <- par("oma")
2332 old_par <- par( 4710 old_par <- par(
2333 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), 4711 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1),
2334 oma = old_oma * c(1, 1, 0.3, 1), 4712 oma = old_oma * c(1, 1, 0.3, 1),
2335 cex.main = 0.9, 4713 cex.main = 0.9,
2336 cex.axis = 0.7, 4714 cex.axis = 0.7,
2337 fin = c(9, 7.25) 4715 fin = c(9, 7.25)
2338 ) 4716 )
2339 # ref: https://r-charts.com/distribution/add-points-boxplot/
2340 # Vertical plot 4717 # Vertical plot
2341 colnames(filtered_data_filtered) <- sample_name_matches 4718 colnames(filtered_data_filtered) <- sample_name_matches
2342 tryCatch( 4719 tryCatch(
2343 boxplot( 4720 boxplot(
2344 filtered_data_filtered, 4721 filtered_data_filtered,
2345 main = "Imputed, normalized intensities", # no line plot 4722 main = "Imputed, normalized intensities", # no line plot
2346 las = 2, 4723 las = 2,
2347 cex.axis = 0.9 * sample_name_shrink, 4724 cex.axis = 0.9 * sample_name_grow,
2348 col = const_boxplot_fill, 4725 col = const_boxplot_fill,
2349 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") 4726 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
4727 notch = FALSE,
4728 varwidth = boxplot_varwidth
2350 ), 4729 ),
2351 error = function(e) print(e) 4730 error = function(e) {
4731 print(e)
4732 cat_margins()
4733 }
4734
2352 ) 4735 )
2353 par(old_par) 4736 par(old_par)
2354 } else { 4737 } else {
2355 cat(sprintf( 4738 cat(sprintf(
2356 "%s < %0.2f\n\n\n\n\n", 4739 "%s < %0.2f\n\n\n\n\n",
2357 "No peptides were found to have cutoff adjusted p-value", 4740 "No peptides were found to have cutoff adjusted p-value",
2358 cutoff 4741 cutoff
2359 )) 4742 ))
2360 } 4743 }
2361 4744
2362 if (nrow(filtered_data_filtered) > 0) { 4745 if (have_remaining_peptides && nrow(filtered_data_filtered) > 0) {
2363 # Add Phosphopeptide column to anova_filtered table 4746 # Add Phosphopeptide column to anova_filtered table
2364 # The assumption here is that the first intensity is unique; 4747 # The assumption here is that the first intensity is unique;
2365 # this is a hokey assumption but almost definitely will 4748 # this is a hokey assumption but almost definitely will
2366 # be true in the real world unless there is a computation 4749 # be true in the real world unless there is a computation
2367 # error upstream. 4750 # error upstream.
2390 by.y = "Phosphopeptide" 4773 by.y = "Phosphopeptide"
2391 ) 4774 )
2392 4775
2393 # Produce heatmap to visualize significance and the effect of imputation 4776 # Produce heatmap to visualize significance and the effect of imputation
2394 4777
2395 anova_filtered_merge_format <- sapply(
2396 X = filtered_p$fdr_adjusted_anova_p
2397 ,
2398 FUN = function(x) {
2399 if (x > 0.01)
2400 paste0("%s (%0.", 1 + ceiling(-log10(x)), "f)")
2401 else
2402 paste0("%s (%0.2e)")
2403 }
2404 )
2405
2406 cat_hm_heading <- function(m, cutoff) { 4778 cat_hm_heading <- function(m, cutoff) {
2407 if (nrow(m) > intensity_hm_rows) { 4779 if (nrow(m) > g_intensity_hm_rows) {
2408 cat("\\newpage\n") 4780 cat("\\clearpage\n")
2409 subsection_header( 4781 cat(subsection_header(
2410 paste( 4782 paste(
2411 sprintf("Heatmap for the %d most-significant peptides", 4783 sprintf("Heatmap for the %d most-significant peptides",
2412 intensity_hm_rows), 4784 g_intensity_hm_rows),
2413 sprintf("whose adjusted p-value < %0.2f\n", cutoff) 4785 sprintf("whose adjusted p-value < %0.2f\n", cutoff)
2414 ) 4786 )
2415 ) 4787 ))
2416 } else { 4788 } else {
2417 if (nrow(m) == 0) { 4789 if (nrow(m) == 0) {
2418 return(FALSE) 4790 return(FALSE)
2419 } else { 4791 } else {
2420 subsection_header( 4792 cat(subsection_header(
2421 paste( 4793 paste(
2422 sprintf("Heatmap for %d usable peptides whose", nrow(m)), 4794 sprintf("Heatmap for %d usable peptide genes whose", nrow(m)),
2423 sprintf("adjusted p-value < %0.2f\n", cutoff) 4795 sprintf("adjusted p-value < %0.2f\n", cutoff)
2424 ) 4796 )
2425 ) 4797 ))
2426 } 4798 }
2427 } 4799 }
2428 cat("\n\n\n") 4800 cat("\n\n\n")
2429 cat("\n\n\n") 4801 cat("\n\n\n")
2430 return(TRUE) 4802 return(TRUE)
2431 } 4803 }
2432 4804
2433 # construct matrix with appropriate rownames 4805 # construct matrix with appropriate rownames
2434 m <- 4806 m <-
2435 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) 4807 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ])
2436 if (nrow(m) > 0) { 4808 nrow_m <- nrow(m)
4809 ncol_m <- ncol(m)
4810 if (nrow_m > 0) {
2437 rownames_m <- rownames(m) 4811 rownames_m <- rownames(m)
2438 rownames(m) <- sapply( 4812 q <- data.frame(pepname = rownames_m)
2439 X = seq_len(nrow(m)) 4813 g <- sqldf("
2440 , 4814 SELECT q.pepname, substr(met.Gene_Name, 1, 30) AS gene_name
4815 FROM q, metadata_plus_p AS met
4816 WHERE q.pepname = met.Phosphopeptide
4817 ORDER BY q.rowid
4818 ")
4819 tmp <- sapply(
4820 X = seq_len(nrow(g)),
4821 FUN = function(i) {
4822 pre <- strsplit(g$gene_name[i], "; ")[[1]]
4823 rslt <- paste(unique(pre), sep = "; ")
4824 return(rslt)
4825 }
4826 )
4827 tmp <- unlist(tmp)
4828 tmp <-
4829 make.names(tmp, unique = TRUE)
4830 tmp <- sub(
4831 "No_Gene_Name",
4832 "not_found",
4833 tmp,
4834 fixed = TRUE
4835 )
4836 ten_trunc_names <- trunc_ppep(rownames_m)
4837 tmp <- sapply(
4838 X = seq_len(nrow_m),
2441 FUN = function(i) { 4839 FUN = function(i) {
2442 sprintf( 4840 sprintf(
2443 anova_filtered_merge_format[i], 4841 "(%s) %s",
2444 rownames_m[i], 4842 tmp[i],
2445 signif(filtered_p$fdr_adjusted_anova_p[i], 2) 4843 ten_trunc_names[i]
2446 ) 4844 )
2447 } 4845 }
2448 ) 4846 )
4847 rownames(m) <- tmp
2449 } 4848 }
2450 # draw the heading and heatmap 4849 # draw the heading and heatmap
2451 if (nrow(m) > 0) { 4850 if (nrow_m > 0) {
2452 number_of_peptides_found <- 4851 number_of_peptides_found <-
2453 draw_ppep_heatmap( 4852 ppep_heatmap(
2454 m = m, 4853 m = m,
2455 cutoff = cutoff, 4854 cutoff = cutoff,
2456 hm_heading_function = cat_hm_heading, 4855 hm_heading_function = cat_hm_heading,
2457 hm_main_title = 4856 hm_main_title =
2458 "log(intensities), row-scaled, unimputed, unnormalized", 4857 "log(intensities), row-scaled, unimputed, unnormalized",
2459 suppress_row_dendrogram = FALSE 4858 suppress_row_dendrogram = FALSE,
4859 master_cex = 0.35,
4860 sepcolor = "black",
4861 colsep = sample_colsep
2460 ) 4862 )
2461 if (number_of_peptides_found > 1) { 4863 if (number_of_peptides_found > 1) {
2462 cat("\\leavevmode\n") 4864 cat("\\leavevmode\n")
2463 cat("The adjusted ANOVA \\textit{p}-value is shown in parentheses
2464 after the phosphopeptide sequence.\n\n")
2465 } 4865 }
2466 } 4866 }
2467 } 4867 }
2468 } 4868 }
2469 } 4869 }
4870 cat(filter_result_string)
2470 cat("\\leavevmode\n") 4871 cat("\\leavevmode\n")
4872
4873 if (!g_can_run_ksea) {
4874 errmsg <- paste("Cannot proceed with KSEA analysis",
4875 "because too many values are missing.")
4876 if (FALSE) cat0(
4877 errmsg,
4878 "\\stepcounter{offset}\n",
4879 "\\stepcounter{offset}\n",
4880 "\\stepcounter{offset}\n",
4881 " in ",
4882 table_href(),
4883 ".\n\n"
4884 )
4885 if (FALSE) {
4886 if (print_nb_messages) nbe(see_variable(p_value_data))
4887 } else {
4888 if (print_nb_messages) nbe(see_variable(p_value_data))
4889
4890 display_p_value_data <- p_value_data
4891 display_p_value_data$raw_anova_p <-
4892 sprintf("%0.3g", display_p_value_data$raw_anova_p)
4893 display_p_value_data$fdr_adjusted_anova_p <-
4894 sprintf("%0.3g", display_p_value_data$fdr_adjusted_anova_p)
4895 display_p_value_data$quality <-
4896 sprintf("%0.3g", display_p_value_data$quality)
4897
4898 headers_1st_line <-
4899 c("", "Raw ANOVA", "FDR-adj.", "Missing", "Min. #", "", "")
4900 headers_2nd_line <-
4901 c("Phosphopeptide", "p-value", "p-value", "values", "group-obs", "Quality", "Ranking")
4902 data_frame_tabbing_latex(
4903 x = display_p_value_data,
4904 tabstops = c(2.75, 0.80, 0.80, 0.5, 0.6, 0.60),
4905 use_subsubsection_header = FALSE,
4906 headings = c(headers_1st_line, headers_2nd_line),
4907 caption = "ANOVA results"
4908
4909 )
4910 }
4911 data_frame_tabbing_latex(
4912 x = save_sample_treatment_df,
4913 tabstops = c(1.25, 1.25),
4914 caption = "Sample classes",
4915 use_subsubsection_header = FALSE
4916 )
4917 param_df_exit()
4918 knitr::knit_exit()
4919 return(invisible(-1))
4920 }
4921
2471 ``` 4922 ```
2472 4923
2473 ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} 4924 ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
2474 4925
2475 if (count_of_treatment_levels > 1) { 4926 if (g_can_run_ksea && count_of_treatment_levels > 1) {
2476 # Prepare two-way contrasts with adjusted p-values 4927 # Prepare two-way contrasts with adjusted p-values
2477 # Strategy: 4928 # Strategy:
2478 # - use imputed, log-transformed data: 4929 # - use imputed, log-transformed data:
2479 # - remember this when computing log2(fold-change) 4930 # - remember this when computing log2(fold-change)
2480 # - each contrast is between a combination of trt levels 4931 # - each contrast is between a combination of trt levels
2486 # - adjust p-value, assuming that 4937 # - adjust p-value, assuming that
2487 # (# of pppeps)*(# of contrasts) tests were performed 4938 # (# of pppeps)*(# of contrasts) tests were performed
2488 4939
2489 # Each contrast is between a combination of trt levels 4940 # Each contrast is between a combination of trt levels
2490 m2 <- combn( 4941 m2 <- combn(
2491 x = seq_len(length(levels(sample_treatment_levels))), 4942 x = seq_len(length(levels(smpl_trt))),
2492 m = 2, 4943 m = 2,
2493 simplify = TRUE 4944 simplify = TRUE
2494 ) 4945 )
2495 contrast_count <- ncol(m2) 4946 contrast_count <- ncol(m2)
2496 4947
2500 f_m2 <- 4951 f_m2 <-
2501 function(cntrst, lvl1, lvl2) { 4952 function(cntrst, lvl1, lvl2) {
2502 return( 4953 return(
2503 data.frame( 4954 data.frame(
2504 contrast = cntrst, 4955 contrast = cntrst,
2505 level = sample_treatment_levels[ 4956 level = smpl_trt[
2506 sample_treatment_levels %in% 4957 smpl_trt %in%
2507 levels(sample_treatment_levels)[c(lvl1, lvl2)] 4958 levels(smpl_trt)[c(lvl1, lvl2)]
2508 ], 4959 ],
2509 label = sample_name_matches[ 4960 label = sample_name_matches[
2510 sample_treatment_levels %in% 4961 smpl_trt %in%
2511 levels(sample_treatment_levels)[c(lvl1, lvl2)] 4962 levels(smpl_trt)[c(lvl1, lvl2)]
2512 ] 4963 ]
2513 ) 4964 )
2514 ) 4965 )
2515 } 4966 }
2516 # - compute a df for each contrast 4967 # - compute a df for each contrast
2686 ; 5137 ;
2687 " 5138 "
2688 ) 5139 )
2689 5140
2690 # - create contrast-metadata table 5141 # - create contrast-metadata table
5142 if (print_nb_messages) nbe("CREATE TABLE contrast_lvl_lvl_metadata")
2691 dml_no_rows_exec(db, " 5143 dml_no_rows_exec(db, "
2692 CREATE TABLE contrast_lvl_lvl_metadata 5144 CREATE TABLE contrast_lvl_lvl_metadata
2693 AS 5145 AS
2694 SELECT DISTINCT 5146 SELECT DISTINCT
2695 a.contrast AS ab_contrast, 5147 a.contrast AS ab_contrast,
2796 rownames(grouping_factor) <- grouping_factor$sample 5248 rownames(grouping_factor) <- grouping_factor$sample
2797 grouping_factor <- grouping_factor[, "level", drop = FALSE] 5249 grouping_factor <- grouping_factor[, "level", drop = FALSE]
2798 5250
2799 # - run the two-level (one-way) test 5251 # - run the two-level (one-way) test
2800 p_value_data_contrast_ps <- 5252 p_value_data_contrast_ps <-
2801 apply( 5253 row_apply(
2802 X = contrast_cast_data, 5254 x = contrast_cast_data,
2803 MARGIN = 1, # apply to rows 5255 fun = anova_func,
2804 FUN = anova_func,
2805 grouping_factor = 5256 grouping_factor =
2806 as.factor(grouping_factor$level), # anova_func arg2 5257 as.factor(grouping_factor$level), # anova_func arg2
2807 one_way_f = one_way_two_categories, # anova_func arg3 5258 one_way_f = one_way_two_categories, # anova_func arg3
2808 simplify = TRUE # TRUE is the default for simplify 5259 simplify = TRUE # TRUE is the default for simplify
2809 ) 5260 )
3013 AND NOT m.`Gene` = 'No_Gene_Name' 5464 AND NOT m.`Gene` = 'No_Gene_Name'
3014 AND NOT v.log2_fc = 0 5465 AND NOT v.log2_fc = 0
3015 ; 5466 ;
3016 " 5467 "
3017 ) 5468 )
5469 # We are done with DDL and insertion
5470 RSQLite::dbDisconnect(db)
3018 } 5471 }
3019 ``` 5472 ```
3020 5473
3021 ```{r echo = FALSE, results = 'asis'} 5474 ```{r echo = FALSE, results = 'asis'}
3022 cat("\\newpage\n") 5475 cat("\\newpage\n")
3023 ``` 5476 ```
3024 5477
3025 # KSEA Analysis 5478 # KSEA Analysis Summaries
3026 5479
3027 Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: 5480 Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package:
3028 5481
3029 - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp 5482 - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp
3030 - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415). 5483 - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415).
3031 - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/). 5484 - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/).
3032 5485
3033 For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: 5486 For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as:
3034 5487
3035 $$ 5488 $$
3036 \text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j} 5489 \text{kinase enrichment }z\text{-score}_{j,i} = \frac{(\overline{`r sfc`}_{j,i} - \overline{`r pfc`}_j)\sqrt{m_{j,i}}}{\delta_j}
3037 $$ 5490 $$
3038 5491
3039 and fold-enrichment is computed as: 5492 and fold-enrichment is computed as:
3040 5493
3041 $$ 5494 $$
3042 \text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j} 5495 \text{Enrichment}_{j,i} = \frac{\overline{`r sfc`}_{j,i}}{\overline{`r pfc`}_j}
3043 $$ 5496 $$
3044 5497
3045 where: 5498 where:
3046 5499
3047 - $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$, 5500 - $\overline{`r sfc`}_{j,i}$ is the mean `r pfc_txt` in intensities of known substrates of the kinase $i$ in contrast $j$,
3048 - $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and 5501 - $\overline{`r pfc`}_j$ is the mean `r pfc_txt` of all phosphosites identified in contrast $j$, and
3049 - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, 5502 - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$,
3050 - $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset. 5503 - $\delta_j$ is the standard deviation of the $\log_2 (\text{fold-change})$ for contrast $j$ across all phosphosites in the dataset.
3051 - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. 5504 - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score.
3052 5505
3053 $\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x) 5506 $\text{FDR}_{j,i}$ is the False Discovery Rate corrected kinase enrichment score.
3054 5507
3055 Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). 5508 Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive).
3056 5509
3057 Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`. 5510 Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`.
3058 5511
3059 - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. 5512 - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis.
3060 - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). 5513 - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)).
3061 - Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). 5514 - Kinase-substrate data were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads).
3062 5515
3063 ```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} 5516 For each enriched kinase, a heatmap showing the intensities is presented for up to `r g_intensity_hm_rows` substrates, i.e., those substrates having the highest"quality".
5517
5518 Where possible, a heatmap of the correlations among these the selected substrates is also presented; if correlations cannot be computed (because of too many missing values), then the covariances are heatmapped for substrates having a variance greater than 1.
5519
5520 ```{r ksea, echo = FALSE, fig.dim = c(12, 14.5), results = 'asis'}
5521 cat("\\clearpage\n")
3064 5522
3065 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) 5523 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db)
3066 5524
3067 # -- eliminate the table that's about to be defined 5525 # -- eliminate the table that's about to be defined
3068 ddl_exec(db, " 5526 ddl_exec(db, "
3170 sub_title <- contrast_longlabel 5628 sub_title <- contrast_longlabel
3171 tryCatch( 5629 tryCatch(
3172 expr = { 5630 expr = {
3173 ksea_scores_rslt <- 5631 ksea_scores_rslt <-
3174 ksea_scores( 5632 ksea_scores(
3175 ksdata = pseudo_ksdata, # KSEAapp::KSData, 5633 ksdata = pseudo_ksdata,
3176 px = kseaapp_input, 5634 px = kseaapp_input,
3177 networkin = TRUE, 5635 networkin = TRUE,
3178 networkin_cutoff = 2 5636 networkin_cutoff = 2,
5637 minimum_substrate_count = ksea_min_substrate_count
3179 ) 5638 )
5639
5640 if (FALSE) {
5641 ksea_scores_rslt <-
5642 ksea_scores_rslt[
5643 ksea_scores_rslt$m >= ksea_min_substrate_count,
5644 ,
5645 drop = FALSE
5646 ]
5647 }
5648
5649 if (FALSE) {
5650 data_frame_tabbing_latex(
5651 x = ksea_scores_rslt,
5652 tabstops = c(0.8, 0.8, 0.8, 0.8, 0.8, 0.8),
5653 caption = paste("KSEA scores for contrast ",
5654 cntrst_b_level, "to", cntrst_a_level),
5655 use_subsubsection_header = FALSE
5656 )
5657 }
5658
5659 if (FALSE) {
5660 if (print_nb_messages) nbe("Output contents of `ksea_scores_rslt` table\n")
5661 cat_variable(ksea_scores_rslt)
5662 cat("\n\\clearpage\n")
5663 }
3180 5664
3181 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { 5665 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
3182 next_index <- 1 + next_index 5666 next_index <- 1 + next_index
3183 rslt$score_list[[next_index]] <- ksea_scores_rslt 5667 rslt$score_list[[next_index]] <- ksea_scores_rslt
3184 rslt$name_list[[next_index]] <- contrast_label 5668 rslt$name_list[[next_index]] <- contrast_label
3185 rslt$longname_list[[next_index]] <- contrast_longlabel 5669 rslt$longname_list[[next_index]] <- contrast_longlabel
3186 low_fdr_print( 5670 ksea_low_fdr_print(
3187 rslt = rslt, 5671 rslt = rslt,
3188 i_cntrst = i_cntrst, 5672 i_cntrst = i_cntrst,
3189 i = next_index, 5673 i = next_index,
3190 a_level = cntrst_a_level, 5674 a_level = cntrst_a_level,
3191 b_level = cntrst_b_level, 5675 b_level = cntrst_b_level,
3192 fold_change = cntrst_fold_change, 5676 fold_change = cntrst_fold_change,
3193 caption = contrast_longlabel 5677 caption = contrast_longlabel
3194 ) 5678 )
3195 } 5679 }
3196 }, 5680 },
3197 error = function(e) str(e) 5681 error = function(e) {
5682 str(e)
5683 cat_margins()
5684 }
3198 ) 5685 )
3199 } 5686 }
3200 5687
3201 plotted_kinases <- NULL 5688 plotted_kinases <- NULL
3202 if (length(rslt$score_list) > 1) { 5689 if (g_can_run_ksea && length(rslt$score_list) > 1) {
3203 for (i in seq_len(length(ksea_heatmap_titles))) { 5690 for (i in seq_len(length(ksea_heatmap_titles))) {
3204 hdr <- ksea_heatmap_titles[[i]] 5691 hdr <- ksea_heatmap_titles[[i]]
3205 which_kinases <- i 5692 which_kinases <- i
3206 5693
3207 cat("\\clearpage\n\\begin{center}\n") 5694 cat("\\clearpage\n\\begin{center}\n")
3208 if (i == const_ksea_astrsk_kinases) { 5695 if (i == const_ksea_astrsk_kinases) {
3209 subsection_header(hdr) 5696 cat(subsection_header(hdr))
3210 } else { 5697 } else {
3211 subsection_header(hdr) 5698 cat(subsection_header(hdr))
3212 } 5699 }
3213 cat("\\end{center}\n") 5700 cat("\\end{center}\n")
3214 5701
3215 plotted_kinases <- ksea_heatmap( 5702 plotted_kinases <- ksea_heatmap(
3216 # the data frame outputs from the KSEA.Scores() function, in list format 5703 # the data frame outputs from the KSEA.Scores() function, in list format
3225 # a numeric value between 0 and infinity indicating the min. number of 5712 # a numeric value between 0 and infinity indicating the min. number of
3226 # substrates a kinase must have to be included in the heatmap 5713 # substrates a kinase must have to be included in the heatmap
3227 m_cutoff = 1, 5714 m_cutoff = 1,
3228 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff 5715 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
3229 # for indicating significant kinases in the heatmap 5716 # for indicating significant kinases in the heatmap
3230 p_cutoff = 0.05, 5717 p_cutoff = params$kseaCutoffThreshold,
3231 # a binary input of TRUE or FALSE, indicating whether or not to perform 5718 # a binary input of TRUE or FALSE, indicating whether or not to perform
3232 # hierarchical clustering of the sample columns 5719 # hierarchical clustering of the sample columns
3233 sample_cluster = TRUE, 5720 sample_cluster = TRUE,
3234 # a binary input of TRUE or FALSE, indicating whether or not to export 5721 # a binary input of TRUE or FALSE, indicating whether or not to export
3235 # the heatmap as a .png image into the working directory 5722 # the heatmap as a .png image into the working directory
3242 ylab = "Kinase", 5729 ylab = "Kinase",
3243 # print which kinases: 5730 # print which kinases:
3244 # - 1 : all kinases 5731 # - 1 : all kinases
3245 # - 2 : significant kinases 5732 # - 2 : significant kinases
3246 # - 3 : non-significant kinases 5733 # - 3 : non-significant kinases
3247 which_kinases = which_kinases 5734 which_kinases = which_kinases,
5735 margins = c(7, 15)
3248 ) 5736 )
3249 if (!is.null(plotted_kinases)) { 5737 if (!is.null(plotted_kinases)) {
3250 cat("\\begin{center}\n") 5738 cat("\\begin{center}\n")
3251 cat("Color intensity reflects $z$-score magnitudes; hue reflects $z$-score sign.\n")
3252 if (which_kinases != const_ksea_nonastrsk_kinases) 5739 if (which_kinases != const_ksea_nonastrsk_kinases)
3253 cat("Asterisks reflect significance.\n") 5740 cat("Asterisks reflect significance.\n")
3254 cat("\\end{center}\n") 5741 cat("\\end{center}\n")
3255 } 5742 }
3256 } # end for (i in ... 5743 } # end for (i in ...
3257 } # end if (length ... 5744 } # end if (length ...
3258 5745 ```
3259 for (i_cntrst in seq_len(length(rslt$score_list))) { 5746
3260 next_index <- i_cntrst 5747 ```{r kseabar_calc, echo = FALSE, fig.dim = c(9.5, 6), results = 'asis'}
3261 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] 5748 ksea_prints <- list()
3262 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] 5749 ksea_barplots <- list()
3263 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] 5750 for (i_cntrst in seq_len(length(rslt$score_list))) {
3264 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) 5751 next_index <- i_cntrst
3265 contrast_longlabel <- ( 5752 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
5753 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
5754 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
5755 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
5756 contrast_longlabel <- (
5757 sprintf(
5758 "Class %s -> Class %s",
5759 contrast_metadata_df[i_cntrst, "b_level"],
5760 contrast_metadata_df[i_cntrst, "a_level"]
5761 )
5762 )
5763 main_title <- (
5764 sprintf(
5765 "Change from treatment %s to treatment %s",
5766 contrast_metadata_df[i_cntrst, "b_level"],
5767 contrast_metadata_df[i_cntrst, "a_level"]
5768 )
5769 )
5770 sub_title <- contrast_longlabel
5771 tryCatch(
5772 expr = {
5773 ksea_scores_rslt <- rslt$score_list[[next_index]]
5774 if (print_nb_messages) nbe(see_variable(ksea_scores_rslt)) #ACE
5775
5776 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
5777 sink(deferred <- file())
5778 ksea_low_fdr_print(
5779 rslt = rslt,
5780 i_cntrst = i_cntrst,
5781 i = next_index,
5782 a_level = cntrst_a_level,
5783 b_level = cntrst_b_level,
5784 fold_change = cntrst_fold_change,
5785 caption = contrast_longlabel,
5786 write_db = FALSE,
5787 anchor = const_table_anchor_t
5788 )
5789 cat("\n")
5790 sink()
5791 lines <-
5792 paste(
5793 readLines(deferred, warn = FALSE),
5794 collapse = "\n"
5795 )
5796 close(deferred)
5797 sq_put(ksea_prints, lines)
5798 sink(stderr())
5799 cat("\n---\n")
5800 cat_variable(ksea_prints)
5801 barplot_closure <-
5802 ksea_low_fdr_barplot_factory(
5803 rslt = rslt,
5804 i_cntrst = i_cntrst,
5805 i = next_index,
5806 a_level = cntrst_a_level,
5807 b_level = cntrst_b_level,
5808 fold_change = cntrst_fold_change,
5809 caption = contrast_longlabel
5810 )
5811 if (rlang::is_closure(barplot_closure))
5812 sq_put(ksea_barplots, barplot_closure)
5813 else
5814 sq_put(ksea_barplots, no_op)
5815 str(ksea_barplots)
5816 cat("\n...\n")
5817 sink()
5818 }
5819 },
5820 error = function(e) {
5821 str(e)
5822 cat_margins()
5823 }
5824 )
5825 }
5826 ```
5827
5828 ```{r phosphoelm_kinase_upid_desc, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'}
5829
5830 have_kinase_descriptions <-
5831 if (!is.null(bzip2df(kinase_uprt_desc_lut, kinase_uprt_desc_lut_bz2)) &&
5832 !is.null(bzip2df(kinase_name_uprt_lut, kinase_name_uprt_lut_bz2))
5833 ) {
5834 rownames(kinase_uprt_desc_lut) <- kinase_uprt_desc_lut$UniProtID
5835 kinase_name_to_desc_uprt <- function(s) {
5836 rslt <- NULL
5837 tryCatch(
5838 {
5839 which_rows <- eval(s == kinase_name_uprt_lut$kinase)
5840 kinase_uprtid <-
5841 kinase_name_uprt_lut[which_rows, 2]
5842 # filter for first _HUMAN match if any
5843 grepl_human <- grepl("_HUMAN$", kinase_uprtid)
5844 if (0 < sum(grepl_human))
5845 kinase_uprtid <- kinase_uprtid[grepl_human]
5846 # filter for first match if any
5847 if (0 < length(kinase_uprtid)) {
5848 kinase_uprtid <- kinase_uprtid[1]
5849 kinase_desc <- kinase_uprt_desc_lut[kinase_uprtid, 2]
5850 if (!is.na(kinase_desc))
5851 rslt <- c(kinase_desc, kinase_uprtid)
5852 else
5853 rslt <- c(kinase_desc, "")
5854 }
5855 },
5856 warning = str
5857 )
5858 rslt
5859 }
5860 TRUE
5861 } else {
5862 kinase_name_to_desc_uprt <- function(s) NULL
5863 FALSE
5864 }
5865 ```
5866
5867 ```{r write_params, echo = FALSE, results = 'asis'}
5868 # perhaps this should be moved into the functions section, eventually ...
5869 write_params <- function(db) {
5870 # write parameters to report
5871
5872 # write parameters to SQLite output
5873
5874 mqppep_anova_script_param_df <- data.frame(
5875 script = "mqppep_anova_script.Rmd",
5876 parameter = names(param_unlist),
5877 value = param_unlist
5878 )
5879 ddl_exec(db, "
5880 DROP TABLE IF EXISTS script_parameter;
5881 "
5882 )
5883 ddl_exec(db, "
5884 CREATE TABLE IF NOT EXISTS script_parameter(
5885 script TEXT,
5886 parameter TEXT,
5887 value ANY,
5888 UNIQUE (script, parameter) ON CONFLICT REPLACE
5889 )
5890 ;
5891 "
5892 )
5893 RSQLite::dbWriteTable(
5894 conn = db,
5895 name = "script_parameter",
5896 value = mqppep_anova_script_param_df,
5897 append = TRUE
5898 )
5899
5900 loaded_packages_df <- sessioninfo::package_info("loaded")
5901 loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library)
5902 loaded_packages_df <- data.frame(
5903 package = loaded_packages_df$package,
5904 version = loaded_packages_df$loadedversion,
5905 date = loaded_packages_df$date
5906 )
5907 #ACE cat("\\clearpage\n\\section{R package versions}\n")
5908 #ACE data_frame_tabbing_latex(
5909 #ACE x = loaded_packages_df,
5910 #ACE tabstops = c(2.5, 1.25),
5911 #ACE caption = "R package versions"
5912 #ACE )
5913 cat("\\clearpage\n\\section{Input parameter settings}\n")
5914 data_frame_tabbing_latex(
5915 x = param_df,
5916 tabstops = c(1.75),
5917 underscore_whack = TRUE,
5918 caption = "Input parameters",
5919 verbatim = FALSE
5920 )
5921 }
5922
5923 if (!have_kinase_descriptions) {
5924 write_params(db)
5925 # We are done with output
5926 RSQLite::dbDisconnect(db)
5927 param_df_exit()
5928 knitr::knit_exit()
5929 }
5930 ```
5931
5932 ```{r kseabar, echo = FALSE, fig.dim = c(9.5, 12.3), results = 'asis'}
5933 if (have_kinase_descriptions) {
5934 my_section_header <-
3266 sprintf( 5935 sprintf(
3267 "Class %s -> Class %s", 5936 "inases whose KSEA %s < %s\n",
3268 contrast_metadata_df[i_cntrst, "b_level"], 5937 ksea_cutoff_statistic,
3269 contrast_metadata_df[i_cntrst, "a_level"] 5938 signif(ksea_cutoff_threshold, 2)
3270 ) 5939 )
3271 ) 5940
3272 main_title <- ( 5941 # Use enriched kinases to find enriched kinase-substrate pairs
5942 enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash))
5943
5944 enriched_kinase_descs <-
5945 Reduce(
5946 f = function(l, r) {
5947 lkup <- kinase_name_to_desc_uprt(r)
5948 if (is.null(lkup)) l
5949 else r2 <- rbind(
5950 l,
5951 data.frame(
5952 kinase = r,
5953 uniprot_id = lkup[2],
5954 description = lkup[1]
5955 )
5956 )
5957 },
5958 x = enriched_kinases$kinase,
5959 init = NULL
5960 )
5961
5962 if (length(enriched_kinase_descs) > 0 && nrow(enriched_kinase_descs) > 0) {
5963 cat("\n\\clearpage\n")
5964 data_frame_tabbing_latex(
5965 x = enriched_kinase_descs,
5966 tabstops = c(0.9, 1.3),
5967 headings = c("Kinase", "UniProt ID", "Description"),
5968 caption = paste0("Descriptions of k", my_section_header)
5969 )
5970 }
5971
5972 if (FALSE) {
5973 cat_variable(sqldf("SELECT kinase FROM enriched_kinases"))
5974 cat_variable(sqldf("
5975 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
5976 FROM pseudo_ksdata
5977 WHERE gene IN (SELECT kinase FROM enriched_kinases)
5978 "))
5979 data_frame_table_latex(
5980 x = sqldf("
5981 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
5982 FROM pseudo_ksdata
5983 WHERE gene IN (SELECT kinase FROM enriched_kinases)
5984 "),
5985 justification = "l l l",
5986 centered = TRUE,
5987 caption = "substrates of enriched kinases",
5988 anchor = c(const_table_anchor_p, const_table_anchor_t),
5989 underscore_whack = TRUE
5990 )
5991 data_frame_table_latex(
5992 x = sqldf("
5993 SELECT
5994 gene AS kinase,
5995 ppep,
5996 sub_gene,
5997 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label,
5998 fdr_adjusted_anova_p,
5999 quality,
6000 min_group_obs_count
6001 FROM (
6002 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
6003 FROM pseudo_ksdata
6004 WHERE gene IN (SELECT kinase FROM enriched_kinases)
6005 ),
6006 p_value_data
6007 WHERE ppep = phosphopeptide
6008 GROUP BY kinase, ppep
6009 ORDER BY kinase, ppep, p_value_data.quality DESC
6010 "),
6011 justification = "l l l l l l l",
6012 centered = TRUE,
6013 caption = "labeled substrates of enriched kinases",
6014 anchor = c(const_table_anchor_p, const_table_anchor_t),
6015 underscore_whack = TRUE
6016 )
6017 }
6018 all_enriched_substrates <- sqldf("
6019 SELECT
6020 gene AS kinase,
6021 ppep,
6022 sub_gene,
6023 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label,
6024 fdr_adjusted_anova_p,
6025 quality,
6026 min_group_obs_count
6027 FROM (
6028 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
6029 FROM pseudo_ksdata
6030 WHERE gene IN (SELECT kinase FROM enriched_kinases)
6031 ),
6032 p_value_data
6033 WHERE ppep = phosphopeptide
6034 GROUP BY kinase, ppep
6035 ORDER BY kinase, ppep, p_value_data.quality DESC
6036 ")
6037
6038 all_enriched_substrates <-
6039 all_enriched_substrates[
6040 all_enriched_substrates$quality >= params$minQuality,
6041 ,
6042 drop = FALSE
6043 ]
6044
6045 all_enriched_substrates$sub_gene <-
6046 sub(
6047 " ///.*",
6048 " ...",
6049 all_enriched_substrates$sub_gene
6050 )
6051
6052 all_enriched_substrates$label <-
6053 with(
6054 all_enriched_substrates,
3273 sprintf( 6055 sprintf(
3274 "Change from treatment %s to treatment %s", 6056 "(%s-%s) %s",
3275 contrast_metadata_df[i_cntrst, "b_level"], 6057 kinase,
3276 contrast_metadata_df[i_cntrst, "a_level"] 6058 trunc_subgene(sub_gene),
3277 ) 6059 ppep
3278 ) 6060 )
3279 sub_title <- contrast_longlabel 6061 )
3280 tryCatch( 6062
3281 expr = { 6063 # this global is set to TRUE by cat_enriched_heading immediately below
3282 ksea_scores_rslt <- rslt$score_list[[next_index]] 6064 g_neednewpage <- FALSE
3283 6065
3284 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { 6066 # helper used to label per-kinase substrate enrichment figure
3285 low_fdr_barplot( 6067 cat_enriched_heading <- function(m, cut_args) {
3286 rslt = rslt, 6068 cutoff <- cut_args$cutoff
3287 i_cntrst = i_cntrst, 6069 kinase <- cut_args$kinase
3288 i = next_index, 6070 if (g_neednewpage) cat("\\newpage\n")
3289 a_level = cntrst_a_level, 6071 g_neednewpage <- TRUE
3290 b_level = cntrst_b_level, 6072 if (nrow(m) > g_intensity_hm_rows) {
3291 fold_change = cntrst_fold_change, 6073 cat(subsection_header(
3292 caption = contrast_longlabel
3293 )
3294 }
3295 },
3296 error = function(e) str(e)
3297 )
3298 }
3299 ```
3300
3301 ```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
3302
3303 # Use enriched kinases to find enriched kinase-substrate pairs
3304 enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash))
3305 all_enriched_substrates <- sqldf("
3306 SELECT
3307 gene AS kinase,
3308 ppep,
3309 sub_gene,
3310 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label,
3311 fdr_adjusted_anova_p
3312 FROM (
3313 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
3314 FROM pseudo_ksdata
3315 WHERE gene IN (SELECT kinase FROM enriched_kinases)
3316 ),
3317 p_value_data
3318 WHERE ppep = phosphopeptide
3319 GROUP BY ppep
3320 ORDER BY fdr_adjusted_anova_p
3321 ")
3322
3323 # helper used to label per-kinase substrate enrichment figure
3324 cat_enriched_heading <- function(m, cut_args) {
3325 cutoff <- cut_args$cutoff
3326 kinase <- cut_args$kinase
3327 statistic <- cut_args$statistic
3328 threshold <- cut_args$threshold
3329 cat("\\newpage\n")
3330 if (nrow(m) > intensity_hm_rows) {
3331 subsection_header(
3332 paste(
3333 sprintf( 6074 sprintf(
3334 "Lowest p-valued %d (of %d) enriched %s-substrates,", 6075 "Highest-quality %d (of %d) enriched %s-substrates",
3335 intensity_hm_rows, 6076 g_intensity_hm_rows,
3336 nrow(m), 6077 nrow(m),
3337 kinase 6078 kinase
3338 ), 6079 )
3339 sprintf(" KSEA %s < %0.2f\n", statistic, threshold) 6080 ))
3340 )
3341 )
3342 } else {
3343 if (nrow(m) == 0) {
3344 return(FALSE)
3345 } else { 6081 } else {
3346 subsection_header( 6082 if (nrow(m) == 0) {
3347 paste( 6083 return(FALSE)
6084 } else {
6085 nrow_m <- nrow(m)
6086 cat(subsection_header(
3348 sprintf( 6087 sprintf(
3349 "%d enriched %s-substrates,", 6088 "%d enriched %s-substrate%s",
3350 nrow(m), 6089 nrow_m,
3351 kinase 6090 kinase,
3352 ), 6091 if (nrow_m > 1) "s" else ""
3353 sprintf(
3354 " KSEA %s < %0.2f\n",
3355 statistic,
3356 threshold
3357 ) 6092 )
6093 ))
6094 }
6095 }
6096 cat("\n\n\n")
6097 cat("\n\n\n")
6098 return(TRUE)
6099 }
6100
6101 # --------------------------------
6102 # hack begin - show all substrates
6103 enriched_substrates <- all_enriched_substrates
6104 # add "FALSE &&" to prevent listing of substrates
6105 if (show_enriched_substrates && nrow(enriched_substrates) > 0) {
6106 short_row_names <- sub(
6107 "$FAILED_MATCH_GENE_NAME",
6108 "not_found",
6109 enriched_substrates$sub_gene,
6110 fixed = TRUE
6111 )
6112
6113 if (print_nb_messages) nbe(see_variable(enriched_substrates))
6114 substrates_df <- with(
6115 enriched_substrates,
6116 data.frame(
6117 kinase = kinase,
6118 substrate = sub(" ///*", "...", short_row_names),
6119 anova_p_value = signif(fdr_adjusted_anova_p, 2),
6120 min_group_obs_count = signif(min_group_obs_count, 0),
6121 quality = signif(quality, 3),
6122 sequence = trunc_n(30)(ppep)
3358 ) 6123 )
3359 ) 6124 )
3360 } 6125
3361 } 6126 substrates_df <- substrates_df[
3362 cat("\n\n\n") 6127 with(substrates_df, order(kinase, -quality)),
3363 cat("\n\n\n") 6128 ,
3364 return(TRUE) 6129 drop = FALSE
3365 } 6130 ]
3366 6131
3367 # Disabling heatmaps for substrates pending decision whether to eliminate them altogether 6132 if (print_nb_messages) nbe(see_variable(substrates_df))
3368 if (TRUE) 6133 if (nrow(substrates_df) < 1)
6134 substrates_df$sequence <- c()
6135 if (print_nb_messages) nbe(see_variable(substrates_df))
6136 names(substrates_df) <- headers_2nd_line <-
6137 c("Kinase", "Substrate", "p-value", "per group)", "quality", "Sequence")
6138 headers_1st_line <- c("", "", "ANOVA", "min(values", "", "")
6139 data_frame_tabbing_latex(
6140 x = substrates_df,
6141 tabstops = c(1.2, 0.8, 0.5, 0.65, 0.5),
6142 headings = c(headers_1st_line, headers_2nd_line),
6143 caption = "Details for all enriched substrates of enriched kinases"
6144 )
6145 rm(
6146 enriched_substrates,
6147 substrates_df,
6148 short_row_names,
6149 headers_1st_line,
6150 headers_2nd_line
6151 )
6152 }
6153 cat("\\clearpage\n")
6154 # hack end - show all substrates
6155 # --------------------------------
6156
6157 # print deferred tables and graphs for kinases from contrasts
6158 for (i_cntrst in seq_len(length(ksea_prints))) {
6159 #latex_samepage({
6160 cat(ksea_prints[[i_cntrst]])
6161 cat("\n")
6162 ksea_barplots[[i_cntrst]]()
6163 cat("\n")
6164 cat("\\clearpage\n")
6165 #})
6166 }
6167
6168 }
6169 ```
6170
6171 ```{r enriched, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'}
6172 if (g_can_run_ksea) {
6173 g_did_enriched_header <- FALSE
3369 for (kinase_name in sort(enriched_kinases$kinase)) { 6174 for (kinase_name in sort(enriched_kinases$kinase)) {
3370 enriched_substrates <- 6175 enriched_substrates <-
3371 all_enriched_substrates[ 6176 all_enriched_substrates[
3372 all_enriched_substrates$kinase == kinase_name, 6177 all_enriched_substrates$kinase == kinase_name,
3373 , 6178 ,
3374 drop = FALSE 6179 drop = FALSE
3375 ] 6180 ]
6181 ten_trunc_ppep <- trunc_enriched_substrate(enriched_substrates$ppep)
3376 enriched_substrates$label <- with( 6182 enriched_substrates$label <- with(
3377 enriched_substrates, 6183 enriched_substrates,
3378 sprintf( 6184 sprintf(
3379 "(%s-%s) %s (%0.2g)", 6185 "(%s) %s",
3380 kinase, 6186 make.names(
3381 sub("$FAILED_MATCH_GENE_NAME", "unidentified", sub_gene, fixed = TRUE), 6187 sub("$FAILED_MATCH_GENE_NAME", "not_found", sub_gene, fixed = TRUE),
3382 ppep, 6188 unique = TRUE
3383 fdr_adjusted_anova_p 6189 ),
6190 ten_trunc_ppep
3384 ) 6191 )
3385 ) 6192 )
3386 # Get the intensity values for the heatmap 6193 # Get the intensity values for the heatmap
3387 enriched_intensities <- 6194 enriched_intensities <-
3388 as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) 6195 as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE])
6196
3389 # Remove rows having too many NA values to be relevant 6197 # Remove rows having too many NA values to be relevant
6198 good_rows <- (rowSums(enriched_intensities, na.rm = TRUE) != 0)
6199 #ACE nbe(see_variable(good_rows), "\n")
6200 enriched_substrates <- enriched_substrates[good_rows, , drop = FALSE]
6201 enriched_intensities <- enriched_intensities[good_rows, , drop = FALSE]
6202
3390 # Rename the rows with the display-name for the heatmap 6203 # Rename the rows with the display-name for the heatmap
3391 rownames(enriched_intensities) <- 6204 short_row_names <- sub(
6205 "$FAILED_MATCH_GENE_NAME",
6206 "not_found",
6207 enriched_substrates$sub_gene,
6208 fixed = TRUE
6209 )
6210 short_row_names <-
6211 make.names(short_row_names, unique = TRUE)
6212 long_row_names <-
3392 sapply( 6213 sapply(
3393 X = rownames(enriched_intensities), 6214 X = rownames(enriched_intensities),
3394 FUN = function(rn) { 6215 FUN = function(rn) {
3395 enriched_substrates[enriched_substrates$ppep == rn, "label"] 6216 enriched_substrates[enriched_substrates$ppep == rn, "label"]
3396 } 6217 }
3397 ) 6218 )
6219 rownames(enriched_intensities) <- long_row_names
3398 # Format as matrix for heatmap 6220 # Format as matrix for heatmap
3399 m <- as.matrix(enriched_intensities) 6221 m <- as.matrix(enriched_intensities)
6222 rownames(m) <- trunc_enriched_substrate(rownames(m))
6223
6224 #ACE nb("m with bad rows: ", see_variable(m), "\n")
6225 #ACE good_rows <- (rowSums(m, na.rm = TRUE) != 0)
6226 #ACE nb(see_variable(good_rows), "\n")
6227 #ACE m <- m[good_rows, , drop = FALSE]
6228 #ACE nb("m without(?) bad rows: ", see_variable(m), "\n")
6229 #ACE nb(see_variable(short_row_names), "\n")
6230 #ACE local_short_row_names <- short_row_names[good_rows]
6231 #ACE local_long_row_names <- long_row_names[good_rows]
6232 #ACE local_enriched_intensities <- enriched_intensities[local_long_row_names, ]
6233
3400 # Draw the heading and heatmap 6234 # Draw the heading and heatmap
3401 if (nrow(m) > 0) { 6235 nrow_m <- nrow(m)
6236 if (nrow_m > 0) {
6237 if (!g_did_enriched_header) {
6238 cat("\n\\clearpage\n")
6239 cat(section_header(paste0("K", my_section_header)))
6240 g_did_enriched_header <- TRUE
6241 }
6242 is_na_m <- is.na(m)
6243 cellnote_m <- is_na_m
6244 cellnote_m[!is_na_m] <- ""
6245 cellnote_m[is_na_m] <- "NA"
3402 cut_args <- new_env() 6246 cut_args <- new_env()
3403 cut_args$cutoff <- cutoff 6247 cut_args$cutoff <- cutoff
3404 cut_args$kinase <- kinase_name 6248 cut_args$kinase <- kinase_name
3405 cut_args$statistic <- ksea_cutoff_statistic 6249 cut_args$statistic <- ksea_cutoff_statistic
3406 cut_args$threshold <- ksea_cutoff_threshold 6250 cut_args$threshold <- ksea_cutoff_threshold
3407 number_of_peptides_found <- 6251 number_of_peptides_found <-
3408 draw_ppep_heatmap( 6252 ppep_heatmap(
3409 m = m, 6253 m = m,
6254 cellnote = cellnote_m,
3410 cutoff = cut_args, 6255 cutoff = cut_args,
3411 hm_heading_function = cat_enriched_heading, 6256 hm_heading_function = cat_enriched_heading,
3412 hm_main_title 6257 hm_main_title
3413 = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", 6258 = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates",
3414 suppress_row_dendrogram = FALSE 6259 suppress_row_dendrogram = FALSE,
6260 master_cex = 0.35,
6261 sepcolor = "black",
6262 colsep = sample_colsep
3415 ) 6263 )
3416 if (number_of_peptides_found > 1) { 6264 if (number_of_peptides_found > 1) {
3417 cat("\\leavevmode\n") 6265
3418 cat("The kinase-subsrate pair is shown in parentheses 6266 tryCatch(
3419 before the phosphopeptide sequence.\n\n") 6267 {
3420 cat("The adjusted ANOVA \\textit{p}-value is shown in parentheses 6268 rownames(m) <- short_row_names
3421 after the phosphopeptide sequence.\n\n") 6269 cov_heatmap(m, nrow_m > g_intensity_hm_rows)
6270 },
6271 error = function(e) {
6272 cat(
6273 sprintf(
6274 "ERROR: %s\n\\newline\n",
6275 mget("e")
6276 )
6277 )
6278 cat(
6279 sprintf(
6280 "message: %s\n\\newline\n",
6281 e$message
6282 )
6283 )
6284 cat_margins()
6285 }
6286 )
3422 } 6287 }
3423 if (nrow(m) == 1) { 6288 substrates_df <- with(
3424 cat( 6289 enriched_substrates,
3425 sprintf( 6290 data.frame(
3426 "\n\nSubstrate is %s, 6291 substrate = sub(" ///*", "...", short_row_names),
3427 \nphopshopeptide is %s, 6292 sequence = trunc_long_ppep(ppep),
3428 \n\nand adjusted ANOVA \\textit{p}-value is %0.2g.\n", 6293 anova_p_value = signif(fdr_adjusted_anova_p, 2),
3429 enriched_substrates[1, "sub_gene"], 6294 min_group_obs_count = signif(min_group_obs_count, 0),
3430 enriched_substrates[1, "ppep"], 6295 quality = signif(quality, 3)
3431 enriched_substrates[1, "fdr_adjusted_anova_p"] 6296 )
6297 )
6298 excess_substrates <- nrow(substrates_df) > g_intensity_hm_rows
6299 if (excess_substrates)
6300 substrates_df <- substrates_df[1:g_intensity_hm_rows, ]
6301 names(substrates_df) <- headers_2nd_line <-
6302 c("Substrate", "Sequence", "p-value", "per group)", "quality")
6303 headers_1st_line <- c("", "", "ANOVA", "min(values", "")
6304 if (1 < nrow(enriched_substrates))
6305 cat("\n\\newpage\n")
6306 cat(subsubsection_header(
6307 sprintf(
6308 "Details for %s%s-substrates",
6309 if (excess_substrates)
6310 sprintf(
6311 "%s \"highest quality\" ",
6312 g_intensity_hm_rows
3432 ) 6313 )
6314 else "",
6315 kinase_name
3433 ) 6316 )
3434 } 6317 ))
6318 substrates_df <- substrates_df[order(-substrates_df$quality), ]
6319 data_frame_tabbing_latex(
6320 x = substrates_df,
6321 tabstops = c(0.8, 3.8, 0.6, 0.8),
6322 headings = c(headers_1st_line, headers_2nd_line)
6323 )
6324 } else {
6325 if (print_nb_messages) nbe(see_variable(nrow_m > 0), "\n")
3435 } 6326 }
3436 } 6327 if (print_nb_messages) nb("end kinase ", kinase_name, "\n")
3437 6328 }
3438 # Write output tabular files 6329
3439 6330 # Write output tabular files
3440 # get kinase, ppep, concat(kinase) tuples for enriched kinases 6331
3441 6332 # get kinase, ppep, concat(kinase) tuples for enriched kinases
3442 kinase_ppep_label <- sqldf(" 6333
3443 WITH 6334 if (print_nb_messages) nb("kinase_ppep_label <- ...\n")
3444 t(ppep, label) AS 6335 if (print_nb_messages) nbe("kinase_ppep_label <- ...\n")
3445 ( 6336 kinase_ppep_label <- sqldf("
3446 SELECT DISTINCT 6337 WITH
3447 SUB_MOD_RSD AS ppep, 6338 t(ppep, label) AS
3448 group_concat(gene, '; ') AS label 6339 (
6340 SELECT DISTINCT
6341 SUB_MOD_RSD AS ppep,
6342 group_concat(gene, '; ') AS label
6343 FROM pseudo_ksdata
6344 WHERE GENE IN (SELECT kinase FROM enriched_kinases)
6345 GROUP BY ppep
6346 ),
6347 k(kinase, ppep_join) AS
6348 (
6349 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join
3449 FROM pseudo_ksdata 6350 FROM pseudo_ksdata
3450 WHERE GENE IN (SELECT kinase FROM enriched_kinases) 6351 WHERE GENE IN (SELECT kinase FROM enriched_kinases)
3451 GROUP BY ppep 6352 )
3452 ), 6353 SELECT k.kinase, t.ppep, t.label
3453 k(kinase, ppep_join) AS 6354 FROM t, k
3454 ( 6355 WHERE t.ppep = k.ppep_join
3455 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join 6356 ORDER BY k.kinase, t.ppep
3456 FROM pseudo_ksdata 6357 ")
3457 WHERE GENE IN (SELECT kinase FROM enriched_kinases) 6358
3458 ) 6359
3459 SELECT k.kinase, t.ppep, t.label 6360 # extract what we need from full_data
3460 FROM t, k 6361 impish <- cbind(rownames(quant_data_imp), quant_data_imp)
3461 WHERE t.ppep = k.ppep_join 6362 colnames(impish)[1] <- "Phosphopeptide"
3462 ORDER BY k.kinase, t.ppep 6363 data_table_imputed_sql <- "
3463 ") 6364 SELECT
3464 6365 f.*,
3465 # extract what we need from full_data 6366 k.label AS KSEA_enrichments,
3466 impish <- cbind(rownames(quant_data_imp), quant_data_imp) 6367 q.*
3467 colnames(impish)[1] <- "Phosphopeptide" 6368 FROM
3468 data_table_imputed_sql <- " 6369 metadata_plus_p f
3469 SELECT 6370 LEFT JOIN kinase_ppep_label k
3470 f.*, 6371 ON f.Phosphopeptide = k.ppep,
3471 k.label AS KSEA_enrichments, 6372 impish q
3472 q.* 6373 WHERE
3473 FROM 6374 f.Phosphopeptide = q.Phosphopeptide
3474 metadata_plus_p f 6375 "
3475 LEFT JOIN kinase_ppep_label k 6376 data_table_imputed <- sqldf(data_table_imputed_sql)
3476 ON f.Phosphopeptide = k.ppep, 6377 # Zap the duplicated 'Phosphopeptide' column named 'ppep'
3477 impish q 6378 data_table_imputed <-
3478 WHERE 6379 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
3479 f.Phosphopeptide = q.Phosphopeptide 6380
3480 " 6381 # Output imputed, un-normalized data
3481 data_table_imputed <- sqldf(data_table_imputed_sql) 6382 if (print_nb_messages) nb("Output imputed, un-normalized data tabular file\n")
3482 # Zap the duplicated 'Phosphopeptide' column named 'ppep' 6383 if (print_nb_messages) nbe("Output imputed, un-normalized data tabular file\n")
3483 data_table_imputed <- 6384 write.table(
3484 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] 6385 data_table_imputed
3485 6386 , file = imputed_data_filename
3486 # Output with imputed, un-normalized data 6387 , sep = "\t"
3487 6388 , col.names = TRUE
3488 write.table( 6389 , row.names = FALSE
3489 data_table_imputed 6390 , quote = FALSE
3490 , file = imputed_data_filename 6391 )
3491 , sep = "\t" 6392
3492 , col.names = TRUE 6393
3493 , row.names = FALSE 6394 #output quantile normalized data
3494 , quote = FALSE 6395 impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log)
3495 ) 6396 colnames(impish)[1] <- "Phosphopeptide"
3496 6397 data_table_imputed <- sqldf(data_table_imputed_sql)
3497 6398 # Zap the duplicated 'Phosphopeptide' column named 'ppep'
3498 #output quantile normalized data 6399 data_table_imputed <-
3499 impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) 6400 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
3500 colnames(impish)[1] <- "Phosphopeptide" 6401 if (print_nb_messages) nb("Output quantile normalized data tabular file\n")
3501 data_table_imputed <- sqldf(data_table_imputed_sql) 6402 if (print_nb_messages) nbe("Output quantile normalized data tabular file\n")
3502 # Zap the duplicated 'Phosphopeptide' column named 'ppep' 6403 write.table(
3503 data_table_imputed <- 6404 data_table_imputed,
3504 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] 6405 file = imp_qn_lt_data_filenm,
3505 write.table( 6406 sep = "\t",
3506 data_table_imputed, 6407 col.names = TRUE,
3507 file = imp_qn_lt_data_filenm, 6408 row.names = FALSE,
3508 sep = "\t", 6409 quote = FALSE
3509 col.names = TRUE, 6410 )
3510 row.names = FALSE, 6411
3511 quote = FALSE 6412 ppep_kinase <- sqldf("
3512 ) 6413 SELECT DISTINCT k.ppep, k.kinase
3513 6414 FROM (
3514 ppep_kinase <- sqldf(" 6415 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep
3515 SELECT DISTINCT k.ppep, k.kinase 6416 FROM pseudo_ksdata
3516 FROM ( 6417 WHERE GENE IN (SELECT kinase FROM enriched_kinases)
3517 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep 6418 ) k
3518 FROM pseudo_ksdata 6419 ORDER BY k.ppep, k.kinase
3519 WHERE GENE IN (SELECT kinase FROM enriched_kinases) 6420 ")
3520 ) k 6421
3521 ORDER BY k.ppep, k.kinase 6422 RSQLite::dbWriteTable(
3522 ") 6423 conn = db,
3523 6424 name = "ksea_enriched_ks",
3524 RSQLite::dbWriteTable( 6425 value = ppep_kinase,
3525 conn = db, 6426 append = FALSE
3526 name = "ksea_enriched_ks", 6427 )
3527 value = ppep_kinase, 6428 }
3528 append = FALSE 6429
3529 ) 6430 if (print_nb_messages) nb("RSQLite::dbWriteTable anova_signif\n")
3530 6431
3531 RSQLite::dbWriteTable( 6432 RSQLite::dbWriteTable(
3532 conn = db, 6433 conn = db,
3533 name = "anova_signif", 6434 name = "anova_signif",
3534 value = p_value_data, 6435 value = p_value_data,
3554 ON m.phospho_peptide = kek.ppep 6455 ON m.phospho_peptide = kek.ppep
3555 ; 6456 ;
3556 " 6457 "
3557 ) 6458 )
3558 6459
6460 if (print_nb_messages) nb("Output contents of `stats_metadata_v` table to tabular file\n")
6461 if (print_nb_messages) nbe("Output contents of `stats_metadata_v` table to tabular file\n")
3559 write.table( 6462 write.table(
3560 dbReadTable(db, "stats_metadata_v"), 6463 dbReadTable(db, "stats_metadata_v"),
3561 file = anova_ksea_mtdt_file, 6464 file = anova_ksea_mtdt_file,
3562 sep = "\t", 6465 sep = "\t",
3563 col.names = TRUE, 6466 col.names = TRUE,
3564 row.names = FALSE, 6467 row.names = FALSE,
3565 quote = FALSE 6468 quote = FALSE
3566 ) 6469 )
3567 6470
6471 cat("\n\\clearpage\n")
3568 6472
3569 ``` 6473 ```
6474
6475 # Data-processing summary flowchart
6476
6477 ![Flowchart showing ANOVA and KSEA data-processing steps](KSEA_impl_flowchart.pdf)
3570 6478
3571 ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} 6479 ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
3572 cat("\\leavevmode\n\n\n") 6480 cat("\\leavevmode\n\n\n")
3573 6481
3574 # write parameters to report 6482 write_params(db)
3575
3576 param_unlist <- unlist(as.list(params))
3577 param_df <- data.frame(
3578 parameter = paste0("\\verb@", names(param_unlist), "@"),
3579 value = paste0(
3580 "\n\\begin{tiny}\n\\verb@",
3581 gsub("$", "\\$", param_unlist, fixed = TRUE),
3582 "@\n\\end{tiny}"
3583 )
3584 )
3585
3586 data_frame_latex(
3587 x = param_df,
3588 justification = "p{0.35\\linewidth} p{0.6\\linewidth}",
3589 centered = TRUE,
3590 caption = "Input parameters",
3591 anchor = const_table_anchor_bp,
3592 underscore_whack = FALSE
3593 )
3594
3595 # write parameters to SQLite output
3596
3597 mqppep_anova_script_param_df <- data.frame(
3598 script = "mqppep_anova_script.Rmd",
3599 parameter = names(param_unlist),
3600 value = param_unlist
3601 )
3602 ddl_exec(db, "
3603 DROP TABLE IF EXISTS script_parameter;
3604 "
3605 )
3606 ddl_exec(db, "
3607 CREATE TABLE IF NOT EXISTS script_parameter(
3608 script TEXT,
3609 parameter TEXT,
3610 value ANY,
3611 UNIQUE (script, parameter) ON CONFLICT REPLACE
3612 )
3613 ;
3614 "
3615 )
3616 RSQLite::dbWriteTable(
3617 conn = db,
3618 name = "script_parameter",
3619 value = mqppep_anova_script_param_df,
3620 append = TRUE
3621 )
3622
3623 # We are done with output 6483 # We are done with output
3624 RSQLite::dbDisconnect(db) 6484 RSQLite::dbDisconnect(db)
6485
6486 cat("\\clearpage\n\\section{R package versions}\n")
6487 utils::toLatex(utils::sessionInfo())
3625 ``` 6488 ```
3626 <!--
3627 There's gotta be a better way...
3628
3629 loaded_packages_df <- sessioninfo::package_info("loaded")
3630 loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library)
3631 loaded_packages_df <- data.frame(
3632 package = loaded_packages_df$package,
3633 version = loaded_packages_df$loadedversion,
3634 date = loaded_packages_df$date
3635 )
3636 data_frame_latex(
3637 x = loaded_packages_df,
3638 justification = "l | l l",
3639 centered = FALSE,
3640 caption = "Loaded R packages",
3641 anchor = const_table_anchor_bp
3642 )
3643 -->