Mercurial > repos > eschen42 > mqppep_anova
comparison mqppep_anova_script.Rmd @ 26:5b8e15b2a67c draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit e0b80550743f634282b4b4348b75e6f172dc1488
author | eschen42 |
---|---|
date | Wed, 26 Oct 2022 23:48:51 +0000 |
parents | f9cd87ac8006 |
children | 42b207aaa527 |
comparison
equal
deleted
inserted
replaced
25:f9cd87ac8006 | 26:5b8e15b2a67c |
---|---|
5 - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]" | 5 - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]" |
6 - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]" | 6 - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]" |
7 date: | 7 date: |
8 - "May 28, 2018" | 8 - "May 28, 2018" |
9 - "; revised June 23, 2022" | 9 - "; revised June 23, 2022" |
10 lot: true | |
10 output: | 11 output: |
11 pdf_document: | 12 pdf_document: |
12 toc: true | 13 toc: true |
13 toc_depth: 3 | 14 toc_depth: 2 |
14 keep_tex: true | 15 keep_tex: true |
15 header-includes: | 16 dev: pdf |
16 - \usepackage{longtable} | 17 includes: |
17 - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut | 18 in_header: mqppep_anova_preamble.tex |
18 - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut | 19 latex_macros: false |
20 raw_tex: true | |
21 urlcolor: blue | |
19 params: | 22 params: |
20 alphaFile: "test-data/alpha_levels.tabular" | 23 alphaFile: "test-data/alpha_levels.tabular" |
21 inputFile: "test-data/test_input_for_anova.tabular" | 24 inputFile: "test-data/test_input_for_anova.tabular" |
22 preprocDb: "test-data/test_input_for_anova.sqlite" | 25 preprocDb: "test-data/test_input_for_anova.sqlite" |
23 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | 26 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] |
24 regexSampleNames: "\\.\\d+[A-Z]$" | 27 regexSampleNames: "\\.\\d+[A-Z]$" |
25 regexSampleGrouping: "\\d+" | 28 regexSampleGrouping: "\\d+" |
26 show_toc: true | 29 groupFilterPatterns: ".+" |
30 groupFilter: !r c("none", "exclude", "include")[1] | |
31 imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
32 kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5] | |
33 #imputationMethod: !r c("group-median", "median", "mean", "random")[1] | |
34 | |
35 # how should sample groups be interpreted? | |
36 # - "f": fixed patterns (like `grep -F`) | |
37 # - "p": PERL-compatible (like `grep -P`) | |
38 # - "r": extended grep patterns (like `grep -E`) | |
39 # use what case sensitivity? | |
40 # - "i": case insensitive matching (like `grep -i`) | |
41 groupFilterMode: !r c("r", "ri", "p", "pi", "f", "fi")[1] | |
42 # what pattern should be used for the first column | |
43 # (extended grep pattern, case sensitive) | |
27 firstDataColumn: "^Intensity[^_]" | 44 firstDataColumn: "^Intensity[^_]" |
28 imputationMethod: !r c("group-median", "median", "mean", "random")[1] | 45 # for small random value imputation, what percentile should be center? |
29 meanPercentile: 1 | 46 meanPercentile: 50 |
47 #meanPercentile: 1 | |
48 # for small random value imputation, what should `s / mean(x)` ratio be? | |
30 sdPercentile: 1.0 | 49 sdPercentile: 1.0 |
50 # output path for imputed data file | |
31 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" | 51 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" |
52 # output path for imputed/quantile-normalized/log-transformed data file | |
32 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" | 53 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" |
54 # output path for contents of `stats_metadata_v` table | |
33 anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" | 55 anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" |
56 # how to test one variable with > 2 categories (e.g., aov or kruskal.test) | |
34 oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] | 57 oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] |
58 # how to test one variable with 2 categories (e.g., oneway.test) | |
35 oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] | 59 oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] |
36 kseaCutoffStatistic: !r c("p.value", "FDR")[2] | 60 # what should be the minimum quality for consideration in both |
37 kseaCutoffThreshold: !r c( 0.1, 0.05)[2] | 61 minQuality: 0 |
38 kseaMinKinaseCount: 1 | 62 # correct KSEA with FDR (recommended) or raw p-value |
39 intensityHeatmapRows: 75 | 63 kseaCutoffStatistic: !r c("FDR", "p.value")[1] |
64 # correct KSEA threshold 0.05 (conventional) or higher (perhaps better) | |
65 # "perhaps better" meaning that KSEA is an hypothesis-generator, not -test | |
66 #kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5)[1] | |
67 # minimum number of substrates required for a kinase to be considered in KSEA | |
68 kseaMinSubstrateCount: 1 | |
69 # Should KSEA be performed aggregating signed log2FC or absolute? | |
70 # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores | |
71 # TRUE use abs(log2FC) for KSEA as Justin Drake requested; this is a | |
72 # justifiable deviation from the KSEAapp::KSEA.Scores algorithm. | |
73 kseaUseAbsoluteLog2FC: TRUE | |
74 #kseaUseAbsoluteLog2FC: FALSE | |
75 # minimum number of observed values per sample-group | |
76 intensityMinValuesPerGroup: 1 | |
77 # maximum number of heatmap rows (result are poor when > 50) | |
78 intensityHeatmapRows: 50 | |
79 # what should be the primary criterion to eliminate excessive heatmap rows | |
80 intensityHeatmapCriteria: !r c("quality", "na_count", "p_value")[1] | |
81 # should correlation among substrates be used (rather than covariance) | |
82 correlateSubstrates: TRUE | |
83 # only show covariance among variables having variance > 1 | |
84 filterCovVarGT1: TRUE | |
85 # maximum number of residues to display for ppeps in rownames or columnames | |
86 ppepTruncN: 10 | |
87 # maximum number of characters of subgenes to display in rownames or columnames | |
88 subgeneTruncN: 10 | |
89 # maximum number of characters for paste(subgene, ppep) for enrichment plots | |
90 substTruncN: 20 | |
91 # should boxplots use variable-width boxes to reflect # of samples | |
92 boxPlotVarWidth: TRUE | |
93 # should boxplots use notched boxes to reflect difference between samples | |
94 boxPlotNotch: TRUE | |
95 # look-up tables for kinase descriptions | |
96 kinaseNameUprtLutBz2: "./kinase_name_uniprot_lut.tabular.bz2" | |
97 kinaseUprtDescLutBz2: "./kinase_uniprot_description_lut.tabular.bz2" | |
98 # should debugging trace messages be printed? | |
99 showEnrichedSubstrates: FALSE | |
100 | |
101 # should debugging nb/nbe messages be printed? | |
102 printNBMsgs: FALSE | |
103 # should debugging trace messages be printed? | |
104 printTraceMsgs: FALSE | |
105 # when debugging files are needed, set debugFileBasePath to the path | |
106 # to the directory where they should be writtn | |
107 debugFileBasePath: !r if (TRUE) NULL else "test-data" | |
40 --- | 108 --- |
41 <!-- | 109 <!-- |
110 alphaFile: "test-data/alpha_levels.tabular" | |
111 inputFile: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular" | |
112 preprocDb: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite" | |
113 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
114 regexSampleNames: "\\.\\w+\\.\\d+[A-Z]$" | |
115 regexSampleGrouping: "\\w+" | |
116 groupFilterPatterns: ".+" | |
117 groupFilter: !r c("none", "exclude", "include")[3] | |
118 imputationMethod: !r c("group-median", "median", "mean", "random")[1] | |
119 kseaCutoffThreshold: !r c(0.05, 0.1, 0.20, 0.35, 0.4, 0.5, 0.999)[1] | |
120 ut_alphaFile: "test-data/alpha_levels.tabular" | |
121 ut_inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" | |
122 ut_preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" | |
123 ut_kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] | |
124 ut_regexSampleNames: "\\.\\d+[A-Z]$" | |
125 ut_regexSampleGrouping: "\\d+" | |
126 ut_groupFilterPatterns: ".+,.*" | |
127 ut_groupFilter: !r c("none", "exclude", "include")[1] | |
128 ut_imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
129 ut_kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[1] | |
130 tst_alphaFile: "test-data/alpha_levels.tabular" | |
131 tst_inputFile: "test-data/test_input_for_anova.tabular" | |
132 tst_preprocDb: "test-data/test_input_for_anova.sqlite" | |
133 tst_kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
134 tst_regexSampleNames: "\\.\\d+[A-Z]$" | |
135 tst_regexSampleGrouping: "\\d+" | |
136 tst_groupFilterPatterns: ".+" | |
137 tst_groupFilter: !r c("none", "exclude", "include")[1] | |
138 tst_imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
139 tst_kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5] | |
140 | |
141 tst_alphaFile: "test-data/alpha_levels.tabular" | |
142 tst_inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" | |
143 tst_preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" | |
144 tst_kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] | |
145 tst_regexSampleNames: "\\.\\d+[A-Z]$" | |
146 tst_regexSampleGrouping: "\\d+" | |
147 tst_groupFilterPatterns: ".+,.*" | |
148 tst_groupFilter: !r c("none", "exclude", "include")[1] | |
149 tst_imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
150 tst_kseaCutoffThreshold: !r c(0.05, 0.1, 0.20, 0.35, 0.4, 0.5, 0.999)[5] | |
151 px_alphaFile: "test-data/alpha_levels.tabular" | |
152 px_inputFile: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular" | |
153 px_preprocDb: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite" | |
154 px_kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
155 px_regexSampleNames: "\\.\\w+\\.\\d+[A-Z]$" | |
156 px_regexSampleGrouping: "\\w+" | |
157 px_groupFilterPatterns: ".+" | |
158 px_groupFilter: !r c("none", "exclude", "include")[3] | |
159 px_imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
160 px_kseaCutoffThreshold: !r c(0.05, 0.1, 0.20, 0.35, 0.4, 0.5, 0.999)[5] | |
161 pdx_alphaFile: "test-data/alpha_levels.tabular" | |
162 pdx_inputFile: "test-data/PDX012970_pST.preproc_tab.tabular" | |
163 pdx_preprocDb: "test-data/PDX012970_pST.preproc.sqlite" | |
164 pdx_kseaAppPrepDb: !r c(":memory:", "test-data/PDX012970.sqlite")[2] | |
165 pdx_regexSampleNames: "\\.\\w+\\.\\w+\\.\\d+[A-Z]$" | |
166 pdx_regexSampleGrouping: "\\.\\w+\\K\\.\\w+" | |
167 pdx_groupFilterPatterns: "AdCa,AVPC" | |
168 pdx_groupFilter: !r c("none", "exclude", "include")[3] | |
169 pdx_imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
170 pdx_kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[1] | |
171 tst_alphaFile: "test-data/alpha_levels.tabular" | |
172 tst_inputFile: "test-data/test_input_for_anova.tabular" | |
173 tst_preprocDb: "test-data/test_input_for_anova.sqlite" | |
174 tst_kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
175 tst_regexSampleNames: "\\.\\d+[A-Z]$" | |
176 tst_regexSampleGrouping: "\\d+" | |
177 tst_groupFilterPatterns: ".+" | |
178 tst_groupFilter: !r c("none", "exclude", "include")[1] | |
179 tst_kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5] | |
180 tst_imputationMethod: !r c("group-median", "median", "mean", "random")[1] | |
181 ut_alphaFile: "test-data/alpha_levels.tabular" | |
182 ut_inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" | |
183 ut_preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" | |
184 ut_kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] | |
185 ut_regexSampleNames: "\\.\\d+[A-Z]$" | |
186 ut_regexSampleGrouping: "\\d+" | |
187 ut_groupFilterPatterns: ".+,.*" | |
188 ut_groupFilter: !r c("none", "exclude", "include")[1] | |
189 ut_imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
42 alphaFile: "test-data/alpha_levels.tabular" | 190 alphaFile: "test-data/alpha_levels.tabular" |
43 inputFile: "test-data/test_input_for_anova.tabular" | 191 inputFile: "test-data/test_input_for_anova.tabular" |
44 preprocDb: "test-data/test_input_for_anova.sqlite" | 192 preprocDb: "test-data/test_input_for_anova.sqlite" |
45 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | 193 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] |
46 regexSampleNames: "\\.\\d+[A-Z]$" | 194 regexSampleNames: "\\.\\d+[A-Z]$" |
47 regexSampleGrouping: "\\d+" | 195 regexSampleGrouping: "\\d+" |
196 groupFilterPatterns: ".+,.*" | |
197 groupFilter: !r c("none", "exclude", "include")[1] | |
198 imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
199 nd_alphaFile: "test-data/alpha_levels.tabular" | |
200 nd_inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" | |
201 nd_preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" | |
202 nd_kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] | |
203 nd_regexSampleNames: "\\.\\d+[A-Z]$" | |
204 nd_regexSampleGrouping: "\\d+" | |
205 nd_groupFilterPatterns: ".+,.*" | |
206 nd_groupFilter: !r c("none", "exclude", "include")[1] | |
207 nd_imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
208 pxd_alphaFile: "test-data/alpha_levels.tabular" | |
209 pxd_inputFile: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular" | |
210 pxd_preprocDb: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite" | |
211 pxd_kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
212 pxd_regexSampleNames: "\\.\\w+\\.\\d+[A-Z]$" | |
213 pxd_regexSampleGrouping: "\\w+" | |
214 pxd_groupFilterPatterns: ".+" | |
215 pxd_groupFilter: !r c("none", "exclude", "include")[3] | |
216 pxd_imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
217 | |
218 alphaFile: "test-data/alpha_levels.tabular" | |
219 inputFile: "test-data/test_input_for_anova.tabular" | |
220 preprocDb: "test-data/test_input_for_anova.sqlite" | |
221 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
222 regexSampleNames: "\\.\\d+[A-Z]$" | |
223 regexSampleGrouping: "\\d+" | |
224 groupFilterPatterns: ".+,.*" | |
225 groupFilter: !r c("none", "exclude", "include")[1] | |
226 | |
227 alphaFile: "test-data/alpha_levels.tabular" | |
228 inputFile: "test-data/PDX012970_pST.preproc_tab.tabular" | |
229 preprocDb: "test-data/PDX012970_pST.preproc.sqlite" | |
230 kseaAppPrepDb: !r c(":memory:", "test-data/PDX012970.sqlite")[2] | |
231 regexSampleNames: "\\.\\w+\\.\\w+\\.\\d+[A-Z]$" | |
232 regexSampleGrouping: "\\.\\w+\\K\\.\\w+" | |
233 groupFilterPatterns: "AdCa,AVPC" | |
234 groupFilter: !r c("none", "exclude", "include")[3] | |
48 | 235 |
49 alphaFile: "test-data/alpha_levels.tabular" | 236 alphaFile: "test-data/alpha_levels.tabular" |
50 inputFile: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular" | 237 inputFile: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular" |
51 preprocDb: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite" | 238 preprocDb: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite" |
52 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | 239 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] |
53 regexSampleNames: "\\.\\w+\\.\\d+[A-Z]$" | 240 regexSampleNames: "\\.\\w+\\.\\d+[A-Z]$" |
54 regexSampleGrouping: "\\w+" | 241 regexSampleGrouping: "\\w+" |
242 groupFilterPatterns: ".+,.*" | |
243 groupFilter: !r c("none", "exclude", "include")[3] | |
55 | 244 |
56 kseaCutoffStatistic: !r c("p.value", "FDR")[2] | 245 kseaCutoffStatistic: !r c("p.value", "FDR")[2] |
57 kseaCutoffThreshold: !r c(0.05, 0.1)[1] | 246 kseaCutoffThreshold: !r c(0.05, 0.1)[1] |
58 | 247 |
59 alphaFile: "test-data/alpha_levels.tabular" | 248 alphaFile: "test-data/alpha_levels.tabular" |
67 inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular" | 256 inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular" |
68 preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite" | 257 preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite" |
69 kseaAppPrepDb: !r c(":memory:", "test-data/pY_Sites_NancyDu.ksea.sqlite")[2] | 258 kseaAppPrepDb: !r c(":memory:", "test-data/pY_Sites_NancyDu.ksea.sqlite")[2] |
70 regexSampleNames: "\\.\\d+[A-Z]$" | 259 regexSampleNames: "\\.\\d+[A-Z]$" |
71 regexSampleGrouping: "\\d+" | 260 regexSampleGrouping: "\\d+" |
261 groupFilterPatterns: ".+,.*" | |
262 groupFilter: !r c("none", "exclude", "include")[3] | |
72 | 263 |
73 alphaFile: "test-data/alpha_levels.tabular" | 264 alphaFile: "test-data/alpha_levels.tabular" |
74 inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" | 265 inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" |
75 preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" | 266 preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" |
76 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] | 267 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] |
77 regexSampleNames: "\\.\\d+[A-Z]$" | 268 regexSampleNames: "\\.\\d+[A-Z]$" |
78 regexSampleGrouping: "\\d+" | 269 regexSampleGrouping: "\\d+" |
79 | 270 groupFilterPatterns: ".+,.*" |
80 inputFile: "test-data/density_failure.preproc_tab.tabular" | 271 groupFilter: !r c("none", "exclude", "include")[1] |
81 kseaAppPrepDb: !r c(":memory:", "mqppep.sqlite")[2] | 272 |
82 latex_document: default | |
83 --> | 273 --> |
84 ```{r setup, include = FALSE} | 274 ```{r setup, include = FALSE, results = 'asis'} |
275 | |
276 # simple debug messaging | |
277 print_nb_messages <- params$printNBMsgs | |
278 | |
279 nb <- if (!print_nb_messages) { | |
280 function(...) invisible() | |
281 } else { | |
282 function(..., f = cat) f("\n$\\exists{}\\supset\\forall{}$", ...) | |
283 } | |
284 | |
285 nbe <- if (!print_nb_messages) { | |
286 function(...) invisible() | |
287 } else { | |
288 function(..., f = cat, file = stderr()) { | |
289 cat( | |
290 stringi::stri_unescape_unicode("\nNBE \\u2203\\u2283\\u2200"), | |
291 ..., | |
292 file = file | |
293 ) | |
294 } | |
295 } | |
296 | |
85 #ref for debugging: https://yihui.org/tinytex/r/#debugging | 297 #ref for debugging: https://yihui.org/tinytex/r/#debugging |
86 options(tinytex.verbose = TRUE) | 298 options(tinytex.verbose = TRUE) |
87 | 299 |
88 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 | 300 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 |
89 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 | 301 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 |
90 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) | 302 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10), dpi = 300) |
91 | 303 |
92 # freeze the random number generator so the same results will be produced | 304 # freeze the random number generator so the same results will be produced |
93 # from run to run | 305 # from run to run |
94 set.seed(28571) | 306 set.seed(28571) |
95 | 307 |
96 ### LIBRARIES | 308 ### LIBRARIES |
309 | |
310 if (print_nb_messages) nbe("library(gplots)") | |
97 library(gplots) | 311 library(gplots) |
312 if (print_nb_messages) nbe("library(caret)") | |
313 # load caret for nearZeroVar | |
314 if (print_nb_messages) nbe("Please ignore the messages about systemd, if any.\n") | |
315 library(caret) | |
316 if (print_nb_messages) nbe("library(DBI)") | |
98 library(DBI) | 317 library(DBI) |
318 if (print_nb_messages) nbe("library(RSQLite)") | |
99 library(RSQLite) | 319 library(RSQLite) |
320 if (print_nb_messages) nbe("library(sqldf)\n") | |
100 # Suppress "Warning: no DISPLAY variable so Tk is not available" | 321 # Suppress "Warning: no DISPLAY variable so Tk is not available" |
101 suppressWarnings(suppressMessages(library(sqldf))) | 322 suppressWarnings(suppressMessages(library(sqldf))) |
102 | 323 |
103 # required but not added to search list: | 324 # required but not added to search list: |
104 # - DBI | 325 # - DBI |
110 # - reshape2 | 331 # - reshape2 |
111 # - vioplot | 332 # - vioplot |
112 | 333 |
113 ### CONSTANTS | 334 ### CONSTANTS |
114 | 335 |
115 const_parfin <- par("fin") | 336 const_boxplot_fill <- "grey94" |
116 const_boxplot_fill <- "grey94" | |
117 const_stripchart_cex <- 0.5 | |
118 const_stripsmall_cex <- | |
119 sqrt(const_stripchart_cex * const_stripchart_cex / 2) | |
120 const_stripchart_jitter <- 0.3 | |
121 const_write_debug_files <- FALSE | |
122 const_table_anchor_bp <- "bp" | |
123 const_table_anchor_ht <- "ht" | |
124 const_table_anchor_p <- "p" | |
125 const_table_anchor_tbp <- "tbp" | |
126 | |
127 | |
128 const_ksea_astrsk_kinases <- 1 | 337 const_ksea_astrsk_kinases <- 1 |
129 const_ksea_nonastrsk_kinases <- 2 | 338 const_ksea_nonastrsk_kinases <- 2 |
130 const_ksea_all_kinases <- 3 | 339 const_ksea_all_kinases <- 3 |
131 | 340 const_log10_e <- log10(exp(1)) |
132 const_log10_e <- log10(exp(1)) | 341 const_stripchart_cex <- 0.5 |
133 | 342 const_stripchart_jitter <- 0.3 |
134 ### FUNCTIONS | 343 const_table_anchor_bp <- "bp" |
135 | 344 const_table_anchor_ht <- "ht" |
136 # from `demo(error.catching)` | 345 const_table_anchor_p <- "p" |
346 const_table_anchor_t <- "t" | |
347 const_table_anchor_tbp <- "tbp" | |
348 | |
349 | |
350 ### GLOBAL VARIABLES (params) | |
351 | |
352 ## functions to process params | |
353 | |
354 is_string_null_or_empty <- function(x) { | |
355 # N. B. non-strings are intentionally treated as NULL | |
356 if (is.null(x)) | |
357 TRUE | |
358 else if (!is.character(x)) | |
359 TRUE | |
360 else x == "" | |
361 } | |
362 | |
137 ##' Catch *and* save both errors and warnings, and in the case of | 363 ##' Catch *and* save both errors and warnings, and in the case of |
138 ##' a warning, also keep the computed result. | 364 ##' a warning, also keep the computed result. |
365 ##' return result as list(value = ..., warning = ...) | |
366 ##' - value will be: | |
367 ##' - the result if no exception is thrown | |
368 ##' - the exception if an exception is caught | |
369 ##' - warning will be a string except perhaps when warning argument is not NULL | |
370 ##' | |
371 ##' adapted from `demo(error.catching)` | |
139 ##' | 372 ##' |
140 ##' @title tryCatch both warnings (with value) and errors | 373 ##' @title tryCatch both warnings (with value) and errors |
141 ##' @param expr an \R expression to evaluate | 374 ##' @param expr an \R expression to evaluate |
142 ##' @return a list with 'value' and 'warning', where | 375 ##' @return a list with 'value' and 'warning', where |
143 ##' 'value' may be an error caught. | 376 ##' 'value' may be an error caught. |
144 ##' @author Martin Maechler; | 377 ##' @author Martin Maechler; |
145 ##' Copyright (C) 2010-2012 The R Core Team | 378 ##' Copyright (C) 2010-2012 The R Core Team |
146 try_catch_w_e <- function(expr) { | 379 try_catch_w_e <- |
147 wrn <- NULL | 380 function(expr, error = function(e) e, warning = NULL) { |
148 # warning handler | 381 wrn <- NULL |
149 w_handler <- function(w) { | 382 # warning handler |
150 wrn <<- w | 383 w_handler <- |
151 invokeRestart("muffleWarning") | 384 if (is.function(warning)) |
152 } | 385 warning |
153 list( | 386 else |
154 value = withCallingHandlers( | 387 function(w) { |
155 tryCatch( | 388 wrn <<- w |
156 expr, | 389 invokeRestart("muffleWarning") |
157 error = function(e) e | 390 } |
391 e_handler <- | |
392 if (is.function(error)) | |
393 error | |
394 else | |
395 function(e) e | |
396 # return result as list(value = ..., warning = ...) | |
397 # - value will be: | |
398 # - the result if no exception is thrown | |
399 # - the exception if an exception is caught | |
400 list( | |
401 value = withCallingHandlers( | |
402 tryCatch( | |
403 expr, | |
404 error = e_handler | |
405 ), | |
406 warning = w_handler | |
158 ), | 407 ), |
159 warning = w_handler | 408 warning = wrn |
160 ), | 409 ) |
161 warning = wrn | 410 } |
162 ) | 411 |
163 } | 412 see_kvp <- |
164 | 413 function(format, key, value, suffix = "") { |
165 | 414 if ( |
166 write_debug_file <- function(s) { | 415 !all( |
167 if (const_write_debug_files) { | 416 is.character(format), |
168 s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) | 417 is.character(key), |
169 print(sprintf("DEBUG writing file %s", spath)) | 418 is.character(value), |
419 is.character(suffix) | |
420 ) | |
421 ) { | |
422 cat("all arguments to see_kvp should be character") | |
423 knitr::knit_exit() | |
424 } | |
425 result <- sprintf(format, value) | |
426 if (length(result) > 1) { | |
427 sprintf( | |
428 "%s = c(%s)%s", | |
429 whack_underscores(key), | |
430 paste(result, collapse = ", "), | |
431 suffix | |
432 ) | |
433 } else { | |
434 sprintf( | |
435 "%s = %s%s", | |
436 key, | |
437 result, | |
438 suffix | |
439 ) | |
440 } | |
441 } | |
442 | |
443 see_logical <- | |
444 function(x, suffix = "", xprssn = deparse1(substitute(x))) { | |
445 result <- as.character(as.logical(x)) | |
446 # handle NAs and NaNs | |
447 result[is.na(result)] <- "NA" | |
448 see_kvp( | |
449 format = "%s", | |
450 key = xprssn, | |
451 value = result, | |
452 suffix = suffix | |
453 ) | |
454 } | |
455 | |
456 see_numeric <- | |
457 function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) { | |
458 if (is.numeric(digits) && is.numeric(x)) { | |
459 digits <- as.integer(digits) | |
460 digits <- min(16, max(0, digits)) | |
461 format <- paste0("%0.", as.character(digits), "g") | |
462 result <- sprintf(format, x) | |
463 see_kvp( | |
464 format = "%s", | |
465 key = xprssn, | |
466 value = result, | |
467 suffix = suffix | |
468 ) | |
469 } | |
470 } | |
471 | |
472 see_character <- | |
473 function(x, suffix = "", xprssn = deparse1(substitute(x))) { | |
474 if (is.character(x)) { | |
475 see_kvp( | |
476 format = "%s", | |
477 key = xprssn, | |
478 value = sprintf("\"%s\"", x), | |
479 suffix = suffix | |
480 ) | |
481 } | |
482 } | |
483 | |
484 see_variable <- | |
485 function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) { | |
486 if (is.character(x)) { | |
487 see_character(x, suffix, xprssn) | |
488 } else if (is.numeric(x)) { | |
489 see_numeric(x, suffix, digits, xprssn) | |
490 } else if (is.logical(x)) { | |
491 see_logical(x, suffix, xprssn) | |
492 } else { | |
493 f <- file("") | |
494 sink(f) | |
495 str(x) | |
496 msg <- paste(readLines(f), collapse = "\n") | |
497 sink() | |
498 close(f) | |
499 paste0( | |
500 "see_variable - str(", | |
501 xprssn, | |
502 "):\n", | |
503 msg, "\n" | |
504 ) | |
505 } | |
506 } | |
507 | |
508 # ref: https://tug.org/texinfohtml/latex2e.html | |
509 # LaTeX sets aside the following characters for special purposes. | |
510 # For example, the percent sign % is for comments. | |
511 # They are called reserved characters or special characters. | |
512 # They are all discussed elsewhere in this manual. | |
513 # | |
514 # $ % & { } _ ~ ^ \ # | |
515 # | |
516 # If you want a reserved character to be printed as itself, in the text body | |
517 # font, for all but the final three characters in that list simply put | |
518 # a backslash \ in front of the character. | |
519 # Thus, typing \$1.23 will produce $1.23 in your output. | |
520 # | |
521 # As to the last three characters, to get a tilde in the text body font, | |
522 # use \~{} (omitting the curly braces would result in the next character | |
523 # receiving a tilde accent). | |
524 # Similarly, to get a text body font circumflex use \^{}. | |
525 # To get a backslash in the font of the text body enter \textbackslash{}. | |
526 whack_math <- | |
527 function(v) { | |
528 v <- as.character(v) | |
529 w <- gsub("\\", "\\textbackslash ", v, fixed = TRUE) | |
530 w <- Reduce( | |
531 f = function(l, r) { | |
532 gsub(r, paste0("\\", r), l, fixed = TRUE) | |
533 }, | |
534 x = c("#", "$", "%", "&", "{", "}", "_"), | |
535 init = w | |
536 ) | |
537 w <- gsub("^", "\\^{}", w, fixed = TRUE) | |
538 return(w) | |
539 if (all(v == w)) | |
540 v | |
541 else | |
542 paste0("\\texttt{", w, "}") | |
543 } | |
544 whack_underscores <- whack_math | |
545 | |
546 ## dump params to stderr (remove this eventually) | |
547 | |
548 if (FALSE) nbe(see_variable(params)) | |
549 | |
550 ## unlist params for eventual output | |
551 | |
552 param_unlist <- unlist(as.list(params)) | |
553 | |
554 # no need to whack underscores and dollars because this is verbatim | |
555 param_df <- data.frame( | |
556 parameter = paste0("\\verb@", names(param_unlist), "@"), | |
557 value = paste0( | |
558 "\n\\begin{tiny}\n\\verb@", | |
559 param_unlist, | |
560 "@\n\\end{tiny}" | |
561 ) | |
562 ) | |
563 param_df <- data.frame( | |
564 parameter = names(param_unlist), | |
565 value = param_unlist | |
566 ) | |
567 param_df <- param_df[order(param_df$parameter), ] | |
568 | |
569 ## general output control | |
570 | |
571 debug_file_base_path <- params$debugFileBasePath | |
572 print_trace_messages <- params$printTraceMsgs | |
573 show_enriched_substrates <- params$showEnrichedSubstrates | |
574 boxplot_varwidth <- params$boxPlotVarWidth | |
575 boxplot_notch <- params$boxPlotNotch | |
576 | |
577 ## parameters for static data | |
578 | |
579 kinase_name_uprt_lut_bz2 <- params$kinaseNameUprtLutBz2 | |
580 kinase_uprt_desc_lut_bz2 <- params$kinaseUprtDescLutBz2 | |
581 | |
582 ## parameters for input file | |
583 | |
584 preproc_db <- params$preprocDb | |
585 alpha_file <- params$alphaFile | |
586 input_file <- params$inputFile | |
587 | |
588 # First data column - ideally, this could be detected via | |
589 # regexSampleNames, but for now leave it as is. | |
590 first_data_column <- params$firstDataColumn | |
591 fdc_is_integer <- is.integer(first_data_column) | |
592 if (fdc_is_integer) { | |
593 first_data_column <- as.integer(params$firstDataColumn) | |
594 } | |
595 | |
596 ## parameters for output files | |
597 | |
598 ksea_app_prep_db <- params$kseaAppPrepDb | |
599 imputed_data_filename <- params$imputedDataFilename | |
600 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile | |
601 anova_ksea_mtdt_file <- params$anovaKseaMetadata | |
602 | |
603 ## parameters for imputation | |
604 | |
605 # Imputation method, should be one of | |
606 # "random", "group-median", "median", or "mean" | |
607 imputation_method <- params$imputationMethod | |
608 | |
609 # Selection of percentile of logvalue data to set the mean for random number | |
610 # generation when using random imputation | |
611 mean_percentile <- params$meanPercentile / 100.0 | |
612 | |
613 # deviation adjustment-factor for random values; real number. | |
614 sd_percentile <- params$sdPercentile | |
615 | |
616 ## parameters for group parsing and filtering | |
617 | |
618 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" | |
619 regex_sample_names <- params$regexSampleNames | |
620 # Regular expression to extract Sample Grouping from Sample Name; | |
621 # if error occurs, compare smpl_trt vs. sample_name_matches | |
622 # to see if groupings/pairs line up | |
623 # e.g., "(\\d+)" | |
624 | |
625 regex_sample_grouping <- params$regexSampleGrouping | |
626 # What are the patterns for filtering sample groups? | |
627 # How should sample groups be filtered? | |
628 # - none: do not filter | |
629 # - include: include sample groups matching filter | |
630 # - exclude: include sample groups not matching filter | |
631 | |
632 sample_group_filter <- params$groupFilter | |
633 if (grepl("f", params$groupFilterMode, fixed = TRUE)) { | |
634 sample_group_filter_perl <- FALSE | |
635 sample_group_filter_fixed <- TRUE | |
636 } else if (grepl("p", params$groupFilterMode, fixed = TRUE)) { | |
637 sample_group_filter_perl <- TRUE | |
638 sample_group_filter_fixed <- FALSE | |
639 } else { # normal regex | |
640 sample_group_filter_perl <- FALSE | |
641 sample_group_filter_fixed <- FALSE | |
642 } | |
643 | |
644 sample_group_filter_nocase <- | |
645 grepl("i", params$groupFilterMode, fixed = TRUE) | |
646 | |
647 # What PCRE patterns should be included or excluded | |
648 group_filter_patterns_csv <- params$groupFilterPatterns | |
649 sample_group_filter_patterns <- strsplit( | |
650 x = group_filter_patterns_csv, | |
651 split = ",", | |
652 fixed = TRUE | |
653 )[[1]] | |
654 | |
655 ## parameters for hypothesis testing | |
656 | |
657 one_way_all_categories_fname <- params$oneWayManyCategories | |
658 | |
659 one_way_all_categories <- try_catch_w_e( | |
660 match.fun(one_way_all_categories_fname)) | |
661 | |
662 if (!is.function(one_way_all_categories$value)) { | |
663 write("fatal error for parameter oneWayManyCategories:", stderr()) | |
664 write(one_way_all_categories$value$message, stderr()) | |
665 if (sys.nframe() > 0) { | |
666 cat("Cannot continue and quit() failed. Goodbye.") | |
667 knitr::knit_exit() | |
668 quit(save = "no", status = 1) | |
669 } | |
670 } | |
671 | |
672 one_way_all_categories <- one_way_all_categories$value | |
673 | |
674 one_way_two_categories_fname <- params$oneWayManyCategories | |
675 one_way_two_categories <- try_catch_w_e( | |
676 match.fun(one_way_two_categories_fname)) | |
677 if (!is.function(one_way_two_categories$value)) { | |
678 cat("fatal error for parameter oneWayTwoCategories: \n") | |
679 cat(one_way_two_categories$value$message, fill = TRUE) | |
680 if (sys.nframe() > 0) { | |
681 cat("Cannot continue and quit() failed. Goodbye.") | |
682 knitr::knit_exit() | |
683 quit(save = "no", status = 1) | |
684 } | |
685 } | |
686 one_way_two_categories <- one_way_two_categories$value | |
687 | |
688 ## parameters for KSEA | |
689 | |
690 ksea_cutoff_statistic <- params$kseaCutoffStatistic | |
691 ksea_cutoff_threshold <- params$kseaCutoffThreshold | |
692 ksea_min_substrate_count <- params$kseaMinSubstrateCount | |
693 | |
694 ## parameters for global variables consumed by functions | |
695 | |
696 # intensityHeatmapCriteria: !r c("na_count", "p_value")[2] # TODO switch to 1 | |
697 # TODO Validate within list | |
698 g_intensity_hm_criteria <- params$intensityHeatmapCriteria | |
699 if (is_string_null_or_empty(g_intensity_hm_criteria)) { | |
700 cat("invalid intensityHeatmapCriteria parameter (must be string)") | |
701 knitr::knit_exit() | |
702 } | |
703 switch( | |
704 g_intensity_hm_criteria, | |
705 "quality" = NULL, | |
706 "na_count" = NULL, | |
707 "p_value" = NULL, | |
708 { | |
709 with( | |
710 params, | |
711 cat( | |
712 sprintf( | |
713 "invalid %s (must be %s)", | |
714 see_variable(intensityHeatmapCriteria), | |
715 "one of quality or na_count or p_value" | |
716 ) | |
717 ) | |
718 ) | |
719 knitr::knit_exit() | |
720 } | |
721 ) | |
722 | |
723 # intensityHeatmapRows: 50 | |
724 # TODO Validate >> 0 < 75 | |
725 g_intensity_hm_rows <- params$intensityHeatmapRows | |
726 if (!is.integer(g_intensity_hm_rows) || g_intensity_hm_rows < 1) { | |
727 cat("invalid intensityHeatmapRows (must be integer > 0)") | |
728 knitr::knit_exit() | |
729 } | |
730 | |
731 g_intensity_min_per_class <- params$intensityMinValuesPerGroup | |
732 if (!is.integer(g_intensity_min_per_class) || g_intensity_min_per_class < 0) { | |
733 cat("invalid intensityMinValuesPerGroup (must be integer > -1") | |
734 knitr::knit_exit() | |
735 } | |
736 | |
737 if (is.na(as.logical(g_correlate_substrates <- params$correlateSubstrates))) { | |
738 cat("invalid correlateSubstrates (must be TRUE or FALSE)") | |
739 knitr::knit_exit() | |
740 } | |
741 | |
742 if (is.na(as.logical(g_filter_cov_var_gt_1 <- params$filterCovVarGT1))) { | |
743 cat("invalid filterCovVarGT1 parameter (must be TRUE or FALSE)") | |
744 knitr::knit_exit() | |
745 } | |
746 | |
747 # TODO Validate >> 0 < 30 | |
748 g_ppep_trunc_n <- params$ppepTruncN | |
749 | |
750 # TODO Validate >> 0 < 30 | |
751 g_subgene_trunc_n <- params$subgeneTruncN | |
752 | |
753 # TODO Validate >> 0 < 30 | |
754 g_sbstr_trunc_n <- params$substTruncN | |
755 | |
756 | |
757 ### OPERATORS | |
758 | |
759 # Test for exclusion | |
760 # ref: https://www.reneshbedre.com/blog/in-operator-r.html | |
761 `%notin%` <- Negate(`%in%`) | |
762 | |
763 # Augmented assignment | |
764 # ref: https://www2.cs.arizona.edu/icon/refernce/infix2.htm#aug_assign | |
765 `%||:=%` <- function(lvalue, ...) { | |
766 pf <- parent.frame() | |
767 rvalue <- Reduce(paste0, x = ..., init = lvalue) | |
768 assign( | |
769 x = as.character(substitute(lvalue)), | |
770 value = rvalue, | |
771 pos = pf | |
772 ) | |
773 invisible(rvalue) | |
774 } | |
775 | |
776 ### FUNCTIONS | |
777 | |
778 no_op <- | |
779 function() { | |
780 } | |
781 # this function is not used in this file and should be removed while | |
782 # factoring out reusable code | |
783 all_apply <- function(f, v, na_rm = TRUE, ...) { | |
784 Reduce( | |
785 f = function(l, r) if (na_rm && is.na(r)) TRUE else l && r, | |
786 x = sapply(X = v, FUN = f, ...), | |
787 init = TRUE | |
788 ) | |
789 } | |
790 | |
791 write_debug_file <- function(data_frame) { | |
792 if (!is.null(debug_file_base_path)) { | |
793 s_path <- | |
794 sprintf( | |
795 "%s/%s.txt", | |
796 debug_file_base_path, | |
797 deparse(substitute(data_frame)) | |
798 ) | |
170 write.table( | 799 write.table( |
171 s, | 800 data_frame, |
172 file = s_path, | 801 file = s_path, |
173 sep = "\t", | 802 sep = "\t", |
174 col.names = TRUE, | 803 col.names = TRUE, |
175 row.names = TRUE, | 804 row.names = TRUE, |
176 quote = FALSE | 805 quote = FALSE |
187 # Hence, `x <- 1; get("x", new_env())` fails by design. | 816 # Hence, `x <- 1; get("x", new_env())` fails by design. |
188 new_env <- function() { | 817 new_env <- function() { |
189 new.env(parent = emptyenv()) | 818 new.env(parent = emptyenv()) |
190 } | 819 } |
191 | 820 |
821 # make apply readable for rows | |
822 row_apply <- function(x, fun, ..., simplify = TRUE) { | |
823 apply(x, MARGIN = 1, fun, ..., simplify = TRUE) | |
824 } | |
825 | |
826 # make apply readable for columns | |
827 column_apply <- function(x, fun, ..., simplify = TRUE) { | |
828 apply(x, MARGIN = 2, fun, ..., simplify = TRUE) | |
829 } | |
830 | |
831 ##' Produce a vector of boolean values whose i-th value is TRUE when any | |
832 ##' member of v matches the i-th membr of s, where i in 1:seq_len(length(s)) | |
833 ##' | |
834 ##' @title Search multiple strings for matches of multiple substrings | |
835 ##' @param v a vector of substrings to match | |
836 ##' @param s a vector of strings to search for matches | |
837 ##' @param ... additional arguments to grepl | |
838 ##' @return a list with keys in s and valuse that are vectors of elements of v | |
839 ##' @author Art Eschenlauer | |
840 ##' Copyright (C) 2022 Art Eschenlauer; | |
841 ##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms | |
842 mgrepl <- function(v, s, ...) { | |
843 grpl_rslt <- rep_len(0, length(s)) | |
844 for (vi in v) { | |
845 grpl_rslt_v <- sapply( | |
846 X = s, | |
847 FUN = function(t) { | |
848 Reduce( | |
849 f = function(l, r) if (is.null(l)) r else c(l, r), | |
850 x = sapply( | |
851 X = vi, | |
852 FUN = function(f) grepl(f, t, ...) | |
853 ), | |
854 init = c() | |
855 ) | |
856 }, | |
857 simplify = "array" | |
858 ) | |
859 grpl_rslt <- grpl_rslt + grpl_rslt_v | |
860 } | |
861 rslt <- unname(grpl_rslt > 0) | |
862 } | |
863 | |
864 ##' Produce positions in a vector where succeeding value != current valus | |
865 ##' | |
866 ##' @title Search vector for neighboring positions having different values | |
867 ##' @param v a vector of comparable numeric values (e.g. integers) | |
868 ##' @return a vector of positions i where v[i] != v[i + 1] | |
869 ##' @author Art Eschenlauer | |
870 ##' Copyright (C) 2022 Art Eschenlauer; | |
871 ##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms | |
872 transition_positions <- function(v) { | |
873 Reduce( | |
874 f = function(l, i) if ((i != 1) && (v[i - 1] != v[i])) c(l, i - 1) else l, | |
875 x = seq_along(v)[-1:0], | |
876 init = c() | |
877 ) | |
878 } | |
879 | |
880 ### figure debug functions | |
881 | |
882 cat_par_vector <- function(par_name, lbl = "", newlines = TRUE) { | |
883 cat( | |
884 sprintf( | |
885 "%spar(%s) = c(%s)%s", | |
886 lbl, | |
887 par_name, | |
888 paste(par(par_name), collapse = ", "), | |
889 if (newlines) "\n\n" else "" | |
890 ) | |
891 ) | |
892 } | |
893 | |
894 cat_margins <- function(lbl = NULL) { | |
895 for (p in c("fig", "fin", "mar", "mai", "omd", "omi", "oma")) | |
896 cat_par_vector(p, if (!is.null(lbl)) paste0(lbl, " ") else NULL) | |
897 } | |
898 | |
899 cat_variable <- | |
900 function(x, suffix = "", digits = 3, force_str = FALSE) { | |
901 xprssn <- deparse1(substitute(x)) | |
902 if (force_str || is.matrix(x) || is.list(x) || is.data.frame(x)) { | |
903 cat( | |
904 paste0( | |
905 "\n\\texttt{\\textbf{", | |
906 whack_underscores(xprssn), | |
907 "}} [", | |
908 typeof(x), | |
909 ",", | |
910 mode(x), | |
911 "] =\n" | |
912 ) | |
913 ) | |
914 cat("\n\\begin{verbatim}\n") | |
915 str(x) | |
916 cat("\n\\end{verbatim}\n") | |
917 } else { | |
918 cat("\n", see_variable(x, suffix, digits, xprssn)) | |
919 } | |
920 } | |
921 | |
922 ### structure helper functions | |
923 | |
924 # ref: staque.R - Icon-oriented stack and queue operations | |
925 # - https://gist.github.com/eschen42/917690355e53918b9e7ba7138a02d1f8 | |
926 # | |
927 # sq_get(v):x produces the leftmost element of v and removes it from v, | |
928 # but produces NA if v is empty | |
929 sq_get <- function(v) { | |
930 if (length(v) == 0) return(NA) | |
931 assign(as.character(substitute(v)), v[-1], parent.frame()) | |
932 return(v[1]) | |
933 } | |
934 # | |
935 # sq_put(v,x1,...,xn):v puts x1, x2, ..., xn onto the right end of v, | |
936 # producing v. | |
937 # Values are pushed in order from left to right, | |
938 # so xn becomes the last (rightmost) value on v. | |
939 # sq_put(v) with no second argument does nothing. | |
940 sq_put <- function(v, x = NA, ...) { | |
941 pf <- parent.frame() | |
942 if (is.null(x)) return(pf$v) | |
943 if ( | |
944 !(length(x) > 1) && | |
945 !rlang::is_closure(x) && | |
946 is.na(x) | |
947 ) return(pf$v) | |
948 assign(as.character(substitute(v)), c(v, x, ...), pf) | |
949 pf[[as.character(substitute(v))]] | |
950 } | |
951 | |
192 ### numerical/statistical helper functions | 952 ### numerical/statistical helper functions |
193 | 953 |
194 any_nan <- function(x) { | 954 any_nan <- function(x) { |
195 !any(x == "NaN") | 955 !any(x == "NaN") |
196 } | 956 } |
199 sd_finite <- function(x) { | 959 sd_finite <- function(x) { |
200 ok <- is.finite(x) | 960 ok <- is.finite(x) |
201 sd(x[ok]) | 961 sd(x[ok]) |
202 } | 962 } |
203 | 963 |
964 # compute anova raw p-value | |
204 anova_func <- function(x, grouping_factor, one_way_f) { | 965 anova_func <- function(x, grouping_factor, one_way_f) { |
205 subject <- data.frame( | 966 subject <- data.frame( |
206 intensity = x | 967 intensity = x |
207 ) | 968 ) |
208 x_aov <- | 969 x_aov <- |
216 else | 977 else |
217 pvalue <- x_aov$p.value | 978 pvalue <- x_aov$p.value |
218 pvalue | 979 pvalue |
219 } | 980 } |
220 | 981 |
982 # This code adapted from matrixcalc::is.positive.definite | |
983 # Notably, this simply tests without calling stop() | |
984 is_positive_definite <- function(x, tol = 1e-08) { | |
985 if (!is.matrix(x)) | |
986 return(FALSE) | |
987 if (!is.numeric(x)) | |
988 return(FALSE) | |
989 if (nrow(x) < 1) | |
990 return(FALSE) | |
991 if (ncol(x) < 1) | |
992 return(FALSE) | |
993 if (nrow(x) != ncol(x)) | |
994 return(FALSE) | |
995 sum_symm <- sum(x == t(x), na.rm = TRUE) | |
996 value_count <- Reduce("*", dim(x)) | |
997 if (sum_symm != value_count) | |
998 return(FALSE) | |
999 eigenvalues <- eigen(x, only.values = TRUE)$values | |
1000 n <- nrow(x) | |
1001 for (i in 1:n) { | |
1002 if (abs(eigenvalues[i]) < tol) { | |
1003 eigenvalues[i] <- 0 | |
1004 } | |
1005 } | |
1006 if (any(eigenvalues <= 0)) { | |
1007 return(FALSE) | |
1008 } | |
1009 return(TRUE) | |
1010 } | |
221 | 1011 |
222 ### LaTeX functions | 1012 ### LaTeX functions |
223 | 1013 |
224 latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) { | |
225 v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v | |
226 cat( | |
227 paste0( | |
228 v_sub, | |
229 collapse = collapse_string | |
230 ) | |
231 ) | |
232 } | |
233 | |
234 latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { | |
235 cat("\\begin{itemize}\n\\item ") | |
236 latex_collapsed_vector(collapse_string, v, underscore_whack) | |
237 cat("\n\\end{itemize}\n") | |
238 } | |
239 | |
240 latex_itemized_list <- function(v, underscore_whack = TRUE) { | |
241 latex_itemized_collapsed("\n\\item ", v, underscore_whack) | |
242 } | |
243 | |
244 latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { | |
245 cat("\\begin{enumerate}\n\\item ") | |
246 latex_collapsed_vector(collapse_string, v, underscore_whack) | |
247 cat("\n\\end{enumerate}\n") | |
248 } | |
249 | |
250 latex_enumerated_list <- function(v) { | |
251 latex_enumerated_collapsed("\n\\item ", v) | |
252 } | |
253 | |
254 latex_table_row <- function(v, extra = "", underscore_whack = TRUE) { | |
255 latex_collapsed_vector(" & ", v, underscore_whack) | |
256 cat(extra) | |
257 cat(" \\\\\n") | |
258 } | |
259 | |
260 # Use this like print.data.frame, from which it is adapted: | 1014 # Use this like print.data.frame, from which it is adapted: |
261 data_frame_latex <- | 1015 data_frame_table_latex <- |
262 function( | 1016 function( |
263 x, | 1017 x, |
264 ..., | |
265 # digits to pass to format.data.frame | 1018 # digits to pass to format.data.frame |
266 digits = NULL, | 1019 digits = NULL, |
267 # TRUE -> right-justify columns; FALSE -> left-justify | 1020 # TRUE -> right-justify columns; FALSE -> left-justify |
268 right = TRUE, | 1021 right = TRUE, |
269 # maximumn number of rows to print | 1022 # maximumn number of rows to print |
275 # optional caption | 1028 # optional caption |
276 caption = NULL, | 1029 caption = NULL, |
277 # h(inline); b(bottom); t (top) or p (separate page) | 1030 # h(inline); b(bottom); t (top) or p (separate page) |
278 anchor = "h", | 1031 anchor = "h", |
279 # set underscore_whack to TRUE to escape underscores | 1032 # set underscore_whack to TRUE to escape underscores |
280 underscore_whack = TRUE | 1033 underscore_whack = TRUE, |
1034 # how to emit results | |
1035 emit = cat | |
281 ) { | 1036 ) { |
282 if (is.null(justification)) | 1037 if (is.null(justification)) |
283 justification <- | 1038 justification <- |
284 Reduce( | 1039 Reduce( |
285 f = paste, | 1040 f = paste, |
286 x = rep_len(if (right) "r" else "l", length(colnames(x))) | 1041 x = rep_len(if (right) "r" else "l", length(colnames(x))) |
287 ) | 1042 ) |
1043 n <- length(rownames(x)) | |
1044 if (length(x) == 0L) { | |
1045 emit( | |
1046 sprintf( | |
1047 # if n is one, use singular 'row', else use plural 'rows' | |
1048 ngettext( | |
1049 n, | |
1050 "data frame with 0 columns and %d row", | |
1051 "data frame with 0 columns and %d rows" | |
1052 ), | |
1053 n | |
1054 ), | |
1055 "\n", | |
1056 sep = "" | |
1057 ) | |
1058 } else if (n == 0L) { | |
1059 emit("0 rows for:\n") | |
1060 latex_itemized_list( | |
1061 v = names(x), | |
1062 underscore_whack = underscore_whack | |
1063 ) | |
1064 } else { | |
1065 if (is.null(max)) | |
1066 max <- getOption("max.print", 99999L) | |
1067 if (!is.finite(max)) { | |
1068 cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max) | |
1069 knitr::knit_exit() | |
1070 } | |
1071 omit <- (n0 <- max %/% length(x)) < n | |
1072 m <- as.matrix( | |
1073 format.data.frame( | |
1074 if (omit) x[seq_len(n0), , drop = FALSE] else x, | |
1075 digits = digits, | |
1076 na.encode = FALSE | |
1077 ) | |
1078 ) | |
1079 emit( | |
1080 # h(inline); b(bottom); t (top) or p (separate page) | |
1081 paste0("\\begin{table}[", anchor, "]"), | |
1082 "\\leavevmode", | |
1083 sep = "\n" | |
1084 ) | |
1085 if (!is.null(caption)) | |
1086 emit(paste0(" \\caption{", caption, "}")) | |
1087 if (centered) emit("\\centering\n") | |
1088 emit( | |
1089 paste( | |
1090 " \\begin{tabular}{", | |
1091 justification, | |
1092 "}\n", | |
1093 sep = "" | |
1094 ) | |
1095 ) | |
1096 | |
1097 # ref for top and bottom struts (\T and \B): | |
1098 # https://tex.stackexchange.com/a/50355 | |
1099 if (!is.null(caption)) | |
1100 emit("\\B \\\\\n") | |
1101 latex_table_row( | |
1102 v = colnames(m), | |
1103 extra = " \\T \\B", | |
1104 underscore_whack = underscore_whack | |
1105 ) | |
1106 emit("\\hline \\\\\n") | |
1107 for (i in seq_len(length(m[, 1]))) { | |
1108 latex_table_row( | |
1109 v = m[i, ], | |
1110 underscore_whack = underscore_whack | |
1111 ) | |
1112 } | |
1113 emit( | |
1114 paste( | |
1115 " \\end{tabular}", | |
1116 "\\end{table}", | |
1117 sep = "\n" | |
1118 ) | |
1119 ) | |
1120 if (omit) | |
1121 emit(" [ reached 'max' / getOption(\"max.print\") -- omitted", | |
1122 n - n0, "rows ]\n") | |
1123 } | |
1124 invisible(x) | |
1125 } | |
1126 | |
1127 # Use this like print.data.frame, from which it is adapted: | |
1128 data_frame_tabbing_latex <- | |
1129 function( | |
1130 x, | |
1131 # vector of tab stops, in inches | |
1132 tabstops, | |
1133 # vector of headings, registered with tab-stops | |
1134 headings = colnames(x), | |
1135 # digits to pass to format.data.frame | |
1136 digits = NULL, | |
1137 # maximumn number of rows to print | |
1138 max = NULL, | |
1139 # optional caption | |
1140 caption = NULL, | |
1141 # set underscore_whack to TRUE to escape underscores | |
1142 underscore_whack = TRUE, | |
1143 # flag for landscape mode | |
1144 landscape = FALSE, | |
1145 # flag indicating that subsubsection should be used for caption | |
1146 # rather than subsection | |
1147 use_subsubsection_header = TRUE, | |
1148 # character-size indicator; for possible values, see: | |
1149 # https://tug.org/texinfohtml/latex2e.html#Font-sizes | |
1150 charactersize = "small", | |
1151 # set verbatim to TRUE to debug output | |
1152 verbatim = FALSE | |
1153 ) { | |
1154 | |
1155 hlinport <- if (landscape) { | |
1156 function() cat("\\hlinlscp \\\\\n") | |
1157 } else { | |
1158 function() cat("\\hlinport \\\\\n") | |
1159 } | |
1160 | |
1161 tabstops_tex <- | |
1162 Reduce( | |
1163 f = function(l, r) paste0(l, r), | |
1164 x = sprintf("\\hspace{%0.2fin}\\=", tabstops), | |
1165 init = "" | |
1166 ) | |
1167 | |
288 n <- length(rownames(x)) | 1168 n <- length(rownames(x)) |
289 if (length(x) == 0L) { | 1169 if (length(x) == 0L) { |
290 cat( | 1170 cat( |
291 sprintf( | 1171 sprintf( |
292 # if n is one, use singular 'row', else use plural 'rows' | 1172 # if n is one, use singular 'row', else use plural 'rows' |
307 underscore_whack = underscore_whack | 1187 underscore_whack = underscore_whack |
308 ) | 1188 ) |
309 } else { | 1189 } else { |
310 if (is.null(max)) | 1190 if (is.null(max)) |
311 max <- getOption("max.print", 99999L) | 1191 max <- getOption("max.print", 99999L) |
312 if (!is.finite(max)) | 1192 if (!is.finite(max)) { |
313 stop("invalid 'max' / getOption(\"max.print\"): ", | 1193 cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max) |
314 max) | 1194 knitr::knit_exit() |
1195 } | |
315 omit <- (n0 <- max %/% length(x)) < n | 1196 omit <- (n0 <- max %/% length(x)) < n |
316 m <- as.matrix( | 1197 m <- as.matrix( |
317 format.data.frame( | 1198 format.data.frame( |
318 if (omit) x[seq_len(n0), , drop = FALSE] else x, | 1199 if (omit) x[seq_len(n0), , drop = FALSE] else x, |
319 digits = digits, | 1200 digits = digits, |
320 na.encode = FALSE | 1201 na.encode = FALSE |
321 ) | 1202 ) |
322 ) | 1203 ) |
323 cat( | 1204 if (landscape) |
324 # h(inline); b(bottom); t (top) or p (separate page) | 1205 cat("\n\\begin{landscape}") |
325 paste0("\\begin{table}[", anchor, "]\n") | 1206 tex_caption <- |
326 ) | 1207 if (!is.null(caption)) sprintf("\\captionof{table}{%s}\n", caption) |
327 if (!is.null(caption)) | 1208 else "\n" |
328 cat(paste0(" \\caption{", caption, "}")) | 1209 # build the column names, which have multiple lines when |
329 if (centered) cat("\\centering\n") | 1210 # length(headings) is a multiple of the number of columns |
330 cat( | 1211 column_names <- "" |
331 paste( | 1212 while (length(headings) > 0) { |
332 " \\begin{tabular}{", | 1213 my_row <- c() |
333 justification, | 1214 for (i in 1:(1 + length(tabstops))) { |
334 "}\n", | 1215 my_field <- sq_get(headings) |
335 sep = "" | 1216 sq_put(my_row, if (is.na(my_field)) "" else my_field) |
336 ) | 1217 } |
337 ) | 1218 column_names %||:=% latex_tabbing_row( |
338 # ref: https://tex.stackexchange.com/a/50353 | 1219 v = my_row, |
339 # Describes use of \rule{0pt}{3ex} | 1220 underscore_whack = underscore_whack, |
340 if (!is.null(caption)) | 1221 action = paste0 |
341 cat("\\B \\\\ \\hline\\hline\n") | |
342 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 | |
343 latex_table_row( | |
344 v = colnames(m), | |
345 extra = "\\T\\B", | |
346 underscore_whack = underscore_whack | |
347 ) | |
348 cat("\\hline\n") | |
349 for (i in seq_len(length(m[, 1]))) { | |
350 latex_table_row( | |
351 v = m[i, ], | |
352 underscore_whack = underscore_whack | |
353 ) | 1222 ) |
354 } | 1223 } |
1224 | |
1225 # Begin tabbing environment after beginning charactersize environment | |
1226 if (verbatim) cat("\n\\begin{verbatim}") | |
355 cat( | 1227 cat( |
356 paste( | 1228 paste0( |
357 " \\end{tabular}", | 1229 "\n\\begin{", charactersize, "}", tex_caption, |
358 "\\end{table}", | 1230 "\\begin{tabwrap}{", tabstops_tex, "}\n" |
359 sep = "\n" | |
360 ) | |
361 ) | 1231 ) |
1232 ) | |
1233 # emit column names | |
1234 cat(column_names) | |
1235 # emit hline | |
1236 hlinport() | |
1237 for (i in seq_len(length(m[, 1]))) { | |
1238 my_row <- latex_tabbing_row( | |
1239 v = m[i, ], | |
1240 underscore_whack = underscore_whack, | |
1241 action = paste0 | |
1242 ) | |
1243 if (FALSE) | |
1244 cat(my_row) | |
1245 else | |
1246 cat(my_row) | |
1247 } | |
1248 hlinport() | |
362 if (omit) | 1249 if (omit) |
363 cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", | 1250 cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", |
364 n - n0, "rows ]\n") | 1251 n - n0, "rows ]\n") |
1252 # End charactersize environment after ending tabbing environment | |
1253 cat(paste0("\\end{tabwrap}\n\\end{", charactersize, "}\n")) | |
1254 if (verbatim) cat("\\end{verbatim}\n") | |
1255 if (landscape) | |
1256 cat("\\end{landscape}\n") | |
365 } | 1257 } |
366 invisible(x) | 1258 invisible(x) |
1259 } | |
1260 | |
1261 param_df_noexit <- | |
1262 function(e = NULL) { | |
1263 data_frame_tabbing_latex( | |
1264 x = param_df, | |
1265 tabstops = c(1.75), | |
1266 underscore_whack = TRUE, | |
1267 caption = "Input parameters", | |
1268 verbatim = FALSE | |
1269 ) | |
1270 if (!is.null(e)) { | |
1271 sink(stderr()) | |
1272 cat("Caught fatal error:\n\n") | |
1273 str(e) | |
1274 sink() | |
1275 } | |
1276 } | |
1277 | |
1278 param_df_exit <- | |
1279 function(e = NULL) { | |
1280 param_df_noexit(e) | |
1281 knitr::knit_exit() | |
1282 exit(-1) | |
1283 } | |
1284 | |
1285 # exit with exit code (default 0) and optional msg | |
1286 exit <- | |
1287 function(code = 0, msg = NULL, use_stderr = FALSE) { | |
1288 if (!is.null(msg)) { | |
1289 if (use_stderr) sink(stderr()) | |
1290 cat("\n\n", msg, "\n\n") | |
1291 if (use_stderr) sink() | |
1292 } | |
1293 q(save = "no", status = code) | |
1294 } | |
1295 | |
1296 # make control sequences into printable latex sequences | |
1297 latex_printable_control_seqs <- | |
1298 function(s) { | |
1299 s <- gsub("[\\]", "xyzzy_plugh", s) | |
1300 s <- gsub("[$]", "\\\\$", s) | |
1301 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) | |
1302 return(s) | |
1303 } | |
1304 nolatex_verbatim <- | |
1305 function(expr) eval(expr) | |
1306 | |
1307 latex_verbatim <- | |
1308 function(expr) { | |
1309 arg_string <- deparse1(substitute(expr)) | |
1310 cat("\n\\begin{verbatim}\n___\n") | |
1311 tryCatch( | |
1312 expr = expr, | |
1313 error = param_df_exit, | |
1314 #ACE error = | |
1315 #ACE function(e) { | |
1316 #ACE cat("Caught error:\n\n") | |
1317 #ACE str(e) | |
1318 #ACE knitr::knit_exit() | |
1319 #ACE stop(e) | |
1320 #ACE }, | |
1321 finally = cat("...\n\\end{verbatim}\n") | |
1322 ) | |
1323 } | |
1324 | |
1325 latex_samepage <- | |
1326 function(expr) { | |
1327 arg_string <- deparse1(substitute(expr)) | |
1328 cat("\n\\begin{samepage}\n") | |
1329 tryCatch( | |
1330 expr = expr, | |
1331 error = param_df_exit, | |
1332 #ACE error = | |
1333 #ACE function(e) { | |
1334 #ACE cat("Caught error:\n\n") | |
1335 #ACE str(e) | |
1336 #ACE knitr::knit_exit() | |
1337 #ACE stop(e) | |
1338 #ACE }, | |
1339 finally = cat("\n\\end{samepage}\n") | |
1340 ) | |
1341 } | |
1342 | |
1343 # return the result of invocation after showing parameters | |
1344 # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ | |
1345 latex_show_invocation <- | |
1346 function(f, f_name = deparse1(substitute(f)), head_patch = FALSE) { | |
1347 function(...) { | |
1348 my_env <- (as.list(environment())) | |
1349 va <- list(...) | |
1350 my_rslt <- new_env() | |
1351 my_rslt$rslt <- NULL | |
1352 latex_verbatim( | |
1353 expr = { | |
1354 cat(sprintf("\n .. Local variables for '%s':\n\n", f_name)) | |
1355 str(va) | |
1356 if (!head_patch) { | |
1357 # return this result | |
1358 # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ | |
1359 cat(sprintf("\n .. Invoking '%s'\n", f_name)) | |
1360 tryCatch( | |
1361 { | |
1362 cat("\n\\end{verbatim}\n") | |
1363 rslt <- do.call(f, va) | |
1364 }, | |
1365 error = param_df_exit, | |
1366 #ACE error = function(e) { | |
1367 #ACE cat("\n\\begin{verbatim}\n") | |
1368 #ACE str(e) | |
1369 #ACE cat("\n\\end{verbatim}\n") | |
1370 #ACE knitr::knit_exit() | |
1371 #ACE stop(e) | |
1372 #ACE }, | |
1373 finally = cat("\n\\begin{verbatim}\n") | |
1374 ) | |
1375 cat(sprintf("\n .. '%s' returned:\n", f_name)) | |
1376 str(rslt) | |
1377 my_rslt$rslt <- rslt | |
1378 } | |
1379 } | |
1380 ) | |
1381 # return the result of invocation with the shown parameters | |
1382 # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ | |
1383 if (head_patch) my_rslt$rslt <- do.call(f, va) | |
1384 (my_rslt$rslt) | |
1385 } | |
1386 } | |
1387 | |
1388 latex_collapsed_vector <- function( | |
1389 collapse_string, | |
1390 v, | |
1391 underscore_whack = TRUE, | |
1392 action = cat0 | |
1393 ) { | |
1394 v_sub <- | |
1395 if (underscore_whack) whack_underscores(v) else v | |
1396 action( | |
1397 paste0( | |
1398 v_sub, | |
1399 collapse = collapse_string | |
1400 ) | |
1401 ) | |
1402 } | |
1403 | |
1404 latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { | |
1405 cat("\\begin{itemize}\n\\item ") | |
1406 latex_collapsed_vector(collapse_string, v, underscore_whack) | |
1407 cat("\n\\end{itemize}\n") | |
1408 } | |
1409 | |
1410 latex_itemized_list <- function(v, underscore_whack = TRUE) { | |
1411 latex_itemized_collapsed("\n\\item ", v, underscore_whack) | |
1412 } | |
1413 | |
1414 latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { | |
1415 cat("\\begin{enumerate}\n\\item ") | |
1416 latex_collapsed_vector(collapse_string, v, underscore_whack) | |
1417 cat("\n\\end{enumerate}\n") | |
1418 } | |
1419 | |
1420 latex_enumerated_list <- function(v) { | |
1421 latex_enumerated_collapsed("\n\\item ", v) | |
1422 } | |
1423 | |
1424 latex_table_row <- function(v, extra = "", underscore_whack = TRUE) { | |
1425 latex_collapsed_vector(" & ", v, underscore_whack) | |
1426 cat(extra) | |
1427 cat(" \\\\\n") | |
1428 } | |
1429 | |
1430 latex_tabbing_row <- function( | |
1431 v, | |
1432 extra = "", | |
1433 underscore_whack = TRUE, | |
1434 action = cat0 | |
1435 ) { | |
1436 # latex_collapsed_vector applies action to result of paste0; | |
1437 # by default, action = cat; | |
1438 # hence, a scalar string is assigned to v_collapsed | |
1439 v_collapsed <- | |
1440 latex_collapsed_vector( | |
1441 "} \\> \\tabfill{", | |
1442 v, | |
1443 underscore_whack, | |
1444 action = paste0 | |
1445 ) | |
1446 action( | |
1447 "\\tabfill{", | |
1448 v_collapsed, | |
1449 "}", | |
1450 extra, | |
1451 " \\\\\n" | |
1452 ) | |
1453 } | |
1454 | |
1455 # N.B. use con = "" to emulate regular cat | |
1456 fcat0 <- | |
1457 function(..., sprtr = " ", cnnctn = file()) { | |
1458 cat0(..., sep = sprtr, file = cnnctn) | |
1459 invisible(cnnctn) | |
367 } | 1460 } |
368 | 1461 |
369 hypersub <- | 1462 hypersub <- |
370 function(s) { | 1463 function(s) { |
371 hyper <- tolower(s) | 1464 hyper <- tolower(s) |
372 hyper <- gsub("[^a-z0-9]+", "-", hyper) | 1465 hyper <- gsub("[^a-z0-9]+", "-", hyper) |
373 hyper <- gsub("[-]+", "-", hyper) | 1466 hyper <- gsub("[-]+", "-", hyper) |
1467 hyper <- gsub("[_]+", "-", hyper) | |
374 hyper <- sub("^[-]", "", hyper) | 1468 hyper <- sub("^[-]", "", hyper) |
375 hyper <- sub("[-]$", "", hyper) | 1469 hyper <- sub("[-]$", "", hyper) |
376 return(hyper) | 1470 return(hyper) |
377 } | 1471 } |
378 | 1472 |
379 subsection_header <- | 1473 table_href <- function(s = "offset", caption = "") { |
380 function(s) { | 1474 paste0("\\hyperlink{table.\\arabic{", s, "}}{Table \\arabic{", s, "}}") |
1475 } | |
1476 | |
1477 table_offset <- function(i = 0, s = "offset", new = FALSE) { | |
1478 paste0( | |
1479 if (new) paste0("\\newcounter{", s, "}\n") else "", | |
1480 "\\setcounter{", s, "}{\\value{table}}\n", | |
1481 paste0(if (i > 0) rep(paste0("\\stepcounter{", s, "}"), i), "\n") | |
1482 ) | |
1483 } | |
1484 | |
1485 a_section_header <- | |
1486 function(s, prefix = "") { | |
381 hyper <- hypersub(s) | 1487 hyper <- hypersub(s) |
382 cat( | 1488 my_subsection_header <- sprintf( |
383 sprintf( | 1489 "\\hypertarget{%s}{\\%ssection{%s}\\label{%s}}\n", |
384 "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n", | 1490 hyper, |
385 hyper, s, hyper | 1491 prefix, |
386 ) | 1492 gsub("_", "\\_", s, fixed = TRUE), |
387 ) | 1493 hyper |
388 } | 1494 ) |
389 | 1495 my_subsection_header |
390 subsubsection_header <- | 1496 } |
391 function(s) { | 1497 section_header <- function(s) a_section_header(s, "") |
392 hyper <- hypersub(s) | 1498 subsection_header <- function(s) a_section_header(s, "sub") |
393 cat( | 1499 subsubsection_header <- function(s) a_section_header(s, "subsub") |
394 sprintf( | |
395 "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n", | |
396 hyper, s, hyper | |
397 ) | |
398 ) | |
399 } | |
400 | 1500 |
401 ### SQLite functions | 1501 ### SQLite functions |
402 | 1502 |
403 ddl_exec <- function(db, sql) { | 1503 ddl_exec <- function(db, sql) { |
404 discard <- DBI::dbExecute(conn = db, statement = sql) | 1504 discard <- DBI::dbExecute(conn = db, statement = sql) |
432 } | 1532 } |
433 } | 1533 } |
434 | 1534 |
435 ### KSEA functions and helpers | 1535 ### KSEA functions and helpers |
436 | 1536 |
437 # Adapted from KSEAapp::KSEA.Scores to allow retrieval of: | 1537 #' The KSEA App Analysis (KSEA Kinase Scores Only) |
438 # - maximum log2(FC) | 1538 #' |
1539 #' Compute KSEA kinase scores and statistics from phoshoproteomics data input | |
1540 #' Adapted from KSEAapp::KSEA.Scores to allow retrieval of maximum log2(FC) | |
1541 #' | |
1542 #' Result is an R data.frame with column names | |
1543 #' "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" | |
1544 #' "Please refer to the original Casado et al. publication for detailed | |
1545 #' description of these columns and what they represent: | |
1546 #' | |
1547 #' - Kinase.Gene indicates the gene name for each kinase. | |
1548 #' - mS represents the mean log2(fold change) of all the | |
1549 #' kinase's substrates. | |
1550 #' - Enrichment is the background-adjusted value of the kinase's mS. | |
1551 #' - m is the total number of detected substrates | |
1552 #' from the experimental dataset for each kinase. | |
1553 #' - z.score is the normalized score for each kinase, weighted by | |
1554 #' the number of identified substrates. | |
1555 #' - p.value represents the statistical assessment for the z.score. | |
1556 #' - FDR is the p-value adjusted for multiple hypothesis testing | |
1557 #' using the Benjamini & Hochberg method." | |
1558 #' | |
1559 #' @param ksdata the Kinase-Substrate dataset uploaded from the file | |
1560 #' prefaced with "PSP&NetworKIN_" | |
1561 #' available from github.com/casecpb/KSEA/ | |
1562 #' @param px the experimental data file formatted as described in | |
1563 #' the KSEA.Complete() documentation | |
1564 #' @param networkin a binary input of TRUE or FALSE, indicating whether | |
1565 #' or not to include NetworKIN predictions, where | |
1566 #' \code{NetworKIN = TRUE} | |
1567 #' means include NetworKIN predictions | |
1568 #' @param networkin_cutoff a numeric value between 1 and infinity setting | |
1569 #' the minimum NetworKIN score | |
1570 #' (this can be omitted if NetworKIN = FALSE) | |
1571 #' | |
1572 #' @return creates a new R data.frame with all the KSEA kinase | |
1573 #' scores, along with each one's statistical | |
1574 #' assessment, as described herein. | |
1575 #' | |
1576 #' @references | |
1577 #' | |
1578 #' Casado et al. (2013) Sci Signal. 6(268):rs6 | |
1579 #' | |
1580 #' Hornbeck et al. (2015) Nucleic Acids Res. 43:D512-20 | |
1581 #' | |
1582 #' Horn et al. (2014) Nature Methods 11(6):603-4 | |
1583 #' | |
439 ksea_scores <- function( | 1584 ksea_scores <- function( |
440 | |
441 # For human data, typically, ksdata = KSEAapp::ksdata | 1585 # For human data, typically, ksdata = KSEAapp::ksdata |
442 ksdata, | 1586 ksdata, |
443 | 1587 |
444 # Input data file having columns: | 1588 # Input data file having columns: |
445 # - Protein : abbreviated protein name | 1589 # - Protein : abbreviated protein name |
457 # NetworKIN predictions | 1601 # NetworKIN predictions |
458 networkin, | 1602 networkin, |
459 | 1603 |
460 # A numeric value between 1 and infinity setting the minimum NetworKIN | 1604 # A numeric value between 1 and infinity setting the minimum NetworKIN |
461 # score (can be left out if networkin = FALSE) | 1605 # score (can be left out if networkin = FALSE) |
462 networkin_cutoff | 1606 networkin_cutoff, |
1607 | |
1608 # Minimum substrate count, necessary to adjust the p-value appropriately. | |
1609 minimum_substrate_count | |
463 | 1610 |
464 ) { | 1611 ) { |
1612 # no px$FC should be <= 0, but abs(px$FC) is used below as a precaution. | |
465 if (length(grep(";", px$Residue.Both)) == 0) { | 1613 if (length(grep(";", px$Residue.Both)) == 0) { |
466 # There are no Residue.Both entries having semicolons, so new is | 1614 # There are no Residue.Both entries having semicolons, so new is |
467 # simply px except two columns are renamed and a column is added | 1615 # simply px except two columns are renamed and a column is added |
468 # for log2(abs(fold-change)) | 1616 # for log2(abs(fold-change)) |
469 new <- px | 1617 new <- px |
505 # Convert any illegal values from NaN to NA | 1653 # Convert any illegal values from NaN to NA |
506 new[is.nan(new$log2_fc), "log2_fc"] <- NA | 1654 new[is.nan(new$log2_fc), "log2_fc"] <- NA |
507 # Eliminate rows having missing values (e.g., non-imputed data) | 1655 # Eliminate rows having missing values (e.g., non-imputed data) |
508 new <- new[complete.cases(new$log2_fc), ] | 1656 new <- new[complete.cases(new$log2_fc), ] |
509 } | 1657 } |
510 if (networkin == TRUE) { | 1658 # At this point, new$log2_fc is signed according to which contrast has |
511 # When NetworKIN is true, filter on NetworKIN.cutoff which includes | 1659 # the greater intensity |
512 # PhosphoSitePlus data *because its networkin_score is set to Inf* | 1660 # To take the magnitude into account without taking the direction into |
513 ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ] | 1661 # account, set params$kseaUseAbsoluteLog2FC to TRUE |
514 ksdata_filtered <- ksdata_filtered[ | 1662 # |
515 (ksdata_filtered$networkin_score >= networkin_cutoff), ] | 1663 # Should KSEA be performed aggregating signed log2FC or absolute? |
516 } else { | 1664 # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores |
517 # Otherwise, simply use PhosphSitePlus rows | 1665 if (params$kseaUseAbsoluteLog2FC) { |
518 ksdata_filtered <- ksdata[ | 1666 # TRUE use abs(log2FC) for KSEA as Justin requested; this is a |
519 grep("PhosphoSitePlus", ksdata$Source), ] | 1667 # justifiable deviation from the KSEAapp::KSEA.Scores algorithm. |
520 } | 1668 new$log2_fc <- abs(new$log2_fc) |
521 # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD | 1669 } |
1670 | |
1671 monitor_filtration_on_stderr <- TRUE | |
1672 if (monitor_filtration_on_stderr) { | |
1673 # set to TRUE to monitor filtration on stderr | |
1674 sink(stderr()) | |
1675 cat(see_variable(networkin, "\n")) | |
1676 } | |
1677 ksdata_filtered <- | |
1678 sqldf( | |
1679 sprintf("%s %s", | |
1680 "select * from ksdata where not Source = 'NetworKIN'", | |
1681 if (networkin) | |
1682 sprintf("or networkin_score >= %d", networkin_cutoff) | |
1683 else | |
1684 "" | |
1685 ) | |
1686 ) | |
1687 if (monitor_filtration_on_stderr) { | |
1688 cat(see_variable(sqldf( | |
1689 "select count(*), Source from ksdata group by Source"), "\n")) | |
1690 cat(see_variable(sqldf( | |
1691 "select count(*), Source from ksdata_filtered group by Source"), "\n")) | |
1692 sink() | |
1693 } | |
1694 | |
1695 ############################################################################ | |
1696 # Line numbers below refer to lines of: | |
1697 # https://github.com/casecpb/KSEAapp/blob/master/R/KSEA.Scores.R | |
1698 # I would put the original line in a comment but then lint would complain... | |
1699 # - Indeed, I had to rename all the variables because lint didn't like names | |
1700 # containing periods or capital letters. | |
1701 # ACE | |
1702 ############################################################################ | |
1703 # | |
1704 # (1) Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD | |
522 # colnames of ksdata_filtered: | 1705 # colnames of ksdata_filtered: |
523 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" | 1706 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" |
524 # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" | 1707 # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" |
525 # "SITE_...7_AA" "networkin_score" "Source" | 1708 # "SITE_...7_AA" "networkin_score" "Source" |
526 # colnames of new: | 1709 # colnames of new: |
529 # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc | 1712 # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc |
530 # FROM ksdata_filtered a | 1713 # FROM ksdata_filtered a |
531 # INNER JOIN new b | 1714 # INNER JOIN new b |
532 # ON a.SUB_GENE = b.SUB_GENE | 1715 # ON a.SUB_GENE = b.SUB_GENE |
533 # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD | 1716 # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD |
1717 # (KSEA.Scores.R line # 105) | |
1718 # "Extract KSData.filtered annotations that are only found in new" | |
534 ksdata_dataset <- base::merge(ksdata_filtered, new) | 1719 ksdata_dataset <- base::merge(ksdata_filtered, new) |
535 # colnames of ksdata_dataset: | 1720 # colnames of ksdata_dataset: |
536 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" | 1721 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" |
537 # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" | 1722 # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" |
538 # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" | 1723 # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" |
539 # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) | 1724 # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) |
540 # Re-order dataset; prior to accounting for isoforms | 1725 # Re-order dataset; prior to accounting for isoforms |
1726 # (KSEA.Scores.R line # 106) | |
541 ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] | 1727 ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] |
542 # Extract non-isoform accession in UniProtKB | 1728 # Extract non-isoform accession in UniProtKB |
1729 # (KSEA.Scores.R line # 107) | |
543 ksdata_dataset$uniprot_no_isoform <- sapply( | 1730 ksdata_dataset$uniprot_no_isoform <- sapply( |
544 ksdata_dataset$KIN_ACC_ID, | 1731 ksdata_dataset$KIN_ACC_ID, |
545 function(x) unlist(strsplit(as.character(x), split = "-"))[1] | 1732 function(x) unlist(strsplit(as.character(x), split = "-"))[1] |
546 ) | 1733 ) |
1734 # "last expression collapses isoforms ... for easy processing" | |
547 # Discard previous results while selecting interesting columns ... | 1735 # Discard previous results while selecting interesting columns ... |
1736 # (KSEA.Scores.R line # 110) | |
548 ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] | 1737 ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] |
549 # Column names are now: | 1738 # Column names are now: |
550 # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" | 1739 # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" |
551 # "FC" "log2_fc" "Source" | 1740 # "FC" "log2_fc" "Source" |
552 # Make column names human-readable | 1741 # Make column names human-readable |
1742 # (KSEA.Scores.R line # 111) | |
553 colnames(ksdata_dataset_abbrev) <- c( | 1743 colnames(ksdata_dataset_abbrev) <- c( |
554 "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", | 1744 "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", |
555 "FC", "log2FC", "Source" | 1745 "FC", "log2FC", "Source" |
556 ) | 1746 ) |
557 # SELECT * FROM ksdata_dataset_abbrev | 1747 # SELECT * FROM ksdata_dataset_abbrev |
558 # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p | 1748 # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p |
1749 # (KSEA.Scores.R line # 112) | |
1750 # "Extract KSData.filtered annotations that are only found in new" | |
559 ksdata_dataset_abbrev <- | 1751 ksdata_dataset_abbrev <- |
560 ksdata_dataset_abbrev[ | 1752 ksdata_dataset_abbrev[ |
561 order( | 1753 order( |
562 ksdata_dataset_abbrev$Kinase.Gene, | 1754 ksdata_dataset_abbrev$Kinase.Gene, |
563 ksdata_dataset_abbrev$Substrate.Gene, | 1755 ksdata_dataset_abbrev$Substrate.Gene, |
564 ksdata_dataset_abbrev$Substrate.Mod, | 1756 ksdata_dataset_abbrev$Substrate.Mod, |
565 ksdata_dataset_abbrev$p), | 1757 ksdata_dataset_abbrev$p), |
566 ] | 1758 ] |
1759 if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev)) | |
567 # First aggregation step to account for multiply phosphorylated peptides | 1760 # First aggregation step to account for multiply phosphorylated peptides |
568 # and differing peptide sequences; the goal here is to combine results | 1761 # and differing peptide sequences; the goal here is to combine results |
569 # for all measurements of the same substrate. | 1762 # for all measurements of the same substrate. |
570 # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, | 1763 # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, |
571 # `Source`, avg(log2FC) AS log2FC | 1764 # `Source`, avg(log2FC) AS log2FC |
573 # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, | 1766 # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, |
574 # `Source` | 1767 # `Source` |
575 # ORDER BY `Kinase.Gene`; | 1768 # ORDER BY `Kinase.Gene`; |
576 # in two steps: | 1769 # in two steps: |
577 # (1) compute average log_2(fold-change) | 1770 # (1) compute average log_2(fold-change) |
1771 # "take the mean of the log2FC amongst phosphosite duplicates" | |
1772 # (KSEA.Scores.R line # 115) | |
578 ksdata_dataset_abbrev <- aggregate( | 1773 ksdata_dataset_abbrev <- aggregate( |
579 log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, | 1774 log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, |
580 data = ksdata_dataset_abbrev, | 1775 data = ksdata_dataset_abbrev, |
581 FUN = mean | 1776 FUN = mean |
582 ) | 1777 ) |
1778 if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev)) | |
583 # (2) order by Kinase.Gene | 1779 # (2) order by Kinase.Gene |
1780 # (KSEA.Scores.R line # 117) | |
584 ksdata_dataset_abbrev <- | 1781 ksdata_dataset_abbrev <- |
585 ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] | 1782 ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] |
586 # SELECT `Kinase.Gene`, count(*) | 1783 # SELECT `Kinase.Gene`, count(*) |
587 # FROM ksdata_dataset_abbrev | 1784 # FROM ksdata_dataset_abbrev |
588 # GROUP BY `Kinase.Gene`; | 1785 # GROUP BY `Kinase.Gene`; |
589 # in two steps: | 1786 # in two steps: |
590 # (1) Extract the list of Kinase.Gene names | 1787 # (1) Extract the list of Kinase.Gene names |
1788 # "@@@@@@@@@@@@@@@@@@@@" | |
1789 # "Do analysis for KSEA" | |
1790 # "@@@@@@@@@@@@@@@@@@@@" | |
1791 # (KSEA.Scores.R line # 124) | |
591 kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) | 1792 kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) |
592 # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, | 1793 # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, |
593 # named by Kinase.Gene | 1794 # named by Kinase.Gene |
1795 # (KSEA.Scores.R line # 125) | |
594 kinase_list <- as.matrix(table(kinase_list)) | 1796 kinase_list <- as.matrix(table(kinase_list)) |
595 # Second aggregation step to account for all substrates per kinase | 1797 # Second aggregation step to account for all substrates per kinase |
596 # CREATE TABLE mean_fc | 1798 # CREATE TABLE mean_fc |
597 # AS | 1799 # AS |
598 # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC | 1800 # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC |
599 # FROM ksdata_dataset_abbrev | 1801 # FROM ksdata_dataset_abbrev |
600 # GROUP BY `Kinase.Gene` | 1802 # GROUP BY `Kinase.Gene` |
601 mean_fc <- aggregate( | 1803 # (KSEA.Scores.R line # 127) |
602 log2FC ~ Kinase.Gene, | 1804 if (print_nb_messages) nb(see_variable(ksdata_dataset_abbrev), "\n") |
603 data = ksdata_dataset_abbrev, | 1805 mean_fc <- |
604 FUN = mean | 1806 aggregate( |
605 ) | |
606 # mean_fc columns: "Kinase.Gene", "log2FC" | |
607 if (FALSE) { | |
608 # I need to re-think this; I was trying to find the most-represented | |
609 # peptide, but that horse has already left the barn | |
610 # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC | |
611 # FROM ksdata_dataset_abbrev | |
612 # GROUP BY `Kinase.Gene` | |
613 max_fc <- aggregate( | |
614 log2FC ~ Kinase.Gene, | 1807 log2FC ~ Kinase.Gene, |
615 data = ksdata_dataset_abbrev, | 1808 data = ksdata_dataset_abbrev, |
616 FUN = function(r) max(abs(r)) | 1809 FUN = mean |
617 ) | 1810 ) |
618 } | 1811 if (print_nb_messages) nbe(see_variable(mean_fc), "\n") |
1812 | |
1813 # for contrast j | |
1814 # for each kinase i | |
1815 # extract log2 of fold-change (from `new` above) | |
1816 # (used in KSEA.Scores.R lines # 130 & 132) | |
1817 log2_fc_j_each_i <- | |
1818 new$log2_fc | |
1819 | |
1820 # for contrast j | |
1821 # for all kinases i | |
1822 # compute mean of abs(log2 of fold-change) | |
1823 # (used in KSEA.Scores.R lines # 130) | |
1824 mean_abs_log2_fc_j_all_i <- | |
1825 mean(abs(log2_fc_j_each_i), na.rm = TRUE) | |
1826 | |
1827 # for contrast j | |
1828 # for all kinases i | |
1829 # compute mean of log2 of fold-change | |
1830 # (used in KSEA.Scores.R lines # 132) | |
1831 mean_log2_fc_j_all_i <- | |
1832 mean(log2_fc_j_each_i, na.rm = TRUE) | |
1833 | |
1834 # Reorder mean_fc, although I don't see why | |
1835 # (KSEA.Scores.R line 128 | |
1836 mean_fc <- mean_fc[order(mean_fc[, 1]), ] | |
1837 # mean_fc columns so far: "Kinase.Gene", "log2FC" | |
1838 # - Kinase.Gene | |
1839 # indicates the gene name for each kinase. | |
619 | 1840 |
620 # Create column 3: mS | 1841 # Create column 3: mS |
621 mean_fc$m_s <- mean_fc[, 2] | 1842 # - mS |
1843 # represents the mean log2(fold change) of all the | |
1844 # kinase's substrates. | |
1845 # (KSEA.Scores.R line # 129) | |
1846 mean_fc$m_s <- | |
1847 mean_fc_m_s <- mean_fc[, 2] | |
1848 | |
622 # Create column 4: Enrichment | 1849 # Create column 4: Enrichment |
623 mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE)) | 1850 # - Enrichment |
624 # Create column 5: m, count of substrates | 1851 # is the background-adjusted value of the kinase's mS. |
625 mean_fc$m <- kinase_list | 1852 # (KSEA.Scores.R line # 130) |
1853 mean_fc$enrichment <- | |
1854 mean_fc_m_s / mean_abs_log2_fc_j_all_i | |
1855 | |
1856 # Create column 5: m, count of substrates of kinase (count of j for i) | |
1857 # - m | |
1858 # is the total number of detected substrates | |
1859 # from the experimental dataset for each kinase. | |
1860 # (KSEA.Scores.R line # 131) | |
1861 mean_fc$m <- | |
1862 mean_fc_m <- kinase_list | |
1863 | |
1864 | |
626 # Create column 6: z-score | 1865 # Create column 6: z-score |
627 mean_fc$z_score <- ( | 1866 # - z.score |
628 (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) * | 1867 # is the normalized score for each kinase, weighted by |
629 sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE) | 1868 # the number of identified substrates. |
1869 # (KSEA.Scores.R line # 132) | |
1870 mean_fc$z_score <- | |
1871 (mean_fc_m_s - mean_log2_fc_j_all_i) * sqrt(mean_fc_m) / | |
1872 sd(log2_fc_j_each_i, na.rm = TRUE) | |
1873 | |
630 # Create column 7: p-value, deduced from z-score | 1874 # Create column 7: p-value, deduced from z-score |
631 mean_fc$p_value <- pnorm(-abs(mean_fc$z_score)) | 1875 # - p.value |
1876 # represents the statistical assessment for the z.score. | |
1877 # (KSEA.Scores.R line # 133) | |
1878 # "one-tailed p-value" | |
1879 mean_fc$p_value <- | |
1880 pnorm(-abs(mean_fc$z_score)) | |
1881 | |
1882 # zap excluded kinases; this must be done before adjusting p-value | |
1883 if (TRUE) { | |
1884 mean_fc <- | |
1885 mean_fc[ | |
1886 mean_fc$m >= minimum_substrate_count, | |
1887 , | |
1888 drop = FALSE | |
1889 ] | |
1890 } | |
1891 | |
1892 #ACE nb(see_variable(nrow(mean_fc)), "\n") | |
632 # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value | 1893 # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value |
633 mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr") | 1894 # - FDR |
634 | 1895 # is the p-value adjusted for multiple hypothesis testing |
635 # Remove log2FC column, which is duplicated as mS | 1896 # using the Benjamini & Hochberg method." |
636 mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2] | 1897 # (KSEA.Scores.R line # 134) |
1898 mean_fc$fdr <- | |
1899 p.adjust(mean_fc$p_value, method = "fdr") | |
1900 | |
1901 # It makes no sense to leave Z-scores negative when using | |
1902 # absolute value of fold-change | |
1903 if (params$kseaUseAbsoluteLog2FC) { | |
1904 mean_fc$z_score <- abs(mean_fc$z_score) | |
1905 } | |
1906 | |
1907 # Remove second column (log2FC), which is duplicated as mS | |
1908 # (KSEA.Scores.R line # 136) | |
1909 mean_fc <- | |
1910 mean_fc[order(mean_fc$Kinase.Gene), -2] | |
637 # Correct the column names which we had to hack because of the linter... | 1911 # Correct the column names which we had to hack because of the linter... |
638 colnames(mean_fc) <- c( | 1912 colnames(mean_fc) <- c( |
639 "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" | 1913 "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" |
640 ) | 1914 ) |
1915 # (KSEA.Scores.R line # 138) | |
641 return(mean_fc) | 1916 return(mean_fc) |
642 } | 1917 } |
643 | 1918 |
644 low_fdr_barplot <- function( | 1919 ksea_low_fdr_barplot_factory <- function( |
645 rslt, | 1920 rslt, |
646 i_cntrst, | 1921 i_cntrst, |
647 i, | 1922 i, |
648 a_level, | 1923 a_level, |
649 b_level, | 1924 b_level, |
671 k$fdr | 1946 k$fdr |
672 }, | 1947 }, |
673 "p.value" = { | 1948 "p.value" = { |
674 k$p_value | 1949 k$p_value |
675 }, | 1950 }, |
676 stop( | 1951 { |
677 sprintf( | 1952 cat( |
678 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | 1953 sprintf( |
679 ksea_cutoff_statistic | 1954 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", |
1955 ksea_cutoff_statistic | |
1956 ) | |
680 ) | 1957 ) |
681 ) | 1958 param_df_exit() |
682 ) | 1959 knitr::knit_exit() |
1960 } | |
1961 ) | |
683 | 1962 |
684 k <- k[selector < ksea_cutoff_threshold, ] | 1963 k <- k[selector < ksea_cutoff_threshold, ] |
685 | 1964 nrow_k <- nrow(k) |
686 if (nrow(k) > 0) { | 1965 |
687 op <- par(mai = c(1, 1.5, 0.4, 0.4)) | 1966 #ACE nbe(see_variable(fdr_barplot_dataframe <- k)) |
1967 | |
1968 if (nrow_k > 0) { | |
1969 max_nchar_rowname <- max(nchar(rownames(k))) | |
1970 my_cex_names <- 1.0 / (1 + nrow_k / 50) | |
1971 | |
1972 if (print_trace_messages) cat_margins("Initially") | |
1973 if (print_trace_messages) cat_variable(nrow_k, "\n\n", 0) | |
1974 if (print_trace_messages) cat_variable(my_cex_names, "\n\n", 0) | |
1975 if (print_trace_messages) cat_variable(max_nchar_rowname, "\n\n", 0) | |
1976 | |
1977 # fin: The figure region dimensions, (width, height), in inches. | |
1978 # mar: A numerical vector of the form c(bottom, left, top, right) | |
1979 # that gives the number of lines of margin to be specified | |
1980 # on the four sides of the plot; default: c(5, 4, 4, 2) + 0.1 | |
1981 | |
1982 # mar: The figure region dimensions, (width, height), in inches. | |
688 numeric_z_score <- as.numeric(k$z_score) | 1983 numeric_z_score <- as.numeric(k$z_score) |
689 z_score_order <- order(numeric_z_score) | 1984 bar_order <- order(-as.numeric(k$p_value)) |
690 kinase_name <- k$kinase_gene | 1985 kinase_name <- k$kinase_gene |
691 long_caption <- | 1986 long_caption <- |
692 sprintf( | 1987 sprintf( |
693 "Kinase z-score, %s < %s, %s", | 1988 "Kinase z-score, %s, KSEA %s < %s", |
1989 caption, | |
694 ksea_cutoff_statistic, | 1990 ksea_cutoff_statistic, |
695 ksea_cutoff_threshold, | 1991 ksea_cutoff_threshold |
696 caption | |
697 ) | 1992 ) |
698 my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) | 1993 my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) |
699 cat("\n\\clearpage\n") | 1994 # return a function that draws the plot |
700 barplot( | 1995 function() { |
701 height = numeric_z_score[z_score_order], | 1996 par_fin <- par("fin") # vector of width_in_inches and height_in_inches) |
702 border = NA, | 1997 op <- par( |
703 xpd = FALSE, | 1998 bg = if (print_trace_messages) "yellow" else "white", |
704 cex.names = 1.0, | 1999 fin = c(par_fin[1], min(par_fin[2], 2.5 + nrow_k / 6)), |
705 main = long_caption, | 2000 mar = par("mar") + |
706 cex.main = my_cex_caption, | 2001 c(3 / nrow_k, (1 + max_nchar_rowname * my_cex_names) / 2, 0, 0) |
707 names.arg = kinase_name[z_score_order], | 2002 # bottom, left, top, right |
708 horiz = TRUE, | |
709 srt = 45, | |
710 las = 1, | |
711 cex.axis = 0.9 | |
712 ) | 2003 ) |
713 par(op) | 2004 on.exit(par(op)) |
2005 if (print_trace_messages) cat_margins("Eventually") | |
2006 | |
2007 barplot( | |
2008 height = numeric_z_score[bar_order], | |
2009 border = NA, | |
2010 xpd = FALSE, | |
2011 cex.names = my_cex_names, | |
2012 main = long_caption, | |
2013 cex.main = my_cex_caption, | |
2014 names.arg = kinase_name[bar_order], | |
2015 horiz = TRUE, | |
2016 srt = 45, | |
2017 las = 1, | |
2018 cex.axis = 0.9 | |
2019 ) | |
2020 } | |
714 } | 2021 } |
2022 } else { | |
2023 no_op | |
715 } | 2024 } |
716 } | 2025 } |
717 | 2026 |
718 # note that this adds elements to the global variable `ksea_asterisk_hash` | 2027 # note that this adds elements to the global variable `ksea_asterisk_hash` |
719 | 2028 |
720 low_fdr_print <- function( | 2029 ksea_low_fdr_print <- function( |
721 rslt, | 2030 rslt, |
722 i_cntrst, | 2031 i_cntrst, |
723 i, | 2032 i, |
724 a_level, | 2033 a_level, |
725 b_level, | 2034 b_level, |
726 fold_change, | 2035 fold_change, |
727 caption | 2036 caption, |
2037 write_db = TRUE, # if TRUE, write to DB, else print table | |
2038 anchor = c(const_table_anchor_p, const_table_anchor_t) | |
728 ) { | 2039 ) { |
729 rslt_score_list_i <- rslt$score_list[[i]] | 2040 rslt_score_list_i <- rslt$score_list[[i]] |
730 if (!is.null(rslt_score_list_i)) { | 2041 if (!is.null(rslt_score_list_i)) { |
731 rslt_score_list_i_nrow <- nrow(rslt_score_list_i) | 2042 rslt_score_list_i_nrow <- nrow(rslt_score_list_i) |
732 k <- contrast_ksea_scores <- data.frame( | 2043 k <- contrast_ksea_scores <- data.frame( |
748 k$fdr | 2059 k$fdr |
749 }, | 2060 }, |
750 "p.value" = { | 2061 "p.value" = { |
751 k$p_value | 2062 k$p_value |
752 }, | 2063 }, |
753 stop( | 2064 { |
754 sprintf( | 2065 cat( |
755 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | 2066 sprintf( |
756 ksea_cutoff_statistic | 2067 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", |
2068 ksea_cutoff_statistic | |
2069 ) | |
757 ) | 2070 ) |
758 ) | 2071 param_df_exit() |
759 ) | 2072 knitr::knit_exit() |
2073 } | |
2074 ) | |
760 | 2075 |
761 k <- k[selector < ksea_cutoff_threshold, ] | 2076 k <- k[selector < ksea_cutoff_threshold, ] |
762 # save kinase names to ksea_asterisk_hash | 2077 # save kinase names to ksea_asterisk_hash |
763 for (kinase_name in k$kinase_gene) { | 2078 for (kinase_name in k$kinase_gene) { |
764 ksea_asterisk_hash[[kinase_name]] <- 1 | 2079 ksea_asterisk_hash[[kinase_name]] <- 1 |
765 } | 2080 } |
766 | 2081 |
767 db_write_table_overwrite <- (i_cntrst < 2) | 2082 if (write_db) { |
768 db_write_table_append <- !db_write_table_overwrite | 2083 db_write_table_overwrite <- (i_cntrst < 2) |
769 RSQLite::dbWriteTable( | 2084 db_write_table_append <- !db_write_table_overwrite |
770 conn = db, | 2085 RSQLite::dbWriteTable( |
771 name = "contrast_ksea_scores", | 2086 conn = db, |
772 value = contrast_ksea_scores, | 2087 name = "contrast_ksea_scores", |
773 append = db_write_table_append | 2088 value = contrast_ksea_scores, |
774 ) | 2089 append = db_write_table_append |
775 selector <- switch( | 2090 ) |
776 ksea_cutoff_statistic, | 2091 "" |
777 "FDR" = { | 2092 } else { |
778 contrast_ksea_scores$fdr | 2093 selector <- switch( |
779 }, | 2094 ksea_cutoff_statistic, |
780 "p.value" = { | 2095 "FDR" = { |
781 contrast_ksea_scores$p_value | 2096 contrast_ksea_scores$fdr |
782 }, | 2097 }, |
783 stop( | 2098 "p.value" = { |
784 sprintf( | 2099 contrast_ksea_scores$p_value |
785 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | 2100 }, |
786 ksea_cutoff_statistic | 2101 { |
2102 cat( | |
2103 sprintf( | |
2104 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
2105 ksea_cutoff_statistic | |
2106 ) | |
2107 ) | |
2108 param_df_exit() | |
2109 knitr::knit_exit() | |
2110 } | |
2111 ) | |
2112 if (print_nb_messages) nbe(see_variable(contrast_ksea_scores)) | |
2113 output_df <- contrast_ksea_scores[ | |
2114 selector < ksea_cutoff_threshold, | |
2115 c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", | |
2116 "z_score", "p_value", "fdr") | |
2117 ] | |
2118 output_df$kinase_gene <- | |
2119 gsub( | |
2120 "_", | |
2121 "\\\\_", | |
2122 output_df$kinase_gene | |
2123 ) | |
2124 colnames(output_df) <- | |
2125 c( | |
2126 colnames(output_df)[1], | |
2127 colnames(output_df)[2], | |
2128 "enrichment", | |
2129 "m_s", | |
2130 "z_score", | |
2131 "p_value", | |
2132 "fdr" | |
2133 ) | |
2134 #ACE output_order <- with(output_df, order(fdr)) | |
2135 output_order <- with(output_df, order(p_value)) | |
2136 output_df <- output_df[output_order, ] | |
2137 | |
2138 output_df[, 2] <- sprintf("%0.3g", output_df[, 2]) | |
2139 output_df$fdr <- sprintf("%0.4f", output_df$fdr) | |
2140 output_df$p_value <- sprintf("%0.2e", output_df$p_value) | |
2141 output_df$z_score <- sprintf("%0.2f", output_df$z_score) | |
2142 output_df$m_s <- sprintf("%d", output_df$m_s) | |
2143 output_df$enrichment <- sprintf("%0.3g", output_df$enrichment) | |
2144 output_ncol <- ncol(output_df) | |
2145 colnames(output_df) <- | |
2146 c( | |
2147 "Kinase", | |
2148 "\\(\\overline{{\\lvert}\\log_2 (\\text{fold-change}){\\rvert}}\\)", | |
2149 "Enrichment", | |
2150 "Substrates", | |
2151 "z-score", | |
2152 "p-value", | |
2153 "FDR" | |
2154 ) | |
2155 selector <- switch( | |
2156 ksea_cutoff_statistic, | |
2157 "FDR" = { | |
2158 rslt$score_list[[i]]$FDR | |
2159 }, | |
2160 "p.value" = { | |
2161 rslt$score_list[[i]]$p.value | |
2162 }, | |
2163 { | |
2164 cat( | |
2165 sprintf( | |
2166 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
2167 ksea_cutoff_statistic | |
2168 ) | |
2169 ) | |
2170 param_df_exit() | |
2171 knitr::knit_exit() | |
2172 } | |
2173 ) | |
2174 if (sum(selector < ksea_cutoff_threshold) > 0) { | |
2175 if (print_nb_messages) nbe(see_variable(output_df)) | |
2176 math_caption <- gsub("{", "\\{", caption, fixed = TRUE) | |
2177 math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) | |
2178 # with ( | |
2179 # output_df, | |
2180 # ) | |
2181 if (TRUE) { | |
2182 output_df$Kinase <- whack_underscores(output_df$Kinase) | |
2183 data_frame_tabbing_latex( | |
2184 x = output_df, | |
2185 # vector of tab stops, in inches | |
2186 tabstops = c(1.0, 1.2, 1.0, 1.0, 1.0, 1.0), | |
2187 # vector of headings, registered with tab-stops | |
2188 headings = colnames(output_df), | |
2189 # digits to pass to format.data.frame | |
2190 digits = NULL, | |
2191 # maximumn number of rows to print | |
2192 max = NULL, | |
2193 # optional caption | |
2194 caption = sprintf( | |
2195 "\\text{%s}, KSEA %s < %s", | |
2196 math_caption, | |
2197 ksea_cutoff_statistic, | |
2198 ksea_cutoff_threshold | |
2199 ), | |
2200 # set underscore_whack to TRUE to escape underscores | |
2201 underscore_whack = FALSE, | |
2202 # flag for landscape mode | |
2203 landscape = FALSE, | |
2204 # flag indicating that subsubsection should be used for caption | |
2205 # rather than subsection | |
2206 use_subsubsection_header = TRUE, | |
2207 # character-size indicator; for possible values, see: | |
2208 # https://tug.org/texinfohtml/latex2e.html#Font-sizes | |
2209 charactersize = "small", | |
2210 # set verbatim to TRUE to debug output | |
2211 verbatim = FALSE | |
2212 ) | |
2213 } else { | |
2214 data_frame_table_latex( | |
2215 x = output_df, | |
2216 justification = "l c c c c c c", | |
2217 centered = TRUE, | |
2218 caption = sprintf( | |
2219 "\\text{%s}, KSEA %s < %s", | |
2220 math_caption, | |
2221 ksea_cutoff_statistic, | |
2222 ksea_cutoff_threshold | |
2223 ), | |
2224 anchor = anchor, | |
2225 underscore_whack = FALSE | |
2226 ) | |
2227 } | |
2228 } else { | |
2229 cat( | |
2230 sprintf( | |
2231 "\\break | |
2232 No kinases had | |
2233 \\(\\text{KSEA %s}_\\text{enrichment} < %s\\) | |
2234 for contrast %s\\hfill\\break\n", | |
2235 ksea_cutoff_statistic, | |
2236 ksea_cutoff_threshold, | |
2237 caption | |
787 ) | 2238 ) |
788 ) | 2239 ) |
789 ) | 2240 } |
790 output_df <- contrast_ksea_scores[ | |
791 selector < ksea_cutoff_threshold, | |
792 c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", | |
793 "z_score", "p_value", "fdr") | |
794 ] | |
795 output_order <- with(output_df, order(mean_log2_fc, kinase_gene)) | |
796 output_df <- output_df[output_order, ] | |
797 colnames(output_df) <- | |
798 c( | |
799 colnames(output_df)[1], | |
800 colnames(output_df)[2], | |
801 "enrichment", | |
802 "m_s", | |
803 "z_score", | |
804 "p_value", | |
805 "fdr" | |
806 ) | |
807 output_df$fdr <- sprintf("%0.4f", output_df$fdr) | |
808 output_df$p_value <- sprintf("%0.2e", output_df$p_value) | |
809 output_df$z_score <- sprintf("%0.2f", output_df$z_score) | |
810 output_df$m_s <- sprintf("%d", output_df$m_s) | |
811 output_df$enrichment <- sprintf("%0.2f", output_df$enrichment) | |
812 output_ncol <- ncol(output_df) | |
813 colnames(output_df) <- | |
814 c( | |
815 "Kinase", | |
816 "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)", | |
817 "Enrichment", | |
818 "Substrates", | |
819 "z-score", | |
820 "p-value", | |
821 "FDR" | |
822 ) | |
823 selector <- switch( | |
824 ksea_cutoff_statistic, | |
825 "FDR" = { | |
826 rslt$score_list[[i]]$FDR | |
827 }, | |
828 "p.value" = { | |
829 rslt$score_list[[i]]$p.value | |
830 }, | |
831 stop( | |
832 sprintf( | |
833 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
834 ksea_cutoff_statistic | |
835 ) | |
836 ) | |
837 ) | |
838 if (sum(selector < ksea_cutoff_threshold) > 0) { | |
839 math_caption <- gsub("{", "\\{", caption, fixed = TRUE) | |
840 math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) | |
841 data_frame_latex( | |
842 x = output_df, | |
843 justification = "l c c c c c c", | |
844 centered = TRUE, | |
845 caption = sprintf( | |
846 "\\text{%s}, %s < %s", | |
847 math_caption, | |
848 ksea_cutoff_statistic, | |
849 ksea_cutoff_threshold | |
850 ), | |
851 anchor = const_table_anchor_p | |
852 ) | |
853 } else { | |
854 cat( | |
855 sprintf( | |
856 "\\break | |
857 No kinases had | |
858 \\(\\text{%s}_\\text{enrichment} < %s\\) | |
859 for contrast %s\\hfill\\break\n", | |
860 ksea_cutoff_statistic, | |
861 ksea_cutoff_threshold, | |
862 caption | |
863 ) | |
864 ) | |
865 } | 2241 } |
2242 } else { | |
2243 "" | |
866 } | 2244 } |
867 } | 2245 } |
868 | 2246 |
869 # create_breaks is a helper for ksea_heatmap | 2247 # create_breaks is a helper for ksea_heatmap |
870 create_breaks <- function(merged_scores) { | 2248 create_breaks <- function(merged_scores) { |
905 mycol <- unique(append(mycol_neg, mycol_pos)) | 2283 mycol <- unique(append(mycol_neg, mycol_pos)) |
906 color_breaks <- list(breaks_all, mycol) | 2284 color_breaks <- list(breaks_all, mycol) |
907 return(color_breaks) | 2285 return(color_breaks) |
908 } | 2286 } |
909 | 2287 |
2288 hm2plus <- function( | |
2289 x, | |
2290 mat = matrix( | |
2291 c( | |
2292 c(0, 4, 0), | |
2293 c(0, 3, 3), | |
2294 c(2, 1, 1) | |
2295 ), | |
2296 nrow = 3, | |
2297 ncol = 3, | |
2298 byrow = TRUE | |
2299 ), | |
2300 denwid = 0.5, | |
2301 denhgt = 0.15, | |
2302 widths = c(0.5, 2.5, 1.5), | |
2303 heights = c(0.4, 0.15, 3.95), | |
2304 divergent = FALSE, | |
2305 notecol = "grey50", | |
2306 trace = "none", | |
2307 margins = c(6, 20), | |
2308 srtcol = 90, | |
2309 srtrow = 0, | |
2310 density_info = "none", | |
2311 key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), | |
2312 key_par = list(), | |
2313 hclustfun = hclust, | |
2314 ... | |
2315 ) { | |
2316 | |
2317 varargs <- list(...) | |
2318 if (FALSE) # this is to avoid commenting out code to pass linting... | |
2319 my_hm2 <- latex_show_invocation(heatmap.2, head_patch = FALSE) | |
2320 else | |
2321 my_hm2 <- heatmap.2 | |
2322 | |
2323 x <- as.matrix(x) | |
2324 if (sum(!is.na(x)) < 1) | |
2325 return(NULL) | |
2326 color_count <- 1 + max(64, length(as.vector(x))) # 8 was not enough | |
2327 break_count <- 1 + color_count | |
2328 min_nonax <- min(x, na.rm = TRUE) | |
2329 max_nonax <- max(x, na.rm = TRUE) | |
2330 if (print_nb_messages) nb("within hm2plus", see_variable(divergent), "\n") | |
2331 if (divergent) { | |
2332 zlim <- max(abs(min_nonax), abs(max_nonax)) | |
2333 if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n")) | |
2334 breaks <- (zlim) / (break_count:1) | |
2335 if (print_nb_messages) nb(see_variable(breaks, "\n")) | |
2336 breaks <- breaks - median(breaks) | |
2337 zlim <- c(-zlim, zlim) | |
2338 if (print_nb_messages) nb(see_variable(zlim, "\n")) | |
2339 } else { | |
2340 zlim <- max(abs(min_nonax), abs(max_nonax)) | |
2341 if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n")) | |
2342 breaks <- zlim / (break_count:1) | |
2343 if (print_nb_messages) nb(see_variable(breaks, "\n")) | |
2344 if (max_nonax < 0) { | |
2345 breaks <- breaks - zlim | |
2346 zlim <- c(-zlim, 0) | |
2347 } else { | |
2348 zlim <- c(0, zlim) | |
2349 } | |
2350 if (print_nb_messages) nb(see_variable(zlim, "\n")) | |
2351 } | |
2352 nonax <- x | |
2353 nonax[is.na(x)] <- min_nonax | |
2354 if (is.null(widths)) widths <- c(denwid, 4 - denwid, 1.5) | |
2355 if (is.null(heights)) heights <- c(0.4, denhgt, 4.0) | |
2356 colors <- | |
2357 if (divergent && min_nonax < 0) { | |
2358 # divergent colors on both sides of zero | |
2359 colorRampPalette(c("red", "white", "blue"))(color_count) | |
2360 } else if (divergent && min_nonax > 0) { | |
2361 # "divergent" colors > zero | |
2362 colorRampPalette(c("white", "blue"))(color_count) | |
2363 } else if (divergent && max_nonax < 0) { | |
2364 # "divergent" colors < zero | |
2365 colorRampPalette(c("red", "white"))(color_count) | |
2366 } else { | |
2367 # "non-divergent" colors including zero | |
2368 hcl.colors(color_count, "YlOrRd", rev = TRUE) | |
2369 } | |
2370 | |
2371 #ACE if (print_nb_messages) nb("within hm2plus", see_variable(key_par), "\n") | |
2372 #ACE if (print_nb_messages) nb(see_variable(colors, "\n")) | |
2373 #ACE key_par$col = colors | |
2374 #ACE key_par$breaks = breaks | |
2375 | |
2376 if (print_nb_messages) nb(see_variable(par(), "\n")) #ACE TODO remove me | |
2377 if (print_nb_messages) cat("\\leavevmode\n\\linebreak\n") #ACE TODO remove me | |
2378 suppressWarnings( | |
2379 my_hm2( | |
2380 x = x, | |
2381 col = colors, | |
2382 #ACE symkey = FALSE, | |
2383 density.info = density_info, | |
2384 srtCol = srtcol, | |
2385 srtRow = srtrow, | |
2386 margins = margins, | |
2387 lwid = widths, | |
2388 lhei = heights, | |
2389 key.title = NA, | |
2390 key.xlab = key_xlab, | |
2391 key.par = key_par, | |
2392 lmat = mat, | |
2393 notecol = notecol, | |
2394 trace = trace, | |
2395 bg = "yellow", | |
2396 hclustfun = hclustfun, | |
2397 #ACE breaks = breaks, | |
2398 oldstyle = FALSE, | |
2399 ... # varargs | |
2400 ) | |
2401 ) | |
2402 # implicitly returning value returned by heatmap.2 | |
2403 } | |
2404 | |
910 # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap | 2405 # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap |
911 draw_kseaapp_summary_heatmap <- function( | 2406 draw_kseaapp_summary_heatmap <- function( |
912 x, | 2407 x, # matrix with row/col names already formatted |
913 sample_cluster, | 2408 sample_cluster, # a binary input of TRUE or FALSE, |
914 merged_asterisk, | 2409 # indicating whether or not to perform |
915 my_cex_row, | 2410 # hierarchical clustering of the sample columns |
916 color_breaks, | 2411 merged_asterisk, # matrix having dimensions of x, values "*" or "" |
917 margins, | 2412 color_breaks, # breaks for color gradation, from create_breaks |
2413 # passed to `breaks` argument of `image` | |
2414 margins = c(8, 15), # two integers setting the bottom and right margins | |
2415 # to accommodate row and column labels | |
2416 master_cex = 0.7, # basis for text sizes | |
918 ... | 2417 ... |
919 ) { | 2418 ) { |
920 merged_scores <- x | 2419 merged_scores <- x |
921 if (!is.matrix(x)) { | 2420 if (!is.matrix(x)) { |
922 cat( | 2421 cat( |
924 "No plot because \\texttt{typeof(x)} is '", | 2423 "No plot because \\texttt{typeof(x)} is '", |
925 typeof(x), | 2424 typeof(x), |
926 "' rather than 'matrix'.\n\n" | 2425 "' rather than 'matrix'.\n\n" |
927 ) | 2426 ) |
928 ) | 2427 ) |
929 } else if (nrow(x) < 2) { | 2428 cat_variable(x) |
930 cat("No plot because matrix has ", nrow(x), " rows.\n\n") | |
931 return(FALSE) | 2429 return(FALSE) |
932 } else if (ncol(x) < 2) { | 2430 } |
933 cat("No plot because matrix x has ", ncol(x), " columns.\n\n") | 2431 if (print_trace_messages) cat(sprintf("master_cex = %03f\n\n", master_cex)) |
2432 nrow_x <- nrow(x) | |
2433 ncol_x <- ncol(x) | |
2434 #if (nrow_x < 2) { | |
2435 if (nrow_x < 1) { | |
2436 cat("No plot because matrix has no rows.\n\n") | |
934 return(FALSE) | 2437 return(FALSE) |
935 } else { | 2438 } else if (nrow_x < 2) { |
936 my_limit <- 25 | 2439 cat("No plot because matrix has one row. Matrix looks like this:\n\n") |
937 my_cex_col <- my_limit / (my_limit + ncol(x)) | 2440 cat("\n\\begin{verbatim}\n") |
938 my_cex_row <- my_limit / (my_limit + nrow(x)) | 2441 print(x) |
939 my_scale <- 12.0 | 2442 cat("\n\\end{verbatim}\n") |
940 if (ncol(x) < 10 && nrow(x) < 10) | 2443 return(FALSE) |
941 my_scale <- my_scale * 10 / (10 - nrow(x)) * 10 / (10 - ncol(x)) | 2444 } else if (ncol_x < 2) { |
942 gplots::heatmap.2( | 2445 cat("No plot because matrix x has ", ncol_x, " columns.\n\n") |
943 x = merged_scores, | 2446 cat_variable(x) |
944 Colv = sample_cluster, | 2447 return(FALSE) |
945 breaks = color_breaks[[1]], | 2448 } |
946 cellnote = merged_asterisk, | 2449 max_nchar_rowname <- max(nchar(rownames(x))) |
947 cexCol = 0.9 * my_cex_col, | 2450 max_nchar_colname <- max(nchar(colnames(x))) |
948 cexRow = 2 * my_cex_row, | 2451 my_limit <- g_intensity_hm_rows |
949 col = color_breaks[[2]], | 2452 |
950 density.info = "none", | 2453 my_row_cex_scale <- master_cex * 150 / nrow_x |
951 key = FALSE, | 2454 my_col_cex_scale <- 3.0 |
952 lhei = c(0.4, 8.0, 1.1), | 2455 my_asterisk_scale <- 0.4 * my_row_cex_scale |
953 lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), | 2456 my_row_warp <- 1 |
954 lwid = c(0.5, 3), | 2457 my_note_warp <- 2 |
955 margins = margins, | 2458 my_row_warp <- 1 |
956 notecex = my_scale * my_cex_row * my_cex_col, | 2459 my_row_cex_asterisk <- |
957 notecol = "white", | 2460 master_cex * my_row_warp * my_asterisk_scale |
958 scale = "none", | 2461 |
959 srtCol = 45, | 2462 |
960 srtRow = 45, | 2463 my_col_cex <- my_col_cex_scale * master_cex |
961 trace = "none", | 2464 my_row_cex <- min(3.5 * my_row_cex_asterisk, my_col_cex) |
962 ... | 2465 my_key_cex <- 1.286 |
963 ) | 2466 my_hm2_cex <- 1 * master_cex |
964 return(TRUE) | 2467 my_offset <- (4.8 / (9 + nrow_x / 10)) - 0.4 |
965 } | 2468 if (print_trace_messages) cat(sprintf("nrow_x = %03f\n\n", nrow_x)) |
966 } | 2469 if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset)) |
967 | 2470 my_offset <- 0.05 |
968 # Adapted from KSEAapp::KSEA.Heatmap | 2471 if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset)) |
2472 my_scale <- 3.0 | |
2473 if (ncol_x < 10 && nrow_x < 10) | |
2474 my_scale <- my_scale * 10 / (10 - nrow_x) * 10 / (10 - ncol_x) | |
2475 | |
2476 my_heights <- c( | |
2477 0.15, | |
2478 3.85 - my_offset, | |
2479 0.5 + my_offset | |
2480 ) | |
2481 my_margins <- c(1, 1) + | |
2482 c( | |
2483 margins[1] * 0.08 * max_nchar_colname * my_col_cex, | |
2484 margins[2] * 0.04 * max_nchar_rowname * my_row_cex | |
2485 ) | |
2486 | |
2487 my_notecex <- | |
2488 my_scale * | |
2489 min( | |
2490 1.1, | |
2491 my_row_cex_asterisk * my_note_warp, | |
2492 my_col_cex * my_note_warp | |
2493 ) | |
2494 | |
2495 if (print_trace_messages) { | |
2496 cat_variable(my_heights, suffix = "; ") | |
2497 cat_variable(my_margins, suffix = "\n\n") | |
2498 cat_variable(my_row_cex_scale, suffix = "; ") | |
2499 cat_variable(my_col_cex_scale, suffix = "\n\n") | |
2500 cat_variable(my_row_cex_asterisk, suffix = "\n\n") | |
2501 cat_variable(my_row_cex, suffix = "; ") | |
2502 cat_variable(my_col_cex, suffix = "\n\n") | |
2503 cat_variable(my_row_cex, suffix = "; ") | |
2504 cat_variable(my_col_cex, suffix = "\n\n") | |
2505 } | |
2506 | |
2507 hm2plus( | |
2508 x = merged_scores, | |
2509 Colv = sample_cluster, | |
2510 cellnote = merged_asterisk, | |
2511 cex = my_hm2_cex, | |
2512 cexCol = my_col_cex, | |
2513 cexRow = my_row_cex, | |
2514 denhgt = 0.15, | |
2515 density_info = "none", | |
2516 denwid = 0.5, | |
2517 divergent = TRUE, | |
2518 key_par = list(cex = my_key_cex), | |
2519 key_xlab = "Z-score", | |
2520 margins = my_margins, | |
2521 notecex = my_scale * min( | |
2522 1.5, | |
2523 my_row_cex_asterisk * my_note_warp, | |
2524 my_col_cex * my_note_warp | |
2525 ), | |
2526 notecol = "white", | |
2527 scale = "none", | |
2528 srtcol = 90, | |
2529 srtrow = 0, | |
2530 trace = "none", | |
2531 mat = matrix( | |
2532 c( | |
2533 c(0, 3, 3), | |
2534 c(2, 1, 1), | |
2535 c(0, 4, 0) | |
2536 ), | |
2537 nrow = 3, | |
2538 ncol = 3, | |
2539 byrow = TRUE | |
2540 ), | |
2541 widths = c(0.5, 3.1, 0.9), | |
2542 heights = my_heights, | |
2543 ... | |
2544 ) | |
2545 return(TRUE) | |
2546 } | |
2547 | |
2548 # function drawing heatmap of contrast fold-change for each kinase, | |
2549 # adapted from KSEAapp::KSEA.Heatmap | |
969 ksea_heatmap <- function( | 2550 ksea_heatmap <- function( |
970 # the data frame outputs from the KSEA.Scores() function, in list format | 2551 # the data frame outputs from the KSEA.Scores() function, in list format |
971 score_list, | 2552 score_list, |
972 # a character vector of all the sample names for heatmap annotation: | 2553 # a character vector of all the sample names for heatmap annotation: |
973 # - the names must be in the same order as the data in score_list | 2554 # - the names must be in the same order as the data in score_list |
979 # a numeric value between 0 and infinity indicating the min. number of | 2560 # a numeric value between 0 and infinity indicating the min. number of |
980 # substrates a kinase must have to be included in the heatmap | 2561 # substrates a kinase must have to be included in the heatmap |
981 m_cutoff, | 2562 m_cutoff, |
982 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff | 2563 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff |
983 # for indicating significant kinases in the heatmap | 2564 # for indicating significant kinases in the heatmap |
984 p_cutoff = | 2565 p_cutoff = { |
985 stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"), | 2566 cat("argument 'p_cutoff' is required for function 'ksea_heatmap'") |
2567 param_df_exit() | |
2568 knitr::knit_exit() | |
2569 }, | |
986 # a binary input of TRUE or FALSE, indicating whether or not to perform | 2570 # a binary input of TRUE or FALSE, indicating whether or not to perform |
987 # hierarchical clustering of the sample columns | 2571 # hierarchical clustering of the sample columns |
988 sample_cluster, | 2572 sample_cluster, |
989 # a binary input of TRUE or FALSE, indicating whether or not to export | 2573 # a binary input of TRUE or FALSE, indicating whether or not to export |
990 # the heatmap as a .png image into the working directory | 2574 # the heatmap as a .png image into the working directory |
991 export = FALSE, | 2575 export = FALSE, |
992 # bottom and right margins; adjust as needed if contrast names are too long | 2576 # bottom and right margins; adjust as needehttps://tex.stackexchange.com/a/56795d if contrast names are too long |
993 margins = c(6, 20), | 2577 margins = c(6, 6), |
994 # print which kinases? | 2578 # print which kinases? |
995 # - Mandatory argument, must be one of const_ksea_.*_kinases | 2579 # - Mandatory argument, must be one of const_ksea_.*_kinases |
996 which_kinases, | 2580 which_kinases, |
997 # additional arguments to gplots::heatmap.2, such as: | 2581 # additional arguments to gplots::heatmap.2, such as: |
998 # - main: main title of plot | 2582 # - main: main title of plot |
1037 } | 2621 } |
1038 return(new) | 2622 return(new) |
1039 } | 2623 } |
1040 merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) | 2624 merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) |
1041 | 2625 |
1042 # begin hack to print only significant rows | |
1043 asterisk_rows <- rowSums(merged_asterisk == "*") > 0 | 2626 asterisk_rows <- rowSums(merged_asterisk == "*") > 0 |
1044 all_rows <- rownames(merged_stats) | 2627 all_rows <- rownames(merged_stats) |
1045 names(asterisk_rows) <- all_rows | 2628 names(asterisk_rows) <- all_rows |
1046 non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) | 2629 non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) |
1047 asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) | 2630 asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) |
1048 merged_scores_asterisk <- merged_scores[names(asterisk_rows), , drop = FALSE] | 2631 merged_scores_asterisk <- merged_scores[names(asterisk_rows), , drop = FALSE] |
1049 merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), , drop = FALSE] | 2632 merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), , drop = FALSE] |
1050 # end hack to print only significant rows | |
1051 | 2633 |
1052 row_list <- list() | 2634 row_list <- list() |
1053 row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows | 2635 row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows |
1054 row_list[[const_ksea_all_kinases]] <- all_rows | 2636 row_list[[const_ksea_all_kinases]] <- all_rows |
1055 row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows | 2637 row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows |
1060 stts <- merged_stats[my_row_names, , drop = FALSE] | 2642 stts <- merged_stats[my_row_names, , drop = FALSE] |
1061 merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) | 2643 merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) |
1062 | 2644 |
1063 color_breaks <- create_breaks(scrs) | 2645 color_breaks <- create_breaks(scrs) |
1064 if (is.null(color_breaks)) { | 2646 if (is.null(color_breaks)) { |
1065 cat("No plot because matrix has too many missing values.\n\n") | 2647 cat("No plot because matrix has too few rows.\n\n") |
1066 return(NULL) | 2648 return(NULL) |
1067 } | 2649 } |
1068 plot_height <- nrow(scrs) ^ 0.55 | 2650 plot_height <- nrow(scrs) ^ 0.55 |
1069 plot_width <- ncol(scrs) ^ 0.7 | 2651 plot_width <- ncol(scrs) ^ 0.7 |
1070 my_cex_row <- 0.25 * 16 / plot_height | |
1071 if (export == "TRUE") { | 2652 if (export == "TRUE") { |
1072 png( | 2653 png( |
1073 "KSEA.Merged.Heatmap.png", | 2654 "KSEA.Merged.Heatmap.png", |
1074 width = plot_width * 300, | 2655 width = plot_width * 300, |
1075 height = 2 * plot_height * 300, | 2656 height = 2 * plot_height * 300, |
1079 } | 2660 } |
1080 did_draw <- draw_kseaapp_summary_heatmap( | 2661 did_draw <- draw_kseaapp_summary_heatmap( |
1081 x = scrs, | 2662 x = scrs, |
1082 sample_cluster = sample_cluster, | 2663 sample_cluster = sample_cluster, |
1083 merged_asterisk = merged_asterisk, | 2664 merged_asterisk = merged_asterisk, |
1084 my_cex_row = my_cex_row, | |
1085 color_breaks = color_breaks, | 2665 color_breaks = color_breaks, |
1086 margins = margins | 2666 margins = margins |
1087 ) | 2667 ) |
1088 if (export == "TRUE") { | 2668 if (export == "TRUE") { |
1089 dev.off() | 2669 dev.off() |
1091 if (!did_draw) | 2671 if (!did_draw) |
1092 return(NULL) | 2672 return(NULL) |
1093 return(my_row_names) | 2673 return(my_row_names) |
1094 } | 2674 } |
1095 | 2675 |
1096 # helper for heatmaps of phosphopeptide intensities | 2676 # helpers for heatmaps of phosphopeptide intensities |
1097 | 2677 |
1098 draw_ppep_heatmap <- | 2678 # factory producing function to truncate string after n characters |
2679 trunc_n <- function(n) { | |
2680 function(x) { | |
2681 sapply( | |
2682 X = x, | |
2683 FUN = function(s) { | |
2684 if (is.na(s)) | |
2685 return("NA") | |
2686 cond <- try_catch_w_e(nchar(s) > n) | |
2687 if (!is.logical(cond$value)) { | |
2688 return(cond$value$message) | |
2689 } else if (cond$value) { | |
2690 paste0( | |
2691 strtrim(s, n), | |
2692 "..." | |
2693 ) | |
2694 } else { | |
2695 s | |
2696 } | |
2697 }, | |
2698 USE.NAMES = FALSE | |
2699 ) | |
2700 } | |
2701 } | |
2702 trunc_long_ppep <- function(x) trunc_n(40)(x) | |
2703 trunc_ppep <- function(x) trunc_n(g_ppep_trunc_n)(x) | |
2704 trunc_subgene <- function(x) trunc_n(g_subgene_trunc_n)(x) | |
2705 trunc_enriched_substrate <- function(x) trunc_n(g_sbstr_trunc_n)(x) | |
2706 | |
2707 # factory producing a function that returns a covariance | |
2708 # matrix's rows (and columns) having variance > v_min | |
2709 keep_cov_w_var_gtr_min <- function(v_min) { | |
2710 function(x) { | |
2711 if (!is.matrix(x)) | |
2712 return(NULL) | |
2713 keepers <- sapply( | |
2714 X = seq_len(nrow(x)), | |
2715 FUN = function(i) { | |
2716 if (x[i, i] < v_min) | |
2717 NA | |
2718 else | |
2719 x[i, i] | |
2720 } | |
2721 ) | |
2722 names(keepers) <- rownames(x) | |
2723 keepers <- keepers[!is.na(keepers)] | |
2724 keepers <- names(keepers) | |
2725 if (length(keepers) == 0) | |
2726 return(NULL) | |
2727 x[keepers, keepers] | |
2728 } | |
2729 } | |
2730 # function that returns a matrix's rows having variance > 1 | |
2731 keep_cov_w_var_gtr_1 <- keep_cov_w_var_gtr_min(1) | |
2732 | |
2733 # factory producing a function that returns | |
2734 # - either a matrix's rows (rows = TRUE) | |
2735 # - or a matrix's columns (rows = FALSE) | |
2736 # having variance > v_min | |
2737 keep_var_gtr_min <- function(v_min) { | |
2738 function(x, rows = TRUE) { | |
2739 nrowcol <- if (rows) nrow else ncol | |
2740 if (!is.matrix(x)) | |
2741 return(NULL) | |
2742 keepers <- sapply( | |
2743 X = seq_len(nrowcol(x)), | |
2744 FUN = function(i) { | |
2745 row_var <- var( | |
2746 if (rows) x[i, ] else x[, i], | |
2747 na.rm = TRUE | |
2748 ) | |
2749 if (is.na(row_var) || row_var <= v_min) NA else i | |
2750 } | |
2751 ) | |
2752 keepers <- keepers[!is.na(keepers)] | |
2753 if (rows) x[keepers, ] else x[, keepers] | |
2754 } | |
2755 } | |
2756 | |
2757 keep_var_gtr_0 <- keep_var_gtr_min(0) | |
2758 | |
2759 # function drawing heatmap of phosphopeptide intensities | |
2760 ppep_heatmap <- | |
1099 function( | 2761 function( |
1100 m, # matrix with rownames already formatted | 2762 m, # matrix with rownames already formatted |
1101 cutoff, # cutoff used by hm_heading_function | 2763 cutoff, # cutoff used by hm_heading_function |
1102 hm_heading_function, # construct and cat heading from m and cutoff | 2764 hm_heading_function, # construct $ cat heading from m and cutoff |
1103 hm_main_title, # main title for plot (drawn below heading) | 2765 hm_main_title, # main title for plot (drawn below heading) |
1104 suppress_row_dendrogram = TRUE, # set to false to show dendrogram | 2766 suppress_row_dendrogram = TRUE, # set to false to show dendrogram |
1105 max_peptide_count # experimental: | 2767 max_peptide_count = # experimental: |
1106 = intensity_hm_rows, # values of 50 and 75 worked well | 2768 g_intensity_hm_rows, # values of 50 and 75 worked well |
1107 ... # passthru parameters for heatmap | 2769 master_cex = 1.0, # basis for text sizes |
2770 margins = NULL, # optional margins (bottom, right) | |
2771 cellnote = NULL, # optional matrix of character; dim = dim(m) | |
2772 adj = 0.5, # adjust text: 0 left, 0.5 middle, 1 right | |
2773 ... # passthru to hm2plus or heatmap.2 | |
1108 ) { | 2774 ) { |
2775 use_heatmap_1 <- FALSE | |
1109 peptide_count <- 0 | 2776 peptide_count <- 0 |
1110 # emit the heading for the heatmap | 2777 # emit the heading for the heatmap |
1111 if (hm_heading_function(m, cutoff)) { | 2778 if (hm_heading_function(m, cutoff)) { |
1112 peptide_count <- min(max_peptide_count, nrow(m)) | 2779 nrow_m <- nrow(m) |
1113 if (nrow(m) > 0) { | 2780 peptide_count <- min(max_peptide_count, nrow_m) |
2781 if (nrow_m > 1) { | |
1114 m_margin <- m[peptide_count:1, ] | 2782 m_margin <- m[peptide_count:1, ] |
1115 # Margin setting was heuristically derived | 2783 # Margin was heuristically derived to accommodate the widest label |
1116 margins <- | 2784 row_mchar_max <- max(nchar(rownames(m_margin))) |
1117 c(0.5, # col | 2785 col_mchar_max <- max(nchar(colnames(m_margin))) |
1118 max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row | 2786 row_margin <- master_cex * row_mchar_max * 2.6 |
1119 ) | 2787 col_margin <- master_cex * col_mchar_max * 2.6 |
1120 } | 2788 if (print_trace_messages) cat(sprintf("row_margin = %0.3f; ", row_margin)) |
1121 if (nrow(m) > 0) { | 2789 if (print_trace_messages) cat(sprintf("col_margin = %0.3f; ", col_margin)) |
1122 hm_call <- NULL | 2790 hm_call <- NULL |
1123 tryCatch( | 2791 tryCatch( |
1124 { | 2792 { |
1125 old_oma <- par("oma") | 2793 # set non-argument parameters for hm_call inner function |
1126 par(cex.main = 0.6) | 2794 my_row_cex <- |
1127 # Heuristically determined character size adjustment formula | 2795 master_cex * 200000 / ( |
1128 my_cex_row <- | 2796 (max(nchar(rownames(m_margin)))^2) * g_intensity_hm_rows |
1129 250000 / ( | |
1130 max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows | |
1131 ) | 2797 ) |
1132 m_hm <- m[peptide_count:1, , drop = FALSE] | 2798 m_hm <- m[peptide_count:1, , drop = FALSE] |
1133 my_limit <- 60 | 2799 if (is.null(cellnote)) { |
1134 my_cex_col <- 0.75 * my_limit / (my_limit + ncol(m_hm)) | 2800 cellnote <- matrix("", nrow = nrow(m_hm), ncol = ncol(m_hm)) |
2801 cellnote[is.na(m_hm)] <- "NA" | |
2802 } else { | |
2803 cellnote <- cellnote[peptide_count:1, , drop = FALSE] | |
2804 } | |
2805 m_hm[is.na(m_hm)] <- 0 | |
2806 nrow_m_hm <- nrow(m_hm) | |
2807 ncol_m_hm <- ncol(m_hm) | |
2808 if (nrow_m_hm < 1 || ncol_m_hm < 1) | |
2809 return(peptide_count) # return zero as initialized above | |
2810 my_limit <- g_intensity_hm_rows | |
2811 | |
2812 | |
2813 my_row_cex <- master_cex * (100 / (2 + row_mchar_max)) | |
2814 my_col_cex <- master_cex * 6 * row_margin / col_margin | |
2815 my_col_adj <- min(my_col_cex, my_row_cex) / my_col_cex | |
2816 my_col_cex <- min(my_col_cex, my_row_cex) | |
2817 col_margin <- sqrt(my_col_adj) * col_margin | |
2818 if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex)) | |
2819 if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex)) | |
2820 if (is.null(margins)) my_margins <- | |
2821 c( | |
2822 (my_col_cex + col_margin), # col | |
2823 (my_row_cex + row_margin) / my_row_cex # row | |
2824 ) | |
2825 else | |
2826 my_margins <- margins | |
2827 | |
2828 if (print_trace_messages) cat( | |
2829 sprintf( | |
2830 "my_margins = c(%s)\n\n", | |
2831 paste(my_margins, collapse = ", ") | |
2832 ) | |
2833 ) | |
2834 my_hm2_cex <- 2 * master_cex | |
2835 my_key_cex <- 0.9 - 0.1 * (g_intensity_hm_rows + nrow_m_hm) / g_intensity_hm_rows | |
2836 my_key_warp <- 1.5 * 22.75 / row_margin | |
2837 my_key_cex <- min(1.10, my_key_warp * my_key_cex) | |
2838 my_hgt_scale <- 3.70 - 0.4 * (max(1, 0.9 * my_row_cex) - 1) | |
2839 my_hgt_scale <- 3.75 # 3.615 | |
2840 my_hgt_scale <- 3.60 # 3.615 | |
2841 if (print_trace_messages) | |
2842 cat_variable(my_hgt_scale, "\n\n", 3) | |
2843 my_warp <- max(0.1, 1.4 * (7.5 + nrow_m) / g_intensity_hm_rows) | |
2844 if (print_trace_messages) | |
2845 cat_variable(my_warp, "\n\n", 3) | |
2846 # added 0.9 heuristically... | |
2847 my_plot_height <- | |
2848 (0.566 + 0.354 * (nrow_m / g_intensity_hm_rows)) * | |
2849 min(my_hgt_scale, my_hgt_scale * my_warp) | |
2850 my_plot_height <- min(3.65, my_plot_height * g_intensity_hm_rows / 50) | |
2851 my_heights <- c( | |
2852 0.3, # title and top dendrogram | |
2853 my_plot_height, # plot and bottom margin | |
2854 4.15 - my_hgt_scale, # legend | |
2855 0.05 + my_hgt_scale - my_plot_height # whitespace below legend | |
2856 ) | |
2857 my_note_cex <- min(0.8, my_row_cex, my_col_cex) | |
2858 if (print_trace_messages) { | |
2859 cat_variable(my_plot_height, "\n\n", 3) | |
2860 cat_variable(4.19 - my_hgt_scale, "\n\n", 3) | |
2861 cat_variable(nrow_m_hm, "; ", 0) | |
2862 cat_variable(ncol_m_hm, "; ", 0) | |
2863 cat_variable(my_row_cex, "; ", 3) | |
2864 cat_variable(my_col_cex, "; ", 3) | |
2865 cat_variable(my_note_cex, "; ", 3) | |
2866 cat_variable(my_key_cex, "\n\n", 3) | |
2867 cat_variable(my_hgt_scale, "; ", 3) | |
2868 cat_variable(my_plot_height, "; ", 3) | |
2869 cat_variable(my_warp, "\n\n", 3) | |
2870 cat_variable(my_heights, "; ", 2) | |
2871 cat_variable(sum(my_heights), "\n\n", 3) | |
2872 } | |
2873 | |
2874 # define hm_call inner function | |
1135 hm_call <- function(x, scaling, title) { | 2875 hm_call <- function(x, scaling, title) { |
1136 heatmap( | 2876 my_cex_main <- min(5.0, 220 / nchar(title)) |
1137 x, | 2877 op <- par( |
1138 Rowv = if (suppress_row_dendrogram) NA else NULL, | 2878 cex.main = my_cex_main * master_cex, |
1139 Colv = NA, | 2879 adj = adj |
1140 cexRow = my_cex_row, | 2880 ) |
1141 cexCol = my_cex_col, | 2881 if ( |
1142 scale = scaling, | 2882 !is.null( |
1143 margins = margins, | 2883 hm2plus( |
1144 main = title, | 2884 x, |
1145 xlab = "", | 2885 Colv = NA, |
1146 las = 1, | 2886 Rowv = TRUE, |
1147 ... | 2887 cexRow = my_row_cex, |
2888 cexCol = my_col_cex, | |
2889 dendrogram = "row", | |
2890 las = 1, | |
2891 main = title, | |
2892 key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), | |
2893 cex = my_hm2_cex, | |
2894 key_par = list(cex = my_key_cex), | |
2895 margins = my_margins, | |
2896 widths = c(0.4, 2.6, 1.5), | |
2897 heights = my_heights, | |
2898 mat = matrix( | |
2899 c( | |
2900 c(0, 3, 3), | |
2901 c(2, 1, 1), | |
2902 c(0, 4, 0), | |
2903 c(0, 0, 0) | |
2904 ), | |
2905 nrow = 4, | |
2906 ncol = 3, | |
2907 byrow = TRUE | |
2908 ), | |
2909 na.rm = TRUE, | |
2910 scale = scaling, | |
2911 srtcol = 90, | |
2912 srtrow = 0, | |
2913 xlab = "", | |
2914 cellnote = cellnote, | |
2915 notecex = my_note_cex, | |
2916 ... | |
2917 ) | |
1148 ) | 2918 ) |
2919 ) { | |
2920 if (print_trace_messages) cat( | |
2921 sprintf( | |
2922 "my_heights = c(%s); sum = %0.3f\n\n", | |
2923 paste( | |
2924 sprintf("%0.3f", my_heights), | |
2925 collapse = ", " | |
2926 ), | |
2927 sum(my_heights) | |
2928 ) | |
2929 ) | |
2930 if (print_trace_messages) cat( | |
2931 sprintf("my_key_cex = %0.3f\n\n", | |
2932 my_key_cex) | |
2933 ) | |
2934 if (print_trace_messages) cat( | |
2935 sprintf("my_key_cex/my_heights[3] = %0.3f\n\n", | |
2936 my_key_cex / my_heights[3]) | |
2937 ) | |
2938 if (print_trace_messages) cat( | |
2939 sprintf("my_heights[2]/my_heights[3] = %0.3f\n\n", | |
2940 my_heights[2] / my_heights[3]) | |
2941 ) | |
2942 } | |
2943 par(op) | |
1149 } | 2944 } |
2945 | |
2946 # invoke hm_call inner function | |
1150 if (sum(rowSums(!is.na(m_hm)) < 2)) | 2947 if (sum(rowSums(!is.na(m_hm)) < 2)) |
1151 hm_call( | 2948 hm_call( |
1152 m_hm, | 2949 m_hm, |
1153 "none", | 2950 "none", |
1154 "log(intensities), unscaled, unimputed, and unnormalized" | 2951 "log(intensities), unscaled, unimputed, and unnormalized" |
1168 if (nrow(m_hm) > 1) | 2965 if (nrow(m_hm) > 1) |
1169 hm_call( | 2966 hm_call( |
1170 m_hm, | 2967 m_hm, |
1171 "none", | 2968 "none", |
1172 paste( | 2969 paste( |
1173 "log(intensities), unscaled, unimputed,", | 2970 "log(intensities), unscaled,", |
1174 "NAs zeroed, unnormalized" | 2971 "zero-imputed, unnormalized" |
1175 ) | 2972 ) |
1176 ) | 2973 ) |
1177 else | 2974 else |
1178 cat("\nThere are too few peptides to produce a heatmap.\n") | 2975 cat("\nThere are too few peptides to produce a heatmap.\n") |
1179 }, | 2976 }, |
1180 error = function(r) { | 2977 error = function(r) { |
1181 cat( | 2978 cat( |
1182 sprintf( | 2979 sprintf( |
1183 "\n%s %s Internal message: %s\n", | 2980 "\n%s %s Internal message: %s\n\\newline\n\n", |
1184 "Could not draw heatmap,", | 2981 "Failure drawing heatmap,", |
1185 "possibly because of too many missing values.", | 2982 "possibly because of too many missing values.\n\\newline\n\n", |
1186 r$message | 2983 r$message |
1187 ) | 2984 ) |
1188 ) | 2985 ) |
2986 cat_margins() | |
1189 } | 2987 } |
1190 ) | 2988 ) |
1191 } else { | 2989 } else { |
1192 cat( | 2990 cat( |
1193 "\nCould not draw heatmap, possibly because of too many missing values.\n" | 2991 "\nFailure drawing heatmap, possibly because of too many missing values.\n" |
1194 ) | 2992 ) |
1195 } | 2993 } |
1196 }, | 2994 } |
1197 finally = par(old_oma) | |
1198 ) | 2995 ) |
1199 } | 2996 } |
1200 } | 2997 } |
1201 return(peptide_count) | 2998 # return value: |
1202 } | 2999 peptide_count |
3000 } | |
3001 | |
3002 # function drawing heatmap of correlations if they exist, else covariances | |
3003 cov_heatmap <- | |
3004 function( | |
3005 m, # matrix with rownames already formatted | |
3006 top_substrates = FALSE, | |
3007 ... # passthru to hm2plus or heatmap.2 | |
3008 ) { | |
3009 if (print_nb_messages) nbe(see_variable(m), " [", nrow(m), "x", ncol(m), "\n") | |
3010 #ACE nb(rowSums(m, na.rm = TRUE)) | |
3011 #ACE bad_rows <- (rowSums(m, na.rm = TRUE) == 0) | |
3012 #ACE nb(see_variable(bad_rows)) | |
3013 #ACE m <- m[-bad_rows, , drop = FALSE] | |
3014 colnames_m <- colnames(m) | |
3015 is_na_m <- is.na(m) | |
3016 tmp <- m | |
3017 tmp[is_na_m] <- 0 | |
3018 | |
3019 tmp <- m[, 0 < colSums(x = tmp)] # by default, na.rm is FALSE | |
3020 | |
3021 colnames_tmp <- colnames(tmp) | |
3022 | |
3023 my_low_p_seq <- seq( | |
3024 from = min(g_intensity_hm_rows, nrow(m)), | |
3025 to = 1, | |
3026 by = -1 | |
3027 ) | |
3028 | |
3029 if (g_correlate_substrates) { | |
3030 # zap samples having zero or near-zero variance | |
3031 tmp[is.na(tmp)] <- 0 | |
3032 nzv <- caret::nearZeroVar( | |
3033 tmp, # matrix of values, samples x variables | |
3034 freqCut = 1.01, # min(freq most prevalent value / | |
3035 # freq second most prevalent) | |
3036 uniqueCut = 99 # max(number of unique values / | |
3037 # total number of samples) | |
3038 ) | |
3039 tmp <- if (length(nzv) > 0) { | |
3040 m[, -nzv, drop = FALSE] | |
3041 } else { | |
3042 m | |
3043 } | |
3044 } else { | |
3045 tmp <- m[my_low_p_seq, , drop = FALSE] | |
3046 } | |
3047 | |
3048 | |
3049 t_m <- t(tmp) | |
3050 t_m[is.na(t_m)] <- 0 | |
3051 prefiltered_nrow <- ncol(t_m) | |
3052 | |
3053 my_corcov <- cov(t_m) | |
3054 did_filter_rows <- did_filter_cols <- FALSE | |
3055 if (g_correlate_substrates && !is_positive_definite(my_corcov)) { | |
3056 my_correlate_substrates <- FALSE | |
3057 t_m <- t(m[my_low_p_seq, , drop = FALSE]) | |
3058 t_m[is.na(t_m)] <- 0 | |
3059 unfiltered_row_count <- ncol(t_m) | |
3060 unfiltered_col_count <- nrow(t_m) | |
3061 | |
3062 # zap empty samples | |
3063 t_m <- t_m[0 < rowSums(x = t_m), ] | |
3064 # zap substrates present in fewer than two samples | |
3065 foo <- t_m > 0 | |
3066 foo <- colSums(x = foo) > 1 | |
3067 t_m <- t_m[, foo] | |
3068 | |
3069 did_filter_rows <- unfiltered_row_count > ncol(t_m) | |
3070 did_filter_cols <- unfiltered_col_count > nrow(t_m) | |
3071 | |
3072 colnames_tmp <- rownames(t_m) | |
3073 my_corcov <- cov(t_m) | |
3074 if (g_filter_cov_var_gt_1) { | |
3075 my_corcov <- keep_cov_w_var_gtr_1(my_corcov) | |
3076 } | |
3077 } else if (g_correlate_substrates) { | |
3078 my_corcov <- cov2cor(my_corcov) | |
3079 my_correlate_substrates <- TRUE | |
3080 } else { | |
3081 my_correlate_substrates <- FALSE | |
3082 if (g_filter_cov_var_gt_1) my_corcov <- keep_cov_w_var_gtr_1(my_corcov) | |
3083 } | |
3084 | |
3085 omitted_samples <- colnames_m[colnames_m %notin% colnames_tmp] | |
3086 suffix <- if (length(omitted_samples) > 1) "s" else "" | |
3087 | |
3088 f_omissions <- | |
3089 function(is_corr) { | |
3090 cat( | |
3091 sprintf( | |
3092 "Below is the %s plot for %s substrates", | |
3093 if (is_corr) "correlation" else "covariance", | |
3094 sprintf( | |
3095 if (top_substrates) | |
3096 "%0.0f \"highest-quality\"" | |
3097 else | |
3098 "%0.0f", | |
3099 ncol(t_m) | |
3100 ) | |
3101 ) | |
3102 ) | |
3103 if (did_filter_cols) { | |
3104 cat(sprintf(", omitting sample%s ", suffix)) | |
3105 latex_collapsed_vector(", ", omitted_samples) | |
3106 } | |
3107 cat(".\n\n") | |
3108 } | |
3109 | |
3110 if (is.null(my_corcov) || sum(!is.na(t_m)) < 2) { | |
3111 cat( | |
3112 sprintf( | |
3113 "\\newline\n%s %s plot.\n", | |
3114 "Insufficient covariance to produce", | |
3115 if (my_correlate_substrates) | |
3116 "correlation" | |
3117 else | |
3118 "covariance" | |
3119 ), | |
3120 "\\newpage\n" | |
3121 ) | |
3122 return(NULL) | |
3123 } | |
3124 | |
3125 cat("\\leavevmode\n", "\\newpage\n") | |
3126 f_omissions(my_correlate_substrates) | |
3127 | |
3128 master_cex <- 0.4 | |
3129 max_nchar <- max(nchar(rownames(t_m))) | |
3130 my_limit <- g_intensity_hm_rows | |
3131 diminution <- sqrt(my_limit / (my_limit + ncol(t_m))) | |
3132 my_row_cex <- | |
3133 my_col_cex <- | |
3134 min(1.75, master_cex * 9 * diminution ^ 1.5) | |
3135 my_margin <- 3 + my_row_cex * 64 / (8 + max_nchar) | |
3136 my_key_cex <- 1.4 | |
3137 my_hm2_cex <- 1.0 * master_cex | |
3138 my_hgt_scale <- 3.50 - 0.26 * (max(0.4, my_key_cex) - 0.4) | |
3139 my_hgt_scale <- 2.7 | |
3140 | |
3141 my_legend_height <- 4.0 - my_hgt_scale | |
3142 my_legend_height <- 0.5 * my_key_cex | |
3143 my_warp <- 0.65 * (my_limit + ncol(t_m)) / my_limit | |
3144 my_warp <- 0.8 | |
3145 my_legend_height <- 0.77 | |
3146 my_legend_height <- 0.67 | |
3147 my_plot_height <- my_hgt_scale + (1 - my_warp) * my_legend_height | |
3148 my_legend_height <- my_warp * my_legend_height | |
3149 | |
3150 parjust <- par(adj = 0.5) | |
3151 on.exit(par(parjust)) | |
3152 my_corcov <- my_corcov[order(rownames(my_corcov)), ] | |
3153 my_main <- | |
3154 sprintf("%s among %s substrates %s", | |
3155 if (my_correlate_substrates) "Correlation" | |
3156 else "Covariance", | |
3157 kinase_name, | |
3158 if (!my_correlate_substrates && | |
3159 g_filter_cov_var_gt_1 && | |
3160 did_filter_rows | |
3161 ) | |
3162 "having variance > 1" | |
3163 else "" | |
3164 ) | |
3165 my_main_nchar <- nchar(my_main) | |
3166 my_heights <- c( | |
3167 0.3, | |
3168 my_plot_height, | |
3169 my_legend_height # was 4.0 - my_hgt_scale # was 4.19 | |
3170 ) | |
3171 if (print_trace_messages) cat(sprintf("max_nchar = %0.3f; ", max_nchar)) | |
3172 if (print_trace_messages) cat(sprintf("my_margin = %0.3f; ", my_margin)) | |
3173 if (print_trace_messages) cat(sprintf("my_plot_height = %0.3f\n\n", my_plot_height)) | |
3174 if (print_trace_messages) cat(sprintf("master_cex = %0.3f; ", master_cex)) | |
3175 if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex)) | |
3176 if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex)) | |
3177 if (print_trace_messages) cat(sprintf("my_key_cex = %0.3f\n\n", my_key_cex)) | |
3178 if (print_trace_messages) cat(sprintf("my_hgt_scale = %0.3f\n\n", my_hgt_scale)) | |
3179 if (print_trace_messages) cat(sprintf("legend height = %0.3f\n\n", my_legend_height)) | |
3180 if (print_trace_messages) cat( | |
3181 sprintf( | |
3182 "my_heights = c(%s); sum = %0.3f\n\n", | |
3183 paste( | |
3184 sprintf("%0.3f", my_heights), | |
3185 collapse = ", " | |
3186 ), | |
3187 sum(my_heights) | |
3188 ) | |
3189 ) | |
3190 op <- par(cex.main = (30 + my_main_nchar) / my_main_nchar) | |
3191 on.exit(par(op)) | |
3192 hm2plus( | |
3193 x = my_corcov, | |
3194 cex = my_hm2_cex, | |
3195 cexCol = my_col_cex, | |
3196 cexRow = my_row_cex, | |
3197 density_info = "none", | |
3198 denhgt = 0.15, | |
3199 denwid = 0.5, | |
3200 divergent = TRUE, | |
3201 key_par = list(cex = my_key_cex), | |
3202 key_xlab = if (my_correlate_substrates) "Correlation" | |
3203 else "Covariance", | |
3204 main = my_main, | |
3205 mat = matrix( | |
3206 c( | |
3207 c(0, 3, 3), | |
3208 c(2, 1, 1), | |
3209 c(0, 4, 0) | |
3210 ), | |
3211 nrow = 3, | |
3212 ncol = 3, | |
3213 byrow = TRUE | |
3214 ), | |
3215 heights = my_heights, | |
3216 margins = c(my_margin, my_margin), | |
3217 widths = c(0.5, 3.1, 0.9), | |
3218 scale = "none", | |
3219 symkey = TRUE, | |
3220 symbreaks = TRUE, | |
3221 symm = FALSE #TODO evaluate TRUE | |
3222 # ... | |
3223 ) | |
3224 } # end cov_heatmap | |
3225 | |
3226 ### FILE IMPORT | |
3227 | |
3228 # function reading bzipped file to data.frame | |
3229 bzip2df <- function(d, f, ctor = bzfile) { | |
3230 # read.delim file (by default, compressed by bzip2) | |
3231 if (file.exists(f)) { | |
3232 conn <- NULL | |
3233 pf <- parent.frame() | |
3234 tryCatch( | |
3235 assign( | |
3236 as.character(substitute(d)), | |
3237 read.delim(conn <- bzfile(f, open = "r")), | |
3238 pf | |
3239 ), | |
3240 finally = if (!is.null(conn)) close(conn) | |
3241 ) | |
3242 } | |
3243 } | |
3244 | |
1203 ``` | 3245 ``` |
1204 | 3246 |
1205 ```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
1206 cat("\\listoftables\n") | |
1207 ``` | |
1208 # Purpose | 3247 # Purpose |
1209 | 3248 |
1210 To perform for phosphopeptides: | 3249 The purpose of this analysis is to perform for phosphopeptides: |
1211 | 3250 |
1212 - imputation of missing values, | 3251 - imputation of missing values, |
1213 - quantile normalization, | 3252 - quantile normalization, |
1214 - ANOVA (using the R stats::`r params$oneWayManyCategories` function), and | 3253 - ANOVA (using the R stats::`r params$oneWayManyCategories` function), |
3254 - assignment of an FDR-adjusted $p$-value and a "quality score" to each phosphopeptide, and | |
1215 - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: | 3255 - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: |
1216 - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) | 3256 - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) |
1217 - The Human Proteome Database [http://hprd.org](http://hprd.org) | 3257 - The Human Proteome Database [http://hprd.org](http://hprd.org) |
1218 - NetworKIN [http://networkin.science/](http://networkin.science/) | 3258 - NetworKIN [http://networkin.science/](http://networkin.science/) |
1219 - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx) | 3259 - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx) |
1220 | 3260 |
1221 ```{r include = FALSE} | 3261 ```{r include = FALSE} |
1222 | 3262 |
1223 ### GLOBAL VARIABLES | 3263 if (params$kseaUseAbsoluteLog2FC) { |
1224 | 3264 sfc <- "|s|" |
1225 # parameters for KSEA | 3265 pfc <- "|p|" |
1226 | 3266 pfc_txt <- "$\\text{absolute value}({\\log_2 (\\text{fold-change})})$" |
1227 ksea_cutoff_statistic <- params$kseaCutoffStatistic | 3267 } else { |
1228 ksea_cutoff_threshold <- params$kseaCutoffThreshold | 3268 sfc <- "s" |
1229 ksea_min_kinase_count <- params$kseaMinKinaseCount | 3269 pfc <- "p" |
3270 pfc_txt <- "${\\log_2 (\\text{fold-change}})$" | |
3271 } | |
1230 | 3272 |
1231 ksea_heatmap_titles <- list() | 3273 ksea_heatmap_titles <- list() |
1232 ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- | 3274 ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- |
1233 sprintf( | 3275 sprintf( |
1234 "Summary for all kinases enriched in one or more contrasts at %s < %s", | 3276 "Summary for all kinases enriched in one or more contrasts at %s < %s", |
1244 ksea_cutoff_threshold | 3286 ksea_cutoff_threshold |
1245 ) | 3287 ) |
1246 # hash to hold names of significantly enriched kinases | 3288 # hash to hold names of significantly enriched kinases |
1247 ksea_asterisk_hash <- new_env() | 3289 ksea_asterisk_hash <- new_env() |
1248 | 3290 |
1249 # READ PARAMETERS (mostly) | 3291 # PROCESS (mostly read) PARAMETERS |
1250 | |
1251 intensity_hm_rows <- params$intensityHeatmapRows | |
1252 # Input Filename | |
1253 input_file <- params$inputFile | |
1254 | |
1255 # First data column - ideally, this could be detected via regexSampleNames, | |
1256 # but for now leave it as is. | |
1257 first_data_column <- params$firstDataColumn | |
1258 fdc_is_integer <- is.integer(first_data_column) | |
1259 if (fdc_is_integer) { | |
1260 first_data_column <- as.integer(params$firstDataColumn) | |
1261 } | |
1262 | 3292 |
1263 # False discovery rate adjustment for ANOVA | 3293 # False discovery rate adjustment for ANOVA |
1264 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 | 3294 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 |
1265 val_fdr <- | 3295 val_fdr <- read.table(file = alpha_file, sep = "\t", header = FALSE, quote = "") |
1266 read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "") | |
1267 | 3296 |
1268 if ( | 3297 if ( |
1269 ncol(val_fdr) != 1 || | 3298 ncol(val_fdr) != 1 || |
1270 sum(!is.numeric(val_fdr[, 1])) || | 3299 sum(!is.numeric(val_fdr[, 1])) || |
1271 sum(val_fdr[, 1] < 0) || | 3300 sum(val_fdr[, 1] < 0) || |
1272 sum(val_fdr[, 1] > 1) | 3301 sum(val_fdr[, 1] > 1) |
1273 ) { | 3302 ) { |
1274 stop("alphaFile should be one column of numbers within the range [0.0,1.0]") | 3303 cat("alphaFile should be one column of numbers within the range [0.0,1.0]") |
3304 param_df_exit() | |
3305 knitr::knit_exit() | |
1275 } | 3306 } |
1276 val_fdr <- val_fdr[, 1] | 3307 val_fdr <- val_fdr[, 1] |
1277 | 3308 |
1278 #Imputed Data filename | |
1279 imputed_data_filename <- params$imputedDataFilename | |
1280 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile | |
1281 anova_ksea_mtdt_file <- params$anovaKseaMetadata | |
1282 | |
1283 ``` | 3309 ``` |
1284 | 3310 |
1285 ```{r echo = FALSE} | 3311 ```{r echo = FALSE, results = 'asis'} |
1286 # Imputation method, should be one of | 3312 |
1287 # "random", "group-median", "median", or "mean" | 3313 |
1288 imputation_method <- params$imputationMethod | |
1289 | |
1290 # Selection of percentile of logvalue data to set the mean for random number | |
1291 # generation when using random imputation | |
1292 mean_percentile <- params$meanPercentile / 100.0 | |
1293 | |
1294 # deviation adjustment-factor for random values; real number. | |
1295 sd_percentile <- params$sdPercentile | |
1296 | |
1297 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" | |
1298 regex_sample_names <- params$regexSampleNames | |
1299 | |
1300 # Regular expression to extract Sample Grouping from Sample Name; | |
1301 # if error occurs, compare sample_treatment_levels vs. sample_name_matches | |
1302 # to see if groupings/pairs line up | |
1303 # e.g., "(\\d+)" | |
1304 regex_sample_grouping <- params$regexSampleGrouping | |
1305 | |
1306 one_way_all_categories_fname <- params$oneWayManyCategories | |
1307 one_way_all_categories <- try_catch_w_e( | |
1308 match.fun(one_way_all_categories_fname)) | |
1309 if (!is.function(one_way_all_categories$value)) { | |
1310 write("fatal error for parameter oneWayManyCategories:", stderr()) | |
1311 write(one_way_all_categories$value$message, stderr()) | |
1312 if (sys.nframe() > 0) quit(save = "no", status = 1) | |
1313 stop("Cannot continue. Goodbye.") | |
1314 } | |
1315 one_way_all_categories <- one_way_all_categories$value | |
1316 | |
1317 one_way_two_categories_fname <- params$oneWayManyCategories | |
1318 one_way_two_categories <- try_catch_w_e( | |
1319 match.fun(one_way_two_categories_fname)) | |
1320 if (!is.function(one_way_two_categories$value)) { | |
1321 cat("fatal error for parameter oneWayTwoCategories: \n") | |
1322 cat(one_way_two_categories$value$message, fill = TRUE) | |
1323 if (sys.nframe() > 0) quit(save = "no", status = 1) | |
1324 stop("Cannot continue. Goodbye.") | |
1325 } | |
1326 one_way_two_categories <- one_way_two_categories$value | |
1327 | |
1328 preproc_db <- params$preprocDb | |
1329 ksea_app_prep_db <- params$kseaAppPrepDb | |
1330 result <- file.copy( | 3314 result <- file.copy( |
1331 from = preproc_db, | 3315 from = preproc_db, |
1332 to = ksea_app_prep_db, | 3316 to = ksea_app_prep_db, |
1333 overwrite = TRUE | 3317 overwrite = TRUE |
1334 ) | 3318 ) |
1339 preproc_db, | 3323 preproc_db, |
1340 ksea_app_prep_db, | 3324 ksea_app_prep_db, |
1341 ), | 3325 ), |
1342 stderr() | 3326 stderr() |
1343 ) | 3327 ) |
1344 if (sys.nframe() > 0) quit(save = "no", status = 1) | 3328 if (sys.nframe() > 0) { |
1345 stop("Cannot continue. Goodbye.") | 3329 cat("Cannot continue and quit() failed. Goodbye.") |
1346 } | 3330 param_df_exit() |
3331 knitr::knit_exit() | |
3332 # in case knit_exit doesn't exit | |
3333 quit(save = "no", status = 1) | |
3334 } | |
3335 } | |
3336 | |
3337 if (FALSE) { | |
3338 write.table(x = param_df, file = "test-data/params.txt") | |
3339 } | |
3340 | |
1347 ``` | 3341 ``` |
1348 | 3342 |
1349 ```{r echo = FALSE} | 3343 ```{r echo = FALSE} |
1350 ### READ DATA | 3344 ### READ DATA |
1351 | 3345 |
1356 sep = "\t", | 3350 sep = "\t", |
1357 header = TRUE, | 3351 header = TRUE, |
1358 quote = "", | 3352 quote = "", |
1359 check.names = FALSE | 3353 check.names = FALSE |
1360 ) | 3354 ) |
3355 | |
1361 ``` | 3356 ``` |
1362 | 3357 |
1363 # Extract Sample Classes and Names | 3358 |
1364 | 3359 # Extraction of Sample Classes and Names from Input Data |
1365 Column names parsed from input file are shown in Table 1; sample classes and names, in Table 2. | |
1366 | 3360 |
1367 ```{r echo = FALSE, results = 'asis'} | 3361 ```{r echo = FALSE, results = 'asis'} |
1368 | 3362 |
1369 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) | 3363 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) |
3364 my_column_names <- names(full_data) | |
1370 | 3365 |
1371 if (!fdc_is_integer) { | 3366 if (!fdc_is_integer) { |
1372 if (length(data_column_indices) > 0) { | 3367 if (length(data_column_indices) > 0) { |
1373 first_data_column <- data_column_indices[1] | 3368 first_data_column <- data_column_indices[1] |
1374 } else { | 3369 } else { |
1375 stop(paste("failed to convert firstDataColumn:", first_data_column)) | 3370 cat(paste("failed to convert firstDataColumn:", first_data_column)) |
3371 param_df_exit() | |
3372 knitr::knit_exit() | |
1376 } | 3373 } |
1377 } | 3374 } |
1378 | 3375 |
1379 cat( | 3376 cat( |
1380 sprintf( | 3377 sprintf( |
1381 paste( | 3378 paste( |
1382 "\n\nThe input data file has peptide-intensity data for each sample", | 3379 "\n\nThe input data file has peptide-intensity data", |
1383 "in one of columns %d through %d.\n\n" | 3380 "in columns %d (\"%s\") through %d (\"%s\")." |
1384 ), | 3381 ), |
1385 min(data_column_indices), | 3382 tmp <- min(data_column_indices), |
1386 max(data_column_indices) | 3383 my_column_names[tmp], |
1387 ) | 3384 tmp <- max(data_column_indices), |
1388 ) | 3385 my_column_names[tmp] |
1389 | 3386 ) |
1390 # Write column names as a LaTeX enumerated list. | 3387 ) |
1391 column_name_df <- data.frame( | 3388 |
1392 column = seq_len(length(colnames(full_data))), | 3389 if (TRUE) { |
1393 name = paste0("\\verb@", colnames(full_data), "@") | 3390 cat0( |
1394 ) | 3391 table_offset(i = 1, new = TRUE), |
1395 cat("\n\\begin{tiny}\n") | 3392 "Sample classes and names are shown in ", |
1396 data_frame_latex( | 3393 table_href(), |
1397 x = column_name_df, | 3394 ".\n\n" |
1398 justification = "l l", | 3395 ) |
1399 centered = TRUE, | 3396 } else { |
1400 caption = "Input data column names", | 3397 cat0( |
1401 anchor = const_table_anchor_bp, | 3398 "\\newcounter{offset}\n", |
1402 underscore_whack = FALSE | 3399 "\\setcounter{offset}{\\value{table}}\n", |
1403 ) | 3400 "\\stepcounter{offset}\n", |
1404 cat("\n\\end{tiny}\n") | 3401 "Sample classes and names are shown in ", |
3402 table_href(), | |
3403 ".\n\n" | |
3404 ) | |
3405 } | |
3406 | |
3407 #TODO remove this unused variable and assignment | |
3408 if (FALSE) { | |
3409 # Write column names as a LaTeX enumerated list. | |
3410 column_name_df <- data.frame( | |
3411 column = seq_len(length(colnames(full_data))), | |
3412 name = paste0("\\verb@", colnames(full_data), "@") | |
3413 ) | |
3414 } | |
1405 | 3415 |
1406 ``` | 3416 ``` |
1407 | 3417 |
1408 ```{r echo = FALSE, results = 'asis'} | 3418 ```{r echo = FALSE, results = 'asis'} |
3419 # extract intensity columns from full_data to quant_data | |
1409 quant_data <- full_data[first_data_column:length(full_data)] | 3420 quant_data <- full_data[first_data_column:length(full_data)] |
1410 quant_data[quant_data == 0] <- NA | 3421 quant_data[quant_data == 0] <- NA |
1411 rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide | 3422 rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide |
3423 full_data_names <- colnames(quant_data) | |
1412 # Extract factors and trt-replicates using regular expressions. | 3424 # Extract factors and trt-replicates using regular expressions. |
1413 # Typically: | 3425 # Typically: |
1414 # regex_sample_names is "\\.\\d+[A-Z]$" | 3426 # regex_sample_names is "\\.\\d+[A-Z]$" |
1415 # regex_sample_grouping is "\\d+" | 3427 # regex_sample_grouping is "\\d+" |
1416 # This would distinguish trt-replicates by terminal letter [A-Z] | 3428 # This would distinguish trt-replicates by terminal letter [A-Z] |
1424 colnames(quant_data) <- sample_name_matches | 3436 colnames(quant_data) <- sample_name_matches |
1425 | 3437 |
1426 write_debug_file(quant_data) | 3438 write_debug_file(quant_data) |
1427 | 3439 |
1428 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) | 3440 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) |
1429 sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match)) | 3441 smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match)) |
3442 | |
3443 if (print_nb_messages) nbe(see_variable(smpl_trt, "\n\n")) | |
3444 if (print_nb_messages) nbe(see_variable(sample_name_matches, "\n\n")) | |
3445 if (print_nb_messages) nbe(see_variable(full_data_names, "\n\n")) | |
3446 | |
3447 sample_treatment_df <- | |
3448 save_sample_treatment_df <- | |
3449 data.frame( | |
3450 class = smpl_trt, | |
3451 sample = sample_name_matches, | |
3452 full_sample_names = full_data_names | |
3453 ) | |
3454 | |
3455 if (print_nb_messages) nbe(see_variable(sample_treatment_df, "\n\n")) | |
3456 | |
3457 # reorder data | |
3458 my_order <- with(sample_treatment_df, order(class, sample)) | |
3459 quant_data <- quant_data[, my_order] | |
3460 sample_name_matches <- sample_name_matches[my_order] | |
3461 smpl_trt <- smpl_trt[my_order] | |
3462 sample_treatment_df <- data.frame( | |
3463 class = smpl_trt, | |
3464 sample = sample_name_matches | |
3465 ) | |
3466 | |
3467 # filter smpl_trt as appropriate | |
3468 if (sample_group_filter %in% c("include", "exclude")) { | |
3469 include_sample <- | |
3470 mgrepl( | |
3471 v = sample_group_filter_patterns, | |
3472 s = as.character(smpl_trt), | |
3473 fixed = sample_group_filter_fixed, | |
3474 perl = sample_group_filter_perl, | |
3475 ignore.case = sample_group_filter_nocase | |
3476 ) | |
3477 if (sum(include_sample) < 2) { | |
3478 errmsg <- | |
3479 paste( | |
3480 "ERROR:", | |
3481 sum(include_sample), | |
3482 "samples are too few for analysis;", | |
3483 "check input parameters for sample-name parsing" | |
3484 ) | |
3485 cat0( | |
3486 errmsg, | |
3487 "\\stepcounter{offset}\n", | |
3488 " in ", | |
3489 table_href(), | |
3490 ".\n\n" | |
3491 ) | |
3492 data_frame_tabbing_latex( | |
3493 x = save_sample_treatment_df, | |
3494 tabstops = c(1.25, 1.25), | |
3495 caption = "Sample classes", | |
3496 use_subsubsection_header = FALSE | |
3497 ) | |
3498 data_frame_tabbing_latex( | |
3499 x = | |
3500 param_df[ | |
3501 c("regexSampleNames", | |
3502 "regexSampleGrouping", | |
3503 "groupFilterPatterns", | |
3504 "groupFilter", | |
3505 "groupFilterMode" | |
3506 ), | |
3507 ], | |
3508 tabstops = c(1.75), | |
3509 underscore_whack = TRUE, | |
3510 caption = "Input parameters for sample-name parsing", | |
3511 verbatim = FALSE | |
3512 ) | |
3513 param_df_exit() | |
3514 knitr::knit_exit() | |
3515 return(invisible(-1)) | |
3516 } | |
3517 sample_treatment_df <- | |
3518 if (sample_group_filter == "include") | |
3519 sample_treatment_df[include_sample, ] | |
3520 else | |
3521 sample_treatment_df[!include_sample, ] | |
3522 } else { | |
3523 include_sample <- rep.int(TRUE, length(smpl_trt)) | |
3524 } | |
3525 sample_name_matches <- sample_treatment_df$sample | |
3526 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) | |
3527 smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match)) | |
3528 sample_treatment_df$class <- smpl_trt | |
3529 | |
3530 # filter quant_data as appropriate | |
1430 number_of_samples <- length(sample_name_matches) | 3531 number_of_samples <- length(sample_name_matches) |
1431 sample_treatment_df <- data.frame( | 3532 quant_data <- quant_data[, sample_name_matches] |
1432 class = sample_treatment_levels, | 3533 |
1433 sample = sample_name_matches | 3534 sample_level_integers <- as.integer(smpl_trt) |
1434 ) | 3535 sample_treatment_levels <- levels(smpl_trt) |
1435 # reorder data | 3536 count_of_treatment_levels <- length(sample_treatment_levels) |
1436 if (TRUE) { | 3537 |
1437 my_order <- with(sample_treatment_df, order(class, sample)) | 3538 # for each phosphopeptide, across treatment levels, compute minimum |
1438 quant_data <- quant_data[, my_order] | 3539 # count of observed (i.e., non-missing) values |
1439 sample_name_matches <- sample_name_matches[my_order] | 3540 my_env <- new_env() |
1440 sample_treatment_levels <- sample_treatment_levels[my_order] | 3541 for (l in sample_treatment_levels) |
1441 } | 3542 my_env[[as.character(l)]] <- |
1442 sample_treatment_df <- data.frame( | 3543 as.vector(rowSums(!is.na(quant_data[l == smpl_trt]))) |
1443 class = sample_treatment_levels, | 3544 min_group_obs_count <- row_apply( |
1444 sample = sample_name_matches | 3545 x = Reduce( |
1445 ) | 3546 f = function(l, r) cbind(l, my_env[[r]]), |
1446 data_frame_latex( | 3547 x = sample_treatment_levels, |
3548 init = c() | |
3549 ), | |
3550 fun = min | |
3551 ) | |
3552 names(min_group_obs_count) <- rownames(quant_data) | |
3553 rm(my_env) | |
3554 | |
3555 # display (possibly-filtered) results | |
3556 cat("\\newpage\n") | |
3557 | |
3558 if (sum(include_sample) > 1) { | |
3559 data_frame_tabbing_latex( | |
1447 x = sample_treatment_df, | 3560 x = sample_treatment_df, |
1448 justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}", | 3561 tabstops = c(1.25), |
1449 centered = TRUE, | |
1450 caption = "Sample classes", | 3562 caption = "Sample classes", |
1451 anchor = const_table_anchor_tbp, | 3563 use_subsubsection_header = FALSE |
1452 underscore_whack = FALSE | 3564 ) |
1453 ) | 3565 } |
1454 sample_name_shrink <- 10 / (10 + max(nchar(sample_name_matches))) | 3566 sample_name_grow <- 10 / (10 + max(nchar(sample_name_matches) + 6)) |
3567 sample_colsep <- transition_positions(as.integer(sample_treatment_df$class)) | |
1455 ``` | 3568 ``` |
1456 | 3569 |
1457 ```{r echo = FALSE, results = 'asis'} | 3570 ```{r echo = FALSE, results = 'asis'} |
1458 cat("\\newpage\n") | 3571 cat("\\newpage\n") |
1459 ``` | 3572 ``` |
1460 | 3573 |
1461 ## Are the log-transformed sample distributions similar? | 3574 ## Are the log-transformed sample distributions similar? |
1462 | 3575 |
1463 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | 3576 ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} |
1464 | 3577 |
1465 quant_data[quant_data == 0] <- NA #replace 0 with NA | 3578 quant_data[quant_data == 0] <- NA #replace 0 with NA |
1466 quant_data_log <- log10(quant_data) | 3579 quant_data_log <- log10(quant_data) |
1467 | 3580 |
1468 rownames(quant_data_log) <- rownames(quant_data) | 3581 rownames(quant_data_log) <- rownames(quant_data) |
1469 colnames(quant_data_log) <- sample_name_matches | 3582 colnames(quant_data_log) <- sample_name_matches |
1470 | 3583 |
1471 write_debug_file(quant_data_log) | 3584 write_debug_file(quant_data_log) |
1472 | 3585 |
1473 # data visualization | 3586 g_ppep_distrib_ctl <- new_env() |
1474 old_par <- par( | 3587 g_ppep_distrib_ctl$xlab_line <- 3.5 + 11.86 * (0.67 - sample_name_grow) |
1475 mai = par("mai") + c(0.5, 0, 0, 0) | 3588 g_ppep_distrib_ctl$mai_bottom <- (0.5 + 3.95 * (0.67 - sample_name_grow)) |
1476 ) | 3589 g_ppep_distrib_ctl$axis <- (0.6 + 0.925 * (0.67 - sample_name_grow)) |
1477 # ref: https://r-charts.com/distribution/add-points-boxplot/ | 3590 |
1478 # Vertical plot | 3591 my_ppep_distrib_bxp <- function( |
1479 boxplot( | 3592 x |
1480 quant_data_log | 3593 , sample_name_grow = sample_name_grow |
1481 , las = 2 | 3594 , main |
1482 , cex.axis = 0.9 * sample_name_shrink | 3595 , varwidth = FALSE |
1483 , col = const_boxplot_fill | 3596 , sub = NULL |
1484 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 3597 , xlab |
1485 , xlab = "Sample" | 3598 , ylab |
1486 ) | 3599 , col = const_boxplot_fill |
1487 par(old_par) | 3600 , notch = FALSE |
1488 | 3601 , ppep_distrib_ctl = g_ppep_distrib_ctl |
1489 | 3602 , ... |
1490 | 3603 ) { |
1491 cat("\n\n\n") | 3604 my_xlab_line <- g_ppep_distrib_ctl$xlab_line |
1492 cat("\n\n\n") | 3605 my_mai_bottom <- g_ppep_distrib_ctl$mai_bottom |
3606 my_axis <- g_ppep_distrib_ctl$axis | |
3607 | |
3608 if (print_trace_messages) { | |
3609 cat_variable(my_xlab_line, suffix = "; ") | |
3610 cat_variable(my_mai_bottom, suffix = "; ") | |
3611 cat_variable(my_axis, suffix = "\n\n") | |
3612 } | |
3613 | |
3614 old_par <- par( | |
3615 mai = par("mai") + c(my_mai_bottom, 0, 0, 0), | |
3616 cex.axis = my_axis, | |
3617 cex.lab = 1.2 | |
3618 ) | |
3619 tryCatch( | |
3620 { | |
3621 # Vertical plot | |
3622 boxplot( | |
3623 x | |
3624 , las = 2 | |
3625 , col = col | |
3626 , main = main | |
3627 , sub = NULL | |
3628 , ylab = ylab | |
3629 , xlab = NULL | |
3630 , notch = notch | |
3631 , varwidth = varwidth | |
3632 , ... | |
3633 ) | |
3634 title( | |
3635 sub = sub | |
3636 , cex.sub = 1.0 | |
3637 , line = my_xlab_line + 1 | |
3638 ) | |
3639 title( | |
3640 xlab = xlab | |
3641 , line = my_xlab_line | |
3642 ) | |
3643 }, | |
3644 finally = par(old_par) | |
3645 ) | |
3646 } | |
3647 | |
3648 my_ppep_distrib_bxp( | |
3649 x = quant_data_log | |
3650 , sample_name_grow = sample_name_grow | |
3651 , main = "Peptide intensities for each sample" | |
3652 , varwidth = boxplot_varwidth | |
3653 , sub = "Box widths reflect number of peptides for sample" | |
3654 , xlab = "Sample" | |
3655 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
3656 , col = const_boxplot_fill | |
3657 , notch = FALSE | |
3658 ) | |
3659 | |
3660 cat("\n\n\n\n") | |
1493 | 3661 |
1494 ``` | 3662 ``` |
1495 | 3663 |
1496 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} | 3664 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} |
1497 if (nrow(quant_data_log) > 1) { | 3665 if (nrow(quant_data_log) > 1) { |
1536 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), | 3704 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), |
1537 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 3705 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") |
1538 ) | 3706 ) |
1539 ``` | 3707 ``` |
1540 | 3708 |
3709 # Characterization of Input Data | |
3710 | |
1541 ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values | 3711 ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values |
1542 | 3712 |
1543 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} | 3713 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} |
1544 # determine quantile | 3714 # determine quantile |
1545 q1 <- quantile(logvalues, probs = mean_percentile)[1] | 3715 q1 <- quantile(logvalues, probs = mean_percentile)[1] |
1546 | 3716 |
1547 # 1 = row of matrix (ie, phosphopeptide) | 3717 # 1 = row of matrix (ie, phosphopeptide) |
1548 sds <- apply(quant_data_log, 1, sd_finite) | 3718 sds <- row_apply(quant_data_log, sd_finite) |
1549 if (sum(!is.na(sds)) > 2) { | 3719 if (sum(!is.na(sds)) > 2) { |
1550 plot( | 3720 plot( |
1551 density(sds, na.rm = TRUE) | 3721 density(sds, na.rm = TRUE) |
1552 , main = "Smoothed estimated probability density vs. std. deviation" | 3722 , main = "Smoothed estimated probability density vs. std. deviation" |
1553 , sub = "(probability estimation made with Gaussian smoothing)" | 3723 , sub = "(probability estimation made with Gaussian smoothing)" |
1577 ```{r echo = FALSE} | 3747 ```{r echo = FALSE} |
1578 | 3748 |
1579 # prep for trt-median based imputation | 3749 # prep for trt-median based imputation |
1580 | 3750 |
1581 ``` | 3751 ``` |
1582 # Impute Missing Values | 3752 # Imputation of Missing Values |
1583 | 3753 |
1584 ```{r echo = FALSE} | 3754 ```{r echo = FALSE} |
1585 | 3755 |
1586 imp_smry_pot_peptides_before <- nrow(quant_data_log) | 3756 imp_smry_pot_peptides_before <- nrow(quant_data_log) |
1587 imp_smry_missing_values_before <- number_to_impute | 3757 imp_smry_missing_values_before <- number_to_impute |
1608 quant_data_imp <- quant_data | 3778 quant_data_imp <- quant_data |
1609 imputation_method_description <- | 3779 imputation_method_description <- |
1610 paste("Substitute missing value with", | 3780 paste("Substitute missing value with", |
1611 "median peptide-intensity for sample group.\n" | 3781 "median peptide-intensity for sample group.\n" |
1612 ) | 3782 ) |
1613 sample_level_integers <- as.integer(sample_treatment_levels) | |
1614 # Take the accurate ln(x+1) because the data are log-normally distributed | 3783 # Take the accurate ln(x+1) because the data are log-normally distributed |
1615 # and because median can involve an average of two measurements. | 3784 # and because median can involve an average of two measurements. |
1616 quant_data_imp <- log1p(quant_data_imp) | 3785 quant_data_imp <- log1p(quant_data_imp) |
1617 for (i in seq_len(length(levels(sample_treatment_levels)))) { | 3786 for (i in seq_len(count_of_treatment_levels)) { |
1618 # Determine the columns for this factor-level | 3787 # Determine the columns for this factor-level |
1619 level_cols <- i == sample_level_integers | 3788 level_cols <- i == sample_level_integers |
1620 # Extract those columns | 3789 # Extract those columns |
1621 lvlsbst <- quant_data_imp[, level_cols, drop = FALSE] | 3790 lvlsbst <- quant_data_imp[, level_cols, drop = FALSE] |
1622 # assign to ind the row-column pairs corresponding to each NA | 3791 # assign to ind the row-column pairs corresponding to each NA |
1623 ind <- which(is.na(lvlsbst), arr.ind = TRUE) | 3792 ind <- which(is.na(lvlsbst), arr.ind = TRUE) |
1624 # No group-median exists if there is only one sample | 3793 # No group-median exists if there is only one sample |
1625 # a given ppep has no measurement; otherwise, proceed. | 3794 # a given ppep has no measurement; otherwise, proceed. |
1626 if (ncol(lvlsbst) > 1) { | 3795 if (ncol(lvlsbst) > 1) { |
1627 the_centers <- | 3796 the_centers <- |
1628 apply(lvlsbst, 1, median, na.rm = TRUE) | 3797 row_apply(lvlsbst, median, na.rm = TRUE) |
1629 for (j in seq_len(nrow(lvlsbst))) { | 3798 for (j in seq_len(nrow(lvlsbst))) { |
1630 for (k in seq_len(ncol(lvlsbst))) { | 3799 for (k in seq_len(ncol(lvlsbst))) { |
1631 if (is.na(lvlsbst[j, k])) { | 3800 if (is.na(lvlsbst[j, k])) { |
1632 lvlsbst[j, k] <- the_centers[j] | 3801 lvlsbst[j, k] <- the_centers[j] |
1633 } | 3802 } |
1647 "median peptide-intensity across all sample classes.\n" | 3816 "median peptide-intensity across all sample classes.\n" |
1648 ) | 3817 ) |
1649 # Take the accurate ln(x+1) because the data are log-normally distributed | 3818 # Take the accurate ln(x+1) because the data are log-normally distributed |
1650 # and because median can involve an average of two measurements. | 3819 # and because median can involve an average of two measurements. |
1651 quant_data_imp <- log1p(quant_data_imp) | 3820 quant_data_imp <- log1p(quant_data_imp) |
1652 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]] | 3821 quant_data_imp[ind] <- row_apply(quant_data_imp, median, na.rm = TRUE)[ind[, 1]] |
1653 # Take the accurate e^x - 1 to match scaling of original input. | 3822 # Take the accurate e^x - 1 to match scaling of original input. |
1654 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) | 3823 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) |
1655 good_rows <- !is.nan(rowMeans(quant_data_imp)) | 3824 good_rows <- !is.nan(rowMeans(quant_data_imp)) |
1656 } | 3825 } |
1657 , "mean" = { | 3826 , "mean" = { |
1663 # Take the accurate ln(x+1) because the data are log-normally distributed, | 3832 # Take the accurate ln(x+1) because the data are log-normally distributed, |
1664 # so arguments to mean should be previously transformed. | 3833 # so arguments to mean should be previously transformed. |
1665 # this will have to be | 3834 # this will have to be |
1666 quant_data_imp <- log1p(quant_data_imp) | 3835 quant_data_imp <- log1p(quant_data_imp) |
1667 # Assign to NA cells the mean for the row | 3836 # Assign to NA cells the mean for the row |
1668 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]] | 3837 quant_data_imp[ind] <- row_apply(quant_data_imp, mean, na.rm = TRUE)[ind[, 1]] |
1669 # Take the accurate e^x - 1 to match scaling of original input. | 3838 # Take the accurate e^x - 1 to match scaling of original input. |
1670 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) | 3839 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) |
1671 good_rows <- !is.nan(rowMeans(quant_data_imp)) | 3840 good_rows <- !is.nan(rowMeans(quant_data_imp)) |
1672 } | 3841 } |
1673 , "random" = { | 3842 , "random" = { |
1707 ```{r echo = FALSE} | 3876 ```{r echo = FALSE} |
1708 | 3877 |
1709 imp_smry_pot_peptides_after <- sum(good_rows) | 3878 imp_smry_pot_peptides_after <- sum(good_rows) |
1710 imp_smry_rejected_after <- sum(!good_rows) | 3879 imp_smry_rejected_after <- sum(!good_rows) |
1711 imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) | 3880 imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) |
3881 | |
3882 # From ?`%in%`, %in% is currently defined as function(x, table) match(x, table, nomatch = 0) > 0 | |
3883 | |
3884 sink(stderr()) | |
3885 print("`%in%`:") | |
3886 print(`%in%`) | |
3887 sink() | |
3888 | |
3889 stock_in <- | |
3890 names(good_rows) %in% | |
3891 names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) | |
3892 if (print_nb_messages) nbe(see_variable(stock_in), "\n") | |
3893 | |
3894 explicit_in <- | |
3895 0 < match( | |
3896 names(good_rows), | |
3897 names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) | |
3898 ) | |
3899 if (print_nb_messages) nbe(see_variable(explicit_in), "\n") | |
3900 | |
3901 great_enough_row_names <- good_rows[ | |
3902 names(good_rows) %in% | |
3903 names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) | |
3904 ] | |
3905 if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n") | |
3906 great_enough_row_names <- great_enough_row_names[great_enough_row_names] | |
3907 if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n") | |
1712 ``` | 3908 ``` |
3909 | |
1713 ```{r echo = FALSE, results = 'asis'} | 3910 ```{r echo = FALSE, results = 'asis'} |
1714 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf | 3911 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf |
1715 tabular_lines_fmt <- paste( | 3912 tabular_lines_fmt <- paste( |
1716 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) | 3913 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) |
3914 " \\leavevmode", | |
1717 " \\caption{Imputation Results}", | 3915 " \\caption{Imputation Results}", |
1718 " \\centering", # \centering centers the table on the page | 3916 " \\centering", # \centering centers the table on the page |
1719 " \\begin{tabular}{l c c c}", | 3917 " \\begin{tabular}{l c c c}", |
1720 " \\hline\\hline", | 3918 " \\hline\\hline", |
1721 " \\ & potential peptides & missing values & rejected", | 3919 " \\ & potential peptides & missing values & rejected", |
1722 " peptides \\\\ [0.5ex]", | 3920 " peptides \\\\ [0.5ex]", |
1723 " \\hline", | 3921 " \\hline", |
1724 " before imputation & %d & %d (%d\\%s) & \\\\", | 3922 " before imputation & %d & %d (%d\\%s) & \\\\", |
1725 " after imputation & %d & %d & %d \\\\ [1ex]", | 3923 " after imputation & %d & %d & %d \\\\", |
3924 " after keep comparable & %d & & %d \\\\ [1ex]", | |
1726 " \\hline", | 3925 " \\hline", |
1727 " \\end{tabular}", | 3926 " \\end{tabular}", |
1728 #" \\label{table:nonlin}", # may be used to refer this table in the text | 3927 #" \\label{table:nonlin}", # may be used to refer this table in the text |
1729 "\\end{table}", | 3928 "\\end{table}", |
1730 sep = "\n" | 3929 sep = "\n" |
1736 imp_smry_missing_values_before, | 3935 imp_smry_missing_values_before, |
1737 imp_smry_pct_missing, | 3936 imp_smry_pct_missing, |
1738 "%", | 3937 "%", |
1739 imp_smry_pot_peptides_after, | 3938 imp_smry_pot_peptides_after, |
1740 imp_smry_missing_values_after, | 3939 imp_smry_missing_values_after, |
1741 imp_smry_rejected_after | 3940 imp_smry_rejected_after, |
3941 length(great_enough_row_names), | |
3942 imp_smry_pot_peptides_before - | |
3943 length(great_enough_row_names) | |
1742 ) | 3944 ) |
1743 cat(tabular_lines) | 3945 cat(tabular_lines) |
1744 ``` | 3946 ``` |
1745 ```{r echo = FALSE} | 3947 |
1746 | 3948 ```{r filter_good_rows, echo = FALSE} |
1747 | 3949 |
1748 # Zap rows where imputation was ineffective | 3950 if (print_nb_messages) nbe("before name extraction, ", see_variable(length(good_rows)), " ", see_variable(good_rows), "\n") |
3951 good_rows <- names(good_rows[names(great_enough_row_names)]) | |
3952 if (print_nb_messages) nbe("after name extraction, ", see_variable(length(good_rows)), see_variable(good_rows), "\n") | |
3953 | |
3954 #ACE min_group_obs_count <- min_group_obs_count[names(great_enough_row_names)] | |
3955 #ACE nbe("good_rows") | |
3956 #ACE nbe(see_variable(good_rows)) | |
3957 #ACE nbe("names(min_group_obs_count) before filter for good rows") | |
3958 #ACE nbe(see_variable(names(min_group_obs_count))) | |
3959 min_group_obs_count <- min_group_obs_count[good_rows] | |
3960 #ACE nbe("min_group_obs_count after filter for good rows") | |
3961 #ACE nbe(see_variable(names(min_group_obs_count))) | |
3962 | |
3963 # Zap rows where imputation was insufficiently effective | |
1749 full_data <- full_data [good_rows, ] | 3964 full_data <- full_data [good_rows, ] |
1750 quant_data <- quant_data [good_rows, ] | 3965 quant_data <- quant_data [good_rows, ] |
1751 | 3966 quant_data_log <- quant_data_log [good_rows, ] |
3967 | |
3968 if (print_nb_messages) nbe("before row filter, ", see_variable(nrow(quant_data_imp)), "\n") | |
1752 quant_data_imp <- quant_data_imp[good_rows, ] | 3969 quant_data_imp <- quant_data_imp[good_rows, ] |
3970 if (print_nb_messages) nbe("after row filter, ", see_variable(nrow(quant_data_imp)), "\n") | |
1753 write_debug_file(quant_data_imp) | 3971 write_debug_file(quant_data_imp) |
1754 quant_data_imp_good_rows <- quant_data_imp | 3972 quant_data_imp_good_rows <- quant_data_imp |
1755 | 3973 |
1756 write_debug_file(quant_data_imp_good_rows) | 3974 write_debug_file(quant_data_imp_good_rows) |
1757 ``` | 3975 ``` |
1801 d_imputed <- d_combined | 4019 d_imputed <- d_combined |
1802 } | 4020 } |
1803 | 4021 |
1804 ``` | 4022 ``` |
1805 | 4023 |
1806 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | 4024 ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} |
1807 zero_sd_rownames <- | 4025 zero_sd_rownames <- |
1808 rownames(quant_data_imp)[ | 4026 rownames(quant_data_imp)[ |
1809 is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0) | 4027 is.na((row_apply(quant_data_imp, sd, na.rm = TRUE)) == 0) |
1810 ] | 4028 ] |
1811 | 4029 |
1812 if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { | 4030 if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { |
1813 stop("All peptides have zero standard deviation. Cannot continue.") | 4031 cat("All peptides have zero standard deviation. Cannot continue.") |
4032 param_df_exit() | |
4033 knitr::knit_exit() | |
1814 } | 4034 } |
1815 if (length(zero_sd_rownames) > 0) { | 4035 if (length(zero_sd_rownames) > 0) { |
1816 cat( | 4036 cat( |
1817 sprintf("%d peptides with zero variance were removed from statistical consideration", | 4037 sprintf( |
1818 length(zero_sd_rownames) | 4038 "%d %s %s", |
4039 length(zero_sd_rownames), | |
4040 "peptides with zero variance", | |
4041 "were removed from statistical consideration" | |
1819 ) | 4042 ) |
1820 ) | 4043 ) |
1821 zap_named_rows <- function(df, nms) { | 4044 zap_named_rows <- function(df, nms) { |
1822 return(df[!(row.names(df) %in% nms), ]) | 4045 return(df[!(row.names(df) %in% nms), ]) |
1823 } | 4046 } |
1824 quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames) | 4047 quant_data_imp <- |
1825 quant_data <- zap_named_rows(quant_data, zero_sd_rownames) | 4048 zap_named_rows(quant_data_imp, zero_sd_rownames) |
1826 full_data <- zap_named_rows(full_data, zero_sd_rownames) | 4049 quant_data <- |
4050 zap_named_rows(quant_data, zero_sd_rownames) | |
4051 full_data <- | |
4052 zap_named_rows(full_data, zero_sd_rownames) | |
4053 min_group_obs_count <- | |
4054 min_group_obs_count[names(min_group_obs_count) %notin% zero_sd_rownames] | |
1827 } | 4055 } |
1828 | 4056 |
1829 if (sum(is.na(quant_data)) > 0) { | 4057 if (sum(is.na(quant_data)) > 0) { |
1830 cat("\\leavevmode\\newpage\n") | 4058 cat("\\leavevmode\\newpage\n") |
1831 # data visualization | |
1832 old_par <- par( | |
1833 mai = par("mai") + c(0.5, 0, 0, 0) | |
1834 ) | |
1835 # Copy quant data to x | 4059 # Copy quant data to x |
1836 x <- quant_data | 4060 x <- quant_data |
1837 # x gets to have values of: | 4061 # x gets to have values of: |
1838 # - NA for observed values | 4062 # - NA for observed values |
1839 # - 1 for missing values | 4063 # - 1 for missing values |
1858 max(red_dots, blue_dots, na.rm = TRUE) | 4082 max(red_dots, blue_dots, na.rm = TRUE) |
1859 ) | 4083 ) |
1860 show_stripchart <- | 4084 show_stripchart <- |
1861 50 > (count_red + count_blue) / length(sample_name_matches) | 4085 50 > (count_red + count_blue) / length(sample_name_matches) |
1862 if (show_stripchart) { | 4086 if (show_stripchart) { |
1863 boxplot_sub <- "Light blue = data before imputation; Red = imputed data" | 4087 boxplot_sub <- "Light blue = data before imputation; Red = imputed data;" |
1864 } else { | 4088 } else { |
1865 boxplot_sub <- "" | 4089 boxplot_sub <- "" |
1866 } | 4090 } |
1867 | 4091 |
1868 # Vertical plot | 4092 # Vertical plot |
1869 colnames(blue_dots) <- sample_name_matches | 4093 colnames(blue_dots) <- sample_name_matches |
1870 boxplot( | 4094 my_ppep_distrib_bxp( |
1871 blue_dots | 4095 x = blue_dots |
1872 , las = 2 # "always vertical" | 4096 , sample_name_grow = sample_name_grow |
1873 , cex.axis = 0.9 * sample_name_shrink | |
1874 , col = const_boxplot_fill | |
1875 , ylim = ylim | |
1876 , main = "Peptide intensities after eliminating unusable peptides" | 4097 , main = "Peptide intensities after eliminating unusable peptides" |
1877 , sub = boxplot_sub | 4098 , varwidth = boxplot_varwidth |
4099 , sub = paste(boxplot_sub, "Box widths reflect number of peptides for sample") | |
1878 , xlab = "Sample" | 4100 , xlab = "Sample" |
1879 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 4101 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") |
4102 , col = const_boxplot_fill | |
4103 , notch = FALSE | |
4104 , ylim = ylim | |
1880 ) | 4105 ) |
1881 | 4106 |
1882 if (show_stripchart) { | 4107 if (show_stripchart) { |
1883 # Points | 4108 # Points |
1884 # ref: https://r-charts.com/distribution/add-points-boxplot/ | 4109 # ref: https://r-charts.com/distribution/add-points-boxplot/ |
1886 stripchart( | 4111 stripchart( |
1887 blue_dots, # Data | 4112 blue_dots, # Data |
1888 method = "jitter", # Random noise | 4113 method = "jitter", # Random noise |
1889 jitter = const_stripchart_jitter, | 4114 jitter = const_stripchart_jitter, |
1890 pch = 19, # Pch symbols | 4115 pch = 19, # Pch symbols |
1891 cex = const_stripsmall_cex, # Size of symbols reduced | 4116 cex = const_stripchart_cex, # Size of symbols reduced |
1892 col = "lightblue", # Color of the symbol | 4117 col = "lightblue", # Color of the symbol |
1893 vertical = TRUE, # Vertical mode | 4118 vertical = TRUE, # Vertical mode |
1894 add = TRUE # Add it over | 4119 add = TRUE # Add it over |
1895 ) | 4120 ) |
1896 stripchart( | 4121 stripchart( |
1897 red_dots, # Data | 4122 red_dots, # Data |
1898 method = "jitter", # Random noise | 4123 method = "jitter", # Random noise |
1899 jitter = const_stripchart_jitter, | 4124 jitter = const_stripchart_jitter, |
1900 pch = 19, # Pch symbols | 4125 pch = 19, # Pch symbols |
1901 cex = const_stripsmall_cex, # Size of symbols reduced | 4126 cex = const_stripchart_cex, # Size of symbols reduced |
1902 col = "red", # Color of the symbol | 4127 col = "red", # Color of the symbol |
1903 vertical = TRUE, # Vertical mode | 4128 vertical = TRUE, # Vertical mode |
1904 add = TRUE # Add it over | 4129 add = TRUE # Add it over |
1905 ) | 4130 ) |
1906 | 4131 |
1907 } | 4132 } |
1908 if (TRUE) { | 4133 } |
1909 # show measured values in blue on left half-violin plot | 4134 ``` |
1910 cat("\\leavevmode\n\\quad\n\n\\quad\n\n") | 4135 |
1911 vioplot::vioplot( | 4136 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} |
1912 x = lapply(blue_dots, function(x) x[!is.na(x)]), | 4137 if (sum(is.na(quant_data)) > 0) { |
1913 col = "lightblue1", | 4138 # show measured values in blue on left half-violin plot |
1914 side = "left", | 4139 cat("\\leavevmode\n\\quad\n\n\\quad\n\n") |
1915 plotCentre = "line", | 4140 old_par <- par( |
1916 ylim = ylim_save, | 4141 mai = par("mai") + c(g_ppep_distrib_ctl$mai_bottom, 0, 0, 0), |
1917 main = "Distributions of observed and imputed data", | 4142 cex.axis = g_ppep_distrib_ctl$axis, |
1918 sub = "Light blue = observed data; Pink = imputed data", | 4143 cex.lab = 1.2 |
1919 las = 2, | 4144 ) |
1920 cex.axis = 0.9 * sample_name_shrink, | 4145 tryCatch( |
1921 xlab = "Sample", | 4146 { |
1922 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 4147 vioplot::vioplot( |
1923 ) | 4148 x = lapply(blue_dots, function(x) x[!is.na(x)]), |
1924 red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) | 4149 col = "lightblue1", |
1925 cols_to_delete <- c() | 4150 side = "left", |
1926 for (ix in seq_len(length(red_violins))) { | 4151 plotCentre = "line", |
1927 if (length(red_violins[[ix]]) < 1) { | 4152 ylim = ylim_save, |
1928 cols_to_delete <- c(cols_to_delete, ix) | 4153 main = "Distributions of observed and imputed data", |
4154 sub = NULL, | |
4155 las = 2, | |
4156 xlab = NULL, | |
4157 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
4158 ) | |
4159 title( | |
4160 sub = "Light blue = observed data; Pink = imputed data", | |
4161 cex.sub = 1.0, | |
4162 line = g_ppep_distrib_ctl$xlab_line + 1 | |
4163 ) | |
4164 title( | |
4165 xlab = "Sample", | |
4166 line = g_ppep_distrib_ctl$xlab_line | |
4167 ) | |
4168 red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) | |
4169 cols_to_delete <- c() | |
4170 for (ix in seq_len(length(red_violins))) { | |
4171 if (length(red_violins[[ix]]) < 1) { | |
4172 cols_to_delete <- c(cols_to_delete, ix) | |
4173 } | |
1929 } | 4174 } |
1930 } | 4175 # destroy any unimputable columns |
1931 # destroy any unimputable columns | 4176 if (!is.null(cols_to_delete)) { |
1932 if (!is.null(cols_to_delete)) { | 4177 red_violins <- red_violins[-cols_to_delete] |
1933 red_violins <- red_violins[-cols_to_delete] | 4178 } |
1934 } | 4179 # plot imputed values in red on right half-violin plot |
1935 # plot imputed values in red on right half-violin plot | 4180 vioplot::vioplot( |
1936 vioplot::vioplot( | 4181 x = red_violins, |
1937 x = red_violins, | 4182 col = "lightpink1", |
1938 col = "lightpink1", | 4183 side = "right", |
1939 side = "right", | 4184 plotCentre = "line", |
1940 plotCentre = "line", | 4185 add = TRUE |
1941 add = TRUE | 4186 ) |
1942 ) | 4187 |
1943 } | 4188 }, |
1944 | 4189 finally = par(old_par) |
1945 par(old_par) | 4190 ) |
1946 | 4191 |
1947 # density plot | 4192 # density plot |
1948 cat("\\leavevmode\n\n\n\n\n\n\n") | 4193 cat("\\leavevmode\n\n\n\n\n\n\n") |
1949 if (can_plot_before_after_imp) { | 4194 if (can_plot_before_after_imp) { |
1950 ylim <- c( | 4195 ylim <- c( |
1978 } | 4223 } |
1979 cat("\\leavevmode\\newpage\n") | 4224 cat("\\leavevmode\\newpage\n") |
1980 } | 4225 } |
1981 ``` | 4226 ``` |
1982 | 4227 |
1983 # Perform Quantile Normalization | 4228 # Quantile Normalization |
1984 | 4229 |
1985 The excellent `normalize.quantiles` function from | 4230 The excellent `normalize.quantiles` function from |
1986 *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* | 4231 *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* |
1987 performs "quantile normalization" as described Bolstad *et al.* (2003), | 4232 performs "quantile normalization" as described Bolstad *et al.* (2003), |
1988 DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* | 4233 DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* |
1989 and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*, | 4234 and its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html), |
1990 i.e., it assumes that the goal is to detect | 4235 i.e., it assumes that the goal is to detect |
1991 subtle differences among grossly similar samples (having similar distributions) | 4236 subtle differences among grossly similar samples (having similar distributions) |
1992 by equailzing intra-quantile quantitations. | 4237 by equalizing intra-quantile quantitations^[Unfortunately, |
1993 Unfortunately, one software library upon which it depends | 4238 one software library upon which `preprocessCore` depends |
1994 *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* | 4239 *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* |
1995 that requires that a specific, non-concurrent version of the library be | 4240 that requires that a specific, non-concurrent version of the library (`openblas` version $0.3.3$) be |
1996 installed. The installation command equivalent to what was used to install the library to produce the results presented here is: | 4241 installed. The installation command equivalent to what was used to install the library to produce the results presented here is: |
1997 ``` | 4242 \linebreak |
1998 conda install bioconductor-preprocesscore openblas=0.3.3 | 4243 ` conda install bioconductor-preprocesscore openblas=0.3.3`]. |
1999 ``` | |
2000 | 4244 |
2001 | 4245 |
2002 <!-- | 4246 <!-- |
2003 # Apply quantile normalization using preprocessCore::normalize.quantiles | 4247 # Apply quantile normalization using preprocessCore::normalize.quantiles |
2004 # --- | 4248 # --- |
2005 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html | 4249 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html |
2006 # except this: https://support.bioconductor.org/p/122925/#9135989 | 4250 # except this: https://support.bioconductor.org/p/122925/#9135989 |
2007 # says to install it like this: | 4251 # says to install it like this: |
2008 # ``` | 4252 # ``` |
2009 # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) | 4253 # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) |
2010 # ``` | 4254 # ``` |
2011 # conda installation (necessary because of a bug in recent openblas): | 4255 # conda installation (necessary because of a bug in recent openblas): |
2012 # conda install bioconductor-preprocesscore openblas=0.3.3 | 4256 # conda install bioconductor-preprocesscore openblas=0.3.3 |
2013 # ... | 4257 # ... |
2014 # --- | 4258 # --- |
2015 # normalize.quantiles {preprocessCore} -- Quantile Normalization | 4259 # normalize.quantiles {preprocessCore} -- Quantile Normalization |
2016 # | 4260 # |
2017 # Description: | 4261 # Description: |
2018 # Using a normalization based upon quantiles, this function normalizes a | 4262 # Using a normalization based upon quantiles, this function normalizes a |
2019 # matrix of probe level intensities. | 4263 # matrix of probe level intensities. |
2020 # | 4264 # |
2021 # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the | 4265 # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the |
2022 # assumption that the data is missing at random. | 4266 # assumption that the data is missing at random. |
2023 # | 4267 # |
2024 # Usage: | 4268 # Usage: |
2025 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) | 4269 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) |
2026 # | 4270 # |
2027 # Arguments: | 4271 # Arguments: |
2028 # | 4272 # |
2029 # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. | 4273 # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. |
2030 # | 4274 # |
2031 # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, | 4275 # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, |
2032 # but in certain situations not making a copy of the matrix, but instead normalizing | 4276 # but in certain situations not making a copy of the matrix, but instead normalizing |
2033 # it in place will be more memory friendly. | 4277 # it in place will be more memory friendly. |
2034 # | 4278 # |
2035 # - keep.names: Boolean option to preserve matrix row and column names in output. | 4279 # - keep.names: Boolean option to preserve matrix row and column names in output. |
2036 # | 4280 # |
2037 # Details: | 4281 # Details: |
2038 # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. | 4282 # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. |
2039 # No special allowances are made for outliers. If you make use of quantile normalization | 4283 # No special allowances are made for outliers. If you make use of quantile normalization |
2040 # please cite Bolstad et al, Bioinformatics (2003). | 4284 # please cite Bolstad et al, Bioinformatics (2003). |
2041 # | 4285 # |
2042 # This functions will handle missing data (ie NA values), based on | 4286 # This functions will handle missing data (ie NA values), based on |
2043 # the assumption that the data is missing at random. | 4287 # the assumption that the data is missing at random. |
2044 # | 4288 # |
2045 # Note that the current implementation optimizes for better memory usage | 4289 # Note that the current implementation optimizes for better memory usage |
2046 # at the cost of some additional run-time. | 4290 # at the cost of some additional run-time. |
2047 # | 4291 # |
2048 # Value: A normalized matrix. | 4292 # Value: A normalized matrix. |
2049 # | 4293 # |
2050 # Author: Ben Bolstad, bmbolstad.com | 4294 # Author: Ben Bolstad, bmbolstad.com |
2051 # | 4295 # |
2052 # References | 4296 # References |
2053 # | 4297 # |
2054 # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide | 4298 # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide |
2055 # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf | 4299 # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf |
2056 # | 4300 # |
2057 # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of | 4301 # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of |
2058 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias | 4302 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias |
2059 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 | 4303 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 |
2060 # http://bmbolstad.com/misc/normalize/normalize.html | 4304 # http://bmbolstad.com/misc/normalize/normalize.html |
2061 # ... | 4305 # ... |
2062 --> | 4306 --> |
2063 ```{r echo = FALSE, results = 'asis'} | 4307 ```{r echo = FALSE, results = 'asis'} |
2064 | 4308 |
4309 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp)), "\n") | |
2065 if (nrow(quant_data_imp) > 0) { | 4310 if (nrow(quant_data_imp) > 0) { |
2066 quant_data_imp_qn <- preprocessCore::normalize.quantiles( | 4311 quant_data_imp_qn <- preprocessCore::normalize.quantiles( |
2067 as.matrix(quant_data_imp), keep.names = TRUE | 4312 as.matrix(quant_data_imp), keep.names = TRUE |
2068 ) | 4313 ) |
2069 } else { | 4314 } else { |
2070 quant_data_imp_qn <- as.matrix(quant_data_imp) | 4315 quant_data_imp_qn <- as.matrix(quant_data_imp) |
2071 } | 4316 } |
2072 | 4317 |
4318 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn)), "\n") | |
4319 | |
2073 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) | 4320 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) |
2074 | |
2075 write_debug_file(quant_data_imp_qn) | 4321 write_debug_file(quant_data_imp_qn) |
2076 | 4322 |
2077 quant_data_imp_qn_log <- log10(quant_data_imp_qn) | 4323 quant_data_imp_qn_log <- log10(quant_data_imp_qn) |
2078 | |
2079 write_debug_file(quant_data_imp_qn_log) | 4324 write_debug_file(quant_data_imp_qn_log) |
2080 | 4325 |
4326 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n") | |
4327 if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n") | |
4328 | |
2081 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) | 4329 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) |
2082 | 4330 |
2083 sel <- apply(quant_data_imp_qn_ls, 1, any_nan) | 4331 sel <- row_apply(quant_data_imp_qn_ls, any_nan) |
2084 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls | 4332 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls |
2085 | 4333 |
2086 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] | 4334 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] |
2087 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) | 4335 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) |
2088 | 4336 |
2093 | 4341 |
2094 # Create data.frame used by ANOVA analysis | 4342 # Create data.frame used by ANOVA analysis |
2095 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) | 4343 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) |
2096 ``` | 4344 ``` |
2097 | 4345 |
2098 <!-- ACE insertion begin --> | |
2099 ## Are normalized, imputed, log-transformed sample distributions similar? | 4346 ## Are normalized, imputed, log-transformed sample distributions similar? |
2100 | 4347 |
2101 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | 4348 ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} |
2102 | 4349 |
2103 # Save unimputed quant_data_log for plotting below | 4350 # Save unimputed quant_data_log for plotting below |
2104 unimputed_quant_data_log <- quant_data_log | 4351 unimputed_quant_data_log <- quant_data_log |
2105 | 4352 |
2106 # log10 transform (after preparing for zero values, | 4353 # log10 transform (after preparing for zero values, |
2119 ) | 4366 ) |
2120 cat("\n\n\n") | 4367 cat("\n\n\n") |
2121 | 4368 |
2122 | 4369 |
2123 # data visualization | 4370 # data visualization |
4371 if (TRUE) { | |
4372 | |
4373 my_ppep_distrib_bxp( | |
4374 x = quant_data_log | |
4375 , sample_name_grow = sample_name_grow | |
4376 , main = "Peptide intensities for each sample" | |
4377 , varwidth = boxplot_varwidth | |
4378 , sub = NULL | |
4379 , xlab = "Sample" | |
4380 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
4381 , col = const_boxplot_fill | |
4382 , notch = boxplot_notch | |
4383 ) | |
4384 | |
4385 } else { | |
4386 | |
2124 old_par <- par( | 4387 old_par <- par( |
2125 mai = par("mai") + c(0.5, 0, 0, 0) | 4388 mai = par("mai") + c(0.5, 0, 0, 0) |
2126 , oma = par("oma") + c(0.5, 0, 0, 0) | 4389 , oma = par("oma") + c(0.5, 0, 0, 0) |
2127 ) | 4390 ) |
2128 # ref: https://r-charts.com/distribution/add-points-boxplot/ | 4391 # ref: https://r-charts.com/distribution/add-points-boxplot/ |
2129 # Vertical plot | 4392 # Vertical plot |
2130 colnames(quant_data_log) <- sample_name_matches | 4393 colnames(quant_data_log) <- sample_name_matches |
2131 boxplot( | 4394 boxplot( |
2132 quant_data_log | 4395 quant_data_log |
2133 , las = 2 | 4396 , las = 2 |
2134 , cex.axis = 0.9 * sample_name_shrink | 4397 , cex.axis = 0.9 * sample_name_grow |
2135 , col = const_boxplot_fill | 4398 , col = const_boxplot_fill |
2136 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 4399 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") |
2137 , xlab = "Sample" | 4400 , xlab = "Sample" |
4401 , notch = boxplot_notch | |
4402 , varwidth = boxplot_varwidth | |
2138 ) | 4403 ) |
2139 par(old_par) | 4404 par(old_par) |
4405 } | |
2140 } else { | 4406 } else { |
2141 cat("There are no peptides to plot\n") | 4407 cat("There are no peptides to plot\n") |
2142 } | 4408 } |
2143 | 4409 |
2144 cat("\n\n\n") | 4410 cat("\n\n\n") |
2160 cat("\\leavevmode\\newpage\n") | 4426 cat("\\leavevmode\\newpage\n") |
2161 ``` | 4427 ``` |
2162 | 4428 |
2163 # ANOVA Analysis | 4429 # ANOVA Analysis |
2164 | 4430 |
2165 ```{r, echo = FALSE} | 4431 ## Assignment of $p$-value and quality score |
4432 | |
4433 For each phosphopeptide, ANOVA analysis was performed to compute a $p$-value representing the evidence against the "null hypothesis" ($H_0$) that the intensity does not vary significantly among sample groups. | |
4434 | |
4435 However, because as more and more phosphopeptides are tested, there is increasd probability that, by random chance, a given peptide will have a $p$-value that appears to indicate significance. For this reason, the $p$-values were adjusted by applying the False Discovery Rate (FDR) correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x). | |
4436 | |
4437 Furthermore, to give more weight to phosphopeptides having fewer missing values, an (arbitrarily defined) quality score was assigned to each, defined as: | |
4438 | |
4439 $$ | |
4440 \textit{quality}_j | |
4441 = \frac{1 + o_{j}}{v_{j}(1 + o_{j}) + 0.005} | |
4442 $$ | |
4443 | |
4444 where: | |
4445 | |
4446 - $o_j$ is the minimum number of non-missing observations per sample group for substrate $j$ for all sample groups, and | |
4447 - $v_j$ is the FDR-adjusted ANOVA $p$-value for substrate $j$. | |
4448 | |
4449 | |
4450 ```{r, echo = FALSE, results = 'asis'} | |
4451 cat("\\newpage\n") | |
4452 | |
2166 # Make new data frame containing only Phosphopeptides | 4453 # Make new data frame containing only Phosphopeptides |
2167 # to connect preANOVA to ANOVA (connect_df) | 4454 # to connect preANOVA to ANOVA (connect_df) |
2168 connect_df <- data.frame( | 4455 connect_df <- data.frame( |
2169 data_table_imp_qn_lt$Phosphopeptide | 4456 data_table_imp_qn_lt$Phosphopeptide |
2170 , data_table_imp_qn_lt[, first_data_column] | 4457 , data_table_imp_qn_lt[, first_data_column] |
2171 ) | 4458 ) |
2172 colnames(connect_df) <- c("Phosphopeptide", "Intensity") | 4459 colnames(connect_df) <- c("Phosphopeptide", "Intensity") |
2173 ``` | 4460 ``` |
2174 | 4461 |
2175 ```{r anova, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | 4462 ```{r anova, echo = FALSE, fig.dim = c(10, 12), results = 'asis'} |
2176 count_of_treatment_levels <- length(levels(sample_treatment_levels)) | 4463 g_can_run_ksea <- FALSE |
4464 old_oma <- par("oma") | |
2177 if (count_of_treatment_levels < 2) { | 4465 if (count_of_treatment_levels < 2) { |
2178 nuke_control_sequences <- | |
2179 function(s) { | |
2180 s <- gsub("[\\]", "xyzzy_plugh", s) | |
2181 s <- gsub("[$]", "\\\\$", s) | |
2182 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) | |
2183 return(s) | |
2184 } | |
2185 cat( | 4466 cat( |
2186 "ERROR!!!! Cannot perform ANOVA analysis", | 4467 "ERROR!!!! Cannot perform ANOVA analysis", |
2187 "(see next page)\\newpage\n" | 4468 "(see next page)\\newpage\n" |
2188 ) | 4469 ) |
2189 cat( | 4470 cat( |
2195 cat("Unparsed sample names are:\n\n\n", | 4476 cat("Unparsed sample names are:\n\n\n", |
2196 "\\begin{quote}\n", | 4477 "\\begin{quote}\n", |
2197 paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), | 4478 paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), |
2198 "\n\\end{quote}\n\n") | 4479 "\n\\end{quote}\n\n") |
2199 | 4480 |
2200 regex_sample_names <- nuke_control_sequences(regex_sample_names) | 4481 regex_sample_names <- latex_printable_control_seqs(regex_sample_names) |
2201 | 4482 |
2202 cat("\\leavevmode\n\n\n") | 4483 cat("\\leavevmode\n\n\n") |
2203 cat("Parsing rule for SampleNames is", | 4484 cat("Parsing rule for SampleNames is", |
2204 "\n\n\n", | 4485 "\n\n\n", |
2205 "\\text{'", | 4486 "\\text{'", |
2211 cat("\nParsed sample names are:\n", | 4492 cat("\nParsed sample names are:\n", |
2212 "\\begin{quote}\n", | 4493 "\\begin{quote}\n", |
2213 paste(sample_name_matches, collapse = "\n\n\n"), | 4494 paste(sample_name_matches, collapse = "\n\n\n"), |
2214 "\n\\end{quote}\n\n") | 4495 "\n\\end{quote}\n\n") |
2215 | 4496 |
2216 regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping) | 4497 regex_sample_grouping <- latex_printable_control_seqs(regex_sample_grouping) |
2217 | 4498 |
2218 cat("\\leavevmode\n\n\n") | 4499 cat("\\leavevmode\n\n\n") |
2219 cat("Parsing rule for SampleGrouping is", | 4500 cat("Parsing rule for SampleGrouping is", |
2220 "\n\n\n", | 4501 "\n\n\n", |
2221 "\\text{'", | 4502 "\\text{'", |
2230 paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"), | 4511 paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"), |
2231 "\n\\end{quote}\n\n") | 4512 "\n\\end{quote}\n\n") |
2232 | 4513 |
2233 } else { | 4514 } else { |
2234 | 4515 |
2235 p_value_data_anova_ps <- | 4516 if (print_nb_messages) nbe("computing p_value_data_anova_ps\n") |
2236 apply( | 4517 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n") |
2237 quant_data_imp_qn_log, | 4518 if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n") |
2238 1, | 4519 if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log[, ".NE.7C"]), "\n") |
2239 anova_func, | 4520 if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log), "\n") |
2240 grouping_factor = sample_treatment_levels, | 4521 if (print_nb_messages) nbe(see_variable(anova_func), "\n") |
2241 one_way_f = one_way_all_categories | 4522 if (print_nb_messages) nbe(see_variable(smpl_trt), "\n") |
2242 ) | 4523 if (print_nb_messages) nbe(see_variable(one_way_all_categories), "\n") |
4524 tryCatch( | |
4525 { | |
4526 p_value_data_anova_ps <- | |
4527 row_apply( | |
4528 quant_data_imp_qn_log, | |
4529 anova_func, | |
4530 grouping_factor = smpl_trt, | |
4531 one_way_f = one_way_all_categories | |
4532 ) | |
4533 }, | |
4534 error = function(e) { | |
4535 mesg <- paste("Could not compute ANOVA because", e$message) | |
4536 cat("\n\n", mesg, "\n\n") | |
4537 param_df_noexit(e) | |
4538 sink(stderr()) | |
4539 cat("\n\n", mesg, "\n\n") | |
4540 values <- paste(param_df$parameter, "=", param_df$value, collapse = "\n") | |
4541 cat(values) | |
4542 sink() | |
4543 knitr::knit_exit() | |
4544 exit(code = 1) | |
4545 } | |
4546 ) | |
4547 if (print_nb_messages) nbe(see_variable(p_value_data_anova_ps), "\n") | |
2243 | 4548 |
2244 p_value_data_anova_ps_fdr <- | 4549 p_value_data_anova_ps_fdr <- |
2245 p.adjust(p_value_data_anova_ps, method = "fdr") | 4550 p.adjust(p_value_data_anova_ps, method = "fdr") |
4551 my_ppep_v <- full_data[, 1] | |
4552 p_value_data <- list( | |
4553 phosphopeptide = my_ppep_v, | |
4554 raw_anova_p = p_value_data_anova_ps, | |
4555 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr, | |
4556 missing_values = rowSums(is.na(quant_data)), | |
4557 min_group_obs_count = min_group_obs_count | |
4558 ) | |
2246 p_value_data <- data.frame( | 4559 p_value_data <- data.frame( |
2247 phosphopeptide = full_data[, 1], | 4560 phosphopeptide = my_ppep_v, |
2248 raw_anova_p = p_value_data_anova_ps, | 4561 raw_anova_p = p_value_data_anova_ps, |
2249 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr | 4562 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr, |
2250 ) | 4563 missing_values = rowSums(is.na(quant_data)), |
4564 min_group_obs_count = min_group_obs_count | |
4565 ) | |
4566 p_value_data$quality <- 1.0 / with( | |
4567 p_value_data, | |
4568 fdr_adjusted_anova_p + 0.005 / (1 + min_group_obs_count) | |
4569 ) | |
4570 | |
4571 p_value_data$ranking <- | |
4572 with( | |
4573 p_value_data, | |
4574 switch( | |
4575 g_intensity_hm_criteria, | |
4576 "quality" = order(-quality), | |
4577 "na_count" = order(missing_values, fdr_adjusted_anova_p), | |
4578 # the default is "p_value" | |
4579 order(fdr_adjusted_anova_p) | |
4580 ) | |
4581 ) | |
4582 p_value_data <- p_value_data[p_value_data$ranking, , drop = FALSE] | |
4583 | |
4584 write.table( | |
4585 p_value_data, | |
4586 file = "p_value_data.txt", | |
4587 sep = "\t", | |
4588 col.names = TRUE, | |
4589 row.names = FALSE, | |
4590 quote = FALSE | |
4591 ) | |
4592 | |
2251 | 4593 |
2252 # output ANOVA file to constructed filename, | 4594 # output ANOVA file to constructed filename, |
2253 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" | 4595 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" |
2254 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" | 4596 # becomes "Outputfile_pST_ANOVA_STEP5_FDR0.05.txt" |
2255 | 4597 |
2256 # Re-output datasets to include p-values | 4598 # Re-output datasets to include p-values |
2257 metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3]) | 4599 metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:ncol(p_value_data)]) |
2258 write.table( | 4600 write.table( |
2259 cbind(metadata_plus_p, quant_data_imp), | 4601 cbind(metadata_plus_p, quant_data_imp), |
2260 file = imputed_data_filename, | 4602 file = imputed_data_filename, |
2261 sep = "\t", | 4603 sep = "\t", |
2262 col.names = TRUE, | 4604 col.names = TRUE, |
2272 row.names = FALSE, | 4614 row.names = FALSE, |
2273 quote = FALSE | 4615 quote = FALSE |
2274 ) | 4616 ) |
2275 | 4617 |
2276 | 4618 |
2277 p_value_data <- | |
2278 p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] | |
2279 | |
2280 first_page_suppress <- 1 | 4619 first_page_suppress <- 1 |
2281 number_of_peptides_found <- 0 | 4620 number_of_peptides_found <- 0 |
2282 cutoff <- val_fdr[1] | 4621 cutoff <- val_fdr[1] |
2283 for (cutoff in val_fdr) { | 4622 for (cutoff in val_fdr) { |
2284 if (number_of_peptides_found > 49) { | 4623 #loop through FDR cutoffs |
4624 if (number_of_peptides_found > g_intensity_hm_rows - 1) { | |
2285 cat("\\leavevmode\n\n\n") | 4625 cat("\\leavevmode\n\n\n") |
2286 break | 4626 break |
2287 } | 4627 } |
2288 | 4628 |
2289 #loop through FDR cutoffs | 4629 bool_1 <- (p_value_data$fdr_adjusted_anova_p < cutoff) |
4630 bool_2 <- (p_value_data$min_group_obs_count >= g_intensity_min_per_class) | |
4631 g_can_run_ksea <- g_can_run_ksea || (sum(bool_2) > 0) | |
4632 bool_4 <- (p_value_data$quality >= params$minQuality) | |
4633 bool_3 <- as.logical( | |
4634 as.integer(bool_1) * | |
4635 as.integer(bool_2) * | |
4636 as.integer(bool_4) | |
4637 ) | |
4638 if (print_trace_messages) { | |
4639 if (length(bool_1) > 30) { | |
4640 cat_variable(bool_1, force_str = TRUE) | |
4641 cat_variable(bool_2, force_str = TRUE) | |
4642 cat_variable(bool_3, force_str = TRUE) | |
4643 } else { | |
4644 cat_variable(bool_1, suffix = "\n\n") | |
4645 cat_variable(bool_2, suffix = "\n\n") | |
4646 cat_variable(bool_3, suffix = "\n\n") | |
4647 } | |
4648 cat_variable(length(bool_3), digits = 0, suffix = "; ") | |
4649 cat_variable(sum(bool_3), digits = 0, suffix = "\n\n") | |
4650 } | |
2290 | 4651 |
2291 filtered_p <- | 4652 filtered_p <- |
2292 p_value_data[ | 4653 p_value_data[bool_3, , drop = FALSE] |
2293 which(p_value_data$fdr_adjusted_anova_p < cutoff), | 4654 filtered_p <- |
2294 , drop = FALSE | 4655 filtered_p[!is.na(filtered_p$phosphopeptide), , drop = FALSE] |
2295 ] | 4656 |
4657 if (print_trace_messages) | |
4658 cat_variable(filtered_p, force_str = TRUE) | |
4659 | |
4660 have_remaining_peptides <- sum(bool_3, na.rm = TRUE) > 0 | |
4661 filter_result_string <- | |
4662 sprintf( | |
4663 "%s, %s of %0.0f peptides remained having both %s and %s.\n\n", | |
4664 "After filtering for ANOVA results", | |
4665 if (have_remaining_peptides) | |
4666 as.character(sum(bool_3, na.rm = TRUE)) | |
4667 else | |
4668 "none", | |
4669 length(bool_3), | |
4670 sprintf("adjusted p-value < %s", as.character(signif(cutoff, 2))), | |
4671 sprintf( | |
4672 "more than %0.0f observations in some groups", | |
4673 max(0, g_intensity_min_per_class - 1) | |
4674 ) | |
4675 ) | |
4676 | |
2296 filtered_data_filtered <- | 4677 filtered_data_filtered <- |
2297 quant_data_imp_qn_log[ | 4678 quant_data_imp_qn_log[ |
2298 rownames(filtered_p), | 4679 rownames(filtered_p), |
2299 , drop = FALSE | 4680 , drop = FALSE |
2300 ] | 4681 ] |
4682 # order by p-value | |
2301 filtered_data_filtered <- | 4683 filtered_data_filtered <- |
2302 filtered_data_filtered[ | 4684 filtered_data_filtered[ |
2303 order(filtered_p$fdr_adjusted_anova_p), | 4685 order(filtered_p$fdr_adjusted_anova_p), |
2304 , drop = FALSE | 4686 , drop = FALSE |
2305 ] | 4687 ] |
2306 | 4688 |
2307 # <!-- ACE insertion start --> | 4689 if (have_remaining_peptides && nrow(filtered_p) > 0 && nrow(filtered_data_filtered) > 0) { |
2308 | |
2309 if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) { | |
2310 if (first_page_suppress == 1) { | 4690 if (first_page_suppress == 1) { |
2311 first_page_suppress <- 0 | 4691 first_page_suppress <- 0 |
2312 } else { | 4692 } else { |
2313 cat("\\newpage\n") | 4693 cat("\\newpage\n") |
2314 } | 4694 } |
2315 if (nrow(filtered_data_filtered) > 1) { | 4695 latex_samepage({ |
2316 subsection_header(sprintf( | 4696 cat(filter_result_string) |
2317 "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n", | 4697 if (nrow(filtered_data_filtered) > 1) { |
2318 nrow(filtered_data_filtered), | 4698 cat(subsection_header(sprintf( |
2319 cutoff | 4699 "Intensity distributions for %d phosphopeptides\n", |
2320 )) | 4700 nrow(filtered_data_filtered) |
2321 } else { | 4701 ))) |
2322 subsection_header(sprintf( | 4702 } else { |
2323 "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n", | 4703 cat(subsection_header(sprintf( |
2324 rownames(filtered_data_filtered)[1], | 4704 "Intensity distribution for one phosphopeptide (%s)\n", |
2325 cutoff | 4705 rownames(filtered_data_filtered)[1] |
2326 )) | 4706 ))) |
2327 } | 4707 } |
2328 cat("\n\n\n") | 4708 }) # end latex_samepage |
2329 cat("\n\n\n") | 4709 |
2330 | |
2331 old_oma <- par("oma") | |
2332 old_par <- par( | 4710 old_par <- par( |
2333 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), | 4711 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), |
2334 oma = old_oma * c(1, 1, 0.3, 1), | 4712 oma = old_oma * c(1, 1, 0.3, 1), |
2335 cex.main = 0.9, | 4713 cex.main = 0.9, |
2336 cex.axis = 0.7, | 4714 cex.axis = 0.7, |
2337 fin = c(9, 7.25) | 4715 fin = c(9, 7.25) |
2338 ) | 4716 ) |
2339 # ref: https://r-charts.com/distribution/add-points-boxplot/ | |
2340 # Vertical plot | 4717 # Vertical plot |
2341 colnames(filtered_data_filtered) <- sample_name_matches | 4718 colnames(filtered_data_filtered) <- sample_name_matches |
2342 tryCatch( | 4719 tryCatch( |
2343 boxplot( | 4720 boxplot( |
2344 filtered_data_filtered, | 4721 filtered_data_filtered, |
2345 main = "Imputed, normalized intensities", # no line plot | 4722 main = "Imputed, normalized intensities", # no line plot |
2346 las = 2, | 4723 las = 2, |
2347 cex.axis = 0.9 * sample_name_shrink, | 4724 cex.axis = 0.9 * sample_name_grow, |
2348 col = const_boxplot_fill, | 4725 col = const_boxplot_fill, |
2349 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 4726 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)"), |
4727 notch = FALSE, | |
4728 varwidth = boxplot_varwidth | |
2350 ), | 4729 ), |
2351 error = function(e) print(e) | 4730 error = function(e) { |
4731 print(e) | |
4732 cat_margins() | |
4733 } | |
4734 | |
2352 ) | 4735 ) |
2353 par(old_par) | 4736 par(old_par) |
2354 } else { | 4737 } else { |
2355 cat(sprintf( | 4738 cat(sprintf( |
2356 "%s < %0.2f\n\n\n\n\n", | 4739 "%s < %0.2f\n\n\n\n\n", |
2357 "No peptides were found to have cutoff adjusted p-value", | 4740 "No peptides were found to have cutoff adjusted p-value", |
2358 cutoff | 4741 cutoff |
2359 )) | 4742 )) |
2360 } | 4743 } |
2361 | 4744 |
2362 if (nrow(filtered_data_filtered) > 0) { | 4745 if (have_remaining_peptides && nrow(filtered_data_filtered) > 0) { |
2363 # Add Phosphopeptide column to anova_filtered table | 4746 # Add Phosphopeptide column to anova_filtered table |
2364 # The assumption here is that the first intensity is unique; | 4747 # The assumption here is that the first intensity is unique; |
2365 # this is a hokey assumption but almost definitely will | 4748 # this is a hokey assumption but almost definitely will |
2366 # be true in the real world unless there is a computation | 4749 # be true in the real world unless there is a computation |
2367 # error upstream. | 4750 # error upstream. |
2390 by.y = "Phosphopeptide" | 4773 by.y = "Phosphopeptide" |
2391 ) | 4774 ) |
2392 | 4775 |
2393 # Produce heatmap to visualize significance and the effect of imputation | 4776 # Produce heatmap to visualize significance and the effect of imputation |
2394 | 4777 |
2395 anova_filtered_merge_format <- sapply( | |
2396 X = filtered_p$fdr_adjusted_anova_p | |
2397 , | |
2398 FUN = function(x) { | |
2399 if (x > 0.01) | |
2400 paste0("%s (%0.", 1 + ceiling(-log10(x)), "f)") | |
2401 else | |
2402 paste0("%s (%0.2e)") | |
2403 } | |
2404 ) | |
2405 | |
2406 cat_hm_heading <- function(m, cutoff) { | 4778 cat_hm_heading <- function(m, cutoff) { |
2407 if (nrow(m) > intensity_hm_rows) { | 4779 if (nrow(m) > g_intensity_hm_rows) { |
2408 cat("\\newpage\n") | 4780 cat("\\clearpage\n") |
2409 subsection_header( | 4781 cat(subsection_header( |
2410 paste( | 4782 paste( |
2411 sprintf("Heatmap for the %d most-significant peptides", | 4783 sprintf("Heatmap for the %d most-significant peptides", |
2412 intensity_hm_rows), | 4784 g_intensity_hm_rows), |
2413 sprintf("whose adjusted p-value < %0.2f\n", cutoff) | 4785 sprintf("whose adjusted p-value < %0.2f\n", cutoff) |
2414 ) | 4786 ) |
2415 ) | 4787 )) |
2416 } else { | 4788 } else { |
2417 if (nrow(m) == 0) { | 4789 if (nrow(m) == 0) { |
2418 return(FALSE) | 4790 return(FALSE) |
2419 } else { | 4791 } else { |
2420 subsection_header( | 4792 cat(subsection_header( |
2421 paste( | 4793 paste( |
2422 sprintf("Heatmap for %d usable peptides whose", nrow(m)), | 4794 sprintf("Heatmap for %d usable peptide genes whose", nrow(m)), |
2423 sprintf("adjusted p-value < %0.2f\n", cutoff) | 4795 sprintf("adjusted p-value < %0.2f\n", cutoff) |
2424 ) | 4796 ) |
2425 ) | 4797 )) |
2426 } | 4798 } |
2427 } | 4799 } |
2428 cat("\n\n\n") | 4800 cat("\n\n\n") |
2429 cat("\n\n\n") | 4801 cat("\n\n\n") |
2430 return(TRUE) | 4802 return(TRUE) |
2431 } | 4803 } |
2432 | 4804 |
2433 # construct matrix with appropriate rownames | 4805 # construct matrix with appropriate rownames |
2434 m <- | 4806 m <- |
2435 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) | 4807 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) |
2436 if (nrow(m) > 0) { | 4808 nrow_m <- nrow(m) |
4809 ncol_m <- ncol(m) | |
4810 if (nrow_m > 0) { | |
2437 rownames_m <- rownames(m) | 4811 rownames_m <- rownames(m) |
2438 rownames(m) <- sapply( | 4812 q <- data.frame(pepname = rownames_m) |
2439 X = seq_len(nrow(m)) | 4813 g <- sqldf(" |
2440 , | 4814 SELECT q.pepname, substr(met.Gene_Name, 1, 30) AS gene_name |
4815 FROM q, metadata_plus_p AS met | |
4816 WHERE q.pepname = met.Phosphopeptide | |
4817 ORDER BY q.rowid | |
4818 ") | |
4819 tmp <- sapply( | |
4820 X = seq_len(nrow(g)), | |
4821 FUN = function(i) { | |
4822 pre <- strsplit(g$gene_name[i], "; ")[[1]] | |
4823 rslt <- paste(unique(pre), sep = "; ") | |
4824 return(rslt) | |
4825 } | |
4826 ) | |
4827 tmp <- unlist(tmp) | |
4828 tmp <- | |
4829 make.names(tmp, unique = TRUE) | |
4830 tmp <- sub( | |
4831 "No_Gene_Name", | |
4832 "not_found", | |
4833 tmp, | |
4834 fixed = TRUE | |
4835 ) | |
4836 ten_trunc_names <- trunc_ppep(rownames_m) | |
4837 tmp <- sapply( | |
4838 X = seq_len(nrow_m), | |
2441 FUN = function(i) { | 4839 FUN = function(i) { |
2442 sprintf( | 4840 sprintf( |
2443 anova_filtered_merge_format[i], | 4841 "(%s) %s", |
2444 rownames_m[i], | 4842 tmp[i], |
2445 signif(filtered_p$fdr_adjusted_anova_p[i], 2) | 4843 ten_trunc_names[i] |
2446 ) | 4844 ) |
2447 } | 4845 } |
2448 ) | 4846 ) |
4847 rownames(m) <- tmp | |
2449 } | 4848 } |
2450 # draw the heading and heatmap | 4849 # draw the heading and heatmap |
2451 if (nrow(m) > 0) { | 4850 if (nrow_m > 0) { |
2452 number_of_peptides_found <- | 4851 number_of_peptides_found <- |
2453 draw_ppep_heatmap( | 4852 ppep_heatmap( |
2454 m = m, | 4853 m = m, |
2455 cutoff = cutoff, | 4854 cutoff = cutoff, |
2456 hm_heading_function = cat_hm_heading, | 4855 hm_heading_function = cat_hm_heading, |
2457 hm_main_title = | 4856 hm_main_title = |
2458 "log(intensities), row-scaled, unimputed, unnormalized", | 4857 "log(intensities), row-scaled, unimputed, unnormalized", |
2459 suppress_row_dendrogram = FALSE | 4858 suppress_row_dendrogram = FALSE, |
4859 master_cex = 0.35, | |
4860 sepcolor = "black", | |
4861 colsep = sample_colsep | |
2460 ) | 4862 ) |
2461 if (number_of_peptides_found > 1) { | 4863 if (number_of_peptides_found > 1) { |
2462 cat("\\leavevmode\n") | 4864 cat("\\leavevmode\n") |
2463 cat("The adjusted ANOVA \\textit{p}-value is shown in parentheses | |
2464 after the phosphopeptide sequence.\n\n") | |
2465 } | 4865 } |
2466 } | 4866 } |
2467 } | 4867 } |
2468 } | 4868 } |
2469 } | 4869 } |
4870 cat(filter_result_string) | |
2470 cat("\\leavevmode\n") | 4871 cat("\\leavevmode\n") |
4872 | |
4873 if (!g_can_run_ksea) { | |
4874 errmsg <- paste("Cannot proceed with KSEA analysis", | |
4875 "because too many values are missing.") | |
4876 if (FALSE) cat0( | |
4877 errmsg, | |
4878 "\\stepcounter{offset}\n", | |
4879 "\\stepcounter{offset}\n", | |
4880 "\\stepcounter{offset}\n", | |
4881 " in ", | |
4882 table_href(), | |
4883 ".\n\n" | |
4884 ) | |
4885 if (FALSE) { | |
4886 if (print_nb_messages) nbe(see_variable(p_value_data)) | |
4887 } else { | |
4888 if (print_nb_messages) nbe(see_variable(p_value_data)) | |
4889 | |
4890 display_p_value_data <- p_value_data | |
4891 display_p_value_data$raw_anova_p <- | |
4892 sprintf("%0.3g", display_p_value_data$raw_anova_p) | |
4893 display_p_value_data$fdr_adjusted_anova_p <- | |
4894 sprintf("%0.3g", display_p_value_data$fdr_adjusted_anova_p) | |
4895 display_p_value_data$quality <- | |
4896 sprintf("%0.3g", display_p_value_data$quality) | |
4897 | |
4898 headers_1st_line <- | |
4899 c("", "Raw ANOVA", "FDR-adj.", "Missing", "Min. #", "", "") | |
4900 headers_2nd_line <- | |
4901 c("Phosphopeptide", "p-value", "p-value", "values", "group-obs", "Quality", "Ranking") | |
4902 data_frame_tabbing_latex( | |
4903 x = display_p_value_data, | |
4904 tabstops = c(2.75, 0.80, 0.80, 0.5, 0.6, 0.60), | |
4905 use_subsubsection_header = FALSE, | |
4906 headings = c(headers_1st_line, headers_2nd_line), | |
4907 caption = "ANOVA results" | |
4908 | |
4909 ) | |
4910 } | |
4911 data_frame_tabbing_latex( | |
4912 x = save_sample_treatment_df, | |
4913 tabstops = c(1.25, 1.25), | |
4914 caption = "Sample classes", | |
4915 use_subsubsection_header = FALSE | |
4916 ) | |
4917 param_df_exit() | |
4918 knitr::knit_exit() | |
4919 return(invisible(-1)) | |
4920 } | |
4921 | |
2471 ``` | 4922 ``` |
2472 | 4923 |
2473 ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | 4924 ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} |
2474 | 4925 |
2475 if (count_of_treatment_levels > 1) { | 4926 if (g_can_run_ksea && count_of_treatment_levels > 1) { |
2476 # Prepare two-way contrasts with adjusted p-values | 4927 # Prepare two-way contrasts with adjusted p-values |
2477 # Strategy: | 4928 # Strategy: |
2478 # - use imputed, log-transformed data: | 4929 # - use imputed, log-transformed data: |
2479 # - remember this when computing log2(fold-change) | 4930 # - remember this when computing log2(fold-change) |
2480 # - each contrast is between a combination of trt levels | 4931 # - each contrast is between a combination of trt levels |
2486 # - adjust p-value, assuming that | 4937 # - adjust p-value, assuming that |
2487 # (# of pppeps)*(# of contrasts) tests were performed | 4938 # (# of pppeps)*(# of contrasts) tests were performed |
2488 | 4939 |
2489 # Each contrast is between a combination of trt levels | 4940 # Each contrast is between a combination of trt levels |
2490 m2 <- combn( | 4941 m2 <- combn( |
2491 x = seq_len(length(levels(sample_treatment_levels))), | 4942 x = seq_len(length(levels(smpl_trt))), |
2492 m = 2, | 4943 m = 2, |
2493 simplify = TRUE | 4944 simplify = TRUE |
2494 ) | 4945 ) |
2495 contrast_count <- ncol(m2) | 4946 contrast_count <- ncol(m2) |
2496 | 4947 |
2500 f_m2 <- | 4951 f_m2 <- |
2501 function(cntrst, lvl1, lvl2) { | 4952 function(cntrst, lvl1, lvl2) { |
2502 return( | 4953 return( |
2503 data.frame( | 4954 data.frame( |
2504 contrast = cntrst, | 4955 contrast = cntrst, |
2505 level = sample_treatment_levels[ | 4956 level = smpl_trt[ |
2506 sample_treatment_levels %in% | 4957 smpl_trt %in% |
2507 levels(sample_treatment_levels)[c(lvl1, lvl2)] | 4958 levels(smpl_trt)[c(lvl1, lvl2)] |
2508 ], | 4959 ], |
2509 label = sample_name_matches[ | 4960 label = sample_name_matches[ |
2510 sample_treatment_levels %in% | 4961 smpl_trt %in% |
2511 levels(sample_treatment_levels)[c(lvl1, lvl2)] | 4962 levels(smpl_trt)[c(lvl1, lvl2)] |
2512 ] | 4963 ] |
2513 ) | 4964 ) |
2514 ) | 4965 ) |
2515 } | 4966 } |
2516 # - compute a df for each contrast | 4967 # - compute a df for each contrast |
2686 ; | 5137 ; |
2687 " | 5138 " |
2688 ) | 5139 ) |
2689 | 5140 |
2690 # - create contrast-metadata table | 5141 # - create contrast-metadata table |
5142 if (print_nb_messages) nbe("CREATE TABLE contrast_lvl_lvl_metadata") | |
2691 dml_no_rows_exec(db, " | 5143 dml_no_rows_exec(db, " |
2692 CREATE TABLE contrast_lvl_lvl_metadata | 5144 CREATE TABLE contrast_lvl_lvl_metadata |
2693 AS | 5145 AS |
2694 SELECT DISTINCT | 5146 SELECT DISTINCT |
2695 a.contrast AS ab_contrast, | 5147 a.contrast AS ab_contrast, |
2796 rownames(grouping_factor) <- grouping_factor$sample | 5248 rownames(grouping_factor) <- grouping_factor$sample |
2797 grouping_factor <- grouping_factor[, "level", drop = FALSE] | 5249 grouping_factor <- grouping_factor[, "level", drop = FALSE] |
2798 | 5250 |
2799 # - run the two-level (one-way) test | 5251 # - run the two-level (one-way) test |
2800 p_value_data_contrast_ps <- | 5252 p_value_data_contrast_ps <- |
2801 apply( | 5253 row_apply( |
2802 X = contrast_cast_data, | 5254 x = contrast_cast_data, |
2803 MARGIN = 1, # apply to rows | 5255 fun = anova_func, |
2804 FUN = anova_func, | |
2805 grouping_factor = | 5256 grouping_factor = |
2806 as.factor(grouping_factor$level), # anova_func arg2 | 5257 as.factor(grouping_factor$level), # anova_func arg2 |
2807 one_way_f = one_way_two_categories, # anova_func arg3 | 5258 one_way_f = one_way_two_categories, # anova_func arg3 |
2808 simplify = TRUE # TRUE is the default for simplify | 5259 simplify = TRUE # TRUE is the default for simplify |
2809 ) | 5260 ) |
3013 AND NOT m.`Gene` = 'No_Gene_Name' | 5464 AND NOT m.`Gene` = 'No_Gene_Name' |
3014 AND NOT v.log2_fc = 0 | 5465 AND NOT v.log2_fc = 0 |
3015 ; | 5466 ; |
3016 " | 5467 " |
3017 ) | 5468 ) |
5469 # We are done with DDL and insertion | |
5470 RSQLite::dbDisconnect(db) | |
3018 } | 5471 } |
3019 ``` | 5472 ``` |
3020 | 5473 |
3021 ```{r echo = FALSE, results = 'asis'} | 5474 ```{r echo = FALSE, results = 'asis'} |
3022 cat("\\newpage\n") | 5475 cat("\\newpage\n") |
3023 ``` | 5476 ``` |
3024 | 5477 |
3025 # KSEA Analysis | 5478 # KSEA Analysis Summaries |
3026 | 5479 |
3027 Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: | 5480 Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: |
3028 | 5481 |
3029 - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp | 5482 - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp |
3030 - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415). | 5483 - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415). |
3031 - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/). | 5484 - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/). |
3032 | 5485 |
3033 For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: | 5486 For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: |
3034 | 5487 |
3035 $$ | 5488 $$ |
3036 \text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j} | 5489 \text{kinase enrichment }z\text{-score}_{j,i} = \frac{(\overline{`r sfc`}_{j,i} - \overline{`r pfc`}_j)\sqrt{m_{j,i}}}{\delta_j} |
3037 $$ | 5490 $$ |
3038 | 5491 |
3039 and fold-enrichment is computed as: | 5492 and fold-enrichment is computed as: |
3040 | 5493 |
3041 $$ | 5494 $$ |
3042 \text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j} | 5495 \text{Enrichment}_{j,i} = \frac{\overline{`r sfc`}_{j,i}}{\overline{`r pfc`}_j} |
3043 $$ | 5496 $$ |
3044 | 5497 |
3045 where: | 5498 where: |
3046 | 5499 |
3047 - $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$, | 5500 - $\overline{`r sfc`}_{j,i}$ is the mean `r pfc_txt` in intensities of known substrates of the kinase $i$ in contrast $j$, |
3048 - $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and | 5501 - $\overline{`r pfc`}_j$ is the mean `r pfc_txt` of all phosphosites identified in contrast $j$, and |
3049 - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, | 5502 - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, |
3050 - $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset. | 5503 - $\delta_j$ is the standard deviation of the $\log_2 (\text{fold-change})$ for contrast $j$ across all phosphosites in the dataset. |
3051 - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. | 5504 - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. |
3052 | 5505 |
3053 $\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x) | 5506 $\text{FDR}_{j,i}$ is the False Discovery Rate corrected kinase enrichment score. |
3054 | 5507 |
3055 Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). | 5508 Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). |
3056 | 5509 |
3057 Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`. | 5510 Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`. |
3058 | 5511 |
3059 - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. | 5512 - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. |
3060 - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). | 5513 - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). |
3061 - Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). | 5514 - Kinase-substrate data were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). |
3062 | 5515 |
3063 ```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | 5516 For each enriched kinase, a heatmap showing the intensities is presented for up to `r g_intensity_hm_rows` substrates, i.e., those substrates having the highest"quality". |
5517 | |
5518 Where possible, a heatmap of the correlations among these the selected substrates is also presented; if correlations cannot be computed (because of too many missing values), then the covariances are heatmapped for substrates having a variance greater than 1. | |
5519 | |
5520 ```{r ksea, echo = FALSE, fig.dim = c(12, 14.5), results = 'asis'} | |
5521 cat("\\clearpage\n") | |
3064 | 5522 |
3065 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) | 5523 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) |
3066 | 5524 |
3067 # -- eliminate the table that's about to be defined | 5525 # -- eliminate the table that's about to be defined |
3068 ddl_exec(db, " | 5526 ddl_exec(db, " |
3170 sub_title <- contrast_longlabel | 5628 sub_title <- contrast_longlabel |
3171 tryCatch( | 5629 tryCatch( |
3172 expr = { | 5630 expr = { |
3173 ksea_scores_rslt <- | 5631 ksea_scores_rslt <- |
3174 ksea_scores( | 5632 ksea_scores( |
3175 ksdata = pseudo_ksdata, # KSEAapp::KSData, | 5633 ksdata = pseudo_ksdata, |
3176 px = kseaapp_input, | 5634 px = kseaapp_input, |
3177 networkin = TRUE, | 5635 networkin = TRUE, |
3178 networkin_cutoff = 2 | 5636 networkin_cutoff = 2, |
5637 minimum_substrate_count = ksea_min_substrate_count | |
3179 ) | 5638 ) |
5639 | |
5640 if (FALSE) { | |
5641 ksea_scores_rslt <- | |
5642 ksea_scores_rslt[ | |
5643 ksea_scores_rslt$m >= ksea_min_substrate_count, | |
5644 , | |
5645 drop = FALSE | |
5646 ] | |
5647 } | |
5648 | |
5649 if (FALSE) { | |
5650 data_frame_tabbing_latex( | |
5651 x = ksea_scores_rslt, | |
5652 tabstops = c(0.8, 0.8, 0.8, 0.8, 0.8, 0.8), | |
5653 caption = paste("KSEA scores for contrast ", | |
5654 cntrst_b_level, "to", cntrst_a_level), | |
5655 use_subsubsection_header = FALSE | |
5656 ) | |
5657 } | |
5658 | |
5659 if (FALSE) { | |
5660 if (print_nb_messages) nbe("Output contents of `ksea_scores_rslt` table\n") | |
5661 cat_variable(ksea_scores_rslt) | |
5662 cat("\n\\clearpage\n") | |
5663 } | |
3180 | 5664 |
3181 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { | 5665 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { |
3182 next_index <- 1 + next_index | 5666 next_index <- 1 + next_index |
3183 rslt$score_list[[next_index]] <- ksea_scores_rslt | 5667 rslt$score_list[[next_index]] <- ksea_scores_rslt |
3184 rslt$name_list[[next_index]] <- contrast_label | 5668 rslt$name_list[[next_index]] <- contrast_label |
3185 rslt$longname_list[[next_index]] <- contrast_longlabel | 5669 rslt$longname_list[[next_index]] <- contrast_longlabel |
3186 low_fdr_print( | 5670 ksea_low_fdr_print( |
3187 rslt = rslt, | 5671 rslt = rslt, |
3188 i_cntrst = i_cntrst, | 5672 i_cntrst = i_cntrst, |
3189 i = next_index, | 5673 i = next_index, |
3190 a_level = cntrst_a_level, | 5674 a_level = cntrst_a_level, |
3191 b_level = cntrst_b_level, | 5675 b_level = cntrst_b_level, |
3192 fold_change = cntrst_fold_change, | 5676 fold_change = cntrst_fold_change, |
3193 caption = contrast_longlabel | 5677 caption = contrast_longlabel |
3194 ) | 5678 ) |
3195 } | 5679 } |
3196 }, | 5680 }, |
3197 error = function(e) str(e) | 5681 error = function(e) { |
5682 str(e) | |
5683 cat_margins() | |
5684 } | |
3198 ) | 5685 ) |
3199 } | 5686 } |
3200 | 5687 |
3201 plotted_kinases <- NULL | 5688 plotted_kinases <- NULL |
3202 if (length(rslt$score_list) > 1) { | 5689 if (g_can_run_ksea && length(rslt$score_list) > 1) { |
3203 for (i in seq_len(length(ksea_heatmap_titles))) { | 5690 for (i in seq_len(length(ksea_heatmap_titles))) { |
3204 hdr <- ksea_heatmap_titles[[i]] | 5691 hdr <- ksea_heatmap_titles[[i]] |
3205 which_kinases <- i | 5692 which_kinases <- i |
3206 | 5693 |
3207 cat("\\clearpage\n\\begin{center}\n") | 5694 cat("\\clearpage\n\\begin{center}\n") |
3208 if (i == const_ksea_astrsk_kinases) { | 5695 if (i == const_ksea_astrsk_kinases) { |
3209 subsection_header(hdr) | 5696 cat(subsection_header(hdr)) |
3210 } else { | 5697 } else { |
3211 subsection_header(hdr) | 5698 cat(subsection_header(hdr)) |
3212 } | 5699 } |
3213 cat("\\end{center}\n") | 5700 cat("\\end{center}\n") |
3214 | 5701 |
3215 plotted_kinases <- ksea_heatmap( | 5702 plotted_kinases <- ksea_heatmap( |
3216 # the data frame outputs from the KSEA.Scores() function, in list format | 5703 # the data frame outputs from the KSEA.Scores() function, in list format |
3225 # a numeric value between 0 and infinity indicating the min. number of | 5712 # a numeric value between 0 and infinity indicating the min. number of |
3226 # substrates a kinase must have to be included in the heatmap | 5713 # substrates a kinase must have to be included in the heatmap |
3227 m_cutoff = 1, | 5714 m_cutoff = 1, |
3228 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff | 5715 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff |
3229 # for indicating significant kinases in the heatmap | 5716 # for indicating significant kinases in the heatmap |
3230 p_cutoff = 0.05, | 5717 p_cutoff = params$kseaCutoffThreshold, |
3231 # a binary input of TRUE or FALSE, indicating whether or not to perform | 5718 # a binary input of TRUE or FALSE, indicating whether or not to perform |
3232 # hierarchical clustering of the sample columns | 5719 # hierarchical clustering of the sample columns |
3233 sample_cluster = TRUE, | 5720 sample_cluster = TRUE, |
3234 # a binary input of TRUE or FALSE, indicating whether or not to export | 5721 # a binary input of TRUE or FALSE, indicating whether or not to export |
3235 # the heatmap as a .png image into the working directory | 5722 # the heatmap as a .png image into the working directory |
3242 ylab = "Kinase", | 5729 ylab = "Kinase", |
3243 # print which kinases: | 5730 # print which kinases: |
3244 # - 1 : all kinases | 5731 # - 1 : all kinases |
3245 # - 2 : significant kinases | 5732 # - 2 : significant kinases |
3246 # - 3 : non-significant kinases | 5733 # - 3 : non-significant kinases |
3247 which_kinases = which_kinases | 5734 which_kinases = which_kinases, |
5735 margins = c(7, 15) | |
3248 ) | 5736 ) |
3249 if (!is.null(plotted_kinases)) { | 5737 if (!is.null(plotted_kinases)) { |
3250 cat("\\begin{center}\n") | 5738 cat("\\begin{center}\n") |
3251 cat("Color intensity reflects $z$-score magnitudes; hue reflects $z$-score sign.\n") | |
3252 if (which_kinases != const_ksea_nonastrsk_kinases) | 5739 if (which_kinases != const_ksea_nonastrsk_kinases) |
3253 cat("Asterisks reflect significance.\n") | 5740 cat("Asterisks reflect significance.\n") |
3254 cat("\\end{center}\n") | 5741 cat("\\end{center}\n") |
3255 } | 5742 } |
3256 } # end for (i in ... | 5743 } # end for (i in ... |
3257 } # end if (length ... | 5744 } # end if (length ... |
3258 | 5745 ``` |
3259 for (i_cntrst in seq_len(length(rslt$score_list))) { | 5746 |
3260 next_index <- i_cntrst | 5747 ```{r kseabar_calc, echo = FALSE, fig.dim = c(9.5, 6), results = 'asis'} |
3261 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] | 5748 ksea_prints <- list() |
3262 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] | 5749 ksea_barplots <- list() |
3263 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] | 5750 for (i_cntrst in seq_len(length(rslt$score_list))) { |
3264 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) | 5751 next_index <- i_cntrst |
3265 contrast_longlabel <- ( | 5752 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] |
5753 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] | |
5754 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] | |
5755 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) | |
5756 contrast_longlabel <- ( | |
5757 sprintf( | |
5758 "Class %s -> Class %s", | |
5759 contrast_metadata_df[i_cntrst, "b_level"], | |
5760 contrast_metadata_df[i_cntrst, "a_level"] | |
5761 ) | |
5762 ) | |
5763 main_title <- ( | |
5764 sprintf( | |
5765 "Change from treatment %s to treatment %s", | |
5766 contrast_metadata_df[i_cntrst, "b_level"], | |
5767 contrast_metadata_df[i_cntrst, "a_level"] | |
5768 ) | |
5769 ) | |
5770 sub_title <- contrast_longlabel | |
5771 tryCatch( | |
5772 expr = { | |
5773 ksea_scores_rslt <- rslt$score_list[[next_index]] | |
5774 if (print_nb_messages) nbe(see_variable(ksea_scores_rslt)) #ACE | |
5775 | |
5776 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { | |
5777 sink(deferred <- file()) | |
5778 ksea_low_fdr_print( | |
5779 rslt = rslt, | |
5780 i_cntrst = i_cntrst, | |
5781 i = next_index, | |
5782 a_level = cntrst_a_level, | |
5783 b_level = cntrst_b_level, | |
5784 fold_change = cntrst_fold_change, | |
5785 caption = contrast_longlabel, | |
5786 write_db = FALSE, | |
5787 anchor = const_table_anchor_t | |
5788 ) | |
5789 cat("\n") | |
5790 sink() | |
5791 lines <- | |
5792 paste( | |
5793 readLines(deferred, warn = FALSE), | |
5794 collapse = "\n" | |
5795 ) | |
5796 close(deferred) | |
5797 sq_put(ksea_prints, lines) | |
5798 sink(stderr()) | |
5799 cat("\n---\n") | |
5800 cat_variable(ksea_prints) | |
5801 barplot_closure <- | |
5802 ksea_low_fdr_barplot_factory( | |
5803 rslt = rslt, | |
5804 i_cntrst = i_cntrst, | |
5805 i = next_index, | |
5806 a_level = cntrst_a_level, | |
5807 b_level = cntrst_b_level, | |
5808 fold_change = cntrst_fold_change, | |
5809 caption = contrast_longlabel | |
5810 ) | |
5811 if (rlang::is_closure(barplot_closure)) | |
5812 sq_put(ksea_barplots, barplot_closure) | |
5813 else | |
5814 sq_put(ksea_barplots, no_op) | |
5815 str(ksea_barplots) | |
5816 cat("\n...\n") | |
5817 sink() | |
5818 } | |
5819 }, | |
5820 error = function(e) { | |
5821 str(e) | |
5822 cat_margins() | |
5823 } | |
5824 ) | |
5825 } | |
5826 ``` | |
5827 | |
5828 ```{r phosphoelm_kinase_upid_desc, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'} | |
5829 | |
5830 have_kinase_descriptions <- | |
5831 if (!is.null(bzip2df(kinase_uprt_desc_lut, kinase_uprt_desc_lut_bz2)) && | |
5832 !is.null(bzip2df(kinase_name_uprt_lut, kinase_name_uprt_lut_bz2)) | |
5833 ) { | |
5834 rownames(kinase_uprt_desc_lut) <- kinase_uprt_desc_lut$UniProtID | |
5835 kinase_name_to_desc_uprt <- function(s) { | |
5836 rslt <- NULL | |
5837 tryCatch( | |
5838 { | |
5839 which_rows <- eval(s == kinase_name_uprt_lut$kinase) | |
5840 kinase_uprtid <- | |
5841 kinase_name_uprt_lut[which_rows, 2] | |
5842 # filter for first _HUMAN match if any | |
5843 grepl_human <- grepl("_HUMAN$", kinase_uprtid) | |
5844 if (0 < sum(grepl_human)) | |
5845 kinase_uprtid <- kinase_uprtid[grepl_human] | |
5846 # filter for first match if any | |
5847 if (0 < length(kinase_uprtid)) { | |
5848 kinase_uprtid <- kinase_uprtid[1] | |
5849 kinase_desc <- kinase_uprt_desc_lut[kinase_uprtid, 2] | |
5850 if (!is.na(kinase_desc)) | |
5851 rslt <- c(kinase_desc, kinase_uprtid) | |
5852 else | |
5853 rslt <- c(kinase_desc, "") | |
5854 } | |
5855 }, | |
5856 warning = str | |
5857 ) | |
5858 rslt | |
5859 } | |
5860 TRUE | |
5861 } else { | |
5862 kinase_name_to_desc_uprt <- function(s) NULL | |
5863 FALSE | |
5864 } | |
5865 ``` | |
5866 | |
5867 ```{r write_params, echo = FALSE, results = 'asis'} | |
5868 # perhaps this should be moved into the functions section, eventually ... | |
5869 write_params <- function(db) { | |
5870 # write parameters to report | |
5871 | |
5872 # write parameters to SQLite output | |
5873 | |
5874 mqppep_anova_script_param_df <- data.frame( | |
5875 script = "mqppep_anova_script.Rmd", | |
5876 parameter = names(param_unlist), | |
5877 value = param_unlist | |
5878 ) | |
5879 ddl_exec(db, " | |
5880 DROP TABLE IF EXISTS script_parameter; | |
5881 " | |
5882 ) | |
5883 ddl_exec(db, " | |
5884 CREATE TABLE IF NOT EXISTS script_parameter( | |
5885 script TEXT, | |
5886 parameter TEXT, | |
5887 value ANY, | |
5888 UNIQUE (script, parameter) ON CONFLICT REPLACE | |
5889 ) | |
5890 ; | |
5891 " | |
5892 ) | |
5893 RSQLite::dbWriteTable( | |
5894 conn = db, | |
5895 name = "script_parameter", | |
5896 value = mqppep_anova_script_param_df, | |
5897 append = TRUE | |
5898 ) | |
5899 | |
5900 loaded_packages_df <- sessioninfo::package_info("loaded") | |
5901 loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) | |
5902 loaded_packages_df <- data.frame( | |
5903 package = loaded_packages_df$package, | |
5904 version = loaded_packages_df$loadedversion, | |
5905 date = loaded_packages_df$date | |
5906 ) | |
5907 #ACE cat("\\clearpage\n\\section{R package versions}\n") | |
5908 #ACE data_frame_tabbing_latex( | |
5909 #ACE x = loaded_packages_df, | |
5910 #ACE tabstops = c(2.5, 1.25), | |
5911 #ACE caption = "R package versions" | |
5912 #ACE ) | |
5913 cat("\\clearpage\n\\section{Input parameter settings}\n") | |
5914 data_frame_tabbing_latex( | |
5915 x = param_df, | |
5916 tabstops = c(1.75), | |
5917 underscore_whack = TRUE, | |
5918 caption = "Input parameters", | |
5919 verbatim = FALSE | |
5920 ) | |
5921 } | |
5922 | |
5923 if (!have_kinase_descriptions) { | |
5924 write_params(db) | |
5925 # We are done with output | |
5926 RSQLite::dbDisconnect(db) | |
5927 param_df_exit() | |
5928 knitr::knit_exit() | |
5929 } | |
5930 ``` | |
5931 | |
5932 ```{r kseabar, echo = FALSE, fig.dim = c(9.5, 12.3), results = 'asis'} | |
5933 if (have_kinase_descriptions) { | |
5934 my_section_header <- | |
3266 sprintf( | 5935 sprintf( |
3267 "Class %s -> Class %s", | 5936 "inases whose KSEA %s < %s\n", |
3268 contrast_metadata_df[i_cntrst, "b_level"], | 5937 ksea_cutoff_statistic, |
3269 contrast_metadata_df[i_cntrst, "a_level"] | 5938 signif(ksea_cutoff_threshold, 2) |
3270 ) | 5939 ) |
3271 ) | 5940 |
3272 main_title <- ( | 5941 # Use enriched kinases to find enriched kinase-substrate pairs |
5942 enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) | |
5943 | |
5944 enriched_kinase_descs <- | |
5945 Reduce( | |
5946 f = function(l, r) { | |
5947 lkup <- kinase_name_to_desc_uprt(r) | |
5948 if (is.null(lkup)) l | |
5949 else r2 <- rbind( | |
5950 l, | |
5951 data.frame( | |
5952 kinase = r, | |
5953 uniprot_id = lkup[2], | |
5954 description = lkup[1] | |
5955 ) | |
5956 ) | |
5957 }, | |
5958 x = enriched_kinases$kinase, | |
5959 init = NULL | |
5960 ) | |
5961 | |
5962 if (length(enriched_kinase_descs) > 0 && nrow(enriched_kinase_descs) > 0) { | |
5963 cat("\n\\clearpage\n") | |
5964 data_frame_tabbing_latex( | |
5965 x = enriched_kinase_descs, | |
5966 tabstops = c(0.9, 1.3), | |
5967 headings = c("Kinase", "UniProt ID", "Description"), | |
5968 caption = paste0("Descriptions of k", my_section_header) | |
5969 ) | |
5970 } | |
5971 | |
5972 if (FALSE) { | |
5973 cat_variable(sqldf("SELECT kinase FROM enriched_kinases")) | |
5974 cat_variable(sqldf(" | |
5975 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
5976 FROM pseudo_ksdata | |
5977 WHERE gene IN (SELECT kinase FROM enriched_kinases) | |
5978 ")) | |
5979 data_frame_table_latex( | |
5980 x = sqldf(" | |
5981 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
5982 FROM pseudo_ksdata | |
5983 WHERE gene IN (SELECT kinase FROM enriched_kinases) | |
5984 "), | |
5985 justification = "l l l", | |
5986 centered = TRUE, | |
5987 caption = "substrates of enriched kinases", | |
5988 anchor = c(const_table_anchor_p, const_table_anchor_t), | |
5989 underscore_whack = TRUE | |
5990 ) | |
5991 data_frame_table_latex( | |
5992 x = sqldf(" | |
5993 SELECT | |
5994 gene AS kinase, | |
5995 ppep, | |
5996 sub_gene, | |
5997 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label, | |
5998 fdr_adjusted_anova_p, | |
5999 quality, | |
6000 min_group_obs_count | |
6001 FROM ( | |
6002 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
6003 FROM pseudo_ksdata | |
6004 WHERE gene IN (SELECT kinase FROM enriched_kinases) | |
6005 ), | |
6006 p_value_data | |
6007 WHERE ppep = phosphopeptide | |
6008 GROUP BY kinase, ppep | |
6009 ORDER BY kinase, ppep, p_value_data.quality DESC | |
6010 "), | |
6011 justification = "l l l l l l l", | |
6012 centered = TRUE, | |
6013 caption = "labeled substrates of enriched kinases", | |
6014 anchor = c(const_table_anchor_p, const_table_anchor_t), | |
6015 underscore_whack = TRUE | |
6016 ) | |
6017 } | |
6018 all_enriched_substrates <- sqldf(" | |
6019 SELECT | |
6020 gene AS kinase, | |
6021 ppep, | |
6022 sub_gene, | |
6023 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label, | |
6024 fdr_adjusted_anova_p, | |
6025 quality, | |
6026 min_group_obs_count | |
6027 FROM ( | |
6028 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
6029 FROM pseudo_ksdata | |
6030 WHERE gene IN (SELECT kinase FROM enriched_kinases) | |
6031 ), | |
6032 p_value_data | |
6033 WHERE ppep = phosphopeptide | |
6034 GROUP BY kinase, ppep | |
6035 ORDER BY kinase, ppep, p_value_data.quality DESC | |
6036 ") | |
6037 | |
6038 all_enriched_substrates <- | |
6039 all_enriched_substrates[ | |
6040 all_enriched_substrates$quality >= params$minQuality, | |
6041 , | |
6042 drop = FALSE | |
6043 ] | |
6044 | |
6045 all_enriched_substrates$sub_gene <- | |
6046 sub( | |
6047 " ///.*", | |
6048 " ...", | |
6049 all_enriched_substrates$sub_gene | |
6050 ) | |
6051 | |
6052 all_enriched_substrates$label <- | |
6053 with( | |
6054 all_enriched_substrates, | |
3273 sprintf( | 6055 sprintf( |
3274 "Change from treatment %s to treatment %s", | 6056 "(%s-%s) %s", |
3275 contrast_metadata_df[i_cntrst, "b_level"], | 6057 kinase, |
3276 contrast_metadata_df[i_cntrst, "a_level"] | 6058 trunc_subgene(sub_gene), |
3277 ) | 6059 ppep |
3278 ) | 6060 ) |
3279 sub_title <- contrast_longlabel | 6061 ) |
3280 tryCatch( | 6062 |
3281 expr = { | 6063 # this global is set to TRUE by cat_enriched_heading immediately below |
3282 ksea_scores_rslt <- rslt$score_list[[next_index]] | 6064 g_neednewpage <- FALSE |
3283 | 6065 |
3284 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { | 6066 # helper used to label per-kinase substrate enrichment figure |
3285 low_fdr_barplot( | 6067 cat_enriched_heading <- function(m, cut_args) { |
3286 rslt = rslt, | 6068 cutoff <- cut_args$cutoff |
3287 i_cntrst = i_cntrst, | 6069 kinase <- cut_args$kinase |
3288 i = next_index, | 6070 if (g_neednewpage) cat("\\newpage\n") |
3289 a_level = cntrst_a_level, | 6071 g_neednewpage <- TRUE |
3290 b_level = cntrst_b_level, | 6072 if (nrow(m) > g_intensity_hm_rows) { |
3291 fold_change = cntrst_fold_change, | 6073 cat(subsection_header( |
3292 caption = contrast_longlabel | |
3293 ) | |
3294 } | |
3295 }, | |
3296 error = function(e) str(e) | |
3297 ) | |
3298 } | |
3299 ``` | |
3300 | |
3301 ```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
3302 | |
3303 # Use enriched kinases to find enriched kinase-substrate pairs | |
3304 enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) | |
3305 all_enriched_substrates <- sqldf(" | |
3306 SELECT | |
3307 gene AS kinase, | |
3308 ppep, | |
3309 sub_gene, | |
3310 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label, | |
3311 fdr_adjusted_anova_p | |
3312 FROM ( | |
3313 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
3314 FROM pseudo_ksdata | |
3315 WHERE gene IN (SELECT kinase FROM enriched_kinases) | |
3316 ), | |
3317 p_value_data | |
3318 WHERE ppep = phosphopeptide | |
3319 GROUP BY ppep | |
3320 ORDER BY fdr_adjusted_anova_p | |
3321 ") | |
3322 | |
3323 # helper used to label per-kinase substrate enrichment figure | |
3324 cat_enriched_heading <- function(m, cut_args) { | |
3325 cutoff <- cut_args$cutoff | |
3326 kinase <- cut_args$kinase | |
3327 statistic <- cut_args$statistic | |
3328 threshold <- cut_args$threshold | |
3329 cat("\\newpage\n") | |
3330 if (nrow(m) > intensity_hm_rows) { | |
3331 subsection_header( | |
3332 paste( | |
3333 sprintf( | 6074 sprintf( |
3334 "Lowest p-valued %d (of %d) enriched %s-substrates,", | 6075 "Highest-quality %d (of %d) enriched %s-substrates", |
3335 intensity_hm_rows, | 6076 g_intensity_hm_rows, |
3336 nrow(m), | 6077 nrow(m), |
3337 kinase | 6078 kinase |
3338 ), | 6079 ) |
3339 sprintf(" KSEA %s < %0.2f\n", statistic, threshold) | 6080 )) |
3340 ) | |
3341 ) | |
3342 } else { | |
3343 if (nrow(m) == 0) { | |
3344 return(FALSE) | |
3345 } else { | 6081 } else { |
3346 subsection_header( | 6082 if (nrow(m) == 0) { |
3347 paste( | 6083 return(FALSE) |
6084 } else { | |
6085 nrow_m <- nrow(m) | |
6086 cat(subsection_header( | |
3348 sprintf( | 6087 sprintf( |
3349 "%d enriched %s-substrates,", | 6088 "%d enriched %s-substrate%s", |
3350 nrow(m), | 6089 nrow_m, |
3351 kinase | 6090 kinase, |
3352 ), | 6091 if (nrow_m > 1) "s" else "" |
3353 sprintf( | |
3354 " KSEA %s < %0.2f\n", | |
3355 statistic, | |
3356 threshold | |
3357 ) | 6092 ) |
6093 )) | |
6094 } | |
6095 } | |
6096 cat("\n\n\n") | |
6097 cat("\n\n\n") | |
6098 return(TRUE) | |
6099 } | |
6100 | |
6101 # -------------------------------- | |
6102 # hack begin - show all substrates | |
6103 enriched_substrates <- all_enriched_substrates | |
6104 # add "FALSE &&" to prevent listing of substrates | |
6105 if (show_enriched_substrates && nrow(enriched_substrates) > 0) { | |
6106 short_row_names <- sub( | |
6107 "$FAILED_MATCH_GENE_NAME", | |
6108 "not_found", | |
6109 enriched_substrates$sub_gene, | |
6110 fixed = TRUE | |
6111 ) | |
6112 | |
6113 if (print_nb_messages) nbe(see_variable(enriched_substrates)) | |
6114 substrates_df <- with( | |
6115 enriched_substrates, | |
6116 data.frame( | |
6117 kinase = kinase, | |
6118 substrate = sub(" ///*", "...", short_row_names), | |
6119 anova_p_value = signif(fdr_adjusted_anova_p, 2), | |
6120 min_group_obs_count = signif(min_group_obs_count, 0), | |
6121 quality = signif(quality, 3), | |
6122 sequence = trunc_n(30)(ppep) | |
3358 ) | 6123 ) |
3359 ) | 6124 ) |
3360 } | 6125 |
3361 } | 6126 substrates_df <- substrates_df[ |
3362 cat("\n\n\n") | 6127 with(substrates_df, order(kinase, -quality)), |
3363 cat("\n\n\n") | 6128 , |
3364 return(TRUE) | 6129 drop = FALSE |
3365 } | 6130 ] |
3366 | 6131 |
3367 # Disabling heatmaps for substrates pending decision whether to eliminate them altogether | 6132 if (print_nb_messages) nbe(see_variable(substrates_df)) |
3368 if (TRUE) | 6133 if (nrow(substrates_df) < 1) |
6134 substrates_df$sequence <- c() | |
6135 if (print_nb_messages) nbe(see_variable(substrates_df)) | |
6136 names(substrates_df) <- headers_2nd_line <- | |
6137 c("Kinase", "Substrate", "p-value", "per group)", "quality", "Sequence") | |
6138 headers_1st_line <- c("", "", "ANOVA", "min(values", "", "") | |
6139 data_frame_tabbing_latex( | |
6140 x = substrates_df, | |
6141 tabstops = c(1.2, 0.8, 0.5, 0.65, 0.5), | |
6142 headings = c(headers_1st_line, headers_2nd_line), | |
6143 caption = "Details for all enriched substrates of enriched kinases" | |
6144 ) | |
6145 rm( | |
6146 enriched_substrates, | |
6147 substrates_df, | |
6148 short_row_names, | |
6149 headers_1st_line, | |
6150 headers_2nd_line | |
6151 ) | |
6152 } | |
6153 cat("\\clearpage\n") | |
6154 # hack end - show all substrates | |
6155 # -------------------------------- | |
6156 | |
6157 # print deferred tables and graphs for kinases from contrasts | |
6158 for (i_cntrst in seq_len(length(ksea_prints))) { | |
6159 #latex_samepage({ | |
6160 cat(ksea_prints[[i_cntrst]]) | |
6161 cat("\n") | |
6162 ksea_barplots[[i_cntrst]]() | |
6163 cat("\n") | |
6164 cat("\\clearpage\n") | |
6165 #}) | |
6166 } | |
6167 | |
6168 } | |
6169 ``` | |
6170 | |
6171 ```{r enriched, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'} | |
6172 if (g_can_run_ksea) { | |
6173 g_did_enriched_header <- FALSE | |
3369 for (kinase_name in sort(enriched_kinases$kinase)) { | 6174 for (kinase_name in sort(enriched_kinases$kinase)) { |
3370 enriched_substrates <- | 6175 enriched_substrates <- |
3371 all_enriched_substrates[ | 6176 all_enriched_substrates[ |
3372 all_enriched_substrates$kinase == kinase_name, | 6177 all_enriched_substrates$kinase == kinase_name, |
3373 , | 6178 , |
3374 drop = FALSE | 6179 drop = FALSE |
3375 ] | 6180 ] |
6181 ten_trunc_ppep <- trunc_enriched_substrate(enriched_substrates$ppep) | |
3376 enriched_substrates$label <- with( | 6182 enriched_substrates$label <- with( |
3377 enriched_substrates, | 6183 enriched_substrates, |
3378 sprintf( | 6184 sprintf( |
3379 "(%s-%s) %s (%0.2g)", | 6185 "(%s) %s", |
3380 kinase, | 6186 make.names( |
3381 sub("$FAILED_MATCH_GENE_NAME", "unidentified", sub_gene, fixed = TRUE), | 6187 sub("$FAILED_MATCH_GENE_NAME", "not_found", sub_gene, fixed = TRUE), |
3382 ppep, | 6188 unique = TRUE |
3383 fdr_adjusted_anova_p | 6189 ), |
6190 ten_trunc_ppep | |
3384 ) | 6191 ) |
3385 ) | 6192 ) |
3386 # Get the intensity values for the heatmap | 6193 # Get the intensity values for the heatmap |
3387 enriched_intensities <- | 6194 enriched_intensities <- |
3388 as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) | 6195 as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) |
6196 | |
3389 # Remove rows having too many NA values to be relevant | 6197 # Remove rows having too many NA values to be relevant |
6198 good_rows <- (rowSums(enriched_intensities, na.rm = TRUE) != 0) | |
6199 #ACE nbe(see_variable(good_rows), "\n") | |
6200 enriched_substrates <- enriched_substrates[good_rows, , drop = FALSE] | |
6201 enriched_intensities <- enriched_intensities[good_rows, , drop = FALSE] | |
6202 | |
3390 # Rename the rows with the display-name for the heatmap | 6203 # Rename the rows with the display-name for the heatmap |
3391 rownames(enriched_intensities) <- | 6204 short_row_names <- sub( |
6205 "$FAILED_MATCH_GENE_NAME", | |
6206 "not_found", | |
6207 enriched_substrates$sub_gene, | |
6208 fixed = TRUE | |
6209 ) | |
6210 short_row_names <- | |
6211 make.names(short_row_names, unique = TRUE) | |
6212 long_row_names <- | |
3392 sapply( | 6213 sapply( |
3393 X = rownames(enriched_intensities), | 6214 X = rownames(enriched_intensities), |
3394 FUN = function(rn) { | 6215 FUN = function(rn) { |
3395 enriched_substrates[enriched_substrates$ppep == rn, "label"] | 6216 enriched_substrates[enriched_substrates$ppep == rn, "label"] |
3396 } | 6217 } |
3397 ) | 6218 ) |
6219 rownames(enriched_intensities) <- long_row_names | |
3398 # Format as matrix for heatmap | 6220 # Format as matrix for heatmap |
3399 m <- as.matrix(enriched_intensities) | 6221 m <- as.matrix(enriched_intensities) |
6222 rownames(m) <- trunc_enriched_substrate(rownames(m)) | |
6223 | |
6224 #ACE nb("m with bad rows: ", see_variable(m), "\n") | |
6225 #ACE good_rows <- (rowSums(m, na.rm = TRUE) != 0) | |
6226 #ACE nb(see_variable(good_rows), "\n") | |
6227 #ACE m <- m[good_rows, , drop = FALSE] | |
6228 #ACE nb("m without(?) bad rows: ", see_variable(m), "\n") | |
6229 #ACE nb(see_variable(short_row_names), "\n") | |
6230 #ACE local_short_row_names <- short_row_names[good_rows] | |
6231 #ACE local_long_row_names <- long_row_names[good_rows] | |
6232 #ACE local_enriched_intensities <- enriched_intensities[local_long_row_names, ] | |
6233 | |
3400 # Draw the heading and heatmap | 6234 # Draw the heading and heatmap |
3401 if (nrow(m) > 0) { | 6235 nrow_m <- nrow(m) |
6236 if (nrow_m > 0) { | |
6237 if (!g_did_enriched_header) { | |
6238 cat("\n\\clearpage\n") | |
6239 cat(section_header(paste0("K", my_section_header))) | |
6240 g_did_enriched_header <- TRUE | |
6241 } | |
6242 is_na_m <- is.na(m) | |
6243 cellnote_m <- is_na_m | |
6244 cellnote_m[!is_na_m] <- "" | |
6245 cellnote_m[is_na_m] <- "NA" | |
3402 cut_args <- new_env() | 6246 cut_args <- new_env() |
3403 cut_args$cutoff <- cutoff | 6247 cut_args$cutoff <- cutoff |
3404 cut_args$kinase <- kinase_name | 6248 cut_args$kinase <- kinase_name |
3405 cut_args$statistic <- ksea_cutoff_statistic | 6249 cut_args$statistic <- ksea_cutoff_statistic |
3406 cut_args$threshold <- ksea_cutoff_threshold | 6250 cut_args$threshold <- ksea_cutoff_threshold |
3407 number_of_peptides_found <- | 6251 number_of_peptides_found <- |
3408 draw_ppep_heatmap( | 6252 ppep_heatmap( |
3409 m = m, | 6253 m = m, |
6254 cellnote = cellnote_m, | |
3410 cutoff = cut_args, | 6255 cutoff = cut_args, |
3411 hm_heading_function = cat_enriched_heading, | 6256 hm_heading_function = cat_enriched_heading, |
3412 hm_main_title | 6257 hm_main_title |
3413 = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", | 6258 = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", |
3414 suppress_row_dendrogram = FALSE | 6259 suppress_row_dendrogram = FALSE, |
6260 master_cex = 0.35, | |
6261 sepcolor = "black", | |
6262 colsep = sample_colsep | |
3415 ) | 6263 ) |
3416 if (number_of_peptides_found > 1) { | 6264 if (number_of_peptides_found > 1) { |
3417 cat("\\leavevmode\n") | 6265 |
3418 cat("The kinase-subsrate pair is shown in parentheses | 6266 tryCatch( |
3419 before the phosphopeptide sequence.\n\n") | 6267 { |
3420 cat("The adjusted ANOVA \\textit{p}-value is shown in parentheses | 6268 rownames(m) <- short_row_names |
3421 after the phosphopeptide sequence.\n\n") | 6269 cov_heatmap(m, nrow_m > g_intensity_hm_rows) |
6270 }, | |
6271 error = function(e) { | |
6272 cat( | |
6273 sprintf( | |
6274 "ERROR: %s\n\\newline\n", | |
6275 mget("e") | |
6276 ) | |
6277 ) | |
6278 cat( | |
6279 sprintf( | |
6280 "message: %s\n\\newline\n", | |
6281 e$message | |
6282 ) | |
6283 ) | |
6284 cat_margins() | |
6285 } | |
6286 ) | |
3422 } | 6287 } |
3423 if (nrow(m) == 1) { | 6288 substrates_df <- with( |
3424 cat( | 6289 enriched_substrates, |
3425 sprintf( | 6290 data.frame( |
3426 "\n\nSubstrate is %s, | 6291 substrate = sub(" ///*", "...", short_row_names), |
3427 \nphopshopeptide is %s, | 6292 sequence = trunc_long_ppep(ppep), |
3428 \n\nand adjusted ANOVA \\textit{p}-value is %0.2g.\n", | 6293 anova_p_value = signif(fdr_adjusted_anova_p, 2), |
3429 enriched_substrates[1, "sub_gene"], | 6294 min_group_obs_count = signif(min_group_obs_count, 0), |
3430 enriched_substrates[1, "ppep"], | 6295 quality = signif(quality, 3) |
3431 enriched_substrates[1, "fdr_adjusted_anova_p"] | 6296 ) |
6297 ) | |
6298 excess_substrates <- nrow(substrates_df) > g_intensity_hm_rows | |
6299 if (excess_substrates) | |
6300 substrates_df <- substrates_df[1:g_intensity_hm_rows, ] | |
6301 names(substrates_df) <- headers_2nd_line <- | |
6302 c("Substrate", "Sequence", "p-value", "per group)", "quality") | |
6303 headers_1st_line <- c("", "", "ANOVA", "min(values", "") | |
6304 if (1 < nrow(enriched_substrates)) | |
6305 cat("\n\\newpage\n") | |
6306 cat(subsubsection_header( | |
6307 sprintf( | |
6308 "Details for %s%s-substrates", | |
6309 if (excess_substrates) | |
6310 sprintf( | |
6311 "%s \"highest quality\" ", | |
6312 g_intensity_hm_rows | |
3432 ) | 6313 ) |
6314 else "", | |
6315 kinase_name | |
3433 ) | 6316 ) |
3434 } | 6317 )) |
6318 substrates_df <- substrates_df[order(-substrates_df$quality), ] | |
6319 data_frame_tabbing_latex( | |
6320 x = substrates_df, | |
6321 tabstops = c(0.8, 3.8, 0.6, 0.8), | |
6322 headings = c(headers_1st_line, headers_2nd_line) | |
6323 ) | |
6324 } else { | |
6325 if (print_nb_messages) nbe(see_variable(nrow_m > 0), "\n") | |
3435 } | 6326 } |
3436 } | 6327 if (print_nb_messages) nb("end kinase ", kinase_name, "\n") |
3437 | 6328 } |
3438 # Write output tabular files | 6329 |
3439 | 6330 # Write output tabular files |
3440 # get kinase, ppep, concat(kinase) tuples for enriched kinases | 6331 |
3441 | 6332 # get kinase, ppep, concat(kinase) tuples for enriched kinases |
3442 kinase_ppep_label <- sqldf(" | 6333 |
3443 WITH | 6334 if (print_nb_messages) nb("kinase_ppep_label <- ...\n") |
3444 t(ppep, label) AS | 6335 if (print_nb_messages) nbe("kinase_ppep_label <- ...\n") |
3445 ( | 6336 kinase_ppep_label <- sqldf(" |
3446 SELECT DISTINCT | 6337 WITH |
3447 SUB_MOD_RSD AS ppep, | 6338 t(ppep, label) AS |
3448 group_concat(gene, '; ') AS label | 6339 ( |
6340 SELECT DISTINCT | |
6341 SUB_MOD_RSD AS ppep, | |
6342 group_concat(gene, '; ') AS label | |
6343 FROM pseudo_ksdata | |
6344 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | |
6345 GROUP BY ppep | |
6346 ), | |
6347 k(kinase, ppep_join) AS | |
6348 ( | |
6349 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join | |
3449 FROM pseudo_ksdata | 6350 FROM pseudo_ksdata |
3450 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | 6351 WHERE GENE IN (SELECT kinase FROM enriched_kinases) |
3451 GROUP BY ppep | 6352 ) |
3452 ), | 6353 SELECT k.kinase, t.ppep, t.label |
3453 k(kinase, ppep_join) AS | 6354 FROM t, k |
3454 ( | 6355 WHERE t.ppep = k.ppep_join |
3455 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join | 6356 ORDER BY k.kinase, t.ppep |
3456 FROM pseudo_ksdata | 6357 ") |
3457 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | 6358 |
3458 ) | 6359 |
3459 SELECT k.kinase, t.ppep, t.label | 6360 # extract what we need from full_data |
3460 FROM t, k | 6361 impish <- cbind(rownames(quant_data_imp), quant_data_imp) |
3461 WHERE t.ppep = k.ppep_join | 6362 colnames(impish)[1] <- "Phosphopeptide" |
3462 ORDER BY k.kinase, t.ppep | 6363 data_table_imputed_sql <- " |
3463 ") | 6364 SELECT |
3464 | 6365 f.*, |
3465 # extract what we need from full_data | 6366 k.label AS KSEA_enrichments, |
3466 impish <- cbind(rownames(quant_data_imp), quant_data_imp) | 6367 q.* |
3467 colnames(impish)[1] <- "Phosphopeptide" | 6368 FROM |
3468 data_table_imputed_sql <- " | 6369 metadata_plus_p f |
3469 SELECT | 6370 LEFT JOIN kinase_ppep_label k |
3470 f.*, | 6371 ON f.Phosphopeptide = k.ppep, |
3471 k.label AS KSEA_enrichments, | 6372 impish q |
3472 q.* | 6373 WHERE |
3473 FROM | 6374 f.Phosphopeptide = q.Phosphopeptide |
3474 metadata_plus_p f | 6375 " |
3475 LEFT JOIN kinase_ppep_label k | 6376 data_table_imputed <- sqldf(data_table_imputed_sql) |
3476 ON f.Phosphopeptide = k.ppep, | 6377 # Zap the duplicated 'Phosphopeptide' column named 'ppep' |
3477 impish q | 6378 data_table_imputed <- |
3478 WHERE | 6379 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] |
3479 f.Phosphopeptide = q.Phosphopeptide | 6380 |
3480 " | 6381 # Output imputed, un-normalized data |
3481 data_table_imputed <- sqldf(data_table_imputed_sql) | 6382 if (print_nb_messages) nb("Output imputed, un-normalized data tabular file\n") |
3482 # Zap the duplicated 'Phosphopeptide' column named 'ppep' | 6383 if (print_nb_messages) nbe("Output imputed, un-normalized data tabular file\n") |
3483 data_table_imputed <- | 6384 write.table( |
3484 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] | 6385 data_table_imputed |
3485 | 6386 , file = imputed_data_filename |
3486 # Output with imputed, un-normalized data | 6387 , sep = "\t" |
3487 | 6388 , col.names = TRUE |
3488 write.table( | 6389 , row.names = FALSE |
3489 data_table_imputed | 6390 , quote = FALSE |
3490 , file = imputed_data_filename | 6391 ) |
3491 , sep = "\t" | 6392 |
3492 , col.names = TRUE | 6393 |
3493 , row.names = FALSE | 6394 #output quantile normalized data |
3494 , quote = FALSE | 6395 impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) |
3495 ) | 6396 colnames(impish)[1] <- "Phosphopeptide" |
3496 | 6397 data_table_imputed <- sqldf(data_table_imputed_sql) |
3497 | 6398 # Zap the duplicated 'Phosphopeptide' column named 'ppep' |
3498 #output quantile normalized data | 6399 data_table_imputed <- |
3499 impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) | 6400 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] |
3500 colnames(impish)[1] <- "Phosphopeptide" | 6401 if (print_nb_messages) nb("Output quantile normalized data tabular file\n") |
3501 data_table_imputed <- sqldf(data_table_imputed_sql) | 6402 if (print_nb_messages) nbe("Output quantile normalized data tabular file\n") |
3502 # Zap the duplicated 'Phosphopeptide' column named 'ppep' | 6403 write.table( |
3503 data_table_imputed <- | 6404 data_table_imputed, |
3504 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] | 6405 file = imp_qn_lt_data_filenm, |
3505 write.table( | 6406 sep = "\t", |
3506 data_table_imputed, | 6407 col.names = TRUE, |
3507 file = imp_qn_lt_data_filenm, | 6408 row.names = FALSE, |
3508 sep = "\t", | 6409 quote = FALSE |
3509 col.names = TRUE, | 6410 ) |
3510 row.names = FALSE, | 6411 |
3511 quote = FALSE | 6412 ppep_kinase <- sqldf(" |
3512 ) | 6413 SELECT DISTINCT k.ppep, k.kinase |
3513 | 6414 FROM ( |
3514 ppep_kinase <- sqldf(" | 6415 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep |
3515 SELECT DISTINCT k.ppep, k.kinase | 6416 FROM pseudo_ksdata |
3516 FROM ( | 6417 WHERE GENE IN (SELECT kinase FROM enriched_kinases) |
3517 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep | 6418 ) k |
3518 FROM pseudo_ksdata | 6419 ORDER BY k.ppep, k.kinase |
3519 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | 6420 ") |
3520 ) k | 6421 |
3521 ORDER BY k.ppep, k.kinase | 6422 RSQLite::dbWriteTable( |
3522 ") | 6423 conn = db, |
3523 | 6424 name = "ksea_enriched_ks", |
3524 RSQLite::dbWriteTable( | 6425 value = ppep_kinase, |
3525 conn = db, | 6426 append = FALSE |
3526 name = "ksea_enriched_ks", | 6427 ) |
3527 value = ppep_kinase, | 6428 } |
3528 append = FALSE | 6429 |
3529 ) | 6430 if (print_nb_messages) nb("RSQLite::dbWriteTable anova_signif\n") |
3530 | 6431 |
3531 RSQLite::dbWriteTable( | 6432 RSQLite::dbWriteTable( |
3532 conn = db, | 6433 conn = db, |
3533 name = "anova_signif", | 6434 name = "anova_signif", |
3534 value = p_value_data, | 6435 value = p_value_data, |
3554 ON m.phospho_peptide = kek.ppep | 6455 ON m.phospho_peptide = kek.ppep |
3555 ; | 6456 ; |
3556 " | 6457 " |
3557 ) | 6458 ) |
3558 | 6459 |
6460 if (print_nb_messages) nb("Output contents of `stats_metadata_v` table to tabular file\n") | |
6461 if (print_nb_messages) nbe("Output contents of `stats_metadata_v` table to tabular file\n") | |
3559 write.table( | 6462 write.table( |
3560 dbReadTable(db, "stats_metadata_v"), | 6463 dbReadTable(db, "stats_metadata_v"), |
3561 file = anova_ksea_mtdt_file, | 6464 file = anova_ksea_mtdt_file, |
3562 sep = "\t", | 6465 sep = "\t", |
3563 col.names = TRUE, | 6466 col.names = TRUE, |
3564 row.names = FALSE, | 6467 row.names = FALSE, |
3565 quote = FALSE | 6468 quote = FALSE |
3566 ) | 6469 ) |
3567 | 6470 |
6471 cat("\n\\clearpage\n") | |
3568 | 6472 |
3569 ``` | 6473 ``` |
6474 | |
6475 # Data-processing summary flowchart | |
6476 | |
6477  | |
3570 | 6478 |
3571 ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | 6479 ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} |
3572 cat("\\leavevmode\n\n\n") | 6480 cat("\\leavevmode\n\n\n") |
3573 | 6481 |
3574 # write parameters to report | 6482 write_params(db) |
3575 | |
3576 param_unlist <- unlist(as.list(params)) | |
3577 param_df <- data.frame( | |
3578 parameter = paste0("\\verb@", names(param_unlist), "@"), | |
3579 value = paste0( | |
3580 "\n\\begin{tiny}\n\\verb@", | |
3581 gsub("$", "\\$", param_unlist, fixed = TRUE), | |
3582 "@\n\\end{tiny}" | |
3583 ) | |
3584 ) | |
3585 | |
3586 data_frame_latex( | |
3587 x = param_df, | |
3588 justification = "p{0.35\\linewidth} p{0.6\\linewidth}", | |
3589 centered = TRUE, | |
3590 caption = "Input parameters", | |
3591 anchor = const_table_anchor_bp, | |
3592 underscore_whack = FALSE | |
3593 ) | |
3594 | |
3595 # write parameters to SQLite output | |
3596 | |
3597 mqppep_anova_script_param_df <- data.frame( | |
3598 script = "mqppep_anova_script.Rmd", | |
3599 parameter = names(param_unlist), | |
3600 value = param_unlist | |
3601 ) | |
3602 ddl_exec(db, " | |
3603 DROP TABLE IF EXISTS script_parameter; | |
3604 " | |
3605 ) | |
3606 ddl_exec(db, " | |
3607 CREATE TABLE IF NOT EXISTS script_parameter( | |
3608 script TEXT, | |
3609 parameter TEXT, | |
3610 value ANY, | |
3611 UNIQUE (script, parameter) ON CONFLICT REPLACE | |
3612 ) | |
3613 ; | |
3614 " | |
3615 ) | |
3616 RSQLite::dbWriteTable( | |
3617 conn = db, | |
3618 name = "script_parameter", | |
3619 value = mqppep_anova_script_param_df, | |
3620 append = TRUE | |
3621 ) | |
3622 | |
3623 # We are done with output | 6483 # We are done with output |
3624 RSQLite::dbDisconnect(db) | 6484 RSQLite::dbDisconnect(db) |
6485 | |
6486 cat("\\clearpage\n\\section{R package versions}\n") | |
6487 utils::toLatex(utils::sessionInfo()) | |
3625 ``` | 6488 ``` |
3626 <!-- | |
3627 There's gotta be a better way... | |
3628 | |
3629 loaded_packages_df <- sessioninfo::package_info("loaded") | |
3630 loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) | |
3631 loaded_packages_df <- data.frame( | |
3632 package = loaded_packages_df$package, | |
3633 version = loaded_packages_df$loadedversion, | |
3634 date = loaded_packages_df$date | |
3635 ) | |
3636 data_frame_latex( | |
3637 x = loaded_packages_df, | |
3638 justification = "l | l l", | |
3639 centered = FALSE, | |
3640 caption = "Loaded R packages", | |
3641 anchor = const_table_anchor_bp | |
3642 ) | |
3643 --> |