Mercurial > repos > eschen42 > mqppep_anova
comparison mqppep_anova_script.Rmd @ 13:b41a077af3aa draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 040e4945da00a279cb60daae799fce9489f99c50"
author | eschen42 |
---|---|
date | Tue, 22 Mar 2022 20:47:40 +0000 |
parents | 4deacfee76ef |
children | 6679616d0c18 |
comparison
equal
deleted
inserted
replaced
12:4deacfee76ef | 13:b41a077af3aa |
---|---|
1 --- | 1 --- |
2 title: "MaxQuant Phospho-Proteomic Enrichment Pipeline ANOVA" | 2 title: "MaxQuant Phospho-Proteomic Enrichment Pipeline ANOVA" |
3 author: "Larry Cheng; Art Eschenlauer" | 3 author: "Larry Cheng; Art Eschenlauer" |
4 date: "May 28, 2018; Nov 16, 2021" | 4 date: "May 28, 2018; Mar 16, 2022" |
5 output: | 5 output: |
6 pdf_document: default | 6 pdf_document: |
7 toc: true | |
8 latex_document: | |
9 toc: true | |
7 params: | 10 params: |
8 inputFile: "test-data/test_input_for_anova.tabular" | 11 inputFile: "test-data/test_input_for_anova.tabular" |
9 alphaFile: "test-data/alpha_levels.tabular" | 12 alphaFile: "test-data/alpha_levels.tabular" |
10 firstDataColumn: "Intensity" | 13 firstDataColumn: "Intensity" |
11 imputationMethod: !r c("group-median", "median", "mean", "random")[1] | 14 imputationMethod: !r c("group-median", "median", "mean", "random")[1] |
12 meanPercentile: 1 | 15 meanPercentile: 1 |
13 sdPercentile: 0.2 | 16 sdPercentile: 1.0 |
14 regexSampleNames: "\\.\\d+[A-Z]$" | 17 regexSampleNames: "\\.\\d+[A-Z]$" |
15 regexSampleGrouping: "\\d+" | 18 regexSampleGrouping: "\\d+" |
16 imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt" | 19 imputedDataFilename: "test-data/imputedDataFilename.txt" |
20 imputedQNLTDataFile: "test-data/imputedQNLTDataFile.txt" | |
21 show_toc: true | |
17 --- | 22 --- |
23 <!-- | |
24 latex_document: default | |
25 inputFile: "test-data/test_input_for_anova.tabular" | |
26 inputFile: "test-data/density_failure.preproc_tab.tabular" | |
27 inputFile: "test-data/UT_Phospho_STY_Sites.preproc_tab" | |
28 date: "May 28, 2018; Mar 16, 2022" | |
29 --> | |
18 ```{r setup, include = FALSE} | 30 ```{r setup, include = FALSE} |
19 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 | 31 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 |
20 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) | 32 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) |
33 | |
34 # freeze the random number generator so the same results will be produced | |
35 # from run to run | |
36 set.seed(28571) | |
37 | |
38 ### CONSTANTS | |
39 | |
40 const_parfin <- par("fin") | |
41 const_boxplot_fill <- "grey94" | |
42 const_stripchart_cex <- 0.5 | |
43 const_stripsmall_cex <- | |
44 sqrt(const_stripchart_cex * const_stripchart_cex / 2) | |
45 const_stripchart_jitter <- 0.3 | |
46 const_write_debug_files <- FALSE | |
21 | 47 |
22 ### FUNCTIONS | 48 ### FUNCTIONS |
23 | 49 |
24 #ANOVA filter function | 50 #ANOVA filter function |
25 anova_func <- function(x, grouping_factor) { | 51 anova_func <- function(x, grouping_factor) { |
26 x_aov <- aov(as.numeric(x) ~ grouping_factor) | 52 x_aov <- aov(as.numeric(x) ~ grouping_factor) |
27 pvalue <- summary(x_aov)[[1]][["Pr(>F)"]][1] | 53 pvalue <- summary(x_aov)[[1]][["Pr(>F)"]][1] |
28 pvalue | 54 pvalue |
29 } | 55 } |
56 | |
57 write_debug_file <- function(s) { | |
58 if (const_write_debug_files) { | |
59 s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) | |
60 write.table( | |
61 s, | |
62 file = s_path, | |
63 sep = "\t", | |
64 col.names = TRUE, | |
65 row.names = TRUE, | |
66 quote = FALSE | |
67 ) | |
68 } | |
69 } | |
70 | |
71 latex_collapsed_vector <- function(collapse_string, v) { | |
72 cat( | |
73 paste0( | |
74 gsub("_", "\\\\_", v), | |
75 collapse = collapse_string | |
76 ) | |
77 ) | |
78 } | |
79 | |
80 latex_itemized_collapsed <- function(collapse_string, v) { | |
81 cat("\\begin{itemize}\n\\item ") | |
82 latex_collapsed_vector(collapse_string, v) | |
83 cat("\n\\end{itemize}\n") | |
84 } | |
85 | |
86 latex_itemized_list <- function(v) { | |
87 latex_itemized_collapsed("\n\\item ", v) | |
88 } | |
89 | |
90 latex_enumerated_collapsed <- function(collapse_string, v) { | |
91 cat("\\begin{enumerate}\n\\item ") | |
92 latex_collapsed_vector(collapse_string, v) | |
93 cat("\n\\end{enumerate}\n") | |
94 } | |
95 | |
96 latex_enumerated_list <- function(v) { | |
97 latex_enumerated_collapsed("\n\\item ", v) | |
98 } | |
99 | |
100 latex_table_row <- function(v) { | |
101 latex_collapsed_vector(" & ", v) | |
102 cat(" \\\\\n") | |
103 } | |
104 | |
105 # Use this like print.data.frame, from which it is adapted: | |
106 print_data_frame_latex <- | |
107 function( | |
108 x, | |
109 ..., | |
110 # digits to pass to format.data.frame | |
111 digits = NULL, | |
112 # TRUE -> right-justify columns; FALSE -> left-justify | |
113 right = TRUE, | |
114 # maximumn number of rows to print | |
115 max = NULL, | |
116 # string with justification of each column | |
117 justification = NULL, | |
118 # TRUE to center on page | |
119 centered = FALSE, | |
120 # optional capttion | |
121 caption = NULL, | |
122 # h(inline); b(bottom); t (top) or p (separate page) | |
123 anchor = "h" | |
124 ) { | |
125 if (is.null(justification)) | |
126 justification <- | |
127 Reduce( | |
128 f = paste, | |
129 x = rep_len(if (right) "r" else "l", length(colnames(x))) | |
130 ) | |
131 n <- length(rownames(x)) | |
132 if (length(x) == 0L) { | |
133 cat( | |
134 sprintf( | |
135 # if n is one, use singular 'row', else use plural 'rows' | |
136 ngettext( | |
137 n, | |
138 "data frame with 0 columns and %d row", | |
139 "data frame with 0 columns and %d rows" | |
140 ), | |
141 n | |
142 ), | |
143 "\n", | |
144 sep = "" | |
145 ) | |
146 } | |
147 else if (n == 0L) { | |
148 cat("0 rows for:\n") | |
149 latex_itemized_list(names(x)) | |
150 } | |
151 else { | |
152 if (is.null(max)) | |
153 max <- getOption("max.print", 99999L) | |
154 if (!is.finite(max)) | |
155 stop("invalid 'max' / getOption(\"max.print\"): ", | |
156 max) | |
157 omit <- (n0 <- max %/% length(x)) < n | |
158 m <- as.matrix( | |
159 format.data.frame( | |
160 if (omit) x[seq_len(n0), , drop = FALSE] else x, | |
161 digits = digits, | |
162 na.encode = FALSE | |
163 ) | |
164 ) | |
165 cat( | |
166 # h(inline); b(bottom); t (top) or p (separate page) | |
167 paste0("\\begin{table}[", anchor, "]\n") | |
168 ) | |
169 if (!is.null(caption)) | |
170 cat(paste0(" \\caption{", caption, "}")) | |
171 if (centered) cat("\\centering\n") | |
172 cat( | |
173 paste( | |
174 " \\begin{tabular}{", | |
175 justification, | |
176 "}\n", | |
177 sep = "" | |
178 ) | |
179 ) | |
180 if (!is.null(caption)) | |
181 cat(" \\hline\\hline\n") | |
182 latex_table_row(colnames(m)) | |
183 cat("\\hline\n") | |
184 for (i in seq_len(length(m[, 1]))) { | |
185 latex_table_row(m[i, ]) | |
186 } | |
187 cat( | |
188 paste( | |
189 " \\end{tabular}", | |
190 "\\end{table}", | |
191 sep = "\n" | |
192 ) | |
193 ) | |
194 if (omit) | |
195 cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", | |
196 n - n0, "rows ]\n") | |
197 } | |
198 invisible(x) | |
199 } | |
200 | |
30 ``` | 201 ``` |
31 | 202 |
32 ## Purpose: | 203 ## Purpose: |
204 | |
33 Perform imputation of missing values, quantile normalization, and ANOVA. | 205 Perform imputation of missing values, quantile normalization, and ANOVA. |
34 | 206 |
35 <!-- | 207 <!-- |
36 ## Variables to change for each input file | 208 ## Variables to change for each input file |
37 --> | 209 --> |
56 val_fdr <- | 228 val_fdr <- |
57 read.table(file = params$alphaFile, sep = "\t", header = F, quote = "")[, 1] | 229 read.table(file = params$alphaFile, sep = "\t", header = F, quote = "")[, 1] |
58 | 230 |
59 #Imputed Data filename | 231 #Imputed Data filename |
60 imputed_data_filename <- params$imputedDataFilename | 232 imputed_data_filename <- params$imputedDataFilename |
233 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile | |
61 | 234 |
62 #ANOVA data filename | 235 #ANOVA data filename |
63 ``` | 236 ``` |
64 | 237 |
65 ```{r echo = FALSE} | 238 ```{r echo = FALSE} |
76 | 249 |
77 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" | 250 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" |
78 regex_sample_names <- params$regexSampleNames | 251 regex_sample_names <- params$regexSampleNames |
79 | 252 |
80 # Regular expression to extract Sample Grouping from Sample Name; | 253 # Regular expression to extract Sample Grouping from Sample Name; |
81 # if error occurs, compare sample_factor_levels and temp_matches | 254 # if error occurs, compare sample_factor_levels and sample_name_matches |
82 # to see if groupings/pairs line up | 255 # to see if groupings/pairs line up |
83 # e.g., "(\\d+)" | 256 # e.g., "(\\d+)" |
84 regex_sample_grouping <- params$regexSampleGrouping | 257 regex_sample_grouping <- params$regexSampleGrouping |
85 | 258 |
86 ``` | 259 ``` |
99 quote = "", | 272 quote = "", |
100 check.names = FALSE | 273 check.names = FALSE |
101 ) | 274 ) |
102 ``` | 275 ``` |
103 | 276 |
104 ### Column names from input file | 277 ### Parse column names, sample names, and factor levels from input file |
105 | 278 |
106 ```{r echo = FALSE, results = 'markup'} | 279 ```{r echo = FALSE, results = 'asis'} |
107 print(colnames(full_data)) | 280 # Write column naames as an enumerated list. |
281 column_name_df <- data.frame( | |
282 column = seq_len(length(colnames(full_data))), | |
283 name = colnames(full_data) | |
284 ) | |
285 print_data_frame_latex( | |
286 x = column_name_df, | |
287 justification = "l l", | |
288 centered = TRUE, | |
289 caption = "Input data column name", | |
290 anchor = "h" | |
291 ) | |
292 | |
108 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) | 293 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) |
109 cat(sprintf("First data column: %d\n", min(data_column_indices))) | 294 cat( |
110 cat(sprintf("Last data column: %d\n", max(data_column_indices))) | 295 sprintf( |
111 ``` | 296 "\n\nData columns: [%d,%d]\n\n", |
112 | 297 min(data_column_indices), |
113 ```{r echo = FALSE, results = 'asis'} | 298 max(data_column_indices) |
114 cat("\\newpage\n") | 299 ) |
115 ``` | 300 ) |
116 | |
117 ### Checking that log-transformed sample distributions are similar: | |
118 | |
119 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | |
120 | 301 |
121 if (FALSE == fdc_is_integer) { | 302 if (FALSE == fdc_is_integer) { |
122 | |
123 if (length(data_column_indices) > 0) { | 303 if (length(data_column_indices) > 0) { |
124 first_data_column <- data_column_indices[1] | 304 first_data_column <- data_column_indices[1] |
125 } else { | 305 } else { |
126 stop(paste("failed to convert firstDataColumn:", first_data_column)) | 306 stop(paste("failed to convert firstDataColumn:", first_data_column)) |
127 } | 307 } |
128 } | 308 } |
129 | 309 |
130 quant_data0 <- full_data[first_data_column:length(full_data)] | 310 ``` |
311 | |
312 ```{r echo = FALSE, results = 'asis'} | |
313 #```{r echo = FALSE, results = 'asis'} | |
131 quant_data <- full_data[first_data_column:length(full_data)] | 314 quant_data <- full_data[first_data_column:length(full_data)] |
315 quant_data[quant_data == 0] <- NA | |
316 rownames(quant_data) <- full_data$Phosphopeptide | |
317 # Get factors -> group replicates (as indicated by terminal letter) | |
318 # by the preceding digits; | |
319 # Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$" | |
320 # get factors -> | |
321 # group runs (samples) by ignoring terminal [A-Z] in sample names | |
322 # e.g. | |
323 # group .1A .1B .1C into group 1; | |
324 # group .2A .2B .2C, into group 2; | |
325 # etc. | |
326 m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE) | |
327 sample_name_matches <- regmatches(names(quant_data), m) | |
328 colnames(quant_data) <- sample_name_matches | |
329 | |
330 write_debug_file(quant_data) | |
331 | |
332 m2 <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) | |
333 sample_factor_levels <- as.factor(regmatches(sample_name_matches, m2)) | |
334 number_of_samples <- length(sample_name_matches) | |
335 sample_factor_df <- data.frame( | |
336 sample = sample_name_matches, | |
337 level = sample_factor_levels | |
338 ) | |
339 print_data_frame_latex( | |
340 x = sample_factor_df, | |
341 justification = "c c", | |
342 centered = TRUE, | |
343 caption = "Factor level", | |
344 anchor = "h" | |
345 ) | |
346 ``` | |
347 ```{r echo = FALSE, results = 'asis'} | |
348 cat("\\newpage\n") | |
349 ``` | |
350 | |
351 ### Are the log-transformed sample distributions similar? | |
352 | |
353 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | |
354 | |
132 quant_data[quant_data == 0] <- NA #replace 0 with NA | 355 quant_data[quant_data == 0] <- NA #replace 0 with NA |
133 quant_data_log <- log10(quant_data) | 356 quant_data_log <- log10(quant_data) |
134 | 357 |
135 rownames(quant_data_log) <- full_data$Phosphopeptide | 358 rownames(quant_data_log) <- rownames(quant_data) |
359 colnames(quant_data_log) <- sample_name_matches | |
360 | |
361 write_debug_file(quant_data_log) | |
136 | 362 |
137 # data visualization | 363 # data visualization |
138 old_par <- par( | 364 old_par <- par( |
139 mai = par("mai") + c(0.5, 0, 0, 0) | 365 mai = par("mai") + c(0.5, 0, 0, 0) |
140 ) | 366 ) |
367 # ref: https://r-charts.com/distribution/add-points-boxplot/ | |
368 # Vertical plot | |
141 boxplot( | 369 boxplot( |
142 quant_data_log | 370 quant_data_log |
143 , las = 2 | 371 , las = 1 |
372 , col = const_boxplot_fill | |
144 ) | 373 ) |
374 # Points | |
375 stripchart( | |
376 quant_data_log, # Data | |
377 method = "jitter", # Random noise | |
378 jitter = const_stripchart_jitter, | |
379 pch = 19, # Pch symbols | |
380 cex = const_stripchart_cex, # Size of symbols reduced | |
381 col = "goldenrod", # Color of the symbol | |
382 vertical = TRUE, # Vertical mode | |
383 add = TRUE # Add it over | |
384 ) | |
145 par(old_par) | 385 par(old_par) |
146 | 386 |
147 | 387 |
148 | 388 |
149 cat("\\newline\n") | 389 cat("\n\n\n") |
150 cat("\\newline\n") | 390 cat("\n\n\n") |
151 | 391 |
152 ``` | 392 ``` |
153 | 393 |
154 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), warning = FALSE} | 394 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} |
155 quant_data_log_stack <- stack(quant_data_log) | |
156 library(ggplot2) | 395 library(ggplot2) |
157 ggplot( | 396 if (nrow(quant_data_log) > 1) { |
158 quant_data_log_stack, | 397 quant_data_log_stack <- stack(quant_data_log) |
159 aes(x = values)) + geom_density(aes(group = ind, colour = ind)) | 398 ggplot( |
160 ``` | 399 quant_data_log_stack, |
161 | 400 aes(x = values) |
162 ### Globally, are phosphopeptide intensities are approximately unimodal? | 401 ) + |
402 geom_density( | |
403 aes(group = ind, colour = ind), | |
404 na.rm = TRUE | |
405 ) | |
406 } else { | |
407 cat("No density plot because there are too few peptides.\n\n") | |
408 } | |
409 ``` | |
410 | |
411 ### Globally, are peptide intensities are approximately unimodal? | |
163 | 412 |
164 <!-- | 413 <!-- |
165 # ref for bquote below particularly and plotting math expressions generally: | 414 # ref for bquote below particularly and plotting math expressions generally: |
166 # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ | 415 # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ |
167 --> | 416 --> |
168 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5)} | 417 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} |
169 | 418 |
170 # identify the location of missing values | 419 # identify the location of missing values |
171 fin <- is.finite(as.numeric(as.matrix(quant_data_log))) | 420 fin <- is.finite(as.numeric(as.matrix(quant_data_log))) |
172 | 421 |
173 logvalues <- as.numeric(as.matrix(quant_data_log))[fin] | 422 logvalues <- as.numeric(as.matrix(quant_data_log))[fin] |
183 , main = bquote("Frequency vs." ~ log[10](intensity)) | 432 , main = bquote("Frequency vs." ~ log[10](intensity)) |
184 , xlab = bquote(log[10](intensity)) | 433 , xlab = bquote(log[10](intensity)) |
185 ) | 434 ) |
186 ``` | 435 ``` |
187 | 436 |
188 ### Distribution of standard deviations of phosphopeptides, ignoring missing values: | 437 ### Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values: |
189 | 438 |
190 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5)} | 439 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} |
191 # determine quantile | 440 # determine quantile |
192 q1 <- quantile(logvalues, probs = mean_percentile)[1] | 441 q1 <- quantile(logvalues, probs = mean_percentile)[1] |
193 | 442 |
194 # determine standard deviation of quantile to impute | 443 # determine standard deviation of quantile to impute |
195 sd_finite <- function(x) { | 444 sd_finite <- function(x) { |
196 ok <- is.finite(x) | 445 ok <- is.finite(x) |
197 sd(x[ok]) * sd_percentile | 446 sd(x[ok]) |
198 } | 447 } |
199 # 1 = row of matrix (ie, phosphopeptide) | 448 # 1 = row of matrix (ie, phosphopeptide) |
200 sds <- apply(quant_data_log, 1, sd_finite) | 449 sds <- apply(quant_data_log, 1, sd_finite) |
201 plot( | 450 if (sum(!is.na(sds)) > 2) { |
202 density(sds, na.rm = T) | 451 plot( |
203 , main = "Smoothed estimated probability density vs. std. deviation" | 452 density(sds, na.rm = T) |
204 , sub = "(probability estimation made with Gaussian smoothing)" | 453 , main = "Smoothed estimated probability density vs. std. deviation" |
205 ) | 454 , sub = "(probability estimation made with Gaussian smoothing)" |
206 | 455 ) |
207 m1 <- median(sds, na.rm = T) #sd to be used is the median sd | 456 } else { |
208 | 457 cat( |
209 ``` | 458 "At least two non-missing values are required to plot", |
210 | 459 "probability density.\n\n" |
211 | 460 ) |
212 | 461 } |
213 <!-- | 462 |
214 The number of missing values are: | 463 ``` |
215 --> | 464 |
465 ```{r echo = FALSE} | |
466 # Determine number of cells to impute | |
467 temp <- quant_data[is.na(quant_data)] | |
468 | |
469 # Determine number of values to impute | |
470 number_to_impute <- length(temp) | |
471 | |
472 # Determine percent of missing values | |
473 pct_missing_values <- | |
474 round(length(temp) / (length(logvalues) + length(temp)) * 100) | |
475 ``` | |
476 | |
477 ```{r echo = FALSE} | |
478 | |
479 # prep for trt-median based imputation | |
480 | |
481 ``` | |
482 ## Impute Missing Values | |
483 | |
484 ```{r echo = FALSE} | |
485 | |
486 imp_smry_potential_before <- length(logvalues) | |
487 imp_smry_missing_before <- number_to_impute | |
488 imp_smry_pct_missing <- pct_missing_values | |
489 | |
490 ``` | |
491 | |
216 ```{r echo = FALSE} | 492 ```{r echo = FALSE} |
217 #Determine number of cells to impute | 493 #Determine number of cells to impute |
218 temp <- quant_data[is.na(quant_data)] | |
219 | |
220 #Determine number of values to impute | |
221 number_to_impute <- length(temp) | |
222 ``` | |
223 | |
224 <!-- | |
225 % of values that are missing: | |
226 --> | |
227 ```{r echo = FALSE} | |
228 pct_missing_values <- length(temp) / (length(logvalues) + length(temp)) * 100 | |
229 ``` | |
230 | |
231 <!-- | |
232 First few rows of data before imputation: | |
233 --> | |
234 ```{r echo = FALSE, results = 'asis'} | |
235 cat("\\newpage\n") | |
236 ``` | |
237 | |
238 ## Parse sample names | |
239 | |
240 Parse the names of the samples to deduce the factor level for each sample: | |
241 | |
242 ```{r echo = FALSE} | |
243 | |
244 # prep for trt-median based imputation | |
245 | |
246 # Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$" | |
247 # get factors -> | |
248 # group runs (samples) by ignoring terminal [A-Z] in sample names | |
249 | |
250 m <- regexpr(regex_sample_names, names(quant_data), perl = TRUE) | |
251 temp_matches <- regmatches(names(quant_data), m) | |
252 print("Extracted sample names") | |
253 print(temp_matches) | |
254 m2 <- regexpr(regex_sample_grouping, temp_matches, perl = TRUE) | |
255 sample_factor_levels <- as.factor(regmatches(temp_matches, m2)) | |
256 print("Factor levels") | |
257 print(sample_factor_levels) | |
258 | |
259 ``` | |
260 ## Impute missing values | |
261 | |
262 ```{r echo = FALSE} | |
263 | |
264 #Determine number of cells to impute | |
265 cat("Before imputation,", | |
266 sprintf( | |
267 "there are:\n %d peptides\n %d missing values (%2.0f%s)", | |
268 sum(rep.int(TRUE, nrow(quant_data))), | |
269 sum(is.na(quant_data)), | |
270 pct_missing_values, | |
271 "%" | |
272 ) | |
273 ) | |
274 | 494 |
275 ``` | 495 ``` |
276 ```{r echo = FALSE} | 496 ```{r echo = FALSE} |
277 | 497 |
278 #Impute data | 498 #Impute data |
280 | 500 |
281 # Identify which values are missing and need to be imputed | 501 # Identify which values are missing and need to be imputed |
282 ind <- which(is.na(quant_data_imp), arr.ind = TRUE) | 502 ind <- which(is.na(quant_data_imp), arr.ind = TRUE) |
283 | 503 |
284 ``` | 504 ``` |
285 ```{r echo = FALSE} | 505 ```{r echo = FALSE, results = 'asis'} |
286 | 506 |
287 # Apply imputation | 507 # Apply imputation |
288 switch( | 508 switch( |
289 imputation_method | 509 imputation_method |
290 , "group-median" = { | 510 , "group-median" = { |
291 cat("Imputation method:\n substitute missing value", | 511 imputation_method_description <- |
292 "with median peptide-intensity for sample-group\n") | 512 paste("Substitute missing value with", |
293 | 513 "median peptide intensity for sample group\n" |
514 ) | |
294 sample_level_integers <- as.integer(sample_factor_levels) | 515 sample_level_integers <- as.integer(sample_factor_levels) |
295 for (i in seq_len(length(levels(sample_factor_levels)))) { | 516 for (i in seq_len(length(levels(sample_factor_levels)))) { |
296 level_cols <- i == sample_level_integers | 517 level_cols <- i == sample_level_integers |
297 ind <- which(is.na(quant_data_imp[, level_cols]), arr.ind = TRUE) | 518 ind <- which(is.na(quant_data_imp[, level_cols]), arr.ind = TRUE) |
298 quant_data_imp[ind, level_cols] <- | 519 quant_data_imp[ind, level_cols] <- |
299 apply(quant_data_imp[, level_cols], 1, median, na.rm = T)[ind[, 1]] | 520 apply(quant_data_imp[, level_cols], 1, median, na.rm = T)[ind[, 1]] |
300 } | 521 } |
301 good_rows <- !is.na(rowMeans(quant_data_imp)) | 522 good_rows <- !is.na(rowMeans(quant_data_imp)) |
302 } | 523 } |
303 , "median" = { | 524 , "median" = { |
304 cat("Imputation method:\n substitute missing value with", | 525 imputation_method_description <- |
305 "median peptide-intensity across all sample classes\n") | 526 paste("Substitute missing value with", |
527 "median peptide intensity across all sample classes\n" | |
528 ) | |
306 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]] | 529 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]] |
307 good_rows <- !is.na(rowMeans(quant_data_imp)) | 530 good_rows <- !is.na(rowMeans(quant_data_imp)) |
308 } | 531 } |
309 , "mean" = { | 532 , "mean" = { |
310 cat("Imputation method:\n substitute missing value with", | 533 imputation_method_description <- |
311 "mean peptide-intensity across all sample classes\n") | 534 paste("Substitute missing value with", |
535 "mean peptide intensity across all sample classes\n" | |
536 ) | |
312 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]] | 537 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]] |
313 good_rows <- !is.na(rowMeans(quant_data_imp)) | 538 good_rows <- !is.na(rowMeans(quant_data_imp)) |
314 } | 539 } |
315 , "random" = { | 540 , "random" = { |
316 cat( | 541 m1 <- median(sds, na.rm = T) * sd_percentile #sd to be used is the median sd |
317 "Imputation method:\n substitute missing value with\n ", | 542 # If you want results to be reproducible, you will want to call |
318 sprintf( | 543 # base::set.seed before calling stats::rnorm |
319 "random intensity N ~ (%0.2f, %0.2f)\n" | 544 imputation_method_description <- |
320 , q1, m1 | 545 paste("Substitute each missing value with random intensity", |
321 ) | 546 sprintf( |
322 ) | 547 "random intensity $N \\sim (%0.2f, %0.2f)$\n", |
323 quant_data_imp[is.na(quant_data_imp)] <- | 548 q1, m1 |
549 ) | |
550 ) | |
551 cat(sprintf("mean_percentile (from input parameter) is %2.0f\n\n", | |
552 100 * mean_percentile)) | |
553 cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n", | |
554 sd_percentile)) | |
555 #ACE cat(sprintf("sd for rnorm is %0.4f\n\n", m1)) | |
556 quant_data_imp[ind] <- | |
324 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) | 557 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) |
325 good_rows <- !is.na(rowMeans(quant_data_imp)) | 558 good_rows <- !is.na(rowMeans(quant_data_imp)) |
326 } | 559 } |
327 ) | 560 ) |
328 | 561 |
329 ``` | 562 if (length(good_rows) < 1) { |
563 print("ERROR: Cannot impute data; there are no good rows!") | |
564 return(-1) | |
565 } | |
566 ``` | |
567 | |
568 ```{r echo = FALSE, results = 'asis'} | |
569 cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description) | |
570 ``` | |
571 | |
330 ```{r echo = FALSE} | 572 ```{r echo = FALSE} |
331 | 573 |
332 #Determine number of cells to impute | 574 imp_smry_potential_after <- sum(good_rows) |
333 temp <- quant_data_imp[is.na(quant_data_imp)] | 575 imp_smry_rejected_after <- sum(!good_rows) |
334 cat("After imputation, there are:", | 576 imp_smry_missing_after <- sum(is.na(quant_data_imp[good_rows, ])) |
577 ``` | |
578 ```{r echo = FALSE, results = 'asis'} | |
579 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf | |
580 tabular_lines <- paste( | |
581 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) | |
582 " \\caption{Imputation Results}", | |
583 " \\centering", # \centering centers the table on the page | |
584 " \\begin{tabular}{l c c c}", | |
585 " \\hline\\hline", | |
586 " \\ & potential peptides & missing values & rejected", | |
587 " peptides \\\\ [0.5ex]", | |
588 " \\hline", | |
589 " before imputation & %d & %d (%d\\%s) & \\\\", | |
590 " after imputation & %d & %d & %d \\\\ [1ex]", | |
591 " \\hline", | |
592 " \\end{tabular}", | |
593 #" \\label{table:nonlin}", # may be used to refer this table in the text | |
594 "\\end{table}", | |
595 sep = "\n" | |
596 ) | |
597 tabular_lines <- | |
335 sprintf( | 598 sprintf( |
336 "\n %d missing values\n %d usable peptides analysis" | 599 tabular_lines, |
337 , sum(is.na(quant_data_imp[good_rows, ])) | 600 imp_smry_potential_before, |
338 , sum(good_rows) | 601 imp_smry_missing_before, |
339 ), | 602 imp_smry_pct_missing, |
340 sprintf( | 603 "%", |
341 "\n %d peptides with too many missing values for further analysis" | 604 imp_smry_potential_after, |
342 , sum(!good_rows) | 605 imp_smry_missing_after, |
343 ) | 606 imp_smry_rejected_after |
344 ) | 607 ) |
608 cat(tabular_lines) | |
345 ``` | 609 ``` |
346 ```{r echo = FALSE} | 610 ```{r echo = FALSE} |
347 | 611 |
348 | 612 |
349 # Zap rows where imputation was ineffective | 613 # Zap rows where imputation was ineffective |
350 full_data <- full_data [good_rows, ] | 614 full_data <- full_data [good_rows, ] |
351 quant_data <- quant_data [good_rows, ] | 615 quant_data <- quant_data [good_rows, ] |
616 | |
617 write_debug_file(quant_data_imp) | |
618 | |
352 quant_data_imp <- quant_data_imp[good_rows, ] | 619 quant_data_imp <- quant_data_imp[good_rows, ] |
353 | 620 quant_data_imp_good_rows <- quant_data_imp |
354 ``` | 621 |
355 ```{r echo = FALSE} | 622 write_debug_file(quant_data_imp_good_rows) |
356 | 623 ``` |
357 d_combined <- (density(as.numeric(as.matrix( | 624 |
358 log10(quant_data_imp) | 625 ```{r echo = FALSE, results = 'asis'} |
359 )))) | 626 |
627 can_plot_before_after_imp <- TRUE | |
628 d_combined <- | |
629 as.numeric( | |
630 as.matrix( | |
631 log10(quant_data_imp) | |
632 ) | |
633 ) | |
360 d_original <- | 634 d_original <- |
361 density(as.numeric(as.matrix( | 635 as.numeric( |
362 log10(quant_data_imp[!is.na(quant_data)])))) | 636 as.matrix( |
363 | 637 log10(quant_data_imp[!is.na(quant_data)]) |
364 ``` | 638 ) |
365 ```{r echo = FALSE} | 639 ) |
640 | |
641 if (sum(!is.na(d_combined)) > 2 && sum(!is.na(d_original)) > 2) { | |
642 d_combined <- density(d_combined) | |
643 d_original <- density(d_original) | |
644 } else { | |
645 can_plot_before_after_imp <- FALSE | |
646 } | |
366 | 647 |
367 if (sum(is.na(quant_data)) > 0) { | 648 if (sum(is.na(quant_data)) > 0) { |
368 # There ARE missing values | 649 # There ARE missing values |
369 d_imputed <- | 650 d_imputed <- |
370 (density(as.numeric(as.matrix( | 651 as.numeric( |
371 log10(quant_data_imp[is.na(quant_data)]) | 652 as.matrix( |
372 )))) | 653 log10(quant_data_imp[is.na(quant_data)]) |
654 ) | |
655 ) | |
656 if (sum(!is.na(d_combined)) > 2) { | |
657 d_imputed <- (density(d_imputed)) | |
658 } else { | |
659 can_plot_before_after_imp <- FALSE | |
660 } | |
373 } else { | 661 } else { |
374 # There are NO missing values | 662 # There are NO missing values |
375 d_imputed <- d_combined | 663 d_imputed <- d_combined |
376 } | 664 } |
377 | 665 |
378 ``` | 666 ``` |
379 | 667 |
380 ```{r echo = FALSE, fig.dim = c(9, 5)} | 668 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} |
381 ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y)) | 669 if (sum(is.na(quant_data)) > 0) { |
382 plot( | 670 cat("\\leavevmode\\newpage\n") |
383 d_combined, | 671 # data visualization |
384 ylim = ylim, | 672 old_par <- par( |
385 sub = "Blue = data before imputation; Red = imputed data", | 673 mai = par("mai") + c(0.5, 0, 0, 0) |
386 main = "Density vs. log10(intensity) before and after imputation" | 674 ) |
387 ) | 675 x <- quant_data |
388 lines(d_original, col = "blue") | 676 x <- blue_dots <- x / x |
389 lines(d_imputed, col = "red") | 677 blue_dots <- log10(blue_dots * quant_data) |
678 x[is.na(x)] <- 0 | |
679 x[x == 1] <- NA | |
680 x[x == 0] <- 1 | |
681 quant_data_imp_log10 <- | |
682 log10(quant_data_imp) | |
683 | |
684 write_debug_file(quant_data_imp_log10) | |
685 | |
686 red_dots <- quant_data_imp_log10 * x | |
687 ylim <- c( | |
688 min(red_dots, blue_dots, na.rm = TRUE), | |
689 max(red_dots, blue_dots, na.rm = TRUE) | |
690 ) | |
691 # ref: https://r-charts.com/distribution/add-points-boxplot/ | |
692 # Vertical plot | |
693 colnames(blue_dots) <- sample_name_matches | |
694 boxplot( | |
695 blue_dots | |
696 , las = 1 # "always horizontal" | |
697 , col = const_boxplot_fill | |
698 , ylim = ylim | |
699 , main = "Peptide intensities before and after imputation" | |
700 , sub = "Light blue = data before imputation; Red = imputed data" | |
701 , xlab = "Sample" | |
702 , ylab = "log10(peptide intensity)" | |
703 ) | |
704 # Points | |
705 # NA values are not plotted | |
706 stripchart( | |
707 blue_dots, # Data | |
708 method = "jitter", # Random noise | |
709 jitter = const_stripchart_jitter, | |
710 pch = 19, # Pch symbols | |
711 cex = const_stripsmall_cex, # Size of symbols reduced | |
712 col = "lightblue", # Color of the symbol | |
713 vertical = TRUE, # Vertical mode | |
714 add = TRUE # Add it over | |
715 ) | |
716 stripchart( | |
717 red_dots, # Data | |
718 method = "jitter", # Random noise | |
719 jitter = const_stripchart_jitter, | |
720 pch = 19, # Pch symbols | |
721 cex = const_stripsmall_cex, # Size of symbols reduced | |
722 col = "red", # Color of the symbol | |
723 vertical = TRUE, # Vertical mode | |
724 add = TRUE # Add it over | |
725 ) | |
726 par(old_par) | |
727 | |
728 # density plot | |
729 cat("\\leavevmode\n\n\n\n\n\n\n") | |
730 if (can_plot_before_after_imp) { | |
731 ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y)) | |
732 plot( | |
733 d_combined, | |
734 ylim = ylim, | |
735 sub = | |
736 paste( | |
737 "Blue = data before imputation; Red = imputed data;", | |
738 "Black = combined" | |
739 ), | |
740 main = "Density of peptide intensity before and after imputation", | |
741 xlab = "log10(peptide intensity)", | |
742 ylab = "Probability density" | |
743 ) | |
744 lines(d_original, col = "blue") | |
745 lines(d_imputed, col = "red") | |
746 } else { | |
747 cat( | |
748 "There are too few points to plot the density of peptide intensity", | |
749 "before and after imputation." | |
750 ) | |
751 } | |
752 cat("\\leavevmode\\newpage\n") | |
753 } | |
390 ``` | 754 ``` |
391 | 755 |
392 ## Perform Quantile Normalization | 756 ## Perform Quantile Normalization |
393 | 757 |
394 <!-- | 758 <!-- |
446 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias | 810 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias |
447 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 | 811 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 |
448 # http://bmbolstad.com/misc/normalize/normalize.html | 812 # http://bmbolstad.com/misc/normalize/normalize.html |
449 # ... | 813 # ... |
450 --> | 814 --> |
451 ```{r echo = FALSE} | 815 ```{r echo = FALSE, results = 'asis'} |
452 library(preprocessCore) | 816 library(preprocessCore) |
453 | 817 |
454 if (TRUE) { | 818 if (nrow(quant_data_imp) > 0) { |
455 quant_data_imp_qn <- normalize.quantiles(as.matrix(quant_data_imp)) | 819 quant_data_imp_qn <- normalize.quantiles(as.matrix(quant_data_imp)) |
456 } else { | 820 } else { |
457 quant_data_imp_qn <- as.matrix(quant_data_imp) | 821 quant_data_imp_qn <- as.matrix(quant_data_imp) |
458 } | 822 } |
459 | 823 |
460 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) | 824 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) |
461 names(quant_data_imp_qn) <- names(quant_data_imp) | 825 names(quant_data_imp_qn) <- names(quant_data_imp) |
826 | |
827 write_debug_file(quant_data_imp_qn) | |
828 | |
462 quant_data_imp_qn_log <- log10(quant_data_imp_qn) | 829 quant_data_imp_qn_log <- log10(quant_data_imp_qn) |
463 | 830 rownames(quant_data_imp_qn_log) <- rownames(quant_data_imp) |
464 rownames(quant_data_imp_qn_log) <- full_data[, 1] | 831 |
832 write_debug_file(quant_data_imp_qn_log) | |
465 | 833 |
466 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) | 834 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) |
835 | |
467 any_nan <- function(x) { | 836 any_nan <- function(x) { |
468 !any(x == "NaN") | 837 !any(x == "NaN") |
469 } | 838 } |
470 sel <- apply(quant_data_imp_qn_ls, 1, any_nan) | 839 sel <- apply(quant_data_imp_qn_ls, 1, any_nan) |
471 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls[which(sel), ] | 840 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls[which(sel), ] |
472 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) | 841 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) |
842 rownames(quant_data_imp_qn_ls2) <- rownames(quant_data_imp)[sel] | |
843 | |
844 quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls) | |
845 rownames(quant_data_imp_qn_ls) <- rownames(quant_data_imp) | |
846 | |
847 write_debug_file(quant_data_imp_qn_ls) | |
848 write_debug_file(quant_data_imp_qn_ls2) | |
473 | 849 |
474 #output quantile normalized data | 850 #output quantile normalized data |
475 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) | 851 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) |
476 write.table( | 852 write.table( |
477 data_table_imp_qn_lt, | 853 data_table_imp_qn_lt, |
478 file = paste(paste( | 854 file = imp_qn_lt_data_filenm, |
479 strsplit(imputed_data_filename, ".txt"), "QN_LT", sep = "_" | |
480 ), ".txt", sep = ""), | |
481 sep = "\t", | 855 sep = "\t", |
482 col.names = TRUE, | 856 col.names = TRUE, |
483 row.names = FALSE | 857 row.names = FALSE, |
858 quote = FALSE | |
484 ) | 859 ) |
485 | 860 |
486 ``` | 861 ``` |
487 | 862 |
488 <!-- ACE insertion begin --> | 863 <!-- ACE insertion begin --> |
489 ### Checking that normalized, imputed, log-transformed sample distributions are similar: | 864 ### Checking that normalized, imputed, log-transformed sample distributions are similar: |
490 | 865 |
491 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | 866 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} |
492 | |
493 | 867 |
494 # Save unimputed quant_data_log for plotting below | 868 # Save unimputed quant_data_log for plotting below |
495 unimputed_quant_data_log <- quant_data_log | 869 unimputed_quant_data_log <- quant_data_log |
496 | 870 |
497 # log10 transform (after preparing for zero values, | 871 # log10 transform (after preparing for zero values, |
498 # which should never happen...) | 872 # which should never happen...) |
499 quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 | 873 quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 |
500 quant_data_log <- log10(quant_data_imp_qn) | 874 quant_data_log <- log10(quant_data_imp_qn) |
501 | 875 |
502 # Output quantile-normalized log-transformed dataset | 876 # Output with imputed, un-normalized data |
503 # with imputed, normalized data | 877 |
504 | 878 data_table_imputed <- cbind(full_data[1:9], quant_data_imp) |
505 data_table_imputed <- cbind(full_data[1:9], quant_data_log) | |
506 write.table( | 879 write.table( |
507 data_table_imputed | 880 data_table_imputed |
508 , file = imputed_data_filename | 881 , file = imputed_data_filename |
509 , sep = "\t" | 882 , sep = "\t" |
510 , col.names = TRUE | 883 , col.names = TRUE |
511 , row.names = FALSE | 884 , row.names = FALSE |
512 , quote = FALSE | 885 , quote = FALSE |
513 ) | 886 ) |
514 | 887 |
515 | 888 how_many_peptides <- nrow(quant_data_log) |
516 | 889 |
517 # data visualization | 890 if ((how_many_peptides) > 0) { |
518 old_par <- par( | 891 cat( |
519 mai = par("mai") + c(0.5, 0, 0, 0) | 892 sprintf( |
520 , oma = par("oma") + c(0.5, 0, 0, 0) | 893 "Intensities for %d peptides:\n\n\n", |
521 ) | 894 how_many_peptides |
522 boxplot( | 895 ) |
523 quant_data_log | 896 ) |
524 , las = 2 | 897 cat("\n\n\n") |
525 ) | 898 |
526 par(old_par) | 899 |
527 | 900 # data visualization |
528 | 901 old_par <- par( |
529 | 902 mai = par("mai") + c(0.5, 0, 0, 0) |
530 cat("\\newline\n") | 903 , oma = par("oma") + c(0.5, 0, 0, 0) |
531 cat("\\newline\n") | 904 ) |
532 | 905 # ref: https://r-charts.com/distribution/add-points-boxplot/ |
533 ``` | 906 # Vertical plot |
534 | 907 colnames(quant_data_log) <- sample_name_matches |
535 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4)} | 908 boxplot( |
536 quant_data_log_stack <- stack(quant_data_log) | 909 quant_data_log |
537 ggplot( | 910 , las = 1 |
538 quant_data_log_stack, | 911 , col = const_boxplot_fill |
539 aes(x = values) | 912 ) |
540 ) + geom_density(aes(group = ind, colour = ind)) | 913 # Points |
914 stripchart( | |
915 quant_data_log, # Data | |
916 method = "jitter", # Random noise | |
917 jitter = const_stripchart_jitter, | |
918 pch = 19, # Pch symbols | |
919 cex = const_stripchart_cex, # Size of symbols reduced | |
920 col = "goldenrod", # Color of the symbol | |
921 vertical = TRUE, # Vertical mode | |
922 add = TRUE # Add it over | |
923 ) | |
924 par(old_par) | |
925 } else { | |
926 cat("There are no peptides to plot\n") | |
927 } | |
928 | |
929 cat("\n\n\n") | |
930 | |
931 ``` | |
932 | |
933 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} | |
934 if (nrow(quant_data_log) > 1) { | |
935 quant_data_log_stack <- stack(quant_data_log) | |
936 ggplot( | |
937 quant_data_log_stack, | |
938 aes(x = values) | |
939 ) + | |
940 geom_density( | |
941 aes(group = ind, colour = ind), | |
942 na.rm = TRUE | |
943 ) | |
944 } else { | |
945 cat("No density plot because there are fewer than two peptides to plot.\n\n") | |
946 } | |
947 ``` | |
948 ```{r echo = FALSE, results = 'asis'} | |
949 cat("\\leavevmode\\newpage\n") | |
541 ``` | 950 ``` |
542 | 951 |
543 ## Perform ANOVA filters | 952 ## Perform ANOVA filters |
544 | |
545 (see following pages) | |
546 | 953 |
547 ```{r, echo = FALSE} | 954 ```{r, echo = FALSE} |
548 # Make new data frame containing only Phosphopeptides | 955 # Make new data frame containing only Phosphopeptides |
549 # to connect preANOVA to ANOVA (connect_df) | 956 # to connect preANOVA to ANOVA (connect_df) |
550 connect_df <- data.frame( | 957 connect_df <- data.frame( |
553 ) | 960 ) |
554 colnames(connect_df) <- c("Phosphopeptide", "Intensity") | 961 colnames(connect_df) <- c("Phosphopeptide", "Intensity") |
555 ``` | 962 ``` |
556 | 963 |
557 ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | 964 ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} |
558 # Get factors -> group replicates (as indicated by terminal letter) | 965 count_of_factor_levels <- length(levels(sample_factor_levels)) |
559 # by the preceding digits; | 966 if (count_of_factor_levels < 2) { |
560 # e.g., group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc.. | |
561 m <- | |
562 regexpr(regex_sample_names, names(quant_data_imp_qn_log), perl = TRUE) | |
563 | |
564 temp_matches <- regmatches(names(quant_data_imp_qn_log), m) | |
565 | |
566 number_of_samples <- length(temp_matches) | |
567 | |
568 m2 <- regexpr(regex_sample_grouping, temp_matches, perl = TRUE) | |
569 | |
570 | |
571 sample_factor_levels <- as.factor(regmatches(temp_matches, m2)) | |
572 | |
573 if (length(levels(sample_factor_levels)) < 2) { | |
574 nuke_control_sequences <- | 967 nuke_control_sequences <- |
575 function(s) { | 968 function(s) { |
576 s <- gsub("[\\]", "xyzzy_plugh", s) | 969 s <- gsub("[\\]", "xyzzy_plugh", s) |
577 s <- gsub("[$]", "\\\\$", s) | 970 s <- gsub("[$]", "\\\\$", s) |
578 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) | 971 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) |
582 "ERROR!!!! Cannot perform ANOVA analysis", | 975 "ERROR!!!! Cannot perform ANOVA analysis", |
583 "(see next page)\\newpage\n" | 976 "(see next page)\\newpage\n" |
584 ) | 977 ) |
585 cat( | 978 cat( |
586 "ERROR: ANOVA analysis", | 979 "ERROR: ANOVA analysis", |
587 "requires two or more factor levels!\\newline\n" | 980 "requires two or more factor levels!\n\n\n" |
588 ) | 981 ) |
589 | 982 |
590 cat("\\newline\\newline\n") | 983 cat("\n\n\n\n\n") |
591 cat("Unparsed sample names are:\\newline\n", | 984 cat("Unparsed sample names are:\n\n\n", |
592 "\\begin{quote}\n", | 985 "\\begin{quote}\n", |
593 paste(names(quant_data_imp_qn_log), collapse = "\\newline\n"), | 986 paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), |
594 "\n\\end{quote}\n\n") | 987 "\n\\end{quote}\n\n") |
595 | 988 |
596 regex_sample_names <- nuke_control_sequences(regex_sample_names) | 989 regex_sample_names <- nuke_control_sequences(regex_sample_names) |
597 | 990 |
598 cat("\\leavevmode\\newline\n") | 991 cat("\\leavevmode\n\n\n") |
599 cat("Parsing rule for SampleNames is", | 992 cat("Parsing rule for SampleNames is", |
600 "\\newline\n", | 993 "\n\n\n", |
601 "\\text{'", | 994 "\\text{'", |
602 regex_sample_names, | 995 regex_sample_names, |
603 "'}\\newline\n", | 996 "'}\n\n\n", |
604 sep = "" | 997 sep = "" |
605 ) | 998 ) |
606 | 999 |
607 cat("\nParsed sample names are:\n", | 1000 cat("\nParsed sample names are:\n", |
608 "\\begin{quote}\n", | 1001 "\\begin{quote}\n", |
609 paste(temp_matches, collapse = "\\newline\n"), | 1002 paste(sample_name_matches, collapse = "\n\n\n"), |
610 "\n\\end{quote}\n\n") | 1003 "\n\\end{quote}\n\n") |
611 | 1004 |
612 regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping) | 1005 regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping) |
613 | 1006 |
614 cat("\\leavevmode\\newline\n") | 1007 cat("\\leavevmode\n\n\n") |
615 cat("Parsing rule for SampleGrouping is", | 1008 cat("Parsing rule for SampleGrouping is", |
616 "\\newline\n", | 1009 "\n\n\n", |
617 "\\text{'", | 1010 "\\text{'", |
618 regex_sample_grouping, | 1011 regex_sample_grouping, |
619 "'}\\newline\n", | 1012 "'}\n\n\n", |
620 sep = "" | 1013 sep = "" |
621 ) | 1014 ) |
622 | 1015 |
623 cat("\\newline\n") | 1016 cat("\n\n\n") |
624 cat("Sample group assignments are:\n", | 1017 cat("Sample group assignments are:\n", |
625 "\\begin{quote}\n", | 1018 "\\begin{quote}\n", |
626 paste(regmatches(temp_matches, m2), collapse = "\\newline\n"), | 1019 paste(regmatches(sample_name_matches, m2), collapse = "\n\n\n"), |
627 "\n\\end{quote}\n\n") | 1020 "\n\\end{quote}\n\n") |
628 | 1021 |
629 } else { | 1022 } else { |
630 p_value_data_anova_ps <- | 1023 p_value_data_anova_ps <- |
631 apply( | 1024 apply( |
647 | 1040 |
648 # output ANOVA file to constructed filename, | 1041 # output ANOVA file to constructed filename, |
649 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" | 1042 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" |
650 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" | 1043 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" |
651 | 1044 |
652 # Re-output quantile-normalized log-transformed dataset | 1045 # Re-output datasets to include p-values |
653 # with imputed, normalized data to include p-values | |
654 | |
655 data_table_imputed <- | |
656 cbind(full_data[1:9], p_value_data[, 2:3], quant_data_log) | |
657 write.table( | 1046 write.table( |
658 data_table_imputed, | 1047 cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp), |
659 file = imputed_data_filename, | 1048 file = imputed_data_filename, |
660 sep = "\t", | 1049 sep = "\t", |
661 col.names = TRUE, | 1050 col.names = TRUE, |
662 row.names = FALSE, | 1051 row.names = FALSE, |
663 quote = FALSE | 1052 quote = FALSE |
664 ) | 1053 ) |
665 | 1054 |
1055 write.table( | |
1056 cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp_qn_log), | |
1057 file = imp_qn_lt_data_filenm, | |
1058 sep = "\t", | |
1059 col.names = TRUE, | |
1060 row.names = FALSE, | |
1061 quote = FALSE | |
1062 ) | |
1063 | |
666 | 1064 |
667 p_value_data <- | 1065 p_value_data <- |
668 p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] | 1066 p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] |
669 | 1067 |
1068 first_page_suppress <- 1 | |
1069 number_of_peptides_found <- 0 | |
670 cutoff <- val_fdr[1] | 1070 cutoff <- val_fdr[1] |
671 for (cutoff in val_fdr) { | 1071 for (cutoff in val_fdr) { |
1072 if (number_of_peptides_found > 49) { | |
1073 cat("\\leavevmode\n\n\n") | |
1074 break | |
1075 } | |
1076 | |
672 #loop through FDR cutoffs | 1077 #loop through FDR cutoffs |
673 | 1078 |
674 filtered_p <- | 1079 filtered_p <- |
675 p_value_data[ | 1080 p_value_data[ |
676 which(p_value_data$fdr_adjusted_anova_p < cutoff), | 1081 which(p_value_data$fdr_adjusted_anova_p < cutoff), |
677 , | 1082 , drop = FALSE |
678 drop = FALSE | |
679 ] | 1083 ] |
680 filtered_data_filtered <- | 1084 filtered_data_filtered <- |
681 quant_data_imp_qn_log[ | 1085 quant_data_imp_qn_log[ |
682 rownames(filtered_p), | 1086 rownames(filtered_p), |
683 , | 1087 , drop = FALSE |
684 drop = FALSE | |
685 ] | 1088 ] |
686 filtered_data_filtered <- | 1089 filtered_data_filtered <- |
687 filtered_data_filtered[ | 1090 filtered_data_filtered[ |
688 order(filtered_p$fdr_adjusted_anova_p), | 1091 order(filtered_p$fdr_adjusted_anova_p), |
689 , | 1092 , drop = FALSE |
690 drop = FALSE | |
691 ] | 1093 ] |
692 | 1094 |
693 # <!-- ACE insertion start --> | 1095 # <!-- ACE insertion start --> |
694 old_oma <- par("oma") | 1096 |
695 old_par <- par( | 1097 if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) { |
696 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), | 1098 if (first_page_suppress == 1) { |
697 oma = old_oma * c(1, 1, 0.3, 1), | 1099 first_page_suppress <- 0 |
698 cex.main = 0.9, | 1100 } |
699 cex.axis = 0.7 | 1101 else { |
700 ) | 1102 cat("\\newpage\n") |
701 | 1103 } |
702 cat("\\newpage\n") | |
703 if (nrow(filtered_data_filtered) > 0) { | |
704 cat(sprintf( | 1104 cat(sprintf( |
705 "Intensities for peptides whose adjusted p-value < %0.2f\n", | 1105 "Intensities for %d peptides whose adjusted p-value < %0.2f:\n", |
1106 nrow(filtered_data_filtered), | |
706 cutoff | 1107 cutoff |
707 )) | 1108 )) |
708 cat("\\newline\n") | 1109 cat("\n\n\n") |
709 cat("\\newline\n") | 1110 cat("\n\n\n") |
710 | 1111 |
1112 old_oma <- par("oma") | |
1113 old_par <- par( | |
1114 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), | |
1115 oma = old_oma * c(1, 1, 0.3, 1), | |
1116 cex.main = 0.9, | |
1117 cex.axis = 0.7, | |
1118 fin = c(9, 7.25) | |
1119 ) | |
1120 # ref: https://r-charts.com/distribution/add-points-boxplot/ | |
1121 # Vertical plot | |
1122 colnames(filtered_data_filtered) <- sample_name_matches | |
711 boxplot( | 1123 boxplot( |
712 filtered_data_filtered, | 1124 filtered_data_filtered, |
713 main = "Imputed, normalized intensities", # no line plot | 1125 main = "Imputed, normalized intensities", # no line plot |
714 las = 2, | 1126 las = 1, |
1127 col = const_boxplot_fill, | |
715 ylab = expression(log[10](intensity)) | 1128 ylab = expression(log[10](intensity)) |
716 ) | 1129 ) |
1130 # Points | |
1131 stripchart( | |
1132 filtered_data_filtered, # Data | |
1133 method = "jitter", # Random noise | |
1134 jitter = const_stripchart_jitter, | |
1135 pch = 19, # Pch symbols | |
1136 cex = const_stripchart_cex, # Size of symbols reduced | |
1137 col = "goldenrod", # Color of the symbol | |
1138 vertical = TRUE, # Vertical mode | |
1139 add = TRUE # Add it over | |
1140 ) | |
1141 par(old_par) | |
717 } else { | 1142 } else { |
718 cat(sprintf( | 1143 cat(sprintf( |
719 "No peptides were found to have cutoff adjusted p-value < %0.2f\n", | 1144 "%s < %0.2f\n\n\n\n\n", |
1145 "No peptides were found to have cutoff adjusted p-value <", | |
720 cutoff | 1146 cutoff |
721 )) | 1147 )) |
722 } | 1148 } |
723 par(old_par) | |
724 | 1149 |
725 if (nrow(filtered_data_filtered) > 0) { | 1150 if (nrow(filtered_data_filtered) > 0) { |
726 #Add Phosphopeptide column to anova_filtered table | 1151 #Add Phosphopeptide column to anova_filtered table |
727 anova_filtered_merge <- merge( | 1152 anova_filtered_merge <- merge( |
728 x = connect_df | 1153 x = connect_df, |
729 , | 1154 y = filtered_data_filtered, |
730 y = filtered_data_filtered | 1155 by.x = "Intensity", |
731 , | |
732 by.x = "Intensity" | |
733 , | |
734 by.y = 1 | 1156 by.y = 1 |
735 ) | 1157 ) |
736 anova_filtered_merge_order <- rownames(filtered_p) | 1158 anova_filtered_merge_order <- rownames(filtered_p) |
737 | 1159 |
738 anova_filtered_merge_format <- sapply( | 1160 anova_filtered_merge_format <- sapply( |
757 c("Phosphopeptide", colnames(filtered_data_filtered)) | 1179 c("Phosphopeptide", colnames(filtered_data_filtered)) |
758 | 1180 |
759 # Merge qualitative columns into the ANOVA data | 1181 # Merge qualitative columns into the ANOVA data |
760 output_table <- data.frame(anova_filtered$Phosphopeptide) | 1182 output_table <- data.frame(anova_filtered$Phosphopeptide) |
761 output_table <- merge( | 1183 output_table <- merge( |
762 x = output_table | 1184 x = output_table, |
763 , | 1185 y = data_table_imp_qn_lt, |
764 y = data_table_imp_qn_lt | 1186 by.x = "anova_filtered.Phosphopeptide", |
765 , | |
766 by.x = "anova_filtered.Phosphopeptide" | |
767 , | |
768 by.y = "Phosphopeptide" | 1187 by.y = "Phosphopeptide" |
769 ) | 1188 ) |
770 | 1189 |
771 # Produce heatmap to visualize significance and the effect of imputation | 1190 # Produce heatmap to visualize significance and the effect of imputation |
772 m <- | 1191 m <- |
775 matrix( | 1194 matrix( |
776 as.integer(is.na(m)), | 1195 as.integer(is.na(m)), |
777 nrow = nrow(m) | 1196 nrow = nrow(m) |
778 ) | 1197 ) |
779 ) | 1198 ) |
780 m <- m[!m_nan_rows, ] | 1199 m <- m[!m_nan_rows, , drop = FALSE] |
781 if (nrow(m) > 0) { | 1200 if (nrow(m) > 0) { |
782 rownames_m <- rownames(m) | 1201 rownames_m <- rownames(m) |
783 rownames(m) <- sapply( | 1202 rownames(m) <- sapply( |
784 X = seq_len(nrow(m)) | 1203 X = seq_len(nrow(m)) |
785 , | 1204 , |
789 filtered_p$fdr_adjusted_anova_p[i], | 1208 filtered_p$fdr_adjusted_anova_p[i], |
790 rownames_m[i] | 1209 rownames_m[i] |
791 ) | 1210 ) |
792 } | 1211 } |
793 ) | 1212 ) |
794 margins <- | |
795 c(max(nchar(colnames(m))) * 10 / 16 # col | |
796 , max(nchar(rownames(m))) * 5 / 16 # row | |
797 ) | |
798 how_many_peptides <- min(50, nrow(m)) | 1213 how_many_peptides <- min(50, nrow(m)) |
1214 number_of_peptides_found <- how_many_peptides | |
1215 if (nrow(m) > 1) { | |
1216 m_margin <- m[how_many_peptides:1, ] | |
1217 margins <- | |
1218 c(max(nchar(colnames(m_margin))) * 10 / 16 # col | |
1219 , max(nchar(rownames(m_margin))) * 5 / 16 # row | |
1220 ) | |
1221 } | |
799 | 1222 |
800 cat("\\newpage\n") | 1223 cat("\\newpage\n") |
801 if (nrow(m) > 50) { | 1224 if (nrow(m) > 50) { |
802 cat("Heatmap for the 50 most-significant peptides", | 1225 cat("Heatmap for the 50 most-significant peptides", |
803 sprintf( | 1226 sprintf( |
804 "whose adjusted p-value < %0.2f\n", | 1227 "whose adjusted p-value < %0.2f\n", |
805 cutoff) | 1228 cutoff) |
806 ) | 1229 ) |
807 } else { | 1230 } else { |
808 cat("Heatmap for peptides whose", | 1231 if (nrow(m) == 1) { |
809 sprintf("adjusted p-value < %0.2f\n", | 1232 cat( |
810 cutoff) | 1233 sprintf("Heatmap for %d usable peptides whose", nrow(m)), |
811 ) | 1234 sprintf("adjusted p-value < %0.2f\n", cutoff) |
1235 ) | |
1236 next | |
1237 } | |
812 } | 1238 } |
813 cat("\\newline\n") | 1239 cat("\n\n\n") |
814 cat("\\newline\n") | 1240 cat("\n\n\n") |
815 op <- par("cex.main") | |
816 try( | 1241 try( |
817 if (nrow(m) > 1) { | 1242 if (nrow(m) > 1) { |
1243 old_oma <- par("oma") | |
818 par(cex.main = 0.6) | 1244 par(cex.main = 0.6) |
819 heatmap( | 1245 heatmap( |
820 m[how_many_peptides:1, ], | 1246 m[how_many_peptides:1, ], |
821 Rowv = NA, | 1247 Rowv = NA, |
822 Colv = NA, | 1248 Colv = NA, |
823 cexRow = 0.7, | 1249 cexRow = 0.7, |
824 cexCol = 0.8, | 1250 cexCol = 0.8, |
825 scale = "row", | 1251 scale = "row", |
826 #ACE scale = "none", | |
827 margins = margins, | 1252 margins = margins, |
828 main = | 1253 main = |
829 "Heatmap of unimputed, unnormalized intensities", | 1254 "Unimputed, unnormalized intensities", |
830 xlab = "" | 1255 xlab = "", |
1256 las = 1 #, fin = c(9, 5.5) | |
831 ) | 1257 ) |
832 } | 1258 } |
833 ) | 1259 ) |
834 par(op) | |
835 } | 1260 } |
836 } | 1261 } |
837 } | 1262 } |
838 } | 1263 } |
1264 cat("\\leavevmode\n\n\n") | |
839 ``` | 1265 ``` |
840 | 1266 |
841 <!-- | 1267 <!-- |
842 ## Peptide IDs, etc. | 1268 ## Peptide IDs, etc. |
843 | 1269 |