comparison mqppep_anova_script.Rmd @ 13:b41a077af3aa draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 040e4945da00a279cb60daae799fce9489f99c50"
author eschen42
date Tue, 22 Mar 2022 20:47:40 +0000
parents 4deacfee76ef
children 6679616d0c18
comparison
equal deleted inserted replaced
12:4deacfee76ef 13:b41a077af3aa
1 --- 1 ---
2 title: "MaxQuant Phospho-Proteomic Enrichment Pipeline ANOVA" 2 title: "MaxQuant Phospho-Proteomic Enrichment Pipeline ANOVA"
3 author: "Larry Cheng; Art Eschenlauer" 3 author: "Larry Cheng; Art Eschenlauer"
4 date: "May 28, 2018; Nov 16, 2021" 4 date: "May 28, 2018; Mar 16, 2022"
5 output: 5 output:
6 pdf_document: default 6 pdf_document:
7 toc: true
8 latex_document:
9 toc: true
7 params: 10 params:
8 inputFile: "test-data/test_input_for_anova.tabular" 11 inputFile: "test-data/test_input_for_anova.tabular"
9 alphaFile: "test-data/alpha_levels.tabular" 12 alphaFile: "test-data/alpha_levels.tabular"
10 firstDataColumn: "Intensity" 13 firstDataColumn: "Intensity"
11 imputationMethod: !r c("group-median", "median", "mean", "random")[1] 14 imputationMethod: !r c("group-median", "median", "mean", "random")[1]
12 meanPercentile: 1 15 meanPercentile: 1
13 sdPercentile: 0.2 16 sdPercentile: 1.0
14 regexSampleNames: "\\.\\d+[A-Z]$" 17 regexSampleNames: "\\.\\d+[A-Z]$"
15 regexSampleGrouping: "\\d+" 18 regexSampleGrouping: "\\d+"
16 imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt" 19 imputedDataFilename: "test-data/imputedDataFilename.txt"
20 imputedQNLTDataFile: "test-data/imputedQNLTDataFile.txt"
21 show_toc: true
17 --- 22 ---
23 <!--
24 latex_document: default
25 inputFile: "test-data/test_input_for_anova.tabular"
26 inputFile: "test-data/density_failure.preproc_tab.tabular"
27 inputFile: "test-data/UT_Phospho_STY_Sites.preproc_tab"
28 date: "May 28, 2018; Mar 16, 2022"
29 -->
18 ```{r setup, include = FALSE} 30 ```{r setup, include = FALSE}
19 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 31 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
20 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) 32 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10))
33
34 # freeze the random number generator so the same results will be produced
35 # from run to run
36 set.seed(28571)
37
38 ### CONSTANTS
39
40 const_parfin <- par("fin")
41 const_boxplot_fill <- "grey94"
42 const_stripchart_cex <- 0.5
43 const_stripsmall_cex <-
44 sqrt(const_stripchart_cex * const_stripchart_cex / 2)
45 const_stripchart_jitter <- 0.3
46 const_write_debug_files <- FALSE
21 47
22 ### FUNCTIONS 48 ### FUNCTIONS
23 49
24 #ANOVA filter function 50 #ANOVA filter function
25 anova_func <- function(x, grouping_factor) { 51 anova_func <- function(x, grouping_factor) {
26 x_aov <- aov(as.numeric(x) ~ grouping_factor) 52 x_aov <- aov(as.numeric(x) ~ grouping_factor)
27 pvalue <- summary(x_aov)[[1]][["Pr(>F)"]][1] 53 pvalue <- summary(x_aov)[[1]][["Pr(>F)"]][1]
28 pvalue 54 pvalue
29 } 55 }
56
57 write_debug_file <- function(s) {
58 if (const_write_debug_files) {
59 s_path <- sprintf("test-data/%s.txt", deparse(substitute(s)))
60 write.table(
61 s,
62 file = s_path,
63 sep = "\t",
64 col.names = TRUE,
65 row.names = TRUE,
66 quote = FALSE
67 )
68 }
69 }
70
71 latex_collapsed_vector <- function(collapse_string, v) {
72 cat(
73 paste0(
74 gsub("_", "\\\\_", v),
75 collapse = collapse_string
76 )
77 )
78 }
79
80 latex_itemized_collapsed <- function(collapse_string, v) {
81 cat("\\begin{itemize}\n\\item ")
82 latex_collapsed_vector(collapse_string, v)
83 cat("\n\\end{itemize}\n")
84 }
85
86 latex_itemized_list <- function(v) {
87 latex_itemized_collapsed("\n\\item ", v)
88 }
89
90 latex_enumerated_collapsed <- function(collapse_string, v) {
91 cat("\\begin{enumerate}\n\\item ")
92 latex_collapsed_vector(collapse_string, v)
93 cat("\n\\end{enumerate}\n")
94 }
95
96 latex_enumerated_list <- function(v) {
97 latex_enumerated_collapsed("\n\\item ", v)
98 }
99
100 latex_table_row <- function(v) {
101 latex_collapsed_vector(" & ", v)
102 cat(" \\\\\n")
103 }
104
105 # Use this like print.data.frame, from which it is adapted:
106 print_data_frame_latex <-
107 function(
108 x,
109 ...,
110 # digits to pass to format.data.frame
111 digits = NULL,
112 # TRUE -> right-justify columns; FALSE -> left-justify
113 right = TRUE,
114 # maximumn number of rows to print
115 max = NULL,
116 # string with justification of each column
117 justification = NULL,
118 # TRUE to center on page
119 centered = FALSE,
120 # optional capttion
121 caption = NULL,
122 # h(inline); b(bottom); t (top) or p (separate page)
123 anchor = "h"
124 ) {
125 if (is.null(justification))
126 justification <-
127 Reduce(
128 f = paste,
129 x = rep_len(if (right) "r" else "l", length(colnames(x)))
130 )
131 n <- length(rownames(x))
132 if (length(x) == 0L) {
133 cat(
134 sprintf(
135 # if n is one, use singular 'row', else use plural 'rows'
136 ngettext(
137 n,
138 "data frame with 0 columns and %d row",
139 "data frame with 0 columns and %d rows"
140 ),
141 n
142 ),
143 "\n",
144 sep = ""
145 )
146 }
147 else if (n == 0L) {
148 cat("0 rows for:\n")
149 latex_itemized_list(names(x))
150 }
151 else {
152 if (is.null(max))
153 max <- getOption("max.print", 99999L)
154 if (!is.finite(max))
155 stop("invalid 'max' / getOption(\"max.print\"): ",
156 max)
157 omit <- (n0 <- max %/% length(x)) < n
158 m <- as.matrix(
159 format.data.frame(
160 if (omit) x[seq_len(n0), , drop = FALSE] else x,
161 digits = digits,
162 na.encode = FALSE
163 )
164 )
165 cat(
166 # h(inline); b(bottom); t (top) or p (separate page)
167 paste0("\\begin{table}[", anchor, "]\n")
168 )
169 if (!is.null(caption))
170 cat(paste0(" \\caption{", caption, "}"))
171 if (centered) cat("\\centering\n")
172 cat(
173 paste(
174 " \\begin{tabular}{",
175 justification,
176 "}\n",
177 sep = ""
178 )
179 )
180 if (!is.null(caption))
181 cat(" \\hline\\hline\n")
182 latex_table_row(colnames(m))
183 cat("\\hline\n")
184 for (i in seq_len(length(m[, 1]))) {
185 latex_table_row(m[i, ])
186 }
187 cat(
188 paste(
189 " \\end{tabular}",
190 "\\end{table}",
191 sep = "\n"
192 )
193 )
194 if (omit)
195 cat(" [ reached 'max' / getOption(\"max.print\") -- omitted",
196 n - n0, "rows ]\n")
197 }
198 invisible(x)
199 }
200
30 ``` 201 ```
31 202
32 ## Purpose: 203 ## Purpose:
204
33 Perform imputation of missing values, quantile normalization, and ANOVA. 205 Perform imputation of missing values, quantile normalization, and ANOVA.
34 206
35 <!-- 207 <!--
36 ## Variables to change for each input file 208 ## Variables to change for each input file
37 --> 209 -->
56 val_fdr <- 228 val_fdr <-
57 read.table(file = params$alphaFile, sep = "\t", header = F, quote = "")[, 1] 229 read.table(file = params$alphaFile, sep = "\t", header = F, quote = "")[, 1]
58 230
59 #Imputed Data filename 231 #Imputed Data filename
60 imputed_data_filename <- params$imputedDataFilename 232 imputed_data_filename <- params$imputedDataFilename
233 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
61 234
62 #ANOVA data filename 235 #ANOVA data filename
63 ``` 236 ```
64 237
65 ```{r echo = FALSE} 238 ```{r echo = FALSE}
76 249
77 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" 250 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
78 regex_sample_names <- params$regexSampleNames 251 regex_sample_names <- params$regexSampleNames
79 252
80 # Regular expression to extract Sample Grouping from Sample Name; 253 # Regular expression to extract Sample Grouping from Sample Name;
81 # if error occurs, compare sample_factor_levels and temp_matches 254 # if error occurs, compare sample_factor_levels and sample_name_matches
82 # to see if groupings/pairs line up 255 # to see if groupings/pairs line up
83 # e.g., "(\\d+)" 256 # e.g., "(\\d+)"
84 regex_sample_grouping <- params$regexSampleGrouping 257 regex_sample_grouping <- params$regexSampleGrouping
85 258
86 ``` 259 ```
99 quote = "", 272 quote = "",
100 check.names = FALSE 273 check.names = FALSE
101 ) 274 )
102 ``` 275 ```
103 276
104 ### Column names from input file 277 ### Parse column names, sample names, and factor levels from input file
105 278
106 ```{r echo = FALSE, results = 'markup'} 279 ```{r echo = FALSE, results = 'asis'}
107 print(colnames(full_data)) 280 # Write column naames as an enumerated list.
281 column_name_df <- data.frame(
282 column = seq_len(length(colnames(full_data))),
283 name = colnames(full_data)
284 )
285 print_data_frame_latex(
286 x = column_name_df,
287 justification = "l l",
288 centered = TRUE,
289 caption = "Input data column name",
290 anchor = "h"
291 )
292
108 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) 293 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE)
109 cat(sprintf("First data column: %d\n", min(data_column_indices))) 294 cat(
110 cat(sprintf("Last data column: %d\n", max(data_column_indices))) 295 sprintf(
111 ``` 296 "\n\nData columns: [%d,%d]\n\n",
112 297 min(data_column_indices),
113 ```{r echo = FALSE, results = 'asis'} 298 max(data_column_indices)
114 cat("\\newpage\n") 299 )
115 ``` 300 )
116
117 ### Checking that log-transformed sample distributions are similar:
118
119 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
120 301
121 if (FALSE == fdc_is_integer) { 302 if (FALSE == fdc_is_integer) {
122
123 if (length(data_column_indices) > 0) { 303 if (length(data_column_indices) > 0) {
124 first_data_column <- data_column_indices[1] 304 first_data_column <- data_column_indices[1]
125 } else { 305 } else {
126 stop(paste("failed to convert firstDataColumn:", first_data_column)) 306 stop(paste("failed to convert firstDataColumn:", first_data_column))
127 } 307 }
128 } 308 }
129 309
130 quant_data0 <- full_data[first_data_column:length(full_data)] 310 ```
311
312 ```{r echo = FALSE, results = 'asis'}
313 #```{r echo = FALSE, results = 'asis'}
131 quant_data <- full_data[first_data_column:length(full_data)] 314 quant_data <- full_data[first_data_column:length(full_data)]
315 quant_data[quant_data == 0] <- NA
316 rownames(quant_data) <- full_data$Phosphopeptide
317 # Get factors -> group replicates (as indicated by terminal letter)
318 # by the preceding digits;
319 # Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$"
320 # get factors ->
321 # group runs (samples) by ignoring terminal [A-Z] in sample names
322 # e.g.
323 # group .1A .1B .1C into group 1;
324 # group .2A .2B .2C, into group 2;
325 # etc.
326 m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE)
327 sample_name_matches <- regmatches(names(quant_data), m)
328 colnames(quant_data) <- sample_name_matches
329
330 write_debug_file(quant_data)
331
332 m2 <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
333 sample_factor_levels <- as.factor(regmatches(sample_name_matches, m2))
334 number_of_samples <- length(sample_name_matches)
335 sample_factor_df <- data.frame(
336 sample = sample_name_matches,
337 level = sample_factor_levels
338 )
339 print_data_frame_latex(
340 x = sample_factor_df,
341 justification = "c c",
342 centered = TRUE,
343 caption = "Factor level",
344 anchor = "h"
345 )
346 ```
347 ```{r echo = FALSE, results = 'asis'}
348 cat("\\newpage\n")
349 ```
350
351 ### Are the log-transformed sample distributions similar?
352
353 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
354
132 quant_data[quant_data == 0] <- NA #replace 0 with NA 355 quant_data[quant_data == 0] <- NA #replace 0 with NA
133 quant_data_log <- log10(quant_data) 356 quant_data_log <- log10(quant_data)
134 357
135 rownames(quant_data_log) <- full_data$Phosphopeptide 358 rownames(quant_data_log) <- rownames(quant_data)
359 colnames(quant_data_log) <- sample_name_matches
360
361 write_debug_file(quant_data_log)
136 362
137 # data visualization 363 # data visualization
138 old_par <- par( 364 old_par <- par(
139 mai = par("mai") + c(0.5, 0, 0, 0) 365 mai = par("mai") + c(0.5, 0, 0, 0)
140 ) 366 )
367 # ref: https://r-charts.com/distribution/add-points-boxplot/
368 # Vertical plot
141 boxplot( 369 boxplot(
142 quant_data_log 370 quant_data_log
143 , las = 2 371 , las = 1
372 , col = const_boxplot_fill
144 ) 373 )
374 # Points
375 stripchart(
376 quant_data_log, # Data
377 method = "jitter", # Random noise
378 jitter = const_stripchart_jitter,
379 pch = 19, # Pch symbols
380 cex = const_stripchart_cex, # Size of symbols reduced
381 col = "goldenrod", # Color of the symbol
382 vertical = TRUE, # Vertical mode
383 add = TRUE # Add it over
384 )
145 par(old_par) 385 par(old_par)
146 386
147 387
148 388
149 cat("\\newline\n") 389 cat("\n\n\n")
150 cat("\\newline\n") 390 cat("\n\n\n")
151 391
152 ``` 392 ```
153 393
154 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), warning = FALSE} 394 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
155 quant_data_log_stack <- stack(quant_data_log)
156 library(ggplot2) 395 library(ggplot2)
157 ggplot( 396 if (nrow(quant_data_log) > 1) {
158 quant_data_log_stack, 397 quant_data_log_stack <- stack(quant_data_log)
159 aes(x = values)) + geom_density(aes(group = ind, colour = ind)) 398 ggplot(
160 ``` 399 quant_data_log_stack,
161 400 aes(x = values)
162 ### Globally, are phosphopeptide intensities are approximately unimodal? 401 ) +
402 geom_density(
403 aes(group = ind, colour = ind),
404 na.rm = TRUE
405 )
406 } else {
407 cat("No density plot because there are too few peptides.\n\n")
408 }
409 ```
410
411 ### Globally, are peptide intensities are approximately unimodal?
163 412
164 <!-- 413 <!--
165 # ref for bquote below particularly and plotting math expressions generally: 414 # ref for bquote below particularly and plotting math expressions generally:
166 # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ 415 # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/
167 --> 416 -->
168 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5)} 417 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
169 418
170 # identify the location of missing values 419 # identify the location of missing values
171 fin <- is.finite(as.numeric(as.matrix(quant_data_log))) 420 fin <- is.finite(as.numeric(as.matrix(quant_data_log)))
172 421
173 logvalues <- as.numeric(as.matrix(quant_data_log))[fin] 422 logvalues <- as.numeric(as.matrix(quant_data_log))[fin]
183 , main = bquote("Frequency vs." ~ log[10](intensity)) 432 , main = bquote("Frequency vs." ~ log[10](intensity))
184 , xlab = bquote(log[10](intensity)) 433 , xlab = bquote(log[10](intensity))
185 ) 434 )
186 ``` 435 ```
187 436
188 ### Distribution of standard deviations of phosphopeptides, ignoring missing values: 437 ### Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values:
189 438
190 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5)} 439 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
191 # determine quantile 440 # determine quantile
192 q1 <- quantile(logvalues, probs = mean_percentile)[1] 441 q1 <- quantile(logvalues, probs = mean_percentile)[1]
193 442
194 # determine standard deviation of quantile to impute 443 # determine standard deviation of quantile to impute
195 sd_finite <- function(x) { 444 sd_finite <- function(x) {
196 ok <- is.finite(x) 445 ok <- is.finite(x)
197 sd(x[ok]) * sd_percentile 446 sd(x[ok])
198 } 447 }
199 # 1 = row of matrix (ie, phosphopeptide) 448 # 1 = row of matrix (ie, phosphopeptide)
200 sds <- apply(quant_data_log, 1, sd_finite) 449 sds <- apply(quant_data_log, 1, sd_finite)
201 plot( 450 if (sum(!is.na(sds)) > 2) {
202 density(sds, na.rm = T) 451 plot(
203 , main = "Smoothed estimated probability density vs. std. deviation" 452 density(sds, na.rm = T)
204 , sub = "(probability estimation made with Gaussian smoothing)" 453 , main = "Smoothed estimated probability density vs. std. deviation"
205 ) 454 , sub = "(probability estimation made with Gaussian smoothing)"
206 455 )
207 m1 <- median(sds, na.rm = T) #sd to be used is the median sd 456 } else {
208 457 cat(
209 ``` 458 "At least two non-missing values are required to plot",
210 459 "probability density.\n\n"
211 460 )
212 461 }
213 <!-- 462
214 The number of missing values are: 463 ```
215 --> 464
465 ```{r echo = FALSE}
466 # Determine number of cells to impute
467 temp <- quant_data[is.na(quant_data)]
468
469 # Determine number of values to impute
470 number_to_impute <- length(temp)
471
472 # Determine percent of missing values
473 pct_missing_values <-
474 round(length(temp) / (length(logvalues) + length(temp)) * 100)
475 ```
476
477 ```{r echo = FALSE}
478
479 # prep for trt-median based imputation
480
481 ```
482 ## Impute Missing Values
483
484 ```{r echo = FALSE}
485
486 imp_smry_potential_before <- length(logvalues)
487 imp_smry_missing_before <- number_to_impute
488 imp_smry_pct_missing <- pct_missing_values
489
490 ```
491
216 ```{r echo = FALSE} 492 ```{r echo = FALSE}
217 #Determine number of cells to impute 493 #Determine number of cells to impute
218 temp <- quant_data[is.na(quant_data)]
219
220 #Determine number of values to impute
221 number_to_impute <- length(temp)
222 ```
223
224 <!--
225 % of values that are missing:
226 -->
227 ```{r echo = FALSE}
228 pct_missing_values <- length(temp) / (length(logvalues) + length(temp)) * 100
229 ```
230
231 <!--
232 First few rows of data before imputation:
233 -->
234 ```{r echo = FALSE, results = 'asis'}
235 cat("\\newpage\n")
236 ```
237
238 ## Parse sample names
239
240 Parse the names of the samples to deduce the factor level for each sample:
241
242 ```{r echo = FALSE}
243
244 # prep for trt-median based imputation
245
246 # Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$"
247 # get factors ->
248 # group runs (samples) by ignoring terminal [A-Z] in sample names
249
250 m <- regexpr(regex_sample_names, names(quant_data), perl = TRUE)
251 temp_matches <- regmatches(names(quant_data), m)
252 print("Extracted sample names")
253 print(temp_matches)
254 m2 <- regexpr(regex_sample_grouping, temp_matches, perl = TRUE)
255 sample_factor_levels <- as.factor(regmatches(temp_matches, m2))
256 print("Factor levels")
257 print(sample_factor_levels)
258
259 ```
260 ## Impute missing values
261
262 ```{r echo = FALSE}
263
264 #Determine number of cells to impute
265 cat("Before imputation,",
266 sprintf(
267 "there are:\n %d peptides\n %d missing values (%2.0f%s)",
268 sum(rep.int(TRUE, nrow(quant_data))),
269 sum(is.na(quant_data)),
270 pct_missing_values,
271 "%"
272 )
273 )
274 494
275 ``` 495 ```
276 ```{r echo = FALSE} 496 ```{r echo = FALSE}
277 497
278 #Impute data 498 #Impute data
280 500
281 # Identify which values are missing and need to be imputed 501 # Identify which values are missing and need to be imputed
282 ind <- which(is.na(quant_data_imp), arr.ind = TRUE) 502 ind <- which(is.na(quant_data_imp), arr.ind = TRUE)
283 503
284 ``` 504 ```
285 ```{r echo = FALSE} 505 ```{r echo = FALSE, results = 'asis'}
286 506
287 # Apply imputation 507 # Apply imputation
288 switch( 508 switch(
289 imputation_method 509 imputation_method
290 , "group-median" = { 510 , "group-median" = {
291 cat("Imputation method:\n substitute missing value", 511 imputation_method_description <-
292 "with median peptide-intensity for sample-group\n") 512 paste("Substitute missing value with",
293 513 "median peptide intensity for sample group\n"
514 )
294 sample_level_integers <- as.integer(sample_factor_levels) 515 sample_level_integers <- as.integer(sample_factor_levels)
295 for (i in seq_len(length(levels(sample_factor_levels)))) { 516 for (i in seq_len(length(levels(sample_factor_levels)))) {
296 level_cols <- i == sample_level_integers 517 level_cols <- i == sample_level_integers
297 ind <- which(is.na(quant_data_imp[, level_cols]), arr.ind = TRUE) 518 ind <- which(is.na(quant_data_imp[, level_cols]), arr.ind = TRUE)
298 quant_data_imp[ind, level_cols] <- 519 quant_data_imp[ind, level_cols] <-
299 apply(quant_data_imp[, level_cols], 1, median, na.rm = T)[ind[, 1]] 520 apply(quant_data_imp[, level_cols], 1, median, na.rm = T)[ind[, 1]]
300 } 521 }
301 good_rows <- !is.na(rowMeans(quant_data_imp)) 522 good_rows <- !is.na(rowMeans(quant_data_imp))
302 } 523 }
303 , "median" = { 524 , "median" = {
304 cat("Imputation method:\n substitute missing value with", 525 imputation_method_description <-
305 "median peptide-intensity across all sample classes\n") 526 paste("Substitute missing value with",
527 "median peptide intensity across all sample classes\n"
528 )
306 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]] 529 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]]
307 good_rows <- !is.na(rowMeans(quant_data_imp)) 530 good_rows <- !is.na(rowMeans(quant_data_imp))
308 } 531 }
309 , "mean" = { 532 , "mean" = {
310 cat("Imputation method:\n substitute missing value with", 533 imputation_method_description <-
311 "mean peptide-intensity across all sample classes\n") 534 paste("Substitute missing value with",
535 "mean peptide intensity across all sample classes\n"
536 )
312 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]] 537 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]]
313 good_rows <- !is.na(rowMeans(quant_data_imp)) 538 good_rows <- !is.na(rowMeans(quant_data_imp))
314 } 539 }
315 , "random" = { 540 , "random" = {
316 cat( 541 m1 <- median(sds, na.rm = T) * sd_percentile #sd to be used is the median sd
317 "Imputation method:\n substitute missing value with\n ", 542 # If you want results to be reproducible, you will want to call
318 sprintf( 543 # base::set.seed before calling stats::rnorm
319 "random intensity N ~ (%0.2f, %0.2f)\n" 544 imputation_method_description <-
320 , q1, m1 545 paste("Substitute each missing value with random intensity",
321 ) 546 sprintf(
322 ) 547 "random intensity $N \\sim (%0.2f, %0.2f)$\n",
323 quant_data_imp[is.na(quant_data_imp)] <- 548 q1, m1
549 )
550 )
551 cat(sprintf("mean_percentile (from input parameter) is %2.0f\n\n",
552 100 * mean_percentile))
553 cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n",
554 sd_percentile))
555 #ACE cat(sprintf("sd for rnorm is %0.4f\n\n", m1))
556 quant_data_imp[ind] <-
324 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) 557 10 ^ rnorm(number_to_impute, mean = q1, sd = m1)
325 good_rows <- !is.na(rowMeans(quant_data_imp)) 558 good_rows <- !is.na(rowMeans(quant_data_imp))
326 } 559 }
327 ) 560 )
328 561
329 ``` 562 if (length(good_rows) < 1) {
563 print("ERROR: Cannot impute data; there are no good rows!")
564 return(-1)
565 }
566 ```
567
568 ```{r echo = FALSE, results = 'asis'}
569 cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description)
570 ```
571
330 ```{r echo = FALSE} 572 ```{r echo = FALSE}
331 573
332 #Determine number of cells to impute 574 imp_smry_potential_after <- sum(good_rows)
333 temp <- quant_data_imp[is.na(quant_data_imp)] 575 imp_smry_rejected_after <- sum(!good_rows)
334 cat("After imputation, there are:", 576 imp_smry_missing_after <- sum(is.na(quant_data_imp[good_rows, ]))
577 ```
578 ```{r echo = FALSE, results = 'asis'}
579 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf
580 tabular_lines <- paste(
581 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page)
582 " \\caption{Imputation Results}",
583 " \\centering", # \centering centers the table on the page
584 " \\begin{tabular}{l c c c}",
585 " \\hline\\hline",
586 " \\ & potential peptides & missing values & rejected",
587 " peptides \\\\ [0.5ex]",
588 " \\hline",
589 " before imputation & %d & %d (%d\\%s) & \\\\",
590 " after imputation & %d & %d & %d \\\\ [1ex]",
591 " \\hline",
592 " \\end{tabular}",
593 #" \\label{table:nonlin}", # may be used to refer this table in the text
594 "\\end{table}",
595 sep = "\n"
596 )
597 tabular_lines <-
335 sprintf( 598 sprintf(
336 "\n %d missing values\n %d usable peptides analysis" 599 tabular_lines,
337 , sum(is.na(quant_data_imp[good_rows, ])) 600 imp_smry_potential_before,
338 , sum(good_rows) 601 imp_smry_missing_before,
339 ), 602 imp_smry_pct_missing,
340 sprintf( 603 "%",
341 "\n %d peptides with too many missing values for further analysis" 604 imp_smry_potential_after,
342 , sum(!good_rows) 605 imp_smry_missing_after,
343 ) 606 imp_smry_rejected_after
344 ) 607 )
608 cat(tabular_lines)
345 ``` 609 ```
346 ```{r echo = FALSE} 610 ```{r echo = FALSE}
347 611
348 612
349 # Zap rows where imputation was ineffective 613 # Zap rows where imputation was ineffective
350 full_data <- full_data [good_rows, ] 614 full_data <- full_data [good_rows, ]
351 quant_data <- quant_data [good_rows, ] 615 quant_data <- quant_data [good_rows, ]
616
617 write_debug_file(quant_data_imp)
618
352 quant_data_imp <- quant_data_imp[good_rows, ] 619 quant_data_imp <- quant_data_imp[good_rows, ]
353 620 quant_data_imp_good_rows <- quant_data_imp
354 ``` 621
355 ```{r echo = FALSE} 622 write_debug_file(quant_data_imp_good_rows)
356 623 ```
357 d_combined <- (density(as.numeric(as.matrix( 624
358 log10(quant_data_imp) 625 ```{r echo = FALSE, results = 'asis'}
359 )))) 626
627 can_plot_before_after_imp <- TRUE
628 d_combined <-
629 as.numeric(
630 as.matrix(
631 log10(quant_data_imp)
632 )
633 )
360 d_original <- 634 d_original <-
361 density(as.numeric(as.matrix( 635 as.numeric(
362 log10(quant_data_imp[!is.na(quant_data)])))) 636 as.matrix(
363 637 log10(quant_data_imp[!is.na(quant_data)])
364 ``` 638 )
365 ```{r echo = FALSE} 639 )
640
641 if (sum(!is.na(d_combined)) > 2 && sum(!is.na(d_original)) > 2) {
642 d_combined <- density(d_combined)
643 d_original <- density(d_original)
644 } else {
645 can_plot_before_after_imp <- FALSE
646 }
366 647
367 if (sum(is.na(quant_data)) > 0) { 648 if (sum(is.na(quant_data)) > 0) {
368 # There ARE missing values 649 # There ARE missing values
369 d_imputed <- 650 d_imputed <-
370 (density(as.numeric(as.matrix( 651 as.numeric(
371 log10(quant_data_imp[is.na(quant_data)]) 652 as.matrix(
372 )))) 653 log10(quant_data_imp[is.na(quant_data)])
654 )
655 )
656 if (sum(!is.na(d_combined)) > 2) {
657 d_imputed <- (density(d_imputed))
658 } else {
659 can_plot_before_after_imp <- FALSE
660 }
373 } else { 661 } else {
374 # There are NO missing values 662 # There are NO missing values
375 d_imputed <- d_combined 663 d_imputed <- d_combined
376 } 664 }
377 665
378 ``` 666 ```
379 667
380 ```{r echo = FALSE, fig.dim = c(9, 5)} 668 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
381 ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y)) 669 if (sum(is.na(quant_data)) > 0) {
382 plot( 670 cat("\\leavevmode\\newpage\n")
383 d_combined, 671 # data visualization
384 ylim = ylim, 672 old_par <- par(
385 sub = "Blue = data before imputation; Red = imputed data", 673 mai = par("mai") + c(0.5, 0, 0, 0)
386 main = "Density vs. log10(intensity) before and after imputation" 674 )
387 ) 675 x <- quant_data
388 lines(d_original, col = "blue") 676 x <- blue_dots <- x / x
389 lines(d_imputed, col = "red") 677 blue_dots <- log10(blue_dots * quant_data)
678 x[is.na(x)] <- 0
679 x[x == 1] <- NA
680 x[x == 0] <- 1
681 quant_data_imp_log10 <-
682 log10(quant_data_imp)
683
684 write_debug_file(quant_data_imp_log10)
685
686 red_dots <- quant_data_imp_log10 * x
687 ylim <- c(
688 min(red_dots, blue_dots, na.rm = TRUE),
689 max(red_dots, blue_dots, na.rm = TRUE)
690 )
691 # ref: https://r-charts.com/distribution/add-points-boxplot/
692 # Vertical plot
693 colnames(blue_dots) <- sample_name_matches
694 boxplot(
695 blue_dots
696 , las = 1 # "always horizontal"
697 , col = const_boxplot_fill
698 , ylim = ylim
699 , main = "Peptide intensities before and after imputation"
700 , sub = "Light blue = data before imputation; Red = imputed data"
701 , xlab = "Sample"
702 , ylab = "log10(peptide intensity)"
703 )
704 # Points
705 # NA values are not plotted
706 stripchart(
707 blue_dots, # Data
708 method = "jitter", # Random noise
709 jitter = const_stripchart_jitter,
710 pch = 19, # Pch symbols
711 cex = const_stripsmall_cex, # Size of symbols reduced
712 col = "lightblue", # Color of the symbol
713 vertical = TRUE, # Vertical mode
714 add = TRUE # Add it over
715 )
716 stripchart(
717 red_dots, # Data
718 method = "jitter", # Random noise
719 jitter = const_stripchart_jitter,
720 pch = 19, # Pch symbols
721 cex = const_stripsmall_cex, # Size of symbols reduced
722 col = "red", # Color of the symbol
723 vertical = TRUE, # Vertical mode
724 add = TRUE # Add it over
725 )
726 par(old_par)
727
728 # density plot
729 cat("\\leavevmode\n\n\n\n\n\n\n")
730 if (can_plot_before_after_imp) {
731 ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y))
732 plot(
733 d_combined,
734 ylim = ylim,
735 sub =
736 paste(
737 "Blue = data before imputation; Red = imputed data;",
738 "Black = combined"
739 ),
740 main = "Density of peptide intensity before and after imputation",
741 xlab = "log10(peptide intensity)",
742 ylab = "Probability density"
743 )
744 lines(d_original, col = "blue")
745 lines(d_imputed, col = "red")
746 } else {
747 cat(
748 "There are too few points to plot the density of peptide intensity",
749 "before and after imputation."
750 )
751 }
752 cat("\\leavevmode\\newpage\n")
753 }
390 ``` 754 ```
391 755
392 ## Perform Quantile Normalization 756 ## Perform Quantile Normalization
393 757
394 <!-- 758 <!--
446 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias 810 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias
447 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 811 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185
448 # http://bmbolstad.com/misc/normalize/normalize.html 812 # http://bmbolstad.com/misc/normalize/normalize.html
449 # ... 813 # ...
450 --> 814 -->
451 ```{r echo = FALSE} 815 ```{r echo = FALSE, results = 'asis'}
452 library(preprocessCore) 816 library(preprocessCore)
453 817
454 if (TRUE) { 818 if (nrow(quant_data_imp) > 0) {
455 quant_data_imp_qn <- normalize.quantiles(as.matrix(quant_data_imp)) 819 quant_data_imp_qn <- normalize.quantiles(as.matrix(quant_data_imp))
456 } else { 820 } else {
457 quant_data_imp_qn <- as.matrix(quant_data_imp) 821 quant_data_imp_qn <- as.matrix(quant_data_imp)
458 } 822 }
459 823
460 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) 824 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn)
461 names(quant_data_imp_qn) <- names(quant_data_imp) 825 names(quant_data_imp_qn) <- names(quant_data_imp)
826
827 write_debug_file(quant_data_imp_qn)
828
462 quant_data_imp_qn_log <- log10(quant_data_imp_qn) 829 quant_data_imp_qn_log <- log10(quant_data_imp_qn)
463 830 rownames(quant_data_imp_qn_log) <- rownames(quant_data_imp)
464 rownames(quant_data_imp_qn_log) <- full_data[, 1] 831
832 write_debug_file(quant_data_imp_qn_log)
465 833
466 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) 834 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn))))
835
467 any_nan <- function(x) { 836 any_nan <- function(x) {
468 !any(x == "NaN") 837 !any(x == "NaN")
469 } 838 }
470 sel <- apply(quant_data_imp_qn_ls, 1, any_nan) 839 sel <- apply(quant_data_imp_qn_ls, 1, any_nan)
471 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls[which(sel), ] 840 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls[which(sel), ]
472 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) 841 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2)
842 rownames(quant_data_imp_qn_ls2) <- rownames(quant_data_imp)[sel]
843
844 quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls)
845 rownames(quant_data_imp_qn_ls) <- rownames(quant_data_imp)
846
847 write_debug_file(quant_data_imp_qn_ls)
848 write_debug_file(quant_data_imp_qn_ls2)
473 849
474 #output quantile normalized data 850 #output quantile normalized data
475 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) 851 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log)
476 write.table( 852 write.table(
477 data_table_imp_qn_lt, 853 data_table_imp_qn_lt,
478 file = paste(paste( 854 file = imp_qn_lt_data_filenm,
479 strsplit(imputed_data_filename, ".txt"), "QN_LT", sep = "_"
480 ), ".txt", sep = ""),
481 sep = "\t", 855 sep = "\t",
482 col.names = TRUE, 856 col.names = TRUE,
483 row.names = FALSE 857 row.names = FALSE,
858 quote = FALSE
484 ) 859 )
485 860
486 ``` 861 ```
487 862
488 <!-- ACE insertion begin --> 863 <!-- ACE insertion begin -->
489 ### Checking that normalized, imputed, log-transformed sample distributions are similar: 864 ### Checking that normalized, imputed, log-transformed sample distributions are similar:
490 865
491 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} 866 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
492
493 867
494 # Save unimputed quant_data_log for plotting below 868 # Save unimputed quant_data_log for plotting below
495 unimputed_quant_data_log <- quant_data_log 869 unimputed_quant_data_log <- quant_data_log
496 870
497 # log10 transform (after preparing for zero values, 871 # log10 transform (after preparing for zero values,
498 # which should never happen...) 872 # which should never happen...)
499 quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 873 quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001
500 quant_data_log <- log10(quant_data_imp_qn) 874 quant_data_log <- log10(quant_data_imp_qn)
501 875
502 # Output quantile-normalized log-transformed dataset 876 # Output with imputed, un-normalized data
503 # with imputed, normalized data 877
504 878 data_table_imputed <- cbind(full_data[1:9], quant_data_imp)
505 data_table_imputed <- cbind(full_data[1:9], quant_data_log)
506 write.table( 879 write.table(
507 data_table_imputed 880 data_table_imputed
508 , file = imputed_data_filename 881 , file = imputed_data_filename
509 , sep = "\t" 882 , sep = "\t"
510 , col.names = TRUE 883 , col.names = TRUE
511 , row.names = FALSE 884 , row.names = FALSE
512 , quote = FALSE 885 , quote = FALSE
513 ) 886 )
514 887
515 888 how_many_peptides <- nrow(quant_data_log)
516 889
517 # data visualization 890 if ((how_many_peptides) > 0) {
518 old_par <- par( 891 cat(
519 mai = par("mai") + c(0.5, 0, 0, 0) 892 sprintf(
520 , oma = par("oma") + c(0.5, 0, 0, 0) 893 "Intensities for %d peptides:\n\n\n",
521 ) 894 how_many_peptides
522 boxplot( 895 )
523 quant_data_log 896 )
524 , las = 2 897 cat("\n\n\n")
525 ) 898
526 par(old_par) 899
527 900 # data visualization
528 901 old_par <- par(
529 902 mai = par("mai") + c(0.5, 0, 0, 0)
530 cat("\\newline\n") 903 , oma = par("oma") + c(0.5, 0, 0, 0)
531 cat("\\newline\n") 904 )
532 905 # ref: https://r-charts.com/distribution/add-points-boxplot/
533 ``` 906 # Vertical plot
534 907 colnames(quant_data_log) <- sample_name_matches
535 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4)} 908 boxplot(
536 quant_data_log_stack <- stack(quant_data_log) 909 quant_data_log
537 ggplot( 910 , las = 1
538 quant_data_log_stack, 911 , col = const_boxplot_fill
539 aes(x = values) 912 )
540 ) + geom_density(aes(group = ind, colour = ind)) 913 # Points
914 stripchart(
915 quant_data_log, # Data
916 method = "jitter", # Random noise
917 jitter = const_stripchart_jitter,
918 pch = 19, # Pch symbols
919 cex = const_stripchart_cex, # Size of symbols reduced
920 col = "goldenrod", # Color of the symbol
921 vertical = TRUE, # Vertical mode
922 add = TRUE # Add it over
923 )
924 par(old_par)
925 } else {
926 cat("There are no peptides to plot\n")
927 }
928
929 cat("\n\n\n")
930
931 ```
932
933 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
934 if (nrow(quant_data_log) > 1) {
935 quant_data_log_stack <- stack(quant_data_log)
936 ggplot(
937 quant_data_log_stack,
938 aes(x = values)
939 ) +
940 geom_density(
941 aes(group = ind, colour = ind),
942 na.rm = TRUE
943 )
944 } else {
945 cat("No density plot because there are fewer than two peptides to plot.\n\n")
946 }
947 ```
948 ```{r echo = FALSE, results = 'asis'}
949 cat("\\leavevmode\\newpage\n")
541 ``` 950 ```
542 951
543 ## Perform ANOVA filters 952 ## Perform ANOVA filters
544
545 (see following pages)
546 953
547 ```{r, echo = FALSE} 954 ```{r, echo = FALSE}
548 # Make new data frame containing only Phosphopeptides 955 # Make new data frame containing only Phosphopeptides
549 # to connect preANOVA to ANOVA (connect_df) 956 # to connect preANOVA to ANOVA (connect_df)
550 connect_df <- data.frame( 957 connect_df <- data.frame(
553 ) 960 )
554 colnames(connect_df) <- c("Phosphopeptide", "Intensity") 961 colnames(connect_df) <- c("Phosphopeptide", "Intensity")
555 ``` 962 ```
556 963
557 ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} 964 ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
558 # Get factors -> group replicates (as indicated by terminal letter) 965 count_of_factor_levels <- length(levels(sample_factor_levels))
559 # by the preceding digits; 966 if (count_of_factor_levels < 2) {
560 # e.g., group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc..
561 m <-
562 regexpr(regex_sample_names, names(quant_data_imp_qn_log), perl = TRUE)
563
564 temp_matches <- regmatches(names(quant_data_imp_qn_log), m)
565
566 number_of_samples <- length(temp_matches)
567
568 m2 <- regexpr(regex_sample_grouping, temp_matches, perl = TRUE)
569
570
571 sample_factor_levels <- as.factor(regmatches(temp_matches, m2))
572
573 if (length(levels(sample_factor_levels)) < 2) {
574 nuke_control_sequences <- 967 nuke_control_sequences <-
575 function(s) { 968 function(s) {
576 s <- gsub("[\\]", "xyzzy_plugh", s) 969 s <- gsub("[\\]", "xyzzy_plugh", s)
577 s <- gsub("[$]", "\\\\$", s) 970 s <- gsub("[$]", "\\\\$", s)
578 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) 971 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s)
582 "ERROR!!!! Cannot perform ANOVA analysis", 975 "ERROR!!!! Cannot perform ANOVA analysis",
583 "(see next page)\\newpage\n" 976 "(see next page)\\newpage\n"
584 ) 977 )
585 cat( 978 cat(
586 "ERROR: ANOVA analysis", 979 "ERROR: ANOVA analysis",
587 "requires two or more factor levels!\\newline\n" 980 "requires two or more factor levels!\n\n\n"
588 ) 981 )
589 982
590 cat("\\newline\\newline\n") 983 cat("\n\n\n\n\n")
591 cat("Unparsed sample names are:\\newline\n", 984 cat("Unparsed sample names are:\n\n\n",
592 "\\begin{quote}\n", 985 "\\begin{quote}\n",
593 paste(names(quant_data_imp_qn_log), collapse = "\\newline\n"), 986 paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"),
594 "\n\\end{quote}\n\n") 987 "\n\\end{quote}\n\n")
595 988
596 regex_sample_names <- nuke_control_sequences(regex_sample_names) 989 regex_sample_names <- nuke_control_sequences(regex_sample_names)
597 990
598 cat("\\leavevmode\\newline\n") 991 cat("\\leavevmode\n\n\n")
599 cat("Parsing rule for SampleNames is", 992 cat("Parsing rule for SampleNames is",
600 "\\newline\n", 993 "\n\n\n",
601 "\\text{'", 994 "\\text{'",
602 regex_sample_names, 995 regex_sample_names,
603 "'}\\newline\n", 996 "'}\n\n\n",
604 sep = "" 997 sep = ""
605 ) 998 )
606 999
607 cat("\nParsed sample names are:\n", 1000 cat("\nParsed sample names are:\n",
608 "\\begin{quote}\n", 1001 "\\begin{quote}\n",
609 paste(temp_matches, collapse = "\\newline\n"), 1002 paste(sample_name_matches, collapse = "\n\n\n"),
610 "\n\\end{quote}\n\n") 1003 "\n\\end{quote}\n\n")
611 1004
612 regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping) 1005 regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping)
613 1006
614 cat("\\leavevmode\\newline\n") 1007 cat("\\leavevmode\n\n\n")
615 cat("Parsing rule for SampleGrouping is", 1008 cat("Parsing rule for SampleGrouping is",
616 "\\newline\n", 1009 "\n\n\n",
617 "\\text{'", 1010 "\\text{'",
618 regex_sample_grouping, 1011 regex_sample_grouping,
619 "'}\\newline\n", 1012 "'}\n\n\n",
620 sep = "" 1013 sep = ""
621 ) 1014 )
622 1015
623 cat("\\newline\n") 1016 cat("\n\n\n")
624 cat("Sample group assignments are:\n", 1017 cat("Sample group assignments are:\n",
625 "\\begin{quote}\n", 1018 "\\begin{quote}\n",
626 paste(regmatches(temp_matches, m2), collapse = "\\newline\n"), 1019 paste(regmatches(sample_name_matches, m2), collapse = "\n\n\n"),
627 "\n\\end{quote}\n\n") 1020 "\n\\end{quote}\n\n")
628 1021
629 } else { 1022 } else {
630 p_value_data_anova_ps <- 1023 p_value_data_anova_ps <-
631 apply( 1024 apply(
647 1040
648 # output ANOVA file to constructed filename, 1041 # output ANOVA file to constructed filename,
649 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" 1042 # e.g. "Outputfile_pST_ANOVA_STEP5.txt"
650 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" 1043 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt"
651 1044
652 # Re-output quantile-normalized log-transformed dataset 1045 # Re-output datasets to include p-values
653 # with imputed, normalized data to include p-values
654
655 data_table_imputed <-
656 cbind(full_data[1:9], p_value_data[, 2:3], quant_data_log)
657 write.table( 1046 write.table(
658 data_table_imputed, 1047 cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp),
659 file = imputed_data_filename, 1048 file = imputed_data_filename,
660 sep = "\t", 1049 sep = "\t",
661 col.names = TRUE, 1050 col.names = TRUE,
662 row.names = FALSE, 1051 row.names = FALSE,
663 quote = FALSE 1052 quote = FALSE
664 ) 1053 )
665 1054
1055 write.table(
1056 cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp_qn_log),
1057 file = imp_qn_lt_data_filenm,
1058 sep = "\t",
1059 col.names = TRUE,
1060 row.names = FALSE,
1061 quote = FALSE
1062 )
1063
666 1064
667 p_value_data <- 1065 p_value_data <-
668 p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] 1066 p_value_data[order(p_value_data$fdr_adjusted_anova_p), ]
669 1067
1068 first_page_suppress <- 1
1069 number_of_peptides_found <- 0
670 cutoff <- val_fdr[1] 1070 cutoff <- val_fdr[1]
671 for (cutoff in val_fdr) { 1071 for (cutoff in val_fdr) {
1072 if (number_of_peptides_found > 49) {
1073 cat("\\leavevmode\n\n\n")
1074 break
1075 }
1076
672 #loop through FDR cutoffs 1077 #loop through FDR cutoffs
673 1078
674 filtered_p <- 1079 filtered_p <-
675 p_value_data[ 1080 p_value_data[
676 which(p_value_data$fdr_adjusted_anova_p < cutoff), 1081 which(p_value_data$fdr_adjusted_anova_p < cutoff),
677 , 1082 , drop = FALSE
678 drop = FALSE
679 ] 1083 ]
680 filtered_data_filtered <- 1084 filtered_data_filtered <-
681 quant_data_imp_qn_log[ 1085 quant_data_imp_qn_log[
682 rownames(filtered_p), 1086 rownames(filtered_p),
683 , 1087 , drop = FALSE
684 drop = FALSE
685 ] 1088 ]
686 filtered_data_filtered <- 1089 filtered_data_filtered <-
687 filtered_data_filtered[ 1090 filtered_data_filtered[
688 order(filtered_p$fdr_adjusted_anova_p), 1091 order(filtered_p$fdr_adjusted_anova_p),
689 , 1092 , drop = FALSE
690 drop = FALSE
691 ] 1093 ]
692 1094
693 # <!-- ACE insertion start --> 1095 # <!-- ACE insertion start -->
694 old_oma <- par("oma") 1096
695 old_par <- par( 1097 if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) {
696 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), 1098 if (first_page_suppress == 1) {
697 oma = old_oma * c(1, 1, 0.3, 1), 1099 first_page_suppress <- 0
698 cex.main = 0.9, 1100 }
699 cex.axis = 0.7 1101 else {
700 ) 1102 cat("\\newpage\n")
701 1103 }
702 cat("\\newpage\n")
703 if (nrow(filtered_data_filtered) > 0) {
704 cat(sprintf( 1104 cat(sprintf(
705 "Intensities for peptides whose adjusted p-value < %0.2f\n", 1105 "Intensities for %d peptides whose adjusted p-value < %0.2f:\n",
1106 nrow(filtered_data_filtered),
706 cutoff 1107 cutoff
707 )) 1108 ))
708 cat("\\newline\n") 1109 cat("\n\n\n")
709 cat("\\newline\n") 1110 cat("\n\n\n")
710 1111
1112 old_oma <- par("oma")
1113 old_par <- par(
1114 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1),
1115 oma = old_oma * c(1, 1, 0.3, 1),
1116 cex.main = 0.9,
1117 cex.axis = 0.7,
1118 fin = c(9, 7.25)
1119 )
1120 # ref: https://r-charts.com/distribution/add-points-boxplot/
1121 # Vertical plot
1122 colnames(filtered_data_filtered) <- sample_name_matches
711 boxplot( 1123 boxplot(
712 filtered_data_filtered, 1124 filtered_data_filtered,
713 main = "Imputed, normalized intensities", # no line plot 1125 main = "Imputed, normalized intensities", # no line plot
714 las = 2, 1126 las = 1,
1127 col = const_boxplot_fill,
715 ylab = expression(log[10](intensity)) 1128 ylab = expression(log[10](intensity))
716 ) 1129 )
1130 # Points
1131 stripchart(
1132 filtered_data_filtered, # Data
1133 method = "jitter", # Random noise
1134 jitter = const_stripchart_jitter,
1135 pch = 19, # Pch symbols
1136 cex = const_stripchart_cex, # Size of symbols reduced
1137 col = "goldenrod", # Color of the symbol
1138 vertical = TRUE, # Vertical mode
1139 add = TRUE # Add it over
1140 )
1141 par(old_par)
717 } else { 1142 } else {
718 cat(sprintf( 1143 cat(sprintf(
719 "No peptides were found to have cutoff adjusted p-value < %0.2f\n", 1144 "%s < %0.2f\n\n\n\n\n",
1145 "No peptides were found to have cutoff adjusted p-value <",
720 cutoff 1146 cutoff
721 )) 1147 ))
722 } 1148 }
723 par(old_par)
724 1149
725 if (nrow(filtered_data_filtered) > 0) { 1150 if (nrow(filtered_data_filtered) > 0) {
726 #Add Phosphopeptide column to anova_filtered table 1151 #Add Phosphopeptide column to anova_filtered table
727 anova_filtered_merge <- merge( 1152 anova_filtered_merge <- merge(
728 x = connect_df 1153 x = connect_df,
729 , 1154 y = filtered_data_filtered,
730 y = filtered_data_filtered 1155 by.x = "Intensity",
731 ,
732 by.x = "Intensity"
733 ,
734 by.y = 1 1156 by.y = 1
735 ) 1157 )
736 anova_filtered_merge_order <- rownames(filtered_p) 1158 anova_filtered_merge_order <- rownames(filtered_p)
737 1159
738 anova_filtered_merge_format <- sapply( 1160 anova_filtered_merge_format <- sapply(
757 c("Phosphopeptide", colnames(filtered_data_filtered)) 1179 c("Phosphopeptide", colnames(filtered_data_filtered))
758 1180
759 # Merge qualitative columns into the ANOVA data 1181 # Merge qualitative columns into the ANOVA data
760 output_table <- data.frame(anova_filtered$Phosphopeptide) 1182 output_table <- data.frame(anova_filtered$Phosphopeptide)
761 output_table <- merge( 1183 output_table <- merge(
762 x = output_table 1184 x = output_table,
763 , 1185 y = data_table_imp_qn_lt,
764 y = data_table_imp_qn_lt 1186 by.x = "anova_filtered.Phosphopeptide",
765 ,
766 by.x = "anova_filtered.Phosphopeptide"
767 ,
768 by.y = "Phosphopeptide" 1187 by.y = "Phosphopeptide"
769 ) 1188 )
770 1189
771 # Produce heatmap to visualize significance and the effect of imputation 1190 # Produce heatmap to visualize significance and the effect of imputation
772 m <- 1191 m <-
775 matrix( 1194 matrix(
776 as.integer(is.na(m)), 1195 as.integer(is.na(m)),
777 nrow = nrow(m) 1196 nrow = nrow(m)
778 ) 1197 )
779 ) 1198 )
780 m <- m[!m_nan_rows, ] 1199 m <- m[!m_nan_rows, , drop = FALSE]
781 if (nrow(m) > 0) { 1200 if (nrow(m) > 0) {
782 rownames_m <- rownames(m) 1201 rownames_m <- rownames(m)
783 rownames(m) <- sapply( 1202 rownames(m) <- sapply(
784 X = seq_len(nrow(m)) 1203 X = seq_len(nrow(m))
785 , 1204 ,
789 filtered_p$fdr_adjusted_anova_p[i], 1208 filtered_p$fdr_adjusted_anova_p[i],
790 rownames_m[i] 1209 rownames_m[i]
791 ) 1210 )
792 } 1211 }
793 ) 1212 )
794 margins <-
795 c(max(nchar(colnames(m))) * 10 / 16 # col
796 , max(nchar(rownames(m))) * 5 / 16 # row
797 )
798 how_many_peptides <- min(50, nrow(m)) 1213 how_many_peptides <- min(50, nrow(m))
1214 number_of_peptides_found <- how_many_peptides
1215 if (nrow(m) > 1) {
1216 m_margin <- m[how_many_peptides:1, ]
1217 margins <-
1218 c(max(nchar(colnames(m_margin))) * 10 / 16 # col
1219 , max(nchar(rownames(m_margin))) * 5 / 16 # row
1220 )
1221 }
799 1222
800 cat("\\newpage\n") 1223 cat("\\newpage\n")
801 if (nrow(m) > 50) { 1224 if (nrow(m) > 50) {
802 cat("Heatmap for the 50 most-significant peptides", 1225 cat("Heatmap for the 50 most-significant peptides",
803 sprintf( 1226 sprintf(
804 "whose adjusted p-value < %0.2f\n", 1227 "whose adjusted p-value < %0.2f\n",
805 cutoff) 1228 cutoff)
806 ) 1229 )
807 } else { 1230 } else {
808 cat("Heatmap for peptides whose", 1231 if (nrow(m) == 1) {
809 sprintf("adjusted p-value < %0.2f\n", 1232 cat(
810 cutoff) 1233 sprintf("Heatmap for %d usable peptides whose", nrow(m)),
811 ) 1234 sprintf("adjusted p-value < %0.2f\n", cutoff)
1235 )
1236 next
1237 }
812 } 1238 }
813 cat("\\newline\n") 1239 cat("\n\n\n")
814 cat("\\newline\n") 1240 cat("\n\n\n")
815 op <- par("cex.main")
816 try( 1241 try(
817 if (nrow(m) > 1) { 1242 if (nrow(m) > 1) {
1243 old_oma <- par("oma")
818 par(cex.main = 0.6) 1244 par(cex.main = 0.6)
819 heatmap( 1245 heatmap(
820 m[how_many_peptides:1, ], 1246 m[how_many_peptides:1, ],
821 Rowv = NA, 1247 Rowv = NA,
822 Colv = NA, 1248 Colv = NA,
823 cexRow = 0.7, 1249 cexRow = 0.7,
824 cexCol = 0.8, 1250 cexCol = 0.8,
825 scale = "row", 1251 scale = "row",
826 #ACE scale = "none",
827 margins = margins, 1252 margins = margins,
828 main = 1253 main =
829 "Heatmap of unimputed, unnormalized intensities", 1254 "Unimputed, unnormalized intensities",
830 xlab = "" 1255 xlab = "",
1256 las = 1 #, fin = c(9, 5.5)
831 ) 1257 )
832 } 1258 }
833 ) 1259 )
834 par(op)
835 } 1260 }
836 } 1261 }
837 } 1262 }
838 } 1263 }
1264 cat("\\leavevmode\n\n\n")
839 ``` 1265 ```
840 1266
841 <!-- 1267 <!--
842 ## Peptide IDs, etc. 1268 ## Peptide IDs, etc.
843 1269