Mercurial > repos > eschen42 > mqppep_anova
view mqppep_anova_script.Rmd @ 21:f3deca1a3c84 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 00881af5b1373174e5afe706add2f33b8614828c"
author | eschen42 |
---|---|
date | Wed, 13 Apr 2022 19:48:32 +0000 |
parents | 2c5f1a2fe16a |
children | 61adb8801b73 |
line wrap: on
line source
--- title: "MaxQuant Phospho-Proteomic Enrichment Pipeline ANOVA" author: "Larry Cheng; Art Eschenlauer" date: "May 28, 2018; Mar 16, 2022" output: pdf_document: toc: true latex_document: toc: true params: alphaFile: "test-data/alpha_levels.tabular" inputFile: "test-data/UT_Phospho_ST_Sites.preproc.tabular" firstDataColumn: "^Intensity[^_]" imputationMethod: !r c("group-median", "median", "mean", "random")[4] meanPercentile: 1 sdPercentile: 1.0 regexSampleNames: "\\.\\d+[A-Z]$" regexSampleGrouping: "\\d+" imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" show_toc: true --- <!-- alphaFile: "test-data/alpha_levels.tabular" inputFile: "test-data/test_input_for_anova.tabular" inputFile: "test-data/UT_Phospho_ST_Sites.preproc.tabular" inputFile: "test-data/density_failure.preproc_tab.tabular" latex_document: default --> ```{r setup, include = FALSE} # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) # freeze the random number generator so the same results will be produced # from run to run set.seed(28571) ### CONSTANTS const_parfin <- par("fin") const_boxplot_fill <- "grey94" const_stripchart_cex <- 0.5 const_stripsmall_cex <- sqrt(const_stripchart_cex * const_stripchart_cex / 2) const_stripchart_jitter <- 0.3 const_write_debug_files <- FALSE const_table_anchor <- "tbp" ### FUNCTIONS #ANOVA filter function anova_func <- function(x, grouping_factor) { x_aov <- aov(as.numeric(x) ~ grouping_factor) pvalue <- summary(x_aov)[[1]][["Pr(>F)"]][1] pvalue } write_debug_file <- function(s) { if (const_write_debug_files) { s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) write.table( s, file = s_path, sep = "\t", col.names = TRUE, row.names = TRUE, quote = FALSE ) } } latex_collapsed_vector <- function(collapse_string, v) { cat( paste0( gsub("_", "\\\\_", v), collapse = collapse_string ) ) } latex_itemized_collapsed <- function(collapse_string, v) { cat("\\begin{itemize}\n\\item ") latex_collapsed_vector(collapse_string, v) cat("\n\\end{itemize}\n") } latex_itemized_list <- function(v) { latex_itemized_collapsed("\n\\item ", v) } latex_enumerated_collapsed <- function(collapse_string, v) { cat("\\begin{enumerate}\n\\item ") latex_collapsed_vector(collapse_string, v) cat("\n\\end{enumerate}\n") } latex_enumerated_list <- function(v) { latex_enumerated_collapsed("\n\\item ", v) } latex_table_row <- function(v) { latex_collapsed_vector(" & ", v) cat(" \\\\\n") } # Use this like print.data.frame, from which it is adapted: data_frame_latex <- function( x, ..., # digits to pass to format.data.frame digits = NULL, # TRUE -> right-justify columns; FALSE -> left-justify right = TRUE, # maximumn number of rows to print max = NULL, # string with justification of each column justification = NULL, # TRUE to center on page centered = TRUE, # optional capttion caption = NULL, # h(inline); b(bottom); t (top) or p (separate page) anchor = "h" ) { if (is.null(justification)) justification <- Reduce( f = paste, x = rep_len(if (right) "r" else "l", length(colnames(x))) ) n <- length(rownames(x)) if (length(x) == 0L) { cat( sprintf( # if n is one, use singular 'row', else use plural 'rows' ngettext( n, "data frame with 0 columns and %d row", "data frame with 0 columns and %d rows" ), n ), "\n", sep = "" ) } else if (n == 0L) { cat("0 rows for:\n") latex_itemized_list(names(x)) } else { if (is.null(max)) max <- getOption("max.print", 99999L) if (!is.finite(max)) stop("invalid 'max' / getOption(\"max.print\"): ", max) omit <- (n0 <- max %/% length(x)) < n m <- as.matrix( format.data.frame( if (omit) x[seq_len(n0), , drop = FALSE] else x, digits = digits, na.encode = FALSE ) ) cat( # h(inline); b(bottom); t (top) or p (separate page) paste0("\\begin{table}[", anchor, "]\n") ) if (!is.null(caption)) cat(paste0(" \\caption{", caption, "}")) if (centered) cat("\\centering\n") cat( paste( " \\begin{tabular}{", justification, "}\n", sep = "" ) ) if (!is.null(caption)) cat(" \\hline\\hline\n") latex_table_row(colnames(m)) cat("\\hline\n") for (i in seq_len(length(m[, 1]))) { latex_table_row(m[i, ]) } cat( paste( " \\end{tabular}", "\\end{table}", sep = "\n" ) ) if (omit) cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", n - n0, "rows ]\n") } invisible(x) } ``` ## Purpose Perform imputation of missing values, quantile normalization, and ANOVA. ```{r include = FALSE} # Input Filename input_file <- params$inputFile # First data column - ideally, this could be detected via regexSampleNames, # but for now leave it as is. first_data_column <- params$firstDataColumn fdc_is_integer <- is.integer(first_data_column) if (fdc_is_integer) { first_data_column <- as.integer(params$firstDataColumn) } # False discovery rate adjustment for ANOVA # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 val_fdr <- read.table(file = params$alphaFile, sep = "\t", header = F, quote = "") if ( ncol(val_fdr) != 1 || sum(!is.numeric(val_fdr[, 1])) || sum(val_fdr[, 1] < 0) || sum(val_fdr[, 1] > 1) ) { stop("alphaFile should be one column of numbers within the range [0.0,1.0]") } val_fdr <- val_fdr[, 1] #Imputed Data filename imputed_data_filename <- params$imputedDataFilename imp_qn_lt_data_filenm <- params$imputedQNLTDataFile #ANOVA data filename ``` ```{r echo = FALSE} # Imputation method, should be one of # "random", "group-median", "median", or "mean" imputation_method <- params$imputationMethod # Selection of percentile of logvalue data to set the mean for random number # generation when using random imputation mean_percentile <- params$meanPercentile / 100.0 # deviation adjustment-factor for random values; real number. sd_percentile <- params$sdPercentile # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" regex_sample_names <- params$regexSampleNames # Regular expression to extract Sample Grouping from Sample Name; # if error occurs, compare sample_factor_levels and sample_name_matches # to see if groupings/pairs line up # e.g., "(\\d+)" regex_sample_grouping <- params$regexSampleGrouping ``` ```{r echo = FALSE} ### READ DATA library(data.table) # read.table reads a file in table format and creates a data frame from it. # - note that `quote = ""` means that quotation marks are treated literally. full_data <- read.table( file = input_file, sep = "\t", header = T, quote = "", check.names = FALSE ) ``` ## Extract Sample Names and Factor Levels Column names parsed from input file are shown in Table 1; sample names and factor levels, in Table 2. ```{r echo = FALSE, results = 'asis'} data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) if (!fdc_is_integer) { if (length(data_column_indices) > 0) { first_data_column <- data_column_indices[1] } else { stop(paste("failed to convert firstDataColumn:", first_data_column)) } } cat( sprintf( paste( "\n\nPeptide-intensity data for each sample is", "in one of columns %d through %d.\n\n" ), min(data_column_indices), max(data_column_indices) ) ) # Write column names as a LaTeX enumerated list. column_name_df <- data.frame( column = seq_len(length(colnames(full_data))), name = colnames(full_data) ) data_frame_latex( x = column_name_df, justification = "l l", centered = TRUE, caption = "Input data column name", anchor = const_table_anchor ) ``` ```{r echo = FALSE, results = 'asis'} #```{r echo = FALSE, results = 'asis'} quant_data <- full_data[first_data_column:length(full_data)] quant_data[quant_data == 0] <- NA rownames(quant_data) <- full_data$Phosphopeptide # Get factors -> group replicates (as indicated by terminal letter) # by the preceding digits; # Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$" # get factors -> # group runs (samples) by ignoring terminal [A-Z] in sample names # e.g. # group .1A .1B .1C into group 1; # group .2A .2B .2C, into group 2; # etc. m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE) sample_name_matches <- regmatches(names(quant_data), m) colnames(quant_data) <- sample_name_matches write_debug_file(quant_data) m2 <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) sample_factor_levels <- as.factor(regmatches(sample_name_matches, m2)) number_of_samples <- length(sample_name_matches) sample_factor_df <- data.frame( sample = sample_name_matches, level = sample_factor_levels ) data_frame_latex( x = sample_factor_df, justification = "c c", centered = TRUE, caption = "Factor level", anchor = const_table_anchor ) ``` ```{r echo = FALSE, results = 'asis'} cat("\\newpage\n") ``` ### Are the log-transformed sample distributions similar? ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} quant_data[quant_data == 0] <- NA #replace 0 with NA quant_data_log <- log10(quant_data) rownames(quant_data_log) <- rownames(quant_data) colnames(quant_data_log) <- sample_name_matches write_debug_file(quant_data_log) # data visualization old_par <- par( mai = par("mai") + c(0.5, 0, 0, 0) ) # ref: https://r-charts.com/distribution/add-points-boxplot/ # Vertical plot boxplot( quant_data_log , las = 1 , col = const_boxplot_fill , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") , xlab = "Sample" ) par(old_par) cat("\n\n\n") cat("\n\n\n") ``` ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} library(ggplot2) if (nrow(quant_data_log) > 1) { quant_data_log_stack <- stack(quant_data_log) ggplot( quant_data_log_stack, aes(x = values) ) + xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + ylab("Probability density") + geom_density( aes(group = ind, colour = ind), na.rm = TRUE ) } else { cat("No density plot because there are too few peptides.\n\n") } ``` ### Globally, are peptide intensities are approximately unimodal? <!-- # bquote could be used as an alternative to latex2exp::TeX below particularly # and when plotting math expressions generally, at the expense of mastering # another syntax, which hardly seems worthwhile when I need to use TeX # elsewhere; here's an introduction to bquote: # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ --> ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} # identify the location of missing values fin <- is.finite(as.numeric(as.matrix(quant_data_log))) logvalues <- as.numeric(as.matrix(quant_data_log))[fin] logvalues_density <- density(logvalues) plot( x = logvalues_density, main = latex2exp::TeX( "Smoothed estimated probability density vs. $log_{10}$(peptide intensity)" ), xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), ylab = "Probability density" ) hist( x = as.numeric(as.matrix(quant_data_log)), xlim = c(min(logvalues_density$x), max(logvalues_density$x)), breaks = 100, main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") ) ``` ### Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values: ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} # determine quantile q1 <- quantile(logvalues, probs = mean_percentile)[1] # determine standard deviation of quantile to impute sd_finite <- function(x) { ok <- is.finite(x) sd(x[ok]) } # 1 = row of matrix (ie, phosphopeptide) sds <- apply(quant_data_log, 1, sd_finite) if (sum(!is.na(sds)) > 2) { plot( density(sds, na.rm = T) , main = "Smoothed estimated probability density vs. std. deviation" , sub = "(probability estimation made with Gaussian smoothing)" , ylab = "Probability density" ) } else { cat( "At least two non-missing values are required to plot", "probability density.\n\n" ) } ``` ```{r echo = FALSE} # Determine number of cells to impute temp <- quant_data[is.na(quant_data)] # Determine number of values to impute number_to_impute <- length(temp) # Determine percent of missing values pct_missing_values <- round(length(temp) / (length(logvalues) + length(temp)) * 100) ``` ```{r echo = FALSE} # prep for trt-median based imputation ``` ## Impute Missing Values ```{r echo = FALSE} imp_smry_potential_before <- length(logvalues) imp_smry_missing_before <- number_to_impute imp_smry_pct_missing <- pct_missing_values ``` ```{r echo = FALSE} #Determine number of cells to impute ``` ```{r echo = FALSE} #Impute data quant_data_imp <- quant_data # Identify which values are missing and need to be imputed ind <- which(is.na(quant_data_imp), arr.ind = TRUE) ``` ```{r echo = FALSE, results = 'asis'} # Apply imputation switch( imputation_method , "group-median" = { imputation_method_description <- paste("Substitute missing value with", "median peptide intensity for sample group.\n" ) sample_level_integers <- as.integer(sample_factor_levels) for (i in seq_len(length(levels(sample_factor_levels)))) { level_cols <- i == sample_level_integers ind <- which(is.na(quant_data_imp[, level_cols]), arr.ind = TRUE) quant_data_imp[ind, level_cols] <- apply(quant_data_imp[, level_cols], 1, median, na.rm = T)[ind[, 1]] } good_rows <- !is.na(rowMeans(quant_data_imp)) } , "median" = { imputation_method_description <- paste("Substitute missing value with", "median peptide intensity across all sample classes.\n" ) quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]] good_rows <- !is.na(rowMeans(quant_data_imp)) } , "mean" = { imputation_method_description <- paste("Substitute missing value with", "mean peptide intensity across all sample classes.\n" ) quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]] good_rows <- !is.na(rowMeans(quant_data_imp)) } , "random" = { m1 <- median(sds, na.rm = T) * sd_percentile #sd to be used is the median sd # If you want results to be reproducible, you will want to call # base::set.seed before calling stats::rnorm imputation_method_description <- paste("Substitute each missing value with random intensity", sprintf( "random intensity $N \\sim (%0.2f, %0.2f)$.\n", q1, m1 ) ) cat(sprintf("mean_percentile (from input parameter) is %2.0f\n\n", 100 * mean_percentile)) cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n", sd_percentile)) quant_data_imp[ind] <- 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) good_rows <- !is.na(rowMeans(quant_data_imp)) } ) if (length(good_rows) < 1) { print("ERROR: Cannot impute data; there are no good rows!") return(-1) } ``` ```{r echo = FALSE, results = 'asis'} cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description) ``` ```{r echo = FALSE} imp_smry_potential_after <- sum(good_rows) imp_smry_rejected_after <- sum(!good_rows) imp_smry_missing_after <- sum(is.na(quant_data_imp[good_rows, ])) ``` ```{r echo = FALSE, results = 'asis'} # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf tabular_lines <- paste( "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) " \\caption{Imputation Results}", " \\centering", # \centering centers the table on the page " \\begin{tabular}{l c c c}", " \\hline\\hline", " \\ & potential peptides & missing values & rejected", " peptides \\\\ [0.5ex]", " \\hline", " before imputation & %d & %d (%d\\%s) & \\\\", " after imputation & %d & %d & %d \\\\ [1ex]", " \\hline", " \\end{tabular}", #" \\label{table:nonlin}", # may be used to refer this table in the text "\\end{table}", sep = "\n" ) tabular_lines <- sprintf( tabular_lines, imp_smry_potential_before, imp_smry_missing_before, imp_smry_pct_missing, "%", imp_smry_potential_after, imp_smry_missing_after, imp_smry_rejected_after ) cat(tabular_lines) ``` ```{r echo = FALSE} # Zap rows where imputation was ineffective full_data <- full_data [good_rows, ] quant_data <- quant_data [good_rows, ] write_debug_file(quant_data_imp) quant_data_imp <- quant_data_imp[good_rows, ] quant_data_imp_good_rows <- quant_data_imp write_debug_file(quant_data_imp_good_rows) ``` ```{r echo = FALSE, results = 'asis'} can_plot_before_after_imp <- TRUE d_combined <- as.numeric( as.matrix( log10(quant_data_imp) ) ) d_original <- as.numeric( as.matrix( log10(quant_data_imp[!is.na(quant_data)]) ) ) if (sum(!is.na(d_combined)) > 2 && sum(!is.na(d_original)) > 2) { d_combined <- density(d_combined) d_original <- density(d_original) } else { can_plot_before_after_imp <- FALSE } if (sum(is.na(quant_data)) > 0) { # There ARE missing values d_imputed <- as.numeric( as.matrix( log10(quant_data_imp[is.na(quant_data)]) ) ) if (sum(!is.na(d_combined)) > 2) { d_imputed <- (density(d_imputed)) } else { can_plot_before_after_imp <- FALSE } } else { # There are NO missing values d_imputed <- d_combined } ``` ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} if (sum(is.na(quant_data)) > 0) { cat("\\leavevmode\\newpage\n") # data visualization old_par <- par( mai = par("mai") + c(0.5, 0, 0, 0) ) x <- quant_data x <- blue_dots <- x / x blue_dots <- log10(blue_dots * quant_data) x[is.na(x)] <- 0 x[x == 1] <- NA x[x == 0] <- 1 quant_data_imp_log10 <- log10(quant_data_imp) write_debug_file(quant_data_imp_log10) red_dots <- quant_data_imp_log10 * x count_red <- sum(!is.na(red_dots)) count_blue <- sum(!is.na(blue_dots)) ylim_save <- ylim <- c( min(red_dots, blue_dots, na.rm = TRUE), max(red_dots, blue_dots, na.rm = TRUE) ) show_stripchart <- 50 > (count_red + count_blue) / length(sample_name_matches) if (show_stripchart) { boxplot_sub <- "Light blue = data before imputation; Red = imputed data" } else { boxplot_sub <- "" } # Vertical plot colnames(blue_dots) <- sample_name_matches boxplot( blue_dots , las = 1 # "always horizontal" , col = const_boxplot_fill , ylim = ylim , main = "Peptide intensities before and after imputation" , sub = boxplot_sub , xlab = "Sample" , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") ) if (show_stripchart) { # Points # ref: https://r-charts.com/distribution/add-points-boxplot/ # NA values are not plotted stripchart( blue_dots, # Data method = "jitter", # Random noise jitter = const_stripchart_jitter, pch = 19, # Pch symbols cex = const_stripsmall_cex, # Size of symbols reduced col = "lightblue", # Color of the symbol vertical = TRUE, # Vertical mode add = TRUE # Add it over ) stripchart( red_dots, # Data method = "jitter", # Random noise jitter = const_stripchart_jitter, pch = 19, # Pch symbols cex = const_stripsmall_cex, # Size of symbols reduced col = "red", # Color of the symbol vertical = TRUE, # Vertical mode add = TRUE # Add it over ) } else { # violin plot cat("\\leavevmode\n\\quad\n\n\\quad\n\n") vioplot::vioplot( x = lapply(blue_dots, function(x) x[!is.na(x)]), col = "lightblue1", side = "left", plotCentre = "line", ylim = ylim_save, main = "Distributions of observed and imputed data", sub = "Light blue = observed data; Pink = imputed data", xlab = "Sample", ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") ) vioplot::vioplot( x = lapply(red_dots, function(x) x[!is.na(x)]), col = "lightpink1", side = "right", plotCentre = "line", add = T ) } par(old_par) # density plot cat("\\leavevmode\n\n\n\n\n\n\n") if (can_plot_before_after_imp) { ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y)) plot( d_combined, ylim = ylim, sub = paste( "Blue = data before imputation; Red = imputed data;", "Black = combined" ), main = "Density of peptide intensity before and after imputation", xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), ylab = "Probability density" ) lines(d_original, col = "blue") lines(d_imputed, col = "red") } else { cat( "There are too few points to plot the density of peptide intensity", "before and after imputation." ) } cat("\\leavevmode\\newpage\n") } ``` ## Perform Quantile Normalization <!-- # Apply quantile normalization using preprocessCore::normalize.quantiles # --- # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html # except this: https://support.bioconductor.org/p/122925/#9135989 # says to install it like this: # ``` # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) # ``` # conda installation (necessary because of a bug in recent openblas): # conda install bioconductor-preprocesscore openblas=0.3.3 # ... # --- # normalize.quantiles {preprocessCore} -- Quantile Normalization # # Description: # Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities. # # Usage: # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) # # Arguments: # # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. # # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, # but in certain situations not making a copy of the matrix, but instead normalizing # it in place will be more memory friendly. # # - keep.names: Boolean option to preserve matrix row and column names in output. # # Details: # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. # No special allowances are made for outliers. If you make use of quantile normalization # please cite Bolstad et al, Bioinformatics (2003). # # This functions will handle missing data (ie NA values), based on # the assumption that the data is missing at random. # # Note that the current implementation optimizes for better memory usage # at the cost of some additional run-time. # # Value: A normalized matrix. # # Author: Ben Bolstad, bmbolstad.com # # References # # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf # # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 # http://bmbolstad.com/misc/normalize/normalize.html # ... --> ```{r echo = FALSE, results = 'asis'} library(preprocessCore) if (nrow(quant_data_imp) > 0) { quant_data_imp_qn <- normalize.quantiles(as.matrix(quant_data_imp)) } else { quant_data_imp_qn <- as.matrix(quant_data_imp) } quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) names(quant_data_imp_qn) <- names(quant_data_imp) write_debug_file(quant_data_imp_qn) quant_data_imp_qn_log <- log10(quant_data_imp_qn) rownames(quant_data_imp_qn_log) <- rownames(quant_data_imp) write_debug_file(quant_data_imp_qn_log) quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) any_nan <- function(x) { !any(x == "NaN") } sel <- apply(quant_data_imp_qn_ls, 1, any_nan) quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls[which(sel), ] quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) rownames(quant_data_imp_qn_ls2) <- rownames(quant_data_imp)[sel] quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls) rownames(quant_data_imp_qn_ls) <- rownames(quant_data_imp) write_debug_file(quant_data_imp_qn_ls) write_debug_file(quant_data_imp_qn_ls2) #output quantile normalized data data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) write.table( data_table_imp_qn_lt, file = imp_qn_lt_data_filenm, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE ) ``` <!-- ACE insertion begin --> ### Checking that normalized, imputed, log-transformed sample distributions are similar: ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} # Save unimputed quant_data_log for plotting below unimputed_quant_data_log <- quant_data_log # log10 transform (after preparing for zero values, # which should never happen...) quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 quant_data_log <- log10(quant_data_imp_qn) # Output with imputed, un-normalized data data_table_imputed <- cbind(full_data[1:9], quant_data_imp) write.table( data_table_imputed , file = imputed_data_filename , sep = "\t" , col.names = TRUE , row.names = FALSE , quote = FALSE ) how_many_peptides <- nrow(quant_data_log) if ((how_many_peptides) > 0) { cat( sprintf( "Intensities for %d peptides:\n\n\n", how_many_peptides ) ) cat("\n\n\n") # data visualization old_par <- par( mai = par("mai") + c(0.5, 0, 0, 0) , oma = par("oma") + c(0.5, 0, 0, 0) ) # ref: https://r-charts.com/distribution/add-points-boxplot/ # Vertical plot colnames(quant_data_log) <- sample_name_matches boxplot( quant_data_log , las = 1 , col = const_boxplot_fill , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") , xlab = "Sample" ) par(old_par) } else { cat("There are no peptides to plot\n") } cat("\n\n\n") ``` ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} if (nrow(quant_data_log) > 1) { quant_data_log_stack <- stack(quant_data_log) ggplot( quant_data_log_stack, aes(x = values) ) + xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + ylab("Probability density") + geom_density( aes(group = ind, colour = ind), na.rm = TRUE ) } else { cat("No density plot because there are fewer than two peptides to plot.\n\n") } ``` ```{r echo = FALSE, results = 'asis'} cat("\\leavevmode\\newpage\n") ``` ## Perform ANOVA Filters ```{r, echo = FALSE} # Make new data frame containing only Phosphopeptides # to connect preANOVA to ANOVA (connect_df) connect_df <- data.frame( data_table_imp_qn_lt$Phosphopeptide , data_table_imp_qn_lt[, first_data_column] ) colnames(connect_df) <- c("Phosphopeptide", "Intensity") ``` ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} count_of_factor_levels <- length(levels(sample_factor_levels)) if (count_of_factor_levels < 2) { nuke_control_sequences <- function(s) { s <- gsub("[\\]", "xyzzy_plugh", s) s <- gsub("[$]", "\\\\$", s) s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) return(s) } cat( "ERROR!!!! Cannot perform ANOVA analysis", "(see next page)\\newpage\n" ) cat( "ERROR: ANOVA analysis", "requires two or more factor levels!\n\n\n" ) cat("\n\n\n\n\n") cat("Unparsed sample names are:\n\n\n", "\\begin{quote}\n", paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), "\n\\end{quote}\n\n") regex_sample_names <- nuke_control_sequences(regex_sample_names) cat("\\leavevmode\n\n\n") cat("Parsing rule for SampleNames is", "\n\n\n", "\\text{'", regex_sample_names, "'}\n\n\n", sep = "" ) cat("\nParsed sample names are:\n", "\\begin{quote}\n", paste(sample_name_matches, collapse = "\n\n\n"), "\n\\end{quote}\n\n") regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping) cat("\\leavevmode\n\n\n") cat("Parsing rule for SampleGrouping is", "\n\n\n", "\\text{'", regex_sample_grouping, "'}\n\n\n", sep = "" ) cat("\n\n\n") cat("Sample group assignments are:\n", "\\begin{quote}\n", paste(regmatches(sample_name_matches, m2), collapse = "\n\n\n"), "\n\\end{quote}\n\n") } else { p_value_data_anova_ps <- apply( quant_data_imp_qn_log, 1, anova_func, grouping_factor = sample_factor_levels ) p_value_data_anova_ps_fdr <- p.adjust(p_value_data_anova_ps, method = "fdr") p_value_data <- data.frame( phosphopeptide = full_data[, 1] , raw_anova_p = p_value_data_anova_ps , fdr_adjusted_anova_p = p_value_data_anova_ps_fdr ) # output ANOVA file to constructed filename, # e.g. "Outputfile_pST_ANOVA_STEP5.txt" # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" # Re-output datasets to include p-values write.table( cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp), file = imputed_data_filename, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE ) write.table( cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp_qn_log), file = imp_qn_lt_data_filenm, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE ) p_value_data <- p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] first_page_suppress <- 1 number_of_peptides_found <- 0 cutoff <- val_fdr[1] for (cutoff in val_fdr) { if (number_of_peptides_found > 49) { cat("\\leavevmode\n\n\n") break } #loop through FDR cutoffs filtered_p <- p_value_data[ which(p_value_data$fdr_adjusted_anova_p < cutoff), , drop = FALSE ] filtered_data_filtered <- quant_data_imp_qn_log[ rownames(filtered_p), , drop = FALSE ] filtered_data_filtered <- filtered_data_filtered[ order(filtered_p$fdr_adjusted_anova_p), , drop = FALSE ] # <!-- ACE insertion start --> if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) { if (first_page_suppress == 1) { first_page_suppress <- 0 } else { cat("\\newpage\n") } cat(sprintf( "Intensities for %d peptides whose adjusted p-value < %0.2f:\n", nrow(filtered_data_filtered), cutoff )) cat("\n\n\n") cat("\n\n\n") old_oma <- par("oma") old_par <- par( mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), oma = old_oma * c(1, 1, 0.3, 1), cex.main = 0.9, cex.axis = 0.7, fin = c(9, 7.25) ) # ref: https://r-charts.com/distribution/add-points-boxplot/ # Vertical plot colnames(filtered_data_filtered) <- sample_name_matches boxplot( filtered_data_filtered, main = "Imputed, normalized intensities", # no line plot las = 1, col = const_boxplot_fill, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") ) par(old_par) } else { cat(sprintf( "%s < %0.2f\n\n\n\n\n", "No peptides were found to have cutoff adjusted p-value", cutoff )) } if (nrow(filtered_data_filtered) > 0) { #Add Phosphopeptide column to anova_filtered table anova_filtered_merge <- merge( x = connect_df, y = filtered_data_filtered, by.x = "Intensity", by.y = 1 ) anova_filtered_merge_order <- rownames(filtered_p) anova_filtered_merge_format <- sapply( X = filtered_p$fdr_adjusted_anova_p , FUN = function(x) { if (x > 0.0001) paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s") else paste0("(%0.4e) %s") } ) anova_filtered <- data.table( anova_filtered_merge$Phosphopeptide , anova_filtered_merge$Intensity , anova_filtered_merge[, 2:number_of_samples + 1] ) colnames(anova_filtered) <- c("Phosphopeptide", colnames(filtered_data_filtered)) # Merge qualitative columns into the ANOVA data output_table <- data.frame(anova_filtered$Phosphopeptide) output_table <- merge( x = output_table, y = data_table_imp_qn_lt, by.x = "anova_filtered.Phosphopeptide", by.y = "Phosphopeptide" ) # Produce heatmap to visualize significance and the effect of imputation m <- as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) m_nan_rows <- rowSums( matrix( as.integer(is.na(m)), nrow = nrow(m) ) ) m <- m[!m_nan_rows, , drop = FALSE] if (nrow(m) > 0) { rownames_m <- rownames(m) rownames(m) <- sapply( X = seq_len(nrow(m)) , FUN = function(i) { sprintf( anova_filtered_merge_format[i], filtered_p$fdr_adjusted_anova_p[i], rownames_m[i] ) } ) how_many_peptides <- min(50, nrow(m)) number_of_peptides_found <- how_many_peptides if (nrow(m) > 1) { m_margin <- m[how_many_peptides:1, ] margins <- c(max(nchar(colnames(m_margin))) * 10 / 16 # col , max(nchar(rownames(m_margin))) * 5 / 16 # row ) } cat("\\newpage\n") if (nrow(m) > 50) { cat("Heatmap for the 50 most-significant peptides", sprintf( "whose adjusted p-value < %0.2f\n", cutoff) ) } else { if (nrow(m) == 1) { next } else { cat( sprintf("Heatmap for %d usable peptides whose", nrow(m)), sprintf("adjusted p-value < %0.2f\n", cutoff) ) } } cat("\n\n\n") cat("\n\n\n") try( if (nrow(m) > 1) { old_oma <- par("oma") par(cex.main = 0.6) heatmap( m[how_many_peptides:1, ], Rowv = NA, Colv = NA, cexRow = 0.7, cexCol = 0.8, scale = "row", margins = margins, main = "Unimputed, unnormalized intensities", xlab = "", las = 1 #, fin = c(9, 5.5) ) } ) } } } } cat("\\leavevmode\n\n\n") ```