Mercurial > repos > eschen42 > mqppep_anova
view mqppep_anova_script.Rmd @ 7:d728198f1ba5 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 9a0fa6d0f9aadc069a5551a54da6daf307885637"
author | eschen42 |
---|---|
date | Tue, 15 Mar 2022 00:35:16 +0000 |
parents | c1403d18c189 |
children | 4deacfee76ef |
line wrap: on
line source
--- title: "MaxQuant Phospho-Proteomic Enrichment Pipeline ANOVA" author: "Larry Cheng; Art Eschenlauer" date: "May 28, 2018; Nov 16, 2021" output: pdf_document: default params: inputFile: "test-data/test_input_for_anova.tabular" alphaFile: "test-data/alpha_levels.tabular" firstDataColumn: "Intensity" imputationMethod: !r c("group-median", "median", "mean", "random")[1] meanPercentile: 1 sdPercentile: 0.2 regexSampleNames: "\\.(\\d+)[A-Z]$" regexSampleGrouping: "(\\d+)" imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt" --- ```{r setup, include = FALSE} # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) ### FUNCTIONS #ANOVA filter function anova_func <- function(x, grouping_factor) { x_aov <- aov(as.numeric(x) ~ grouping_factor) pvalue <- summary(x_aov)[[1]][["Pr(>F)"]][1] pvalue } ``` ## Purpose: Perform imputation of missing values, quantile normalization, and ANOVA. <!-- ## Variables to change for each input file --> ```{r include = FALSE} # Input Filename input_file <- params$inputFile # First data column - ideally, this could be detected via regexSampleNames, # but for now leave it as is. first_data_column <- params$firstDataColumn fdc_is_integer <- TRUE first_data_column <- withCallingHandlers( as.integer(first_data_column) , warning = function(w) fdc_is_integer <<- FALSE ) if (FALSE == fdc_is_integer) { first_data_column <- params$firstDataColumn } # False discovery rate adjustment for ANOVA # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 val_fdr <- read.table(file = params$alphaFile, sep = "\t", header = F, quote = "")[, 1] #Imputed Data filename imputed_data_filename <- params$imputedDataFilename #ANOVA data filename ``` ```{r echo = FALSE} # Imputation method, should be one of # "random", "group-median", "median", or "mean" imputation_method <- params$imputationMethod # Selection of percentile of logvalue data to set the mean for random number # generation when using random imputation mean_percentile <- params$meanPercentile / 100.0 # deviation adjustment-factor for random values; real number. sd_percentile <- params$sdPercentile # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" regex_sample_names <- params$regexSampleNames # Regular expression to extract Sample Grouping from Sample Name; # if error occurs, compare sample_factor_levels and temp_matches # to see if groupings/pairs line up # e.g., "(\\d+)" regex_sample_grouping <- params$regexSampleGrouping ``` ```{r echo = FALSE} ### READ DATA library(data.table) # read.table reads a file in table format and creates a data frame from it. # - note that `quote = ""` means that quotation marks are treated literally. full_data <- read.table( file = input_file, sep = "\t", header = T, quote = "", check.names = FALSE ) ``` ### Column names from input file ```{r echo = FALSE, results = 'markup'} print(colnames(full_data)) data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) cat(sprintf("First data column: %d\n", min(data_column_indices))) cat(sprintf("Last data column: %d\n", max(data_column_indices))) ``` ```{r echo = FALSE, results = 'asis'} cat("\\newpage\n") ``` ### Checking that log-transformed sample distributions are similar: ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} if (FALSE == fdc_is_integer) { if (length(data_column_indices) > 0) { first_data_column <- data_column_indices[1] } else { stop(paste("failed to convert firstDataColumn:", first_data_column)) } } quant_data0 <- full_data[first_data_column:length(full_data)] quant_data <- full_data[first_data_column:length(full_data)] quant_data[quant_data == 0] <- NA #replace 0 with NA quant_data_log <- log10(quant_data) rownames(quant_data_log) <- full_data$Phosphopeptide # data visualization old_par <- par( mai = par("mai") + c(0.5, 0, 0, 0) ) boxplot( quant_data_log , las = 2 ) par(old_par) cat("\\newline\n") cat("\\newline\n") ``` ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), warning = FALSE} quant_data_log_stack <- stack(quant_data_log) library(ggplot2) ggplot( quant_data_log_stack, aes(x = values)) + geom_density(aes(group = ind, colour = ind)) ``` ### Globally, are phosphopeptide intensities are approximately unimodal? <!-- # ref for bquote below particularly and plotting math expressions generally: # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ --> ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5)} # identify the location of missing values fin <- is.finite(as.numeric(as.matrix(quant_data_log))) logvalues <- as.numeric(as.matrix(quant_data_log))[fin] plot( density(logvalues), main = bquote( "Smoothed estimated probability density vs." ~ log[10](intensity)), xlab = bquote(log[10](intensity)) ) hist( x = as.numeric(as.matrix(quant_data_log)) , breaks = 100 , main = bquote("Frequency vs." ~ log[10](intensity)) , xlab = bquote(log[10](intensity)) ) ``` ### Distribution of standard deviations of phosphopeptides, ignoring missing values: ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5)} # determine quantile q1 <- quantile(logvalues, probs = mean_percentile)[1] # determine standard deviation of quantile to impute sd_finite <- function(x) { ok <- is.finite(x) sd(x[ok]) * sd_percentile } # 1 = row of matrix (ie, phosphopeptide) sds <- apply(quant_data_log, 1, sd_finite) plot( density(sds, na.rm = T) , main = "Smoothed estimated probability density vs. std. deviation" , sub = "(probability estimation made with Gaussian smoothing)" ) m1 <- median(sds, na.rm = T) #sd to be used is the median sd ``` <!-- The number of missing values are: --> ```{r echo = FALSE} #Determine number of cells to impute temp <- quant_data[is.na(quant_data)] #Determine number of values to impute number_to_impute <- length(temp) ``` <!-- % of values that are missing: --> ```{r echo = FALSE} pct_missing_values <- length(temp) / (length(logvalues) + length(temp)) * 100 ``` <!-- First few rows of data before imputation: --> ```{r echo = FALSE, results = 'asis'} cat("\\newpage\n") ``` ## Parse sample names Parse the names of the samples to deduce the factor level for each sample: ```{r echo = FALSE} # prep for trt-median based imputation # Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$" # get factors -> # group runs (samples) by ignoring terminal [A-Z] in sample names m <- regexpr(regex_sample_names, names(quant_data), perl = TRUE) temp_matches <- regmatches(names(quant_data), m) print("Extracted sample names") print(temp_matches) m2 <- regexpr(regex_sample_grouping, temp_matches, perl = TRUE) sample_factor_levels <- as.factor(regmatches(temp_matches, m2)) print("Factor levels") print(sample_factor_levels) ``` ## Impute missing values ```{r echo = FALSE} #Determine number of cells to impute cat("Before imputation,", sprintf( "there are:\n %d peptides\n %d missing values (%2.0f%s)", sum(rep.int(TRUE, nrow(quant_data))), sum(is.na(quant_data)), pct_missing_values, "%" ) ) ``` ```{r echo = FALSE} #Impute data quant_data_imp <- quant_data # Identify which values are missing and need to be imputed ind <- which(is.na(quant_data_imp), arr.ind = TRUE) ``` ```{r echo = FALSE} # Apply imputation switch( imputation_method , "group-median" = { cat("Imputation method:\n substitute missing value", "with median peptide-intensity for sample-group\n") sample_level_integers <- as.integer(sample_factor_levels) for (i in seq_len(length(levels(sample_factor_levels)))) { level_cols <- i == sample_level_integers ind <- which(is.na(quant_data_imp[, level_cols]), arr.ind = TRUE) quant_data_imp[ind, level_cols] <- apply(quant_data_imp[, level_cols], 1, median, na.rm = T)[ind[, 1]] } good_rows <- !is.na(rowMeans(quant_data_imp)) } , "median" = { cat("Imputation method:\n substitute missing value with", "median peptide-intensity across all sample classes\n") quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]] good_rows <- !is.na(rowMeans(quant_data_imp)) } , "mean" = { cat("Imputation method:\n substitute missing value with", "mean peptide-intensity across all sample classes\n") quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]] good_rows <- !is.na(rowMeans(quant_data_imp)) } , "random" = { cat( "Imputation method:\n substitute missing value with\n ", sprintf( "random intensity N ~ (%0.2f, %0.2f)\n" , q1, m1 ) ) quant_data_imp[is.na(quant_data_imp)] <- 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) good_rows <- !is.na(rowMeans(quant_data_imp)) } ) ``` ```{r echo = FALSE} #Determine number of cells to impute temp <- quant_data_imp[is.na(quant_data_imp)] cat("After imputation, there are:", sprintf( "\n %d missing values\n %d usable peptides analysis" , sum(is.na(quant_data_imp[good_rows, ])) , sum(good_rows) ), sprintf( "\n %d peptides with too many missing values for further analysis" , sum(!good_rows) ) ) ``` ```{r echo = FALSE} # Zap rows where imputation was ineffective full_data <- full_data [good_rows, ] quant_data <- quant_data [good_rows, ] quant_data_imp <- quant_data_imp[good_rows, ] ``` ```{r echo = FALSE} d_combined <- (density(as.numeric(as.matrix( log10(quant_data_imp) )))) d_original <- density(as.numeric(as.matrix( log10(quant_data_imp[!is.na(quant_data)])))) ``` ```{r echo = FALSE} if (sum(is.na(quant_data)) > 0) { # There ARE missing values d_imputed <- (density(as.numeric(as.matrix( log10(quant_data_imp[is.na(quant_data)]) )))) } else { # There are NO missing values d_imputed <- d_combined } ``` ```{r echo = FALSE, fig.dim = c(9, 5)} ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y)) plot( d_combined, ylim = ylim, sub = "Blue = data before imputation; Red = imputed data", main = "Density vs. log10(intensity) before and after imputation" ) lines(d_original, col = "blue") lines(d_imputed, col = "red") ``` ## Perform Quantile Normalization <!-- # Apply quantile normalization using preprocessCore::normalize.quantiles # --- # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html # except this: https://support.bioconductor.org/p/122925/#9135989 # says to install it like this: # ``` # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) # ``` # conda installation (necessary because of a bug in recent openblas): # conda install bioconductor-preprocesscore openblas=0.3.3 # ... # --- # normalize.quantiles {preprocessCore} -- Quantile Normalization # # Description: # Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities. # # Usage: # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) # # Arguments: # # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. # # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, # but in certain situations not making a copy of the matrix, but instead normalizing # it in place will be more memory friendly. # # - keep.names: Boolean option to preserve matrix row and column names in output. # # Details: # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. # No special allowances are made for outliers. If you make use of quantile normalization # please cite Bolstad et al, Bioinformatics (2003). # # This functions will handle missing data (ie NA values), based on # the assumption that the data is missing at random. # # Note that the current implementation optimizes for better memory usage # at the cost of some additional run-time. # # Value: A normalized matrix. # # Author: Ben Bolstad, bmbolstad.com # # References # # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf # # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 # http://bmbolstad.com/misc/normalize/normalize.html # ... --> ```{r echo = FALSE} library(preprocessCore) if (TRUE) { quant_data_imp_qn <- normalize.quantiles(as.matrix(quant_data_imp)) } else { quant_data_imp_qn <- as.matrix(quant_data_imp) } quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) names(quant_data_imp_qn) <- names(quant_data_imp) quant_data_imp_qn_log <- log10(quant_data_imp_qn) rownames(quant_data_imp_qn_log) <- full_data[, 1] quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) any_nan <- function(x) { !any(x == "NaN") } sel <- apply(quant_data_imp_qn_ls, 1, any_nan) quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls[which(sel), ] quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) #output quantile normalized data data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) write.table( data_table_imp_qn_lt, file = paste(paste( strsplit(imputed_data_filename, ".txt"), "QN_LT", sep = "_" ), ".txt", sep = ""), sep = "\t", col.names = TRUE, row.names = FALSE ) ``` <!-- ACE insertion begin --> ### Checking that normalized, imputed, log-transformed sample distributions are similar: ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} # Save unimputed quant_data_log for plotting below unimputed_quant_data_log <- quant_data_log # log10 transform (after preparing for zero values, # which should never happen...) quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 quant_data_log <- log10(quant_data_imp_qn) # Output quantile-normalized log-transformed dataset # with imputed, normalized data data_table_imputed <- cbind(full_data[1:9], quant_data_log) write.table( data_table_imputed , file = imputed_data_filename , sep = "\t" , col.names = TRUE , row.names = FALSE , quote = FALSE ) # data visualization old_par <- par( mai = par("mai") + c(0.5, 0, 0, 0) , oma = par("oma") + c(0.5, 0, 0, 0) ) boxplot( quant_data_log , las = 2 ) par(old_par) cat("\\newline\n") cat("\\newline\n") ``` ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4)} quant_data_log_stack <- stack(quant_data_log) ggplot( quant_data_log_stack, aes(x = values) ) + geom_density(aes(group = ind, colour = ind)) ``` ## Perform ANOVA filters (see following pages) ```{r, echo = FALSE} # Make new data frame containing only Phosphopeptides # to connect preANOVA to ANOVA (connect_df) connect_df <- data.frame( data_table_imp_qn_lt$Phosphopeptide , data_table_imp_qn_lt[, first_data_column] ) colnames(connect_df) <- c("Phosphopeptide", "Intensity") ``` ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} # Get factors -> group replicates (as indicated by terminal letter) # by the preceding digits; # e.g., group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc.. m <- regexpr(regex_sample_names, names(quant_data_imp_qn_log), perl = TRUE) temp_matches <- regmatches(names(quant_data_imp_qn_log), m) number_of_samples <- length(temp_matches) m2 <- regexpr(regex_sample_grouping, temp_matches, perl = TRUE) sample_factor_levels <- as.factor(regmatches(temp_matches, m2)) if (length(levels(sample_factor_levels)) < 2) { cat( "ERROR!!!! Cannot perform ANOVA analysis", "because it requires two or more factor levels\n" ) cat("Unparsed sample names are:\n") print(names(quant_data_imp_qn_log)) cat(sprintf("Parsing rule for SampleNames is '%s'\n", regex_sample_names)) cat("Parsed names are:\n") print(temp_matches) cat(sprintf( "Parsing rule for SampleGrouping is '%s'\n", regex_sample_grouping )) cat("Sample group assignments are:\n") print(regmatches(temp_matches, m2)) } else { p_value_data_anova_ps <- apply( quant_data_imp_qn_log, 1, anova_func, grouping_factor = sample_factor_levels ) p_value_data_anova_ps_fdr <- p.adjust(p_value_data_anova_ps, method = "fdr") p_value_data <- data.frame( phosphopeptide = full_data[, 1] , raw_anova_p = p_value_data_anova_ps , fdr_adjusted_anova_p = p_value_data_anova_ps_fdr ) # output ANOVA file to constructed filename, # e.g. "Outputfile_pST_ANOVA_STEP5.txt" # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" # Re-output quantile-normalized log-transformed dataset # with imputed, normalized data to include p-values data_table_imputed <- cbind(full_data[1:9], p_value_data[, 2:3], quant_data_log) write.table( data_table_imputed, file = imputed_data_filename, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE ) p_value_data <- p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] cutoff <- val_fdr[1] for (cutoff in val_fdr) { #loop through FDR cutoffs filtered_p <- p_value_data[ which(p_value_data$fdr_adjusted_anova_p < cutoff), , drop = FALSE ] filtered_data_filtered <- quant_data_imp_qn_log[ rownames(filtered_p), , drop = FALSE ] filtered_data_filtered <- filtered_data_filtered[ order(filtered_p$fdr_adjusted_anova_p), , drop = FALSE ] # <!-- ACE insertion start --> old_oma <- par("oma") old_par <- par( mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), oma = old_oma * c(1, 1, 0.3, 1), cex.main = 0.9, cex.axis = 0.7 ) cat("\\newpage\n") if (nrow(filtered_data_filtered) > 0) { cat(sprintf( "Intensities for peptides whose adjusted p-value < %0.2f\n", cutoff )) cat("\\newline\n") cat("\\newline\n") boxplot( filtered_data_filtered, main = "Imputed, normalized intensities", # no line plot las = 2, ylab = expression(log[10](intensity)) ) } else { cat(sprintf( "No peptides were found to have cutoff adjusted p-value < %0.2f\n", cutoff )) } par(old_par) if (nrow(filtered_data_filtered) > 0) { #Add Phosphopeptide column to anova_filtered table anova_filtered_merge <- merge( x = connect_df , y = filtered_data_filtered , by.x = "Intensity" , by.y = 1 ) anova_filtered_merge_order <- rownames(filtered_p) anova_filtered_merge_format <- sapply( X = filtered_p$fdr_adjusted_anova_p , FUN = function(x) { if (x > 0.0001) paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s") else paste0("(%0.4e) %s") } ) anova_filtered <- data.table( anova_filtered_merge$Phosphopeptide , anova_filtered_merge$Intensity , anova_filtered_merge[, 2:number_of_samples + 1] ) colnames(anova_filtered) <- c("Phosphopeptide", colnames(filtered_data_filtered)) # merge qualitative columns into the ANOVA data output_table <- data.frame(anova_filtered$Phosphopeptide) output_table <- merge( x = output_table , y = data_table_imp_qn_lt , by.x = "anova_filtered.Phosphopeptide" , by.y = "Phosphopeptide" ) #Produce heatmap to visualize significance and the effect of imputation m <- as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) if (nrow(m) > 0) { rownames_m <- rownames(m) rownames(m) <- sapply( X = seq_len(nrow(m)) , FUN = function(i) { sprintf( anova_filtered_merge_format[i] , filtered_p$fdr_adjusted_anova_p[i] , rownames_m[i] ) } ) margins <- c(max(nchar(colnames(m))) * 10 / 16 # col , max(nchar(rownames(m))) * 5 / 16 # row ) how_many_peptides <- min(50, nrow(m)) cat("\\newpage\n") if (nrow(m) > 50) { cat("Heatmap for the 50 most-significant peptides", sprintf( "whose adjusted p-value < %0.2f\n", cutoff) ) } else { cat("Heatmap for peptides whose", sprintf("adjusted p-value < %0.2f\n", cutoff) ) } cat("\\newline\n") cat("\\newline\n") op <- par("cex.main") try( if (nrow(m) > 1) { par(cex.main = 0.6) heatmap( m[how_many_peptides:1, ], Rowv = NA, Colv = NA, cexRow = 0.7, cexCol = 0.8, scale = "row", margins = margins, main = "Heatmap of unimputed, unnormalized intensities", xlab = "" ) } ) par(op) } } } } ``` <!-- ## Peptide IDs, etc. See output files. -->