Mercurial > repos > eschen42 > mqppep_anova
view mqppep_anova_script.Rmd @ 6:922d309640db draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 9dfb7e07a3673d7de4b0a1b7e6ce1b75a8a4f42b"
author | eschen42 |
---|---|
date | Fri, 11 Mar 2022 20:04:05 +0000 |
parents | c1403d18c189 |
children | d728198f1ba5 |
line wrap: on
line source
--- title: "Quant Data Processing Script" author: "Larry Cheng; Art Eschenlauer" date: "May 28, 2018; Nov 16, 2021" output: html_document: default pdf_document: default params: inputFile: "Upstream_Map_pST_outputfile_STEP4.txt" alphaFile: "alpha_levels.txt" firstDataColumn: "Intensity" imputationMethod: !r c("group-median","median","mean","random")[4] meanPercentile: 1 sdPercentile: 0.2 regexSampleNames: "\\.(\\d+)[A-Z]$" regexSampleGrouping: "(\\d+)" imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt" --- ```{r setup, include=FALSE} # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 knitr::opts_chunk$set(echo = FALSE, fig.dim=c(9,10)) ``` ## Purpose: Perform imputation of missing values, quantile normalization, and ANOVA. <!-- ## Variables to change for each input file --> ```{r include = FALSE} #Input Filename inputFile <- params$inputFile #First data column - ideally, this could be detected via regexSampleNames, but for now leave it as is. firstDataColumn <- params$firstDataColumn FDC_is_integer <- TRUE firstDataColumn <- withCallingHandlers( as.integer(firstDataColumn) , warning = function(w) FDC_is_integer <<- FALSE ) if (FALSE == FDC_is_integer) { firstDataColumn <- params$firstDataColumn } #False discovery rate adjustment for ANOVA (Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05) valFDR <- read.table(file = params$alphaFile, sep = "\t", header=F, quote="")[,1] #Imputed Data filename imputedDataFilename <- params$imputedDataFilename #ANOVA data filename ``` ```{r include = FALSE} #Imputation method, should be one of c("random","group-median","median","mean") imputationMethod <- params$imputationMethod #Selection of percentile of logvalue data to set the mean for random number generation when using random imputation meanPercentile <- params$meanPercentile / 100.0 #deviation adjustment-factor for random values; real number. sdPercentile <- params$sdPercentile #Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" regexSampleNames <- params$regexSampleNames #Regular expression to extract Sample Grouping from Sample Name (if error occurs, compare sampleNumbers and tempMatches to see if groupings/pairs line up) # e.g., "(\\d+)" regexSampleGrouping <- params$regexSampleGrouping ``` ```{r include = FALSE} ### FUNCTIONS #ANOVA filter function anovaFunc <- function(x, groupingFactor) { x.aov = aov(as.numeric(x) ~ groupingFactor) pvalue = summary(x.aov)[[1]][["Pr(>F)"]][1] pvalue } ``` ### Checking that log-transformed sample distributions are similar: ```{r echo=FALSE} library(data.table) # read.table reads a file in table format and creates a data frame from it. # - note that `quote=""` means that quotation marks are treated literally. fullData <- read.table(file = inputFile, sep = "\t", header=T, quote="", check.names=FALSE) print(colnames(fullData)) #head(fullData) if (FALSE == FDC_is_integer) { dataColumnIndices <- grep(firstDataColumn, names(fullData), perl=TRUE) str(dataColumnIndices) if (length(dataColumnIndices) > 0) { firstDataColumn <- dataColumnIndices[1] } else { stop(paste("failed to convert firstDataColumn:", firstDataColumn)) } } quantData0 <- fullData[firstDataColumn:length(fullData)] quantData <- fullData[firstDataColumn:length(fullData)] quantData[quantData==0] <- NA #replace 0 with NA quantDataLog <- log10(quantData) rownames(quantDataLog) <- fullData$Phosphopeptide summary(quantDataLog) #data visualization old_par <- par( mai=par("mai") + c(0.5,0,0,0) ) boxplot( quantDataLog , las=2 ) par(old_par) quantDataLog_stack <- stack(quantDataLog) ``` ```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)} library(ggplot2) ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind)) ``` ### Globally, are phosphopeptide intensities are approximately unimodal? ```{r echo = FALSE,fig.align="left", fig.dim=c(9,5)} # ref for bquote particularly and plotting math expressions generally: # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ #identify the location of missing values fin <- is.finite(as.numeric(as.matrix(quantDataLog))) logvalues <- as.numeric(as.matrix(quantDataLog))[fin] plot( density(logvalues) , main = bquote("Smoothed estimated probability density vs." ~ log[10](intensity)) , xlab = bquote(log[10](intensity)) ) hist( x = as.numeric(as.matrix(quantDataLog)) , breaks = 100 , main = bquote("Frequency vs." ~ log[10](intensity)) , xlab = bquote(log[10](intensity)) ) ``` <!-- ## Impute missing values --> ### Distribution of standard deviations of phosphopeptides, ignoring missing values: ```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)} #determine quantile q1 <- quantile(logvalues, probs = meanPercentile)[1] #determine standard deviation of quantile to impute sd_finite <- function(x) { ok <- is.finite(x) sd(x[ok]) * sdPercentile } sds <- apply(quantDataLog, 1, sd_finite) # 1 = row of matrix (ie, phosphopeptide) plot( density(sds, na.rm=T) , main="Smoothed estimated probability density vs. std. deviation" , sub="(probability estimation made with Gaussian smoothing)" ) m1 <- median(sds, na.rm=T) #sd to be used is the median sd ``` <!-- The number of missing values are: --> ```{r echo=FALSE} #Determine number of cells to impute temp <- quantData[is.na(quantData)] #Determine number of values to impute NoToImpute <- length(temp) ``` <!-- % of values that are missing: --> ```{r echo=FALSE} pct_missing_values <- length(temp)/(length(logvalues)+length(temp)) * 100 ``` <!-- First few rows of data before imputation: --> ## Impute missing values ```{r echo = FALSE} #ACE start segment: trt-median based imputation # prep for trt-median based imputation # Assuming that regexSampleNames <- "\\.(\\d+)[A-Z]$" # get factors -> group runs (samples) by ignoring terminal [A-Z] in sample names # regexpr(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE) m <- regexpr(regexSampleNames, names(quantData), perl=TRUE) tempMatches <- regmatches(names(quantData), m) print("Extracted sample names") print(tempMatches) m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE) sampleNumbers <- as.factor(regmatches(tempMatches, m2)) print("Factor levels") print(sampleNumbers) ``` ```{r echo = FALSE} #ACE hack begin #Determine number of cells to impute cat( sprintf("Before imputation, there are:\n %d peptides\n %d missing values (%2.0f%s)" , sum(rep.int(TRUE, nrow(quantData))) , sum(is.na(quantData)) , pct_missing_values , "%" ) ) #ACE hack end ``` ```{r echo = FALSE} #Impute data quantDataImputed <- quantData # Identify which values are missing and need to be imputed ind <- which(is.na(quantDataImputed), arr.ind=TRUE) ``` ```{r echo = FALSE} # Apply imputation switch( imputationMethod , "group-median"={ cat("Imputation method: substitute missing value with median peptide-intensity for sample-group\n") #goodRows <- rep.int(TRUE, nrow(quantDataImputed)) sampleLevelIntegers <- as.integer(sampleNumbers) for (i in 1:length(levels(sampleNumbers))) { levelCols <- i == sampleLevelIntegers ind <- which(is.na(quantDataImputed[,levelCols]), arr.ind=TRUE) quantDataImputed[ind,levelCols] <- apply(quantDataImputed[,levelCols], 1, median, na.rm=T)[ind[,1]] } goodRows <- !is.na(rowMeans(quantDataImputed)) } , "median"={ cat("Imputation method: substitute missing value with median peptide-intensity across all sample classes\n") quantDataImputed[ind] <- apply(quantDataImputed, 1, median, na.rm=T)[ind[,1]] goodRows <- !is.na(rowMeans(quantDataImputed)) } , "mean"={ cat("Imputation method: substitute missing value with mean peptide-intensity across all sample classes\n") quantDataImputed[ind] <- apply(quantDataImputed, 1, mean, na.rm=T)[ind[,1]] goodRows <- !is.na(rowMeans(quantDataImputed)) } , "random"={ cat( sprintf( "Imputation method: substitute missing value with random intensity N ~ (%0.2f, %0.2f)\n" , q1, m1 ) ) quantDataImputed[is.na(quantDataImputed)] <- 10^rnorm(NoToImpute, mean= q1, sd = m1) goodRows <- !is.na(rowMeans(quantDataImputed)) } ) ``` ```{r echo = FALSE} #Determine number of cells to impute temp <- quantDataImputed[is.na(quantDataImputed)] cat( sprintf( "After imputation, there are:\n %d missing values\n %d usable peptides\n %d peptides with too many missing values for further analysis" , sum(is.na(quantDataImputed[goodRows,])) , sum(goodRows) , sum(!goodRows) ) ) ``` ```{r echo = FALSE} # Zap rows where imputation was ineffective fullData <- fullData [goodRows, ] quantData <- quantData [goodRows, ] quantDataImputed <- quantDataImputed[goodRows, ] ``` ```{r echo = FALSE} d_combined <- (density(as.numeric(as.matrix(log10(quantDataImputed))))) d_original <- density(as.numeric(as.matrix(log10(quantDataImputed[!is.na(quantData)])))) ``` ```{r echo = FALSE} if (sum(is.na(quantData)) > 0) { # There ARE missing values d_imputed <- (density(as.numeric(as.matrix(log10(quantDataImputed[is.na(quantData)]))))) } else { # There are NO missing values d_imputed <- d_combined } ``` <!-- ```{r echo = FALSE, fig.cap = "Blue = Data before imputation; Red = Imputed data"} --> ```{r echo = FALSE, fig.dim=c(9,5)} ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y)) plot( d_combined , ylim = ylim , sub = "Blue = data before imputation; Red = imputed data" , main = "Density vs. log10(intensity) before and after imputation" ) lines(d_original, col="blue") lines(d_imputed, col="red") ``` ## Perform Quantile Normalization ```{r echo=FALSE} library(preprocessCore) # Apply quantile normalization using preprocessCore::normalize.quantiles # --- # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html # except this: https://support.bioconductor.org/p/122925/#9135989 # says to install it like this: # ``` # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE,lib=.libPaths()[1]) # ``` # conda installation (necessary because of a bug in recent openblas): # conda install bioconductor-preprocesscore openblas=0.3.3 # ... # --- # normalize.quantiles {preprocessCore} -- Quantile Normalization # # Description: # Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities. # # Usage: # normalize.quantiles(x,copy=TRUE, keep.names=FALSE) # # Arguments: # # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. # # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, # but in certain situations not making a copy of the matrix, but instead normalizing # it in place will be more memory friendly. # # - keep.names: Boolean option to preserve matrix row and column names in output. # # Details: # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. # No special allowances are made for outliers. If you make use of quantile normalization # please cite Bolstad et al, Bioinformatics (2003). # # This functions will handle missing data (ie NA values), based on # the assumption that the data is missing at random. # # Note that the current implementation optimizes for better memory usage # at the cost of some additional run-time. # # Value: A normalized matrix. # # Author: Ben Bolstad, bmbolstad.com # # References # # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf # # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 # http://bmbolstad.com/misc/normalize/normalize.html # ... if (TRUE) { quantDataImputed.qn <- normalize.quantiles(as.matrix(quantDataImputed)) } else { quantDataImputed.qn <- as.matrix(quantDataImputed) } quantDataImputed.qn = as.data.frame(quantDataImputed.qn) names(quantDataImputed.qn) = names(quantDataImputed) quantDataImputed_QN_log <- log10(quantDataImputed.qn) rownames(quantDataImputed_QN_log) <- fullData[,1] quantDataImputed.qn.LS = t(scale(t(log10(quantDataImputed.qn)))) anyNaN <- function (x) { !any(x == "NaN") } sel = apply(quantDataImputed.qn.LS, 1, anyNaN) quantDataImputed.qn.LS2 <- quantDataImputed.qn.LS[which(sel),] quantDataImputed.qn.LS2 = as.data.frame(quantDataImputed.qn.LS2) #output quantile normalized data dataTableImputed_QN_LT <- cbind(fullData[1:9], quantDataImputed_QN_log) write.table(dataTableImputed_QN_LT, file = paste(paste(strsplit(imputedDataFilename, ".txt"),"QN_LT",sep="_"),".txt",sep=""), sep = "\t", col.names=TRUE, row.names=FALSE) ``` <!-- ACE insertion begin --> ### Checking that normalized, imputed, log-transformed sample distributions are similar: ```{r echo=FALSE} #library(data.table) #Save unimputed quantDataLog for plotting below unimputedQuantDataLog <- quantDataLog #Log10 transform (after preparing for zero values, which should never happen...) quantDataImputed.qn[quantDataImputed.qn == 0] <- .000000001 quantDataLog <- log10(quantDataImputed.qn) summary(quantDataLog) #Output quantile-normalized log-transformed dataset with imputed, normalized data dataTableImputed <- cbind(fullData[1:9], quantDataLog) write.table( dataTableImputed , file=imputedDataFilename , sep="\t" , col.names=TRUE , row.names=FALSE , quote=FALSE ) #data visualization old_par <- par( mai=par("mai") + c(0.5,0,0,0) , oma=par("oma") + c(0.5,0,0,0) ) boxplot( quantDataLog , las=2 ) par(old_par) ``` ```{r echo=FALSE, fig.dim=c(9,5)} quantDataLog_stack <- stack(quantDataLog) ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind)) ``` ## Perform ANOVA filters ```{r,echo=FALSE} #Make new data frame containing only Phosphopeptides to connect preANOVA to ANOVA (connect_df) connect_df <- data.frame( dataTableImputed_QN_LT$Phosphopeptide , dataTableImputed_QN_LT[,firstDataColumn] ) colnames(connect_df) <- c("Phosphopeptide","Intensity") ``` ```{r echo=FALSE, fig.dim=c(9,10)} # Get factors -> group replicates (as indicated by terminal letter) by the preceding digits # For example, group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc.. m <- regexpr(regexSampleNames, names(quantDataImputed_QN_log), perl=TRUE) #ACE str(m) tempMatches <- regmatches(names(quantDataImputed_QN_log), m) #ACE str(tempMatches) numSamples <- length(tempMatches) #ACE str(numSamples) m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE) #ACE str(m2) #ACE str(regmatches(tempMatches, m2)) sampleNumbers <- as.factor(regmatches(tempMatches, m2)) #ACE str(sampleNumbers) if (length(levels(sampleNumbers))<2) { cat("ERROR!!!! Cannot perform ANOVA analysis because it requires two or more factor levels\n") cat("Unparsed sample names are:\n") print(names(quantDataImputed_QN_log)) cat(sprintf("Parsing rule for SampleNames is '%s'\n", regexSampleNames)) cat("Parsed names are:\n") print(tempMatches) cat(sprintf("Parsing rule for SampleGrouping is '%s'\n", regexSampleGrouping)) cat("Sample group assignments are:\n") print(regmatches(tempMatches, m2)) } else { pValueData.anovaPs <- apply(quantDataImputed_QN_log, 1, anovaFunc, groupingFactor=sampleNumbers) pValueData.anovaPs.FDR <- p.adjust(pValueData.anovaPs, method="fdr") pValueData <- data.frame( phosphopeptide = fullData[,1] , rawANOVAp = pValueData.anovaPs , FDRadjustedANOVAp = pValueData.anovaPs.FDR ) #ACE rownames(pValueData) <- fullData[,1] # output ANOVA file to constructed filename, # e.g. "Outputfile_pST_ANOVA_STEP5.txt" # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" #Re-output quantile-normalized log-transformed dataset with imputed, normalized data to include p-values dataTableImputed <- cbind(fullData[1:9], pValueData[,2:3], quantDataLog) write.table( dataTableImputed , file=imputedDataFilename , sep="\t" , col.names=TRUE , row.names=FALSE , quote=FALSE ) pValueData <- pValueData[order(pValueData$FDRadjustedANOVAp),] cutoff <- valFDR[1] for (cutoff in valFDR){ #loop through FDR cutoffs filtered_p <- pValueData[which(pValueData$FDRadjustedANOVAp < cutoff),, drop = FALSE] filteredData.filtered <- quantDataImputed_QN_log[rownames(filtered_p),, drop = FALSE] filteredData.filtered <- filteredData.filtered[order(filtered_p$FDRadjustedANOVAp),, drop = FALSE] # <!-- ACE insertion start --> old_oma <- par("oma") old_par <- par( mai=(par("mai") + c(0.7,0,0,0)) * c(1,1,0.3,1) , oma=old_oma * c(1,1,0.3,1) , cex.main=0.9 , cex.axis=0.7 ) if (nrow(filteredData.filtered) > 0) { boxplot( filteredData.filtered , main = sprintf("Imputed, normalized intensities where adjusted p-value < %0.2f", cutoff) # no line plot , main = "" , las = 2 # , ylim = c(5.5,10) , ylab = expression(log[10](intensity)) ) } else { cat(sprintf("No peptides were found to have cutoff adjusted p-value < %0.2f\n", cutoff)) } par(old_par) #Add Phosphopeptide column to ANOVA filtered table ANOVA.filtered_merge <- merge( x = connect_df , y = filteredData.filtered , by.x="Intensity" , by.y=1 ) ANOVA.filtered_merge.order <- rownames(filtered_p) ANOVA.filtered_merge.format <- sapply( X = filtered_p$FDRadjustedANOVAp , FUN = function(x) { if (x > 0.0001) paste0("(%0.",1+ceiling(-log10(x)),"f) %s") else paste0("(%0.4e) %s") } ) #ANOVA.filtered_merge.format <- paste0("(%0.",1+ceiling(-log10(filtered_p$FDRadjustedANOVAp)),"f) %s") ANOVA.filtered <- data.table( ANOVA.filtered_merge$Phosphopeptide , ANOVA.filtered_merge$Intensity , ANOVA.filtered_merge[, 2:numSamples+1] ) colnames(ANOVA.filtered) <- c("Phosphopeptide", colnames(filteredData.filtered)) # merge qualitative columns into the ANOVA data output_table <- data.frame(ANOVA.filtered$Phosphopeptide) output_table <- merge( x = output_table , y = dataTableImputed_QN_LT , by.x = "ANOVA.filtered.Phosphopeptide" , by.y="Phosphopeptide" ) #Produce heatmap to visualize significance and the effect of imputation m <- as.matrix(unimputedQuantDataLog[ANOVA.filtered_merge.order,]) if (nrow(m) > 0) { rownames_m <- rownames(m) rownames(m) <- sapply( X = 1:nrow(m) , FUN = function(i) { sprintf( ANOVA.filtered_merge.format[i] , filtered_p$FDRadjustedANOVAp[i] , rownames_m[i] ) } ) margins <- c( max(nchar(colnames(m))) * 10 / 16 # col , max(nchar(rownames(m))) * 5 / 16 # row ) how_many_peptides <- min(50, nrow(m)) op <- par("cex.main") try( if (nrow(m) > 1) { par(cex.main=0.6) heatmap( m[how_many_peptides:1,] , Rowv = NA , Colv = NA , cexRow = 0.7 , cexCol = 0.8 , scale="row" , margins = margins , main = "Heatmap of unimputed, unnormalized intensities" , xlab = "" # , main = bquote( # .( how_many_peptides ) # ~ " peptides with adjusted p-value <" # ~ .(sprintf("%0.2f", cutoff)) # ) ) } ) #ACE fig_dim knitr::opts_chunk$set(fig.dim = fig_dim) par(op) } } } ``` ## Peptide IDs, etc. See output files.