view mqppep_anova_script.Rmd @ 6:922d309640db draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 9dfb7e07a3673d7de4b0a1b7e6ce1b75a8a4f42b"
author eschen42
date Fri, 11 Mar 2022 20:04:05 +0000
parents c1403d18c189
children d728198f1ba5
line wrap: on
line source

---
title: "Quant Data Processing Script"
author: "Larry Cheng; Art Eschenlauer"
date: "May 28, 2018; Nov 16, 2021"
output:
  html_document: default
  pdf_document: default
params:
  inputFile: "Upstream_Map_pST_outputfile_STEP4.txt"
  alphaFile: "alpha_levels.txt"
  firstDataColumn: "Intensity"
  imputationMethod: !r c("group-median","median","mean","random")[4]
  meanPercentile: 1
  sdPercentile: 0.2
  regexSampleNames: "\\.(\\d+)[A-Z]$"
  regexSampleGrouping: "(\\d+)"
  imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt"
---
```{r setup, include=FALSE}
# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
knitr::opts_chunk$set(echo = FALSE, fig.dim=c(9,10))
```

## Purpose:
Perform imputation of missing values, quantile normalization, and ANOVA.

<!--
## Variables to change for each input file
-->
```{r include = FALSE}
#Input Filename
inputFile <- params$inputFile

#First data column - ideally, this could be detected via regexSampleNames, but for now leave it as is.
firstDataColumn <- params$firstDataColumn
FDC_is_integer <- TRUE
firstDataColumn <- withCallingHandlers(
    as.integer(firstDataColumn)
  , warning = function(w) FDC_is_integer <<- FALSE
  )
if (FALSE == FDC_is_integer) {
  firstDataColumn <- params$firstDataColumn
}

#False discovery rate adjustment for ANOVA (Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05)
valFDR <- read.table(file = params$alphaFile, sep = "\t", header=F, quote="")[,1]

#Imputed Data filename
imputedDataFilename <- params$imputedDataFilename

#ANOVA data filename
```

```{r include = FALSE}
#Imputation method, should be one of c("random","group-median","median","mean")
imputationMethod <- params$imputationMethod

#Selection of percentile of logvalue data to set the mean for random number generation when using random imputation
meanPercentile <- params$meanPercentile / 100.0

#deviation adjustment-factor for random values; real number.
sdPercentile <- params$sdPercentile

#Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
regexSampleNames <- params$regexSampleNames

#Regular expression to extract Sample Grouping from Sample Name (if error occurs, compare sampleNumbers and tempMatches to see if groupings/pairs line up)
# e.g., "(\\d+)"
regexSampleGrouping <- params$regexSampleGrouping

```


```{r include = FALSE}
### FUNCTIONS

#ANOVA filter function
anovaFunc <- function(x, groupingFactor) {
  x.aov = aov(as.numeric(x) ~ groupingFactor)
  pvalue = summary(x.aov)[[1]][["Pr(>F)"]][1]
  pvalue
}
```



### Checking that log-transformed sample distributions are similar:
```{r echo=FALSE}

library(data.table)

# read.table reads a file in table format and creates a data frame from it.
#   - note that `quote=""` means that quotation marks are treated literally.
fullData <- read.table(file = inputFile, sep = "\t", header=T, quote="", check.names=FALSE)
print(colnames(fullData))
#head(fullData)

if (FALSE == FDC_is_integer) {
  dataColumnIndices <- grep(firstDataColumn, names(fullData), perl=TRUE)
  str(dataColumnIndices)
  if (length(dataColumnIndices) > 0) {
    firstDataColumn <- dataColumnIndices[1]
  } else {
    stop(paste("failed to convert firstDataColumn:", firstDataColumn))
  }
}
 
quantData0 <- fullData[firstDataColumn:length(fullData)]
quantData <- fullData[firstDataColumn:length(fullData)]
quantData[quantData==0] <- NA  #replace 0 with NA
quantDataLog <- log10(quantData)

rownames(quantDataLog) <- fullData$Phosphopeptide

summary(quantDataLog)

#data visualization
old_par <- par(
  mai=par("mai") + c(0.5,0,0,0)
)
boxplot(
  quantDataLog
, las=2
)
par(old_par)

quantDataLog_stack <- stack(quantDataLog)
```

```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)}
library(ggplot2)
ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind))
```

### Globally, are phosphopeptide intensities are approximately unimodal?
```{r echo = FALSE,fig.align="left", fig.dim=c(9,5)}

# ref for bquote particularly and plotting math expressions generally:
#   https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/

#identify the location of missing values
fin <- is.finite(as.numeric(as.matrix(quantDataLog)))

logvalues <- as.numeric(as.matrix(quantDataLog))[fin]
plot(
  density(logvalues)
, main = bquote("Smoothed estimated probability density vs." ~ log[10](intensity))
, xlab = bquote(log[10](intensity))
)
hist(
  x = as.numeric(as.matrix(quantDataLog))
, breaks = 100
, main = bquote("Frequency vs." ~ log[10](intensity))
, xlab = bquote(log[10](intensity))
)
```

<!--
## Impute missing values
-->

### Distribution of standard deviations of phosphopeptides, ignoring missing values:

```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)}
#determine quantile
q1 <- quantile(logvalues, probs = meanPercentile)[1]

#determine standard deviation of quantile to impute
sd_finite <- function(x) {
  ok <- is.finite(x)
  sd(x[ok]) * sdPercentile
}
sds <- apply(quantDataLog, 1, sd_finite) # 1 = row of matrix (ie, phosphopeptide)
plot(
  density(sds, na.rm=T)
, main="Smoothed estimated probability density vs. std. deviation"
, sub="(probability estimation made with Gaussian smoothing)"
)

m1 <- median(sds, na.rm=T) #sd to be used is the median sd

```



<!--
The number of missing values are:
-->
```{r echo=FALSE}
#Determine number of cells to impute
temp <- quantData[is.na(quantData)]

#Determine number of values to impute
NoToImpute <- length(temp)
```

<!--
% of values that are missing:
-->
```{r echo=FALSE}
pct_missing_values <- length(temp)/(length(logvalues)+length(temp)) * 100
```

<!--
First few rows of data before imputation:
-->
## Impute missing values
```{r echo = FALSE}

#ACE start segment: trt-median based imputation
# prep for trt-median based imputation

# Assuming that regexSampleNames <- "\\.(\\d+)[A-Z]$"
#   get factors -> group runs (samples) by ignoring terminal [A-Z] in sample names
# regexpr(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)
m <- regexpr(regexSampleNames, names(quantData), perl=TRUE)
tempMatches <- regmatches(names(quantData), m)
print("Extracted sample names")
print(tempMatches)
m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE)
sampleNumbers <- as.factor(regmatches(tempMatches, m2))
print("Factor levels")
print(sampleNumbers)

```
```{r echo = FALSE}

#ACE hack begin
#Determine number of cells to impute
cat(
  sprintf("Before imputation, there are:\n %d peptides\n %d missing values (%2.0f%s)"
  , sum(rep.int(TRUE, nrow(quantData)))
  , sum(is.na(quantData))
  , pct_missing_values
  , "%"
  )
)
#ACE hack end

```
```{r echo = FALSE}

#Impute data
quantDataImputed <- quantData

# Identify which values are missing and need to be imputed
ind <- which(is.na(quantDataImputed), arr.ind=TRUE)

```
```{r echo = FALSE}

# Apply imputation
switch(
  imputationMethod
, "group-median"={
    cat("Imputation method: substitute missing value with median peptide-intensity for sample-group\n")
    #goodRows <- rep.int(TRUE, nrow(quantDataImputed))
    sampleLevelIntegers <- as.integer(sampleNumbers)
    for (i in 1:length(levels(sampleNumbers))) {
      levelCols <- i == sampleLevelIntegers
      ind <- which(is.na(quantDataImputed[,levelCols]), arr.ind=TRUE)
      quantDataImputed[ind,levelCols] <- apply(quantDataImputed[,levelCols], 1, median, na.rm=T)[ind[,1]]
    }
    goodRows <- !is.na(rowMeans(quantDataImputed))
  }
, "median"={
    cat("Imputation method: substitute missing value with median peptide-intensity across all sample classes\n")
    quantDataImputed[ind] <- apply(quantDataImputed, 1, median, na.rm=T)[ind[,1]]
    goodRows <- !is.na(rowMeans(quantDataImputed))
  }
, "mean"={
    cat("Imputation method: substitute missing value with mean peptide-intensity across all sample classes\n")
    quantDataImputed[ind] <- apply(quantDataImputed, 1, mean, na.rm=T)[ind[,1]]
    goodRows <- !is.na(rowMeans(quantDataImputed))
  }
, "random"={
    cat(
      sprintf(
        "Imputation method: substitute missing value with random intensity N ~ (%0.2f, %0.2f)\n"
      , q1, m1
      )
    )
    quantDataImputed[is.na(quantDataImputed)] <- 10^rnorm(NoToImpute, mean= q1, sd = m1)
    goodRows <- !is.na(rowMeans(quantDataImputed))
  }
)

```
```{r echo = FALSE}

#Determine number of cells to impute
temp <- quantDataImputed[is.na(quantDataImputed)]
cat(
  sprintf(
    "After imputation, there are:\n  %d missing values\n  %d usable peptides\n  %d peptides with too many missing values for further analysis"
  , sum(is.na(quantDataImputed[goodRows,]))
  , sum(goodRows)
  , sum(!goodRows)
  )
)
```
```{r echo = FALSE}


# Zap rows where imputation was ineffective
fullData         <- fullData        [goodRows, ]
quantData        <- quantData       [goodRows, ]
quantDataImputed <- quantDataImputed[goodRows, ]

```
```{r echo = FALSE}

d_combined <- (density(as.numeric(as.matrix(log10(quantDataImputed)))))
d_original <- density(as.numeric(as.matrix(log10(quantDataImputed[!is.na(quantData)]))))

```
```{r echo = FALSE}

if (sum(is.na(quantData)) > 0) {
  # There ARE missing values
  d_imputed <- (density(as.numeric(as.matrix(log10(quantDataImputed[is.na(quantData)])))))
} else {
  # There are NO missing values
  d_imputed <- d_combined
}

```

<!-- ```{r echo = FALSE, fig.cap = "Blue =  Data before imputation; Red = Imputed data"} -->
```{r echo = FALSE, fig.dim=c(9,5)}
ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y))
plot(
  d_combined
, ylim = ylim
, sub = "Blue = data before imputation; Red = imputed data"
, main = "Density vs. log10(intensity) before and after imputation"
)
lines(d_original, col="blue")
lines(d_imputed, col="red")
```

## Perform Quantile Normalization
```{r echo=FALSE}
library(preprocessCore)
# Apply quantile normalization using preprocessCore::normalize.quantiles
# ---
# tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html
#   except this: https://support.bioconductor.org/p/122925/#9135989
#   says to install it like this:
#     ```
#     BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE,lib=.libPaths()[1])
#     ```
# conda installation (necessary because of a bug in recent openblas):
#   conda install bioconductor-preprocesscore openblas=0.3.3
# ...
# ---
# normalize.quantiles {preprocessCore}	--  Quantile Normalization
#
# Description:
#   Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities.
#
# Usage:
#   normalize.quantiles(x,copy=TRUE, keep.names=FALSE)
#
# Arguments:
#
#   - x: A matrix of intensities where each column corresponds to a chip and each row is a probe.
#
#   - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy,
#       but in certain situations not making a copy of the matrix, but instead normalizing
#       it in place will be more memory friendly.
#
#   - keep.names: Boolean option to preserve matrix row and column names in output.
#
# Details:
#   This method is based upon the concept of a quantile-quantile plot extended to n dimensions.
#     No special allowances are made for outliers. If you make use of quantile normalization
#     please cite Bolstad et al, Bioinformatics (2003).
#
#   This functions will handle missing data (ie NA values), based on
#     the assumption that the data is missing at random.
#
#   Note that the current implementation optimizes for better memory usage
#     at the cost of some additional run-time.
#
# Value: A normalized matrix.
#
# Author: Ben Bolstad, bmbolstad.com
#
# References
#
#   - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide
#       Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf
#
#   - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of
#       Normalization Methods for High Density Oligonucleotide Array Data Based on Bias
#       and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185
#       http://bmbolstad.com/misc/normalize/normalize.html
# ...

if (TRUE) {
  quantDataImputed.qn <- normalize.quantiles(as.matrix(quantDataImputed)) 
} else {
  quantDataImputed.qn <- as.matrix(quantDataImputed)
}

quantDataImputed.qn = as.data.frame(quantDataImputed.qn)
names(quantDataImputed.qn) = names(quantDataImputed)
quantDataImputed_QN_log <- log10(quantDataImputed.qn)

rownames(quantDataImputed_QN_log) <- fullData[,1]

quantDataImputed.qn.LS = t(scale(t(log10(quantDataImputed.qn))))
anyNaN <- function (x) {
  !any(x == "NaN")
}
sel = apply(quantDataImputed.qn.LS, 1, anyNaN)
quantDataImputed.qn.LS2 <- quantDataImputed.qn.LS[which(sel),]
quantDataImputed.qn.LS2 = as.data.frame(quantDataImputed.qn.LS2)

#output quantile normalized data
dataTableImputed_QN_LT <- cbind(fullData[1:9], quantDataImputed_QN_log)
write.table(dataTableImputed_QN_LT, file = paste(paste(strsplit(imputedDataFilename, ".txt"),"QN_LT",sep="_"),".txt",sep=""), sep = "\t", col.names=TRUE, row.names=FALSE)

```

<!-- ACE insertion begin -->
### Checking that normalized, imputed, log-transformed sample distributions are similar:

```{r echo=FALSE}
#library(data.table)

#Save unimputed quantDataLog for plotting below
unimputedQuantDataLog <- quantDataLog

#Log10 transform (after preparing for zero values, which should never happen...)
quantDataImputed.qn[quantDataImputed.qn == 0] <- .000000001
quantDataLog <- log10(quantDataImputed.qn)

summary(quantDataLog)

#Output quantile-normalized log-transformed dataset with imputed, normalized data

dataTableImputed <- cbind(fullData[1:9], quantDataLog)
write.table(
    dataTableImputed
  , file=imputedDataFilename
  , sep="\t"
  , col.names=TRUE
  , row.names=FALSE
  , quote=FALSE
  )



#data visualization
old_par <- par(
  mai=par("mai") + c(0.5,0,0,0)
, oma=par("oma") + c(0.5,0,0,0)
)
boxplot(
  quantDataLog
, las=2
)
par(old_par)
```

```{r echo=FALSE, fig.dim=c(9,5)}
quantDataLog_stack <- stack(quantDataLog)
ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind))
```

## Perform ANOVA filters

```{r,echo=FALSE}
#Make new data frame containing only Phosphopeptides to connect preANOVA to ANOVA (connect_df)
connect_df <- data.frame(
    dataTableImputed_QN_LT$Phosphopeptide
  , dataTableImputed_QN_LT[,firstDataColumn]
  )
colnames(connect_df) <- c("Phosphopeptide","Intensity")
```

```{r echo=FALSE, fig.dim=c(9,10)}
# Get factors -> group replicates (as indicated by terminal letter) by the preceding digits
#   For example, group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc..
m <- regexpr(regexSampleNames, names(quantDataImputed_QN_log), perl=TRUE)
#ACE str(m)
tempMatches <- regmatches(names(quantDataImputed_QN_log), m)
#ACE str(tempMatches)
numSamples <- length(tempMatches)
#ACE str(numSamples)
m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE)
#ACE str(m2)
#ACE str(regmatches(tempMatches, m2))
sampleNumbers <- as.factor(regmatches(tempMatches, m2))
#ACE str(sampleNumbers)

if (length(levels(sampleNumbers))<2) {
  cat("ERROR!!!! Cannot perform ANOVA analysis because it requires two or more factor levels\n")
  cat("Unparsed sample names are:\n")
  print(names(quantDataImputed_QN_log))
  cat(sprintf("Parsing rule for SampleNames is '%s'\n", regexSampleNames))
  cat("Parsed names are:\n")
  print(tempMatches)
  cat(sprintf("Parsing rule for SampleGrouping is '%s'\n", regexSampleGrouping))
  cat("Sample group assignments are:\n")
  print(regmatches(tempMatches, m2))
} else {
  pValueData.anovaPs <- apply(quantDataImputed_QN_log, 1, anovaFunc, groupingFactor=sampleNumbers)

  pValueData.anovaPs.FDR <- p.adjust(pValueData.anovaPs, method="fdr")
  pValueData <- data.frame(
    phosphopeptide = fullData[,1]
  , rawANOVAp = pValueData.anovaPs
  , FDRadjustedANOVAp = pValueData.anovaPs.FDR
  )
  #ACE rownames(pValueData) <- fullData[,1]
  # output ANOVA file to constructed filename, 
  #   e.g.    "Outputfile_pST_ANOVA_STEP5.txt"
  #   becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt"

  #Re-output quantile-normalized log-transformed dataset with imputed, normalized data to include p-values

  dataTableImputed <- cbind(fullData[1:9], pValueData[,2:3], quantDataLog)
  write.table(
      dataTableImputed
    , file=imputedDataFilename
    , sep="\t"
    , col.names=TRUE
    , row.names=FALSE
    , quote=FALSE
    )


  pValueData <- pValueData[order(pValueData$FDRadjustedANOVAp),]

  cutoff <- valFDR[1]
  for (cutoff in valFDR){ #loop through FDR cutoffs

    filtered_p <- pValueData[which(pValueData$FDRadjustedANOVAp < cutoff),, drop = FALSE]
    filteredData.filtered <- quantDataImputed_QN_log[rownames(filtered_p),, drop = FALSE]
    filteredData.filtered <- filteredData.filtered[order(filtered_p$FDRadjustedANOVAp),, drop = FALSE]

    # <!-- ACE insertion start -->
    old_oma <- par("oma")
    old_par <- par(
      mai=(par("mai") + c(0.7,0,0,0)) * c(1,1,0.3,1)
    , oma=old_oma * c(1,1,0.3,1)
    , cex.main=0.9
    , cex.axis=0.7
    )
    
    if (nrow(filteredData.filtered) > 0) {
      boxplot(
        filteredData.filtered
      , main = sprintf("Imputed, normalized intensities where adjusted p-value < %0.2f", cutoff)
      # no line plot , main = ""
      , las = 2
      # , ylim = c(5.5,10)
      , ylab = expression(log[10](intensity))
      )
    } else {
      cat(sprintf("No peptides were found to have cutoff adjusted p-value < %0.2f\n", cutoff))
    }
    par(old_par)
    
    #Add Phosphopeptide column to ANOVA filtered table
    ANOVA.filtered_merge <- merge(
        x = connect_df
      , y = filteredData.filtered
      , by.x="Intensity"
      , by.y=1
      )
    ANOVA.filtered_merge.order <- rownames(filtered_p)
    
    ANOVA.filtered_merge.format <- sapply(
      X = filtered_p$FDRadjustedANOVAp
    , FUN = function(x) {
        if (x > 0.0001)
          paste0("(%0.",1+ceiling(-log10(x)),"f) %s")
        else
          paste0("(%0.4e) %s")
        }
    )

    #ANOVA.filtered_merge.format <- paste0("(%0.",1+ceiling(-log10(filtered_p$FDRadjustedANOVAp)),"f) %s")

    ANOVA.filtered <- data.table(
        ANOVA.filtered_merge$Phosphopeptide
      , ANOVA.filtered_merge$Intensity
      , ANOVA.filtered_merge[, 2:numSamples+1]
      )
    colnames(ANOVA.filtered) <- c("Phosphopeptide", colnames(filteredData.filtered))
    
    # merge qualitative columns into the ANOVA data
    output_table <- data.frame(ANOVA.filtered$Phosphopeptide)
    output_table <- merge(
        x = output_table
      , y = dataTableImputed_QN_LT
      , by.x = "ANOVA.filtered.Phosphopeptide"
      , by.y="Phosphopeptide"
      )

    #Produce heatmap to visualize significance and the effect of imputation
    m <- as.matrix(unimputedQuantDataLog[ANOVA.filtered_merge.order,])
    if (nrow(m) > 0) {
      rownames_m <- rownames(m)
      rownames(m) <- sapply(
          X = 1:nrow(m)
        , FUN = function(i) {
            sprintf(
              ANOVA.filtered_merge.format[i]
            , filtered_p$FDRadjustedANOVAp[i]
            , rownames_m[i]
            )
          }
        )
      margins <- c(
        max(nchar(colnames(m))) * 10 / 16 # col
      , max(nchar(rownames(m))) * 5 / 16 # row
      )
      how_many_peptides <- min(50, nrow(m))

      op <- par("cex.main")
      try(
        if (nrow(m) > 1) {
          par(cex.main=0.6)
          heatmap(
            m[how_many_peptides:1,]
          , Rowv = NA
          , Colv = NA
          , cexRow = 0.7
          , cexCol = 0.8
          , scale="row"
          , margins = margins
          , main = "Heatmap of unimputed, unnormalized intensities"
          , xlab = ""
          # , main = bquote(
          #     .( how_many_peptides )
          #       ~ " peptides with adjusted p-value <"
          #       ~ .(sprintf("%0.2f", cutoff))
          #     )
          )
        } 
      )
      #ACE fig_dim knitr::opts_chunk$set(fig.dim = fig_dim)
      par(op)
    }
    
  }
}
```

## Peptide IDs, etc.

See output files.