view mqppep_anova_script.Rmd @ 7:d728198f1ba5 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 9a0fa6d0f9aadc069a5551a54da6daf307885637"
author eschen42
date Tue, 15 Mar 2022 00:35:16 +0000
parents c1403d18c189
children 4deacfee76ef
line wrap: on
line source

---
title: "MaxQuant Phospho-Proteomic Enrichment Pipeline ANOVA"
author: "Larry Cheng; Art Eschenlauer"
date: "May 28, 2018; Nov 16, 2021"
output:
  pdf_document: default
params:
  inputFile: "test-data/test_input_for_anova.tabular"
  alphaFile: "test-data/alpha_levels.tabular"
  firstDataColumn: "Intensity"
  imputationMethod: !r c("group-median", "median", "mean", "random")[1]
  meanPercentile: 1
  sdPercentile: 0.2
  regexSampleNames: "\\.(\\d+)[A-Z]$"
  regexSampleGrouping: "(\\d+)"
  imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt"
---
```{r setup, include = FALSE}
# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10))

### FUNCTIONS

#ANOVA filter function
anova_func <- function(x, grouping_factor) {
  x_aov <- aov(as.numeric(x) ~ grouping_factor)
  pvalue <- summary(x_aov)[[1]][["Pr(>F)"]][1]
  pvalue
}
```

## Purpose:
Perform imputation of missing values, quantile normalization, and ANOVA.

<!--
## Variables to change for each input file
-->
```{r include = FALSE}
# Input Filename
input_file <- params$inputFile

# First data column - ideally, this could be detected via regexSampleNames,
#   but for now leave it as is.
first_data_column <- params$firstDataColumn
fdc_is_integer <- TRUE
first_data_column <- withCallingHandlers(
    as.integer(first_data_column)
  , warning = function(w) fdc_is_integer <<- FALSE
  )
if (FALSE == fdc_is_integer) {
  first_data_column <- params$firstDataColumn
}

# False discovery rate adjustment for ANOVA
#  Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05
val_fdr <-
  read.table(file = params$alphaFile, sep = "\t", header = F, quote = "")[, 1]

#Imputed Data filename
imputed_data_filename <- params$imputedDataFilename

#ANOVA data filename
```

```{r echo = FALSE}
# Imputation method, should be one of
#   "random", "group-median", "median", or "mean"
imputation_method <- params$imputationMethod

# Selection of percentile of logvalue data to set the mean for random number
#   generation when using random imputation
mean_percentile <- params$meanPercentile / 100.0

# deviation adjustment-factor for random values; real number.
sd_percentile <- params$sdPercentile

# Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
regex_sample_names <- params$regexSampleNames

# Regular expression to extract Sample Grouping from Sample Name;
#   if error occurs, compare sample_factor_levels and temp_matches
#   to see if groupings/pairs line up
#   e.g., "(\\d+)"
regex_sample_grouping <- params$regexSampleGrouping

```

```{r echo = FALSE}
### READ DATA

library(data.table)

# read.table reads a file in table format and creates a data frame from it.
#   - note that `quote = ""` means that quotation marks are treated literally.
full_data <- read.table(
  file = input_file,
  sep = "\t",
  header = T,
  quote = "",
  check.names = FALSE
  )
```

### Column names from input file

```{r echo = FALSE, results = 'markup'}
print(colnames(full_data))
data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE)
cat(sprintf("First data column:  %d\n", min(data_column_indices)))
cat(sprintf("Last data column:   %d\n", max(data_column_indices)))
```

```{r echo = FALSE, results = 'asis'}
cat("\\newpage\n")
```

### Checking that log-transformed sample distributions are similar:

```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}

if (FALSE == fdc_is_integer) {

  if (length(data_column_indices) > 0) {
    first_data_column <- data_column_indices[1]
  } else {
    stop(paste("failed to convert firstDataColumn:", first_data_column))
  }
}

quant_data0 <- full_data[first_data_column:length(full_data)]
quant_data <- full_data[first_data_column:length(full_data)]
quant_data[quant_data == 0] <- NA  #replace 0 with NA
quant_data_log <- log10(quant_data)

rownames(quant_data_log) <- full_data$Phosphopeptide

# data visualization
old_par <- par(
  mai = par("mai") + c(0.5, 0, 0, 0)
)
boxplot(
  quant_data_log
, las = 2
)
par(old_par)



cat("\\newline\n")
cat("\\newline\n")

```

```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), warning = FALSE}
quant_data_log_stack <- stack(quant_data_log)
library(ggplot2)
ggplot(
  quant_data_log_stack,
  aes(x = values)) + geom_density(aes(group = ind, colour = ind))
```

### Globally, are phosphopeptide intensities are approximately unimodal?

<!--
# ref for bquote below particularly and plotting math expressions generally:
#   https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/
-->
```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5)}

# identify the location of missing values
fin <- is.finite(as.numeric(as.matrix(quant_data_log)))

logvalues <- as.numeric(as.matrix(quant_data_log))[fin]
plot(
  density(logvalues),
  main = bquote(
    "Smoothed estimated probability density vs." ~ log[10](intensity)),
  xlab = bquote(log[10](intensity))
  )
hist(
  x = as.numeric(as.matrix(quant_data_log))
, breaks = 100
, main = bquote("Frequency vs." ~ log[10](intensity))
, xlab = bquote(log[10](intensity))
)
```

### Distribution of standard deviations of phosphopeptides, ignoring missing values:

```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5)}
# determine quantile
q1 <- quantile(logvalues, probs = mean_percentile)[1]

# determine standard deviation of quantile to impute
sd_finite <- function(x) {
  ok <- is.finite(x)
  sd(x[ok]) * sd_percentile
}
# 1 = row of matrix (ie, phosphopeptide)
sds <- apply(quant_data_log, 1, sd_finite)
plot(
  density(sds, na.rm = T)
, main = "Smoothed estimated probability density vs. std. deviation"
, sub = "(probability estimation made with Gaussian smoothing)"
)

m1 <- median(sds, na.rm = T) #sd to be used is the median sd

```



<!--
The number of missing values are:
-->
```{r echo = FALSE}
#Determine number of cells to impute
temp <- quant_data[is.na(quant_data)]

#Determine number of values to impute
number_to_impute <- length(temp)
```

<!--
% of values that are missing:
-->
```{r echo = FALSE}
pct_missing_values <- length(temp) / (length(logvalues) + length(temp)) * 100
```

<!--
First few rows of data before imputation:
-->
```{r echo = FALSE, results = 'asis'}
cat("\\newpage\n")
```

## Parse sample names

Parse the names of the samples to deduce the factor level for each sample:

```{r echo = FALSE}

# prep for trt-median based imputation

# Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$"
#   get factors ->
#      group runs (samples) by ignoring terminal [A-Z] in sample names

m <- regexpr(regex_sample_names, names(quant_data), perl = TRUE)
temp_matches <- regmatches(names(quant_data), m)
print("Extracted sample names")
print(temp_matches)
m2 <- regexpr(regex_sample_grouping, temp_matches, perl = TRUE)
sample_factor_levels <- as.factor(regmatches(temp_matches, m2))
print("Factor levels")
print(sample_factor_levels)

```
## Impute missing values

```{r echo = FALSE}

#Determine number of cells to impute
cat("Before imputation,",
  sprintf(
    "there are:\n  %d peptides\n  %d missing values (%2.0f%s)",
    sum(rep.int(TRUE, nrow(quant_data))),
    sum(is.na(quant_data)),
    pct_missing_values,
    "%"
    )
)

```
```{r echo = FALSE}

#Impute data
quant_data_imp <- quant_data

# Identify which values are missing and need to be imputed
ind <- which(is.na(quant_data_imp), arr.ind = TRUE)

```
```{r echo = FALSE}

# Apply imputation
switch(
  imputation_method
, "group-median" = {
    cat("Imputation method:\n   substitute missing value",
      "with median peptide-intensity for sample-group\n")

    sample_level_integers <- as.integer(sample_factor_levels)
    for (i in seq_len(length(levels(sample_factor_levels)))) {
      level_cols <- i == sample_level_integers
      ind <- which(is.na(quant_data_imp[, level_cols]), arr.ind = TRUE)
      quant_data_imp[ind, level_cols] <-
        apply(quant_data_imp[, level_cols], 1, median, na.rm = T)[ind[, 1]]
    }
    good_rows <- !is.na(rowMeans(quant_data_imp))
  }
, "median" = {
    cat("Imputation method:\n   substitute missing value with",
      "median peptide-intensity across all sample classes\n")
    quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]]
    good_rows <- !is.na(rowMeans(quant_data_imp))
  }
, "mean" = {
    cat("Imputation method:\n   substitute missing value with",
      "mean peptide-intensity across all sample classes\n")
    quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]]
    good_rows <- !is.na(rowMeans(quant_data_imp))
  }
, "random" = {
    cat(
      "Imputation method:\n   substitute missing value with\n  ",
      sprintf(
        "random intensity N ~ (%0.2f, %0.2f)\n"
      , q1, m1
      )
    )
    quant_data_imp[is.na(quant_data_imp)] <-
      10 ^ rnorm(number_to_impute, mean = q1, sd = m1)
    good_rows <- !is.na(rowMeans(quant_data_imp))
  }
)

```
```{r echo = FALSE}

#Determine number of cells to impute
temp <- quant_data_imp[is.na(quant_data_imp)]
cat("After imputation, there are:",
  sprintf(
    "\n  %d missing values\n  %d usable peptides analysis"
  , sum(is.na(quant_data_imp[good_rows, ]))
  , sum(good_rows)
  ),
  sprintf(
    "\n  %d peptides with too many missing values for further analysis"
  , sum(!good_rows)
  )
)
```
```{r echo = FALSE}


# Zap rows where imputation was ineffective
full_data         <- full_data        [good_rows, ]
quant_data        <- quant_data       [good_rows, ]
quant_data_imp <- quant_data_imp[good_rows, ]

```
```{r echo = FALSE}

d_combined <- (density(as.numeric(as.matrix(
  log10(quant_data_imp)
))))
d_original <-
  density(as.numeric(as.matrix(
    log10(quant_data_imp[!is.na(quant_data)]))))

```
```{r echo = FALSE}

if (sum(is.na(quant_data)) > 0) {
  # There ARE missing values
  d_imputed <-
    (density(as.numeric(as.matrix(
      log10(quant_data_imp[is.na(quant_data)])
    ))))
} else {
  # There are NO missing values
  d_imputed <- d_combined
}

```

```{r echo = FALSE, fig.dim = c(9, 5)}
ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y))
plot(
  d_combined,
  ylim = ylim,
  sub = "Blue = data before imputation; Red = imputed data",
  main = "Density vs. log10(intensity) before and after imputation"
)
lines(d_original, col = "blue")
lines(d_imputed, col = "red")
```

## Perform Quantile Normalization

<!--
# Apply quantile normalization using preprocessCore::normalize.quantiles
# ---
# tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html
#   except this: https://support.bioconductor.org/p/122925/#9135989
#   says to install it like this:
#     ```
#     BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1])
#     ```
# conda installation (necessary because of a bug in recent openblas):
#   conda install bioconductor-preprocesscore openblas=0.3.3
# ...
# ---
# normalize.quantiles {preprocessCore}	--  Quantile Normalization
#
# Description:
#   Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities.
#
# Usage:
#   normalize.quantiles(x, copy = TRUE, keep.names = FALSE)
#
# Arguments:
#
#   - x: A matrix of intensities where each column corresponds to a chip and each row is a probe.
#
#   - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy,
#       but in certain situations not making a copy of the matrix, but instead normalizing
#       it in place will be more memory friendly.
#
#   - keep.names: Boolean option to preserve matrix row and column names in output.
#
# Details:
#   This method is based upon the concept of a quantile-quantile plot extended to n dimensions.
#     No special allowances are made for outliers. If you make use of quantile normalization
#     please cite Bolstad et al, Bioinformatics (2003).
#
#   This functions will handle missing data (ie NA values), based on
#     the assumption that the data is missing at random.
#
#   Note that the current implementation optimizes for better memory usage
#     at the cost of some additional run-time.
#
# Value: A normalized matrix.
#
# Author: Ben Bolstad, bmbolstad.com
#
# References
#
#   - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide
#       Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf
#
#   - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of
#       Normalization Methods for High Density Oligonucleotide Array Data Based on Bias
#       and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185
#       http://bmbolstad.com/misc/normalize/normalize.html
# ...
-->
```{r echo = FALSE}
library(preprocessCore)

if (TRUE) {
  quant_data_imp_qn <- normalize.quantiles(as.matrix(quant_data_imp))
} else {
  quant_data_imp_qn <- as.matrix(quant_data_imp)
}

quant_data_imp_qn <- as.data.frame(quant_data_imp_qn)
names(quant_data_imp_qn) <- names(quant_data_imp)
quant_data_imp_qn_log <- log10(quant_data_imp_qn)

rownames(quant_data_imp_qn_log) <- full_data[, 1]

quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn))))
any_nan <- function(x) {
  !any(x == "NaN")
}
sel <- apply(quant_data_imp_qn_ls, 1, any_nan)
quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls[which(sel), ]
quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2)

#output quantile normalized data
data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log)
write.table(
  data_table_imp_qn_lt,
  file = paste(paste(
    strsplit(imputed_data_filename, ".txt"), "QN_LT", sep = "_"
  ), ".txt", sep = ""),
  sep = "\t",
  col.names = TRUE,
  row.names = FALSE
)

```

<!-- ACE insertion begin -->
### Checking that normalized, imputed, log-transformed sample distributions are similar:

```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}


# Save unimputed quant_data_log for plotting below
unimputed_quant_data_log <- quant_data_log

# log10 transform (after preparing for zero values,
#   which should never happen...)
quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001
quant_data_log <- log10(quant_data_imp_qn)

# Output quantile-normalized log-transformed dataset
#   with imputed, normalized data

data_table_imputed <- cbind(full_data[1:9], quant_data_log)
write.table(
    data_table_imputed
  , file = imputed_data_filename
  , sep = "\t"
  , col.names = TRUE
  , row.names = FALSE
  , quote = FALSE
  )



# data visualization
old_par <- par(
  mai = par("mai") + c(0.5, 0, 0, 0)
, oma = par("oma") + c(0.5, 0, 0, 0)
)
boxplot(
  quant_data_log
, las = 2
)
par(old_par)



cat("\\newline\n")
cat("\\newline\n")

```

```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4)}
quant_data_log_stack <- stack(quant_data_log)
ggplot(
  quant_data_log_stack,
  aes(x = values)
  ) + geom_density(aes(group = ind, colour = ind))
```

## Perform ANOVA filters

(see following pages)

```{r, echo = FALSE}
# Make new data frame containing only Phosphopeptides
#   to connect preANOVA to ANOVA (connect_df)
connect_df <- data.frame(
    data_table_imp_qn_lt$Phosphopeptide
  , data_table_imp_qn_lt[, first_data_column]
  )
colnames(connect_df) <- c("Phosphopeptide", "Intensity")
```

```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
# Get factors -> group replicates (as indicated by terminal letter)
#   by the preceding digits;
#   e.g., group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc..
m <-
  regexpr(regex_sample_names, names(quant_data_imp_qn_log), perl = TRUE)

temp_matches <- regmatches(names(quant_data_imp_qn_log), m)

number_of_samples <- length(temp_matches)

m2 <- regexpr(regex_sample_grouping, temp_matches, perl = TRUE)


sample_factor_levels <- as.factor(regmatches(temp_matches, m2))


if (length(levels(sample_factor_levels)) < 2) {
  cat(
    "ERROR!!!! Cannot perform ANOVA analysis",
    "because it requires two or more factor levels\n"
  )
  cat("Unparsed sample names are:\n")
  print(names(quant_data_imp_qn_log))
  cat(sprintf("Parsing rule for SampleNames is '%s'\n", regex_sample_names))
  cat("Parsed names are:\n")
  print(temp_matches)
  cat(sprintf(
    "Parsing rule for SampleGrouping is '%s'\n",
    regex_sample_grouping
  ))
  cat("Sample group assignments are:\n")
  print(regmatches(temp_matches, m2))
} else {
  p_value_data_anova_ps <-
    apply(
      quant_data_imp_qn_log,
      1,
      anova_func,
      grouping_factor = sample_factor_levels
      )

  p_value_data_anova_ps_fdr <-
    p.adjust(p_value_data_anova_ps, method = "fdr")
  p_value_data <- data.frame(
    phosphopeptide = full_data[, 1]
    ,
    raw_anova_p = p_value_data_anova_ps
    ,
    fdr_adjusted_anova_p = p_value_data_anova_ps_fdr
  )

  # output ANOVA file to constructed filename,
  #   e.g.    "Outputfile_pST_ANOVA_STEP5.txt"
  #   becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt"

  # Re-output quantile-normalized log-transformed dataset
  #   with imputed, normalized data to include p-values

  data_table_imputed <-
    cbind(full_data[1:9], p_value_data[, 2:3], quant_data_log)
  write.table(
    data_table_imputed,
    file = imputed_data_filename,
    sep = "\t",
    col.names = TRUE,
    row.names = FALSE,
    quote = FALSE
    )


  p_value_data <-
    p_value_data[order(p_value_data$fdr_adjusted_anova_p), ]

  cutoff <- val_fdr[1]
  for (cutoff in val_fdr) {
    #loop through FDR cutoffs

    filtered_p <-
      p_value_data[
        which(p_value_data$fdr_adjusted_anova_p < cutoff),
        ,
        drop = FALSE
        ]
    filtered_data_filtered <-
      quant_data_imp_qn_log[
        rownames(filtered_p),
        ,
        drop = FALSE
        ]
    filtered_data_filtered <-
      filtered_data_filtered[
        order(filtered_p$fdr_adjusted_anova_p),
        ,
        drop = FALSE
        ]

    # <!-- ACE insertion start -->
    old_oma <- par("oma")
    old_par <- par(
      mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1),
      oma = old_oma * c(1, 1, 0.3, 1),
      cex.main = 0.9,
      cex.axis = 0.7
      )

    cat("\\newpage\n")
    if (nrow(filtered_data_filtered) > 0) {
      cat(sprintf(
        "Intensities for peptides whose adjusted p-value < %0.2f\n",
        cutoff
      ))
      cat("\\newline\n")
      cat("\\newline\n")

      boxplot(
        filtered_data_filtered,
        main = "Imputed, normalized intensities", # no line plot
        las = 2,
        ylab = expression(log[10](intensity))
      )
    } else {
      cat(sprintf(
        "No peptides were found to have cutoff adjusted p-value < %0.2f\n",
        cutoff
      ))
    }
    par(old_par)

    if (nrow(filtered_data_filtered) > 0) {
      #Add Phosphopeptide column to anova_filtered table
      anova_filtered_merge <- merge(
        x = connect_df
        ,
        y = filtered_data_filtered
        ,
        by.x = "Intensity"
        ,
        by.y = 1
      )
      anova_filtered_merge_order <- rownames(filtered_p)

      anova_filtered_merge_format <- sapply(
        X = filtered_p$fdr_adjusted_anova_p
        ,
        FUN = function(x) {
          if (x > 0.0001)
            paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s")
          else
            paste0("(%0.4e) %s")
        }
      )



      anova_filtered <- data.table(
        anova_filtered_merge$Phosphopeptide
        ,
        anova_filtered_merge$Intensity
        ,
        anova_filtered_merge[, 2:number_of_samples + 1]
      )
      colnames(anova_filtered) <-
        c("Phosphopeptide", colnames(filtered_data_filtered))

      # merge qualitative columns into the ANOVA data
      output_table <- data.frame(anova_filtered$Phosphopeptide)
      output_table <- merge(
        x = output_table
        ,
        y = data_table_imp_qn_lt
        ,
        by.x = "anova_filtered.Phosphopeptide"
        ,
        by.y = "Phosphopeptide"
      )

      #Produce heatmap to visualize significance and the effect of imputation
      m <-
        as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ])
      if (nrow(m) > 0) {
        rownames_m <- rownames(m)
        rownames(m) <- sapply(
          X = seq_len(nrow(m))
          ,
          FUN = function(i) {
            sprintf(
              anova_filtered_merge_format[i]
              ,
              filtered_p$fdr_adjusted_anova_p[i]
              ,
              rownames_m[i]
            )
          }
        )
        margins <- c(max(nchar(colnames(m))) * 10 / 16 # col
                     , max(nchar(rownames(m))) * 5 / 16 # row
                     )
                     how_many_peptides <- min(50, nrow(m))

                     cat("\\newpage\n")
                     if (nrow(m) > 50) {
                       cat("Heatmap for the 50 most-significant peptides",
                         sprintf(
                           "whose adjusted p-value < %0.2f\n",
                           cutoff)
                       )
                     } else {
                       cat("Heatmap for peptides whose",
                         sprintf("adjusted p-value < %0.2f\n",
                         cutoff)
                       )
                     }
                     cat("\\newline\n")
                     cat("\\newline\n")
                     op <- par("cex.main")
                     try(
                       if (nrow(m) > 1) {
                         par(cex.main = 0.6)
                         heatmap(
                           m[how_many_peptides:1, ],
                           Rowv = NA,
                           Colv = NA,
                           cexRow = 0.7,
                           cexCol = 0.8,
                           scale = "row",
                           margins = margins,
                           main =
                             "Heatmap of unimputed, unnormalized intensities",
                           xlab = ""
                           )
                       }
                     )
                     par(op)
      }
    }
  }
}
```

<!--
## Peptide IDs, etc.

See output files.
-->