Mercurial > repos > galaxyp > maxquant_phosphopeptide_intensity

#!/usr/bin/env Rscript
# libraries
library(optparse)
library(data.table)
library(stringr)
#library(ggplot2)
#library(PTXQC)
#require(PTXQC)
#require(methods)

# title: "MaxQuant Processing Script"
# author: "Larry Cheng"
# date: "February 19, 2018"
#
# # MaxQuant Processing Script
# Takes MaxQuant Phospho (STY)sites.txt file as input and performs the following (in order):
# 1) Runs the Proteomics Quality Control software
# 2) Remove contaminant and reverse sequence rows
# 3) Filters rows based on localization probability
# 4) Extract the quantitative data
# 5) Sequences phosphopeptides
# 6) Merges multiply phosphorylated peptides
# 7) Filters out phosphopeptides based on enrichment
# The output file contains the phosphopeptide (first column) and the quantitative values for each sample
#
# ## Revision History
# Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" and "Phosphopeptide Processing Script"
# Rev. 2017-12-12 :added PTXQC
#                  added additional plots and table outputs for quality control
#                  allowed for more than 2 samples to be grouped together (up to 26 (eg, 1A, 1B, 1C, etc))regexSampleNames <-
#                  "\\.(\\d+)[A-Z]$"
#                  converted from .r to .rmd file to knit report for quality control
# Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data impute multiple times
# Rev. 2016-09-09 :added filter to eliminate contaminant and reverse sequence rows
# Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to preANOVA file output
# Rev. 2016-08-22 :changed regexpression to regexSampleNames <- "\\.(\\d+)[AB]$" so that it looks at the end of string
# Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....)
# Rev. 2016-07-03 :Removed row names from the write.table() output for ANOVA and PreANOVA
# Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75
# Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting the row numbers afterwards
# Rev. 2016-06-21 :test18 + standardized the regexpression in protocol

## Variables to change for each input file
##  ```{r}
##  #Location of MaxQuant txt folder
##  # C:\Users\art\src\ppenrich\postMQanalysis\step2.1
##  dir_txtfolder <- "C:/Users/art/src/ppenrich/postMQanalysis/step2.1"
##  #Input Filename
##  inputFilename <- paste0(dir"/Phospho (STY)Sites.txt"
##  #Column number for the Intensity of the first sample
##  startCol <- 58
##  #Number of runs
##  numSamples <- 6
##  #Column number of "Number.of.Phospho..STY."
##  phosphoCol <- 37
##  #pY or pST enriched samples (ie, "Y" or "ST")
##  enriched <- "ST"
##  #output filename
##  outputfilename = "outputfile_STEP2.txt"
##  ```
##
##  ## Other modifiable variables
##  ```{r}
##  #Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column)
##  intervalCol <- 1
##  #Localization Probability Cutoff
##  localProbCutoff <- 0.75
##  #merge identical phosphopeptides by ("sum" or "average") the intensities
##  collapse_FUN = "sum"
##  ```

# parse options
option_list <- list(
  make_option(
    c("-i", "--input"),
    action = "store",
    default = NA,
    type = "character",
    help = "A MaxQuant Phospho (STY)Sites.txt"
  ),
  make_option(
    c("-o", "--output"),
    action = "store",
    default = "output.tsv",
    type = "character",
    help = "output file"
  ),
  make_option(
    c("-e", "--enriched"),
    action = "store",
    default = "output.tsv",
    type = "character",
    help = "pY or pST enriched samples (ie, 'Y' or 'ST')"
  ),
  make_option(
    c("-n", "--numSamples"),
    action = "store",
    default = 3,
    type = "integer",
    help = "Number of samples"
  ),
  make_option(
    c("-p", "--phosphoCol"),
    action = "store",
    default = 13,
    type = "integer",
    help = "Column number of Number.of.Phospho..STY."
  ),
  make_option(
    c("-s", "--startCol"),
    action = "store",
    default = 24,
    type = "integer",
    help = "Column number for the Intensity of the first sample"
  ),
  make_option(
    c("-I", "--intervalCol"),
    action = "store",
    default = 1,
    type = "integer",
    help = "Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column"
  ),
  make_option(
    c("-l", "--localProbCutoff"),
    action = "store",
    default = 0.75,
    type = "double",
    help = "Localization Probability Cutoff"
  ),
  make_option(
    c("-f", "--collapse_func"),
    action = "store",
    default = "sum",
    type = "character",
    help = "merge identical phosphopeptides by ('sum' or 'average') the intensities"
  )
)
args <- parse_args(OptionParser(option_list=option_list))
# Check parameter values

if (! file.exists(args$input)) {
  stop((paste("File", args$input, "does not exist")))
}
inputFilename <- args$input
outputfilename <- args$output
localProbCutoff <- args$localProbCutoff
numSamples <- args$numSamples
phosphoCol <- args$phosphoCol
startCol <- args$startCol
intervalCol <- args$intervalCol
enriched <- "ST"
collapse_FUN = args$collapse_func

### FUNCTIONS

#function to generate phosphopeptide and build list when applied
phosphopeptide_func <- function(df) {

  #generate peptide sequence and list of phosphopositions
  phosphoprobsequence <- strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]]
  output <- vector()
  phosphopeptide <- ""
  counter <- 0 #keep track of position in peptide
  phosphopositions <- vector() #keep track of phosphorylation positions in peptide
  score_diff <- ""
  for (chara in phosphoprobsequence){
    #build peptide sequence
    if (!(chara == " " | chara == "(" | chara == ")" | chara =="." | chara =="-" | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9")) {
      phosphopeptide <- paste(phosphopeptide,chara,sep="")
      counter <- counter + 1
    }
    #generate score_diff
    if (chara == "-" | chara =="." | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9"){
      score_diff <- paste(score_diff,chara,sep="")
    }
    #evaluate score_diff
    if (chara == ")" ){
      score_diff <- as.numeric(score_diff)
      #only consider a phosphoresidue if score_diff > 0
      if (score_diff > 0) {
        phosphopositions <- append(phosphopositions, counter)
      }
      score_diff <- ""
    }
  }

  #generate phosphopeptide sequence (ie, peptide sequence with "p"'s)
  counter <- 1
  phosphoposition_correction1 <- -1 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string
  phosphoposition_correction2 <- 0 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string
  while (counter <= length(phosphopositions) ) {
    phosphopeptide <- paste(substr(phosphopeptide,0,phosphopositions[counter]+phosphoposition_correction1),"p",substr(phosphopeptide,phosphopositions[counter]+phosphoposition_correction2,nchar(phosphopeptide)),sep="")
    counter <- counter + 1
    phosphoposition_correction1 <- phosphoposition_correction1 + 1
    phosphoposition_correction2 <- phosphoposition_correction2 + 1
  }

  #building phosphopeptide list
  output <- append(output,phosphopeptide)
  return(output)
}
##
##  ## Run Proteomics Quality Control for MaxQuant Results
##  (Bielow C et al. J Proteome Res. 2016 PMID: 26653327)
##
##  ```{r echo=FALSE}
##  library(PTXQC)
##  require(PTXQC)
##  require(methods)
##  txt_folder <- dir_txtfolder
##  r = createReport(txt_folder)
##  cat(paste0("\nReport has been generated inside the MaxQuant txt folder '", r$report_file, "'\n\n"))
##  ```
##
##
## Perform filters on contaminants, reverse sequences, and localization probability

fullData <- read.table(file = inputFilename, sep ="\t", header=T, quote="")

#Filter out contaminant rows and reverse rows
filteredData <- subset(fullData,!grepl("CON__", Proteins))
filteredData <- subset(filteredData,!grepl("_MYCOPLASMA", Proteins))
filteredData <- subset(filteredData,!grepl("CONTAMINANT_", Proteins))
filteredData <- subset(filteredData,!grepl("REV__", Protein)) #since REV__ rows are blank in the first column (Proteins)
write.table(filteredData, file = "filteredData.txt", sep = "\t", col.names=TRUE, row.names=FALSE)

#Data filtered by localization probability
locProbFilteredData <- filteredData[filteredData$Localization.prob>=localProbCutoff,]

##  Localization probability
##  #visualize locprob cutoff
locProbGraphData <- data.frame(
  group = c(paste(">",toString(localProbCutoff),sep=""), paste("<",toString(localProbCutoff),sep="")),
  value = c(nrow(locProbFilteredData)/nrow(filteredData)*100, (nrow(filteredData)-nrow(locProbFilteredData))/nrow(filteredData)*100)
)
##  library(ggplot2)
##  locProbGraph <- ggplot(locProbGraphData, aes(x="", y=value, fill=group)) +
##  geom_bar(width = 0.5, stat = "identity", color="black")
##
##  locProbGraph

## Extract out quantitative values
#Extract out quantitative values
quantData <- locProbFilteredData[,seq(from=startCol, by=intervalCol, length.out=numSamples)]
write.table(quantData, file = "quantdata.txt", sep = "\t", col.names=TRUE, row.names=FALSE)
#head(quantData)
##  ```
##
##
##  ## Generate Phosphopeptide Sequence
##  First few rows of the phosphopeptide column:
#for latest version of MaxQuant (Version 1.5.3.30)
dataTable <- data.frame(locProbFilteredData[,1:8],locProbFilteredData[,phosphoCol],locProbFilteredData[,phosphoCol+1],locProbFilteredData[,phosphoCol+2],locProbFilteredData[,phosphoCol+3],locProbFilteredData[,phosphoCol+4],locProbFilteredData[,phosphoCol+5],locProbFilteredData[,phosphoCol+6],locProbFilteredData[,phosphoCol+7],quantData)
colnames(dataTable) <- c("Proteins","Positions within proteins", "Leading proteins", "Protein", "Protein names", "Gene names", "Fasta headers", "Localization prob", "Number of Phospho (STY)", "Amino Acid", "Sequence window","Modification window", "Peptide window coverage", "Phospho (STY) Probabilities", "Phospho (STY) Score diffs", "Position in peptide", colnames(quantData))
##
#Generate column in dataTable for the predicted phosphopeptide
dataTable$Phosphopeptide <- apply(dataTable,1,phosphopeptide_func)
##
#Move Quant columns to the end
movetolast <- function(data, move) { #moves columns to the end of dataframe
  data[c(setdiff(names(data), move), move)]
}
dataTable <- movetolast(dataTable,c(colnames(quantData)))
##
#Make new data frame containing only Phosphopeptides to be mapped to quant data (merge_df)
dataTable <- setDT(dataTable, keep.rownames=TRUE) #row name will be used to map
merge_df <- data.frame(as.integer(dataTable$rn), dataTable$Phosphopeptide) #row index to merge data frames
colnames(merge_df) <- c("rn", "Phosphopeptide")
##
##
head(merge_df)
##
##
##  First few rows of the MaxQuant phosphoresidue score differences: Sequence representation for each of the possible PTM positions in each possible configuration, the difference is calculated between the identification score with the PTM added to that position and the best scoring identification where no PTM is added to that position. When this value is negative, it is unlikely that the particular modification is located at this position.
##  ```{r echo=FALSE}
##
##  head(locProbFilteredData$Phospho..STY..Score.diffs)
##  ```
##
##  First few rows of data with Phosphopeptide Sequence.
##  ```{r echo=FALSE }
##  # Add Phosphopeptide column to quant columns for quality control checking
quantData_qc <- as.data.frame(quantData)
setDT(quantData_qc, keep.rownames=TRUE) #will use to match rowname to data
quantData_qc$rn <- as.integer(quantData_qc$rn)
quantData_qc <- merge(merge_df,quantData_qc, by="rn")
quantData_qc$rn <- NULL #remove rn column
head(quantData_qc)
##
##  ```
##
##
##  ## Collapse multiphosphorylated peptides
##
##  Before collapse:
##  ```{r echo=FALSE}
##  summary(quantData_qc)
##  ```
##
##
##  After collapse:
##  ```{r echo=FALSE}
#Collapse multiphosphorylated peptides
quantData_qc_collapsed <- data.table(quantData_qc, key = "Phosphopeptide")
quantData_qc_collapsed <- aggregate(. ~ Phosphopeptide,quantData_qc, FUN= collapse_FUN)
summary(quantData_qc_collapsed)
##
##  ```
##
##  % of phosphopeptides that are multiphosphorylated is roughly ~
##  ```{r echo=FALSE}
(nrow(quantData_qc) - nrow(quantData_qc_collapsed))/nrow(quantData_qc)/2
##  ```
##
##  Breakdown of pY, pS, and pT before enrichment filter
##  ```{r echo=FALSE}
##  library(stringr)
pY_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]
pS_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS"),]
pT_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]
##
pY_num <- nrow(pY_data)
pS_num <- nrow(pS_data)
pT_num <- nrow(pT_data)
##
##  #visualize enrichment
enrichGraphData <- data.frame(
  group = c("pY", "pS", "pT"),
  value = c(pY_num, pS_num, pT_num)
)
##
##  library(ggplot2)
##  enrichGraph <- ggplot(enrichGraphData, aes(x="", y=value, fill=group)) +
##  geom_bar(width = 0.5, stat = "identity", color="black") + coord_polar("y", start=0)
##
##  enrichGraph
##  ```
##
##  ## Filter phosphopeptides by enrichment
##  ```{r echo=FALSE}
##  #Data filtered by enrichment
##  library(stringr)
if (enriched == "Y"){
  quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]
} else if ( enriched == "ST" ) {
  quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS") | str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]
} else { print("Error in enriched variable. Set to either 'Y' or 'ST'")}
##  ```
##
##
##  ```{r echo=FALSE}
##  #output files
##  #write.table(quantData_qc, file="BeforeCollapse.txt", sep="\t", row.names = FALSE) #for quality control
##  #write.table(quantData_qc_collapsed, file="AfterCollapse.txt", sep="\t", row.names = FALSE) #for quality control
write.table(quantData_qc_enrichment, file=outputfilename, sep="\t", quote = FALSE, row.names = FALSE)
##  ```
author	galaxyp
date	Thu, 04 Nov 2021 18:35:10 +0000
parents
children