Mercurial > repos > galaxyp > maxquant_phosphopeptide_intensity

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MaxQuantProcessingScript.R	Thu Nov 04 18:35:10 2021 +0000
@@ -0,0 +1,356 @@
+#!/usr/bin/env Rscript
+# libraries
+library(optparse)
+library(data.table)
+library(stringr)
+#library(ggplot2)
+#library(PTXQC)
+#require(PTXQC)
+#require(methods)
+
+# title: "MaxQuant Processing Script"
+# author: "Larry Cheng"
+# date: "February 19, 2018"
+#
+# # MaxQuant Processing Script
+# Takes MaxQuant Phospho (STY)sites.txt file as input and performs the following (in order):
+# 1) Runs the Proteomics Quality Control software
+# 2) Remove contaminant and reverse sequence rows
+# 3) Filters rows based on localization probability
+# 4) Extract the quantitative data
+# 5) Sequences phosphopeptides
+# 6) Merges multiply phosphorylated peptides
+# 7) Filters out phosphopeptides based on enrichment
+# The output file contains the phosphopeptide (first column) and the quantitative values for each sample
+#
+# ## Revision History
+# Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" and "Phosphopeptide Processing Script"
+# Rev. 2017-12-12 :added PTXQC
+#                  added additional plots and table outputs for quality control
+#                  allowed for more than 2 samples to be grouped together (up to 26 (eg, 1A, 1B, 1C, etc))regexSampleNames <-
+#                  "\\.(\\d+)[A-Z]$"
+#                  converted from .r to .rmd file to knit report for quality control
+# Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data impute multiple times
+# Rev. 2016-09-09 :added filter to eliminate contaminant and reverse sequence rows
+# Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to preANOVA file output
+# Rev. 2016-08-22 :changed regexpression to regexSampleNames <- "\\.(\\d+)[AB]$" so that it looks at the end of string
+# Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....)
+# Rev. 2016-07-03 :Removed row names from the write.table() output for ANOVA and PreANOVA
+# Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75
+# Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting the row numbers afterwards
+# Rev. 2016-06-21 :test18 + standardized the regexpression in protocol
+
+## Variables to change for each input file
+##  ```{r}
+##  #Location of MaxQuant txt folder
+##  # C:\Users\art\src\ppenrich\postMQanalysis\step2.1
+##  dir_txtfolder <- "C:/Users/art/src/ppenrich/postMQanalysis/step2.1"
+##  #Input Filename
+##  inputFilename <- paste0(dir"/Phospho (STY)Sites.txt"
+##  #Column number for the Intensity of the first sample
+##  startCol <- 58
+##  #Number of runs
+##  numSamples <- 6
+##  #Column number of "Number.of.Phospho..STY."
+##  phosphoCol <- 37
+##  #pY or pST enriched samples (ie, "Y" or "ST")
+##  enriched <- "ST"
+##  #output filename
+##  outputfilename = "outputfile_STEP2.txt"
+##  ```
+##
+##  ## Other modifiable variables
+##  ```{r}
+##  #Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column)
+##  intervalCol <- 1
+##  #Localization Probability Cutoff
+##  localProbCutoff <- 0.75
+##  #merge identical phosphopeptides by ("sum" or "average") the intensities
+##  collapse_FUN = "sum"
+##  ```
+
+# parse options
+option_list <- list(
+  make_option(
+    c("-i", "--input"),
+    action = "store",
+    default = NA,
+    type = "character",
+    help = "A MaxQuant Phospho (STY)Sites.txt"
+  ),
+  make_option(
+    c("-o", "--output"),
+    action = "store",
+    default = "output.tsv",
+    type = "character",
+    help = "output file"
+  ),
+  make_option(
+    c("-e", "--enriched"),
+    action = "store",
+    default = "output.tsv",
+    type = "character",
+    help = "pY or pST enriched samples (ie, 'Y' or 'ST')"
+  ),
+  make_option(
+    c("-n", "--numSamples"),
+    action = "store",
+    default = 3,
+    type = "integer",
+    help = "Number of samples"
+  ),
+  make_option(
+    c("-p", "--phosphoCol"),
+    action = "store",
+    default = 13,
+    type = "integer",
+    help = "Column number of Number.of.Phospho..STY."
+  ),
+  make_option(
+    c("-s", "--startCol"),
+    action = "store",
+    default = 24,
+    type = "integer",
+    help = "Column number for the Intensity of the first sample"
+  ),
+  make_option(
+    c("-I", "--intervalCol"),
+    action = "store",
+    default = 1,
+    type = "integer",
+    help = "Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column"
+  ),
+  make_option(
+    c("-l", "--localProbCutoff"),
+    action = "store",
+    default = 0.75,
+    type = "double",
+    help = "Localization Probability Cutoff"
+  ),
+  make_option(
+    c("-f", "--collapse_func"),
+    action = "store",
+    default = "sum",
+    type = "character",
+    help = "merge identical phosphopeptides by ('sum' or 'average') the intensities"
+  )
+)
+args <- parse_args(OptionParser(option_list=option_list))
+# Check parameter values
+
+if (! file.exists(args$input)) {
+  stop((paste("File", args$input, "does not exist")))
+}
+inputFilename <- args$input
+outputfilename <- args$output
+localProbCutoff <- args$localProbCutoff
+numSamples <- args$numSamples
+phosphoCol <- args$phosphoCol
+startCol <- args$startCol
+intervalCol <- args$intervalCol
+enriched <- "ST"
+collapse_FUN = args$collapse_func
+
+### FUNCTIONS
+
+#function to generate phosphopeptide and build list when applied
+phosphopeptide_func <- function(df) {
+
+  #generate peptide sequence and list of phosphopositions
+  phosphoprobsequence <- strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]]
+  output <- vector()
+  phosphopeptide <- ""
+  counter <- 0 #keep track of position in peptide
+  phosphopositions <- vector() #keep track of phosphorylation positions in peptide
+  score_diff <- ""
+  for (chara in phosphoprobsequence){
+    #build peptide sequence
+    if (!(chara == " " | chara == "(" | chara == ")" | chara =="." | chara =="-" | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9")) {
+      phosphopeptide <- paste(phosphopeptide,chara,sep="")
+      counter <- counter + 1
+    }
+    #generate score_diff
+    if (chara == "-" | chara =="." | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9"){
+      score_diff <- paste(score_diff,chara,sep="")
+    }
+    #evaluate score_diff
+    if (chara == ")" ){
+      score_diff <- as.numeric(score_diff)
+      #only consider a phosphoresidue if score_diff > 0
+      if (score_diff > 0) {
+        phosphopositions <- append(phosphopositions, counter)
+      }
+      score_diff <- ""
+    }
+  }
+
+  #generate phosphopeptide sequence (ie, peptide sequence with "p"'s)
+  counter <- 1
+  phosphoposition_correction1 <- -1 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string
+  phosphoposition_correction2 <- 0 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string
+  while (counter <= length(phosphopositions) ) {
+    phosphopeptide <- paste(substr(phosphopeptide,0,phosphopositions[counter]+phosphoposition_correction1),"p",substr(phosphopeptide,phosphopositions[counter]+phosphoposition_correction2,nchar(phosphopeptide)),sep="")
+    counter <- counter + 1
+    phosphoposition_correction1 <- phosphoposition_correction1 + 1
+    phosphoposition_correction2 <- phosphoposition_correction2 + 1
+  }
+
+  #building phosphopeptide list
+  output <- append(output,phosphopeptide)
+  return(output)
+}
+##
+##  ## Run Proteomics Quality Control for MaxQuant Results
+##  (Bielow C et al. J Proteome Res. 2016 PMID: 26653327)
+##
+##  ```{r echo=FALSE}
+##  library(PTXQC)
+##  require(PTXQC)
+##  require(methods)
+##  txt_folder <- dir_txtfolder
+##  r = createReport(txt_folder)
+##  cat(paste0("\nReport has been generated inside the MaxQuant txt folder '", r$report_file, "'\n\n"))
+##  ```
+##
+##
+## Perform filters on contaminants, reverse sequences, and localization probability
+
+fullData <- read.table(file = inputFilename, sep ="\t", header=T, quote="")
+
+#Filter out contaminant rows and reverse rows
+filteredData <- subset(fullData,!grepl("CON__", Proteins))
+filteredData <- subset(filteredData,!grepl("_MYCOPLASMA", Proteins))
+filteredData <- subset(filteredData,!grepl("CONTAMINANT_", Proteins))
+filteredData <- subset(filteredData,!grepl("REV__", Protein)) #since REV__ rows are blank in the first column (Proteins)
+write.table(filteredData, file = "filteredData.txt", sep = "\t", col.names=TRUE, row.names=FALSE)
+
+#Data filtered by localization probability
+locProbFilteredData <- filteredData[filteredData$Localization.prob>=localProbCutoff,]
+
+##  Localization probability
+##  #visualize locprob cutoff
+locProbGraphData <- data.frame(
+  group = c(paste(">",toString(localProbCutoff),sep=""), paste("<",toString(localProbCutoff),sep="")),
+  value = c(nrow(locProbFilteredData)/nrow(filteredData)*100, (nrow(filteredData)-nrow(locProbFilteredData))/nrow(filteredData)*100)
+)
+##  library(ggplot2)
+##  locProbGraph <- ggplot(locProbGraphData, aes(x="", y=value, fill=group)) +
+##  geom_bar(width = 0.5, stat = "identity", color="black")
+##
+##  locProbGraph
+
+## Extract out quantitative values
+#Extract out quantitative values
+quantData <- locProbFilteredData[,seq(from=startCol, by=intervalCol, length.out=numSamples)]
+write.table(quantData, file = "quantdata.txt", sep = "\t", col.names=TRUE, row.names=FALSE)
+#head(quantData)
+##  ```
+##
+##
+##  ## Generate Phosphopeptide Sequence
+##  First few rows of the phosphopeptide column:
+#for latest version of MaxQuant (Version 1.5.3.30)
+dataTable <- data.frame(locProbFilteredData[,1:8],locProbFilteredData[,phosphoCol],locProbFilteredData[,phosphoCol+1],locProbFilteredData[,phosphoCol+2],locProbFilteredData[,phosphoCol+3],locProbFilteredData[,phosphoCol+4],locProbFilteredData[,phosphoCol+5],locProbFilteredData[,phosphoCol+6],locProbFilteredData[,phosphoCol+7],quantData)
+colnames(dataTable) <- c("Proteins","Positions within proteins", "Leading proteins", "Protein", "Protein names", "Gene names", "Fasta headers", "Localization prob", "Number of Phospho (STY)", "Amino Acid", "Sequence window","Modification window", "Peptide window coverage", "Phospho (STY) Probabilities", "Phospho (STY) Score diffs", "Position in peptide", colnames(quantData))
+##
+#Generate column in dataTable for the predicted phosphopeptide
+dataTable$Phosphopeptide <- apply(dataTable,1,phosphopeptide_func)
+##
+#Move Quant columns to the end
+movetolast <- function(data, move) { #moves columns to the end of dataframe
+  data[c(setdiff(names(data), move), move)]
+}
+dataTable <- movetolast(dataTable,c(colnames(quantData)))
+##
+#Make new data frame containing only Phosphopeptides to be mapped to quant data (merge_df)
+dataTable <- setDT(dataTable, keep.rownames=TRUE) #row name will be used to map
+merge_df <- data.frame(as.integer(dataTable$rn), dataTable$Phosphopeptide) #row index to merge data frames
+colnames(merge_df) <- c("rn", "Phosphopeptide")
+##
+##
+head(merge_df)
+##
+##
+##  First few rows of the MaxQuant phosphoresidue score differences: Sequence representation for each of the possible PTM positions in each possible configuration, the difference is calculated between the identification score with the PTM added to that position and the best scoring identification where no PTM is added to that position. When this value is negative, it is unlikely that the particular modification is located at this position.
+##  ```{r echo=FALSE}
+##
+##  head(locProbFilteredData$Phospho..STY..Score.diffs)
+##  ```
+##
+##  First few rows of data with Phosphopeptide Sequence.
+##  ```{r echo=FALSE }
+##  # Add Phosphopeptide column to quant columns for quality control checking
+quantData_qc <- as.data.frame(quantData)
+setDT(quantData_qc, keep.rownames=TRUE) #will use to match rowname to data
+quantData_qc$rn <- as.integer(quantData_qc$rn)
+quantData_qc <- merge(merge_df,quantData_qc, by="rn")
+quantData_qc$rn <- NULL #remove rn column
+head(quantData_qc)
+##
+##  ```
+##
+##
+##  ## Collapse multiphosphorylated peptides
+##
+##  Before collapse:
+##  ```{r echo=FALSE}
+##  summary(quantData_qc)
+##  ```
+##
+##
+##  After collapse:
+##  ```{r echo=FALSE}
+#Collapse multiphosphorylated peptides
+quantData_qc_collapsed <- data.table(quantData_qc, key = "Phosphopeptide")
+quantData_qc_collapsed <- aggregate(. ~ Phosphopeptide,quantData_qc, FUN= collapse_FUN)
+summary(quantData_qc_collapsed)
+##
+##  ```
+##
+##  % of phosphopeptides that are multiphosphorylated is roughly ~
+##  ```{r echo=FALSE}
+(nrow(quantData_qc) - nrow(quantData_qc_collapsed))/nrow(quantData_qc)/2
+##  ```
+##
+##  Breakdown of pY, pS, and pT before enrichment filter
+##  ```{r echo=FALSE}
+##  library(stringr)
+pY_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]
+pS_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS"),]
+pT_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]
+##
+pY_num <- nrow(pY_data)
+pS_num <- nrow(pS_data)
+pT_num <- nrow(pT_data)
+##
+##  #visualize enrichment
+enrichGraphData <- data.frame(
+  group = c("pY", "pS", "pT"),
+  value = c(pY_num, pS_num, pT_num)
+)
+##
+##  library(ggplot2)
+##  enrichGraph <- ggplot(enrichGraphData, aes(x="", y=value, fill=group)) +
+##  geom_bar(width = 0.5, stat = "identity", color="black") + coord_polar("y", start=0)
+##
+##  enrichGraph
+##  ```
+##
+##  ## Filter phosphopeptides by enrichment
+##  ```{r echo=FALSE}
+##  #Data filtered by enrichment
+##  library(stringr)
+if (enriched == "Y"){
+  quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]
+} else if ( enriched == "ST" ) {
+  quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS") | str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]
+} else { print("Error in enriched variable. Set to either 'Y' or 'ST'")}
+##  ```
+##
+##
+##  ```{r echo=FALSE}
+##  #output files
+##  #write.table(quantData_qc, file="BeforeCollapse.txt", sep="\t", row.names = FALSE) #for quality control
+##  #write.table(quantData_qc_collapsed, file="AfterCollapse.txt", sep="\t", row.names = FALSE) #for quality control
+write.table(quantData_qc_enrichment, file=outputfilename, sep="\t", quote = FALSE, row.names = FALSE)
+##  ```
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/maxquant_phosphopeptide_intensity.xml	Thu Nov 04 18:35:10 2021 +0000
@@ -0,0 +1,63 @@
+<tool id="maxquant_phosphopeptide_intensity" name="MaxQuant Phosphopeptide Intensity" version="0.1.0" python_template_version="3.5">
+    <description>for each sample</description>
+    <requirements>
+        <requirement type="package" version="1.14.2">r-data.table</requirement>
+        <requirement type="package" version="1.7.1">r-optparse</requirement>
+        <requirement type="package" version="1.4.0">r-stringr</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+Rscript '$__tool_directory__/MaxQuantProcessingScript.R'
+-i '$phospho_sites'
+--enriched $enriched
+--phosphoCol $phosphoCol
+--numSamples $numSamples
+--startCol $startCol
+--intervalCol $intervalCol
+--localProbCutoff $localProbCutoff
+--collapse_func $collapse_func
+-o phosphopeptide_intensities.tsv
+&& head phosphopeptide_intensities.tsv
+
+    ]]></command>
+    <inputs>
+        <param name="phospho_sites" type="data" format="tabular" label="MaxQuant Phospho (STY)Sites.txt"/>
+        <param name="phosphoCol" type="data_column" numerical="false" data_ref="phospho_sites" label="Column: Number of Phospho (STY)" />
+        <param name="numSamples" type="integer" value="1" min="1" label="Number of samples or runs"/>
+        <param name="startCol" type="data_column" numerical="false" data_ref="phospho_sites" label="Column: Intensity of the first sample" />
+        <param name="intervalCol" type="integer" value="1" min="1" label="Interval between the intensity column of samples" help="eg, 1 if subsequent column; 2 if every other column"/>
+        <param name="enriched" type="select" label="Phospho enrichemnt type">
+            <option value="ST">ST</option>
+            <option value="Y">Y</option>
+        </param>
+        <param name="collapse_func" type="select" label="Intensity merge function">
+            <option value="sum">sum</option>
+            <option value="averge">averge</option>
+        </param>
+        <param name="localProbCutoff" type="float" value="0.75" min="0" max="1.0" label="Localization Probability Cutoff"/>
+    </inputs>
+    <outputs>
+        <data name="output" format="tabular" from_work_dir="phosphopeptide_intensities.tsv"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="phospho_sites" ftype="tabular" value="Phospho (ST)Sites_NancyDu.txt"/>
+            <param name="phosphoCol" value="37"/>
+            <param name="numSamples" value="6"/>
+            <param name="startCol" value="58"/>
+            <param name="intervalCol" value="1"/>
+            <param name="enriched" value="ST"/>
+            <param name="collapse_func" value="sum"/>
+            <param name="localProbCutoff" value="0.75"/>
+            <output name="output">
+                <assert_contents>
+                    <has_text text="Phosphopeptide" />
+                    <has_text text="AAAAAAAGDpSDpSWDADAFSVEDPVRK" />
+                    <has_text text="997800000" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+        TODO: Fill in help.
+    ]]></help>
+</tool>