Mercurial > repos > galaxyp > maxquant_phosphopeptide_intensity
view MaxQuantProcessingScript.R @ 0:b09ed1684301 draft default tip
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/maxquant_phosphopeptide_intensity commit c41945617c468ca66813c95de6a5e6fe0edb0c15"
author | galaxyp |
---|---|
date | Thu, 04 Nov 2021 18:35:10 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env Rscript # libraries library(optparse) library(data.table) library(stringr) #library(ggplot2) #library(PTXQC) #require(PTXQC) #require(methods) # title: "MaxQuant Processing Script" # author: "Larry Cheng" # date: "February 19, 2018" # # # MaxQuant Processing Script # Takes MaxQuant Phospho (STY)sites.txt file as input and performs the following (in order): # 1) Runs the Proteomics Quality Control software # 2) Remove contaminant and reverse sequence rows # 3) Filters rows based on localization probability # 4) Extract the quantitative data # 5) Sequences phosphopeptides # 6) Merges multiply phosphorylated peptides # 7) Filters out phosphopeptides based on enrichment # The output file contains the phosphopeptide (first column) and the quantitative values for each sample # # ## Revision History # Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" and "Phosphopeptide Processing Script" # Rev. 2017-12-12 :added PTXQC # added additional plots and table outputs for quality control # allowed for more than 2 samples to be grouped together (up to 26 (eg, 1A, 1B, 1C, etc))regexSampleNames <- # "\\.(\\d+)[A-Z]$" # converted from .r to .rmd file to knit report for quality control # Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data impute multiple times # Rev. 2016-09-09 :added filter to eliminate contaminant and reverse sequence rows # Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to preANOVA file output # Rev. 2016-08-22 :changed regexpression to regexSampleNames <- "\\.(\\d+)[AB]$" so that it looks at the end of string # Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....) # Rev. 2016-07-03 :Removed row names from the write.table() output for ANOVA and PreANOVA # Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75 # Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting the row numbers afterwards # Rev. 2016-06-21 :test18 + standardized the regexpression in protocol ## Variables to change for each input file ## ```{r} ## #Location of MaxQuant txt folder ## # C:\Users\art\src\ppenrich\postMQanalysis\step2.1 ## dir_txtfolder <- "C:/Users/art/src/ppenrich/postMQanalysis/step2.1" ## #Input Filename ## inputFilename <- paste0(dir"/Phospho (STY)Sites.txt" ## #Column number for the Intensity of the first sample ## startCol <- 58 ## #Number of runs ## numSamples <- 6 ## #Column number of "Number.of.Phospho..STY." ## phosphoCol <- 37 ## #pY or pST enriched samples (ie, "Y" or "ST") ## enriched <- "ST" ## #output filename ## outputfilename = "outputfile_STEP2.txt" ## ``` ## ## ## Other modifiable variables ## ```{r} ## #Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column) ## intervalCol <- 1 ## #Localization Probability Cutoff ## localProbCutoff <- 0.75 ## #merge identical phosphopeptides by ("sum" or "average") the intensities ## collapse_FUN = "sum" ## ``` # parse options option_list <- list( make_option( c("-i", "--input"), action = "store", default = NA, type = "character", help = "A MaxQuant Phospho (STY)Sites.txt" ), make_option( c("-o", "--output"), action = "store", default = "output.tsv", type = "character", help = "output file" ), make_option( c("-e", "--enriched"), action = "store", default = "output.tsv", type = "character", help = "pY or pST enriched samples (ie, 'Y' or 'ST')" ), make_option( c("-n", "--numSamples"), action = "store", default = 3, type = "integer", help = "Number of samples" ), make_option( c("-p", "--phosphoCol"), action = "store", default = 13, type = "integer", help = "Column number of Number.of.Phospho..STY." ), make_option( c("-s", "--startCol"), action = "store", default = 24, type = "integer", help = "Column number for the Intensity of the first sample" ), make_option( c("-I", "--intervalCol"), action = "store", default = 1, type = "integer", help = "Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column" ), make_option( c("-l", "--localProbCutoff"), action = "store", default = 0.75, type = "double", help = "Localization Probability Cutoff" ), make_option( c("-f", "--collapse_func"), action = "store", default = "sum", type = "character", help = "merge identical phosphopeptides by ('sum' or 'average') the intensities" ) ) args <- parse_args(OptionParser(option_list=option_list)) # Check parameter values if (! file.exists(args$input)) { stop((paste("File", args$input, "does not exist"))) } inputFilename <- args$input outputfilename <- args$output localProbCutoff <- args$localProbCutoff numSamples <- args$numSamples phosphoCol <- args$phosphoCol startCol <- args$startCol intervalCol <- args$intervalCol enriched <- "ST" collapse_FUN = args$collapse_func ### FUNCTIONS #function to generate phosphopeptide and build list when applied phosphopeptide_func <- function(df) { #generate peptide sequence and list of phosphopositions phosphoprobsequence <- strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]] output <- vector() phosphopeptide <- "" counter <- 0 #keep track of position in peptide phosphopositions <- vector() #keep track of phosphorylation positions in peptide score_diff <- "" for (chara in phosphoprobsequence){ #build peptide sequence if (!(chara == " " | chara == "(" | chara == ")" | chara =="." | chara =="-" | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9")) { phosphopeptide <- paste(phosphopeptide,chara,sep="") counter <- counter + 1 } #generate score_diff if (chara == "-" | chara =="." | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9"){ score_diff <- paste(score_diff,chara,sep="") } #evaluate score_diff if (chara == ")" ){ score_diff <- as.numeric(score_diff) #only consider a phosphoresidue if score_diff > 0 if (score_diff > 0) { phosphopositions <- append(phosphopositions, counter) } score_diff <- "" } } #generate phosphopeptide sequence (ie, peptide sequence with "p"'s) counter <- 1 phosphoposition_correction1 <- -1 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string phosphoposition_correction2 <- 0 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string while (counter <= length(phosphopositions) ) { phosphopeptide <- paste(substr(phosphopeptide,0,phosphopositions[counter]+phosphoposition_correction1),"p",substr(phosphopeptide,phosphopositions[counter]+phosphoposition_correction2,nchar(phosphopeptide)),sep="") counter <- counter + 1 phosphoposition_correction1 <- phosphoposition_correction1 + 1 phosphoposition_correction2 <- phosphoposition_correction2 + 1 } #building phosphopeptide list output <- append(output,phosphopeptide) return(output) } ## ## ## Run Proteomics Quality Control for MaxQuant Results ## (Bielow C et al. J Proteome Res. 2016 PMID: 26653327) ## ## ```{r echo=FALSE} ## library(PTXQC) ## require(PTXQC) ## require(methods) ## txt_folder <- dir_txtfolder ## r = createReport(txt_folder) ## cat(paste0("\nReport has been generated inside the MaxQuant txt folder '", r$report_file, "'\n\n")) ## ``` ## ## ## Perform filters on contaminants, reverse sequences, and localization probability fullData <- read.table(file = inputFilename, sep ="\t", header=T, quote="") #Filter out contaminant rows and reverse rows filteredData <- subset(fullData,!grepl("CON__", Proteins)) filteredData <- subset(filteredData,!grepl("_MYCOPLASMA", Proteins)) filteredData <- subset(filteredData,!grepl("CONTAMINANT_", Proteins)) filteredData <- subset(filteredData,!grepl("REV__", Protein)) #since REV__ rows are blank in the first column (Proteins) write.table(filteredData, file = "filteredData.txt", sep = "\t", col.names=TRUE, row.names=FALSE) #Data filtered by localization probability locProbFilteredData <- filteredData[filteredData$Localization.prob>=localProbCutoff,] ## Localization probability ## #visualize locprob cutoff locProbGraphData <- data.frame( group = c(paste(">",toString(localProbCutoff),sep=""), paste("<",toString(localProbCutoff),sep="")), value = c(nrow(locProbFilteredData)/nrow(filteredData)*100, (nrow(filteredData)-nrow(locProbFilteredData))/nrow(filteredData)*100) ) ## library(ggplot2) ## locProbGraph <- ggplot(locProbGraphData, aes(x="", y=value, fill=group)) + ## geom_bar(width = 0.5, stat = "identity", color="black") ## ## locProbGraph ## Extract out quantitative values #Extract out quantitative values quantData <- locProbFilteredData[,seq(from=startCol, by=intervalCol, length.out=numSamples)] write.table(quantData, file = "quantdata.txt", sep = "\t", col.names=TRUE, row.names=FALSE) #head(quantData) ## ``` ## ## ## ## Generate Phosphopeptide Sequence ## First few rows of the phosphopeptide column: #for latest version of MaxQuant (Version 1.5.3.30) dataTable <- data.frame(locProbFilteredData[,1:8],locProbFilteredData[,phosphoCol],locProbFilteredData[,phosphoCol+1],locProbFilteredData[,phosphoCol+2],locProbFilteredData[,phosphoCol+3],locProbFilteredData[,phosphoCol+4],locProbFilteredData[,phosphoCol+5],locProbFilteredData[,phosphoCol+6],locProbFilteredData[,phosphoCol+7],quantData) colnames(dataTable) <- c("Proteins","Positions within proteins", "Leading proteins", "Protein", "Protein names", "Gene names", "Fasta headers", "Localization prob", "Number of Phospho (STY)", "Amino Acid", "Sequence window","Modification window", "Peptide window coverage", "Phospho (STY) Probabilities", "Phospho (STY) Score diffs", "Position in peptide", colnames(quantData)) ## #Generate column in dataTable for the predicted phosphopeptide dataTable$Phosphopeptide <- apply(dataTable,1,phosphopeptide_func) ## #Move Quant columns to the end movetolast <- function(data, move) { #moves columns to the end of dataframe data[c(setdiff(names(data), move), move)] } dataTable <- movetolast(dataTable,c(colnames(quantData))) ## #Make new data frame containing only Phosphopeptides to be mapped to quant data (merge_df) dataTable <- setDT(dataTable, keep.rownames=TRUE) #row name will be used to map merge_df <- data.frame(as.integer(dataTable$rn), dataTable$Phosphopeptide) #row index to merge data frames colnames(merge_df) <- c("rn", "Phosphopeptide") ## ## head(merge_df) ## ## ## First few rows of the MaxQuant phosphoresidue score differences: Sequence representation for each of the possible PTM positions in each possible configuration, the difference is calculated between the identification score with the PTM added to that position and the best scoring identification where no PTM is added to that position. When this value is negative, it is unlikely that the particular modification is located at this position. ## ```{r echo=FALSE} ## ## head(locProbFilteredData$Phospho..STY..Score.diffs) ## ``` ## ## First few rows of data with Phosphopeptide Sequence. ## ```{r echo=FALSE } ## # Add Phosphopeptide column to quant columns for quality control checking quantData_qc <- as.data.frame(quantData) setDT(quantData_qc, keep.rownames=TRUE) #will use to match rowname to data quantData_qc$rn <- as.integer(quantData_qc$rn) quantData_qc <- merge(merge_df,quantData_qc, by="rn") quantData_qc$rn <- NULL #remove rn column head(quantData_qc) ## ## ``` ## ## ## ## Collapse multiphosphorylated peptides ## ## Before collapse: ## ```{r echo=FALSE} ## summary(quantData_qc) ## ``` ## ## ## After collapse: ## ```{r echo=FALSE} #Collapse multiphosphorylated peptides quantData_qc_collapsed <- data.table(quantData_qc, key = "Phosphopeptide") quantData_qc_collapsed <- aggregate(. ~ Phosphopeptide,quantData_qc, FUN= collapse_FUN) summary(quantData_qc_collapsed) ## ## ``` ## ## % of phosphopeptides that are multiphosphorylated is roughly ~ ## ```{r echo=FALSE} (nrow(quantData_qc) - nrow(quantData_qc_collapsed))/nrow(quantData_qc)/2 ## ``` ## ## Breakdown of pY, pS, and pT before enrichment filter ## ```{r echo=FALSE} ## library(stringr) pY_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),] pS_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS"),] pT_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),] ## pY_num <- nrow(pY_data) pS_num <- nrow(pS_data) pT_num <- nrow(pT_data) ## ## #visualize enrichment enrichGraphData <- data.frame( group = c("pY", "pS", "pT"), value = c(pY_num, pS_num, pT_num) ) ## ## library(ggplot2) ## enrichGraph <- ggplot(enrichGraphData, aes(x="", y=value, fill=group)) + ## geom_bar(width = 0.5, stat = "identity", color="black") + coord_polar("y", start=0) ## ## enrichGraph ## ``` ## ## ## Filter phosphopeptides by enrichment ## ```{r echo=FALSE} ## #Data filtered by enrichment ## library(stringr) if (enriched == "Y"){ quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),] } else if ( enriched == "ST" ) { quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS") | str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),] } else { print("Error in enriched variable. Set to either 'Y' or 'ST'")} ## ``` ## ## ## ```{r echo=FALSE} ## #output files ## #write.table(quantData_qc, file="BeforeCollapse.txt", sep="\t", row.names = FALSE) #for quality control ## #write.table(quantData_qc_collapsed, file="AfterCollapse.txt", sep="\t", row.names = FALSE) #for quality control write.table(quantData_qc_enrichment, file=outputfilename, sep="\t", quote = FALSE, row.names = FALSE) ## ```