comparison MaxQuantProcessingScript.R @ 0:c1403d18c189 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
author eschen42
date Mon, 07 Mar 2022 19:05:01 +0000
parents
children d4d531006735
comparison
equal deleted inserted replaced
-1:000000000000 0:c1403d18c189
1 #!/usr/bin/env Rscript
2
3 # This is the implementation for the
4 # "MaxQuant Phosphopeptide Localization Probability Cutoff"
5 # Galaxy tool (mqppep_lclztn_filter)
6 # It is adapted from the MaxQuant Processing Script written by Larry Cheng.
7
8 # libraries
9 library(optparse)
10 library(data.table)
11 library(stringr)
12 library(ggplot2)
13 #library(PTXQC)
14 #require(PTXQC)
15 #require(methods)
16
17 # title: "MaxQuant Processing Script"
18 # author: "Larry Cheng"
19 # date: "February 19, 2018"
20 #
21 # # MaxQuant Processing Script
22 # Takes MaxQuant Phospho (STY)sites.txt file as input and performs the following (in order):
23 # 1) Runs the Proteomics Quality Control software
24 # 2) Remove contaminant and reverse sequence rows
25 # 3) Filters rows based on localization probability
26 # 4) Extract the quantitative data
27 # 5) Sequences phosphopeptides
28 # 6) Merges multiply phosphorylated peptides
29 # 7) Filters out phosphopeptides based on enrichment
30 # The output file contains the phosphopeptide (first column) and the quantitative values for each sample
31 #
32 # ## Revision History
33 # Rev. 2022-02-10 :wrap for inclusion in Galaxy
34 # Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" and "Phosphopeptide Processing Script"
35 # Rev. 2017-12-12 :added PTXQC
36 # added additional plots and table outputs for quality control
37 # allowed for more than 2 samples to be grouped together (up to 26 (eg, 1A, 1B, 1C, etc))regexSampleNames <-
38 # "\\.(\\d+)[A-Z]$"
39 # converted from .r to .rmd file to knit report for quality control
40 # Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data impute multiple times
41 # Rev. 2016-09-09 :added filter to eliminate contaminant and reverse sequence rows
42 # Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to preANOVA file output
43 # Rev. 2016-08-22 :changed regexpression to regexSampleNames <- "\\.(\\d+)[AB]$" so that it looks at the end of string
44 # Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....)
45 # Rev. 2016-07-03 :Removed row names from the write.table() output for ANOVA and PreANOVA
46 # Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75
47 # Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting the row numbers afterwards
48 # Rev. 2016-06-21 :test18 + standardized the regexpression in protocol
49
50
51 ### FUNCTION DECLARATIONS begin ----------------------------------------------
52
53 # Read first line of file at filePath
54 # adapted from: https://stackoverflow.com/a/35761217/15509512
55 readFirstLine <- function(filepath) {
56 con = file(filepath, "r")
57 line = readLines(con, n = 1)
58 close(con)
59 return(line)
60 }
61
62 # Move columns to the end of dataframe
63 # - data: the dataframe
64 # - move: a vector of column names, each of which is an element of names(data)
65 movetolast <- function(data, move) {
66 data[c(setdiff(names(data), move), move)]
67 }
68
69 # Generate phosphopeptide and build list when applied
70 phosphopeptide_func <- function(df) {
71
72 #generate peptide sequence and list of phosphopositions
73 phosphoprobsequence <- strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]]
74 output <- vector()
75 phosphopeptide <- ""
76 counter <- 0 #keep track of position in peptide
77 phosphopositions <- vector() #keep track of phosphorylation positions in peptide
78 score_diff <- ""
79 for (chara in phosphoprobsequence){
80 #build peptide sequence
81 if (!(chara == " " | chara == "(" | chara == ")" | chara =="." | chara =="-" | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9")) {
82 phosphopeptide <- paste(phosphopeptide,chara,sep="")
83 counter <- counter + 1
84 }
85 #generate score_diff
86 if (chara == "-" | chara =="." | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9"){
87 score_diff <- paste(score_diff,chara,sep="")
88 }
89 #evaluate score_diff
90 if (chara == ")" ){
91 score_diff <- as.numeric(score_diff)
92 #only consider a phosphoresidue if score_diff > 0
93 if (score_diff > 0) {
94 phosphopositions <- append(phosphopositions, counter)
95 }
96 score_diff <- ""
97 }
98 }
99
100 #generate phosphopeptide sequence (ie, peptide sequence with "p"'s)
101 counter <- 1
102 phosphoposition_correction1 <- -1 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string
103 phosphoposition_correction2 <- 0 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string
104 while (counter <= length(phosphopositions) ) {
105 phosphopeptide <- paste(substr(phosphopeptide,0,phosphopositions[counter]+phosphoposition_correction1),"p",substr(phosphopeptide,phosphopositions[counter]+phosphoposition_correction2,nchar(phosphopeptide)),sep="")
106 counter <- counter + 1
107 phosphoposition_correction1 <- phosphoposition_correction1 + 1
108 phosphoposition_correction2 <- phosphoposition_correction2 + 1
109 }
110
111 #building phosphopeptide list
112 output <- append(output,phosphopeptide)
113 return(output)
114 }
115
116 ### FUNCTION DECLARATIONS end ------------------------------------------------
117
118
119 ### EXTRACT ARGUMENTS begin --------------------------------------------------
120
121 # parse options
122 option_list <- list(
123 make_option(
124 c("-i", "--input"),
125 action = "store",
126 type = "character",
127 help = "A MaxQuant Phospho (STY)Sites.txt"
128 )
129 , make_option(
130 c("-o", "--output"),
131 action = "store",
132 type = "character",
133 help = "path to output file"
134 )
135 , make_option(
136 c("-E", "--enrichGraph"),
137 action = "store",
138 type = "character",
139 help = "path to enrichment graph PDF"
140 )
141 , make_option(
142 c("-F", "--enrichGraph_svg"),
143 action = "store",
144 type = "character",
145 help = "path to enrichment graph SVG"
146 )
147 , make_option(
148 c("-L", "--locProbCutoffGraph"),
149 action = "store",
150 type = "character",
151 help = "path to location-proability cutoff graph PDF"
152 )
153 , make_option(
154 c("-M", "--locProbCutoffGraph_svg"),
155 action = "store",
156 type = "character",
157 help = "path to location-proability cutoff graph SVG"
158 )
159 , make_option(
160 c("-e", "--enriched"),
161 action = "store",
162 type = "character",
163 help = "pY or pST enriched samples (ie, 'Y' or 'ST')"
164 )
165 # default = "^Number of Phospho [(]STY[)]$",
166 , make_option(
167 c("-p", "--phosphoCol"),
168 action = "store",
169 type = "character",
170 help = "PERL-compatible regular expression matching header of column having number of 'Phospho (STY)'"
171 )
172 # default = "^Intensity[^_]",
173 , make_option(
174 c("-s", "--startCol"),
175 action = "store",
176 type = "character",
177 help = "PERL-compatible regular expression matching column header having first sample intensity"
178 )
179 # default = 1,
180 , make_option(
181 c("-I", "--intervalCol"),
182 action = "store",
183 type = "integer",
184 help = "Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column"
185 )
186 # default = 0.75,
187 , make_option(
188 c("-l", "--localProbCutoff"),
189 action = "store",
190 type = "double",
191 help = "Localization Probability Cutoff"
192 )
193 # default = "sum",
194 , make_option(
195 c("-f", "--collapse_func"),
196 action = "store",
197 type = "character",
198 help = "merge identical phosphopeptides by ('sum' or 'average') the intensities"
199 )
200 # default = "filteredData.txt",
201 , make_option(
202 c("-r", "--filtered_data"),
203 action = "store",
204 type = "character",
205 help = "filteredData.txt"
206 )
207 # default = "quantData.txt",
208 , make_option(
209 c("-q", "--quant_data"),
210 action = "store",
211 type = "character",
212 help = "quantData.txt"
213 )
214 )
215 args <- parse_args(OptionParser(option_list=option_list))
216 # Check parameter values
217
218 ### EXTRACT ARGUMENTS end ----------------------------------------------------
219
220
221 ### EXTRACT PARAMETERS from arguments begin ----------------------------------
222
223 if (! file.exists(args$input)) {
224 stop((paste("File", args$input, "does not exist")))
225 }
226
227 phosphoColPattern <- "^Number of Phospho [(][STY][STY]*[)]$"
228 startColPattern <- "^Intensity[^_]"
229 phosphoColPattern <- readFirstLine(args$phosphoCol)
230 startColPattern <- readFirstLine(args$startCol)
231
232 sink(getConnection(2))
233 #ACE print(paste("phosphoColPattern", phosphoColPattern))
234 #ACE print(paste("startColPattern", startColPattern))
235
236 inputFilename <- args$input
237 filteredFilename <- args$filtered_data
238 quantFilename <- args$quant_data
239 intervalCol <- as.integer(args$intervalCol)
240
241 firstLine <- readFirstLine(inputFilename)
242 columnHeaders <- unlist(strsplit(x=firstLine, split=c('\t'), fixed=TRUE))
243 sink(getConnection(2))
244 #ACE print("columnHeaders")
245 #ACE print(columnHeaders)
246 sink()
247
248
249 intensityHeaderCols <- grep(pattern=startColPattern, x=columnHeaders, perl=TRUE)
250 if ( length(intensityHeaderCols) == 0) {
251 err_msg <- paste("Found no intensity columns matching pattern:", startColPattern)
252 # Divert output to stderr
253 sink(getConnection(2))
254 print(err_msg)
255 sink()
256 stop(err_msg)
257 }
258
259
260 phosphoCol <- grep(pattern=phosphoColPattern, x=columnHeaders, perl=TRUE)[1]
261 if (is.na(phosphoCol)) {
262 err_msg <- paste("Found no 'number of phospho sites' columns matching pattern:", phosphoColPattern)
263 # Divert output to stderr
264 sink(getConnection(2))
265 print(err_msg)
266 sink()
267 stop(err_msg)
268 }
269
270
271 i_count <- 0
272 this_column <- 1
273 last_value <- intensityHeaderCols[1]
274 intensityCols <- c(last_value)
275
276 while ( length(intensityHeaderCols) >= intervalCol * i_count ) {
277 i_count <- 1 + i_count
278 this_column <- intervalCol + this_column
279 if ( last_value + intervalCol != intensityHeaderCols[this_column] ) break
280 last_value <- intensityHeaderCols[this_column]
281 if (length(intensityHeaderCols) < intervalCol * i_count) break
282 intensityCols <- c(intensityCols, intensityHeaderCols[this_column])
283 }
284
285 startCol <- intensityCols[1]
286 numSamples <- i_count
287
288 outputfilename <- args$output
289 enrichGraphFilename <- args$enrichGraph
290 locProbCutoffGraphFilename <- args$locProbCutoffGraph
291 enrichGraphFilename_svg <- args$enrichGraph_svg
292 locProbCutoffGraphFilename_svg <- args$locProbCutoffGraph_svg
293
294 localProbCutoff <- args$localProbCutoff
295 enriched <- args$enriched
296 collapse_FUN <- args$collapse_func
297
298 ### EXTRACT PARAMETERS from arguments end ------------------------------------
299
300
301 # Proteomics Quality Control for MaxQuant Results
302 # (Bielow C et al. J Proteome Res. 2016 PMID: 26653327)
303 # is run by the Galaxy MaxQuant wrapper and need not be invoked here.
304
305
306 # Read data, filtering out contaminants, reverse sequences, and localization probability
307 # ---
308 fullData <- read.table(file = inputFilename, sep ="\t", header=T, quote="")
309
310 #Filter out contaminant rows and reverse rows
311 filteredData <- subset(fullData,!grepl("CON__", Proteins))
312 filteredData <- subset(filteredData,!grepl("_MYCOPLASMA", Proteins))
313 filteredData <- subset(filteredData,!grepl("CONTAMINANT_", Proteins))
314 filteredData <- subset(filteredData,!grepl("REV__", Protein)) #since REV__ rows are blank in the first column (Proteins)
315 write.table(filteredData, file = filteredFilename, sep = "\t", quote=FALSE, col.names=TRUE, row.names=FALSE)
316 # ...
317
318
319 # Filter out data with localization probability below localProbCutoff
320 # ---
321 #Data filtered by localization probability
322 locProbFilteredData <- filteredData[filteredData$Localization.prob>=localProbCutoff,]
323 # ...
324
325
326 # Localization probability -- visualize locprob cutoff
327 # ---
328 locProbGraphData <- data.frame(
329 group = c(paste(">",toString(localProbCutoff),sep=""), paste("<",toString(localProbCutoff),sep="")),
330 value = c(nrow(locProbFilteredData)/nrow(filteredData)*100, (nrow(filteredData)-nrow(locProbFilteredData))/nrow(filteredData)*100)
331 )
332 gigi <-
333 ggplot(locProbGraphData, aes(x = "", y = value, fill = group)) +
334 geom_bar(width = 0.5, stat = "identity", color = "black") +
335 labs(
336 x = NULL
337 , y = "percent"
338 , title = "Phosphopeptides partitioned by localization-probability cutoff"
339 ) +
340 scale_fill_discrete(name = "phosphopeptide\nlocalization-\nprobability") +
341 theme_minimal() +
342 theme(
343 legend.position = "right"
344 , legend.title=element_text()
345 , plot.title = element_text(hjust = 0.5)
346 , plot.subtitle = element_text(hjust = 0.5)
347 , plot.title.position = "plot"
348 )
349 pdf(locProbCutoffGraphFilename)
350 print(gigi)
351 dev.off()
352 svg(locProbCutoffGraphFilename_svg)
353 print(gigi)
354 dev.off()
355 # ...
356
357
358 # Extract quantitative values from filtered data
359 # ---
360 quantData <- locProbFilteredData[,seq(from=startCol, by=intervalCol, length.out=numSamples)]
361 # ...
362
363
364 # Generate Phosphopeptide Sequence
365 # for latest version of MaxQuant (Version 1.5.3.30)
366 # ---
367 dataTable <- data.frame(locProbFilteredData[,1:8],locProbFilteredData[,phosphoCol],locProbFilteredData[,phosphoCol+1],locProbFilteredData[,phosphoCol+2],locProbFilteredData[,phosphoCol+3],locProbFilteredData[,phosphoCol+4],locProbFilteredData[,phosphoCol+5],locProbFilteredData[,phosphoCol+6],locProbFilteredData[,phosphoCol+7],quantData)
368 colnames(dataTable) <- c("Proteins","Positions within proteins", "Leading proteins", "Protein", "Protein names", "Gene names", "Fasta headers", "Localization prob", "Number of Phospho (STY)", "Amino Acid", "Sequence window","Modification window", "Peptide window coverage", "Phospho (STY) Probabilities", "Phospho (STY) Score diffs", "Position in peptide", colnames(quantData))
369 # 'phosphopeptide_func' generates a phosphopeptide sequence for each row of data.
370 # for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1,2) = both
371 dataTable$Phosphopeptide <- apply(X=dataTable, MARGIN=1, FUN=phosphopeptide_func)
372 # Move the quant data columns to the right end of the data.frame
373 dataTable <- movetolast(dataTable,c(colnames(quantData)))
374 # ...
375
376
377 # Write quantitative values for debugging purposes
378 # ---
379 quantWrite <- cbind( dataTable[,"Sequence window"], quantData )
380 colnames(quantWrite)[1] <- "Sequence.Window"
381 write.table(quantWrite, file = quantFilename, sep = "\t", quote=FALSE, col.names=TRUE, row.names=FALSE)
382 # ...
383
384
385 # Make new data frame containing only Phosphopeptides to be mapped to quant data (merge_df)
386 # ---
387 dataTable <- setDT(dataTable, keep.rownames=TRUE) #row name will be used to map
388 merge_df <- data.frame(as.integer(dataTable$rn), dataTable$Phosphopeptide) #row index to merge data frames
389 colnames(merge_df) <- c("rn", "Phosphopeptide")
390 # ...
391
392
393 # Add Phosphopeptide column to quant columns for quality control checking
394 # ---
395 quantData_qc <- as.data.frame(quantData)
396 setDT(quantData_qc, keep.rownames=TRUE) #will use to match rowname to data
397 quantData_qc$rn <- as.integer(quantData_qc$rn)
398 quantData_qc <- merge(merge_df,quantData_qc, by="rn")
399 quantData_qc$rn <- NULL #remove rn column
400 # ...
401
402
403 # Collapse multiphosphorylated peptides
404 # ---
405 quantData_qc_collapsed <- data.table(quantData_qc, key = "Phosphopeptide")
406 quantData_qc_collapsed <- aggregate(. ~ Phosphopeptide,quantData_qc, FUN= collapse_FUN)
407 # ...
408
409
410 # Compute (as string) % of phosphopeptides that are multiphosphorylated (for use in next step)
411 # ---
412 pct_multiphos <- (nrow(quantData_qc) - nrow(quantData_qc_collapsed)) / (2 * nrow(quantData_qc))
413 pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%")
414 # ...
415
416
417 # Compute and visualize breakdown of pY, pS, and pT before enrichment filter
418 # ---
419 pY_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]
420 pS_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS"),]
421 pT_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]
422
423 pY_num <- nrow(pY_data)
424 pS_num <- nrow(pS_data)
425 pT_num <- nrow(pT_data)
426
427 # Visualize enrichment
428 enrichGraphData <- data.frame(
429 group = c("pY", "pS", "pT"),
430 value = c(pY_num, pS_num, pT_num)
431 )
432
433 enrichGraphData <- enrichGraphData[enrichGraphData$value > 0,]
434
435 # Plot pie chart with legend
436 # start: https://stackoverflow.com/a/62522478/15509512
437 # refine: https://www.statology.org/ggplot-pie-chart/
438 # colors: https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8
439 slices <- enrichGraphData$value
440 phosphoresidue <- enrichGraphData$group
441 pct <- round(100 * slices / sum(slices))
442 lbls <- paste(enrichGraphData$group,"\n",pct, "%\n(", slices, ")", sep="")
443 slc_ctr <- c()
444 run_tot <- 0
445 for (p in pct) {
446 slc_ctr <- c(slc_ctr, run_tot + p/2.0)
447 run_tot <- run_tot + p
448 }
449 lbl_y <- 100 - slc_ctr
450 df <- data.frame(slices, pct, lbls, phosphoresidue = factor(phosphoresidue, levels = phosphoresidue))
451 gigi <- ggplot(
452 df
453 , aes(x = 1, y = pct, fill = phosphoresidue)) +
454 geom_col(position = "stack", orientation = "x") +
455 geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") +
456 coord_polar(theta = "y", direction = -1) +
457 labs(
458 x = NULL
459 , y = NULL
460 , title = "Percentages (and counts) of phosphosites, by type of residue"
461 , caption = sprintf("Roughly %s of peptides have multiple phosphosites.", pct_multiphos)
462 ) +
463 labs(x = NULL, y = NULL, fill = NULL) +
464 theme_classic() +
465 theme( legend.position="right"
466 , axis.line = element_blank()
467 , axis.text = element_blank()
468 , axis.ticks = element_blank()
469 , plot.title = element_text(hjust = 0.5)
470 , plot.subtitle = element_text(hjust = 0.5)
471 , plot.caption = element_text(hjust = 0.5)
472 , plot.title.position = "plot"
473 ) +
474 scale_fill_manual(breaks = phosphoresidue, values=c("#c7eae5", "#f6e8c3", "#dfc27d"))
475
476 pdf(enrichGraphFilename)
477 print(gigi)
478 dev.off()
479 svg(enrichGraphFilename_svg)
480 print(gigi)
481 dev.off()
482 # ...
483
484
485 # Filter phosphopeptides by enrichment
486 # --
487 if (enriched == "Y"){
488 quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]
489 } else if ( enriched == "ST" ) {
490 quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS") | str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]
491 } else {
492 print("Error in enriched variable. Set to either 'Y' or 'ST'")
493 }
494 # ...
495
496
497 # Write phosphopeptides filtered by enrichment
498 # --
499 write.table(quantData_qc_enrichment, file=outputfilename, sep="\t", quote = FALSE, row.names = FALSE)
500 # ...